diff --git a/Cargo.toml b/Cargo.toml index 39fb4a4..c316a81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,8 @@ hex = "0.4" err-derive = "0.3" dryoc = { version = "0.4", optional = true } -serde = { version = "1.0", optional = true, features = ["derive"] } +serde = { version = "1.0", optional = true, default-features = false, features = ["derive"] } [features] -default = [ "dryoc", "serde" ] +#default = [ "dryoc", "serde" ] +#default = [ "serde" ] diff --git a/src/buf.rs b/src/buf.rs new file mode 100644 index 0000000..edd2766 --- /dev/null +++ b/src/buf.rs @@ -0,0 +1,561 @@ +use std::borrow::Cow; + +use crate::{ + is_string_char, is_whitespace, DICT_ASSIGN, DICT_CLOSE, DICT_DELIM, DICT_OPEN, LIST_CLOSE, + LIST_DELIM, LIST_OPEN, +}; + +pub type Pos = u32; + +#[derive(Debug)] +pub enum ParseError { + IncompleteInput, + DuplicateKey(Pos), + Garbage(Pos), +} + +#[derive(Clone, Copy, Debug)] +pub struct ITerm(Pos); + +#[derive(Clone, Copy, Debug)] +struct IRaw { + start: Pos, + end: Pos, +} + +#[derive(Clone, Copy, Debug)] +struct ISeq { + seq_start: Pos, + seq_end: Pos, +} + +#[derive(Clone, Copy, Debug)] +struct IDict { + dict_start: Pos, + dict_end: Pos, +} + +#[derive(Debug)] +enum TTerm { + Str(IRaw), + RawSeq(IRaw, ISeq), + RawList(IRaw, ISeq), + RawDict(IRaw, IDict), +} + +#[derive(Debug)] +pub struct Buf<'a> { + bytes: Cow<'a, [u8]>, + seqs: Vec, + dicts: Vec<(IRaw, ITerm)>, + terms: Vec, +} + +pub fn decode(input: &[u8]) -> Result<(Buf<'_>, ITerm), ParseError> { + let mut buf = Buf { + bytes: input.into(), + seqs: Vec::with_capacity(16), + dicts: Vec::with_capacity(16), + terms: Vec::with_capacity(16), + }; + let all_buf = IRaw { + start: 0, + end: input.len() as Pos, + }; + + let mut stack = Vec::with_capacity(16); + let (term, rest) = buf.decode(all_buf, &mut stack)?; + assert!(stack.is_empty()); + + let rest = buf.take_whitespace(rest); + if rest.start < all_buf.end { + return Err(ParseError::Garbage(rest.start)); + } + + Ok((buf, term)) +} + +enum StackItem { + Term(ITerm), + KeyValue(IRaw, ITerm), +} +type Stack = Vec; +impl StackItem { + fn term(self) -> ITerm { + match self { + StackItem::Term(term) => term, + _ => unreachable!(), + } + } + fn kv(self) -> (IRaw, ITerm) { + match self { + StackItem::KeyValue(key, term) => (key, term), + _ => unreachable!(), + } + } +} + +impl<'a> Buf<'a> { + // ============================================ PUBLIC ACCESS FUNCTIONS + + pub fn raw(&self, term: ITerm) -> Option<&[u8]> { + match self.terms.get(term.0 as usize)? { + TTerm::Str(r) | TTerm::RawSeq(r, _) | TTerm::RawList(r, _) | TTerm::RawDict(r, _) => { + self.get_bytes(*r) + } + } + } + + pub fn str(&self, term: ITerm) -> Option<&str> { + match self.terms.get(term.0 as usize)? { + TTerm::Str(r) => self + .bytes + .get(r.start as usize..r.end as usize) + .map(|x| unsafe { std::str::from_utf8_unchecked(x) }), + _ => None, + } + } + + pub fn seq<'x>(&'x self, term: &'x ITerm) -> Option<&'x [ITerm]> { + match self.terms.get(term.0 as usize)? { + TTerm::RawSeq(_, s) => self.seqs.get(s.seq_start as usize..s.seq_end as usize), + _ => Some(std::slice::from_ref(term)), + } + } + + pub fn seq_of(&self, term: ITerm) -> Option<[ITerm; N]> { + match self.terms.get(term.0 as usize)? { + TTerm::RawSeq(_, s) => { + if (s.seq_end - s.seq_start) as usize == N { + let seq = self.seqs.get(s.seq_start as usize..s.seq_end as usize)?; + Some(seq.try_into().unwrap()) + } else { + None + } + } + _ => None, + } + } + + pub fn list(&self, term: ITerm) -> Option<&[ITerm]> { + match self.terms.get(term.0 as usize)? { + TTerm::RawList(_, s) => self.seqs.get(s.seq_start as usize..s.seq_end as usize), + _ => None, + } + } + + pub fn list_of(&self, term: ITerm) -> Option<[ITerm; N]> { + match self.terms.get(term.0 as usize)? { + TTerm::RawList(_, s) if (s.seq_end - s.seq_start) as usize == N => { + let seq = self.seqs.get(s.seq_start as usize..s.seq_end as usize)?; + Some(seq.try_into().unwrap()) + } + _ => None, + } + } + + pub fn dict_get(&self, term: ITerm, key: &str) -> Option { + match self.terms.get(term.0 as usize)? { + TTerm::RawDict(_, d) => { + let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; + let pos = dict.binary_search_by(|(k, _)| self.get_bytes(*k).unwrap().cmp(key.as_bytes())).ok()?; + Some(dict[pos].1) + } + _ => None, + } + } + + pub fn dict_of(&self, term: ITerm, keys: [&str; N], allow_other: bool) -> Option<[ITerm; N]> { + match self.terms.get(term.0 as usize)? { + TTerm::RawDict(_, d) => { + let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; + if dict.len() < N || (dict.len() > N && !allow_other) { + return None; + } + + let mut ret = [ITerm(0); N]; + for i in 0..N { + let pos = dict.binary_search_by(|(k, _)| self.get_bytes(*k).unwrap().cmp(keys[i].as_bytes())).ok()?; + ret[i] = dict[pos].1; + } + + Some(ret) + } + _ => None, + } + } + + pub fn dict_iter(&self, term: ITerm) -> Option + '_> { + match self.terms.get(term.0 as usize)? { + TTerm::RawDict(_, d) => { + let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; + let iter = dict.iter().map(|(k, v)| (unsafe { std::str::from_utf8_unchecked(self.get_bytes(*k).unwrap()) }, *v)); + Some(iter) + } + _ => None, + } + } + + // ============================================ NETTEXT PARSING + + fn decode(&mut self, raw: IRaw, stack: &mut Stack) -> Result<(ITerm, IRaw), ParseError> { + let start = self.take_whitespace(raw); + let stack_start = stack.len(); + + let mut cur_end = start; + let mut next_start = start; + loop { + match self.decode_nonseq_term(next_start, stack) { + Err(_) => break, + Ok((term, rest)) => { + stack.push(StackItem::Term(term)); + cur_end = rest; + next_start = self.take_whitespace(rest); + } + } + } + + if stack.len() == stack_start { + Err(self.error_at(next_start)) + } else if stack.len() == stack_start + 1 { + Ok((stack.pop().unwrap().term(), next_start)) + } else { + let seq_raw = IRaw { + start: start.start, + end: cur_end.start, + }; + let seq_start = self.seqs.len(); + self.seqs + .extend(stack.drain(stack_start..).map(StackItem::term)); + let seq = TTerm::RawSeq( + seq_raw, + ISeq { + seq_start: seq_start as Pos, + seq_end: self.seqs.len() as Pos, + }, + ); + Ok((self.push_term(seq), next_start)) + } + } + + fn decode_nonseq_term( + &mut self, + raw: IRaw, + stack: &mut Stack, + ) -> Result<(ITerm, IRaw), ParseError> { + if let Ok((term, rest)) = self.decode_string(raw) { + Ok((term, rest)) + } else if let Ok((term, rest)) = self.decode_list(raw, stack) { + Ok((term, rest)) + } else if let Ok((term, rest)) = self.decode_dict(raw, stack) { + Ok((term, rest)) + } else { + Err(self.error_at(raw)) + } + } + + fn decode_list(&mut self, raw: IRaw, stack: &mut Stack) -> Result<(ITerm, IRaw), ParseError> { + let stack_start = stack.len(); + + let mut cur = self.take_whitespace(self.take_char(raw, LIST_OPEN)?); + + while let Ok((term, rest)) = self.decode(cur, stack) { + stack.push(StackItem::Term(term)); + cur = self.take_whitespace(rest); + if let Ok(rest) = self.take_char(rest, LIST_DELIM) { + cur = self.take_whitespace(rest); + } else { + break; + } + } + + if let Ok(rest) = self.take_char(cur, LIST_CLOSE) { + let seq_raw = IRaw { + start: raw.start, + end: rest.start, + }; + let seq_start = self.seqs.len(); + self.seqs + .extend(stack.drain(stack_start..).map(StackItem::term)); + let seq = TTerm::RawList( + seq_raw, + ISeq { + seq_start: seq_start as Pos, + seq_end: self.seqs.len() as Pos, + }, + ); + Ok((self.push_term(seq), rest)) + } else { + stack.truncate(stack_start); + Err(self.error_at(cur)) + } + } + + fn decode_dict(&mut self, raw: IRaw, stack: &mut Stack) -> Result<(ITerm, IRaw), ParseError> { + let stack_start = stack.len(); + + match self.decode_dict_inner(raw, stack) { + Ok(rest) => { + let dict_raw = IRaw { + start: raw.start, + end: rest.start, + }; + let dict_start = self.dicts.len(); + self.dicts + .extend(stack.drain(stack_start..).map(StackItem::kv)); + self.dicts[dict_start..] + .sort_by_key(|(k, _)| (&self.bytes[k.start as usize..k.end as usize], k.start)); + + for ((k1, _), (k2, _)) in self.dicts[dict_start..] + .iter() + .zip(self.dicts[dict_start + 1..].iter()) + { + if self.get_bytes(*k1).unwrap() == self.get_bytes(*k2).unwrap() { + return Err(ParseError::DuplicateKey(k1.start)); + } + } + + let dict = TTerm::RawDict( + dict_raw, + IDict { + dict_start: dict_start as Pos, + dict_end: self.dicts.len() as Pos, + }, + ); + Ok((self.push_term(dict), rest)) + } + Err(e) => { + stack.truncate(stack_start); + Err(e) + } + } + } + + fn decode_dict_inner(&mut self, raw: IRaw, stack: &mut Stack) -> Result { + let mut cur = self.take_whitespace(self.take_char(raw, DICT_OPEN)?); + + while let Ok((key, rest)) = self.take_string(cur) { + cur = self.take_whitespace(rest); + + cur = self.take_char(cur, DICT_ASSIGN)?; + + let (value, rest) = self.decode(cur, stack)?; + cur = self.take_whitespace(rest); + stack.push(StackItem::KeyValue(key, value)); + + if let Ok(rest) = self.take_char(cur, DICT_DELIM) { + cur = self.take_whitespace(rest); + } else { + break; + } + } + + let rest = self.take_char(cur, DICT_CLOSE)?; + Ok(rest) + } + + fn decode_string(&mut self, raw: IRaw) -> Result<(ITerm, IRaw), ParseError> { + let (string_raw, rest) = self.take_string(raw)?; + let string = self.push_term(TTerm::Str(string_raw)); + Ok((string, rest)) + } + + fn take_string(&mut self, raw: IRaw) -> Result<(IRaw, IRaw), ParseError> { + let mut rest = raw; + while rest.start < rest.end { + if is_string_char(self.bytes[rest.start as usize]) { + rest.start += 1; + } else { + break; + } + } + + if rest.start > raw.start { + let string_raw = IRaw { + start: raw.start, + end: rest.start, + }; + Ok((string_raw, rest)) + } else { + Err(self.error_at(rest)) + } + } + + #[inline] + fn take_char(&self, raw: IRaw, c: u8) -> Result { + if raw.start >= raw.end { + Err(ParseError::IncompleteInput) + } else if self.bytes[raw.start as usize] != c { + Err(ParseError::Garbage(raw.start)) + } else { + Ok(IRaw { + start: raw.start + 1, + end: raw.end, + }) + } + } + + #[inline] + fn take_whitespace(&self, mut raw: IRaw) -> IRaw { + while raw.start < raw.end { + if is_whitespace(self.bytes[raw.start as usize]) { + raw.start += 1; + } else { + break; + } + } + raw + } + + #[inline] + fn get_bytes(&self, raw: IRaw) -> Option<&[u8]> { + self.bytes.get(raw.start as usize..raw.end as usize) + } + + #[inline] + fn error_at(&self, raw: IRaw) -> ParseError { + if raw.start < raw.end { + ParseError::Garbage(raw.start) + } else { + ParseError::IncompleteInput + } + } + + #[inline] + fn push_term(&mut self, term: TTerm) -> ITerm { + let ret = ITerm(self.terms.len() as Pos); + self.terms.push(term); + ret + } + + #[cfg(test)] + fn debug(&self, i: ITerm) { + use crate::debug as debug_str; + + let term = &self.terms[i.0 as usize]; + match term { + TTerm::Str(r) => { + eprintln!( + "{} -> {:?} = `{}`", + i.0, + term, + debug_str(&self.bytes[r.start as usize..r.end as usize]) + ); + } + TTerm::RawSeq(r, s) => { + eprintln!( + "{} -> {:?} = `{}` ((", + i.0, + term, + debug_str(&self.bytes[r.start as usize..r.end as usize]) + ); + for j in self.seqs[s.seq_start as usize..s.seq_end as usize].iter() { + self.debug(*j); + } + eprintln!("))"); + } + TTerm::RawList(r, l) => { + eprintln!( + "{} -> {:?} = `{}` [[", + i.0, + term, + debug_str(&self.bytes[r.start as usize..r.end as usize]) + ); + for j in self.seqs[l.seq_start as usize..l.seq_end as usize].iter() { + self.debug(*j); + } + eprintln!("]]"); + } + TTerm::RawDict(r, d) => { + eprintln!( + "{} -> {:?} = `{}` {{{{", + i.0, + term, + debug_str(&self.bytes[r.start as usize..r.end as usize]) + ); + for (k, v) in self.dicts[d.dict_start as usize..d.dict_end as usize].iter() { + eprint!( + "[`{}`] = ", + debug_str(&self.bytes[k.start as usize..k.end as usize]) + ); + self.debug(*v); + } + eprintln!("}}}}"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn list_and_seq() { + let string = "[ [ h1; h2; h3 ]; hello world [ acc bii; cooj dlpa] ]"; + eprintln!("{}", string); + let (buf, term) = decode(string.as_bytes()).unwrap(); + buf.debug(term); + + let [a, b] = buf.list_of(term).unwrap(); + + assert_eq!(buf.raw(a).unwrap(), b"[ h1; h2; h3 ]"); + let l = buf.list(a).unwrap(); + assert_eq!(l.len(), 3); + assert_eq!(buf.str(l[0]).unwrap(), "h1"); + assert_eq!(buf.str(l[1]).unwrap(), "h2"); + assert_eq!(buf.str(l[2]).unwrap(), "h3"); + + assert_eq!(buf.raw(b).unwrap(), b"hello world [ acc bii; cooj dlpa]"); + let [h, w, l] = buf.seq_of(b).unwrap(); + assert_eq!(buf.str(h).unwrap(), "hello"); + assert_eq!(buf.str(w).unwrap(), "world"); + + assert_eq!(buf.raw(l).unwrap(), b"[ acc bii; cooj dlpa]"); + let [l1, l2] = buf.list_of(l).unwrap(); + + assert_eq!(buf.raw(l1).unwrap(), b"acc bii"); + let s = buf.seq(&l1).unwrap(); + assert_eq!(s.len(), 2); + assert_eq!(buf.str(s[0]).unwrap(), "acc"); + assert_eq!(buf.str(s[1]).unwrap(), "bii"); + + assert_eq!(buf.raw(l2).unwrap(), b"cooj dlpa"); + let [s2a, s2b] = buf.seq_of(l2).unwrap(); + assert_eq!(buf.str(s2a).unwrap(), "cooj"); + assert_eq!(buf.str(s2b).unwrap(), "dlpa"); + } + + #[test] + fn dict() { + let string = + "[ { a = plop; b = hello world }; ploplop { e=15; d=12 ;c = {key=val;key2=val2}} ]"; + eprintln!("{}", string); + let (buf, term) = decode(string.as_bytes()).unwrap(); + buf.debug(term); + + let [a, b] = buf.list_of(term).unwrap(); + + assert_eq!(buf.raw(a).unwrap(), b"{ a = plop; b = hello world }"); + let [aa, ab] = buf.dict_of(a, ["a", "b"], false).unwrap(); + assert_eq!(buf.raw(aa).unwrap(), b"plop"); + assert_eq!(buf.raw(ab).unwrap(), b"hello world"); + + assert_eq!(buf.raw(b).unwrap(), b"ploplop { e=15; d=12 ;c = {key=val;key2=val2}}"); + let [ba, bb] = buf.seq_of(b).unwrap(); + assert_eq!(buf.str(ba).unwrap(), "ploplop"); + + assert_eq!(buf.str(buf.dict_get(bb, "e").unwrap()).unwrap(), "15"); + let mut iter = buf.dict_iter(bb).unwrap(); + let (k1, v1) = iter.next().unwrap(); + assert_eq!(k1, "c"); + assert_eq!(buf.raw(v1).unwrap(), b"{key=val;key2=val2}"); + let (k2, v2) = iter.next().unwrap(); + assert_eq!(k2, "d"); + assert_eq!(buf.str(v2).unwrap(), "12"); + let (k3, v3) = iter.next().unwrap(); + assert_eq!(k3, "e"); + assert_eq!(buf.str(v3).unwrap(), "15"); + assert!(iter.next().is_none()); + } +} diff --git a/src/enc/mod.rs b/src/enc/mod.rs index f15f041..9fd1a2a 100644 --- a/src/enc/mod.rs +++ b/src/enc/mod.rs @@ -22,8 +22,8 @@ mod error; use std::borrow::{Borrow, Cow}; use std::collections::HashMap; -use crate::*; use crate::dec::{self, decode}; +use crate::*; use crate::{is_string_char, is_whitespace, switch64, BytesEncoding}; pub use error::Error; diff --git a/src/lib.rs b/src/lib.rs index bf06149..589c2a8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -84,6 +84,7 @@ //! Note that the value of `text1` is embedded as-is inside `text2`. This is what allows us //! to check the hash and the signature: the raw representation of the term hasn't changed. +pub mod buf; pub mod dec; pub mod enc; pub mod switch64; diff --git a/src/switch64.rs b/src/switch64.rs index 5043e98..b99174e 100644 --- a/src/switch64.rs +++ b/src/switch64.rs @@ -103,10 +103,7 @@ mod tests { #[test] fn test_encode() { assert_eq!(debug(&encode(&b"hello world"[..], true)), "hello world"); - assert_eq!( - debug(&encode(&b"hello, world!"[..], true)), - "hello, world!" - ); + assert_eq!(debug(&encode(&b"hello, world!"[..], true)), "hello, world!"); } #[test]