From 7cc2212786277e33d94d20b2e410f4bdffe07ed6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 10 May 2023 17:19:46 +0200 Subject: [PATCH] work on buf parser --- src/{buf.rs => buf/decode.rs} | 218 +++++--------------- src/buf/mod.rs | 374 ++++++++++++++++++++++++++++++++++ 2 files changed, 430 insertions(+), 162 deletions(-) rename src/{buf.rs => buf/decode.rs} (69%) create mode 100644 src/buf/mod.rs diff --git a/src/buf.rs b/src/buf/decode.rs similarity index 69% rename from src/buf.rs rename to src/buf/decode.rs index edd2766..8676f4f 100644 --- a/src/buf.rs +++ b/src/buf/decode.rs @@ -1,11 +1,9 @@ -use std::borrow::Cow; - use crate::{ is_string_char, is_whitespace, DICT_ASSIGN, DICT_CLOSE, DICT_DELIM, DICT_OPEN, LIST_CLOSE, LIST_DELIM, LIST_OPEN, }; -pub type Pos = u32; +use super::*; #[derive(Debug)] pub enum ParseError { @@ -14,43 +12,6 @@ pub enum ParseError { Garbage(Pos), } -#[derive(Clone, Copy, Debug)] -pub struct ITerm(Pos); - -#[derive(Clone, Copy, Debug)] -struct IRaw { - start: Pos, - end: Pos, -} - -#[derive(Clone, Copy, Debug)] -struct ISeq { - seq_start: Pos, - seq_end: Pos, -} - -#[derive(Clone, Copy, Debug)] -struct IDict { - dict_start: Pos, - dict_end: Pos, -} - -#[derive(Debug)] -enum TTerm { - Str(IRaw), - RawSeq(IRaw, ISeq), - RawList(IRaw, ISeq), - RawDict(IRaw, IDict), -} - -#[derive(Debug)] -pub struct Buf<'a> { - bytes: Cow<'a, [u8]>, - seqs: Vec, - dicts: Vec<(IRaw, ITerm)>, - terms: Vec, -} - pub fn decode(input: &[u8]) -> Result<(Buf<'_>, ITerm), ParseError> { let mut buf = Buf { bytes: input.into(), @@ -63,23 +24,20 @@ pub fn decode(input: &[u8]) -> Result<(Buf<'_>, ITerm), ParseError> { end: input.len() as Pos, }; - let mut stack = Vec::with_capacity(16); - let (term, rest) = buf.decode(all_buf, &mut stack)?; - assert!(stack.is_empty()); - - let rest = buf.take_whitespace(rest); - if rest.start < all_buf.end { - return Err(ParseError::Garbage(rest.start)); - } + let term = buf.decode(all_buf)?; Ok((buf, term)) } +// ================ + enum StackItem { Term(ITerm), KeyValue(IRaw, ITerm), } + type Stack = Vec; + impl StackItem { fn term(self) -> ITerm { match self { @@ -96,109 +54,21 @@ impl StackItem { } impl<'a> Buf<'a> { - // ============================================ PUBLIC ACCESS FUNCTIONS + pub(crate) fn decode(&mut self, raw: IRaw) -> Result { + let mut stack = Vec::with_capacity(16); - pub fn raw(&self, term: ITerm) -> Option<&[u8]> { - match self.terms.get(term.0 as usize)? { - TTerm::Str(r) | TTerm::RawSeq(r, _) | TTerm::RawList(r, _) | TTerm::RawDict(r, _) => { - self.get_bytes(*r) - } + let (term, rest) = self.decode_seq(raw, &mut stack)?; + assert!(stack.is_empty()); + + let rest = self.take_whitespace(rest); + if rest.start < raw.end { + return Err(ParseError::Garbage(rest.start)); } + + Ok(term) } - pub fn str(&self, term: ITerm) -> Option<&str> { - match self.terms.get(term.0 as usize)? { - TTerm::Str(r) => self - .bytes - .get(r.start as usize..r.end as usize) - .map(|x| unsafe { std::str::from_utf8_unchecked(x) }), - _ => None, - } - } - - pub fn seq<'x>(&'x self, term: &'x ITerm) -> Option<&'x [ITerm]> { - match self.terms.get(term.0 as usize)? { - TTerm::RawSeq(_, s) => self.seqs.get(s.seq_start as usize..s.seq_end as usize), - _ => Some(std::slice::from_ref(term)), - } - } - - pub fn seq_of(&self, term: ITerm) -> Option<[ITerm; N]> { - match self.terms.get(term.0 as usize)? { - TTerm::RawSeq(_, s) => { - if (s.seq_end - s.seq_start) as usize == N { - let seq = self.seqs.get(s.seq_start as usize..s.seq_end as usize)?; - Some(seq.try_into().unwrap()) - } else { - None - } - } - _ => None, - } - } - - pub fn list(&self, term: ITerm) -> Option<&[ITerm]> { - match self.terms.get(term.0 as usize)? { - TTerm::RawList(_, s) => self.seqs.get(s.seq_start as usize..s.seq_end as usize), - _ => None, - } - } - - pub fn list_of(&self, term: ITerm) -> Option<[ITerm; N]> { - match self.terms.get(term.0 as usize)? { - TTerm::RawList(_, s) if (s.seq_end - s.seq_start) as usize == N => { - let seq = self.seqs.get(s.seq_start as usize..s.seq_end as usize)?; - Some(seq.try_into().unwrap()) - } - _ => None, - } - } - - pub fn dict_get(&self, term: ITerm, key: &str) -> Option { - match self.terms.get(term.0 as usize)? { - TTerm::RawDict(_, d) => { - let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; - let pos = dict.binary_search_by(|(k, _)| self.get_bytes(*k).unwrap().cmp(key.as_bytes())).ok()?; - Some(dict[pos].1) - } - _ => None, - } - } - - pub fn dict_of(&self, term: ITerm, keys: [&str; N], allow_other: bool) -> Option<[ITerm; N]> { - match self.terms.get(term.0 as usize)? { - TTerm::RawDict(_, d) => { - let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; - if dict.len() < N || (dict.len() > N && !allow_other) { - return None; - } - - let mut ret = [ITerm(0); N]; - for i in 0..N { - let pos = dict.binary_search_by(|(k, _)| self.get_bytes(*k).unwrap().cmp(keys[i].as_bytes())).ok()?; - ret[i] = dict[pos].1; - } - - Some(ret) - } - _ => None, - } - } - - pub fn dict_iter(&self, term: ITerm) -> Option + '_> { - match self.terms.get(term.0 as usize)? { - TTerm::RawDict(_, d) => { - let dict = self.dicts.get(d.dict_start as usize..d.dict_end as usize)?; - let iter = dict.iter().map(|(k, v)| (unsafe { std::str::from_utf8_unchecked(self.get_bytes(*k).unwrap()) }, *v)); - Some(iter) - } - _ => None, - } - } - - // ============================================ NETTEXT PARSING - - fn decode(&mut self, raw: IRaw, stack: &mut Stack) -> Result<(ITerm, IRaw), ParseError> { + fn decode_seq(&mut self, raw: IRaw, stack: &mut Stack) -> Result<(ITerm, IRaw), ParseError> { let start = self.take_whitespace(raw); let stack_start = stack.len(); @@ -259,7 +129,7 @@ impl<'a> Buf<'a> { let mut cur = self.take_whitespace(self.take_char(raw, LIST_OPEN)?); - while let Ok((term, rest)) = self.decode(cur, stack) { + while let Ok((term, rest)) = self.decode_seq(cur, stack) { stack.push(StackItem::Term(term)); cur = self.take_whitespace(rest); if let Ok(rest) = self.take_char(rest, LIST_DELIM) { @@ -310,7 +180,7 @@ impl<'a> Buf<'a> { .iter() .zip(self.dicts[dict_start + 1..].iter()) { - if self.get_bytes(*k1).unwrap() == self.get_bytes(*k2).unwrap() { + if self.get_bytes(*k1) == self.get_bytes(*k2) { return Err(ParseError::DuplicateKey(k1.start)); } } @@ -339,7 +209,7 @@ impl<'a> Buf<'a> { cur = self.take_char(cur, DICT_ASSIGN)?; - let (value, rest) = self.decode(cur, stack)?; + let (value, rest) = self.decode_seq(cur, stack)?; cur = self.take_whitespace(rest); stack.push(StackItem::KeyValue(key, value)); @@ -408,8 +278,8 @@ impl<'a> Buf<'a> { } #[inline] - fn get_bytes(&self, raw: IRaw) -> Option<&[u8]> { - self.bytes.get(raw.start as usize..raw.end as usize) + pub(crate) fn get_bytes(&self, raw: IRaw) -> &[u8] { + &self.bytes[raw.start as usize..raw.end as usize] } #[inline] @@ -421,13 +291,6 @@ impl<'a> Buf<'a> { } } - #[inline] - fn push_term(&mut self, term: TTerm) -> ITerm { - let ret = ITerm(self.terms.len() as Pos); - self.terms.push(term); - ret - } - #[cfg(test)] fn debug(&self, i: ITerm) { use crate::debug as debug_str; @@ -454,6 +317,13 @@ impl<'a> Buf<'a> { } eprintln!("))"); } + TTerm::Seq(s) => { + eprintln!("{} -> {:?} ((", i.0, term); + for j in self.seqs[s.seq_start as usize..s.seq_end as usize].iter() { + self.debug(*j); + } + eprintln!("))"); + } TTerm::RawList(r, l) => { eprintln!( "{} -> {:?} = `{}` [[", @@ -466,6 +336,13 @@ impl<'a> Buf<'a> { } eprintln!("]]"); } + TTerm::List(l) => { + eprintln!("{} -> {:?} [[", i.0, term); + for j in self.seqs[l.seq_start as usize..l.seq_end as usize].iter() { + self.debug(*j); + } + eprintln!("]]"); + } TTerm::RawDict(r, d) => { eprintln!( "{} -> {:?} = `{}` {{{{", @@ -482,6 +359,17 @@ impl<'a> Buf<'a> { } eprintln!("}}}}"); } + TTerm::Dict(d) => { + eprintln!("{} -> {:?} {{{{", i.0, term); + for (k, v) in self.dicts[d.dict_start as usize..d.dict_end as usize].iter() { + eprint!( + "[`{}`] = ", + debug_str(&self.bytes[k.start as usize..k.end as usize]) + ); + self.debug(*v); + } + eprintln!("}}}}"); + } } } } @@ -535,17 +423,23 @@ mod tests { buf.debug(term); let [a, b] = buf.list_of(term).unwrap(); - + assert_eq!(buf.raw(a).unwrap(), b"{ a = plop; b = hello world }"); let [aa, ab] = buf.dict_of(a, ["a", "b"], false).unwrap(); assert_eq!(buf.raw(aa).unwrap(), b"plop"); assert_eq!(buf.raw(ab).unwrap(), b"hello world"); - assert_eq!(buf.raw(b).unwrap(), b"ploplop { e=15; d=12 ;c = {key=val;key2=val2}}"); + assert_eq!( + buf.raw(b).unwrap(), + b"ploplop { e=15; d=12 ;c = {key=val;key2=val2}}" + ); let [ba, bb] = buf.seq_of(b).unwrap(); assert_eq!(buf.str(ba).unwrap(), "ploplop"); - assert_eq!(buf.str(buf.dict_get(bb, "e").unwrap()).unwrap(), "15"); + assert_eq!( + buf.str(buf.dict_get(bb, "e").unwrap().unwrap()).unwrap(), + "15" + ); let mut iter = buf.dict_iter(bb).unwrap(); let (k1, v1) = iter.next().unwrap(); assert_eq!(k1, "c"); diff --git a/src/buf/mod.rs b/src/buf/mod.rs new file mode 100644 index 0000000..72c4ee1 --- /dev/null +++ b/src/buf/mod.rs @@ -0,0 +1,374 @@ +pub mod decode; + +use std::borrow::Cow; + +use crate::is_string_char; + +pub use decode::*; + +pub type Pos = u32; + +#[derive(Clone, Copy, Debug)] +pub struct ITerm(Pos); + +#[derive(Clone, Copy, Debug)] +pub(crate) struct IRaw { + start: Pos, + end: Pos, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct ISeq { + seq_start: Pos, + seq_end: Pos, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct IDict { + dict_start: Pos, + dict_end: Pos, +} + +#[derive(Debug)] +pub(crate) enum TTerm { + Str(IRaw), + RawSeq(IRaw, ISeq), + RawList(IRaw, ISeq), + RawDict(IRaw, IDict), + Seq(ISeq), + List(ISeq), + Dict(IDict), +} + +#[derive(Debug)] +pub struct Buf<'a> { + bytes: Cow<'a, [u8]>, + seqs: Vec, + dicts: Vec<(IRaw, ITerm)>, + terms: Vec, +} + +#[derive(Debug)] +pub enum TermError { + InvalidIndex(ITerm), + WrongType(&'static str, &'static str), + WrongLength(usize, usize), + WrongKeys, + NoRawRepresentation, +} + +#[derive(Debug)] +pub enum ValueError { + InvalidIndex(ITerm), + DuplicateKey, + BadString, + SeqInSeq, +} + +impl<'a> Buf<'a> { + pub fn new() -> Self { + Self { + bytes: Default::default(), + seqs: Vec::with_capacity(16), + dicts: Vec::with_capacity(16), + terms: Vec::with_capacity(16), + } + } + + // ================ READING FUNCTIONS ================== + + pub fn raw(&self, term: ITerm) -> Result<&[u8], TermError> { + match self.get_term(term)? { + TTerm::Str(r) | TTerm::RawSeq(r, _) | TTerm::RawList(r, _) | TTerm::RawDict(r, _) => { + Ok(self.get_bytes(*r)) + } + _ => Err(TermError::NoRawRepresentation), + } + } + + pub fn str(&self, term: ITerm) -> Result<&str, TermError> { + match self.get_term(term)? { + TTerm::Str(r) => { + let bytes = self.get_bytes(*r); + let s = unsafe { std::str::from_utf8_unchecked(bytes) }; + Ok(s) + } + t => Err(TermError::WrongType("string", t.typename())), + } + } + + pub fn seq<'x>(&'x self, term: &'x ITerm) -> Result<&'x [ITerm], TermError> { + match self.get_term(*term)? { + TTerm::RawSeq(_, s) | TTerm::Seq(s) => { + Ok(&self.seqs[s.seq_start as usize..s.seq_end as usize]) + } + _ => Ok(std::slice::from_ref(term)), + } + } + + pub fn seq_of(&self, term: ITerm) -> Result<[ITerm; N], TermError> { + match self.get_term(term)? { + TTerm::RawSeq(_, s) | TTerm::Seq(s) => { + let seq_len = (s.seq_end - s.seq_start) as usize; + if seq_len == N { + let seq = &self.seqs[s.seq_start as usize..s.seq_end as usize]; + Ok(seq.try_into().unwrap()) + } else { + Err(TermError::WrongLength(N, seq_len)) + } + } + t => Err(TermError::WrongType("seq", t.typename())), + } + } + + pub fn list(&self, term: ITerm) -> Result<&[ITerm], TermError> { + match self.get_term(term)? { + TTerm::RawList(_, s) | TTerm::List(s) => { + Ok(&self.seqs[s.seq_start as usize..s.seq_end as usize]) + } + t => Err(TermError::WrongType("list", t.typename())), + } + } + + pub fn list_of(&self, term: ITerm) -> Result<[ITerm; N], TermError> { + match self.get_term(term)? { + TTerm::RawList(_, s) | TTerm::List(s) => { + let list_len = (s.seq_end - s.seq_start) as usize; + if list_len == N { + let seq = &self.seqs[s.seq_start as usize..s.seq_end as usize]; + Ok(seq.try_into().unwrap()) + } else { + Err(TermError::WrongLength(N, list_len)) + } + } + t => Err(TermError::WrongType("list", t.typename())), + } + } + + pub fn dict_get(&self, term: ITerm, key: &str) -> Result, TermError> { + match self.get_term(term)? { + TTerm::RawDict(_, d) | TTerm::Dict(d) => { + let dict = &self.dicts[d.dict_start as usize..d.dict_end as usize]; + let pos_opt = dict + .binary_search_by(|(k, _)| self.get_bytes(*k).cmp(key.as_bytes())) + .ok(); + Ok(pos_opt.map(|pos| dict[pos].1)) + } + t => Err(TermError::WrongType("dict", t.typename())), + } + } + + pub fn dict_of( + &self, + term: ITerm, + keys: [&str; N], + allow_other: bool, + ) -> Result<[ITerm; N], TermError> { + match self.get_term(term)? { + TTerm::RawDict(_, d) | TTerm::Dict(d) => { + let dict = &self.dicts[d.dict_start as usize..d.dict_end as usize]; + if dict.len() < N || (dict.len() > N && !allow_other) { + return Err(TermError::WrongKeys); + } + + let mut ret = [ITerm(0); N]; + for i in 0..N { + let pos = dict + .binary_search_by(|(k, _)| self.get_bytes(*k).cmp(keys[i].as_bytes())) + .map_err(|_| TermError::WrongKeys)?; + ret[i] = dict[pos].1; + } + + Ok(ret) + } + t => Err(TermError::WrongType("dict", t.typename())), + } + } + + pub fn dict_iter( + &self, + term: ITerm, + ) -> Result + '_, TermError> { + match self.get_term(term)? { + TTerm::RawDict(_, d) | TTerm::Dict(d) => { + let dict = &self.dicts[d.dict_start as usize..d.dict_end as usize]; + let iter = dict.iter().map(|(k, v)| { + ( + unsafe { std::str::from_utf8_unchecked(self.get_bytes(*k)) }, + *v, + ) + }); + Ok(iter) + } + t => Err(TermError::WrongType("dict", t.typename())), + } + } + + // ================= WRITING FUNCTIONS ================ + + pub fn push_str(&mut self, s: &str) -> Result { + if !s.as_bytes().iter().copied().all(is_string_char) { + return Err(ValueError::BadString); + } + + let term = TTerm::Str(self.push_bytes(s.as_bytes())); + Ok(self.push_term(term)) + } + + pub fn push_seq(&mut self, iterator: impl Iterator) -> Result { + let seq_start = self.seqs.len(); + + for term in iterator { + match self.terms.get(term.0 as usize) { + None => { + self.seqs.truncate(seq_start); + return Err(ValueError::InvalidIndex(term)); + } + Some(TTerm::RawSeq(_, _)) => { + self.seqs.truncate(seq_start); + return Err(ValueError::SeqInSeq); + } + _ => { + self.seqs.push(term); + } + } + } + + let seq = ISeq { + seq_start: seq_start as Pos, + seq_end: self.seqs.len() as Pos, + }; + + Ok(self.push_term(TTerm::Seq(seq))) + } + + pub fn push_list( + &mut self, + iterator: impl Iterator, + ) -> Result { + let list_start = self.seqs.len(); + + for term in iterator { + match self.terms.get(term.0 as usize) { + None => { + self.seqs.truncate(list_start); + return Err(ValueError::InvalidIndex(term)); + } + _ => { + self.seqs.push(term); + } + } + } + + let list = ISeq { + seq_start: list_start as Pos, + seq_end: self.seqs.len() as Pos, + }; + + Ok(self.push_term(TTerm::List(list))) + } + + pub fn push_dict<'k>( + &mut self, + iterator: impl Iterator, + ) -> Result { + let bytes_start = self.bytes.len(); + let dict_start = self.dicts.len(); + + for (key, term) in iterator { + if !key.as_bytes().iter().copied().all(is_string_char) { + return Err(ValueError::BadString); + } + let key = self.push_bytes(key.as_bytes()); + + match self.terms.get(term.0 as usize) { + None => { + self.bytes.to_mut().truncate(bytes_start); + self.dicts.truncate(dict_start); + return Err(ValueError::InvalidIndex(term)); + } + _ => { + self.dicts.push((key, term)); + } + } + } + + self.dicts[dict_start..] + .sort_by_key(|(k, _)| (&self.bytes[k.start as usize..k.end as usize], k.start)); + + for ((k1, _), (k2, _)) in self.dicts[dict_start..] + .iter() + .zip(self.dicts[dict_start + 1..].iter()) + { + if self.get_bytes(*k1) == self.get_bytes(*k2) { + self.bytes.to_mut().truncate(bytes_start); + self.dicts.truncate(dict_start); + return Err(ValueError::DuplicateKey); + } + } + + let dict = IDict { + dict_start: dict_start as Pos, + dict_end: self.dicts.len() as Pos, + }; + + Ok(self.push_term(TTerm::Dict(dict))) + } + + pub fn push_raw(&mut self, raw: &[u8]) -> Result { + let bytes_len = self.bytes.len(); + let seqs_len = self.seqs.len(); + let dicts_len = self.dicts.len(); + let terms_len = self.terms.len(); + + let raw = self.push_bytes(raw); + let result = self.decode(raw); + + if result.is_err() { + // reset to initial state + self.bytes.to_mut().truncate(bytes_len); + self.seqs.truncate(seqs_len); + self.dicts.truncate(dicts_len); + self.terms.truncate(terms_len); + } + + result + } + + // ==== Internal ==== + + #[inline] + fn get_term(&self, term: ITerm) -> Result<&TTerm, TermError> { + self.terms + .get(term.0 as usize) + .ok_or(TermError::InvalidIndex(term)) + } + + #[inline] + fn push_term(&mut self, term: TTerm) -> ITerm { + let ret = ITerm(self.terms.len() as Pos); + self.terms.push(term); + ret + } + + #[inline] + fn push_bytes(&mut self, raw: &[u8]) -> IRaw { + let bytes_start = self.bytes.len(); + self.bytes.to_mut().extend(raw); + IRaw{ + start: bytes_start as Pos, + end: self.bytes.len() as Pos, + } + } +} + +impl TTerm { + fn typename(&self) -> &'static str { + match self { + TTerm::Str(_) => "string", + TTerm::RawSeq(_, _) | TTerm::Seq(_) => "seq", + TTerm::RawList(_, _) | TTerm::List(_) => "list", + TTerm::RawDict(_, _) | TTerm::Dict(_) => "dict", + } + } +}