From ab8c3e70c3de58b855815d11838a2840b765ec95 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 17 Nov 2022 17:55:50 +0100 Subject: [PATCH] Begin working encoder --- src/dec/decode.rs | 33 +++------ src/enc/mod.rs | 183 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 17 +++++ 3 files changed, 210 insertions(+), 23 deletions(-) create mode 100644 src/enc/mod.rs diff --git a/src/dec/decode.rs b/src/dec/decode.rs index a145d4f..33aadee 100644 --- a/src/dec/decode.rs +++ b/src/dec/decode.rs @@ -9,12 +9,7 @@ use nom::{ }; use crate::dec::{debug, AnyTerm, NonListTerm, Term}; - -const DICT_OPEN: &[u8] = b"{"; -const DICT_CLOSE: &[u8] = b"}"; -const DICT_ASSIGN: &[u8] = b"="; -const DICT_DELIM: &[u8] = b","; -const STR_EXTRA_CHARS: &[u8] = b"._-*?"; +use crate::{is_string_char, is_whitespace, DICT_ASSIGN, DICT_CLOSE, DICT_DELIM, DICT_OPEN}; // ---- @@ -51,7 +46,7 @@ impl<'a> From>> for DecodeError<'a> { // ---- /// Decodes a nettext string into the term it represents. -pub fn decode(input: &[u8]) -> std::result::Result, DecodeError<'_>> { +pub fn decode(input: &[u8]) -> std::result::Result, DecodeError<'_>> { let (rest, term) = decode_term(input)?; let (end, _) = take_while(is_whitespace)(rest)?; if !end.is_empty() { @@ -60,7 +55,7 @@ pub fn decode(input: &[u8]) -> std::result::Result, DecodeErro Ok(Term(term)) } -fn decode_term(input: &[u8]) -> IResult<&'_ [u8], AnyTerm<'_, 'static>> { +fn decode_term(input: &[u8]) -> IResult<&'_ [u8], AnyTerm<'_, '_>> { let (start, _) = take_while(is_whitespace)(input)?; let (rest, list) = separated_list1(take_while1(is_whitespace), decode_nonlist_term)(start)?; @@ -73,7 +68,7 @@ fn decode_term(input: &[u8]) -> IResult<&'_ [u8], AnyTerm<'_, 'static>> { } } -fn decode_nonlist_term(input: &[u8]) -> IResult<&'_ [u8], NonListTerm<'_, 'static>> { +fn decode_nonlist_term(input: &[u8]) -> IResult<&'_ [u8], NonListTerm<'_, '_>> { let (rest, term) = alt(( map(decode_str, NonListTerm::Str), map(decode_dict, |(raw, d)| NonListTerm::Dict(raw, d)), @@ -86,14 +81,14 @@ fn decode_str(input: &[u8]) -> IResult<&'_ [u8], &'_ [u8]> { Ok((rest, data)) } -type DictType<'a> = (&'a [u8], HashMap<&'a [u8], AnyTerm<'a, 'static>>); +type DictType<'a> = (&'a [u8], HashMap<&'a [u8], AnyTerm<'a, 'a>>); fn decode_dict(dict_begin: &[u8]) -> IResult<&'_ [u8], DictType<'_>> { - let (d, _) = tag(DICT_OPEN)(dict_begin)?; + let (d, _) = tag(&[DICT_OPEN][..])(dict_begin)?; let (d, items) = separated_list0(dict_separator, decode_dict_item)(d)?; let (d, _) = opt(dict_separator)(d)?; let (d, _) = take_while(is_whitespace)(d)?; - let (dict_end, _) = tag(DICT_CLOSE)(d)?; + let (dict_end, _) = tag(&[DICT_CLOSE][..])(d)?; let dict = items.into_iter().collect::>(); @@ -105,27 +100,19 @@ fn decode_dict(dict_begin: &[u8]) -> IResult<&'_ [u8], DictType<'_>> { fn dict_separator(d: &[u8]) -> IResult<&'_ [u8], ()> { let (d, _) = take_while(is_whitespace)(d)?; - let (d, _) = tag(DICT_DELIM)(d)?; + let (d, _) = tag(&[DICT_DELIM][..])(d)?; Ok((d, ())) } -fn decode_dict_item(d: &[u8]) -> IResult<&'_ [u8], (&'_ [u8], AnyTerm<'_, 'static>)> { +fn decode_dict_item(d: &[u8]) -> IResult<&'_ [u8], (&'_ [u8], AnyTerm<'_, '_>)> { let (d, _) = take_while(is_whitespace)(d)?; let (d, key) = decode_str(d)?; let (d, _) = take_while(is_whitespace)(d)?; - let (d, _) = tag(DICT_ASSIGN)(d)?; + let (d, _) = tag(&[DICT_ASSIGN][..])(d)?; let (d, value) = decode_term(d)?; Ok((d, (key, value))) } -fn is_string_char(c: u8) -> bool { - c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c) -} - -fn is_whitespace(c: u8) -> bool { - c.is_ascii_whitespace() -} - // ---- #[cfg(test)] diff --git a/src/enc/mod.rs b/src/enc/mod.rs new file mode 100644 index 0000000..2dc2f4c --- /dev/null +++ b/src/enc/mod.rs @@ -0,0 +1,183 @@ +use std::collections::HashMap; + +use crate::dec::decode; +use crate::{is_string_char, is_whitespace}; + +pub struct Term<'a>(T<'a>); + +enum T<'a> { + Str(&'a [u8]), + OwnedStr(Vec), + Dict(HashMap<&'a [u8], T<'a>>), + List(Vec>), +} + +#[derive(Debug)] +pub enum Error { + InvalidCharacter(u8), + InvalidRaw, + NotADictionnary, +} + +// ---- helpers to build terms ---- + +/// Encode a string (may contain whitespace) +/// +/// ``` +/// use nettext::enc::*; +/// +/// assert_eq!(encode(&string("Hello world .").unwrap()), b"Hello world ."); +/// ``` +pub fn string(s: &str) -> Result, Error> { + for c in s.as_bytes().iter() { + if !(is_string_char(*c) || is_whitespace(*c)) { + return Err(Error::InvalidCharacter(*c)); + } + } + Ok(Term(T::Str(s.as_bytes()))) +} + +/// Include a raw nettext value +/// +/// ``` +/// use nettext::enc::*; +/// +/// assert_eq!(encode(&raw(b"Hello { a = b, c = d} .").unwrap()), b"Hello { a = b, c = d} ."); +/// ``` +pub fn raw(bytes: &[u8]) -> Result, Error> { + if decode(bytes).is_err() { + return Err(Error::InvalidRaw); + } + Ok(Term(T::Str(bytes))) +} + +/// Encode a list of items +/// +/// ``` +/// use nettext::enc::*; +/// +/// assert_eq!(encode(&list([ +/// string("Hello").unwrap(), +/// string("world").unwrap() +/// ])), b"Hello world"); +/// ``` +pub fn list<'a, I: IntoIterator>>(terms: I) -> Term<'a> { + Term(T::List(terms.into_iter().map(|x| x.0).collect())) +} + +/// Encode a list of items +/// +/// ``` +/// use nettext::enc::*; +/// +/// assert_eq!(encode(&dict([ +/// ("a", string("Hello").unwrap()), +/// ("b", string("world").unwrap()) +/// ])), b"{\n a = Hello,\n b = world,\n}"); +/// ``` +pub fn dict<'a, I: IntoIterator)>>(pairs: I) -> Term<'a> { + Term(T::Dict( + pairs + .into_iter() + .map(|(k, v)| (k.as_bytes(), v.0)) + .collect(), + )) +} + +impl<'a> Term<'a> { + pub fn push(self, t: Term<'a>) -> Term<'a> { + match self.0 { + T::List(mut v) => { + v.push(t.0); + Term(T::List(v)) + } + x => Term(T::List(vec![x, t.0])), + } + } + + pub fn insert(self, k: &'a str, v: Term<'a>) -> Result, Error> { + match self.0 { + T::Dict(mut d) => { + d.insert(k.as_bytes(), v.0); + Ok(Term(T::Dict(d))) + } + _ => Err(Error::NotADictionnary), + } + } +} + +// ---- encoding function ---- + +pub fn encode<'a>(t: &Term<'a>) -> Vec { + let mut buf = Vec::with_capacity(128); + encode_aux(&mut buf, &t.0, 0); + buf +} + +fn encode_aux<'a>(buf: &mut Vec, term: &T<'a>, indent: usize) { + match term { + T::Str(s) => buf.extend_from_slice(s), + T::OwnedStr(s) => buf.extend_from_slice(&s), + T::Dict(d) => { + buf.extend_from_slice(b"{\n"); + let indent2 = indent + 2; + let mut keys = d.keys().collect::>(); + keys.sort(); + for k in keys { + let v = d.get(k).unwrap(); + for _ in 0..indent2 { + buf.push(b' '); + } + buf.extend_from_slice(k); + buf.extend_from_slice(b" = "); + encode_aux(buf, v, indent2); + buf.extend_from_slice(b",\n"); + } + for _ in 0..indent { + buf.push(b' '); + } + buf.push(b'}'); + } + T::List(l) => { + let indent2 = indent + 2; + for (i, v) in l.iter().enumerate() { + if buf.iter().rev().take_while(|c| **c != b'\n').count() > 80 { + buf.push(b'\n'); + for _ in 0..indent2 { + buf.push(b' '); + } + } else if i > 0 { + buf.push(b' '); + } + encode_aux(buf, v, indent2); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn complex1() { + let input = list([ + string("HELLO").unwrap(), + string("alexhelloworld").unwrap(), + dict([ + ("from", string("jxx").unwrap()), + ("subject", string("hello").unwrap()), + ("data", raw(b"{ f1 = plop, f2 = kuko }").unwrap()), + ]), + ]); + let expected = b"HELLO alexhelloworld { + data = { f1 = plop, f2 = kuko }, + from = jxx, + subject = hello, + }"; + let enc = encode(&input); + eprintln!("{}", std::str::from_utf8(&enc).unwrap()); + eprintln!("{}", std::str::from_utf8(&expected[..]).unwrap()); + assert_eq!(encode(&input), expected); + } +} diff --git a/src/lib.rs b/src/lib.rs index 7ceabba..d9f2167 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,19 @@ pub mod crypto; pub mod dec; +pub mod enc; + +// ---- syntactic elements of the data format ---- + +pub(crate) const DICT_OPEN: u8 = b'{'; +pub(crate) const DICT_CLOSE: u8 = b'}'; +pub(crate) const DICT_ASSIGN: u8 = b'='; +pub(crate) const DICT_DELIM: u8 = b','; +pub(crate) const STR_EXTRA_CHARS: &[u8] = b"._-*?"; + +pub(crate) fn is_string_char(c: u8) -> bool { + c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c) +} + +pub(crate) fn is_whitespace(c: u8) -> bool { + c.is_ascii_whitespace() +}