commit 911da57d74e4522f072ad8f2f9701b49806dfdbc Author: Alex Auvolat Date: Thu Nov 17 11:48:43 2022 +0100 First commit including a decoder and a readme file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4fffb2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f504e07 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "nettext" +description = "A text-based data format for cryptographic network protocols" +authors = ["Alex Auvolat "] +version = "0.1.0" +edition = "2021" +license = "AGPL-3.0" +readme = "README.md" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nom = "7.1" diff --git a/README.md b/README.md new file mode 100644 index 0000000..a0a532d --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ +# NetText + +A text-based data format for cryptographic network protocols. + +## Principles + +- Only uses a limited subset of ASCII characters +- Has a minimal set of fundamental data types +- Retains the raw representation of complex data structures for hashing and cryptographic signing +- Minimal value data type: a string type that can only be used to represent identifiers, numbers and base64-encoded byte strings. + +## Fundamental types + +A term can be of any of the following kinds: + +- a string, which may contain only ASCII alphanumeric terms and `.-_*?` +- a dict, which maps strings (as defined above) to any term type +- a list, which is a consecutive sequence of at least 2 strings or dicts (can be mixed), simply separated by whitespace + +Dicts are represented as follows: + +``` +{ + key1 = value1, + key2 = value2 +} +``` + +Lists are represented as follows: + +``` +term1 term2 term3 +``` + +As a consequence, complex data structures can be defined as follows: + +``` +SENDTO alex { + topic = blah, + body = blah blah +} +``` + +The raw representation of a parsed dict or list is retained for hashing purposes. +It in the sequence of bytes, in the encoded string, trimmed from whitespace at extremities, +that represents the encoded dict or list in that string. + +In the complex stance example above, here are the lists and dicts and their raw representation: + +- the toplevel term is a list, whose raw representation is the entire encoded string (assuming no whitespace at beginning or end) +- the third term of the list is a dict, whose raw representation starts at `{` and ends at `}` +- the second mapping of the dict is a list, whose raw representation is exactly `blah blah`. + +Since strings cannot contain whitespace, they are always equivalent to their raw representation. + +## Structural mappings + +Terms can be interpreted in a number of different ways, depending on the context: + +- RAW: the term is interpreted as its raw encoding (see above) +- STRING: if the term is a string or a list composed exclusively of strings, the term is interpreted as its raw encoding +- VARIANT: if the term is a list whose first item is a string, it is interpreted as a variant with the following properties: + - a discriminator (the first item) + - a value, which is either the second item in case there are only two items, or the list composed of all items starting from the second if there are more than two +- DICT: if the term is a dict, interpret it as such +- LIST: if the term is a string or a dict, interpret it as a list composed of that single term. Otherwise, the term is a list, interpret it as a list of terms. + +## Data mappings + +Terms further have mappings as different data types: + +- BYTES: if the term maps as a STRING, decode it using base64 +- INT: if the term maps as a STRING, decode it as an integer written in decimal notation +- HASH, PUBKEY, SECKEY, SIGNATURE, ENCKEY, DECKEY, SYMKEY: a bunch of things that interpret BYTES as specific cryptographic items diff --git a/src/dec/decode.rs b/src/dec/decode.rs new file mode 100644 index 0000000..1e4060b --- /dev/null +++ b/src/dec/decode.rs @@ -0,0 +1,267 @@ +use std::collections::HashMap; + +use nom::{ + branch::alt, + bytes::complete::{tag, take_while, take_while1}, + combinator::{opt, map}, + multi::{separated_list0, separated_list1}, + IResult, + InputLength, +}; + +use crate::dec::{NonListTerm, Term, debug}; + +const DICT_OPEN: &[u8] = b"{"; +const DICT_CLOSE: &[u8] = b"}"; +const DICT_ASSIGN: &[u8] = b"="; +const DICT_DELIM: &[u8] = b","; +const STR_EXTRA_CHARS: &[u8] = b"._-*?"; + +// ---- + +#[derive(Eq, PartialEq)] +pub enum Error<'a> { + Garbage(&'a [u8]), + IncompleteInput, + NomError(&'a [u8], nom::error::ErrorKind), +} + +impl<'a> std::fmt::Debug for Error<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { + match self { + Error::Garbage(g) => write!(f, "Garbage: `{}`", debug(g)), + Error::IncompleteInput => write!(f, "Incomplete input"), + Error::NomError(s, e) => write!(f, "Nom: {:?}, at: `{}`", e, debug(s)), + } + } +} + +pub type Result<'a, T> = std::result::Result>; + +impl<'a> From>> for Error<'a> { + fn from(e: nom::Err>) -> Error<'a> { + match e { + nom::Err::Incomplete(_) => Error::IncompleteInput, + nom::Err::Error(e) | nom::Err::Failure(e) => Error::NomError(e.input, e.code), + } + } +} + +// ---- + +pub fn decode<'a>(input: &'a [u8]) -> Result<'a, Term<'a>> { + let (rest, term) = decode_term(input)?; + let (end, _) = take_while(is_whitespace)(rest)?; + if !end.is_empty() { + return Err(Error::Garbage(end)); + } + Ok(term) +} + +pub fn decode_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], Term<'a>> { + eprintln!("DT: `{}`", debug(input)); + let (start, _) = take_while(is_whitespace)(input)?; + eprintln!("DT2: `{}`", debug(start)); + let (rest, list) = separated_list1(take_while1(is_whitespace), decode_nonlist_term)(start)?; + eprintln!("DT3: `{}`", debug(rest)); + + if list.len() == 1 { + Ok((rest, list.into_iter().next().unwrap().into())) + } else { + let raw_len = start.input_len() - rest.input_len(); + let list_raw = &start[..raw_len]; + Ok((rest, Term::List(list_raw, list))) + } +} + +pub fn decode_nonlist_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], NonListTerm<'a>> { + eprintln!("DNLT: `{}`", debug(input)); + let (rest, term) = alt(( + map(decode_str, NonListTerm::Str), + map(decode_dict, |(raw, d)| NonListTerm::Dict(raw, d)), + ))(input)?; + eprintln!("DNLTend: `{}` {:?}", debug(rest), term); + Ok((rest, term)) +} + +fn decode_str<'a>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> { + eprintln!("DS: `{}`", debug(input)); + let (rest, data) = take_while1(is_string_char)(input)?; + Ok((rest, data)) +} + +type DictType<'a> = (&'a [u8], HashMap<&'a [u8], Term<'a>>); + +fn decode_dict<'a>(dict_begin: &'a [u8]) -> IResult<&'a [u8], DictType<'a>> { + eprintln!("DDbegin: `{}`", debug(dict_begin)); + let (d, _) = tag(DICT_OPEN)(dict_begin)?; + eprintln!("DD2: `{}`", debug(d)); + let (d, items) = separated_list0(dict_separator, decode_dict_item)(d)?; + eprintln!("DD3: `{}`", debug(d)); + let (d, _) = opt(dict_separator)(d)?; + let (d, _) = take_while(is_whitespace)(d)?; + eprintln!("DD4: `{}`", debug(d)); + let (dict_end, _) = tag(DICT_CLOSE)(d)?; + eprintln!("DDend: `{}`", debug(dict_end)); + + let dict = items.into_iter().collect::>(); + + let raw_len = dict_begin.input_len() - dict_end.input_len(); + let dict_raw = &dict_begin[..raw_len]; + + Ok((dict_end, (dict_raw, dict))) +} + +fn dict_separator<'a>(d: &'a [u8]) -> IResult<&'a [u8], ()> { + let (d, _) = take_while(is_whitespace)(d)?; + let (d, _) = tag(DICT_DELIM)(d)?; + Ok((d, ())) +} + +fn decode_dict_item<'a>(d: &'a [u8]) -> IResult<&'a [u8], (&'a [u8], Term<'a>)> { + eprintln!("DDI: `{}`", debug(d)); + let (d, _) = take_while(is_whitespace)(d)?; + eprintln!("DDI1: `{}`", debug(d)); + let (d, key) = decode_str(d)?; + eprintln!("DDI2: `{}`", debug(d)); + let (d, _) = take_while(is_whitespace)(d)?; + let (d, _) = tag(DICT_ASSIGN)(d)?; + eprintln!("DDI3: `{}`", debug(d)); + let (d, value) = decode_term(d)?; + eprintln!("DDI4: `{}`", debug(d)); + Ok((d, (key, value))) +} + +fn is_string_char(c: u8) -> bool { + c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c) +} + +fn is_whitespace(c: u8) -> bool { + c.is_ascii_whitespace() +} + +// ---- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_str() { + let bytes = b" plop "; + assert_eq!(decode(bytes), Ok(Term::Str(b"plop"))); + } + + #[test] + fn list_of_str_str() { + let bytes = b" plop plap plip ploup "; + assert_eq!( + decode(bytes), + Ok(Term::List( + b"plop plap plip ploup", + vec![ + NonListTerm::Str(b"plop"), + NonListTerm::Str(b"plap"), + NonListTerm::Str(b"plip"), + NonListTerm::Str(b"ploup"), + ] + )) + ); + } + + #[test] + fn simple_dict() { + let bytes = b" { aze = hello, by = bojzkz pipo, ccde = ke } "; + assert_eq!( + decode(bytes), + Ok(Term::Dict( + b"{ aze = hello, by = bojzkz pipo, ccde = ke }", + [ + (&b"aze"[..], Term::Str(b"hello")), + ( + &b"by"[..], + Term::List( + b"bojzkz pipo", + vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")] + ) + ), + (&b"ccde"[..], Term::Str(b"ke")), + ] + .into_iter() + .collect() + )) + ); + } + + #[test] + fn simple_dict_2() { + let bytes = b" { aze = hello, by = bojzkz pipo , ccde = ke , } "; + assert_eq!( + decode(bytes), + Ok(Term::Dict( + b"{ aze = hello, by = bojzkz pipo , ccde = ke , }", + [ + (&b"aze"[..], Term::Str(b"hello")), + ( + &b"by"[..], + Term::List( + b"bojzkz pipo", + vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")] + ) + ), + (&b"ccde"[..], Term::Str(b"ke")), + ] + .into_iter() + .collect() + )) + ); + } + + #[test] + fn real_world_1() { + let bytes = b"HEAD alexpubkey"; + assert_eq!( + decode(bytes), + Ok(Term::List( + b"HEAD alexpubkey", + vec![NonListTerm::Str(b"HEAD"), NonListTerm::Str(b"alexpubkey")] + )), + ); + } + + #[test] + fn real_world_2() { + let bytes = b"STANCE sthash stsign { author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }"; + assert_eq!( + decode(bytes), + Ok(Term::List( + &bytes[..], + vec![ + NonListTerm::Str(b"STANCE"), + NonListTerm::Str(b"sthash"), + NonListTerm::Str(b"stsign"), + NonListTerm::Dict(b"{ author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }", + [ + (&b"author"[..], Term::Str(b"alexpubkey")), + (&b"height"[..], Term::Str(b"12")), + (&b"parent"[..], Term::Str(b"parenthash")), + (&b"data"[..], Term::List( + b"MESSAGE { text = hello }", + vec![ + NonListTerm::Str(b"MESSAGE"), + NonListTerm::Dict( + b"{ text = hello }", + [ + (&b"text"[..], Term::Str(b"hello")), + ] + .into_iter() + .collect() + ) + ] + )) + ].into_iter().collect() + ), + ])), + ); + } +} diff --git a/src/dec/mod.rs b/src/dec/mod.rs new file mode 100644 index 0000000..aa75a9d --- /dev/null +++ b/src/dec/mod.rs @@ -0,0 +1,70 @@ +mod decode; + +use std::collections::HashMap; + +pub use decode::*; + +#[derive(Eq, PartialEq)] +pub enum Term<'a> { + Str(&'a [u8]), + Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>), + List(&'a [u8], Vec>), +} + +#[derive(Eq, PartialEq)] +pub enum NonListTerm<'a> { + Str(&'a [u8]), + Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>), +} + +impl<'a> From> for Term<'a> { + fn from(x: NonListTerm<'a>) -> Term<'a> { + match x { + NonListTerm::Str(s) => Term::Str(s), + NonListTerm::Dict(raw, d) => Term::Dict(raw, d), + } + } +} + +// ---- + +pub fn debug<'a>(x: &'a [u8]) -> &'a str { + std::str::from_utf8(x).unwrap_or("") +} + +impl<'a> std::fmt::Debug for Term<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { + match self { + Term::Str(s) => write!(f, "Str(`{}`)", debug(s)), + Term::Dict(raw, d) => { + write!(f, "Dict<`{}`", debug(raw))?; + for (k, v) in d.iter() { + write!(f, "\n `{}`={:?}", debug(k), v)?; + } + write!(f, ">") + } + Term::List(raw, l) => { + write!(f, "List[`{}`", debug(raw))?; + for i in l.iter() { + write!(f, "\n {:?}", i)?; + } + write!(f, "]") + } + } + } +} + +impl<'a> std::fmt::Debug for NonListTerm<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> { + match self { + NonListTerm::Str(s) => write!(f, "Str(`{}`)", debug(s)), + NonListTerm::Dict(raw, d) => { + write!(f, "Dict<`{}`", debug(raw))?; + for (k, v) in d.iter() { + write!(f, "\n `{}`={:?}", debug(k), v)?; + } + write!(f, ">") + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..df63aae --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod dec;