First commit including a decoder and a readme file

2022-11-17 11:48:43 +01:00 · 2022-11-17 11:48:43 +01:00 · 911da57d74
commit 911da57d74
6 changed files with 427 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 /target
 /Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,13 @@
 [package]
 name = "nettext"
 description = "A text-based data format for cryptographic network protocols"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 version = "0.1.0"
 edition = "2021"
 license = "AGPL-3.0"
 readme = "README.md"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 nom = "7.1"
--- a/README.md
+++ b/README.md
@ -0,0 +1,74 @@
 # NetText
 A text-based data format for cryptographic network protocols.
 ## Principles
 - Only uses a limited subset of ASCII characters
 - Has a minimal set of fundamental data types
 - Retains the raw representation of complex data structures for hashing and cryptographic signing
 - Minimal value data type: a string type that can only be used to represent identifiers, numbers and base64-encoded byte strings.
 ## Fundamental types
 A term can be of any of the following kinds:
 - a string, which may contain only ASCII alphanumeric terms and `.-_*?`
 - a dict, which maps strings (as defined above) to any term type
 - a list, which is a consecutive sequence of at least 2 strings or dicts (can be mixed), simply separated by whitespace
 Dicts are represented as follows:
 ```
 {
    key1 = value1,
    key2 = value2
 }
 ```
 Lists are represented as follows:
 ```
 term1 term2 term3
 ```
 As a consequence, complex data structures can be defined as follows:
 ```
 SENDTO alex {
    topic = blah,
    body = blah blah
 }
 ```
 The raw representation of a parsed dict or list is retained for hashing purposes.
 It in the sequence of bytes, in the encoded string, trimmed from whitespace at extremities,
 that represents the encoded dict or list in that string.
 In the complex stance example above, here are the lists and dicts and their raw representation:
 - the toplevel term is a list, whose raw representation is the entire encoded string (assuming no whitespace at beginning or end)
 - the third term of the list is a dict, whose raw representation starts at `{` and ends at `}`
 - the second mapping of the dict is a list, whose raw representation is exactly `blah blah`.
 Since strings cannot contain whitespace, they are always equivalent to their raw representation.
 ## Structural mappings
 Terms can be interpreted in a number of different ways, depending on the context:
 - RAW: the term is interpreted as its raw encoding (see above)
 - STRING: if the term is a string or a list composed exclusively of strings, the term is interpreted as its raw encoding
 - VARIANT: if the term is a list whose first item is a string, it is interpreted as a variant with the following properties:
  - a discriminator (the first item)
  - a value, which is either the second item in case there are only two items, or the list composed of all items starting from the second if there are more than two
 - DICT: if the term is a dict, interpret it as such
 - LIST: if the term is a string or a dict, interpret it as a list composed of that single term. Otherwise, the term is a list, interpret it as a list of terms.
 ## Data mappings
 Terms further have mappings as different data types:
 - BYTES: if the term maps as a STRING, decode it using base64
 - INT: if the term maps as a STRING, decode it as an integer written in decimal notation
 - HASH, PUBKEY, SECKEY, SIGNATURE, ENCKEY, DECKEY, SYMKEY: a bunch of things that interpret BYTES as specific cryptographic items
--- a/src/dec/decode.rs
+++ b/src/dec/decode.rs
@ -0,0 +1,267 @@
 use std::collections::HashMap;
 use nom::{
    branch::alt,
    bytes::complete::{tag, take_while, take_while1},
    combinator::{opt, map},
    multi::{separated_list0, separated_list1},
    IResult,
 	InputLength,
 };
 use crate::dec::{NonListTerm, Term, debug};
 const DICT_OPEN: &[u8] = b"{";
 const DICT_CLOSE: &[u8] = b"}";
 const DICT_ASSIGN: &[u8] = b"=";
 const DICT_DELIM: &[u8] = b",";
 const STR_EXTRA_CHARS: &[u8] = b"._-*?";
 // ----
 #[derive(Eq, PartialEq)]
 pub enum Error<'a> {
    Garbage(&'a [u8]),
    IncompleteInput,
    NomError(&'a [u8], nom::error::ErrorKind),
 }
 impl<'a> std::fmt::Debug for Error<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
        match self {
            Error::Garbage(g) => write!(f, "Garbage: `{}`", debug(g)),
            Error::IncompleteInput => write!(f, "Incomplete input"),
            Error::NomError(s, e) => write!(f, "Nom: {:?}, at: `{}`", e, debug(s)),
        }
    }
 }
 pub type Result<'a, T> = std::result::Result<T, Error<'a>>;
 impl<'a> From<nom::Err<nom::error::Error<&'a [u8]>>> for Error<'a> {
    fn from(e: nom::Err<nom::error::Error<&'a [u8]>>) -> Error<'a> {
        match e {
            nom::Err::Incomplete(_) => Error::IncompleteInput,
            nom::Err::Error(e) | nom::Err::Failure(e) => Error::NomError(e.input, e.code),
        }
    }
 }
 // ----
 pub fn decode<'a>(input: &'a [u8]) -> Result<'a, Term<'a>> {
    let (rest, term) = decode_term(input)?;
    let (end, _) = take_while(is_whitespace)(rest)?;
    if !end.is_empty() {
        return Err(Error::Garbage(end));
    }
    Ok(term)
 }
 pub fn decode_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], Term<'a>> {
    eprintln!("DT: `{}`", debug(input));
    let (start, _) = take_while(is_whitespace)(input)?;
    eprintln!("DT2: `{}`", debug(start));
    let (rest, list) = separated_list1(take_while1(is_whitespace), decode_nonlist_term)(start)?;
    eprintln!("DT3: `{}`", debug(rest));
    if list.len() == 1 {
        Ok((rest, list.into_iter().next().unwrap().into()))
    } else {
        let raw_len = start.input_len() - rest.input_len();
        let list_raw = &start[..raw_len];
        Ok((rest, Term::List(list_raw, list)))
    }
 }
 pub fn decode_nonlist_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], NonListTerm<'a>> {
    eprintln!("DNLT: `{}`", debug(input));
    let (rest, term) = alt((
        map(decode_str, NonListTerm::Str),
        map(decode_dict, |(raw, d)| NonListTerm::Dict(raw, d)),
    ))(input)?;
    eprintln!("DNLTend: `{}` {:?}", debug(rest), term);
    Ok((rest, term))
 }
 fn decode_str<'a>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> {
    eprintln!("DS: `{}`", debug(input));
    let (rest, data) = take_while1(is_string_char)(input)?;
    Ok((rest, data))
 }
 type DictType<'a> = (&'a [u8], HashMap<&'a [u8], Term<'a>>);
 fn decode_dict<'a>(dict_begin: &'a [u8]) -> IResult<&'a [u8], DictType<'a>> {
    eprintln!("DDbegin: `{}`", debug(dict_begin));
    let (d, _) = tag(DICT_OPEN)(dict_begin)?;
    eprintln!("DD2: `{}`", debug(d));
    let (d, items) = separated_list0(dict_separator, decode_dict_item)(d)?;
    eprintln!("DD3: `{}`", debug(d));
 	let (d, _) = opt(dict_separator)(d)?;
    let (d, _) = take_while(is_whitespace)(d)?;
    eprintln!("DD4: `{}`", debug(d));
    let (dict_end, _) = tag(DICT_CLOSE)(d)?;
    eprintln!("DDend: `{}`", debug(dict_end));
    let dict = items.into_iter().collect::<HashMap<_, _>>();
    let raw_len = dict_begin.input_len() - dict_end.input_len();
    let dict_raw = &dict_begin[..raw_len];
    Ok((dict_end, (dict_raw, dict)))
 }
 fn dict_separator<'a>(d: &'a [u8]) -> IResult<&'a [u8], ()> {
    let (d, _) = take_while(is_whitespace)(d)?;
    let (d, _) = tag(DICT_DELIM)(d)?;
    Ok((d, ()))
 }
 fn decode_dict_item<'a>(d: &'a [u8]) -> IResult<&'a [u8], (&'a [u8], Term<'a>)> {
    eprintln!("DDI: `{}`", debug(d));
    let (d, _) = take_while(is_whitespace)(d)?;
    eprintln!("DDI1: `{}`", debug(d));
    let (d, key) = decode_str(d)?;
    eprintln!("DDI2: `{}`", debug(d));
    let (d, _) = take_while(is_whitespace)(d)?;
    let (d, _) = tag(DICT_ASSIGN)(d)?;
    eprintln!("DDI3: `{}`", debug(d));
    let (d, value) = decode_term(d)?;
    eprintln!("DDI4: `{}`", debug(d));
    Ok((d, (key, value)))
 }
 fn is_string_char(c: u8) -> bool {
    c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c)
 }
 fn is_whitespace(c: u8) -> bool {
    c.is_ascii_whitespace()
 }
 // ----
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn simple_str() {
        let bytes = b" plop ";
        assert_eq!(decode(bytes), Ok(Term::Str(b"plop")));
    }
    #[test]
    fn list_of_str_str() {
        let bytes = b" plop plap  plip   ploup   ";
        assert_eq!(
            decode(bytes),
            Ok(Term::List(
                b"plop plap  plip   ploup",
                vec![
                    NonListTerm::Str(b"plop"),
                    NonListTerm::Str(b"plap"),
                    NonListTerm::Str(b"plip"),
                    NonListTerm::Str(b"ploup"),
                ]
            ))
        );
    }
    #[test]
    fn simple_dict() {
        let bytes = b" { aze = hello, by = bojzkz  pipo,    ccde = ke } ";
        assert_eq!(
            decode(bytes),
            Ok(Term::Dict(
                b"{ aze = hello, by = bojzkz  pipo,    ccde = ke }",
                [
                    (&b"aze"[..], Term::Str(b"hello")),
                    (
                        &b"by"[..],
                        Term::List(
                            b"bojzkz  pipo",
                            vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")]
                        )
                    ),
                    (&b"ccde"[..], Term::Str(b"ke")),
                ]
                .into_iter()
                .collect()
            ))
        );
    }
    #[test]
    fn simple_dict_2() {
        let bytes = b" { aze = hello, by = bojzkz  pipo  ,    ccde = ke  , } ";
        assert_eq!(
            decode(bytes),
            Ok(Term::Dict(
                b"{ aze = hello, by = bojzkz  pipo  ,    ccde = ke  , }",
                [
                    (&b"aze"[..], Term::Str(b"hello")),
                    (
                        &b"by"[..],
                        Term::List(
                            b"bojzkz  pipo",
                            vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")]
                        )
                    ),
                    (&b"ccde"[..], Term::Str(b"ke")),
                ]
                .into_iter()
                .collect()
            ))
        );
    }
    #[test]
    fn real_world_1() {
        let bytes = b"HEAD alexpubkey";
        assert_eq!(
            decode(bytes),
            Ok(Term::List(
                b"HEAD alexpubkey",
                vec![NonListTerm::Str(b"HEAD"), NonListTerm::Str(b"alexpubkey")]
            )),
        );
    }
    #[test]
    fn real_world_2() {
        let bytes = b"STANCE sthash stsign { author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }";
        assert_eq!(
            decode(bytes),
            Ok(Term::List(
 				&bytes[..],
                vec![
 					NonListTerm::Str(b"STANCE"),
 					NonListTerm::Str(b"sthash"),
 					NonListTerm::Str(b"stsign"),
 					NonListTerm::Dict(b"{ author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }",
 						[
 							(&b"author"[..], Term::Str(b"alexpubkey")),
 							(&b"height"[..], Term::Str(b"12")),
 							(&b"parent"[..], Term::Str(b"parenthash")),
 							(&b"data"[..], Term::List(
 									b"MESSAGE { text = hello }",
 									vec![
 										NonListTerm::Str(b"MESSAGE"),
 										NonListTerm::Dict(
 											b"{ text = hello }",
 											[
 												(&b"text"[..], Term::Str(b"hello")),
 											]
 											.into_iter()
 											.collect()
 										)
 									]
 							))
 					].into_iter().collect()
 				),
 			])),
        );
    }
 }
--- a/src/dec/mod.rs
+++ b/src/dec/mod.rs
@ -0,0 +1,70 @@
 mod decode;
 use std::collections::HashMap;
 pub use decode::*;
 #[derive(Eq, PartialEq)]
 pub enum Term<'a> {
    Str(&'a [u8]),
    Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>),
    List(&'a [u8], Vec<NonListTerm<'a>>),
 }
 #[derive(Eq, PartialEq)]
 pub enum NonListTerm<'a> {
    Str(&'a [u8]),
    Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>),
 }
 impl<'a> From<NonListTerm<'a>> for Term<'a> {
    fn from(x: NonListTerm<'a>) -> Term<'a> {
        match x {
            NonListTerm::Str(s) => Term::Str(s),
            NonListTerm::Dict(raw, d) => Term::Dict(raw, d),
        }
    }
 }
 // ----
 pub fn debug<'a>(x: &'a [u8]) -> &'a str {
    std::str::from_utf8(x).unwrap_or("<invalid ascii>")
 }
 impl<'a> std::fmt::Debug for Term<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
        match self {
            Term::Str(s) => write!(f, "Str(`{}`)", debug(s)),
            Term::Dict(raw, d) => {
                write!(f, "Dict<`{}`", debug(raw))?;
                for (k, v) in d.iter() {
                    write!(f, "\n  `{}`={:?}", debug(k), v)?;
                }
                write!(f, ">")
            }
            Term::List(raw, l) => {
                write!(f, "List[`{}`", debug(raw))?;
                for i in l.iter() {
                    write!(f, "\n  {:?}", i)?;
                }
                write!(f, "]")
            }
        }
    }
 }
 impl<'a> std::fmt::Debug for NonListTerm<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
        match self {
            NonListTerm::Str(s) => write!(f, "Str(`{}`)", debug(s)),
            NonListTerm::Dict(raw, d) => {
                write!(f, "Dict<`{}`", debug(raw))?;
                for (k, v) in d.iter() {
                    write!(f, "\n     `{}`={:?}", debug(k), v)?;
                }
                write!(f, ">")
            }
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1 @@
 pub mod dec;