First commit including a decoder and a readme file

2022-11-17 11:48:43 +01:00 · 2022-11-17 11:48:43 +01:00 · 911da57d74
commit 911da57d74
6 changed files with 427 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/target
+/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,13 @@
+[package]
+name = "nettext"
+description = "A text-based data format for cryptographic network protocols"
+authors = ["Alex Auvolat <alex@adnab.me>"]
+version = "0.1.0"
+edition = "2021"
+license = "AGPL-3.0"
+readme = "README.md"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+nom = "7.1"
--- a/README.md
+++ b/README.md
@ -0,0 +1,74 @@
+# NetText
+
+A text-based data format for cryptographic network protocols.
+
+## Principles
+
+- Only uses a limited subset of ASCII characters
+- Has a minimal set of fundamental data types
+- Retains the raw representation of complex data structures for hashing and cryptographic signing
+- Minimal value data type: a string type that can only be used to represent identifiers, numbers and base64-encoded byte strings.
+
+## Fundamental types
+
+A term can be of any of the following kinds:
+
+- a string, which may contain only ASCII alphanumeric terms and `.-_*?`
+- a dict, which maps strings (as defined above) to any term type
+- a list, which is a consecutive sequence of at least 2 strings or dicts (can be mixed), simply separated by whitespace
+
+Dicts are represented as follows:
+
+```
+{
+    key1 = value1,
+    key2 = value2
+}
+```
+
+Lists are represented as follows:
+
+```
+term1 term2 term3
+```
+
+As a consequence, complex data structures can be defined as follows:
+
+```
+SENDTO alex {
+    topic = blah,
+    body = blah blah
+}
+```
+
+The raw representation of a parsed dict or list is retained for hashing purposes.
+It in the sequence of bytes, in the encoded string, trimmed from whitespace at extremities,
+that represents the encoded dict or list in that string.
+
+In the complex stance example above, here are the lists and dicts and their raw representation:
+
+- the toplevel term is a list, whose raw representation is the entire encoded string (assuming no whitespace at beginning or end)
+- the third term of the list is a dict, whose raw representation starts at `{` and ends at `}`
+- the second mapping of the dict is a list, whose raw representation is exactly `blah blah`.
+
+Since strings cannot contain whitespace, they are always equivalent to their raw representation.
+
+## Structural mappings
+
+Terms can be interpreted in a number of different ways, depending on the context:
+
+- RAW: the term is interpreted as its raw encoding (see above)
+- STRING: if the term is a string or a list composed exclusively of strings, the term is interpreted as its raw encoding
+- VARIANT: if the term is a list whose first item is a string, it is interpreted as a variant with the following properties:
+  - a discriminator (the first item)
+  - a value, which is either the second item in case there are only two items, or the list composed of all items starting from the second if there are more than two
+- DICT: if the term is a dict, interpret it as such
+- LIST: if the term is a string or a dict, interpret it as a list composed of that single term. Otherwise, the term is a list, interpret it as a list of terms.
+
+## Data mappings
+
+Terms further have mappings as different data types:
+
+- BYTES: if the term maps as a STRING, decode it using base64
+- INT: if the term maps as a STRING, decode it as an integer written in decimal notation
+- HASH, PUBKEY, SECKEY, SIGNATURE, ENCKEY, DECKEY, SYMKEY: a bunch of things that interpret BYTES as specific cryptographic items
--- a/src/dec/decode.rs
+++ b/src/dec/decode.rs
@ -0,0 +1,267 @@
+use std::collections::HashMap;
+
+use nom::{
+    branch::alt,
+    bytes::complete::{tag, take_while, take_while1},
+    combinator::{opt, map},
+    multi::{separated_list0, separated_list1},
+    IResult,
+	InputLength,
+};
+
+use crate::dec::{NonListTerm, Term, debug};
+
+const DICT_OPEN: &[u8] = b"{";
+const DICT_CLOSE: &[u8] = b"}";
+const DICT_ASSIGN: &[u8] = b"=";
+const DICT_DELIM: &[u8] = b",";
+const STR_EXTRA_CHARS: &[u8] = b"._-*?";
+
+// ----
+
+#[derive(Eq, PartialEq)]
+pub enum Error<'a> {
+    Garbage(&'a [u8]),
+    IncompleteInput,
+    NomError(&'a [u8], nom::error::ErrorKind),
+}
+
+impl<'a> std::fmt::Debug for Error<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        match self {
+            Error::Garbage(g) => write!(f, "Garbage: `{}`", debug(g)),
+            Error::IncompleteInput => write!(f, "Incomplete input"),
+            Error::NomError(s, e) => write!(f, "Nom: {:?}, at: `{}`", e, debug(s)),
+        }
+    }
+}
+
+pub type Result<'a, T> = std::result::Result<T, Error<'a>>;
+
+impl<'a> From<nom::Err<nom::error::Error<&'a [u8]>>> for Error<'a> {
+    fn from(e: nom::Err<nom::error::Error<&'a [u8]>>) -> Error<'a> {
+        match e {
+            nom::Err::Incomplete(_) => Error::IncompleteInput,
+            nom::Err::Error(e) | nom::Err::Failure(e) => Error::NomError(e.input, e.code),
+        }
+    }
+}
+
+// ----
+
+pub fn decode<'a>(input: &'a [u8]) -> Result<'a, Term<'a>> {
+    let (rest, term) = decode_term(input)?;
+    let (end, _) = take_while(is_whitespace)(rest)?;
+    if !end.is_empty() {
+        return Err(Error::Garbage(end));
+    }
+    Ok(term)
+}
+
+pub fn decode_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], Term<'a>> {
+    eprintln!("DT: `{}`", debug(input));
+    let (start, _) = take_while(is_whitespace)(input)?;
+    eprintln!("DT2: `{}`", debug(start));
+    let (rest, list) = separated_list1(take_while1(is_whitespace), decode_nonlist_term)(start)?;
+    eprintln!("DT3: `{}`", debug(rest));
+
+    if list.len() == 1 {
+        Ok((rest, list.into_iter().next().unwrap().into()))
+    } else {
+        let raw_len = start.input_len() - rest.input_len();
+        let list_raw = &start[..raw_len];
+        Ok((rest, Term::List(list_raw, list)))
+    }
+}
+
+pub fn decode_nonlist_term<'a>(input: &'a [u8]) -> IResult<&'a [u8], NonListTerm<'a>> {
+    eprintln!("DNLT: `{}`", debug(input));
+    let (rest, term) = alt((
+        map(decode_str, NonListTerm::Str),
+        map(decode_dict, |(raw, d)| NonListTerm::Dict(raw, d)),
+    ))(input)?;
+    eprintln!("DNLTend: `{}` {:?}", debug(rest), term);
+    Ok((rest, term))
+}
+
+fn decode_str<'a>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> {
+    eprintln!("DS: `{}`", debug(input));
+    let (rest, data) = take_while1(is_string_char)(input)?;
+    Ok((rest, data))
+}
+
+type DictType<'a> = (&'a [u8], HashMap<&'a [u8], Term<'a>>);
+
+fn decode_dict<'a>(dict_begin: &'a [u8]) -> IResult<&'a [u8], DictType<'a>> {
+    eprintln!("DDbegin: `{}`", debug(dict_begin));
+    let (d, _) = tag(DICT_OPEN)(dict_begin)?;
+    eprintln!("DD2: `{}`", debug(d));
+    let (d, items) = separated_list0(dict_separator, decode_dict_item)(d)?;
+    eprintln!("DD3: `{}`", debug(d));
+	let (d, _) = opt(dict_separator)(d)?;
+    let (d, _) = take_while(is_whitespace)(d)?;
+    eprintln!("DD4: `{}`", debug(d));
+    let (dict_end, _) = tag(DICT_CLOSE)(d)?;
+    eprintln!("DDend: `{}`", debug(dict_end));
+
+    let dict = items.into_iter().collect::<HashMap<_, _>>();
+
+    let raw_len = dict_begin.input_len() - dict_end.input_len();
+    let dict_raw = &dict_begin[..raw_len];
+
+    Ok((dict_end, (dict_raw, dict)))
+}
+
+fn dict_separator<'a>(d: &'a [u8]) -> IResult<&'a [u8], ()> {
+    let (d, _) = take_while(is_whitespace)(d)?;
+    let (d, _) = tag(DICT_DELIM)(d)?;
+    Ok((d, ()))
+}
+
+fn decode_dict_item<'a>(d: &'a [u8]) -> IResult<&'a [u8], (&'a [u8], Term<'a>)> {
+    eprintln!("DDI: `{}`", debug(d));
+    let (d, _) = take_while(is_whitespace)(d)?;
+    eprintln!("DDI1: `{}`", debug(d));
+    let (d, key) = decode_str(d)?;
+    eprintln!("DDI2: `{}`", debug(d));
+    let (d, _) = take_while(is_whitespace)(d)?;
+    let (d, _) = tag(DICT_ASSIGN)(d)?;
+    eprintln!("DDI3: `{}`", debug(d));
+    let (d, value) = decode_term(d)?;
+    eprintln!("DDI4: `{}`", debug(d));
+    Ok((d, (key, value)))
+}
+
+fn is_string_char(c: u8) -> bool {
+    c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c)
+}
+
+fn is_whitespace(c: u8) -> bool {
+    c.is_ascii_whitespace()
+}
+
+// ----
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simple_str() {
+        let bytes = b" plop ";
+        assert_eq!(decode(bytes), Ok(Term::Str(b"plop")));
+    }
+
+    #[test]
+    fn list_of_str_str() {
+        let bytes = b" plop plap  plip   ploup   ";
+        assert_eq!(
+            decode(bytes),
+            Ok(Term::List(
+                b"plop plap  plip   ploup",
+                vec![
+                    NonListTerm::Str(b"plop"),
+                    NonListTerm::Str(b"plap"),
+                    NonListTerm::Str(b"plip"),
+                    NonListTerm::Str(b"ploup"),
+                ]
+            ))
+        );
+    }
+
+    #[test]
+    fn simple_dict() {
+        let bytes = b" { aze = hello, by = bojzkz  pipo,    ccde = ke } ";
+        assert_eq!(
+            decode(bytes),
+            Ok(Term::Dict(
+                b"{ aze = hello, by = bojzkz  pipo,    ccde = ke }",
+                [
+                    (&b"aze"[..], Term::Str(b"hello")),
+                    (
+                        &b"by"[..],
+                        Term::List(
+                            b"bojzkz  pipo",
+                            vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")]
+                        )
+                    ),
+                    (&b"ccde"[..], Term::Str(b"ke")),
+                ]
+                .into_iter()
+                .collect()
+            ))
+        );
+    }
+
+    #[test]
+    fn simple_dict_2() {
+        let bytes = b" { aze = hello, by = bojzkz  pipo  ,    ccde = ke  , } ";
+        assert_eq!(
+            decode(bytes),
+            Ok(Term::Dict(
+                b"{ aze = hello, by = bojzkz  pipo  ,    ccde = ke  , }",
+                [
+                    (&b"aze"[..], Term::Str(b"hello")),
+                    (
+                        &b"by"[..],
+                        Term::List(
+                            b"bojzkz  pipo",
+                            vec![NonListTerm::Str(b"bojzkz"), NonListTerm::Str(b"pipo")]
+                        )
+                    ),
+                    (&b"ccde"[..], Term::Str(b"ke")),
+                ]
+                .into_iter()
+                .collect()
+            ))
+        );
+    }
+
+    #[test]
+    fn real_world_1() {
+        let bytes = b"HEAD alexpubkey";
+        assert_eq!(
+            decode(bytes),
+            Ok(Term::List(
+                b"HEAD alexpubkey",
+                vec![NonListTerm::Str(b"HEAD"), NonListTerm::Str(b"alexpubkey")]
+            )),
+        );
+    }
+
+    #[test]
+    fn real_world_2() {
+        let bytes = b"STANCE sthash stsign { author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }";
+        assert_eq!(
+            decode(bytes),
+            Ok(Term::List(
+				&bytes[..],
+                vec![
+					NonListTerm::Str(b"STANCE"),
+					NonListTerm::Str(b"sthash"),
+					NonListTerm::Str(b"stsign"),
+					NonListTerm::Dict(b"{ author = alexpubkey, height = 12, parent = parenthash, data = MESSAGE { text = hello } }",
+						[
+							(&b"author"[..], Term::Str(b"alexpubkey")),
+							(&b"height"[..], Term::Str(b"12")),
+							(&b"parent"[..], Term::Str(b"parenthash")),
+							(&b"data"[..], Term::List(
+									b"MESSAGE { text = hello }",
+									vec![
+										NonListTerm::Str(b"MESSAGE"),
+										NonListTerm::Dict(
+											b"{ text = hello }",
+											[
+												(&b"text"[..], Term::Str(b"hello")),
+											]
+											.into_iter()
+											.collect()
+										)
+									]
+							))
+					].into_iter().collect()
+				),
+			])),
+        );
+    }
+}
--- a/src/dec/mod.rs
+++ b/src/dec/mod.rs
@ -0,0 +1,70 @@
+mod decode;
+
+use std::collections::HashMap;
+
+pub use decode::*;
+
+#[derive(Eq, PartialEq)]
+pub enum Term<'a> {
+    Str(&'a [u8]),
+    Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>),
+    List(&'a [u8], Vec<NonListTerm<'a>>),
+}
+
+#[derive(Eq, PartialEq)]
+pub enum NonListTerm<'a> {
+    Str(&'a [u8]),
+    Dict(&'a [u8], HashMap<&'a [u8], Term<'a>>),
+}
+
+impl<'a> From<NonListTerm<'a>> for Term<'a> {
+    fn from(x: NonListTerm<'a>) -> Term<'a> {
+        match x {
+            NonListTerm::Str(s) => Term::Str(s),
+            NonListTerm::Dict(raw, d) => Term::Dict(raw, d),
+        }
+    }
+}
+
+// ----
+
+pub fn debug<'a>(x: &'a [u8]) -> &'a str {
+    std::str::from_utf8(x).unwrap_or("<invalid ascii>")
+}
+
+impl<'a> std::fmt::Debug for Term<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        match self {
+            Term::Str(s) => write!(f, "Str(`{}`)", debug(s)),
+            Term::Dict(raw, d) => {
+                write!(f, "Dict<`{}`", debug(raw))?;
+                for (k, v) in d.iter() {
+                    write!(f, "\n  `{}`={:?}", debug(k), v)?;
+                }
+                write!(f, ">")
+            }
+            Term::List(raw, l) => {
+                write!(f, "List[`{}`", debug(raw))?;
+                for i in l.iter() {
+                    write!(f, "\n  {:?}", i)?;
+                }
+                write!(f, "]")
+            }
+        }
+    }
+}
+
+impl<'a> std::fmt::Debug for NonListTerm<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        match self {
+            NonListTerm::Str(s) => write!(f, "Str(`{}`)", debug(s)),
+            NonListTerm::Dict(raw, d) => {
+                write!(f, "Dict<`{}`", debug(raw))?;
+                for (k, v) in d.iter() {
+                    write!(f, "\n     `{}`={:?}", debug(k), v)?;
+                }
+                write!(f, ">")
+            }
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1 @@
+pub mod dec;