From eae4a0443a1f927a988323f146bdd557745daafe Mon Sep 17 00:00:00 2001
From: Alex Auvolat <alex@adnab.me>
Date: Thu, 15 Dec 2022 16:47:04 +0100
Subject: [PATCH] Switch64 encoding available in serde encoder (TODO decoder)

---
 Cargo.toml       |   3 +-
 src/enc/mod.rs   |  99 +++++++++++++++++++++++------------
 src/lib.rs       |  37 ++++++++++++-
 src/serde/de.rs  |   2 +-
 src/serde/mod.rs |  12 +++++
 src/serde/ser.rs |  53 +++++++++++++------
 src/switch64.rs  | 131 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 285 insertions(+), 52 deletions(-)
 create mode 100644 src/switch64.rs

diff --git a/Cargo.toml b/Cargo.toml
index f1d74c7..5b7ae83 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "nettext"
 description = "A text-based data format for cryptographic network protocols"
 authors = ["Alex Auvolat <alex@adnab.me>"]
-version = "0.3.1"
+version = "0.3.2"
 edition = "2021"
 license = "AGPL-3.0"
 readme = "README.md"
@@ -12,6 +12,7 @@ readme = "README.md"
 [dependencies]
 nom = "7.1"
 base64 = "0.13"
+hex = "0.4"
 err-derive = "0.3"
 
 dryoc = { version = "0.4", optional = true }
diff --git a/src/enc/mod.rs b/src/enc/mod.rs
index 712027c..5215dca 100644
--- a/src/enc/mod.rs
+++ b/src/enc/mod.rs
@@ -23,7 +23,7 @@ use std::borrow::{Borrow, Cow};
 use std::collections::HashMap;
 
 use crate::dec::{self, decode};
-use crate::{is_string_char, is_whitespace};
+use crate::{is_string_char, is_whitespace, switch64, BytesEncoding};
 
 pub use error::Error;
 
@@ -96,6 +96,71 @@ pub fn raw(bytes: &[u8]) -> Result<'_> {
     Ok(Term(T::Str(bytes)))
 }
 
+/// Term corresponding to a byte slice,
+/// encoding using base64 url-safe encoding without padding.
+/// Since empty strings are not possible in nettext,
+/// an empty byte string is encoded as an empty list (`[]`).
+///
+/// Example:
+///
+/// ```
+/// use nettext::enc::*;
+///
+/// assert_eq!(bytes(b"hello, world!").encode(), b"aGVsbG8sIHdvcmxkIQ");
+/// ```
+pub fn bytes(bytes: &[u8]) -> Term<'static> {
+    bytes_format(bytes, BytesEncoding::Base64 { split: false })
+}
+
+/// Same as `bytes()`, but splits the byte slice in 48-byte chunks
+/// and encodes each chunk separately, putting them in a sequence of terms.
+/// Usefull for long byte slices to have cleaner representations,
+/// mainly usefull for dictionnary keys.
+pub fn bytes_split(bytes: &[u8]) -> Term<'static> {
+    bytes_format(bytes, BytesEncoding::Base64 { split: true })
+}
+
+pub fn bytes_format(bytes: &[u8], encoding: BytesEncoding) -> Term<'static> {
+    match encoding {
+        BytesEncoding::Base64 { .. } | BytesEncoding::Hex { .. } if bytes.is_empty() => {
+            Term(T::List(vec![]))
+        }
+        BytesEncoding::Base64 { split: false } => Term(T::OwnedStr(
+            base64::encode_config(bytes, base64::URL_SAFE_NO_PAD).into_bytes(),
+        )),
+        BytesEncoding::Base64 { split: true } => {
+            let chunks = bytes
+                .chunks(48)
+                .map(|b| {
+                    T::OwnedStr(base64::encode_config(b, base64::URL_SAFE_NO_PAD).into_bytes())
+                })
+                .collect::<Vec<_>>();
+            if chunks.len() > 1 {
+                Term(T::Seq(chunks))
+            } else {
+                Term(chunks.into_iter().next().unwrap())
+            }
+        }
+        BytesEncoding::Hex { split: false } => Term(T::OwnedStr(hex::encode(bytes).into_bytes())),
+        BytesEncoding::Hex { split: true } => {
+            let chunks = bytes
+                .chunks(32)
+                .map(|b| T::OwnedStr(hex::encode(b).into_bytes()))
+                .collect::<Vec<_>>();
+            if chunks.len() > 1 {
+                Term(T::Seq(chunks))
+            } else {
+                Term(chunks.into_iter().next().unwrap())
+            }
+        }
+        BytesEncoding::Switch64 { allow_whitespace } => {
+            Term(T::OwnedStr(switch64::encode(bytes, allow_whitespace)))
+        }
+    }
+}
+
+// ---- composed terms -----
+
 /// Term corresponding to a sequence of terms. Subsequences are banned and will raise an error.
 ///
 /// ```
@@ -164,38 +229,6 @@ pub fn dict<'a, I: IntoIterator<Item = (&'a str, Term<'a>)>>(pairs: I) -> Result
     Ok(Term(T::Dict(tmp)))
 }
 
-/// Term corresponding to a byte slice,
-/// encoding using base64 url-safe encoding without padding
-///
-/// Example:
-///
-/// ```
-/// use nettext::enc::*;
-///
-/// assert_eq!(bytes(b"hello, world!").encode(), b"aGVsbG8sIHdvcmxkIQ");
-/// ```
-pub fn bytes(bytes: &[u8]) -> Term<'static> {
-    Term(T::OwnedStr(
-        base64::encode_config(bytes, base64::URL_SAFE_NO_PAD).into_bytes(),
-    ))
-}
-
-/// Same as `bytes()`, but splits the byte slice in 48-byte chunks
-/// and encodes each chunk separately, putting them in a sequence of terms.
-/// Usefull for long byte slices to have cleaner representations,
-/// mainly usefull for dictionnary keys.
-pub fn bytes_split(bytes: &[u8]) -> Term<'static> {
-    let chunks = bytes
-        .chunks(48)
-        .map(|b| T::OwnedStr(base64::encode_config(b, base64::URL_SAFE_NO_PAD).into_bytes()))
-        .collect::<Vec<_>>();
-    if chunks.len() > 1 {
-        Term(T::Seq(chunks))
-    } else {
-        Term(chunks.into_iter().next().unwrap_or(T::Str(b".")))
-    }
-}
-
 impl<'a> Term<'a> {
     /// Append a term to an existing term.
     /// Transforms the initial term into a seq if necessary.
diff --git a/src/lib.rs b/src/lib.rs
index b2aef2c..c342c8e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -87,6 +87,7 @@
 
 pub mod dec;
 pub mod enc;
+pub mod switch64;
 
 #[cfg(feature = "dryoc")]
 pub mod crypto;
@@ -94,6 +95,35 @@ pub mod crypto;
 #[cfg(feature = "serde")]
 pub mod serde;
 
+/// Possible encodings for byte strings in NetText
+#[derive(Clone, Copy)]
+pub enum BytesEncoding {
+    /// Base64 encoding (default)
+    Base64 { split: bool },
+    /// Hexadecimal encoding
+    Hex { split: bool },
+    /// Switch64 encoding, a mix of plain text and base64
+    Switch64 { allow_whitespace: bool },
+}
+
+impl Default for BytesEncoding {
+    fn default() -> Self {
+        BytesEncoding::Base64 { split: true }
+    }
+}
+
+impl BytesEncoding {
+    pub fn without_whitespace(&self) -> Self {
+        match self {
+            BytesEncoding::Base64 { .. } => BytesEncoding::Base64 { split: false },
+            BytesEncoding::Hex { .. } => BytesEncoding::Hex { split: false },
+            BytesEncoding::Switch64 { .. } => BytesEncoding::Switch64 {
+                allow_whitespace: false,
+            },
+        }
+    }
+}
+
 // ---- syntactic elements of the data format ----
 
 pub(crate) const DICT_OPEN: u8 = b'{';
@@ -103,12 +133,17 @@ pub(crate) const DICT_DELIM: u8 = b',';
 pub(crate) const LIST_OPEN: u8 = b'[';
 pub(crate) const LIST_CLOSE: u8 = b']';
 pub(crate) const LIST_DELIM: u8 = b',';
-pub(crate) const STR_EXTRA_CHARS: &[u8] = b"._-+*?@:";
+pub(crate) const STR_EXTRA_CHARS: &[u8] = b"._-+*?@:/\\";
 
+pub(crate) const SWITCH64_SEPARATOR: u8 = b'\\';
+pub(crate) const SWITCH64_EXTRA_CHARS: &[u8] = b"._-+*?@:/";
+
+#[inline]
 pub(crate) fn is_string_char(c: u8) -> bool {
     c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c)
 }
 
+#[inline]
 pub(crate) fn is_whitespace(c: u8) -> bool {
     c.is_ascii_whitespace()
 }
diff --git a/src/serde/de.rs b/src/serde/de.rs
index 3a1b427..1a70098 100644
--- a/src/serde/de.rs
+++ b/src/serde/de.rs
@@ -14,7 +14,7 @@ use crate::serde::error::{Error, Result};
 pub struct Deserializer<'de, 'a>(Term<'de, 'a>);
 
 impl<'de, 'a> Deserializer<'de, 'a> {
-    fn from_term(input: &'a Term<'de, 'a>) -> Deserializer<'de, 'a> {
+    pub fn from_term(input: &'a Term<'de, 'a>) -> Deserializer<'de, 'a> {
         Deserializer(Term(input.0.mkref()))
     }
 }
diff --git a/src/serde/mod.rs b/src/serde/mod.rs
index 1760155..c2145f3 100644
--- a/src/serde/mod.rs
+++ b/src/serde/mod.rs
@@ -4,6 +4,7 @@ mod de;
 mod error;
 mod ser;
 
+pub use crate::BytesEncoding;
 pub use de::{from_bytes, from_term, Deserializer};
 pub use error::{Error, Result};
 pub use ser::{to_bytes, to_term, Serializer};
@@ -31,6 +32,17 @@ mod tests {
         eprintln!("Serialized (concise): {}", ser_concise);
         assert_eq!(ser_concise, expected_concise);
         assert_eq!(from_bytes::<T>(ser_concise.as_bytes()).unwrap(), input);
+
+        let ser_str_hex = input
+            .serialize(&mut Serializer {
+                string_format: BytesEncoding::Switch64 {
+                    allow_whitespace: true,
+                },
+                bytes_format: BytesEncoding::Hex { split: true },
+            })
+            .unwrap()
+            .encode();
+        panic!("{}", debug(&ser_str_hex));
     }
 
     #[test]
diff --git a/src/serde/ser.rs b/src/serde/ser.rs
index 2de2fe9..5d9e0b5 100644
--- a/src/serde/ser.rs
+++ b/src/serde/ser.rs
@@ -2,17 +2,22 @@ use serde::{ser, Serialize};
 
 use crate::enc::*;
 use crate::serde::error::{Error, Result};
+use crate::BytesEncoding;
 use serde::ser::Error as SerError;
 
 /// Serde serializer for nettext
-pub struct Serializer;
+#[derive(Clone, Copy, Default)]
+pub struct Serializer {
+    pub string_format: BytesEncoding,
+    pub bytes_format: BytesEncoding,
+}
 
 /// Serialize value to nettext encoder term
 pub fn to_term<T>(value: &T) -> Result<Term<'static>>
 where
     T: Serialize,
 {
-    value.serialize(&mut Serializer)
+    value.serialize(&mut Serializer::default())
 }
 
 /// Serialize value to nettext
@@ -20,7 +25,7 @@ pub fn to_bytes<T>(value: &T) -> Result<Vec<u8>>
 where
     T: Serialize,
 {
-    Ok(value.serialize(&mut Serializer)?.encode())
+    Ok(value.serialize(&mut Serializer::default())?.encode())
 }
 
 impl<'a> ser::Serializer for &'a mut Serializer {
@@ -89,11 +94,11 @@ impl<'a> ser::Serializer for &'a mut Serializer {
     }
 
     fn serialize_str(self, v: &str) -> Result<Self::Ok> {
-        Ok(bytes(v.as_bytes()))
+        Ok(bytes_format(v.as_bytes(), self.string_format))
     }
 
     fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok> {
-        Ok(bytes(v))
+        Ok(bytes_format(v, self.bytes_format))
     }
 
     fn serialize_none(self) -> Result<Self::Ok> {
@@ -148,12 +153,16 @@ impl<'a> ser::Serializer for &'a mut Serializer {
     }
 
     fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq> {
-        Ok(ListSerializer { items: vec![] })
+        Ok(ListSerializer {
+            items: vec![],
+            ser: *self,
+        })
     }
 
     fn serialize_tuple(self, len: usize) -> Result<Self::SerializeTuple> {
         Ok(SeqSerializer {
             items: Vec::with_capacity(len),
+            ser: *self,
         })
     }
 
@@ -164,7 +173,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
     ) -> Result<Self::SerializeTupleStruct> {
         let mut items = Vec::with_capacity(len + 1);
         items.push(string(name)?);
-        Ok(SeqSerializer { items })
+        Ok(SeqSerializer { items, ser: *self })
     }
 
     fn serialize_tuple_variant(
@@ -176,13 +185,14 @@ impl<'a> ser::Serializer for &'a mut Serializer {
     ) -> Result<Self::SerializeTupleVariant> {
         let mut items = Vec::with_capacity(len + 1);
         items.push(string_owned(format!("{}.{}", name, variant))?);
-        Ok(SeqSerializer { items })
+        Ok(SeqSerializer { items, ser: *self })
     }
 
     fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap> {
         Ok(MapSerializer {
             next: None,
             fields: vec![],
+            ser: *self,
         })
     }
 
@@ -190,6 +200,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
         Ok(StructSerializer {
             name,
             fields: Vec::with_capacity(len),
+            ser: *self,
         })
     }
 
@@ -204,6 +215,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
             name,
             variant,
             fields: Vec::with_capacity(len),
+            ser: *self,
         })
     }
 }
@@ -212,6 +224,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
 
 pub struct SeqSerializer {
     items: Vec<Term<'static>>,
+    ser: Serializer,
 }
 
 impl ser::SerializeTuple for SeqSerializer {
@@ -222,7 +235,7 @@ impl ser::SerializeTuple for SeqSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.items.push(value.serialize(&mut Serializer)?);
+        self.items.push(value.serialize(&mut self.ser)?);
         Ok(())
     }
 
@@ -239,7 +252,7 @@ impl ser::SerializeTupleStruct for SeqSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.items.push(value.serialize(&mut Serializer)?);
+        self.items.push(value.serialize(&mut self.ser)?);
         Ok(())
     }
 
@@ -256,7 +269,7 @@ impl ser::SerializeTupleVariant for SeqSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.items.push(value.serialize(&mut Serializer)?);
+        self.items.push(value.serialize(&mut self.ser)?);
         Ok(())
     }
 
@@ -267,6 +280,7 @@ impl ser::SerializeTupleVariant for SeqSerializer {
 
 pub struct ListSerializer {
     items: Vec<Term<'static>>,
+    ser: Serializer,
 }
 impl ser::SerializeSeq for ListSerializer {
     type Ok = Term<'static>;
@@ -276,7 +290,7 @@ impl ser::SerializeSeq for ListSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.items.push(value.serialize(&mut Serializer)?);
+        self.items.push(value.serialize(&mut self.ser)?);
         Ok(())
     }
 
@@ -288,6 +302,7 @@ impl ser::SerializeSeq for ListSerializer {
 pub struct MapSerializer {
     next: Option<Vec<u8>>,
     fields: Vec<(Vec<u8>, Term<'static>)>,
+    ser: Serializer,
 }
 
 impl ser::SerializeMap for MapSerializer {
@@ -298,7 +313,11 @@ impl ser::SerializeMap for MapSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.next = Some(key.serialize(&mut Serializer)?.encode());
+        let mut ser = Serializer {
+            string_format: self.ser.string_format.without_whitespace(),
+            bytes_format: self.ser.bytes_format.without_whitespace(),
+        };
+        self.next = Some(key.serialize(&mut ser)?.encode());
         Ok(())
     }
 
@@ -310,7 +329,7 @@ impl ser::SerializeMap for MapSerializer {
             self.next
                 .take()
                 .ok_or_else(|| Self::Error::custom("no key"))?,
-            value.serialize(&mut Serializer)?,
+            value.serialize(&mut self.ser)?,
         ));
         Ok(())
     }
@@ -323,6 +342,7 @@ impl ser::SerializeMap for MapSerializer {
 pub struct StructSerializer {
     name: &'static str,
     fields: Vec<(&'static str, Term<'static>)>,
+    ser: Serializer,
 }
 
 impl ser::SerializeStruct for StructSerializer {
@@ -333,7 +353,7 @@ impl ser::SerializeStruct for StructSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.fields.push((key, value.serialize(&mut Serializer)?));
+        self.fields.push((key, value.serialize(&mut self.ser)?));
         Ok(())
     }
 
@@ -346,6 +366,7 @@ pub struct StructVariantSerializer {
     name: &'static str,
     variant: &'static str,
     fields: Vec<(&'static str, Term<'static>)>,
+    ser: Serializer,
 }
 
 impl ser::SerializeStructVariant for StructVariantSerializer {
@@ -356,7 +377,7 @@ impl ser::SerializeStructVariant for StructVariantSerializer {
     where
         T: ?Sized + Serialize,
     {
-        self.fields.push((key, value.serialize(&mut Serializer)?));
+        self.fields.push((key, value.serialize(&mut self.ser)?));
         Ok(())
     }
 
diff --git a/src/switch64.rs b/src/switch64.rs
new file mode 100644
index 0000000..8d47629
--- /dev/null
+++ b/src/switch64.rs
@@ -0,0 +1,131 @@
+//! The Switch64 encoding for text strings
+//!
+//! Allowed characters are encoded as-is.
+//! Others are encoded using base64.
+//! Plain parts and base64-encoded parts are separated by a backslasah `\`
+
+use crate::{SWITCH64_EXTRA_CHARS, SWITCH64_SEPARATOR};
+
+pub fn encode(bytes: &[u8], allow_whitespace: bool) -> Vec<u8> {
+    let mut output = Vec::with_capacity(bytes.len());
+
+    let mut pos = 0;
+    while pos < bytes.len() {
+        // Determine how many bytes to copy as-is
+        let cnt = bytes[pos..]
+            .iter()
+            .take_while(|c| is_valid_plaintext_char(**c, allow_whitespace))
+            .count();
+
+        // Copy those bytes as-is
+        output.extend_from_slice(&bytes[pos..pos + cnt]);
+        pos += cnt;
+
+        // If some bytes remain, switch to base64 encoding
+        if pos < bytes.len() {
+            output.push(SWITCH64_SEPARATOR);
+        } else {
+            break;
+        }
+
+        // Count how many bytes to write as base64
+        // We stop at the first position where we find three consecutive
+        // characters to encode as-is
+        let mut b64end = bytes.len();
+        for i in pos..bytes.len() - 3 {
+            if bytes[i..i + 3]
+                .iter()
+                .all(|c| is_valid_plaintext_char(*c, allow_whitespace))
+            {
+                b64end = i;
+                break;
+            }
+        }
+
+        output.extend_from_slice(
+            base64::encode_config(&bytes[pos..b64end], base64::URL_SAFE_NO_PAD).as_bytes(),
+        );
+        pos = b64end;
+
+        if pos < bytes.len() {
+            output.push(SWITCH64_SEPARATOR);
+        }
+    }
+
+    output
+}
+
+pub fn decode(bytes: &[u8]) -> Result<Vec<u8>, base64::DecodeError> {
+    let mut output = Vec::with_capacity(bytes.len());
+
+    let mut pos = 0;
+    while pos < bytes.len() {
+        let cnt = bytes[pos..]
+            .iter()
+            .take_while(|c| **c != SWITCH64_SEPARATOR)
+            .count();
+        output.extend_from_slice(&bytes[pos..pos + cnt]);
+        pos += cnt + 1;
+
+        if pos >= bytes.len() {
+            break;
+        }
+
+        let cnt = bytes[pos..]
+            .iter()
+            .take_while(|c| **c != SWITCH64_SEPARATOR)
+            .count();
+        output.extend_from_slice(&base64::decode_config(
+            &bytes[pos..pos + cnt],
+            base64::URL_SAFE_NO_PAD,
+        )?);
+        pos += cnt + 1;
+    }
+
+    Ok(output)
+}
+
+#[inline]
+fn is_valid_plaintext_char(c: u8, allow_whitespace: bool) -> bool {
+    c.is_ascii_alphanumeric()
+        || (allow_whitespace && c.is_ascii_whitespace())
+        || SWITCH64_EXTRA_CHARS.contains(&c)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::debug;
+
+    #[test]
+    fn test_encode() {
+        assert_eq!(debug(&encode(&b"hello world"[..], true)), "hello world");
+        assert_eq!(
+            debug(&encode(&b"hello, world!"[..], true)),
+            "hello\\LA\\ world\\IQ"
+        );
+        assert_eq!(debug(&encode(&b",;,@$;8"[..], true)), "\\LDssQCQ7OA");
+    }
+
+    #[test]
+    fn test_decode() {
+        assert_eq!(debug(&decode(&b"hello world"[..]).unwrap()), "hello world");
+        assert_eq!(
+            debug(&decode(&b"hello\\LA\\ world\\IQ"[..]).unwrap()),
+            "hello, world!"
+        );
+        assert_eq!(debug(&decode(&b"\\LDssQCQ7OA"[..]).unwrap()), ",;,@$;8");
+    }
+
+    #[test]
+    fn test_encdec() {
+        for s in [
+			br#"assert_eq!(debug(&decode(&b"hello\\LA\\ world\\IQ"[..]).unwrap()), "hello, world!");"#.to_vec(),
+			br#"- a list, which may contain any number of any kind of terms (can be mixed)"#.to_vec(),
+			base64::decode("dVcG5EzJqGP/2ZGkVu4ewzfAug1W96tb2KiBOVyPUXfw8uD34DEepW/PPqRzi0HL").unwrap()
+		] {
+			assert_eq!(decode(&encode(&s, true)).unwrap(), s);
+			assert_eq!(decode(&encode(&s, false)).unwrap(), s);
+		}
+    }
+}