From eae4a0443a1f927a988323f146bdd557745daafe Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 15 Dec 2022 16:47:04 +0100 Subject: [PATCH] Switch64 encoding available in serde encoder (TODO decoder) --- Cargo.toml | 3 +- src/enc/mod.rs | 99 +++++++++++++++++++++++------------ src/lib.rs | 37 ++++++++++++- src/serde/de.rs | 2 +- src/serde/mod.rs | 12 +++++ src/serde/ser.rs | 53 +++++++++++++------ src/switch64.rs | 131 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 285 insertions(+), 52 deletions(-) create mode 100644 src/switch64.rs diff --git a/Cargo.toml b/Cargo.toml index f1d74c7..5b7ae83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "nettext" description = "A text-based data format for cryptographic network protocols" authors = ["Alex Auvolat "] -version = "0.3.1" +version = "0.3.2" edition = "2021" license = "AGPL-3.0" readme = "README.md" @@ -12,6 +12,7 @@ readme = "README.md" [dependencies] nom = "7.1" base64 = "0.13" +hex = "0.4" err-derive = "0.3" dryoc = { version = "0.4", optional = true } diff --git a/src/enc/mod.rs b/src/enc/mod.rs index 712027c..5215dca 100644 --- a/src/enc/mod.rs +++ b/src/enc/mod.rs @@ -23,7 +23,7 @@ use std::borrow::{Borrow, Cow}; use std::collections::HashMap; use crate::dec::{self, decode}; -use crate::{is_string_char, is_whitespace}; +use crate::{is_string_char, is_whitespace, switch64, BytesEncoding}; pub use error::Error; @@ -96,6 +96,71 @@ pub fn raw(bytes: &[u8]) -> Result<'_> { Ok(Term(T::Str(bytes))) } +/// Term corresponding to a byte slice, +/// encoding using base64 url-safe encoding without padding. +/// Since empty strings are not possible in nettext, +/// an empty byte string is encoded as an empty list (`[]`). +/// +/// Example: +/// +/// ``` +/// use nettext::enc::*; +/// +/// assert_eq!(bytes(b"hello, world!").encode(), b"aGVsbG8sIHdvcmxkIQ"); +/// ``` +pub fn bytes(bytes: &[u8]) -> Term<'static> { + bytes_format(bytes, BytesEncoding::Base64 { split: false }) +} + +/// Same as `bytes()`, but splits the byte slice in 48-byte chunks +/// and encodes each chunk separately, putting them in a sequence of terms. +/// Usefull for long byte slices to have cleaner representations, +/// mainly usefull for dictionnary keys. +pub fn bytes_split(bytes: &[u8]) -> Term<'static> { + bytes_format(bytes, BytesEncoding::Base64 { split: true }) +} + +pub fn bytes_format(bytes: &[u8], encoding: BytesEncoding) -> Term<'static> { + match encoding { + BytesEncoding::Base64 { .. } | BytesEncoding::Hex { .. } if bytes.is_empty() => { + Term(T::List(vec![])) + } + BytesEncoding::Base64 { split: false } => Term(T::OwnedStr( + base64::encode_config(bytes, base64::URL_SAFE_NO_PAD).into_bytes(), + )), + BytesEncoding::Base64 { split: true } => { + let chunks = bytes + .chunks(48) + .map(|b| { + T::OwnedStr(base64::encode_config(b, base64::URL_SAFE_NO_PAD).into_bytes()) + }) + .collect::>(); + if chunks.len() > 1 { + Term(T::Seq(chunks)) + } else { + Term(chunks.into_iter().next().unwrap()) + } + } + BytesEncoding::Hex { split: false } => Term(T::OwnedStr(hex::encode(bytes).into_bytes())), + BytesEncoding::Hex { split: true } => { + let chunks = bytes + .chunks(32) + .map(|b| T::OwnedStr(hex::encode(b).into_bytes())) + .collect::>(); + if chunks.len() > 1 { + Term(T::Seq(chunks)) + } else { + Term(chunks.into_iter().next().unwrap()) + } + } + BytesEncoding::Switch64 { allow_whitespace } => { + Term(T::OwnedStr(switch64::encode(bytes, allow_whitespace))) + } + } +} + +// ---- composed terms ----- + /// Term corresponding to a sequence of terms. Subsequences are banned and will raise an error. /// /// ``` @@ -164,38 +229,6 @@ pub fn dict<'a, I: IntoIterator)>>(pairs: I) -> Result Ok(Term(T::Dict(tmp))) } -/// Term corresponding to a byte slice, -/// encoding using base64 url-safe encoding without padding -/// -/// Example: -/// -/// ``` -/// use nettext::enc::*; -/// -/// assert_eq!(bytes(b"hello, world!").encode(), b"aGVsbG8sIHdvcmxkIQ"); -/// ``` -pub fn bytes(bytes: &[u8]) -> Term<'static> { - Term(T::OwnedStr( - base64::encode_config(bytes, base64::URL_SAFE_NO_PAD).into_bytes(), - )) -} - -/// Same as `bytes()`, but splits the byte slice in 48-byte chunks -/// and encodes each chunk separately, putting them in a sequence of terms. -/// Usefull for long byte slices to have cleaner representations, -/// mainly usefull for dictionnary keys. -pub fn bytes_split(bytes: &[u8]) -> Term<'static> { - let chunks = bytes - .chunks(48) - .map(|b| T::OwnedStr(base64::encode_config(b, base64::URL_SAFE_NO_PAD).into_bytes())) - .collect::>(); - if chunks.len() > 1 { - Term(T::Seq(chunks)) - } else { - Term(chunks.into_iter().next().unwrap_or(T::Str(b"."))) - } -} - impl<'a> Term<'a> { /// Append a term to an existing term. /// Transforms the initial term into a seq if necessary. diff --git a/src/lib.rs b/src/lib.rs index b2aef2c..c342c8e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,6 +87,7 @@ pub mod dec; pub mod enc; +pub mod switch64; #[cfg(feature = "dryoc")] pub mod crypto; @@ -94,6 +95,35 @@ pub mod crypto; #[cfg(feature = "serde")] pub mod serde; +/// Possible encodings for byte strings in NetText +#[derive(Clone, Copy)] +pub enum BytesEncoding { + /// Base64 encoding (default) + Base64 { split: bool }, + /// Hexadecimal encoding + Hex { split: bool }, + /// Switch64 encoding, a mix of plain text and base64 + Switch64 { allow_whitespace: bool }, +} + +impl Default for BytesEncoding { + fn default() -> Self { + BytesEncoding::Base64 { split: true } + } +} + +impl BytesEncoding { + pub fn without_whitespace(&self) -> Self { + match self { + BytesEncoding::Base64 { .. } => BytesEncoding::Base64 { split: false }, + BytesEncoding::Hex { .. } => BytesEncoding::Hex { split: false }, + BytesEncoding::Switch64 { .. } => BytesEncoding::Switch64 { + allow_whitespace: false, + }, + } + } +} + // ---- syntactic elements of the data format ---- pub(crate) const DICT_OPEN: u8 = b'{'; @@ -103,12 +133,17 @@ pub(crate) const DICT_DELIM: u8 = b','; pub(crate) const LIST_OPEN: u8 = b'['; pub(crate) const LIST_CLOSE: u8 = b']'; pub(crate) const LIST_DELIM: u8 = b','; -pub(crate) const STR_EXTRA_CHARS: &[u8] = b"._-+*?@:"; +pub(crate) const STR_EXTRA_CHARS: &[u8] = b"._-+*?@:/\\"; +pub(crate) const SWITCH64_SEPARATOR: u8 = b'\\'; +pub(crate) const SWITCH64_EXTRA_CHARS: &[u8] = b"._-+*?@:/"; + +#[inline] pub(crate) fn is_string_char(c: u8) -> bool { c.is_ascii_alphanumeric() || STR_EXTRA_CHARS.contains(&c) } +#[inline] pub(crate) fn is_whitespace(c: u8) -> bool { c.is_ascii_whitespace() } diff --git a/src/serde/de.rs b/src/serde/de.rs index 3a1b427..1a70098 100644 --- a/src/serde/de.rs +++ b/src/serde/de.rs @@ -14,7 +14,7 @@ use crate::serde::error::{Error, Result}; pub struct Deserializer<'de, 'a>(Term<'de, 'a>); impl<'de, 'a> Deserializer<'de, 'a> { - fn from_term(input: &'a Term<'de, 'a>) -> Deserializer<'de, 'a> { + pub fn from_term(input: &'a Term<'de, 'a>) -> Deserializer<'de, 'a> { Deserializer(Term(input.0.mkref())) } } diff --git a/src/serde/mod.rs b/src/serde/mod.rs index 1760155..c2145f3 100644 --- a/src/serde/mod.rs +++ b/src/serde/mod.rs @@ -4,6 +4,7 @@ mod de; mod error; mod ser; +pub use crate::BytesEncoding; pub use de::{from_bytes, from_term, Deserializer}; pub use error::{Error, Result}; pub use ser::{to_bytes, to_term, Serializer}; @@ -31,6 +32,17 @@ mod tests { eprintln!("Serialized (concise): {}", ser_concise); assert_eq!(ser_concise, expected_concise); assert_eq!(from_bytes::(ser_concise.as_bytes()).unwrap(), input); + + let ser_str_hex = input + .serialize(&mut Serializer { + string_format: BytesEncoding::Switch64 { + allow_whitespace: true, + }, + bytes_format: BytesEncoding::Hex { split: true }, + }) + .unwrap() + .encode(); + panic!("{}", debug(&ser_str_hex)); } #[test] diff --git a/src/serde/ser.rs b/src/serde/ser.rs index 2de2fe9..5d9e0b5 100644 --- a/src/serde/ser.rs +++ b/src/serde/ser.rs @@ -2,17 +2,22 @@ use serde::{ser, Serialize}; use crate::enc::*; use crate::serde::error::{Error, Result}; +use crate::BytesEncoding; use serde::ser::Error as SerError; /// Serde serializer for nettext -pub struct Serializer; +#[derive(Clone, Copy, Default)] +pub struct Serializer { + pub string_format: BytesEncoding, + pub bytes_format: BytesEncoding, +} /// Serialize value to nettext encoder term pub fn to_term(value: &T) -> Result> where T: Serialize, { - value.serialize(&mut Serializer) + value.serialize(&mut Serializer::default()) } /// Serialize value to nettext @@ -20,7 +25,7 @@ pub fn to_bytes(value: &T) -> Result> where T: Serialize, { - Ok(value.serialize(&mut Serializer)?.encode()) + Ok(value.serialize(&mut Serializer::default())?.encode()) } impl<'a> ser::Serializer for &'a mut Serializer { @@ -89,11 +94,11 @@ impl<'a> ser::Serializer for &'a mut Serializer { } fn serialize_str(self, v: &str) -> Result { - Ok(bytes(v.as_bytes())) + Ok(bytes_format(v.as_bytes(), self.string_format)) } fn serialize_bytes(self, v: &[u8]) -> Result { - Ok(bytes(v)) + Ok(bytes_format(v, self.bytes_format)) } fn serialize_none(self) -> Result { @@ -148,12 +153,16 @@ impl<'a> ser::Serializer for &'a mut Serializer { } fn serialize_seq(self, _len: Option) -> Result { - Ok(ListSerializer { items: vec![] }) + Ok(ListSerializer { + items: vec![], + ser: *self, + }) } fn serialize_tuple(self, len: usize) -> Result { Ok(SeqSerializer { items: Vec::with_capacity(len), + ser: *self, }) } @@ -164,7 +173,7 @@ impl<'a> ser::Serializer for &'a mut Serializer { ) -> Result { let mut items = Vec::with_capacity(len + 1); items.push(string(name)?); - Ok(SeqSerializer { items }) + Ok(SeqSerializer { items, ser: *self }) } fn serialize_tuple_variant( @@ -176,13 +185,14 @@ impl<'a> ser::Serializer for &'a mut Serializer { ) -> Result { let mut items = Vec::with_capacity(len + 1); items.push(string_owned(format!("{}.{}", name, variant))?); - Ok(SeqSerializer { items }) + Ok(SeqSerializer { items, ser: *self }) } fn serialize_map(self, _len: Option) -> Result { Ok(MapSerializer { next: None, fields: vec![], + ser: *self, }) } @@ -190,6 +200,7 @@ impl<'a> ser::Serializer for &'a mut Serializer { Ok(StructSerializer { name, fields: Vec::with_capacity(len), + ser: *self, }) } @@ -204,6 +215,7 @@ impl<'a> ser::Serializer for &'a mut Serializer { name, variant, fields: Vec::with_capacity(len), + ser: *self, }) } } @@ -212,6 +224,7 @@ impl<'a> ser::Serializer for &'a mut Serializer { pub struct SeqSerializer { items: Vec>, + ser: Serializer, } impl ser::SerializeTuple for SeqSerializer { @@ -222,7 +235,7 @@ impl ser::SerializeTuple for SeqSerializer { where T: ?Sized + Serialize, { - self.items.push(value.serialize(&mut Serializer)?); + self.items.push(value.serialize(&mut self.ser)?); Ok(()) } @@ -239,7 +252,7 @@ impl ser::SerializeTupleStruct for SeqSerializer { where T: ?Sized + Serialize, { - self.items.push(value.serialize(&mut Serializer)?); + self.items.push(value.serialize(&mut self.ser)?); Ok(()) } @@ -256,7 +269,7 @@ impl ser::SerializeTupleVariant for SeqSerializer { where T: ?Sized + Serialize, { - self.items.push(value.serialize(&mut Serializer)?); + self.items.push(value.serialize(&mut self.ser)?); Ok(()) } @@ -267,6 +280,7 @@ impl ser::SerializeTupleVariant for SeqSerializer { pub struct ListSerializer { items: Vec>, + ser: Serializer, } impl ser::SerializeSeq for ListSerializer { type Ok = Term<'static>; @@ -276,7 +290,7 @@ impl ser::SerializeSeq for ListSerializer { where T: ?Sized + Serialize, { - self.items.push(value.serialize(&mut Serializer)?); + self.items.push(value.serialize(&mut self.ser)?); Ok(()) } @@ -288,6 +302,7 @@ impl ser::SerializeSeq for ListSerializer { pub struct MapSerializer { next: Option>, fields: Vec<(Vec, Term<'static>)>, + ser: Serializer, } impl ser::SerializeMap for MapSerializer { @@ -298,7 +313,11 @@ impl ser::SerializeMap for MapSerializer { where T: ?Sized + Serialize, { - self.next = Some(key.serialize(&mut Serializer)?.encode()); + let mut ser = Serializer { + string_format: self.ser.string_format.without_whitespace(), + bytes_format: self.ser.bytes_format.without_whitespace(), + }; + self.next = Some(key.serialize(&mut ser)?.encode()); Ok(()) } @@ -310,7 +329,7 @@ impl ser::SerializeMap for MapSerializer { self.next .take() .ok_or_else(|| Self::Error::custom("no key"))?, - value.serialize(&mut Serializer)?, + value.serialize(&mut self.ser)?, )); Ok(()) } @@ -323,6 +342,7 @@ impl ser::SerializeMap for MapSerializer { pub struct StructSerializer { name: &'static str, fields: Vec<(&'static str, Term<'static>)>, + ser: Serializer, } impl ser::SerializeStruct for StructSerializer { @@ -333,7 +353,7 @@ impl ser::SerializeStruct for StructSerializer { where T: ?Sized + Serialize, { - self.fields.push((key, value.serialize(&mut Serializer)?)); + self.fields.push((key, value.serialize(&mut self.ser)?)); Ok(()) } @@ -346,6 +366,7 @@ pub struct StructVariantSerializer { name: &'static str, variant: &'static str, fields: Vec<(&'static str, Term<'static>)>, + ser: Serializer, } impl ser::SerializeStructVariant for StructVariantSerializer { @@ -356,7 +377,7 @@ impl ser::SerializeStructVariant for StructVariantSerializer { where T: ?Sized + Serialize, { - self.fields.push((key, value.serialize(&mut Serializer)?)); + self.fields.push((key, value.serialize(&mut self.ser)?)); Ok(()) } diff --git a/src/switch64.rs b/src/switch64.rs new file mode 100644 index 0000000..8d47629 --- /dev/null +++ b/src/switch64.rs @@ -0,0 +1,131 @@ +//! The Switch64 encoding for text strings +//! +//! Allowed characters are encoded as-is. +//! Others are encoded using base64. +//! Plain parts and base64-encoded parts are separated by a backslasah `\` + +use crate::{SWITCH64_EXTRA_CHARS, SWITCH64_SEPARATOR}; + +pub fn encode(bytes: &[u8], allow_whitespace: bool) -> Vec { + let mut output = Vec::with_capacity(bytes.len()); + + let mut pos = 0; + while pos < bytes.len() { + // Determine how many bytes to copy as-is + let cnt = bytes[pos..] + .iter() + .take_while(|c| is_valid_plaintext_char(**c, allow_whitespace)) + .count(); + + // Copy those bytes as-is + output.extend_from_slice(&bytes[pos..pos + cnt]); + pos += cnt; + + // If some bytes remain, switch to base64 encoding + if pos < bytes.len() { + output.push(SWITCH64_SEPARATOR); + } else { + break; + } + + // Count how many bytes to write as base64 + // We stop at the first position where we find three consecutive + // characters to encode as-is + let mut b64end = bytes.len(); + for i in pos..bytes.len() - 3 { + if bytes[i..i + 3] + .iter() + .all(|c| is_valid_plaintext_char(*c, allow_whitespace)) + { + b64end = i; + break; + } + } + + output.extend_from_slice( + base64::encode_config(&bytes[pos..b64end], base64::URL_SAFE_NO_PAD).as_bytes(), + ); + pos = b64end; + + if pos < bytes.len() { + output.push(SWITCH64_SEPARATOR); + } + } + + output +} + +pub fn decode(bytes: &[u8]) -> Result, base64::DecodeError> { + let mut output = Vec::with_capacity(bytes.len()); + + let mut pos = 0; + while pos < bytes.len() { + let cnt = bytes[pos..] + .iter() + .take_while(|c| **c != SWITCH64_SEPARATOR) + .count(); + output.extend_from_slice(&bytes[pos..pos + cnt]); + pos += cnt + 1; + + if pos >= bytes.len() { + break; + } + + let cnt = bytes[pos..] + .iter() + .take_while(|c| **c != SWITCH64_SEPARATOR) + .count(); + output.extend_from_slice(&base64::decode_config( + &bytes[pos..pos + cnt], + base64::URL_SAFE_NO_PAD, + )?); + pos += cnt + 1; + } + + Ok(output) +} + +#[inline] +fn is_valid_plaintext_char(c: u8, allow_whitespace: bool) -> bool { + c.is_ascii_alphanumeric() + || (allow_whitespace && c.is_ascii_whitespace()) + || SWITCH64_EXTRA_CHARS.contains(&c) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::debug; + + #[test] + fn test_encode() { + assert_eq!(debug(&encode(&b"hello world"[..], true)), "hello world"); + assert_eq!( + debug(&encode(&b"hello, world!"[..], true)), + "hello\\LA\\ world\\IQ" + ); + assert_eq!(debug(&encode(&b",;,@$;8"[..], true)), "\\LDssQCQ7OA"); + } + + #[test] + fn test_decode() { + assert_eq!(debug(&decode(&b"hello world"[..]).unwrap()), "hello world"); + assert_eq!( + debug(&decode(&b"hello\\LA\\ world\\IQ"[..]).unwrap()), + "hello, world!" + ); + assert_eq!(debug(&decode(&b"\\LDssQCQ7OA"[..]).unwrap()), ",;,@$;8"); + } + + #[test] + fn test_encdec() { + for s in [ + br#"assert_eq!(debug(&decode(&b"hello\\LA\\ world\\IQ"[..]).unwrap()), "hello, world!");"#.to_vec(), + br#"- a list, which may contain any number of any kind of terms (can be mixed)"#.to_vec(), + base64::decode("dVcG5EzJqGP/2ZGkVu4ewzfAug1W96tb2KiBOVyPUXfw8uD34DEepW/PPqRzi0HL").unwrap() + ] { + assert_eq!(decode(&encode(&s, true)).unwrap(), s); + assert_eq!(decode(&encode(&s, false)).unwrap(), s); + } + } +}