nettext/src/enc/mod.rs

490 lines
15 KiB
Rust
Raw Normal View History

2022-11-17 21:53:36 +00:00
//! Functions to generate nettext representations of data structures
2022-11-18 11:54:56 +00:00
//!
//! Example:
//!
//! ```
//! use nettext::enc::*;
//!
//! let nettext_encoding = seq([
2022-11-18 11:54:56 +00:00
//! string("CALL").unwrap(),
//! string("myfunction").unwrap(),
//! dict([
//! ("a", string("hello").unwrap()),
//! ("b", string("world").unwrap()),
2023-05-10 10:05:25 +00:00
//! ("c", raw(b"{ a = 12; b = 42 }").unwrap()),
2022-11-18 11:54:56 +00:00
//! ("d", bytes_split(&((0..128u8).collect::<Vec<_>>()))),
2022-11-18 17:59:06 +00:00
//! ]).unwrap(),
2022-11-18 11:54:56 +00:00
//! ]).unwrap().encode();
//! ```
2022-11-17 21:53:36 +00:00
2022-11-18 13:15:30 +00:00
mod error;
2022-11-18 15:26:44 +00:00
use std::borrow::{Borrow, Cow};
2022-11-17 16:55:50 +00:00
use std::collections::HashMap;
2022-11-17 22:58:44 +00:00
use crate::dec::{self, decode};
use crate::*;
use crate::{is_string_char, is_whitespace};
2022-11-17 16:55:50 +00:00
2022-11-18 13:15:30 +00:00
pub use error::Error;
2022-11-17 22:58:44 +00:00
/// A term meant to be encoded into a nettext representation
2022-11-17 16:55:50 +00:00
pub struct Term<'a>(T<'a>);
enum T<'a> {
Str(&'a [u8]),
OwnedStr(Vec<u8>),
2022-11-18 13:15:30 +00:00
Dict(HashMap<Cow<'a, [u8]>, T<'a>>),
List(Vec<T<'a>>),
Seq(Vec<T<'a>>),
2022-11-17 16:55:50 +00:00
}
/// The result type for trying to encode something as nettext
2022-11-18 11:49:13 +00:00
pub type Result<'a> = std::result::Result<Term<'a>, Error>;
2022-11-17 22:58:44 +00:00
// ---- helpers to transform datatypes into encoder terms ----
/// Trait for anything that can be encoded as nettext
pub trait Encode {
2022-11-18 11:49:13 +00:00
fn term(&self) -> Result<'_>;
2022-11-17 22:58:44 +00:00
}
impl<'a, 'b> Encode for dec::Term<'a, 'b> {
2022-11-18 11:49:13 +00:00
fn term(&self) -> Result<'_> {
Ok(Term(T::Str(self.raw())))
2022-11-17 22:58:44 +00:00
}
}
2022-11-17 16:55:50 +00:00
// ---- helpers to build terms ----
2022-11-17 22:58:44 +00:00
/// Term corresponding to a string (that may contain whitespace)
2022-11-17 16:55:50 +00:00
///
/// ```
/// use nettext::enc::*;
///
2022-11-18 11:53:14 +00:00
/// assert_eq!(string("Hello world .").unwrap().encode(), b"Hello world .");
2022-11-17 16:55:50 +00:00
/// ```
2022-11-18 11:49:13 +00:00
pub fn string(s: &str) -> Result<'_> {
2022-11-17 16:55:50 +00:00
for c in s.as_bytes().iter() {
if !(is_string_char(*c) || is_whitespace(*c)) {
2022-11-18 11:49:13 +00:00
return Err(Error::InvalidCharacter(*c));
2022-11-17 16:55:50 +00:00
}
}
2022-11-18 11:49:13 +00:00
Ok(Term(T::Str(s.as_bytes())))
2022-11-17 16:55:50 +00:00
}
2022-11-18 13:15:30 +00:00
/// Same as `string` but takes an owned String
pub fn string_owned(s: String) -> Result<'static> {
for c in s.as_bytes().iter() {
if !(is_string_char(*c) || is_whitespace(*c)) {
return Err(Error::InvalidCharacter(*c));
}
}
Ok(Term(T::OwnedStr(s.into_bytes())))
}
2022-11-17 16:55:50 +00:00
/// Include a raw nettext value
///
/// ```
/// use nettext::enc::*;
///
2023-05-10 10:05:25 +00:00
/// assert_eq!(raw(b"Hello { a = b; c = d} .").unwrap().encode(), b"Hello { a = b; c = d} .");
2022-11-17 16:55:50 +00:00
/// ```
2022-11-18 11:49:13 +00:00
pub fn raw(bytes: &[u8]) -> Result<'_> {
2022-11-17 16:55:50 +00:00
if decode(bytes).is_err() {
2022-11-18 11:49:13 +00:00
return Err(Error::InvalidRaw);
2022-11-17 16:55:50 +00:00
}
2022-11-18 11:49:13 +00:00
Ok(Term(T::Str(bytes)))
2022-11-17 16:55:50 +00:00
}
/// Term corresponding to a byte slice,
/// encoding using base64 url-safe encoding without padding.
/// Since empty strings are not possible in nettext,
/// an empty byte string is encoded as the special string `-`.
///
/// Example:
///
/// ```
/// use nettext::enc::*;
///
/// assert_eq!(bytes(b"").encode(), b"-");
/// assert_eq!(bytes(b"hello, world!").encode(), b"aGVsbG8sIHdvcmxkIQ");
/// ```
pub fn bytes(bytes: &[u8]) -> Term<'static> {
if bytes.is_empty() {
Term(T::Str(b"-"))
} else {
Term(T::OwnedStr(
base64::encode_config(bytes, base64::URL_SAFE_NO_PAD).into_bytes(),
))
}
}
/// Same as `bytes()`, but splits the byte slice in 48-byte chunks
/// and encodes each chunk separately, putting them in a sequence of terms.
/// Usefull for long byte slices to have cleaner representations.
pub fn bytes_split(bytes: &[u8]) -> Term<'static> {
if bytes.is_empty() {
Term(T::Str(b"-"))
} else {
let chunks = bytes
.chunks(48)
.map(|b| T::OwnedStr(base64::encode_config(b, base64::URL_SAFE_NO_PAD).into_bytes()))
.collect::<Vec<_>>();
if chunks.len() > 1 {
Term(T::Seq(chunks))
} else {
Term(chunks.into_iter().next().unwrap())
}
}
}
/// Term corresponding to a byte slice,
/// encoding using base64 url-safe encoding without padding,
/// with a prefix used to identify its content type.
/// The marker prefix is typically used in crypto settings to identify
/// a cryptographic protocol or algorithm; it may not contain the `:` character.
///
/// Example:
///
/// ```
/// use nettext::enc::*;
///
/// assert_eq!(marked_bytes("mytype", b"").unwrap().encode(), b"mytype:-");
/// assert_eq!(marked_bytes("mytype", b"hello, world!").unwrap().encode(), b"mytype:aGVsbG8sIHdvcmxkIQ");
/// ```
pub fn marked_bytes(marker: &str, bytes: &[u8]) -> Result<'static> {
for c in marker.as_bytes().iter() {
if !is_string_char(*c) || *c == b':' {
return Err(Error::InvalidCharacter(*c));
}
}
if bytes.is_empty() {
Ok(Term(T::OwnedStr(format!("{}:-", marker).into_bytes())))
} else {
Ok(Term(T::OwnedStr(
format!(
"{}:{}",
marker,
base64::encode_config(bytes, base64::URL_SAFE_NO_PAD)
)
.into_bytes(),
)))
}
}
// ---- composed terms -----
/// Term corresponding to a sequence of terms. Subsequences are banned and will raise an error.
2022-11-17 16:55:50 +00:00
///
/// ```
/// use nettext::enc::*;
///
/// assert_eq!(seq([
2022-11-18 11:49:13 +00:00
/// string("Hello").unwrap(),
/// string("world").unwrap()
2022-11-18 11:53:14 +00:00
/// ]).unwrap().encode(), b"Hello world");
2022-11-17 16:55:50 +00:00
/// ```
pub fn seq<'a, I: IntoIterator<Item = Term<'a>>>(terms: I) -> Result<'a> {
2022-11-17 22:58:44 +00:00
let mut tmp = Vec::with_capacity(8);
for t in terms {
match t.0 {
T::Seq(_) => return Err(Error::SeqInSeq),
2022-11-17 22:58:44 +00:00
x => tmp.push(x),
}
}
Ok(Term(T::Seq(tmp)))
2022-11-17 16:55:50 +00:00
}
/// Term corresponding to a sequence of terms. Sub-sequences are flattenned.
pub fn seq_flatten<'a, I: IntoIterator<Item = Term<'a>>>(terms: I) -> Term<'a> {
2022-11-18 13:15:30 +00:00
let mut tmp = Vec::with_capacity(8);
for t in terms {
match t.0 {
T::Seq(t) => tmp.extend(t),
2022-11-18 13:15:30 +00:00
x => tmp.push(x),
}
}
Term(T::Seq(tmp))
2022-11-18 13:15:30 +00:00
}
/// Term corresponding to a list of terms.
///
/// ```
/// use nettext::enc::*;
///
/// assert_eq!(list([
/// string("Hello").unwrap(),
/// string("world").unwrap()
2023-05-10 10:05:25 +00:00
/// ]).encode(), b"[\n Hello;\n world;\n]");
/// ```
pub fn list<'a, I: IntoIterator<Item = Term<'a>>>(terms: I) -> Term<'a> {
let terms = terms.into_iter().map(|x| x.0).collect::<Vec<_>>();
Term(T::List(terms))
2022-11-18 13:15:30 +00:00
}
2022-11-17 22:58:44 +00:00
/// Term corresponding to a dictionnary of items
2022-11-17 16:55:50 +00:00
///
/// ```
/// use nettext::enc::*;
///
2022-11-18 11:53:14 +00:00
/// assert_eq!(dict([
2022-11-18 11:49:13 +00:00
/// ("a", string("Hello").unwrap()),
/// ("b", string("world").unwrap())
2023-05-10 10:05:25 +00:00
/// ]).unwrap().encode(), b"{\n a = Hello;\n b = world;\n}");
2022-11-17 16:55:50 +00:00
/// ```
2022-11-18 17:59:06 +00:00
pub fn dict<'a, I: IntoIterator<Item = (&'a str, Term<'a>)>>(pairs: I) -> Result<'a> {
2022-11-17 22:58:44 +00:00
let mut tmp = HashMap::new();
for (k, v) in pairs {
2022-11-18 17:59:06 +00:00
if tmp.insert(Cow::from(k.as_bytes()), v.0).is_some() {
return Err(Error::DuplicateKey(k.to_string()));
}
2022-11-17 22:58:44 +00:00
}
2022-11-18 17:59:06 +00:00
Ok(Term(T::Dict(tmp)))
2022-11-17 22:58:44 +00:00
}
2022-11-17 16:55:50 +00:00
impl<'a> Term<'a> {
2022-11-17 22:58:44 +00:00
/// Append a term to an existing term.
/// Transforms the initial term into a seq if necessary.
#[must_use]
2022-11-17 22:58:44 +00:00
pub fn append(self, t: Term<'a>) -> Term<'a> {
2022-11-18 11:49:13 +00:00
match self.0 {
T::Seq(mut v) => {
2022-11-18 11:49:13 +00:00
v.push(t.0);
Term(T::Seq(v))
2022-11-18 11:49:13 +00:00
}
x => Term(T::Seq(vec![x, t.0])),
2022-11-17 16:55:50 +00:00
}
}
2022-11-17 22:58:44 +00:00
/// Inserts a key-value pair into a term that is a dictionnary.
/// Fails if `self` is not a dictionnary.
2022-11-18 11:49:13 +00:00
pub fn insert(self, k: &'a str, v: Term<'a>) -> Result<'a> {
match self.0 {
T::Dict(mut d) => {
2022-11-18 17:59:06 +00:00
if d.insert(Cow::from(k.as_bytes()), v.0).is_some() {
return Err(Error::DuplicateKey(k.to_string()));
}
2022-11-18 11:49:13 +00:00
Ok(Term(T::Dict(d)))
}
_ => Err(Error::NotADictionnary),
2022-11-17 16:55:50 +00:00
}
}
2022-11-18 17:59:06 +00:00
}
// ---- additional internal functions for serde module ----
#[cfg(feature = "serde")]
pub(crate) fn dict_owned_u8<'a, I: IntoIterator<Item = (Vec<u8>, Term<'a>)>>(
pairs: I,
) -> Result<'a> {
let mut tmp = HashMap::new();
for (k, v) in pairs {
tmp.insert(Cow::from(k), v.0);
}
2022-11-18 17:59:06 +00:00
Ok(Term(T::Dict(tmp)))
}
#[cfg(feature = "serde")]
pub(crate) fn safe_raw(bytes: &[u8]) -> Term<'_> {
Term(T::Str(bytes))
}
#[cfg(feature = "serde")]
pub(crate) fn safe_raw_owned(bytes: Vec<u8>) -> Term<'static> {
Term(T::OwnedStr(bytes))
2022-11-17 16:55:50 +00:00
}
// ---- encoding function ----
2022-11-18 11:53:14 +00:00
impl<'a> Term<'a> {
/// Generate the nettext representation of a term
pub fn encode(self) -> Vec<u8> {
let mut buf = Vec::with_capacity(128);
self.0.encode_aux(&mut buf, 0, true);
buf
}
2022-12-15 12:23:01 +00:00
/// Generate the nettext representation of a term, as a String
pub fn encode_string(self) -> String {
unsafe { String::from_utf8_unchecked(self.encode()) }
2022-12-15 12:23:01 +00:00
}
2022-12-15 14:36:41 +00:00
/// Generate the concise nettext representation of a term
pub fn encode_concise(self) -> Vec<u8> {
let mut buf = Vec::with_capacity(128);
self.0.encode_concise_aux(&mut buf);
buf
}
2022-11-17 16:55:50 +00:00
}
2022-11-18 11:53:14 +00:00
impl<'a> T<'a> {
fn encode_aux(self, buf: &mut Vec<u8>, indent: usize, is_toplevel: bool) {
match self {
T::Str(s) => buf.extend_from_slice(s),
T::OwnedStr(s) => buf.extend_from_slice(&s),
T::Dict(mut d) => {
if d.is_empty() {
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[DICT_OPEN, DICT_CLOSE]);
2022-11-18 11:53:14 +00:00
} else if d.len() == 1 {
let (k, v) = d.into_iter().next().unwrap();
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[DICT_OPEN, b' ']);
2022-12-15 14:38:10 +00:00
buf.extend_from_slice(k.borrow());
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[b' ', DICT_ASSIGN, b' ']);
2022-11-18 11:53:14 +00:00
v.encode_aux(buf, indent + 2, false);
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[b' ', DICT_CLOSE]);
2022-11-18 11:53:14 +00:00
} else {
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[DICT_OPEN, b'\n']);
2022-11-18 11:53:14 +00:00
let indent2 = indent + 2;
let mut keys = d.keys().cloned().collect::<Vec<_>>();
keys.sort();
for k in keys {
2022-11-18 13:15:30 +00:00
let v = d.remove(&k).unwrap();
2022-11-18 11:53:14 +00:00
for _ in 0..indent2 {
buf.push(b' ');
}
2022-11-18 13:15:30 +00:00
buf.extend_from_slice(k.borrow());
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[b' ', DICT_ASSIGN, b' ']);
2022-11-18 11:53:14 +00:00
v.encode_aux(buf, indent2, false);
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[DICT_DELIM, b'\n']);
2022-11-18 11:53:14 +00:00
}
for _ in 0..indent {
buf.push(b' ');
}
2023-05-10 10:05:25 +00:00
buf.push(DICT_CLOSE);
}
2022-11-17 16:55:50 +00:00
}
2022-12-15 14:38:10 +00:00
T::List(l) => {
if l.len() == 0 {
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[LIST_OPEN, LIST_CLOSE]);
2022-12-15 14:38:10 +00:00
} else if l.len() == 1 {
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[LIST_OPEN, b' ']);
2022-12-15 14:38:10 +00:00
l.into_iter()
.next()
.unwrap()
.encode_aux(buf, indent + 2, false);
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[b' ', LIST_CLOSE]);
2022-12-15 14:38:10 +00:00
} else {
let indent2 = indent + 2;
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[LIST_OPEN, b'\n']);
2022-12-15 14:38:10 +00:00
for item in l {
for _ in 0..indent2 {
buf.push(b' ');
}
item.encode_aux(buf, indent2, false);
2023-05-10 10:05:25 +00:00
buf.extend_from_slice(&[LIST_DELIM, b'\n']);
2022-12-15 14:38:10 +00:00
}
for _ in 0..indent {
buf.push(b' ');
}
2023-05-10 10:05:25 +00:00
buf.push(LIST_CLOSE);
2022-12-15 14:38:10 +00:00
}
}
T::Seq(l) => {
2022-11-18 11:53:14 +00:00
let indent2 = indent + 2;
for (i, v) in l.into_iter().enumerate() {
if !is_toplevel && buf.iter().rev().take_while(|c| **c != b'\n').count() >= 70 {
buf.push(b'\n');
for _ in 0..indent2 {
buf.push(b' ');
}
} else if i > 0 {
2022-11-17 16:55:50 +00:00
buf.push(b' ');
}
2022-11-18 11:53:14 +00:00
v.encode_aux(buf, indent2, is_toplevel);
2022-11-17 16:55:50 +00:00
}
}
}
}
2022-12-15 14:36:41 +00:00
fn encode_concise_aux(self, buf: &mut Vec<u8>) {
match self {
T::Str(s) => buf.extend_from_slice(s),
T::OwnedStr(s) => buf.extend_from_slice(&s),
T::Dict(mut d) => {
2023-05-10 10:05:25 +00:00
buf.push(DICT_OPEN);
2022-12-15 14:38:10 +00:00
let mut keys = d.keys().cloned().collect::<Vec<_>>();
keys.sort();
for (i, k) in keys.into_iter().enumerate() {
if i > 0 {
2023-05-10 10:05:25 +00:00
buf.push(DICT_DELIM);
2022-12-15 14:38:10 +00:00
}
let v = d.remove(&k).unwrap();
buf.extend_from_slice(k.borrow());
2023-05-10 10:05:25 +00:00
buf.push(DICT_ASSIGN);
2022-12-15 14:38:10 +00:00
v.encode_concise_aux(buf);
}
2023-05-10 10:05:25 +00:00
buf.push(DICT_CLOSE);
2022-12-15 14:36:41 +00:00
}
2022-12-15 14:38:10 +00:00
T::List(l) => {
2023-05-10 10:05:25 +00:00
buf.push(LIST_OPEN);
2022-12-15 14:38:10 +00:00
for (i, item) in l.into_iter().enumerate() {
2022-12-15 14:36:41 +00:00
if i > 0 {
2023-05-10 10:05:25 +00:00
buf.push(LIST_DELIM);
2022-12-15 14:36:41 +00:00
}
2022-12-15 14:38:10 +00:00
item.encode_concise_aux(buf);
}
2023-05-10 10:05:25 +00:00
buf.push(LIST_CLOSE);
2022-12-15 14:38:10 +00:00
}
2022-12-15 14:36:41 +00:00
T::Seq(l) => {
for (i, v) in l.into_iter().enumerate() {
if i > 0 {
buf.push(b' ');
}
v.encode_concise_aux(buf);
}
}
}
}
2022-11-17 16:55:50 +00:00
}
#[cfg(test)]
mod tests {
use super::*;
2022-12-15 14:38:10 +00:00
use crate::debug;
2022-11-17 16:55:50 +00:00
#[test]
fn complex1() {
let input = seq([
2022-11-18 11:49:13 +00:00
string("HELLO").unwrap(),
string("alexhelloworld").unwrap(),
2022-12-15 14:38:10 +00:00
list([string("dude").unwrap(), string("why").unwrap()]),
2022-11-17 16:55:50 +00:00
dict([
2022-11-18 11:49:13 +00:00
("from", string("jxx").unwrap()),
("subject", string("hello").unwrap()),
2023-05-10 10:05:25 +00:00
("data", raw(b"{ f1 = plop; f2 = kuko }").unwrap()),
2022-11-18 17:59:06 +00:00
])
.unwrap(),
2022-11-18 11:49:13 +00:00
])
.unwrap();
2022-12-15 14:36:41 +00:00
let expected = "HELLO alexhelloworld [
2023-05-10 10:05:25 +00:00
dude;
why;
] {
2023-05-10 10:05:25 +00:00
data = { f1 = plop; f2 = kuko };
from = jxx;
subject = hello;
2022-11-17 16:55:50 +00:00
}";
2022-12-15 14:36:41 +00:00
assert_eq!(debug(&input.encode()), expected);
2022-12-15 14:38:10 +00:00
}
2022-12-15 14:36:41 +00:00
#[test]
fn complex1_concise() {
let input = seq([
string("HELLO").unwrap(),
string("alexhelloworld").unwrap(),
2022-12-15 14:38:10 +00:00
list([string("dude").unwrap(), string("why").unwrap()]),
2022-12-15 14:36:41 +00:00
dict([
("from", string("jxx").unwrap()),
("subject", string("hello").unwrap()),
2023-05-10 10:05:25 +00:00
("data", raw(b"{ f1 = plop; f2 = kuko }").unwrap()),
2022-12-15 14:36:41 +00:00
])
.unwrap(),
])
.unwrap();
2023-05-10 10:05:25 +00:00
let expected_concise = "HELLO alexhelloworld [dude;why] {data={ f1 = plop; f2 = kuko };from=jxx;subject=hello}";
2022-12-15 14:36:41 +00:00
assert_eq!(debug(&input.encode_concise()), expected_concise);
}
2022-11-17 16:55:50 +00:00
}