From 6e76fed684359eeb9f520006b8d0459c9eb66cf1 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Mon, 12 Jun 2023 16:05:06 +0200 Subject: [PATCH] implement comment foldable whitespace --- README.md | 4 ++ src/abnf.rs | 50 ---------------- src/headers.rs | 23 +++++--- src/lib.rs | 2 +- src/model.rs | 41 ++++++++++++- src/tokens.rs | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 213 insertions(+), 62 deletions(-) delete mode 100644 src/abnf.rs create mode 100644 src/tokens.rs diff --git a/README.md b/README.md index 7b0555e..73052ab 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ # imf-codec +**Work in progress, do not use in production** + +**Focus: correctness over performance** + **This is currently only a decoder (parser), encoding is not supported.** diff --git a/src/abnf.rs b/src/abnf.rs deleted file mode 100644 index 8a6626c..0000000 --- a/src/abnf.rs +++ /dev/null @@ -1,50 +0,0 @@ -use nom::{ - IResult, - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::{crlf, space0, space1}, - combinator::opt, - sequence::terminated, -}; - -/// ABNF rfc5234 - -/// Permissive CRLF -/// -/// Theoretically, all lines must end with \r\n -/// but mail servers support malformated emails, -/// for example with only \n eol. It works because -/// \r\n is allowed nowhere else, so we also add this support. -pub fn perm_crlf(input: &str) -> IResult<&str, &str> { - alt((crlf, tag("\r"), tag("\n")))(input) -} - -// Note: WSP = SP / HTAB = %x20 / %x09 -// nom::*::space0 = *WSP -// nom::*::space1 = 1*WSP - -/// Parse a folding white space -/// -/// Folding white space are used for long headers splitted on multiple lines -/// -/// ```abnf -/// FWS = ([*WSP CRLF] 1*WSP) / obs-FWS -/// obs-FWS = 1*WSP *(CRLF 1*WSP) -/// ``` -pub fn fws(input: &str) -> IResult<&str, &str> { - let (input, _) = opt(terminated(space0, perm_crlf))(input)?; - // @FIXME: not implemented obs-FWS - space1(input) -} - -/// Sequence of visible chars with the UTF-8 extension -/// -/// ```abnf -/// VCHAR = %x21-7E -/// ; visible (printing) characters -/// VCHAR =/ UTF8-non-ascii -/// SEQ = 1*VCHAR -///``` -pub fn vchar_seq(input: &str) -> IResult<&str, &str> { - take_while1(|c: char| (c >= '\x21' && c <= '\x7E') || !c.is_ascii())(input) -} diff --git a/src/headers.rs b/src/headers.rs index cf832c4..bdcad2f 100644 --- a/src/headers.rs +++ b/src/headers.rs @@ -11,18 +11,18 @@ use nom::{ sequence::tuple, }; -use crate::abnf::{fws, vchar_seq, perm_crlf}; -use crate::model::{HeaderSection, HeaderDate}; +use crate::tokens::{perm_fws, vchar_seq, perm_crlf}; +use crate::model::{PermissiveHeaderSection, HeaderDate, MailboxRef}; /// HEADERS /// Header section /// /// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2 -pub fn header_section(input: &str) -> IResult<&str, HeaderSection> { +pub fn header_section(input: &str) -> IResult<&str, PermissiveHeaderSection> { let (input, headers) = fold_many0( header_field, - HeaderSection::default, + PermissiveHeaderSection::default, |mut section, head| { match head { HeaderField::Date(d) => { @@ -114,7 +114,11 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> { }; (input, HeaderField::Date(date)) }, - //"From" => unimplemented!(), + "From" => { + let (input, mbx) = mailbox(input)?; + //many0( + unimplemented!() + }, "Sender" => unimplemented!(), "Subject" => { let (input, body) = unstructured(input)?; @@ -136,17 +140,17 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> { /// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct /// ``` fn unstructured(input: &str) -> IResult<&str, String> { - let (input, r) = many0(tuple((opt(fws), vchar_seq)))(input)?; + let (input, r) = many0(tuple((opt(perm_fws), vchar_seq)))(input)?; let (input, _) = space0(input)?; // Try to optimize for the most common cases let body = match r.as_slice() { [(None, content)] => content.to_string(), - [(Some(ws), content)] => ws.to_string() + content, + [(Some(_), content)] => " ".to_string() + content, lines => lines.iter().fold(String::with_capacity(255), |acc, item| { let (may_ws, content) = item; match may_ws { - Some(ws) => acc + ws + content, + Some(ws) => acc + " " + content, None => acc + content, } }), @@ -155,3 +159,6 @@ fn unstructured(input: &str) -> IResult<&str, String> { Ok((input, body)) } +fn mailbox(input: &str) -> IResult<&str, MailboxRef> { + unimplemented!(); +} diff --git a/src/lib.rs b/src/lib.rs index 023e824..6f7d11d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,3 @@ pub mod headers; pub mod model; -mod abnf; +mod tokens; diff --git a/src/model.rs b/src/model.rs index f2db3b2..b1d23c5 100644 --- a/src/model.rs +++ b/src/model.rs @@ -9,10 +9,37 @@ pub enum HeaderDate { None, } +#[derive(Debug)] +pub struct MailboxRef<'a> { + // The actual "email address" like hello@example.com + pub addrspec: &'a str, + pub name: Option<&'a str>, +} + +#[derive(Debug)] +pub struct GroupRef<'a> { + pub name: &'a str, + pub mbx: Vec>, +} + +#[derive(Debug)] +pub enum AddressRef<'a> { + Single(MailboxRef<'a>), + Many(GroupRef<'a>), +} + +/// Permissive Header Section +/// +/// This is a structure intended for parsing/decoding, +/// hence it's support cases where the email is considered +/// as invalid according to RFC5322 but for which we can +/// still extract some data. #[derive(Debug, Default)] -pub struct HeaderSection<'a> { +pub struct PermissiveHeaderSection<'a> { pub subject: Option, - pub from: Vec, + pub from: Vec>, + pub sender: Option>, + pub reply_to: Vec>, pub date: HeaderDate, pub optional: HashMap<&'a str, String>, } @@ -21,7 +48,15 @@ enum InvalidEmailErr { NoUsableDate, } -impl<'a> HeaderSection<'a> { +impl<'a> PermissiveHeaderSection<'a> { + /// Check validity of the email + /// + /// Especially check that there is no missing fields, + /// or no unique fields declared multiple times. + /// + /// See: https://www.rfc-editor.org/rfc/rfc5322#section-3.6 + //@FIXME could be changed to a to_StrictHeaderSection call. All fixed errors would be returned in + // a vec of errors. fn is_valid(&self) -> Result<(), InvalidEmailErr> { match self.date { HeaderDate::Parsed(_) => (), diff --git a/src/tokens.rs b/src/tokens.rs new file mode 100644 index 0000000..de76c99 --- /dev/null +++ b/src/tokens.rs @@ -0,0 +1,155 @@ +use nom::{ + IResult, + branch::alt, + bytes::complete::{tag, take_while1}, + character::complete::{crlf, satisfy, space0, space1}, + combinator::{recognize, opt}, + multi::{many0, many1}, + sequence::{preceded, terminated, tuple}, +}; + +/// Lexical tokens +/// +/// Approx. maps to section 3.2 of the RFC +/// https://www.rfc-editor.org/rfc/rfc5322#section-3.2 +/// Also https://datatracker.ietf.org/doc/html/rfc6532 + +/// Permissive CRLF +/// +/// Theoretically, all lines must end with \r\n +/// but some mail servers like Dovecot support malformated emails, +/// for example with only \n eol. It works because +/// \r or \n is allowed nowhere else, so we also add this support. +pub fn perm_crlf(input: &str) -> IResult<&str, &str> { + alt((crlf, tag("\r"), tag("\n")))(input) +} + +// Note: WSP = SP / HTAB = %x20 / %x09 +// nom::*::space0 = *WSP +// nom::*::space1 = 1*WSP + +/// Quoted pair +/// +/// ```abnf +/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp +/// ``` +pub fn quoted_pair(input: &str) -> IResult<&str, char> { + preceded(tag("\\"), satisfy(|c| is_vchar(c) || c == '\t' || c == ' '))(input) +} + +/// Permissive foldable white space +/// +/// Folding white space are used for long headers splitted on multiple lines. +/// The obsolete syntax allowes multiple lines without content; implemented for compatibility +/// reasons +pub fn perm_fws(input: &str) -> IResult<&str, &str> { + alt((recognize(many1(fold_marker)), space1))(input) +} +fn fold_marker(input: &str) -> IResult<&str, &str> { + let (input, _) = space0(input)?; + let (input, _) = perm_crlf(input)?; + space1(input) +} + + +/// Folding White Space with Comment +/// +/// Note: we drop the comments for now... +/// +/// ctext = %d33-39 / ; Printable US-ASCII +/// %d42-91 / ; characters not including +/// %d93-126 / ; "(", ")", or "\" +/// obs-ctext +/// +/// ccontent = ctext / quoted-pair / comment +/// +/// comment = "(" *([FWS] ccontent) [FWS] ")" +/// +/// CFWS = (1*([FWS] comment) [FWS]) / FWS +/// ``` +pub fn cfws(input: &str) -> IResult<&str, &str> { + alt((perm_fws, recognize(comments)))(input) +} + +pub fn comments(input: &str) -> IResult<&str, ()> { + let (input, _) = many1(tuple((opt(perm_fws), comment)))(input)?; + let (input, _) = opt(perm_fws)(input)?; + Ok((input, ())) +} + +pub fn comment(input: &str) -> IResult<&str, ()> { + let (input, _) = tag("(")(input)?; + let (input, _) = many0(tuple((opt(perm_fws), ccontent)))(input)?; + let (input, _) = opt(perm_fws)(input)?; + let (input, _) = tag(")")(input)?; + Ok((input, ())) +} + +pub fn ccontent(input: &str) -> IResult<&str, &str> { + alt((recognize(ctext), recognize(quoted_pair), recognize(comment)))(input) +} + +pub fn ctext(input: &str) -> IResult<&str, char> { + satisfy(is_ctext)(input) +} + +/// Check if it's a comment text character +/// +/// ```abnf +/// ctext = %d33-39 / ; Printable US-ASCII +/// %d42-91 / ; characters not including +/// %d93-126 / ; "(", ")", or "\" +/// obs-ctext +///``` +pub fn is_ctext(c: char) -> bool { + (c >= '\x21' && c <= '\x27') || (c >= '\x2A' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') || !c.is_ascii() +} + +/// VCHAR definition +pub fn is_vchar(c: char) -> bool { + (c >= '\x21' && c <= '\x7E') || !c.is_ascii() +} + +/// Sequence of visible chars with the UTF-8 extension +/// +/// ```abnf +/// VCHAR = %x21-7E +/// ; visible (printing) characters +/// VCHAR =/ UTF8-non-ascii +/// SEQ = 1*VCHAR +///``` +pub fn vchar_seq(input: &str) -> IResult<&str, &str> { + take_while1(is_vchar)(input) +} + +#[cfg(test)] +mod tests { + use super::*; + use nom; + + #[test] + fn test_vchar_seq() { + assert_eq!(vchar_seq("hello world"), Ok((" world", "hello"))); + assert_eq!(vchar_seq("hellođź‘‹ world"), Ok((" world", "hellođź‘‹"))); + } + + #[test] + fn test_perm_crlf() { + assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r"))); + assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n"))); + assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n"))); + } + + #[test] + fn test_perm_fws() { + assert_eq!(perm_fws("\r\n world"), Ok(("world", "\r\n "))); + assert_eq!(perm_fws(" \r\n \r\n world"), Ok(("world", " \r\n \r\n "))); + assert_eq!(perm_fws(" world"), Ok(("world", " "))); + assert!(perm_fws("\r\nFrom: test").is_err()); + } + + #[test] + fn test_cfws() { + assert_eq!(cfws("(A nice \\) chap) "), Ok(("", "(A nice \\) chap) "))); + } +}