implement comment foldable whitespace

2023-06-12 16:05:06 +02:00 · 2023-06-12 16:05:06 +02:00 · 6e76fed684
commit 6e76fed684
parent 7d3b3ff053
6 changed files with 213 additions and 62 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,7 @@
 # imf-codec

+**Work in progress, do not use in production**
+
+**Focus: correctness over performance**
+
 **This is currently only a decoder (parser), encoding is not supported.**
--- a/src/abnf.rs
+++ b/src/abnf.rs
@ -1,50 +0,0 @@
-use nom::{
-    IResult,
-    branch::alt,
-    bytes::complete::{tag, take_while1},
-    character::complete::{crlf, space0, space1},
-    combinator::opt,
-    sequence::terminated,
-};
-
-/// ABNF rfc5234
-
-/// Permissive CRLF
-///
-/// Theoretically, all lines must end with \r\n
-/// but mail servers support malformated emails,
-/// for example with only \n eol. It works because
-/// \r\n is allowed nowhere else, so we also add this support.
-pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
-    alt((crlf, tag("\r"), tag("\n")))(input)
-}
-
-// Note: WSP = SP / HTAB = %x20 / %x09
-// nom::*::space0 = *WSP
-// nom::*::space1 = 1*WSP
-
-/// Parse a folding white space
-///
-/// Folding white space are used for long headers splitted on multiple lines
-///
-/// ```abnf
-/// FWS             =   ([*WSP CRLF] 1*WSP) /  obs-FWS
-/// obs-FWS         =   1*WSP *(CRLF 1*WSP)
-/// ```
-pub fn fws(input: &str) -> IResult<&str, &str> {
-    let (input, _) = opt(terminated(space0, perm_crlf))(input)?;
-    // @FIXME: not implemented obs-FWS
-    space1(input)
-}
-
-/// Sequence of visible chars with the UTF-8 extension
-///
-/// ```abnf
-/// VCHAR   =  %x21-7E
-///            ; visible (printing) characters
-/// VCHAR   =/  UTF8-non-ascii
-/// SEQ     = 1*VCHAR
-///```
-pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
-   take_while1(|c: char| (c >= '\x21' && c <= '\x7E') || !c.is_ascii())(input)
-}
--- a/src/headers.rs
+++ b/src/headers.rs
@ -11,18 +11,18 @@ use nom::{
    sequence::tuple,
 };

-use crate::abnf::{fws, vchar_seq, perm_crlf};
-use crate::model::{HeaderSection, HeaderDate};
+use crate::tokens::{perm_fws, vchar_seq, perm_crlf};
+use crate::model::{PermissiveHeaderSection, HeaderDate, MailboxRef};

 /// HEADERS

 /// Header section
 ///
 /// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
-pub fn header_section(input: &str) -> IResult<&str, HeaderSection> {
+pub fn header_section(input: &str) -> IResult<&str, PermissiveHeaderSection> {
    let (input, headers) = fold_many0(
        header_field,
-        HeaderSection::default,
+        PermissiveHeaderSection::default,
        |mut section, head| {
            match head {
                HeaderField::Date(d) => {
@ -114,7 +114,11 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> {
            };
            (input, HeaderField::Date(date))
        },
-        //"From" => unimplemented!(),
+        "From" => {
+           let (input, mbx) = mailbox(input)?;
+           //many0(
+           unimplemented!()
+        },
        "Sender" => unimplemented!(),
        "Subject" => {
            let (input, body) = unstructured(input)?;
@ -136,17 +140,17 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> {
 /// unstructured    =   (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
 /// ```
 fn unstructured(input: &str) -> IResult<&str, String> {
-    let (input, r) = many0(tuple((opt(fws), vchar_seq)))(input)?;
+    let (input, r) = many0(tuple((opt(perm_fws), vchar_seq)))(input)?;
    let (input, _) = space0(input)?;

    // Try to optimize for the most common cases
    let body = match r.as_slice() {
        [(None, content)] => content.to_string(),
-        [(Some(ws), content)] => ws.to_string() + content,
+        [(Some(_), content)] => " ".to_string() + content,
        lines => lines.iter().fold(String::with_capacity(255), |acc, item| {
            let (may_ws, content) = item;
            match may_ws {
-                Some(ws) => acc + ws + content,
+                Some(ws) => acc + " " + content,
                None => acc + content,
            }
        }),
@ -155,3 +159,6 @@ fn unstructured(input: &str) -> IResult<&str, String> {
    Ok((input, body))
 }

+fn mailbox(input: &str) -> IResult<&str, MailboxRef> {
+    unimplemented!();
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,3 @@
 pub mod headers;
 pub mod model;
-mod abnf;
+mod tokens;
--- a/src/model.rs
+++ b/src/model.rs
@ -9,10 +9,37 @@ pub enum HeaderDate {
    None,
 }

+#[derive(Debug)]
+pub struct MailboxRef<'a> {
+    // The actual "email address" like hello@example.com
+    pub addrspec: &'a str,
+    pub name: Option<&'a str>,
+}
+
+#[derive(Debug)]
+pub struct GroupRef<'a> {
+    pub name: &'a str,
+    pub mbx: Vec<MailboxRef<'a>>,
+}
+
+#[derive(Debug)]
+pub enum AddressRef<'a> {
+    Single(MailboxRef<'a>),
+    Many(GroupRef<'a>),
+}
+
+/// Permissive Header Section
+///
+/// This is a structure intended for parsing/decoding,
+/// hence it's support cases where the email is considered
+/// as invalid according to RFC5322 but for which we can
+/// still extract some data.
 #[derive(Debug, Default)]
-pub struct HeaderSection<'a> {
+pub struct PermissiveHeaderSection<'a> {
    pub subject: Option<String>,
-    pub from: Vec<String>,
+    pub from: Vec<MailboxRef<'a>>,
+    pub sender: Option<MailboxRef<'a>>,
+    pub reply_to: Vec<AddressRef<'a>>,
    pub date: HeaderDate,
    pub optional: HashMap<&'a str, String>,
 }
@ -21,7 +48,15 @@ enum InvalidEmailErr {
    NoUsableDate,
 }

-impl<'a> HeaderSection<'a> {
+impl<'a> PermissiveHeaderSection<'a> {
+    /// Check validity of the email
+    ///
+    /// Especially check that there is no missing fields,
+    /// or no unique fields declared multiple times.
+    ///
+    /// See: https://www.rfc-editor.org/rfc/rfc5322#section-3.6
+    //@FIXME could be changed to a to_StrictHeaderSection call. All fixed errors would be returned in
+    // a vec of errors.
    fn is_valid(&self) -> Result<(), InvalidEmailErr> {
        match self.date {
            HeaderDate::Parsed(_) => (),
--- a/src/tokens.rs
+++ b/src/tokens.rs
@ -0,0 +1,155 @@
+use nom::{
+    IResult,
+    branch::alt,
+    bytes::complete::{tag, take_while1},
+    character::complete::{crlf, satisfy, space0, space1},
+    combinator::{recognize, opt},
+    multi::{many0, many1},
+    sequence::{preceded, terminated, tuple},
+};
+
+/// Lexical tokens
+///
+/// Approx. maps to section 3.2 of the RFC 
+/// https://www.rfc-editor.org/rfc/rfc5322#section-3.2
+/// Also https://datatracker.ietf.org/doc/html/rfc6532
+
+/// Permissive CRLF
+///
+/// Theoretically, all lines must end with \r\n
+/// but some mail servers like Dovecot support malformated emails,
+/// for example with only \n eol. It works because
+/// \r or \n is allowed nowhere else, so we also add this support.
+pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
+    alt((crlf, tag("\r"), tag("\n")))(input)
+}
+
+// Note: WSP = SP / HTAB = %x20 / %x09
+// nom::*::space0 = *WSP
+// nom::*::space1 = 1*WSP
+
+/// Quoted pair
+///
+/// ```abnf
+///    quoted-pair     =   ("\" (VCHAR / WSP)) / obs-qp
+/// ```
+pub fn quoted_pair(input: &str) -> IResult<&str, char> {
+    preceded(tag("\\"), satisfy(|c| is_vchar(c) || c == '\t' || c == ' '))(input)
+}
+
+/// Permissive foldable white space
+///
+/// Folding white space are used for long headers splitted on multiple lines.
+/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
+/// reasons
+pub fn perm_fws(input: &str) -> IResult<&str, &str> {
+    alt((recognize(many1(fold_marker)), space1))(input)
+}
+fn fold_marker(input: &str) -> IResult<&str, &str> {
+   let (input, _) = space0(input)?;
+   let (input, _) = perm_crlf(input)?;
+   space1(input)
+}
+
+
+/// Folding White Space with Comment
+///
+/// Note: we drop the comments for now...  
+///
+///   ctext           =   %d33-39 /          ; Printable US-ASCII
+///                       %d42-91 /          ;  characters not including
+///                       %d93-126 /         ;  "(", ")", or "\"
+///                       obs-ctext
+///
+///   ccontent        =   ctext / quoted-pair / comment
+///
+///   comment         =   "(" *([FWS] ccontent) [FWS] ")"
+///
+///   CFWS            =   (1*([FWS] comment) [FWS]) / FWS
+/// ```
+pub fn cfws(input: &str) -> IResult<&str, &str> {
+    alt((perm_fws, recognize(comments)))(input)
+}
+
+pub fn comments(input: &str) -> IResult<&str, ()> {
+    let (input, _) = many1(tuple((opt(perm_fws), comment)))(input)?;
+    let (input, _) = opt(perm_fws)(input)?;
+    Ok((input, ()))
+}
+
+pub fn comment(input: &str) -> IResult<&str, ()> {
+    let (input, _) = tag("(")(input)?;
+    let (input, _) = many0(tuple((opt(perm_fws), ccontent)))(input)?;
+    let (input, _) = opt(perm_fws)(input)?;
+    let (input, _) = tag(")")(input)?;
+    Ok((input, ()))
+}
+
+pub fn ccontent(input: &str) -> IResult<&str, &str> {
+   alt((recognize(ctext), recognize(quoted_pair), recognize(comment)))(input) 
+}
+
+pub fn ctext(input: &str) -> IResult<&str, char> {
+    satisfy(is_ctext)(input)
+}
+
+/// Check if it's a comment text character
+///
+/// ```abnf
+///   ctext           =   %d33-39 /          ; Printable US-ASCII
+///                       %d42-91 /          ;  characters not including
+///                       %d93-126 /         ;  "(", ")", or "\"
+///                       obs-ctext
+///```
+pub fn is_ctext(c: char) -> bool {
+    (c >= '\x21' && c <= '\x27') || (c >= '\x2A' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') || !c.is_ascii()
+}
+
+/// VCHAR definition
+pub fn is_vchar(c: char) -> bool {
+  (c >= '\x21' && c <= '\x7E') || !c.is_ascii()
+}
+
+/// Sequence of visible chars with the UTF-8 extension
+///
+/// ```abnf
+/// VCHAR   =  %x21-7E
+///            ; visible (printing) characters
+/// VCHAR   =/  UTF8-non-ascii
+/// SEQ     = 1*VCHAR
+///```
+pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
+   take_while1(is_vchar)(input)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use nom;
+
+    #[test]
+    fn test_vchar_seq() {
+        assert_eq!(vchar_seq("hello world"), Ok((" world", "hello")));
+        assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋")));
+    }
+
+    #[test]
+    fn test_perm_crlf() {
+        assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));
+        assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));
+        assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));
+    }
+
+    #[test]
+    fn test_perm_fws() {
+        assert_eq!(perm_fws("\r\n world"), Ok(("world", "\r\n ")));
+        assert_eq!(perm_fws(" \r\n \r\n world"), Ok(("world", " \r\n \r\n ")));
+        assert_eq!(perm_fws(" world"), Ok(("world", " ")));
+        assert!(perm_fws("\r\nFrom: test").is_err());
+    }
+
+    #[test]
+    fn test_cfws() {
+        assert_eq!(cfws("(A nice \\) chap) <pete(his account)@silly.test(his host)>"), Ok(("<pete(his account)@silly.test(his host)>", "(A nice \\) chap) ")));
+    }
+}