diff --git a/.gitignore b/.gitignore index ea8c4bf..a8f3b05 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.*sw* diff --git a/src/headers.rs b/src/headers.rs index 1d737e5..b87c9fd 100644 --- a/src/headers.rs +++ b/src/headers.rs @@ -11,7 +11,9 @@ use nom::{ sequence::tuple, }; -use crate::tokens::{fws, vchar_seq, perm_crlf, unstructured}; +use crate::whitespace::{fws, perm_crlf}; +use crate::words::vchar_seq; +use crate::misc_token::unstructured; use crate::model::{PermissiveHeaderSection, HeaderDate, MailboxRef}; /// HEADERS diff --git a/src/lib.rs b/src/lib.rs index 6f7d11d..0996105 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ pub mod headers; pub mod model; -mod tokens; +mod whitespace; +mod words; +mod quoted; +mod misc_token; diff --git a/src/misc_token.rs b/src/misc_token.rs new file mode 100644 index 0000000..a21cdbc --- /dev/null +++ b/src/misc_token.rs @@ -0,0 +1,69 @@ +use std::borrow::Cow; +use nom::{ + IResult, + branch::alt, + character::complete::space0, + combinator::{into, opt}, + multi::{many0, many1}, + sequence::{pair, tuple}, +}; + +use crate::quoted::quoted_string; +use crate::whitespace::fws; +use crate::words::{atom, vchar_seq}; + +/// Word +/// +/// ```abnf +/// word = atom / quoted-string +/// ``` +pub fn word(input: &str) -> IResult<&str, Cow> { + alt((into(quoted_string), into(atom)))(input) +} + +/// Phrase +/// +/// ```abnf +/// phrase = 1*word / obs-phrase +/// ``` +pub fn phrase(input: &str) -> IResult<&str, String> { + let (input, words) = many1(word)(input)?; + let phrase = words.join(" "); + Ok((input, phrase)) +} + +/// Unstructured header field body +/// +/// ```abnf +/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct +/// ``` +pub fn unstructured(input: &str) -> IResult<&str, String> { + let (input, r) = many0(tuple((opt(fws), vchar_seq)))(input)?; + let (input, _) = space0(input)?; + + // Try to optimize for the most common cases + let body = match r.as_slice() { + [(None, content)] => content.to_string(), + [(Some(_), content)] => " ".to_string() + content, + lines => lines.iter().fold(String::with_capacity(255), |acc, item| { + let (may_ws, content) = item; + match may_ws { + Some(_) => acc + " " + content, + None => acc + content, + } + }), + }; + + Ok((input, body)) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_phrase() { + assert_eq!(phrase("hello world"), Ok(("", "hello world".into()))); + assert_eq!(phrase("salut \"le\" monde"), Ok(("", "salut le monde".into()))); + assert_eq!(phrase("fin\r\n du\r\nmonde"), Ok(("\r\nmonde", "fin du".into()))); + } +} diff --git a/src/quoted.rs b/src/quoted.rs new file mode 100644 index 0000000..4b8af27 --- /dev/null +++ b/src/quoted.rs @@ -0,0 +1,86 @@ +use nom::{ + IResult, + branch::alt, + bytes::complete::tag, + character::complete::satisfy, + combinator::opt, + multi::many0, + sequence::{pair, preceded}, +}; + +use crate::words::is_vchar; +use crate::whitespace::{fws, cfws}; + +/// Quoted pair +/// +/// ```abnf +/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp +/// ``` +pub fn quoted_pair(input: &str) -> IResult<&str, char> { + preceded(tag("\\"), satisfy(|c| is_vchar(c) || c == '\t' || c == ' '))(input) +} + +/// Allowed characters in quote +/// +/// ```abnf +/// qtext = %d33 / ; Printable US-ASCII +/// %d35-91 / ; characters not including +/// %d93-126 / ; "\" or the quote character +/// obs-qtext +/// ``` +fn is_qtext(c: char) -> bool { + c == '\x21' || (c >= '\x23' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') +} + +/// Quoted pair content +/// +/// ```abnf +/// qcontent = qtext / quoted-pair +/// ``` +fn qcontent(input: &str) -> IResult<&str, char> { + alt((satisfy(is_qtext), quoted_pair))(input) +} + +/// Quoted string +/// +/// ```abnf +/// quoted-string = [CFWS] +/// DQUOTE *([FWS] qcontent) [FWS] DQUOTE +/// [CFWS] +/// ``` +pub fn quoted_string(input: &str) -> IResult<&str, String> { + let (input, _) = opt(cfws)(input)?; + let (input, _) = tag("\"")(input)?; + let (input, content) = many0(pair(opt(fws), qcontent))(input)?; + + // Rebuild string + let mut qstring = content.iter().fold( + String::with_capacity(16), + |mut acc, (maybe_wsp, c)| { + if let Some(wsp) = maybe_wsp { + acc.push(*wsp); + } + acc.push(*c); + acc + }); + + let (input, maybe_wsp) = opt(fws)(input)?; + if let Some(wsp) = maybe_wsp { + qstring.push(wsp); + } + + let (input, _) = tag("\"")(input)?; + let (input, _) = opt(cfws)(input)?; + Ok((input, qstring)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_quoted_string() { + assert_eq!(quoted_string(" \"hello\\\"world\" "), Ok(("", "hello\"world".to_string()))); + assert_eq!(quoted_string("\"hello\r\n world\""), Ok(("", "hello world".to_string()))); + } +} diff --git a/src/tokens.rs b/src/tokens.rs deleted file mode 100644 index 39cb688..0000000 --- a/src/tokens.rs +++ /dev/null @@ -1,327 +0,0 @@ -use std::borrow::Cow; -use nom::{ - IResult, - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::{crlf, satisfy, space0, space1}, - combinator::{into, recognize, opt}, - multi::{many0, many1}, - sequence::{delimited, pair, preceded, terminated, tuple}, -}; - -/// Lexical tokens -/// -/// Approx. maps to section 3.2 of the RFC -/// https://www.rfc-editor.org/rfc/rfc5322#section-3.2 -/// Also https://datatracker.ietf.org/doc/html/rfc6532 - -// quoted characters and strings - -/// Quoted pair -/// -/// ```abnf -/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp -/// ``` -pub fn quoted_pair(input: &str) -> IResult<&str, char> { - preceded(tag("\\"), satisfy(|c| is_vchar(c) || c == '\t' || c == ' '))(input) -} - -/// Allowed characters in quote -/// -/// ```abnf -/// qtext = %d33 / ; Printable US-ASCII -/// %d35-91 / ; characters not including -/// %d93-126 / ; "\" or the quote character -/// obs-qtext -/// ``` -fn is_qtext(c: char) -> bool { - c == '\x21' || (c >= '\x23' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') -} - -/// Quoted pair content -/// -/// ```abnf -/// qcontent = qtext / quoted-pair -/// ``` -fn qcontent(input: &str) -> IResult<&str, char> { - alt((satisfy(is_qtext), quoted_pair))(input) -} - -/// Quoted string -/// -/// ```abnf -/// quoted-string = [CFWS] -/// DQUOTE *([FWS] qcontent) [FWS] DQUOTE -/// [CFWS] -/// ``` -pub fn quoted_string(input: &str) -> IResult<&str, String> { - let (input, _) = opt(cfws)(input)?; - let (input, _) = tag("\"")(input)?; - let (input, content) = many0(pair(opt(fws), qcontent))(input)?; - - // Rebuild string - let mut qstring = content.iter().fold( - String::with_capacity(16), - |mut acc, (maybe_wsp, c)| { - if let Some(wsp) = maybe_wsp { - acc.push(*wsp); - } - acc.push(*c); - acc - }); - - let (input, maybe_wsp) = opt(fws)(input)?; - if let Some(wsp) = maybe_wsp { - qstring.push(wsp); - } - - let (input, _) = tag("\"")(input)?; - let (input, _) = opt(cfws)(input)?; - Ok((input, qstring)) -} - -/// Word -/// -/// ```abnf -/// word = atom / quoted-string -/// ``` -pub fn word(input: &str) -> IResult<&str, Cow> { - alt((into(quoted_string), into(atom)))(input) -} - -/// Phrase -/// -/// ```abnf -/// phrase = 1*word / obs-phrase -/// ``` -pub fn phrase(input: &str) -> IResult<&str, String> { - let (input, words) = many1(word)(input)?; - let phrase = words.join(" "); - Ok((input, phrase)) -} - -/// Unstructured header field body -/// -/// ```abnf -/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct -/// ``` -pub fn unstructured(input: &str) -> IResult<&str, String> { - let (input, r) = many0(tuple((opt(fws), vchar_seq)))(input)?; - let (input, _) = space0(input)?; - - // Try to optimize for the most common cases - let body = match r.as_slice() { - [(None, content)] => content.to_string(), - [(Some(_), content)] => " ".to_string() + content, - lines => lines.iter().fold(String::with_capacity(255), |acc, item| { - let (may_ws, content) = item; - match may_ws { - Some(_) => acc + " " + content, - None => acc + content, - } - }), - }; - - Ok((input, body)) -} - -// --- whitespaces and comments - -// Note: WSP = SP / HTAB = %x20 / %x09 -// nom::*::space0 = *WSP -// nom::*::space1 = 1*WSP - -/// Permissive CRLF -/// -/// Theoretically, all lines must end with \r\n -/// but some mail servers like Dovecot support malformated emails, -/// for example with only \n eol. It works because -/// \r or \n is allowed nowhere else, so we also add this support. -pub fn perm_crlf(input: &str) -> IResult<&str, &str> { - alt((crlf, tag("\r"), tag("\n")))(input) -} - -/// Permissive foldable white space -/// -/// Folding white space are used for long headers splitted on multiple lines. -/// The obsolete syntax allowes multiple lines without content; implemented for compatibility -/// reasons -pub fn fws(input: &str) -> IResult<&str, char> { - let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?; - Ok((input, ' ')) -} -fn fold_marker(input: &str) -> IResult<&str, &str> { - let (input, _) = space0(input)?; - let (input, _) = perm_crlf(input)?; - space1(input) -} - - -/// Folding White Space with Comment -/// -/// Note: we drop the comments for now... -/// -/// ctext = %d33-39 / ; Printable US-ASCII -/// %d42-91 / ; characters not including -/// %d93-126 / ; "(", ")", or "\" -/// obs-ctext -/// -/// ccontent = ctext / quoted-pair / comment -/// -/// comment = "(" *([FWS] ccontent) [FWS] ")" -/// -/// CFWS = (1*([FWS] comment) [FWS]) / FWS -/// ``` -pub fn cfws(input: &str) -> IResult<&str, &str> { - alt((recognize(comments), recognize(fws)))(input) -} - -pub fn comments(input: &str) -> IResult<&str, ()> { - let (input, _) = many1(tuple((opt(fws), comment)))(input)?; - let (input, _) = opt(fws)(input)?; - Ok((input, ())) -} - -pub fn comment(input: &str) -> IResult<&str, ()> { - let (input, _) = tag("(")(input)?; - let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?; - let (input, _) = opt(fws)(input)?; - let (input, _) = tag(")")(input)?; - Ok((input, ())) -} - -pub fn ccontent(input: &str) -> IResult<&str, &str> { - alt((recognize(ctext), recognize(quoted_pair), recognize(comment)))(input) -} - -pub fn ctext(input: &str) -> IResult<&str, char> { - satisfy(is_ctext)(input) -} - -/// Check if it's a comment text character -/// -/// ```abnf -/// ctext = %d33-39 / ; Printable US-ASCII -/// %d42-91 / ; characters not including -/// %d93-126 / ; "(", ")", or "\" -/// obs-ctext -///``` -pub fn is_ctext(c: char) -> bool { - (c >= '\x21' && c <= '\x27') || (c >= '\x2A' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') || !c.is_ascii() -} - -// atoms, words, phrases, vchar - -/// VCHAR definition -pub fn is_vchar(c: char) -> bool { - (c >= '\x21' && c <= '\x7E') || !c.is_ascii() -} - -/// Sequence of visible chars with the UTF-8 extension -/// -/// ```abnf -/// VCHAR = %x21-7E -/// ; visible (printing) characters -/// VCHAR =/ UTF8-non-ascii -/// SEQ = 1*VCHAR -///``` -pub fn vchar_seq(input: &str) -> IResult<&str, &str> { - take_while1(is_vchar)(input) -} - -/// Atom allowed characters -fn is_atext(c: char) -> bool { - c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) -} - -/// Atom -/// -/// `[CFWS] 1*atext [CFWS]` -fn atom(input: &str) -> IResult<&str, &str> { - delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input) -} - -/// dot-atom-text -/// -/// `1*atext *("." 1*atext)` -fn dot_atom_text(input: &str) -> IResult<&str, &str> { - recognize(pair(take_while1(is_atext), many0(pair(tag("."), take_while1(is_atext)))))(input) -} - -/// dot-atom -/// -/// `[CFWS] dot-atom-text [CFWS]` -fn dot_atom(input: &str) -> IResult<&str, &str> { - delimited(opt(cfws), dot_atom_text, opt(cfws))(input) -} - - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_vchar_seq() { - assert_eq!(vchar_seq("hello world"), Ok((" world", "hello"))); - assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋"))); - } - - #[test] - fn test_perm_crlf() { - assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r"))); - assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n"))); - assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n"))); - } - - #[test] - fn test_fws() { - assert_eq!(fws("\r\n world"), Ok(("world", ' '))); - assert_eq!(fws(" \r\n \r\n world"), Ok(("world", ' '))); - assert_eq!(fws(" world"), Ok(("world", ' '))); - assert!(fws("\r\nFrom: test").is_err()); - } - - #[test] - fn test_cfws() { - assert_eq!(cfws("(A nice \\) chap) "), Ok(("", "(A nice \\) chap) "))); - assert_eq!(cfws("(Chris's host.)public.example>,"), Ok(("public.example>,", "(Chris's host.)"))); - assert_eq!(cfws("(double (comment) is fun) wouch"), Ok(("wouch", "(double (comment) is fun) "))); - } - - #[test] - fn test_atext() { - assert!(is_atext('=')); - assert!(is_atext('5')); - assert!(is_atext('Q')); - assert!(!is_atext(' ')); - assert!(!is_atext('É')); - } - - #[test] - fn test_atom() { - assert_eq!(atom("(skip) imf_codec (hidden) aerogramme"), Ok(("aerogramme", "imf_codec"))); - } - - #[test] - fn test_dot_atom_text() { - assert_eq!(dot_atom_text("quentin.dufour.io abcdef"), Ok((" abcdef", "quentin.dufour.io"))); - } - - #[test] - fn test_dot_atom() { - assert_eq!(dot_atom(" (skip) quentin.dufour.io abcdef"), Ok(("abcdef", "quentin.dufour.io"))); - } - - #[test] - fn test_quoted_string() { - assert_eq!(quoted_string(" \"hello\\\"world\" "), Ok(("", "hello\"world".to_string()))); - assert_eq!(quoted_string("\"hello\r\n world\""), Ok(("", "hello world".to_string()))); - } - - #[test] - fn test_phrase() { - assert_eq!(phrase("hello world"), Ok(("", "hello world".into()))); - assert_eq!(phrase("salut \"le\" monde"), Ok(("", "salut le monde".into()))); - assert_eq!(phrase("fin\r\n du\r\nmonde"), Ok(("\r\nmonde", "fin du".into()))); - } -} diff --git a/src/whitespace.rs b/src/whitespace.rs new file mode 100644 index 0000000..f6d6d9b --- /dev/null +++ b/src/whitespace.rs @@ -0,0 +1,122 @@ +use nom::{ + IResult, + branch::alt, + bytes::complete::tag, + character::complete::{crlf, satisfy, space0, space1}, + combinator::{recognize, opt}, + multi::{many0, many1}, + sequence::{pair, tuple}, +}; +use crate::quoted::quoted_pair; + +// --- whitespaces and comments + +// Note: WSP = SP / HTAB = %x20 / %x09 +// nom::*::space0 = *WSP +// nom::*::space1 = 1*WSP + +/// Permissive CRLF +/// +/// Theoretically, all lines must end with \r\n +/// but some mail servers like Dovecot support malformated emails, +/// for example with only \n eol. It works because +/// \r or \n is allowed nowhere else, so we also add this support. +pub fn perm_crlf(input: &str) -> IResult<&str, &str> { + alt((crlf, tag("\r"), tag("\n")))(input) +} + +/// Permissive foldable white space +/// +/// Folding white space are used for long headers splitted on multiple lines. +/// The obsolete syntax allowes multiple lines without content; implemented for compatibility +/// reasons +pub fn fws(input: &str) -> IResult<&str, char> { + let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?; + Ok((input, ' ')) +} +fn fold_marker(input: &str) -> IResult<&str, &str> { + let (input, _) = space0(input)?; + let (input, _) = perm_crlf(input)?; + space1(input) +} + + +/// Folding White Space with Comment +/// +/// Note: we drop the comments for now... +/// +/// ctext = %d33-39 / ; Printable US-ASCII +/// %d42-91 / ; characters not including +/// %d93-126 / ; "(", ")", or "\" +/// obs-ctext +/// +/// ccontent = ctext / quoted-pair / comment +/// +/// comment = "(" *([FWS] ccontent) [FWS] ")" +/// +/// CFWS = (1*([FWS] comment) [FWS]) / FWS +/// ``` +pub fn cfws(input: &str) -> IResult<&str, &str> { + alt((recognize(comments), recognize(fws)))(input) +} + +pub fn comments(input: &str) -> IResult<&str, ()> { + let (input, _) = many1(tuple((opt(fws), comment)))(input)?; + let (input, _) = opt(fws)(input)?; + Ok((input, ())) +} + +pub fn comment(input: &str) -> IResult<&str, ()> { + let (input, _) = tag("(")(input)?; + let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?; + let (input, _) = opt(fws)(input)?; + let (input, _) = tag(")")(input)?; + Ok((input, ())) +} + +pub fn ccontent(input: &str) -> IResult<&str, &str> { + alt((recognize(ctext), recognize(quoted_pair), recognize(comment)))(input) +} + +pub fn ctext(input: &str) -> IResult<&str, char> { + satisfy(is_ctext)(input) +} + +/// Check if it's a comment text character +/// +/// ```abnf +/// ctext = %d33-39 / ; Printable US-ASCII +/// %d42-91 / ; characters not including +/// %d93-126 / ; "(", ")", or "\" +/// obs-ctext +///``` +pub fn is_ctext(c: char) -> bool { + (c >= '\x21' && c <= '\x27') || (c >= '\x2A' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') || !c.is_ascii() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_perm_crlf() { + assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r"))); + assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n"))); + assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n"))); + } + + #[test] + fn test_fws() { + assert_eq!(fws("\r\n world"), Ok(("world", ' '))); + assert_eq!(fws(" \r\n \r\n world"), Ok(("world", ' '))); + assert_eq!(fws(" world"), Ok(("world", ' '))); + assert!(fws("\r\nFrom: test").is_err()); + } + + #[test] + fn test_cfws() { + assert_eq!(cfws("(A nice \\) chap) "), Ok(("", "(A nice \\) chap) "))); + assert_eq!(cfws("(Chris's host.)public.example>,"), Ok(("public.example>,", "(Chris's host.)"))); + assert_eq!(cfws("(double (comment) is fun) wouch"), Ok(("wouch", "(double (comment) is fun) "))); + } +} diff --git a/src/words.rs b/src/words.rs new file mode 100644 index 0000000..9535471 --- /dev/null +++ b/src/words.rs @@ -0,0 +1,88 @@ +use nom::{ + IResult, + bytes::complete::{tag, take_while1}, + combinator::{recognize, opt}, + multi::many0, + sequence::{delimited, pair}, +}; +use crate::whitespace::cfws; + + +/// VCHAR definition +pub fn is_vchar(c: char) -> bool { + (c >= '\x21' && c <= '\x7E') || !c.is_ascii() +} + +/// Sequence of visible chars with the UTF-8 extension +/// +/// ```abnf +/// VCHAR = %x21-7E +/// ; visible (printing) characters +/// VCHAR =/ UTF8-non-ascii +/// SEQ = 1*VCHAR +///``` +pub fn vchar_seq(input: &str) -> IResult<&str, &str> { + take_while1(is_vchar)(input) +} + +/// Atom allowed characters +fn is_atext(c: char) -> bool { + c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) +} + +/// Atom +/// +/// `[CFWS] 1*atext [CFWS]` +pub fn atom(input: &str) -> IResult<&str, &str> { + delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input) +} + +/// dot-atom-text +/// +/// `1*atext *("." 1*atext)` +fn dot_atom_text(input: &str) -> IResult<&str, &str> { + recognize(pair(take_while1(is_atext), many0(pair(tag("."), take_while1(is_atext)))))(input) +} + +/// dot-atom +/// +/// `[CFWS] dot-atom-text [CFWS]` +fn dot_atom(input: &str) -> IResult<&str, &str> { + delimited(opt(cfws), dot_atom_text, opt(cfws))(input) +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_vchar_seq() { + assert_eq!(vchar_seq("hello world"), Ok((" world", "hello"))); + assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋"))); + } + + #[test] + fn test_atext() { + assert!(is_atext('=')); + assert!(is_atext('5')); + assert!(is_atext('Q')); + assert!(!is_atext(' ')); + assert!(!is_atext('É')); + } + + #[test] + fn test_atom() { + assert_eq!(atom("(skip) imf_codec (hidden) aerogramme"), Ok(("aerogramme", "imf_codec"))); + } + + #[test] + fn test_dot_atom_text() { + assert_eq!(dot_atom_text("quentin.dufour.io abcdef"), Ok((" abcdef", "quentin.dufour.io"))); + } + + #[test] + fn test_dot_atom() { + assert_eq!(dot_atom(" (skip) quentin.dufour.io abcdef"), Ok(("abcdef", "quentin.dufour.io"))); + } +}