eml-codec/src/text/whitespace.rs

use nom::{
    branch::alt,
    bytes::complete::{is_not, tag, take_while1},
    character::complete::{space0, space1},
    combinator::{opt, recognize},
    multi::{many0, many1},
    sequence::{pair, tuple},
    IResult,
};
use crate::text::encoding::encoded_word;
use crate::text::quoted::quoted_pair;
use crate::text::ascii;

/// Whitespace (space, new line, tab) content and 
/// delimited content (eg. comment, line, sections, etc.)

/// Obsolete/Compatible CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.

pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)
}
pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
    // is_not(CRLF) is a hack, it means "is not CR or LF"
    // and not "is not CRLF". In other words, it continues while
    // it does not encounter 0x0D or 0x0A.
    pair(is_not(ascii::CRLF), obs_crlf)(input)
}

/// ```abnf
/// fold_line = any *(1*(crlf WS) any) crlf
/// ```
pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {
    recognize(tuple((
        is_not(ascii::CRLF),
        many0(pair(
            many1(pair(obs_crlf, space1)),
            is_not(ascii::CRLF),
        )),
        obs_crlf,
    )))(input)
}

// --- whitespaces and comments

// Note: WSP = SP / HTAB = %x20 / %x09
// nom::*::space0 = *WSP
// nom::*::space1 = 1*WSP

/// Permissive foldable white space
///
/// Folding white space are used for long headers splitted on multiple lines.
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
/// reasons
pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {
    let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
    Ok((input, ascii::SP))
}
fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {
    let (input, _) = space0(input)?;
    let (input, _) = obs_crlf(input)?;
    space1(input)
}

/// Folding White Space with Comment
///
/// Note: we drop the comments for now...  
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///
///   ccontent        =   ctext / quoted-pair / comment
///
///   comment         =   "(" *([FWS] ccontent) [FWS] ")"
///
///   CFWS            =   (1*([FWS] comment) [FWS]) / FWS
/// ```
pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((recognize(comments), recognize(fws)))(input)
}

pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {
    let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
    let (input, _) = opt(fws)(input)?;
    Ok((input, ()))
}

pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {
    let (input, _) = tag("(")(input)?;
    let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
    let (input, _) = opt(fws)(input)?;
    let (input, _) = tag(")")(input)?;
    Ok((input, ()))
}

pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
}

pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {
    take_while1(is_ctext)(input)
}

pub fn is_ctext(c: u8) -> bool {
    is_restr_ctext(c) || is_obs_no_ws_ctl(c)
}

/// Check if it's a comment text character
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///```
pub fn is_restr_ctext(c: u8) -> bool {
    (c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)
        || (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)
        || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
}

/// US ASCII control characters without effect
///
/// ```abnf
///   obs-NO-WS-CTL   =   %d1-8 /            ; US-ASCII control
///                       %d11 /             ;  characters that do not
///                       %d12 /             ;  include the carriage
///                       %d14-31 /          ;  return, line feed, and
///                       %d127              ;  white space characters
/// ```
pub fn is_obs_no_ws_ctl(c: u8) -> bool {
    (c >= ascii::SOH && c <= ascii::BS)
        || c == ascii::VT 
        || c == ascii::FF
        || (c >= ascii::SO && c <= ascii::US)
        || c == ascii::DEL
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_obs_crlf() {
        assert_eq!(obs_crlf(b"\rworld"), Ok((&b"world"[..], &b"\r"[..])));
        assert_eq!(obs_crlf(b"\r\nworld"), Ok((&b"world"[..], &b"\r\n"[..])));
        assert_eq!(obs_crlf(b"\nworld"), Ok((&b"world"[..], &b"\n"[..])));
    }

    #[test]
    fn test_fws() {
        assert_eq!(fws(b"\r\n world"), Ok((&b"world"[..], ascii::SP)));
        assert_eq!(fws(b" \r\n \r\n world"), Ok((&b"world"[..], ascii::SP)));
        assert_eq!(fws(b" world"), Ok((&b"world"[..], ascii::SP)));
        assert!(fws(b"\r\nFrom: test").is_err());
    }

    #[test]
    fn test_cfws() {
        assert_eq!(
            cfws(b"(A nice \\) chap) <pete(his account)@silly.test(his host)>"),
            Ok((
                &b"<pete(his account)@silly.test(his host)>"[..],
                &b"(A nice \\) chap) "[..]
            ))
        );
        assert_eq!(
            cfws(b"(Chris's host.)public.example>,"),
            Ok((&b"public.example>,"[..], &b"(Chris's host.)"[..]))
        );
        assert_eq!(
            cfws(b"(double (comment) is fun) wouch"),
            Ok((&b"wouch"[..], &b"(double (comment) is fun) "[..]))
        );
    }

    #[test]
    fn test_cfws_encoded_word() {
       assert_eq!(
            cfws(b"(=?US-ASCII?Q?Keith_Moore?=)"),
            Ok((&b""[..], &b"(=?US-ASCII?Q?Keith_Moore?=)"[..])),
        );
    }
}
refactor parser 2023-06-12 22:08:34 +02:00			`use nom::{`
			`branch::alt,`
wip refactor 2023-07-18 23:25:10 +02:00			`bytes::complete::{is_not, tag, take_while1},`
			`character::complete::{space0, space1},`
cargo fmt 2023-06-22 15:08:50 +02:00			`combinator::{opt, recognize},`
refactor parser 2023-06-12 22:08:34 +02:00			`multi::{many0, many1},`
wip refactor 2023-07-18 23:25:10 +02:00			`sequence::{pair, tuple},`
cargo fmt 2023-06-22 15:08:50 +02:00			`IResult,`
refactor parser 2023-06-12 22:08:34 +02:00			`};`
wip refactor 2023-07-18 23:25:10 +02:00			`use crate::text::encoding::encoded_word;`
			`use crate::text::quoted::quoted_pair;`
			`use crate::text::ascii;`
refactor parser 2023-06-12 22:08:34 +02:00
wip parts 2023-07-17 17:14:08 +02:00			`/// Whitespace (space, new line, tab) content and`
			`/// delimited content (eg. comment, line, sections, etc.)`

wip refactor 2023-07-18 23:25:10 +02:00			`/// Obsolete/Compatible CRLF`
			`///`
			`/// Theoretically, all lines must end with \r\n`
			`/// but some mail servers like Dovecot support malformated emails,`
			`/// for example with only \n eol. It works because`
			`/// \r or \n is allowed nowhere else, so we also add this support.`
fix compilation 2023-07-16 09:55:47 +02:00
wip refactor 2023-07-18 23:25:10 +02:00			`pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {`
			`alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)`
wip parts 2023-07-17 17:14:08 +02:00			`}`
fix compilation 2023-07-16 09:55:47 +02:00			`pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {`
add a preamble test 2023-07-17 11:44:55 +02:00			`// is_not(CRLF) is a hack, it means "is not CR or LF"`
			`// and not "is not CRLF". In other words, it continues while`
			`// it does not encounter 0x0D or 0x0A.`
wip refactor 2023-07-18 23:25:10 +02:00			`pair(is_not(ascii::CRLF), obs_crlf)(input)`
fix compilation 2023-07-16 09:55:47 +02:00			`}`

wip refactor 2023-07-18 23:25:10 +02:00			/// ```abnf
			`/// fold_line = any (1(crlf WS) any) crlf`
			/// ```
			`pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {`
			`recognize(tuple((`
			`is_not(ascii::CRLF),`
			`many0(pair(`
			`many1(pair(obs_crlf, space1)),`
			`is_not(ascii::CRLF),`
			`)),`
			`obs_crlf,`
			`)))(input)`
fix compilation 2023-07-16 09:55:47 +02:00			`}`

refactor parser 2023-06-12 22:08:34 +02:00			`// --- whitespaces and comments`

			`// Note: WSP = SP / HTAB = %x20 / %x09`
			`// nom::::space0 = WSP`
			`// nom::::space1 = 1WSP`

			`/// Permissive foldable white space`
			`///`
			`/// Folding white space are used for long headers splitted on multiple lines.`
			`/// The obsolete syntax allowes multiple lines without content; implemented for compatibility`
			`/// reasons`
wip refactor 2023-07-18 23:25:10 +02:00			`pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {`
refactor parser 2023-06-12 22:08:34 +02:00			`let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;`
wip refactor 2023-07-18 23:25:10 +02:00			`Ok((input, ascii::SP))`
refactor parser 2023-06-12 22:08:34 +02:00			`}`
wip refactor 2023-07-18 23:25:10 +02:00			`fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {`
cargo fmt 2023-06-22 15:08:50 +02:00			`let (input, _) = space0(input)?;`
wip refactor 2023-07-18 23:25:10 +02:00			`let (input, _) = obs_crlf(input)?;`
cargo fmt 2023-06-22 15:08:50 +02:00			`space1(input)`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`/// Folding White Space with Comment`
			`///`
			`/// Note: we drop the comments for now...`
			`///`
mailbox tests 2023-06-13 09:18:36 +02:00			/// ```abnf
refactor parser 2023-06-12 22:08:34 +02:00			`/// ctext = %d33-39 / ; Printable US-ASCII`
			`/// %d42-91 / ; characters not including`
			`/// %d93-126 / ; "(", ")", or "\"`
			`/// obs-ctext`
			`///`
			`/// ccontent = ctext / quoted-pair / comment`
			`///`
			`/// comment = "(" *([FWS] ccontent) [FWS] ")"`
			`///`
			`/// CFWS = (1*([FWS] comment) [FWS]) / FWS`
			/// ```
wip refactor 2023-07-18 23:25:10 +02:00			`pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {`
refactor parser 2023-06-12 22:08:34 +02:00			`alt((recognize(comments), recognize(fws)))(input)`
			`}`

wip refactor 2023-07-18 23:25:10 +02:00			`pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {`
refactor parser 2023-06-12 22:08:34 +02:00			`let (input, _) = many1(tuple((opt(fws), comment)))(input)?;`
			`let (input, _) = opt(fws)(input)?;`
			`Ok((input, ()))`
			`}`

wip refactor 2023-07-18 23:25:10 +02:00			`pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {`
refactor parser 2023-06-12 22:08:34 +02:00			`let (input, _) = tag("(")(input)?;`
			`let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;`
			`let (input, _) = opt(fws)(input)?;`
			`let (input, _) = tag(")")(input)?;`
			`Ok((input, ()))`
			`}`

wip refactor 2023-07-18 23:25:10 +02:00			`pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {`
			`alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

wip refactor 2023-07-18 23:25:10 +02:00			`pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {`
			`take_while1(is_ctext)(input)`
			`}`

			`pub fn is_ctext(c: u8) -> bool {`
			`is_restr_ctext(c) \|\| is_obs_no_ws_ctl(c)`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`/// Check if it's a comment text character`
			`///`
			/// ```abnf
			`/// ctext = %d33-39 / ; Printable US-ASCII`
			`/// %d42-91 / ; characters not including`
			`/// %d93-126 / ; "(", ")", or "\"`
			`/// obs-ctext`
			///```
wip refactor 2023-07-18 23:25:10 +02:00			`pub fn is_restr_ctext(c: u8) -> bool {`
			`(c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)`
			`\|\| (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)`
			`\|\| (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)`
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`}`

cargo fmt 2023-06-22 15:08:50 +02:00			`/// US ASCII control characters without effect`
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`///`
			/// ```abnf
			`/// obs-NO-WS-CTL = %d1-8 / ; US-ASCII control`
			`/// %d11 / ; characters that do not`
			`/// %d12 / ; include the carriage`
			`/// %d14-31 / ; return, line feed, and`
			`/// %d127 ; white space characters`
			/// ```
wip refactor 2023-07-18 23:25:10 +02:00			`pub fn is_obs_no_ws_ctl(c: u8) -> bool {`
			`(c >= ascii::SOH && c <= ascii::BS)`
			`\|\| c == ascii::VT`
			`\|\| c == ascii::FF`
			`\|\| (c >= ascii::SO && c <= ascii::US)`
			`\|\| c == ascii::DEL`
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`}`

refactor parser 2023-06-12 22:08:34 +02:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
wip refactor 2023-07-18 23:25:10 +02:00			`fn test_obs_crlf() {`
fixed tests 2023-07-19 11:03:40 +02:00			`assert_eq!(obs_crlf(b"\rworld"), Ok((&b"world"[..], &b"\r"[..])));`
			`assert_eq!(obs_crlf(b"\r\nworld"), Ok((&b"world"[..], &b"\r\n"[..])));`
			`assert_eq!(obs_crlf(b"\nworld"), Ok((&b"world"[..], &b"\n"[..])));`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`#[test]`
			`fn test_fws() {`
fixed tests 2023-07-19 11:03:40 +02:00			`assert_eq!(fws(b"\r\n world"), Ok((&b"world"[..], ascii::SP)));`
			`assert_eq!(fws(b" \r\n \r\n world"), Ok((&b"world"[..], ascii::SP)));`
			`assert_eq!(fws(b" world"), Ok((&b"world"[..], ascii::SP)));`
			`assert!(fws(b"\r\nFrom: test").is_err());`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`#[test]`
			`fn test_cfws() {`
cargo fmt 2023-06-22 15:08:50 +02:00			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`cfws(b"(A nice \\) chap) <pete(his account)@silly.test(his host)>"),`
cargo fmt 2023-06-22 15:08:50 +02:00			`Ok((`
fixed tests 2023-07-19 11:03:40 +02:00			`&b"<pete(his account)@silly.test(his host)>"[..],`
			`&b"(A nice \\) chap) "[..]`
cargo fmt 2023-06-22 15:08:50 +02:00			`))`
			`);`
			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`cfws(b"(Chris's host.)public.example>,"),`
			`Ok((&b"public.example>,"[..], &b"(Chris's host.)"[..]))`
cargo fmt 2023-06-22 15:08:50 +02:00			`);`
			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`cfws(b"(double (comment) is fun) wouch"),`
			`Ok((&b"wouch"[..], &b"(double (comment) is fun) "[..]))`
cargo fmt 2023-06-22 15:08:50 +02:00			`);`
refactor parser 2023-06-12 22:08:34 +02:00			`}`
implement mime headers 2023-07-14 10:43:31 +02:00
			`#[test]`
			`fn test_cfws_encoded_word() {`
			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`cfws(b"(=?US-ASCII?Q?Keith_Moore?=)"),`
			`Ok((&b""[..], &b"(=?US-ASCII?Q?Keith_Moore?=)"[..])),`
implement mime headers 2023-07-14 10:43:31 +02:00			`);`
			`}`
refactor parser 2023-06-12 22:08:34 +02:00			`}`