eml-codec/src/fragments/whitespace.rs

use crate::fragments::quoted::quoted_pair;
use nom::{
    branch::alt,
    bytes::complete::{is_not, tag},
    character::complete::{crlf, satisfy, space0, space1},
    combinator::{opt, recognize},
    multi::{many0, many1},
    sequence::{pair, tuple},
    IResult,
};
use crate::fragments::encoding::encoded_word;

// Bytes CRLF
const CR: u8 = 0x0D;
const LF: u8 = 0x0A;
pub const CRLF: &[u8] = &[CR, LF];

pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
    // is_not(CRLF) is a hack, it means "is not CR or LF"
    // and not "is not CRLF". In other words, it continues while
    // it does not encounter 0x0D or 0x0A.
    pair(is_not(CRLF), obs_crlf)(input)
}

pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((tag(CRLF), tag(&[CR]), tag(&[LF])))(input)
}


// --- whitespaces and comments

// Note: WSP = SP / HTAB = %x20 / %x09
// nom::*::space0 = *WSP
// nom::*::space1 = 1*WSP

/// Permissive CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.
pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
    alt((crlf, tag("\r"), tag("\n")))(input)
}

/// Permissive foldable white space
///
/// Folding white space are used for long headers splitted on multiple lines.
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
/// reasons
pub fn fws(input: &str) -> IResult<&str, char> {
    let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
    Ok((input, ' '))
}
fn fold_marker(input: &str) -> IResult<&str, &str> {
    let (input, _) = space0(input)?;
    let (input, _) = perm_crlf(input)?;
    space1(input)
}

/// Folding White Space with Comment
///
/// Note: we drop the comments for now...  
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///
///   ccontent        =   ctext / quoted-pair / comment
///
///   comment         =   "(" *([FWS] ccontent) [FWS] ")"
///
///   CFWS            =   (1*([FWS] comment) [FWS]) / FWS
/// ```
pub fn cfws(input: &str) -> IResult<&str, &str> {
    alt((recognize(comments), recognize(fws)))(input)
}

pub fn comments(input: &str) -> IResult<&str, ()> {
    let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
    let (input, _) = opt(fws)(input)?;
    Ok((input, ()))
}

pub fn comment(input: &str) -> IResult<&str, ()> {
    let (input, _) = tag("(")(input)?;
    let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
    let (input, _) = opt(fws)(input)?;
    let (input, _) = tag(")")(input)?;
    Ok((input, ()))
}

pub fn ccontent(input: &str) -> IResult<&str, &str> {
    alt((recognize(ctext), recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
}

pub fn ctext(input: &str) -> IResult<&str, char> {
    satisfy(is_ctext)(input)
}

/// Check if it's a comment text character
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///```
pub fn is_restr_ctext(c: char) -> bool {
    (c >= '\x21' && c <= '\x27')
        || (c >= '\x2A' && c <= '\x5B')
        || (c >= '\x5D' && c <= '\x7E')
        || !c.is_ascii()
}

pub fn is_ctext(c: char) -> bool {
    is_restr_ctext(c) || is_obs_no_ws_ctl(c)
}

/// US ASCII control characters without effect
///
/// ```abnf
///   obs-NO-WS-CTL   =   %d1-8 /            ; US-ASCII control
///                       %d11 /             ;  characters that do not
///                       %d12 /             ;  include the carriage
///                       %d14-31 /          ;  return, line feed, and
///                       %d127              ;  white space characters
/// ```
pub fn is_obs_no_ws_ctl(c: char) -> bool {
    (c >= '\x01' && c <= '\x08')
        || c == '\x0b'
        || c == '\x0b'
        || (c >= '\x0e' && c <= '\x1f')
        || c == '\x7F'
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_perm_crlf() {
        assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));
        assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));
        assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));
    }

    #[test]
    fn test_fws() {
        assert_eq!(fws("\r\n world"), Ok(("world", ' ')));
        assert_eq!(fws(" \r\n \r\n world"), Ok(("world", ' ')));
        assert_eq!(fws(" world"), Ok(("world", ' ')));
        assert!(fws("\r\nFrom: test").is_err());
    }

    #[test]
    fn test_cfws() {
        assert_eq!(
            cfws("(A nice \\) chap) <pete(his account)@silly.test(his host)>"),
            Ok((
                "<pete(his account)@silly.test(his host)>",
                "(A nice \\) chap) "
            ))
        );
        assert_eq!(
            cfws("(Chris's host.)public.example>,"),
            Ok(("public.example>,", "(Chris's host.)"))
        );
        assert_eq!(
            cfws("(double (comment) is fun) wouch"),
            Ok(("wouch", "(double (comment) is fun) "))
        );
    }

    #[test]
    fn test_cfws_encoded_word() {
       assert_eq!(
            cfws("(=?US-ASCII?Q?Keith_Moore?=)"),
            Ok(("", "(=?US-ASCII?Q?Keith_Moore?=)")),
        );
    }
}
cargo fmt 2023-06-22 15:08:50 +02:00			`use crate::fragments::quoted::quoted_pair;`
refactor parser 2023-06-12 22:08:34 +02:00			`use nom::{`
			`branch::alt,`
fix compilation 2023-07-16 09:55:47 +02:00			`bytes::complete::{is_not, tag},`
refactor parser 2023-06-12 22:08:34 +02:00			`character::complete::{crlf, satisfy, space0, space1},`
cargo fmt 2023-06-22 15:08:50 +02:00			`combinator::{opt, recognize},`
refactor parser 2023-06-12 22:08:34 +02:00			`multi::{many0, many1},`
fix compilation 2023-07-16 09:55:47 +02:00			`sequence::{pair, tuple},`
cargo fmt 2023-06-22 15:08:50 +02:00			`IResult,`
refactor parser 2023-06-12 22:08:34 +02:00			`};`
implement mime headers 2023-07-14 10:43:31 +02:00			`use crate::fragments::encoding::encoded_word;`
refactor parser 2023-06-12 22:08:34 +02:00
fix compilation 2023-07-16 09:55:47 +02:00			`// Bytes CRLF`
			`const CR: u8 = 0x0D;`
			`const LF: u8 = 0x0A;`
add a preamble test 2023-07-17 11:44:55 +02:00			`pub const CRLF: &[u8] = &[CR, LF];`
fix compilation 2023-07-16 09:55:47 +02:00
			`pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {`
add a preamble test 2023-07-17 11:44:55 +02:00			`// is_not(CRLF) is a hack, it means "is not CR or LF"`
			`// and not "is not CRLF". In other words, it continues while`
			`// it does not encounter 0x0D or 0x0A.`
fix compilation 2023-07-16 09:55:47 +02:00			`pair(is_not(CRLF), obs_crlf)(input)`
			`}`

			`pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {`
			`alt((tag(CRLF), tag(&[CR]), tag(&[LF])))(input)`
			`}`


refactor parser 2023-06-12 22:08:34 +02:00			`// --- whitespaces and comments`

			`// Note: WSP = SP / HTAB = %x20 / %x09`
			`// nom::::space0 = WSP`
			`// nom::::space1 = 1WSP`

			`/// Permissive CRLF`
			`///`
			`/// Theoretically, all lines must end with \r\n`
			`/// but some mail servers like Dovecot support malformated emails,`
			`/// for example with only \n eol. It works because`
			`/// \r or \n is allowed nowhere else, so we also add this support.`
			`pub fn perm_crlf(input: &str) -> IResult<&str, &str> {`
			`alt((crlf, tag("\r"), tag("\n")))(input)`
			`}`

			`/// Permissive foldable white space`
			`///`
			`/// Folding white space are used for long headers splitted on multiple lines.`
			`/// The obsolete syntax allowes multiple lines without content; implemented for compatibility`
			`/// reasons`
			`pub fn fws(input: &str) -> IResult<&str, char> {`
			`let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;`
			`Ok((input, ' '))`
			`}`
			`fn fold_marker(input: &str) -> IResult<&str, &str> {`
cargo fmt 2023-06-22 15:08:50 +02:00			`let (input, _) = space0(input)?;`
			`let (input, _) = perm_crlf(input)?;`
			`space1(input)`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`/// Folding White Space with Comment`
			`///`
			`/// Note: we drop the comments for now...`
			`///`
mailbox tests 2023-06-13 09:18:36 +02:00			/// ```abnf
refactor parser 2023-06-12 22:08:34 +02:00			`/// ctext = %d33-39 / ; Printable US-ASCII`
			`/// %d42-91 / ; characters not including`
			`/// %d93-126 / ; "(", ")", or "\"`
			`/// obs-ctext`
			`///`
			`/// ccontent = ctext / quoted-pair / comment`
			`///`
			`/// comment = "(" *([FWS] ccontent) [FWS] ")"`
			`///`
			`/// CFWS = (1*([FWS] comment) [FWS]) / FWS`
			/// ```
			`pub fn cfws(input: &str) -> IResult<&str, &str> {`
			`alt((recognize(comments), recognize(fws)))(input)`
			`}`

			`pub fn comments(input: &str) -> IResult<&str, ()> {`
			`let (input, _) = many1(tuple((opt(fws), comment)))(input)?;`
			`let (input, _) = opt(fws)(input)?;`
			`Ok((input, ()))`
			`}`

			`pub fn comment(input: &str) -> IResult<&str, ()> {`
			`let (input, _) = tag("(")(input)?;`
			`let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;`
			`let (input, _) = opt(fws)(input)?;`
			`let (input, _) = tag(")")(input)?;`
			`Ok((input, ()))`
			`}`

			`pub fn ccontent(input: &str) -> IResult<&str, &str> {`
implement mime headers 2023-07-14 10:43:31 +02:00			`alt((recognize(ctext), recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

			`pub fn ctext(input: &str) -> IResult<&str, char> {`
			`satisfy(is_ctext)(input)`
			`}`

			`/// Check if it's a comment text character`
			`///`
			/// ```abnf
			`/// ctext = %d33-39 / ; Printable US-ASCII`
			`/// %d42-91 / ; characters not including`
			`/// %d93-126 / ; "(", ")", or "\"`
			`/// obs-ctext`
			///```
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`pub fn is_restr_ctext(c: char) -> bool {`
cargo fmt 2023-06-22 15:08:50 +02:00			`(c >= '\x21' && c <= '\x27')`
			`\|\| (c >= '\x2A' && c <= '\x5B')`
			`\|\| (c >= '\x5D' && c <= '\x7E')`
			`\|\| !c.is_ascii()`
refactor parser 2023-06-12 22:08:34 +02:00			`}`

add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`pub fn is_ctext(c: char) -> bool {`
			`is_restr_ctext(c) \|\| is_obs_no_ws_ctl(c)`
			`}`

cargo fmt 2023-06-22 15:08:50 +02:00			`/// US ASCII control characters without effect`
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`///`
			/// ```abnf
			`/// obs-NO-WS-CTL = %d1-8 / ; US-ASCII control`
			`/// %d11 / ; characters that do not`
			`/// %d12 / ; include the carriage`
			`/// %d14-31 / ; return, line feed, and`
			`/// %d127 ; white space characters`
			/// ```
			`pub fn is_obs_no_ws_ctl(c: char) -> bool {`
cargo fmt 2023-06-22 15:08:50 +02:00			`(c >= '\x01' && c <= '\x08')`
			`\|\| c == '\x0b'`
			`\|\| c == '\x0b'`
			`\|\| (c >= '\x0e' && c <= '\x1f')`
			`\|\| c == '\x7F'`
add compatibility with obsolete syntax 2023-06-16 12:07:17 +02:00			`}`

refactor parser 2023-06-12 22:08:34 +02:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn test_perm_crlf() {`
			`assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));`
			`assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));`
			`assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));`
			`}`

			`#[test]`
			`fn test_fws() {`
			`assert_eq!(fws("\r\n world"), Ok(("world", ' ')));`
			`assert_eq!(fws(" \r\n \r\n world"), Ok(("world", ' ')));`
			`assert_eq!(fws(" world"), Ok(("world", ' ')));`
			`assert!(fws("\r\nFrom: test").is_err());`
			`}`

			`#[test]`
			`fn test_cfws() {`
cargo fmt 2023-06-22 15:08:50 +02:00			`assert_eq!(`
			`cfws("(A nice \\) chap) <pete(his account)@silly.test(his host)>"),`
			`Ok((`
			`"<pete(his account)@silly.test(his host)>",`
			`"(A nice \\) chap) "`
			`))`
			`);`
			`assert_eq!(`
			`cfws("(Chris's host.)public.example>,"),`
			`Ok(("public.example>,", "(Chris's host.)"))`
			`);`
			`assert_eq!(`
			`cfws("(double (comment) is fun) wouch"),`
			`Ok(("wouch", "(double (comment) is fun) "))`
			`);`
refactor parser 2023-06-12 22:08:34 +02:00			`}`
implement mime headers 2023-07-14 10:43:31 +02:00
			`#[test]`
			`fn test_cfws_encoded_word() {`
			`assert_eq!(`
			`cfws("(=?US-ASCII?Q?Keith_Moore?=)"),`
			`Ok(("", "(=?US-ASCII?Q?Keith_Moore?=)")),`
			`);`
			`}`
refactor parser 2023-06-12 22:08:34 +02:00			`}`