eml-codec/src/text/whitespace.rs

use nom::{
    branch::alt,
    bytes::complete::{is_not, tag, take_while1},
    character::complete::{space0, space1},
    combinator::{opt, recognize},
    multi::{many0, many1},
    sequence::{pair, tuple},
    IResult,
};
use crate::text::encoding::encoded_word;
use crate::text::quoted::quoted_pair;
use crate::text::ascii;

/// Whitespace (space, new line, tab) content and
/// delimited content (eg. comment, line, sections, etc.)

/// Obsolete/Compatible CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.

pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)
}
pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
    // is_not(CRLF) is a hack, it means "is not CR or LF"
    // and not "is not CRLF". In other words, it continues while
    // it does not encounter 0x0D or 0x0A.
    pair(is_not(ascii::CRLF), obs_crlf)(input)
}

/// ```abnf
/// fold_line = any *(1*(crlf WS) any) crlf
/// ```
pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {
    recognize(tuple((
        is_not(ascii::CRLF),
        many0(pair(
            many1(pair(obs_crlf, space1)),
            is_not(ascii::CRLF),
        )),
        obs_crlf,
    )))(input)
}

// --- whitespaces and comments

// Note: WSP = SP / HTAB = %x20 / %x09
// nom::*::space0 = *WSP
// nom::*::space1 = 1*WSP

/// Permissive foldable white space
///
/// Folding white space are used for long headers splitted on multiple lines.
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
/// reasons
pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {
    let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
    Ok((input, ascii::SP))
}
fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {
    let (input, _) = space0(input)?;
    let (input, _) = obs_crlf(input)?;
    space1(input)
}

/// Folding White Space with Comment
///
/// Note: we drop the comments for now...
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///
///   ccontent        =   ctext / quoted-pair / comment
///
///   comment         =   "(" *([FWS] ccontent) [FWS] ")"
///
///   CFWS            =   (1*([FWS] comment) [FWS]) / FWS
/// ```
pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((recognize(comments), recognize(fws)))(input)
}

pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {
    let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
    let (input, _) = opt(fws)(input)?;
    Ok((input, ()))
}

pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {
    let (input, _) = tag("(")(input)?;
    let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
    let (input, _) = opt(fws)(input)?;
    let (input, _) = tag(")")(input)?;
    Ok((input, ()))
}

pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {
    alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
}

pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {
    take_while1(is_ctext)(input)
}

pub fn is_ctext(c: u8) -> bool {
    is_restr_ctext(c) || is_obs_no_ws_ctl(c)
}

/// Check if it's a comment text character
///
/// ```abnf
///   ctext           =   %d33-39 /          ; Printable US-ASCII
///                       %d42-91 /          ;  characters not including
///                       %d93-126 /         ;  "(", ")", or "\"
///                       obs-ctext
///```
pub fn is_restr_ctext(c: u8) -> bool {
    (c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)
        || (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)
        || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
}

/// US ASCII control characters without effect
///
/// ```abnf
///   obs-NO-WS-CTL   =   %d1-8 /            ; US-ASCII control
///                       %d11 /             ;  characters that do not
///                       %d12 /             ;  include the carriage
///                       %d14-31 /          ;  return, line feed, and
///                       %d127              ;  white space characters
/// ```
pub fn is_obs_no_ws_ctl(c: u8) -> bool {
    (c >= ascii::SOH && c <= ascii::BS)
        || c == ascii::VT
        || c == ascii::FF
        || (c >= ascii::SO && c <= ascii::US)
        || c == ascii::DEL
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_obs_crlf() {
        assert_eq!(obs_crlf(b"\rworld"), Ok((&b"world"[..], &b"\r"[..])));
        assert_eq!(obs_crlf(b"\r\nworld"), Ok((&b"world"[..], &b"\r\n"[..])));
        assert_eq!(obs_crlf(b"\nworld"), Ok((&b"world"[..], &b"\n"[..])));
    }

    #[test]
    fn test_fws() {
        assert_eq!(fws(b"\r\n world"), Ok((&b"world"[..], ascii::SP)));
        assert_eq!(fws(b" \r\n \r\n world"), Ok((&b"world"[..], ascii::SP)));
        assert_eq!(fws(b" world"), Ok((&b"world"[..], ascii::SP)));
        assert!(fws(b"\r\nFrom: test").is_err());
    }

    #[test]
    fn test_cfws() {
        assert_eq!(
            cfws(b"(A nice \\) chap) <pete(his account)@silly.test(his host)>"),
            Ok((
                &b"<pete(his account)@silly.test(his host)>"[..],
                &b"(A nice \\) chap) "[..]
            ))
        );
        assert_eq!(
            cfws(b"(Chris's host.)public.example>,"),
            Ok((&b"public.example>,"[..], &b"(Chris's host.)"[..]))
        );
        assert_eq!(
            cfws(b"(double (comment) is fun) wouch"),
            Ok((&b"wouch"[..], &b"(double (comment) is fun) "[..]))
        );
    }

    #[test]
    fn test_cfws_encoded_word() {
       assert_eq!(
            cfws(b"(=?US-ASCII?Q?Keith_Moore?=)"),
            Ok((&b""[..], &b"(=?US-ASCII?Q?Keith_Moore?=)"[..])),
        );
    }
}