eml-codec/src/text/whitespace.rs

192 lines
5.9 KiB
Rust
Raw Normal View History

2023-06-12 22:08:34 +02:00
use nom::{
branch::alt,
2023-07-18 23:25:10 +02:00
bytes::complete::{is_not, tag, take_while1},
character::complete::{space0, space1},
2023-06-22 15:08:50 +02:00
combinator::{opt, recognize},
2023-06-12 22:08:34 +02:00
multi::{many0, many1},
2023-07-18 23:25:10 +02:00
sequence::{pair, tuple},
2023-06-22 15:08:50 +02:00
IResult,
2023-06-12 22:08:34 +02:00
};
2023-07-18 23:25:10 +02:00
use crate::text::encoding::encoded_word;
use crate::text::quoted::quoted_pair;
use crate::text::ascii;
2023-06-12 22:08:34 +02:00
2023-07-17 17:14:08 +02:00
/// Whitespace (space, new line, tab) content and
/// delimited content (eg. comment, line, sections, etc.)
2023-07-18 23:25:10 +02:00
/// Obsolete/Compatible CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.
2023-07-16 09:55:47 +02:00
2023-07-18 23:25:10 +02:00
pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)
2023-07-17 17:14:08 +02:00
}
2023-07-16 09:55:47 +02:00
pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
2023-07-17 11:44:55 +02:00
// is_not(CRLF) is a hack, it means "is not CR or LF"
// and not "is not CRLF". In other words, it continues while
// it does not encounter 0x0D or 0x0A.
2023-07-18 23:25:10 +02:00
pair(is_not(ascii::CRLF), obs_crlf)(input)
2023-07-16 09:55:47 +02:00
}
2023-07-18 23:25:10 +02:00
/// ```abnf
/// fold_line = any *(1*(crlf WS) any) crlf
/// ```
pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
is_not(ascii::CRLF),
many0(pair(
many1(pair(obs_crlf, space1)),
is_not(ascii::CRLF),
)),
obs_crlf,
)))(input)
2023-07-16 09:55:47 +02:00
}
2023-06-12 22:08:34 +02:00
// --- whitespaces and comments
// Note: WSP = SP / HTAB = %x20 / %x09
// nom::*::space0 = *WSP
// nom::*::space1 = 1*WSP
/// Permissive foldable white space
///
/// Folding white space are used for long headers splitted on multiple lines.
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
/// reasons
2023-07-18 23:25:10 +02:00
pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {
2023-06-12 22:08:34 +02:00
let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
2023-07-18 23:25:10 +02:00
Ok((input, ascii::SP))
2023-06-12 22:08:34 +02:00
}
2023-07-18 23:25:10 +02:00
fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {
2023-06-22 15:08:50 +02:00
let (input, _) = space0(input)?;
2023-07-18 23:25:10 +02:00
let (input, _) = obs_crlf(input)?;
2023-06-22 15:08:50 +02:00
space1(input)
2023-06-12 22:08:34 +02:00
}
/// Folding White Space with Comment
///
/// Note: we drop the comments for now...
///
2023-06-13 09:18:36 +02:00
/// ```abnf
2023-06-12 22:08:34 +02:00
/// ctext = %d33-39 / ; Printable US-ASCII
/// %d42-91 / ; characters not including
/// %d93-126 / ; "(", ")", or "\"
/// obs-ctext
///
/// ccontent = ctext / quoted-pair / comment
///
/// comment = "(" *([FWS] ccontent) [FWS] ")"
///
/// CFWS = (1*([FWS] comment) [FWS]) / FWS
/// ```
2023-07-18 23:25:10 +02:00
pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {
2023-06-12 22:08:34 +02:00
alt((recognize(comments), recognize(fws)))(input)
}
2023-07-18 23:25:10 +02:00
pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {
2023-06-12 22:08:34 +02:00
let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
let (input, _) = opt(fws)(input)?;
Ok((input, ()))
}
2023-07-18 23:25:10 +02:00
pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {
2023-06-12 22:08:34 +02:00
let (input, _) = tag("(")(input)?;
let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
let (input, _) = opt(fws)(input)?;
let (input, _) = tag(")")(input)?;
Ok((input, ()))
}
2023-07-18 23:25:10 +02:00
pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
2023-06-12 22:08:34 +02:00
}
2023-07-18 23:25:10 +02:00
pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {
take_while1(is_ctext)(input)
}
pub fn is_ctext(c: u8) -> bool {
is_restr_ctext(c) || is_obs_no_ws_ctl(c)
2023-06-12 22:08:34 +02:00
}
/// Check if it's a comment text character
///
/// ```abnf
/// ctext = %d33-39 / ; Printable US-ASCII
/// %d42-91 / ; characters not including
/// %d93-126 / ; "(", ")", or "\"
/// obs-ctext
///```
2023-07-18 23:25:10 +02:00
pub fn is_restr_ctext(c: u8) -> bool {
(c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)
|| (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)
|| (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
2023-06-16 12:07:17 +02:00
}
2023-06-22 15:08:50 +02:00
/// US ASCII control characters without effect
2023-06-16 12:07:17 +02:00
///
/// ```abnf
/// obs-NO-WS-CTL = %d1-8 / ; US-ASCII control
/// %d11 / ; characters that do not
/// %d12 / ; include the carriage
/// %d14-31 / ; return, line feed, and
/// %d127 ; white space characters
/// ```
2023-07-18 23:25:10 +02:00
pub fn is_obs_no_ws_ctl(c: u8) -> bool {
(c >= ascii::SOH && c <= ascii::BS)
|| c == ascii::VT
|| c == ascii::FF
|| (c >= ascii::SO && c <= ascii::US)
|| c == ascii::DEL
2023-06-16 12:07:17 +02:00
}
2023-06-12 22:08:34 +02:00
#[cfg(test)]
mod tests {
use super::*;
#[test]
2023-07-18 23:25:10 +02:00
fn test_obs_crlf() {
2023-07-19 11:03:40 +02:00
assert_eq!(obs_crlf(b"\rworld"), Ok((&b"world"[..], &b"\r"[..])));
assert_eq!(obs_crlf(b"\r\nworld"), Ok((&b"world"[..], &b"\r\n"[..])));
assert_eq!(obs_crlf(b"\nworld"), Ok((&b"world"[..], &b"\n"[..])));
2023-06-12 22:08:34 +02:00
}
#[test]
fn test_fws() {
2023-07-19 11:03:40 +02:00
assert_eq!(fws(b"\r\n world"), Ok((&b"world"[..], ascii::SP)));
assert_eq!(fws(b" \r\n \r\n world"), Ok((&b"world"[..], ascii::SP)));
assert_eq!(fws(b" world"), Ok((&b"world"[..], ascii::SP)));
assert!(fws(b"\r\nFrom: test").is_err());
2023-06-12 22:08:34 +02:00
}
#[test]
fn test_cfws() {
2023-06-22 15:08:50 +02:00
assert_eq!(
2023-07-19 11:03:40 +02:00
cfws(b"(A nice \\) chap) <pete(his account)@silly.test(his host)>"),
2023-06-22 15:08:50 +02:00
Ok((
2023-07-19 11:03:40 +02:00
&b"<pete(his account)@silly.test(his host)>"[..],
&b"(A nice \\) chap) "[..]
2023-06-22 15:08:50 +02:00
))
);
assert_eq!(
2023-07-19 11:03:40 +02:00
cfws(b"(Chris's host.)public.example>,"),
Ok((&b"public.example>,"[..], &b"(Chris's host.)"[..]))
2023-06-22 15:08:50 +02:00
);
assert_eq!(
2023-07-19 11:03:40 +02:00
cfws(b"(double (comment) is fun) wouch"),
Ok((&b"wouch"[..], &b"(double (comment) is fun) "[..]))
2023-06-22 15:08:50 +02:00
);
2023-06-12 22:08:34 +02:00
}
2023-07-14 10:43:31 +02:00
#[test]
fn test_cfws_encoded_word() {
assert_eq!(
2023-07-19 11:03:40 +02:00
cfws(b"(=?US-ASCII?Q?Keith_Moore?=)"),
Ok((&b""[..], &b"(=?US-ASCII?Q?Keith_Moore?=)"[..])),
2023-07-14 10:43:31 +02:00
);
}
2023-06-12 22:08:34 +02:00
}