implement comment foldable whitespace
This commit is contained in:
parent
7d3b3ff053
commit
6e76fed684
6 changed files with 213 additions and 62 deletions
|
@ -1,3 +1,7 @@
|
||||||
# imf-codec
|
# imf-codec
|
||||||
|
|
||||||
|
**Work in progress, do not use in production**
|
||||||
|
|
||||||
|
**Focus: correctness over performance**
|
||||||
|
|
||||||
**This is currently only a decoder (parser), encoding is not supported.**
|
**This is currently only a decoder (parser), encoding is not supported.**
|
||||||
|
|
50
src/abnf.rs
50
src/abnf.rs
|
@ -1,50 +0,0 @@
|
||||||
use nom::{
|
|
||||||
IResult,
|
|
||||||
branch::alt,
|
|
||||||
bytes::complete::{tag, take_while1},
|
|
||||||
character::complete::{crlf, space0, space1},
|
|
||||||
combinator::opt,
|
|
||||||
sequence::terminated,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// ABNF rfc5234
|
|
||||||
|
|
||||||
/// Permissive CRLF
|
|
||||||
///
|
|
||||||
/// Theoretically, all lines must end with \r\n
|
|
||||||
/// but mail servers support malformated emails,
|
|
||||||
/// for example with only \n eol. It works because
|
|
||||||
/// \r\n is allowed nowhere else, so we also add this support.
|
|
||||||
pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
|
|
||||||
alt((crlf, tag("\r"), tag("\n")))(input)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: WSP = SP / HTAB = %x20 / %x09
|
|
||||||
// nom::*::space0 = *WSP
|
|
||||||
// nom::*::space1 = 1*WSP
|
|
||||||
|
|
||||||
/// Parse a folding white space
|
|
||||||
///
|
|
||||||
/// Folding white space are used for long headers splitted on multiple lines
|
|
||||||
///
|
|
||||||
/// ```abnf
|
|
||||||
/// FWS = ([*WSP CRLF] 1*WSP) / obs-FWS
|
|
||||||
/// obs-FWS = 1*WSP *(CRLF 1*WSP)
|
|
||||||
/// ```
|
|
||||||
pub fn fws(input: &str) -> IResult<&str, &str> {
|
|
||||||
let (input, _) = opt(terminated(space0, perm_crlf))(input)?;
|
|
||||||
// @FIXME: not implemented obs-FWS
|
|
||||||
space1(input)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Sequence of visible chars with the UTF-8 extension
|
|
||||||
///
|
|
||||||
/// ```abnf
|
|
||||||
/// VCHAR = %x21-7E
|
|
||||||
/// ; visible (printing) characters
|
|
||||||
/// VCHAR =/ UTF8-non-ascii
|
|
||||||
/// SEQ = 1*VCHAR
|
|
||||||
///```
|
|
||||||
pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
|
|
||||||
take_while1(|c: char| (c >= '\x21' && c <= '\x7E') || !c.is_ascii())(input)
|
|
||||||
}
|
|
|
@ -11,18 +11,18 @@ use nom::{
|
||||||
sequence::tuple,
|
sequence::tuple,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::abnf::{fws, vchar_seq, perm_crlf};
|
use crate::tokens::{perm_fws, vchar_seq, perm_crlf};
|
||||||
use crate::model::{HeaderSection, HeaderDate};
|
use crate::model::{PermissiveHeaderSection, HeaderDate, MailboxRef};
|
||||||
|
|
||||||
/// HEADERS
|
/// HEADERS
|
||||||
|
|
||||||
/// Header section
|
/// Header section
|
||||||
///
|
///
|
||||||
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
|
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
|
||||||
pub fn header_section(input: &str) -> IResult<&str, HeaderSection> {
|
pub fn header_section(input: &str) -> IResult<&str, PermissiveHeaderSection> {
|
||||||
let (input, headers) = fold_many0(
|
let (input, headers) = fold_many0(
|
||||||
header_field,
|
header_field,
|
||||||
HeaderSection::default,
|
PermissiveHeaderSection::default,
|
||||||
|mut section, head| {
|
|mut section, head| {
|
||||||
match head {
|
match head {
|
||||||
HeaderField::Date(d) => {
|
HeaderField::Date(d) => {
|
||||||
|
@ -114,7 +114,11 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> {
|
||||||
};
|
};
|
||||||
(input, HeaderField::Date(date))
|
(input, HeaderField::Date(date))
|
||||||
},
|
},
|
||||||
//"From" => unimplemented!(),
|
"From" => {
|
||||||
|
let (input, mbx) = mailbox(input)?;
|
||||||
|
//many0(
|
||||||
|
unimplemented!()
|
||||||
|
},
|
||||||
"Sender" => unimplemented!(),
|
"Sender" => unimplemented!(),
|
||||||
"Subject" => {
|
"Subject" => {
|
||||||
let (input, body) = unstructured(input)?;
|
let (input, body) = unstructured(input)?;
|
||||||
|
@ -136,17 +140,17 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> {
|
||||||
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
|
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
|
||||||
/// ```
|
/// ```
|
||||||
fn unstructured(input: &str) -> IResult<&str, String> {
|
fn unstructured(input: &str) -> IResult<&str, String> {
|
||||||
let (input, r) = many0(tuple((opt(fws), vchar_seq)))(input)?;
|
let (input, r) = many0(tuple((opt(perm_fws), vchar_seq)))(input)?;
|
||||||
let (input, _) = space0(input)?;
|
let (input, _) = space0(input)?;
|
||||||
|
|
||||||
// Try to optimize for the most common cases
|
// Try to optimize for the most common cases
|
||||||
let body = match r.as_slice() {
|
let body = match r.as_slice() {
|
||||||
[(None, content)] => content.to_string(),
|
[(None, content)] => content.to_string(),
|
||||||
[(Some(ws), content)] => ws.to_string() + content,
|
[(Some(_), content)] => " ".to_string() + content,
|
||||||
lines => lines.iter().fold(String::with_capacity(255), |acc, item| {
|
lines => lines.iter().fold(String::with_capacity(255), |acc, item| {
|
||||||
let (may_ws, content) = item;
|
let (may_ws, content) = item;
|
||||||
match may_ws {
|
match may_ws {
|
||||||
Some(ws) => acc + ws + content,
|
Some(ws) => acc + " " + content,
|
||||||
None => acc + content,
|
None => acc + content,
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
|
@ -155,3 +159,6 @@ fn unstructured(input: &str) -> IResult<&str, String> {
|
||||||
Ok((input, body))
|
Ok((input, body))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn mailbox(input: &str) -> IResult<&str, MailboxRef> {
|
||||||
|
unimplemented!();
|
||||||
|
}
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
pub mod headers;
|
pub mod headers;
|
||||||
pub mod model;
|
pub mod model;
|
||||||
mod abnf;
|
mod tokens;
|
||||||
|
|
41
src/model.rs
41
src/model.rs
|
@ -9,10 +9,37 @@ pub enum HeaderDate {
|
||||||
None,
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct MailboxRef<'a> {
|
||||||
|
// The actual "email address" like hello@example.com
|
||||||
|
pub addrspec: &'a str,
|
||||||
|
pub name: Option<&'a str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct GroupRef<'a> {
|
||||||
|
pub name: &'a str,
|
||||||
|
pub mbx: Vec<MailboxRef<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum AddressRef<'a> {
|
||||||
|
Single(MailboxRef<'a>),
|
||||||
|
Many(GroupRef<'a>),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Permissive Header Section
|
||||||
|
///
|
||||||
|
/// This is a structure intended for parsing/decoding,
|
||||||
|
/// hence it's support cases where the email is considered
|
||||||
|
/// as invalid according to RFC5322 but for which we can
|
||||||
|
/// still extract some data.
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
pub struct HeaderSection<'a> {
|
pub struct PermissiveHeaderSection<'a> {
|
||||||
pub subject: Option<String>,
|
pub subject: Option<String>,
|
||||||
pub from: Vec<String>,
|
pub from: Vec<MailboxRef<'a>>,
|
||||||
|
pub sender: Option<MailboxRef<'a>>,
|
||||||
|
pub reply_to: Vec<AddressRef<'a>>,
|
||||||
pub date: HeaderDate,
|
pub date: HeaderDate,
|
||||||
pub optional: HashMap<&'a str, String>,
|
pub optional: HashMap<&'a str, String>,
|
||||||
}
|
}
|
||||||
|
@ -21,7 +48,15 @@ enum InvalidEmailErr {
|
||||||
NoUsableDate,
|
NoUsableDate,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> HeaderSection<'a> {
|
impl<'a> PermissiveHeaderSection<'a> {
|
||||||
|
/// Check validity of the email
|
||||||
|
///
|
||||||
|
/// Especially check that there is no missing fields,
|
||||||
|
/// or no unique fields declared multiple times.
|
||||||
|
///
|
||||||
|
/// See: https://www.rfc-editor.org/rfc/rfc5322#section-3.6
|
||||||
|
//@FIXME could be changed to a to_StrictHeaderSection call. All fixed errors would be returned in
|
||||||
|
// a vec of errors.
|
||||||
fn is_valid(&self) -> Result<(), InvalidEmailErr> {
|
fn is_valid(&self) -> Result<(), InvalidEmailErr> {
|
||||||
match self.date {
|
match self.date {
|
||||||
HeaderDate::Parsed(_) => (),
|
HeaderDate::Parsed(_) => (),
|
||||||
|
|
155
src/tokens.rs
Normal file
155
src/tokens.rs
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
use nom::{
|
||||||
|
IResult,
|
||||||
|
branch::alt,
|
||||||
|
bytes::complete::{tag, take_while1},
|
||||||
|
character::complete::{crlf, satisfy, space0, space1},
|
||||||
|
combinator::{recognize, opt},
|
||||||
|
multi::{many0, many1},
|
||||||
|
sequence::{preceded, terminated, tuple},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Lexical tokens
|
||||||
|
///
|
||||||
|
/// Approx. maps to section 3.2 of the RFC
|
||||||
|
/// https://www.rfc-editor.org/rfc/rfc5322#section-3.2
|
||||||
|
/// Also https://datatracker.ietf.org/doc/html/rfc6532
|
||||||
|
|
||||||
|
/// Permissive CRLF
|
||||||
|
///
|
||||||
|
/// Theoretically, all lines must end with \r\n
|
||||||
|
/// but some mail servers like Dovecot support malformated emails,
|
||||||
|
/// for example with only \n eol. It works because
|
||||||
|
/// \r or \n is allowed nowhere else, so we also add this support.
|
||||||
|
pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
|
||||||
|
alt((crlf, tag("\r"), tag("\n")))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: WSP = SP / HTAB = %x20 / %x09
|
||||||
|
// nom::*::space0 = *WSP
|
||||||
|
// nom::*::space1 = 1*WSP
|
||||||
|
|
||||||
|
/// Quoted pair
|
||||||
|
///
|
||||||
|
/// ```abnf
|
||||||
|
/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
|
||||||
|
/// ```
|
||||||
|
pub fn quoted_pair(input: &str) -> IResult<&str, char> {
|
||||||
|
preceded(tag("\\"), satisfy(|c| is_vchar(c) || c == '\t' || c == ' '))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Permissive foldable white space
|
||||||
|
///
|
||||||
|
/// Folding white space are used for long headers splitted on multiple lines.
|
||||||
|
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
|
||||||
|
/// reasons
|
||||||
|
pub fn perm_fws(input: &str) -> IResult<&str, &str> {
|
||||||
|
alt((recognize(many1(fold_marker)), space1))(input)
|
||||||
|
}
|
||||||
|
fn fold_marker(input: &str) -> IResult<&str, &str> {
|
||||||
|
let (input, _) = space0(input)?;
|
||||||
|
let (input, _) = perm_crlf(input)?;
|
||||||
|
space1(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Folding White Space with Comment
|
||||||
|
///
|
||||||
|
/// Note: we drop the comments for now...
|
||||||
|
///
|
||||||
|
/// ctext = %d33-39 / ; Printable US-ASCII
|
||||||
|
/// %d42-91 / ; characters not including
|
||||||
|
/// %d93-126 / ; "(", ")", or "\"
|
||||||
|
/// obs-ctext
|
||||||
|
///
|
||||||
|
/// ccontent = ctext / quoted-pair / comment
|
||||||
|
///
|
||||||
|
/// comment = "(" *([FWS] ccontent) [FWS] ")"
|
||||||
|
///
|
||||||
|
/// CFWS = (1*([FWS] comment) [FWS]) / FWS
|
||||||
|
/// ```
|
||||||
|
pub fn cfws(input: &str) -> IResult<&str, &str> {
|
||||||
|
alt((perm_fws, recognize(comments)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn comments(input: &str) -> IResult<&str, ()> {
|
||||||
|
let (input, _) = many1(tuple((opt(perm_fws), comment)))(input)?;
|
||||||
|
let (input, _) = opt(perm_fws)(input)?;
|
||||||
|
Ok((input, ()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn comment(input: &str) -> IResult<&str, ()> {
|
||||||
|
let (input, _) = tag("(")(input)?;
|
||||||
|
let (input, _) = many0(tuple((opt(perm_fws), ccontent)))(input)?;
|
||||||
|
let (input, _) = opt(perm_fws)(input)?;
|
||||||
|
let (input, _) = tag(")")(input)?;
|
||||||
|
Ok((input, ()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ccontent(input: &str) -> IResult<&str, &str> {
|
||||||
|
alt((recognize(ctext), recognize(quoted_pair), recognize(comment)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ctext(input: &str) -> IResult<&str, char> {
|
||||||
|
satisfy(is_ctext)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if it's a comment text character
|
||||||
|
///
|
||||||
|
/// ```abnf
|
||||||
|
/// ctext = %d33-39 / ; Printable US-ASCII
|
||||||
|
/// %d42-91 / ; characters not including
|
||||||
|
/// %d93-126 / ; "(", ")", or "\"
|
||||||
|
/// obs-ctext
|
||||||
|
///```
|
||||||
|
pub fn is_ctext(c: char) -> bool {
|
||||||
|
(c >= '\x21' && c <= '\x27') || (c >= '\x2A' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') || !c.is_ascii()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// VCHAR definition
|
||||||
|
pub fn is_vchar(c: char) -> bool {
|
||||||
|
(c >= '\x21' && c <= '\x7E') || !c.is_ascii()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sequence of visible chars with the UTF-8 extension
|
||||||
|
///
|
||||||
|
/// ```abnf
|
||||||
|
/// VCHAR = %x21-7E
|
||||||
|
/// ; visible (printing) characters
|
||||||
|
/// VCHAR =/ UTF8-non-ascii
|
||||||
|
/// SEQ = 1*VCHAR
|
||||||
|
///```
|
||||||
|
pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
|
||||||
|
take_while1(is_vchar)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use nom;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vchar_seq() {
|
||||||
|
assert_eq!(vchar_seq("hello world"), Ok((" world", "hello")));
|
||||||
|
assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_perm_crlf() {
|
||||||
|
assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));
|
||||||
|
assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));
|
||||||
|
assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_perm_fws() {
|
||||||
|
assert_eq!(perm_fws("\r\n world"), Ok(("world", "\r\n ")));
|
||||||
|
assert_eq!(perm_fws(" \r\n \r\n world"), Ok(("world", " \r\n \r\n ")));
|
||||||
|
assert_eq!(perm_fws(" world"), Ok(("world", " ")));
|
||||||
|
assert!(perm_fws("\r\nFrom: test").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cfws() {
|
||||||
|
assert_eq!(cfws("(A nice \\) chap) <pete(his account)@silly.test(his host)>"), Ok(("<pete(his account)@silly.test(his host)>", "(A nice \\) chap) ")));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue