From a503eb1de6c5641a9304b39f71a4d8affce51eb4 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Tue, 18 Jul 2023 23:25:10 +0200 Subject: [PATCH] wip refactor --- src/fragments/misc_token.rs | 129 ------- src/fragments/mod.rs | 23 -- src/fragments/model.rs | 146 -------- src/fragments/words.rs | 116 ------ src/headers.rs | 27 ++ src/lib.rs | 6 +- .../mime.rs => mime/content_fields.rs} | 12 - src/{fragments => mime}/part.rs | 0 .../body_structure.rs | 0 .../extract_fields.rs | 0 .../field_eager.rs | 0 .../field_lazy.rs | 0 .../guess_charset.rs | 0 .../header_section.rs | 0 src/{multipass => old.multipass}/mod.rs | 0 src/{multipass => old.multipass}/segment.rs | 0 src/parse.rs | 10 +- src/{fragments => rfc5322}/address.rs | 25 +- src/{fragments => rfc5322}/datetime.rs | 0 src/{fragments => rfc5322}/eager.rs | 0 src/{fragments => rfc5322}/identification.rs | 8 + src/{fragments => rfc5322}/lazy.rs | 0 src/{fragments => rfc5322}/mailbox.rs | 28 +- .../section.rs => rfc5322/message.rs} | 0 src/{fragments => rfc5322}/trace.rs | 0 src/text/ascii.rs | 142 ++++++++ src/text/buffer.rs | 43 +++ src/{fragments => text}/encoding.rs | 129 ++++--- src/text/misc_token.rs | 166 +++++++++ src/text/mod.rs | 7 + src/{fragments => text}/quoted.rs | 49 ++- src/{fragments => text}/whitespace.rs | 116 +++--- src/text/words.rs | 133 +++++++ tests/enron.rs | 129 ------- tests/known.rs | 340 ------------------ 35 files changed, 746 insertions(+), 1038 deletions(-) delete mode 100644 src/fragments/misc_token.rs delete mode 100644 src/fragments/mod.rs delete mode 100644 src/fragments/model.rs delete mode 100644 src/fragments/words.rs create mode 100644 src/headers.rs rename src/{fragments/mime.rs => mime/content_fields.rs} (97%) rename src/{fragments => mime}/part.rs (100%) rename src/{multipass => old.multipass}/body_structure.rs (100%) rename src/{multipass => old.multipass}/extract_fields.rs (100%) rename src/{multipass => old.multipass}/field_eager.rs (100%) rename src/{multipass => old.multipass}/field_lazy.rs (100%) rename src/{multipass => old.multipass}/guess_charset.rs (100%) rename src/{multipass => old.multipass}/header_section.rs (100%) rename src/{multipass => old.multipass}/mod.rs (100%) rename src/{multipass => old.multipass}/segment.rs (100%) rename src/{fragments => rfc5322}/address.rs (90%) rename src/{fragments => rfc5322}/datetime.rs (100%) rename src/{fragments => rfc5322}/eager.rs (100%) rename src/{fragments => rfc5322}/identification.rs (93%) rename src/{fragments => rfc5322}/lazy.rs (100%) rename src/{fragments => rfc5322}/mailbox.rs (95%) rename src/{fragments/section.rs => rfc5322/message.rs} (100%) rename src/{fragments => rfc5322}/trace.rs (100%) create mode 100644 src/text/ascii.rs create mode 100644 src/text/buffer.rs rename src/{fragments => text}/encoding.rs (50%) create mode 100644 src/text/misc_token.rs create mode 100644 src/text/mod.rs rename src/{fragments => text}/quoted.rs (55%) rename src/{fragments => text}/whitespace.rs (67%) create mode 100644 src/text/words.rs delete mode 100644 tests/enron.rs delete mode 100644 tests/known.rs diff --git a/src/fragments/misc_token.rs b/src/fragments/misc_token.rs deleted file mode 100644 index 11e25af..0000000 --- a/src/fragments/misc_token.rs +++ /dev/null @@ -1,129 +0,0 @@ -use nom::{ - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::space0, - combinator::{into, map, opt}, - multi::{many0, many1, separated_list1}, - sequence::tuple, - IResult, -}; -use std::borrow::Cow; - -use crate::error::IMFError; -use crate::fragments::lazy; -use crate::fragments::quoted::quoted_string; -use crate::fragments::whitespace::{fws, is_obs_no_ws_ctl}; -use crate::fragments::words::{atom, is_vchar}; -use crate::fragments::encoding::encoded_word; - -#[derive(Debug, PartialEq, Default)] -pub struct Unstructured(pub String); - -#[derive(Debug, PartialEq, Default)] -pub struct PhraseList(pub Vec); - -impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured { - type Error = IMFError<'a>; - - fn try_from(input: &'a lazy::Unstructured<'a>) -> Result { - unstructured(input.0) - .map(|(_, v)| Unstructured(v)) - .map_err(|e| IMFError::Unstructured(e)) - } -} - -impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList { - type Error = IMFError<'a>; - - fn try_from(p: &'a lazy::PhraseList<'a>) -> Result { - separated_list1(tag(","), phrase)(p.0) - .map(|(_, q)| PhraseList(q)) - .map_err(|e| IMFError::PhraseList(e)) - } -} - -/// Word -/// -/// ```abnf -/// word = atom / quoted-string -/// ``` -pub fn word(input: &str) -> IResult<&str, Cow> { - alt((into(quoted_string), into(encoded_word), into(atom)))(input) -} - -/// Phrase -/// -/// ```abnf -/// phrase = 1*word / obs-phrase -/// ``` -pub fn phrase(input: &str) -> IResult<&str, String> { - let (input, words) = many1(word)(input)?; - let phrase = words.join(" "); - Ok((input, phrase)) -} - -/// Compatible unstructured input -/// -/// ```abnf -/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR -/// ``` -fn is_unstructured(c: char) -> bool { - is_vchar(c) || is_obs_no_ws_ctl(c) || c == '\x00' -} - -enum UnstrToken { - Init, - Encoded, - Plain, -} - -/// Unstructured header field body -/// -/// ```abnf -/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct -/// ``` -pub fn unstructured(input: &str) -> IResult<&str, String> { - let (input, r) = many0(tuple((opt(fws), alt(( - map(encoded_word, |v| (Cow::Owned(v), UnstrToken::Encoded)), - map(take_while1(is_unstructured), |v| (Cow::Borrowed(v), UnstrToken::Plain)), - )))))(input)?; - - let (input, _) = space0(input)?; - - // Try to optimize for the most common cases - let body = match r.as_slice() { - // Optimization when there is only one line - [(None, (content, _))] | [(_, (content, UnstrToken::Encoded))] => content.to_string(), - [(Some(_), (content, _))] => " ".to_string() + content, - // Generic case, with multiple lines - lines => lines.iter().fold( - (&UnstrToken::Init, String::with_capacity(255)), - |(prev_token, result), (may_ws, (content, current_token))| { - let new_res = match (may_ws, prev_token, current_token) { - (_, UnstrToken::Encoded, UnstrToken::Encoded) | (None, _, _) => result + content, - _ => result + " " + content, - }; - (current_token, new_res) - }).1, - }; - - Ok((input, body)) -} - - -#[cfg(test)] -mod tests { - use super::*; - #[test] - fn test_phrase() { - assert_eq!(phrase("hello world"), Ok(("", "hello world".into()))); - assert_eq!( - phrase("salut \"le\" monde"), - Ok(("", "salut le monde".into())) - ); - assert_eq!( - phrase("fin\r\n du\r\nmonde"), - Ok(("\r\nmonde", "fin du".into())) - ); - } -} diff --git a/src/fragments/mod.rs b/src/fragments/mod.rs deleted file mode 100644 index 5ac9bf3..0000000 --- a/src/fragments/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -// Model -pub mod model; - -// Generic -pub mod misc_token; -mod quoted; -pub mod whitespace; -mod words; - -// Header specific -mod address; -mod datetime; -pub mod eager; -mod identification; -pub mod lazy; -mod mailbox; -pub mod section; -pub mod trace; - -// MIME related -pub mod mime; -pub mod encoding; -pub mod part; diff --git a/src/fragments/model.rs b/src/fragments/model.rs deleted file mode 100644 index fb0fa30..0000000 --- a/src/fragments/model.rs +++ /dev/null @@ -1,146 +0,0 @@ -use chrono::{DateTime, FixedOffset}; -use std::collections::HashMap; - -#[derive(Debug, PartialEq)] -pub struct AddrSpec { - pub local_part: String, - pub domain: String, -} -impl AddrSpec { - pub fn fully_qualified(&self) -> String { - format!("{}@{}", self.local_part, self.domain) - } -} - -#[derive(Debug, PartialEq)] -pub struct MailboxRef { - // The actual "email address" like hello@example.com - pub addrspec: AddrSpec, - pub name: Option, -} -impl From for MailboxRef { - fn from(addr: AddrSpec) -> Self { - MailboxRef { - name: None, - addrspec: addr, - } - } -} -pub type MailboxList = Vec; - -#[derive(Debug, PartialEq)] -pub struct GroupRef { - pub name: String, - pub participants: Vec, -} - -#[derive(Debug, PartialEq)] -pub enum AddressRef { - Single(MailboxRef), - Many(GroupRef), -} -impl From for AddressRef { - fn from(mx: MailboxRef) -> Self { - AddressRef::Single(mx) - } -} -impl From for AddressRef { - fn from(grp: GroupRef) -> Self { - AddressRef::Many(grp) - } -} -pub type AddressList = Vec; - -#[derive(Debug, PartialEq)] -pub struct MessageId<'a> { - pub left: &'a str, - pub right: &'a str, -} -pub type MessageIdList<'a> = Vec>; - -#[derive(Debug, PartialEq)] -pub enum FieldBody<'a, T> { - Correct(T), - Failed(&'a str), -} - -#[derive(Debug, PartialEq)] -pub enum Field<'a> { - // 3.6.1. The Origination Date Field - Date(FieldBody<'a, Option>>), - - // 3.6.2. Originator Fields - From(FieldBody<'a, Vec>), - Sender(FieldBody<'a, MailboxRef>), - ReplyTo(FieldBody<'a, Vec>), - - // 3.6.3. Destination Address Fields - To(FieldBody<'a, Vec>), - Cc(FieldBody<'a, Vec>), - Bcc(FieldBody<'a, Vec>), - - // 3.6.4. Identification Fields - MessageID(FieldBody<'a, MessageId<'a>>), - InReplyTo(FieldBody<'a, Vec>>), - References(FieldBody<'a, Vec>>), - - // 3.6.5. Informational Fields - Subject(FieldBody<'a, String>), - Comments(FieldBody<'a, String>), - Keywords(FieldBody<'a, Vec>), - - // 3.6.6 Resent Fields (not implemented) - // 3.6.7 Trace Fields - Received(FieldBody<'a, &'a str>), - ReturnPath(FieldBody<'a, Option>), - - // 3.6.8. Optional Fields - Optional(&'a str, String), - - // None - Rescue(&'a str), -} - -/// Permissive Header Section -/// -/// This is a structure intended for parsing/decoding, -/// hence it's support cases where the email is considered -/// as invalid according to RFC5322 but for which we can -/// still extract some data. -#[derive(Debug, PartialEq, Default)] -pub struct HeaderSection<'a> { - // 3.6.1. The Origination Date Field - pub date: Option>, - - // 3.6.2. Originator Fields - pub from: Vec, - pub sender: Option, - pub reply_to: Vec, - - // 3.6.3. Destination Address Fields - pub to: Vec, - pub cc: Vec, - pub bcc: Vec, - - // 3.6.4. Identification Fields - pub msg_id: Option>, - pub in_reply_to: Vec>, - pub references: Vec>, - - // 3.6.5. Informational Fields - pub subject: Option, - pub comments: Vec, - pub keywords: Vec, - - // 3.6.6 Not implemented - // 3.6.7 Trace Fields - pub return_path: Vec, - pub received: Vec<&'a str>, - - // 3.6.8. Optional Fields - pub optional: HashMap<&'a str, String>, - - // Recovery - pub bad_fields: Vec>, - pub unparsed: Vec<&'a str>, -} diff --git a/src/fragments/words.rs b/src/fragments/words.rs deleted file mode 100644 index acc5584..0000000 --- a/src/fragments/words.rs +++ /dev/null @@ -1,116 +0,0 @@ -use crate::fragments::whitespace::cfws; -use nom::{ - bytes::complete::{tag, take_while1}, - combinator::{opt, recognize}, - multi::many0, - sequence::{delimited, pair}, - IResult, -}; - -/// VCHAR definition -pub fn is_vchar(c: char) -> bool { - (c >= '\x21' && c <= '\x7E') || !c.is_ascii() -} - -/// Sequence of visible chars with the UTF-8 extension -/// -/// ```abnf -/// VCHAR = %x21-7E -/// ; visible (printing) characters -/// VCHAR =/ UTF8-non-ascii -/// SEQ = 1*VCHAR -///``` -#[allow(dead_code)] -pub fn vchar_seq(input: &str) -> IResult<&str, &str> { - take_while1(is_vchar)(input) -} - -/// Atom allowed characters -fn is_atext(c: char) -> bool { - c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) || !c.is_ascii() -} - -/// Atom -/// -/// `[CFWS] 1*atext [CFWS]` -pub fn atom(input: &str) -> IResult<&str, &str> { - delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input) -} - -/// dot-atom-text -/// -/// `1*atext *("." 1*atext)` -pub fn dot_atom_text(input: &str) -> IResult<&str, &str> { - recognize(pair( - take_while1(is_atext), - many0(pair(tag("."), take_while1(is_atext))), - ))(input) -} - -/// dot-atom -/// -/// `[CFWS] dot-atom-text [CFWS]` -pub fn dot_atom(input: &str) -> IResult<&str, &str> { - delimited(opt(cfws), dot_atom_text, opt(cfws))(input) -} - -#[allow(dead_code)] -pub fn is_special(c: char) -> bool { - c == '(' - || c == ')' - || c == '<' - || c == '>' - || c == '[' - || c == ']' - || c == ':' - || c == ';' - || c == '@' - || c == '\\' - || c == ',' - || c == '.' - || c == '"' -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_vchar_seq() { - assert_eq!(vchar_seq("hello world"), Ok((" world", "hello"))); - assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋"))); - } - - #[test] - fn test_atext() { - assert!(is_atext('=')); - assert!(is_atext('5')); - assert!(is_atext('Q')); - assert!(!is_atext(' ')); - assert!(is_atext('É')); // support utf8 - } - - #[test] - fn test_atom() { - assert_eq!( - atom("(skip) imf_codec (hidden) aerogramme"), - Ok(("aerogramme", "imf_codec")) - ); - } - - #[test] - fn test_dot_atom_text() { - assert_eq!( - dot_atom_text("quentin.dufour.io abcdef"), - Ok((" abcdef", "quentin.dufour.io")) - ); - } - - #[test] - fn test_dot_atom() { - assert_eq!( - dot_atom(" (skip) quentin.dufour.io abcdef"), - Ok(("abcdef", "quentin.dufour.io")) - ); - } -} diff --git a/src/headers.rs b/src/headers.rs new file mode 100644 index 0000000..5bf0661 --- /dev/null +++ b/src/headers.rs @@ -0,0 +1,27 @@ +use nom::{ + self, + combinator::{all_consuming, recognize}, + multi::many0, + sequence::terminated, + IResult, +}; + +use crate::text::whitespace::{foldable_line, line, obs_crlf}; + +pub fn headers(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> { + let (body, hdrs) = segment(input)?; + let (_, fields) = fields(hdrs)?; + Ok((body, fields)) +} + +// -- part 1, segment +fn segment(input: &[u8]) -> IResult<&[u8], &[u8]> { + terminated(recognize(many0(line)), obs_crlf)(input) +} + +// -- part 2, isolate fields +fn fields(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> { + let (rest, parsed) = all_consuming(many0(foldable_line))(input)?; + Ok((rest, parsed)) +} + diff --git a/src/lib.rs b/src/lib.rs index 257344f..974cd54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ pub mod error; -pub mod fragments; -pub mod multipass; +//pub mod mime; +//pub mod message; +pub mod headers; +pub mod text; diff --git a/src/fragments/mime.rs b/src/mime/content_fields.rs similarity index 97% rename from src/fragments/mime.rs rename to src/mime/content_fields.rs index 14ece11..31ac49c 100644 --- a/src/fragments/mime.rs +++ b/src/mime/content_fields.rs @@ -292,18 +292,6 @@ pub fn version(input: &str) -> IResult<&str, Version> { Ok((rest, Version { major, minor })) } -/// Token allowed characters -fn is_token_text(c: char) -> bool { - c.is_ascii() && !c.is_ascii_control() && !c.is_ascii_whitespace() && !"()<>@,;:\\\"/[]?=".contains(c) -} - -/// Token -/// -/// `[CFWS] 1*token_text [CFWS]` -pub fn token(input: &str) -> IResult<&str, &str> { - delimited(opt(cfws), take_while1(is_token_text), opt(cfws))(input) -} - pub fn parameter(input: &str) -> IResult<&str, Parameter> { let (rest, (pname, _, pvalue)) = tuple(( token, diff --git a/src/fragments/part.rs b/src/mime/part.rs similarity index 100% rename from src/fragments/part.rs rename to src/mime/part.rs diff --git a/src/multipass/body_structure.rs b/src/old.multipass/body_structure.rs similarity index 100% rename from src/multipass/body_structure.rs rename to src/old.multipass/body_structure.rs diff --git a/src/multipass/extract_fields.rs b/src/old.multipass/extract_fields.rs similarity index 100% rename from src/multipass/extract_fields.rs rename to src/old.multipass/extract_fields.rs diff --git a/src/multipass/field_eager.rs b/src/old.multipass/field_eager.rs similarity index 100% rename from src/multipass/field_eager.rs rename to src/old.multipass/field_eager.rs diff --git a/src/multipass/field_lazy.rs b/src/old.multipass/field_lazy.rs similarity index 100% rename from src/multipass/field_lazy.rs rename to src/old.multipass/field_lazy.rs diff --git a/src/multipass/guess_charset.rs b/src/old.multipass/guess_charset.rs similarity index 100% rename from src/multipass/guess_charset.rs rename to src/old.multipass/guess_charset.rs diff --git a/src/multipass/header_section.rs b/src/old.multipass/header_section.rs similarity index 100% rename from src/multipass/header_section.rs rename to src/old.multipass/header_section.rs diff --git a/src/multipass/mod.rs b/src/old.multipass/mod.rs similarity index 100% rename from src/multipass/mod.rs rename to src/old.multipass/mod.rs diff --git a/src/multipass/segment.rs b/src/old.multipass/segment.rs similarity index 100% rename from src/multipass/segment.rs rename to src/old.multipass/segment.rs diff --git a/src/parse.rs b/src/parse.rs index 60306c1..9f5407b 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,8 +1,9 @@ -use imf_codec::fragments::section::Section; -use imf_codec::multipass::segment; +//use imf_codec::fragments::section::Section; +//use imf_codec::multipass::segment; use std::io; use std::io::Read; +/* fn parser<'a, F>(input: &'a [u8], func: F) -> () where F: FnOnce(&Section) -> (), @@ -15,9 +16,10 @@ where let section = field_body.section(); func(§ion.fields); -} +}*/ fn main() { + /* // Read full mail in memory let mut rawmail = Vec::new(); io::stdin().lock().read_to_end(&mut rawmail).unwrap(); @@ -30,4 +32,6 @@ fn main() { assert!(section.from.len() > 0); assert!(section.bad_fields.len() == 0); }); + */ + println!("hello world"); } diff --git a/src/fragments/address.rs b/src/rfc5322/address.rs similarity index 90% rename from src/fragments/address.rs rename to src/rfc5322/address.rs index 7bf2a43..c829ac3 100644 --- a/src/fragments/address.rs +++ b/src/rfc5322/address.rs @@ -11,9 +11,32 @@ use crate::error::IMFError; use crate::fragments::lazy; use crate::fragments::mailbox::mailbox; use crate::fragments::misc_token::phrase; -use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef}; +//use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef}; use crate::fragments::whitespace::cfws; +#[derive(Debug, PartialEq)] +pub struct GroupRef { + pub name: String, + pub participants: Vec, +} + +#[derive(Debug, PartialEq)] +pub enum AddressRef { + Single(MailboxRef), + Many(GroupRef), +} +impl From for AddressRef { + fn from(mx: MailboxRef) -> Self { + AddressRef::Single(mx) + } +} +impl From for AddressRef { + fn from(grp: GroupRef) -> Self { + AddressRef::Many(grp) + } +} +pub type AddressList = Vec; + impl<'a> TryFrom<&'a lazy::Mailbox<'a>> for MailboxRef { type Error = IMFError<'a>; diff --git a/src/fragments/datetime.rs b/src/rfc5322/datetime.rs similarity index 100% rename from src/fragments/datetime.rs rename to src/rfc5322/datetime.rs diff --git a/src/fragments/eager.rs b/src/rfc5322/eager.rs similarity index 100% rename from src/fragments/eager.rs rename to src/rfc5322/eager.rs diff --git a/src/fragments/identification.rs b/src/rfc5322/identification.rs similarity index 93% rename from src/fragments/identification.rs rename to src/rfc5322/identification.rs index 8ba4b89..179505e 100644 --- a/src/fragments/identification.rs +++ b/src/rfc5322/identification.rs @@ -14,6 +14,14 @@ use crate::fragments::model::{MessageId, MessageIdList}; use crate::fragments::whitespace::cfws; use crate::fragments::words::dot_atom_text; + +#[derive(Debug, PartialEq)] +pub struct MessageId<'a> { + pub left: &'a str, + pub right: &'a str, +} +pub type MessageIdList<'a> = Vec>; + impl<'a> TryFrom<&'a lazy::Identifier<'a>> for MessageId<'a> { type Error = IMFError<'a>; diff --git a/src/fragments/lazy.rs b/src/rfc5322/lazy.rs similarity index 100% rename from src/fragments/lazy.rs rename to src/rfc5322/lazy.rs diff --git a/src/fragments/mailbox.rs b/src/rfc5322/mailbox.rs similarity index 95% rename from src/fragments/mailbox.rs rename to src/rfc5322/mailbox.rs index 6860c7c..fd87309 100644 --- a/src/fragments/mailbox.rs +++ b/src/rfc5322/mailbox.rs @@ -10,11 +10,37 @@ use nom::{ use std::borrow::Cow; use crate::fragments::misc_token::{phrase, word}; -use crate::fragments::model::{AddrSpec, MailboxRef}; use crate::fragments::quoted::quoted_string; use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl}; use crate::fragments::words::{atom, dot_atom}; +#[derive(Debug, PartialEq)] +pub struct AddrSpec { + pub local_part: String, + pub domain: String, +} +impl AddrSpec { + pub fn fully_qualified(&self) -> String { + format!("{}@{}", self.local_part, self.domain) + } +} + +#[derive(Debug, PartialEq)] +pub struct MailboxRef { + // The actual "email address" like hello@example.com + pub addrspec: AddrSpec, + pub name: Option, +} +impl From for MailboxRef { + fn from(addr: AddrSpec) -> Self { + MailboxRef { + name: None, + addrspec: addr, + } + } +} +pub type MailboxList = Vec; + /// Mailbox /// /// ```abnf diff --git a/src/fragments/section.rs b/src/rfc5322/message.rs similarity index 100% rename from src/fragments/section.rs rename to src/rfc5322/message.rs diff --git a/src/fragments/trace.rs b/src/rfc5322/trace.rs similarity index 100% rename from src/fragments/trace.rs rename to src/rfc5322/trace.rs diff --git a/src/text/ascii.rs b/src/text/ascii.rs new file mode 100644 index 0000000..bb5c9b4 --- /dev/null +++ b/src/text/ascii.rs @@ -0,0 +1,142 @@ +// ASCII +// -- CONTROL CHARACTERS +pub const NULL: u8 = 0x00; // NULL +pub const SOH: u8 = 0x01; // START OF HEADER +pub const STX: u8 = 0x02; // START OF TEXT +pub const ETX: u8 = 0x03; // END OF TEXT +pub const EOT: u8 = 0x04; // +pub const ANQ: u8 = 0x05; +pub const ACK: u8 = 0x06; +pub const BEL: u8 = 0x07; +pub const BS: u8 = 0x08; // BACKSPACE +pub const HT: u8 = 0x09; // horizontal tab +pub const LF: u8 = 0x0A; +pub const VT: u8 = 0x0B; +pub const FF: u8 = 0x0C; +pub const CR: u8 = 0x0D; +pub const SO: u8 = 0x0E; +pub const SI: u8 = 0x0F; +pub const DLE: u8 = 0x10; +pub const DC1: u8 = 0x11; +pub const DC2: u8 = 0x12; +pub const DC3: u8 = 0x13; +pub const DC4 : u8 = 0x14; +pub const NAK: u8 = 0x15; +pub const SYN: u8 = 0x16; +pub const ETB: u8 = 0x17; +pub const CAN: u8 = 0x18; +pub const EM: u8 = 0x19; +pub const SUB: u8 = 0x1A; +pub const ESC: u8 = 0x1B; +pub const FS: u8 = 0x1C; +pub const GS: u8 = 0x1D; +pub const RS: u8 = 0x1E; +pub const US: u8 = 0x1F; +pub const DEL: u8 = 0x7F; + +// -- GRAPHIC CHARACTERS +pub const SP: u8 = 0x20; // space +pub const EXCLAMATION: u8 = 0x21; // ! +pub const DQUOTE: u8 = 0x22; // " +pub const NUM: u8 = 0x23; // # +pub const DOLLAR: u8 = 0x24; // $ +pub const PERCENT: u8 = 0x25; // % +pub const AMPERSAND: u8 = 0x26; // & +pub const SQUOTE: u8 = 0x27; // ' +pub const LEFT_PAR: u8 = 0x28; // ( +pub const RIGHT_PAR: u8 = 0x29; // ) +pub const ASTERISK: u8 = 0x2A; // * +pub const PLUS: u8 = 0x2B; // + +pub const COMMA: u8 = 0x2C; // , +pub const MINUS: u8 = 0x2D; // - +pub const PERIOD: u8 = 0x2E; // . +pub const SLASH: u8 = 0x2F; // / +pub const N0: u8 = 0x30; // 0 +pub const N1: u8 = 0x31; // 1 +pub const N2: u8 = 0x32; // 2 +pub const N3: u8 = 0x33; // 3 +pub const N4: u8 = 0x34; // 4 +pub const N5: u8 = 0x35; // 5 +pub const N6: u8 = 0x36; // 6 +pub const N7: u8 = 0x37; // 7 +pub const N8: u8 = 0x38; // 8 +pub const N9: u8 = 0x39; // 9 +pub const COL: u8 = 0x3A; // : +pub const SEM_COL: u8 = 0x3B; // ; +pub const LT: u8 = 0x3C; // < +pub const EQ: u8 = 0x3D; // = +pub const GT: u8 = 0x3E; // > +pub const QUESTION: u8 = 0x3F; // ? +pub const AT: u8 = 0x40; // @ +pub const LCA: u8 = 0x41; // A +pub const LCB: u8 = 0x42; // B +pub const LCC: u8 = 0x43; // C +pub const LCD: u8 = 0x44; // D +pub const LCE: u8 = 0x45; // E +pub const LCF: u8 = 0x46; // F +pub const LCG: u8 = 0x47; // G +pub const LCH: u8 = 0x48; // H +pub const LCI: u8 = 0x49; // I +pub const LCJ: u8 = 0x4A; // J +pub const LCK: u8 = 0x4B; // K +pub const LCL: u8 = 0x4C; // L +pub const LCM: u8 = 0x4D; // M +pub const LCN: u8 = 0x4E; // N +pub const LCO: u8 = 0x4F; // O +pub const LCP: u8 = 0x50; // P +pub const LCQ: u8 = 0x51; // Q +pub const LCR: u8 = 0x52; // R +pub const LCS: u8 = 0x53; // S +pub const LCT: u8 = 0x54; // T +pub const LCU: u8 = 0x55; // U +pub const LCV: u8 = 0x56; // V +pub const LCW: u8 = 0x57; // W +pub const LCX: u8 = 0x58; // X +pub const LCY: u8 = 0x59; // Y +pub const LCZ: u8 = 0x5A; // Z +pub const LEFT_BRACKET: u8 = 0x5B; // [ +pub const BACKSLASH: u8 = 0x5C; // \ +pub const RIGHT_BRACKET: u8 = 0x5D; // ] +pub const CARRET: u8 = 0x5E; // ^ +pub const UNDERSCORE: u8 = 0x5F; // _ +pub const GRAVE: u8 = 0x60; // ` +pub const LSA: u8 = 0x61; // a +pub const LSB: u8 = 0x62; // b +pub const LSC: u8 = 0x63; // c +pub const LSD: u8 = 0x64; // d +pub const LSE: u8 = 0x65; // e +pub const LSF: u8 = 0x66; // f +pub const LSG: u8 = 0x67; // g +pub const LSH: u8 = 0x68; // h +pub const LSI: u8 = 0x69; // i +pub const LSJ: u8 = 0x6A; // j +pub const LSK: u8 = 0x6B; // k +pub const LSL: u8 = 0x6C; // l +pub const LSM: u8 = 0x6D; // m +pub const LSN: u8 = 0x6E; // n +pub const LSO: u8 = 0x6F; // o +pub const LSP: u8 = 0x70; // p +pub const LSQ: u8 = 0x71; // q +pub const LSR: u8 = 0x72; // r +pub const LSS: u8 = 0x73; // s +pub const LST: u8 = 0x74; // t +pub const LSU: u8 = 0x75; // u +pub const LSV: u8 = 0x76; // v +pub const LSW: u8 = 0x77; // w +pub const LSX: u8 = 0x78; // x +pub const LSY: u8 = 0x79; // y +pub const LSZ: u8 = 0x7A; // z +pub const LEFT_CURLY: u8 = 0x7B; // { +pub const PIPE: u8 = 0x7C; // | +pub const RIGHT_CURLY: u8 = 0x7D; // } +pub const TILDE: u8 = 0x7E; // ~ + +// GROUP OF CHARACTERS +// -- CRLF +pub const CRLF: &[u8] = &[CR, LF]; + +// -- WHITESPACE +pub const WS: &[u8] = &[HT, SP]; + +pub const GRAPHIC_BEGIN: u8 = SP; +pub const GRAPHIC_END: u8 = TILDE; diff --git a/src/text/buffer.rs b/src/text/buffer.rs new file mode 100644 index 0000000..bd9cbb8 --- /dev/null +++ b/src/text/buffer.rs @@ -0,0 +1,43 @@ +use encoding_rs::Encoding; + +#[derive(Debug, PartialEq, Default)] +pub struct Text<'a> { + parts: Vec<&'a [u8]>, +} + +impl<'a> Text<'a> { + pub fn push(&mut self, e: &[u8]) { + self.parts.push(e) + } + + pub fn to_string(&self) -> String { + let enc = encoding_rs::UTF_8; + let size = self.parts.iter().fold(0, |acc, v| acc + v.len()); + + self.parts.iter().fold( + String::with_capacity(size), + |mut acc, v| { + let (content, _) = enc.decode_without_bom_handling(v); + acc.push_str(content.as_ref()); + acc + }, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_text() { + let mut text = Text::default(); + text.push(b"hello"); + text.push(&[ascii::SP]); + text.push(b"world"); + assert_eq!( + text.to_string(), + "hello world".to_string(), + ); + } +} diff --git a/src/fragments/encoding.rs b/src/text/encoding.rs similarity index 50% rename from src/fragments/encoding.rs rename to src/text/encoding.rs index 5ad0dc5..84a2c0c 100644 --- a/src/fragments/encoding.rs +++ b/src/text/encoding.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; -use chardetng::EncodingDetector; use encoding_rs::Encoding; use nom::{ @@ -7,92 +5,107 @@ use nom::{ branch::alt, bytes::complete::{tag, take, take_while1, take_while}, character::complete::{one_of}, + character::is_alphanumeric, combinator::map, sequence::{preceded, terminated, tuple}, multi::many0, }; -use encoding_rs::Encoding; use base64::{Engine as _, engine::general_purpose}; -use crate::fragments::mime; +use crate::text::words; +use crate::text::ascii; -const IS_LAST_BUFFER: bool = true; -const ALLOW_UTF8: bool = true; -const NO_TLD: Option<&[u8]> = None; - -pub fn header_decode(input: &[u8]) -> Cow { - // Create detector - let mut detector = EncodingDetector::new(); - detector.feed(input, IS_LAST_BUFFER); - - // Get encoding - let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8); - let (header, _, _) = enc.decode(input); - header -} - -pub fn encoded_word(input: &str) -> IResult<&str, String> { +pub fn encoded_word(input: &[u8]) -> IResult<&[u8], EncodedWord> { alt((encoded_word_quoted, encoded_word_base64))(input) } -pub fn encoded_word_quoted(input: &str) -> IResult<&str, String> { +pub fn encoded_word_quoted(input: &[u8]) -> IResult<&[u8], EncodedWord> { let (rest, (_, charset, _, _, _, txt, _)) = tuple(( - tag("=?"), mime::token, + tag("=?"), words::mime_token, tag("?"), one_of("Qq"), tag("?"), ptext, tag("?=")))(input)?; - let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252); - let parsed = decode_quoted_encoding(renc, txt.iter()); + let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252); + let parsed = EncodedWord::Quoted(QuotedWord { enc: renc, chunks: txt }); Ok((rest, parsed)) } -pub fn encoded_word_base64(input: &str) -> IResult<&str, String> { +pub fn encoded_word_base64(input: &[u8]) -> IResult<&[u8], EncodedWord> { let (rest, (_, charset, _, _, _, txt, _)) = tuple(( - tag("=?"), mime::token, + tag("=?"), words::mime_token, tag("?"), one_of("Bb"), tag("?"), btext, tag("?=")))(input)?; - let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252); - let parsed = general_purpose::STANDARD_NO_PAD.decode(txt).map(|d| renc.decode(d.as_slice()).0.to_string()).unwrap_or("".into()); - + let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252); + let parsed = EncodedWord::Base64(Base64Word { enc: renc, content: txt }); Ok((rest, parsed)) } -fn decode_quoted_encoding<'a>(enc: &'static Encoding, q: impl Iterator>) -> String { - q.fold( - String::new(), - |mut acc, c| { - let dec = match c { - QuotedChunk::Safe(v) => Cow::Borrowed(*v), - QuotedChunk::Space => Cow::Borrowed(" "), - QuotedChunk::Encoded(v) => { - let w = &[*v]; - let (d, _, _) = enc.decode(w); - Cow::Owned(d.into_owned()) - }, - }; - acc.push_str(dec.as_ref()); - acc - }) +#[derive(PartialEq,Debug)] +pub enum EncodedWord<'a> { + Quoted(QuotedWord<'a>), + Base64(Base64Word<'a>), } +#[derive(PartialEq,Debug)] +pub struct Base64Word<'a> { + pub enc: &'static Encoding, + pub content: &'a [u8], +} + +impl<'a> Base64Word<'a> { + pub fn to_string(&self) -> String { + general_purpose::STANDARD_NO_PAD + .decode(self.content) + .map(|d| self.enc.decode(d.as_slice()).0.to_string()) + .unwrap_or("".into()) + } +} + +#[derive(PartialEq,Debug)] +pub struct QuotedWord<'a> { + pub enc: &'static Encoding, + pub chunks: Vec>, +} + +impl<'a> QuotedWord<'a> { + pub fn to_string(&self) -> String { + self.chunks.iter().fold( + String::new(), + |mut acc, c| { + match c { + QuotedChunk::Safe(v) => { + let (content, _) = encoding_rs::UTF_8.decode_without_bom_handling(v); + acc.push_str(content.as_ref()); + } + QuotedChunk::Space => acc.push(' '), + QuotedChunk::Encoded(v) => { + let w = &[*v]; + let (d, _) = self.enc.decode_without_bom_handling(w); + acc.push_str(d.as_ref()); + }, + }; + acc + }) + } +} #[derive(PartialEq,Debug)] pub enum QuotedChunk<'a> { - Safe(&'a str), + Safe(&'a [u8]), Encoded(u8), Space, } //quoted_printable -pub fn ptext(input: &str) -> IResult<&str, Vec> { +pub fn ptext(input: &[u8]) -> IResult<&[u8], Vec> { many0(alt((safe_char2, encoded_space, hex_octet)))(input) } -fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> { +fn safe_char2(input: &[u8]) -> IResult<&[u8], QuotedChunk> { map(take_while1(is_safe_char2), |v| QuotedChunk::Safe(v))(input) } @@ -101,8 +114,8 @@ fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> { /// 8-bit values which correspond to printable ASCII characters other /// than "=", "?", and "_" (underscore), MAY be represented as those /// characters. -fn is_safe_char2(c: char) -> bool { - c.is_ascii() && !c.is_ascii_control() && c != '_' && c != '?' && c != '=' +fn is_safe_char2(c: u8) -> bool { + c >= ascii::SP && c != ascii::UNDERSCORE && c != ascii::QUESTION && c != ascii::EQ } /* @@ -111,28 +124,30 @@ fn is_safe_char(c: char) -> bool { (c >= '\x3e' && c <= '\x7e') }*/ -fn encoded_space(input: &str) -> IResult<&str, QuotedChunk> { +fn encoded_space(input: &[u8]) -> IResult<&[u8], QuotedChunk> { map(tag("_"), |_| QuotedChunk::Space)(input) } -fn hex_octet(input: &str) -> IResult<&str, QuotedChunk> { +fn hex_octet(input: &[u8]) -> IResult<&[u8], QuotedChunk> { use nom::error::*; - let (rest, hstr) = preceded(tag("="), take(2usize))(input)?; + let (rest, hbytes) = preceded(tag("="), take(2usize))(input)?; - let parsed = u8::from_str_radix(hstr, 16) + let (hstr, _) = encoding_rs::UTF_8.decode_without_bom_handling(hbytes); + + let parsed = u8::from_str_radix(hstr.as_ref(), 16) .map_err(|_| nom::Err::Error(Error::new(input, ErrorKind::Verify)))?; Ok((rest, QuotedChunk::Encoded(parsed))) } //base64 (maybe use a crate) -pub fn btext(input: &str) -> IResult<&str, &str> { +pub fn btext(input: &[u8]) -> IResult<&[u8], &[u8]> { terminated(take_while(is_bchar), many0(tag("=")))(input) } -fn is_bchar(c: char) -> bool { - c.is_ascii_alphanumeric() || c == '+' || c == '/' +fn is_bchar(c: u8) -> bool { + is_alphanumeric(c) || c == ascii::PLUS || c == ascii::SLASH } #[cfg(test)] diff --git a/src/text/misc_token.rs b/src/text/misc_token.rs new file mode 100644 index 0000000..35869fe --- /dev/null +++ b/src/text/misc_token.rs @@ -0,0 +1,166 @@ +use nom::{ + branch::alt, + bytes::complete::take_while1, + character::complete::space0, + combinator::{into, map, opt}, + multi::{many0, many1}, + sequence::{preceded, tuple}, + IResult, +}; +use std::borrow::Cow; + +use crate::text::{ + quoted::quoted_string, + whitespace::{fws, is_obs_no_ws_ctl}, + words::{atom, is_vchar}, + encoding::{self, encoded_word}, + buffer, + ascii, +}; + +#[derive(Debug, PartialEq, Default)] +pub struct PhraseList(pub Vec); + +/* +impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured { + type Error = IMFError<'a>; + + fn try_from(input: &'a lazy::Unstructured<'a>) -> Result { + unstructured(input.0) + .map(|(_, v)| Unstructured(v)) + .map_err(|e| IMFError::Unstructured(e)) + } +} + +impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList { + type Error = IMFError<'a>; + + fn try_from(p: &'a lazy::PhraseList<'a>) -> Result { + separated_list1(tag(","), phrase)(p.0) + .map(|(_, q)| PhraseList(q)) + .map_err(|e| IMFError::PhraseList(e)) + } +}*/ + +pub enum Word<'a> { + Quoted(buffer::Text<'a>), + Encoded(encoding::EncodedWord<'a>), + Atom(&'a [u8]), +} +impl<'a> Word<'a> { + pub fn to_string(&self) -> String { + match self { + Word::Quoted(v) => v.to_string(), + Word::Encoded(v) => v.to_string(), + Word::Atom(v) => v.to_string(), + } + } +} + +/// Word +/// +/// ```abnf +/// word = atom / quoted-string +/// ``` +pub fn word(input: &[u8]) -> IResult<&[u8], Word> { + alt(( + map(quoted_string, |v| Word::Quoted(v)), + map(encoded_word, |v| Word::Encoded(v)), + map(atom, |v| Word::Atom(v)) + ))(input) +} + +pub struct Phrase<'a>(pub Vec>); +impl<'a> Phrase<'a> { + pub fn to_string(&self) -> String { + self.0.join(" ") + } +} + +/// Phrase +/// +/// ```abnf +/// phrase = 1*word / obs-phrase +/// ``` +pub fn phrase(input: &[u8]) -> IResult<&[u8], Phrase> { + let (input, phrase) = map(many1(word), |v| Phrase(v))(input)?; + Ok((input, phrase)) +} + +/// Compatible unstructured input +/// +/// ```abnf +/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR +/// ``` +fn is_unstructured(c: u8) -> bool { + is_vchar(c) || is_obs_no_ws_ctl(c) || c == ascii::NULL +} + +enum UnstrToken<'a> { + Init, + Encoded(encoding::EncodedWord<'a>), + Plain(&'a [u8]), +} +impl<'a> UnstrToken<'a> { + pub fn to_string(&self) -> String { + match self { + UnstrToken::Init => "".into(), + UnstrToken::Encoded(e) => e.to_string(), + UnstrToken::Plain(e) => encoding_rs::UTF_8.decode_without_bom_handling(e).into_owned(), + } + } +} + +pub struct Unstructured<'a>(pub Vec>); +impl<'a> Unstructured<'a> { + pub fn to_string(&self) -> String { + self.0.iter().fold( + (&UnstrToken::Init, String::new()), + |(prev_token, result), current_token| { + match (prev_token, current_token) { + (UnstrToken::Init, v) => result.push_str(v.to_string().as_ref()), + (UnstrToken::EncodedWord(_), UnstrToken::EncodedWord(v)) => result.push_str(v.to_string()).as_ref(), + (_, v) => { + result.push(' '); + result.push_str(v.to_string().as_ref()) + }, + }; + + result + } + ) + } +} + +/// Unstructured header field body +/// +/// ```abnf +/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct +/// ``` +pub fn unstructured(input: &[u8]) -> IResult<&[u8], Unstructured> { + let (input, r) = many0(preceded(opt(fws), alt(( + map(encoded_word, |v| UnstrToken::Encoded(v)), + map(take_while1(is_unstructured), |v| UnstrToken::Plain(v)), + ))))(input)?; + + let (input, _) = space0(input)?; + Ok((input, Unstructured(r))) +} + + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_phrase() { + assert_eq!(phrase("hello world"), Ok(("", "hello world".into()))); + assert_eq!( + phrase("salut \"le\" monde"), + Ok(("", "salut le monde".into())) + ); + assert_eq!( + phrase("fin\r\n du\r\nmonde"), + Ok(("\r\nmonde", "fin du".into())) + ); + } +} diff --git a/src/text/mod.rs b/src/text/mod.rs new file mode 100644 index 0000000..6baecdb --- /dev/null +++ b/src/text/mod.rs @@ -0,0 +1,7 @@ +pub mod ascii; +pub mod encoding; +pub mod misc_token; +pub mod quoted; +pub mod whitespace; +pub mod words; +pub mod buffer; diff --git a/src/fragments/quoted.rs b/src/text/quoted.rs similarity index 55% rename from src/fragments/quoted.rs rename to src/text/quoted.rs index 261f499..78ef7a3 100644 --- a/src/fragments/quoted.rs +++ b/src/text/quoted.rs @@ -1,14 +1,16 @@ use nom::{ branch::alt, - bytes::complete::tag, - character::complete::{anychar, satisfy}, - combinator::opt, + bytes::complete::{take_while1, tag}, + character::complete::anychar, + combinator::{recognize, opt}, multi::many0, sequence::{pair, preceded}, IResult, }; -use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl}; +use crate::text::whitespace::{cfws, fws, is_obs_no_ws_ctl}; +use crate::text::ascii; +use crate::text::buffer; /// Quoted pair /// @@ -16,8 +18,8 @@ use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl}; /// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp /// obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR) /// ``` -pub fn quoted_pair(input: &str) -> IResult<&str, char> { - preceded(tag("\\"), anychar)(input) +pub fn quoted_pair(input: &[u8]) -> IResult<&[u8], u8> { + preceded(tag(&[ascii::SLASH]), anychar)(input) } /// Allowed characters in quote @@ -28,11 +30,11 @@ pub fn quoted_pair(input: &str) -> IResult<&str, char> { /// %d93-126 / ; "\" or the quote character /// obs-qtext /// ``` -fn is_restr_qtext(c: char) -> bool { - c == '\x21' || (c >= '\x23' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E') +fn is_restr_qtext(c: u8) -> bool { + c == ascii::EXCLAMATION || (c >= ascii::NUM && c <= ascii::LEFT_BRACKET) || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE) } -fn is_qtext(c: char) -> bool { +fn is_qtext(c: u8) -> bool { is_restr_qtext(c) || is_obs_no_ws_ctl(c) } @@ -41,8 +43,8 @@ fn is_qtext(c: char) -> bool { /// ```abnf /// qcontent = qtext / quoted-pair /// ``` -fn qcontent(input: &str) -> IResult<&str, char> { - alt((satisfy(is_qtext), quoted_pair))(input) +fn qcontent(input: &u8) -> IResult<&[u8], &[u8]> { + alt((take_while1(is_qtext), recognize(quoted_pair)))(input) } /// Quoted string @@ -52,7 +54,7 @@ fn qcontent(input: &str) -> IResult<&str, char> { /// DQUOTE *([FWS] qcontent) [FWS] DQUOTE /// [CFWS] /// ``` -pub fn quoted_string(input: &str) -> IResult<&str, String> { +pub fn quoted_string(input: &[u8]) -> IResult<&[u8], buffer::Text> { let (input, _) = opt(cfws)(input)?; let (input, _) = tag("\"")(input)?; let (input, content) = many0(pair(opt(fws), qcontent))(input)?; @@ -60,11 +62,11 @@ pub fn quoted_string(input: &str) -> IResult<&str, String> { // Rebuild string let mut qstring = content .iter() - .fold(String::with_capacity(16), |mut acc, (maybe_wsp, c)| { + .fold(buffer::Text::default(), |mut acc, (maybe_wsp, c)| { if let Some(wsp) = maybe_wsp { - acc.push(*wsp); + acc.push(&[ascii::SP]); } - acc.push(*c); + acc.push(c); acc }); @@ -84,13 +86,22 @@ mod tests { #[test] fn test_quoted_string() { + let mut text = buffer::Text::default(); + text.push(b"hello"); + text.push(&[ascii::DQUOTE]); + text.push(b"world"); assert_eq!( - quoted_string(" \"hello\\\"world\" "), - Ok(("", "hello\"world".to_string())) + quoted_string(b" \"hello\\\"world\" "), + Ok(("", text)) ); + + let mut text = buffer::Text::default(); + text.push(b"hello"); + text.push(&[ascii::SP]); + text.push(b"world"); assert_eq!( - quoted_string("\"hello\r\n world\""), - Ok(("", "hello world".to_string())) + quoted_string(b"\"hello\r\n world\""), + Ok(("", text)) ); } } diff --git a/src/fragments/whitespace.rs b/src/text/whitespace.rs similarity index 67% rename from src/fragments/whitespace.rs rename to src/text/whitespace.rs index 08b8a2d..28050b2 100644 --- a/src/fragments/whitespace.rs +++ b/src/text/whitespace.rs @@ -1,71 +1,68 @@ -use crate::fragments::quoted::quoted_pair; use nom::{ branch::alt, - bytes::complete::{is_not, tag}, - character::complete::{crlf, satisfy, space0, space1}, + bytes::complete::{is_not, tag, take_while1}, + character::complete::{space0, space1}, combinator::{opt, recognize}, multi::{many0, many1}, - sequence::{pair, terminated, tuple}, + sequence::{pair, tuple}, IResult, }; -use crate::fragments::encoding::encoded_word; +use crate::text::encoding::encoded_word; +use crate::text::quoted::quoted_pair; +use crate::text::ascii; /// Whitespace (space, new line, tab) content and /// delimited content (eg. comment, line, sections, etc.) -// Bytes CRLF -const CR: u8 = 0x0D; -const LF: u8 = 0x0A; -pub const CRLF: &[u8] = &[CR, LF]; +/// Obsolete/Compatible CRLF +/// +/// Theoretically, all lines must end with \r\n +/// but some mail servers like Dovecot support malformated emails, +/// for example with only \n eol. It works because +/// \r or \n is allowed nowhere else, so we also add this support. -pub fn headers(input: &[u8]) -> IResult<&[u8], &[u8]> { - terminated(recognize(many0(line)), obs_crlf)(input) +pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> { + alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input) } - -pub fn fields(input: &str) -> IResult<&str, Vec<&str>> { - all_consuming(many0(foldable_line))(input) -} - pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> { // is_not(CRLF) is a hack, it means "is not CR or LF" // and not "is not CRLF". In other words, it continues while // it does not encounter 0x0D or 0x0A. - pair(is_not(CRLF), obs_crlf)(input) + pair(is_not(ascii::CRLF), obs_crlf)(input) } -pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> { - alt((tag(CRLF), tag(&[CR]), tag(&[LF])))(input) +/// ```abnf +/// fold_line = any *(1*(crlf WS) any) crlf +/// ``` +pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> { + recognize(tuple(( + is_not(ascii::CRLF), + many0(pair( + many1(pair(obs_crlf, space1)), + is_not(ascii::CRLF), + )), + obs_crlf, + )))(input) } - // --- whitespaces and comments // Note: WSP = SP / HTAB = %x20 / %x09 // nom::*::space0 = *WSP // nom::*::space1 = 1*WSP -/// Permissive CRLF -/// -/// Theoretically, all lines must end with \r\n -/// but some mail servers like Dovecot support malformated emails, -/// for example with only \n eol. It works because -/// \r or \n is allowed nowhere else, so we also add this support. -pub fn perm_crlf(input: &str) -> IResult<&str, &str> { - alt((crlf, tag("\r"), tag("\n")))(input) -} - /// Permissive foldable white space /// /// Folding white space are used for long headers splitted on multiple lines. /// The obsolete syntax allowes multiple lines without content; implemented for compatibility /// reasons -pub fn fws(input: &str) -> IResult<&str, char> { +pub fn fws(input: &[u8]) -> IResult<&[u8], u8> { let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?; - Ok((input, ' ')) + Ok((input, ascii::SP)) } -fn fold_marker(input: &str) -> IResult<&str, &str> { +fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> { let (input, _) = space0(input)?; - let (input, _) = perm_crlf(input)?; + let (input, _) = obs_crlf(input)?; space1(input) } @@ -85,17 +82,17 @@ fn fold_marker(input: &str) -> IResult<&str, &str> { /// /// CFWS = (1*([FWS] comment) [FWS]) / FWS /// ``` -pub fn cfws(input: &str) -> IResult<&str, &str> { +pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> { alt((recognize(comments), recognize(fws)))(input) } -pub fn comments(input: &str) -> IResult<&str, ()> { +pub fn comments(input: &[u8]) -> IResult<&[u8], ()> { let (input, _) = many1(tuple((opt(fws), comment)))(input)?; let (input, _) = opt(fws)(input)?; Ok((input, ())) } -pub fn comment(input: &str) -> IResult<&str, ()> { +pub fn comment(input: &[u8]) -> IResult<&[u8], ()> { let (input, _) = tag("(")(input)?; let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?; let (input, _) = opt(fws)(input)?; @@ -103,12 +100,16 @@ pub fn comment(input: &str) -> IResult<&str, ()> { Ok((input, ())) } -pub fn ccontent(input: &str) -> IResult<&str, &str> { - alt((recognize(ctext), recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input) +pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> { + alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input) } -pub fn ctext(input: &str) -> IResult<&str, char> { - satisfy(is_ctext)(input) +pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> { + take_while1(is_ctext)(input) +} + +pub fn is_ctext(c: u8) -> bool { + is_restr_ctext(c) || is_obs_no_ws_ctl(c) } /// Check if it's a comment text character @@ -119,15 +120,10 @@ pub fn ctext(input: &str) -> IResult<&str, char> { /// %d93-126 / ; "(", ")", or "\" /// obs-ctext ///``` -pub fn is_restr_ctext(c: char) -> bool { - (c >= '\x21' && c <= '\x27') - || (c >= '\x2A' && c <= '\x5B') - || (c >= '\x5D' && c <= '\x7E') - || !c.is_ascii() -} - -pub fn is_ctext(c: char) -> bool { - is_restr_ctext(c) || is_obs_no_ws_ctl(c) +pub fn is_restr_ctext(c: u8) -> bool { + (c >= ascii::EXCLAMATION && c <= ascii::SQUOTE) + || (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET) + || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE) } /// US ASCII control characters without effect @@ -139,12 +135,12 @@ pub fn is_ctext(c: char) -> bool { /// %d14-31 / ; return, line feed, and /// %d127 ; white space characters /// ``` -pub fn is_obs_no_ws_ctl(c: char) -> bool { - (c >= '\x01' && c <= '\x08') - || c == '\x0b' - || c == '\x0b' - || (c >= '\x0e' && c <= '\x1f') - || c == '\x7F' +pub fn is_obs_no_ws_ctl(c: u8) -> bool { + (c >= ascii::SOH && c <= ascii::BS) + || c == ascii::VT + || c == ascii::FF + || (c >= ascii::SO && c <= ascii::US) + || c == ascii::DEL } #[cfg(test)] @@ -152,10 +148,10 @@ mod tests { use super::*; #[test] - fn test_perm_crlf() { - assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r"))); - assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n"))); - assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n"))); + fn test_obs_crlf() { + assert_eq!(obs_crlf("\rworld"), Ok(("world", "\r"))); + assert_eq!(obs_crlf("\r\nworld"), Ok(("world", "\r\n"))); + assert_eq!(obs_crlf("\nworld"), Ok(("world", "\n"))); } #[test] diff --git a/src/text/words.rs b/src/text/words.rs new file mode 100644 index 0000000..6a50d7a --- /dev/null +++ b/src/text/words.rs @@ -0,0 +1,133 @@ +use crate::text::whitespace::cfws; +use crate::text::ascii; +use nom::{ + bytes::complete::{tag, take_while1}, + character::is_alphanumeric, + combinator::{opt, recognize}, + multi::many0, + sequence::{delimited, pair}, + IResult, +}; + +pub fn is_vchar(c: u8) -> bool { + c >= ascii::EXCLAMATION && c <= ascii::TILDE +} + +/// MIME Token allowed characters +/// +/// forbidden: ()<>@,;:\"/[]?= +fn is_mime_token_text(c: u8) -> bool { + is_alphanumeric(c) + || c == ascii::EXCLAMATION + || c == ascii::NUM + || c == ascii::DOLLAR + || c == ascii::PERCENT + || c == ascii::AMPERSAND + || c == ascii::SQUOTE + || c == ascii::ASTERISK + || c == ascii::PLUS + || c == ascii::MINUS + || c == ascii::PERIOD + || c == ascii::CARRET + || c == ascii::UNDERSCORE + || c == ascii::GRAVE + || c == ascii::LEFT_CURLY + || c == ascii::PIPE + || c == ascii::RIGHT_CURLY + || c == ascii::TILDE +} + +/// MIME Token +/// +/// `[CFWS] 1*token_text [CFWS]` +pub fn mime_token(input: &[u8]) -> IResult<&[u8], &[u8]> { + delimited(opt(cfws), take_while1(is_mime_token_text), opt(cfws))(input) +} + +/// Atom allowed characters +/// +/// authorized: !#$%&'*+-/=?^_`{|}~ +fn is_atext(c: u8) -> bool { + is_alphanumeric(c) + || c == ascii::EXCLAMATION + || c == ascii::NUM + || c == ascii::DOLLAR + || c == ascii::PERCENT + || c == ascii::AMPERSAND + || c == ascii::SQUOTE + || c == ascii::ASTERISK + || c == ascii::PLUS + || c == ascii::MINUS + || c == ascii::SLASH + || c == ascii::EQ + || c == ascii::QUESTION + || c == ascii::CARRET + || c == ascii::UNDERSCORE + || c == ascii::GRAVE + || c == ascii::LEFT_CURLY + || c == ascii::PIPE + || c == ascii::RIGHT_CURLY + || c == ascii::TILDE +} + +/// Atom +/// +/// `[CFWS] 1*atext [CFWS]` +pub fn atom(input: &[u8]) -> IResult<&[u8], &[u8]> { + delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input) +} + +/// dot-atom-text +/// +/// `1*atext *("." 1*atext)` +pub fn dot_atom_text(input: &[u8]) -> IResult<&[u8], &[u8]> { + recognize(pair( + take_while1(is_atext), + many0(pair(tag("."), take_while1(is_atext))), + ))(input) +} + +/// dot-atom +/// +/// `[CFWS] dot-atom-text [CFWS]` +pub fn dot_atom(input: &[u8]) -> IResult<&[u8], &[u8]> { + delimited(opt(cfws), dot_atom_text, opt(cfws))(input) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_atext() { + assert!(is_atext('=' as u8)); + assert!(is_atext('5' as u8)); + assert!(is_atext('Q' as u8)); + assert!(!is_atext(' ' as u8)); + //assert!(is_atext('É')); // support utf8 + } + + #[test] + fn test_atom() { + assert_eq!( + atom(b"(skip) imf_codec (hidden) aerogramme"), + Ok((&b"aerogramme"[..], &b"imf_codec"[..])) + ); + } + + #[test] + fn test_dot_atom_text() { + assert_eq!( + dot_atom_text("quentin.dufour.io abcdef"), + Ok((" abcdef", "quentin.dufour.io")) + ); + } + + #[test] + fn test_dot_atom() { + assert_eq!( + dot_atom(" (skip) quentin.dufour.io abcdef"), + Ok(("abcdef", "quentin.dufour.io")) + ); + } +} diff --git a/tests/enron.rs b/tests/enron.rs deleted file mode 100644 index 8020bd9..0000000 --- a/tests/enron.rs +++ /dev/null @@ -1,129 +0,0 @@ -use imf_codec::fragments::section; -use imf_codec::multipass; -use std::collections::HashSet; -use std::fs::File; -use std::io::Read; -use std::path::PathBuf; -use walkdir::WalkDir; - -fn parser<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(§ion::Section) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - - func(§ion.fields); -} - -#[test] -#[ignore] -fn test_enron500k() { - let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - d.push("resources/enron/maildir/"); - let prefix_sz = d.as_path().to_str().unwrap().len(); - //d.push("williams-w3/"); - - let known_bad_fields = HashSet::from([ - "white-s/calendar/113.", // To: east <7..> - "skilling-j/inbox/223.", // From: pep - "jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com> - "jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com> - "lokey-t/calendar/33.", // A second Date entry for the calendar containing - // Date: Monday, March 12 - "zipper-a/inbox/199.", // To: e-mail - "dasovich-j/deleted_items/128.", // To: f62489 - "dasovich-j/all_documents/677.", // To: w/assts - "dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/578.", // To: w/assts - "dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com> - "dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com> - "dasovich-j/risk_analytics/3.", // To: w/assts - "dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com> - "dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com> - "dasovich-j/notes_inbox/5.", // To: w/assts - "kaminski-v/sites/19.", // To: <"the.desk":@enron.com> - "kaminski-v/sites/1.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/5082.", // To: <"ft.com.users":@enron.com> - "kaminski-v/discussion_threads/4046.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/4187.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/8068.", // To: cats , risk , leaders - "kaminski-v/discussion_threads/7980.", // To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/all_documents/5970.", //To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/all_documents/5838.", // To + Cc: dogs , breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com - "kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com> - "kaminski-v/all_documents/92.", // To: <"the.desk":@enron.com> - "kaminski-v/all_documents/276.", // To: <"the.desk":@enron.com> - "kaminski-v/technical/1.", // To: <"the.desk":@enron.com> - "kaminski-v/technical/7.", // To: <"the.desk":@enron.com> - "kaminski-v/notes_inbox/140.", // To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/notes_inbox/95.", // To + CC failed: cats , risk , leaders - "kean-s/archiving/untitled/1232.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/archiving/untitled/1688.", // To: w/assts - "kean-s/sent/198.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/reg_risk/9.", // To: w/assts - "kean-s/discussion_threads/950.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/discussion_threads/577.", // To: w/assts - "kean-s/calendar/untitled/1096.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/calendar/untitled/640.", // To: w/assts - "kean-s/all_documents/640.", // To: w/assts - "kean-s/all_documents/1095.", // To: w/assts - "kean-s/attachments/2030.", // To: w/assts - "williams-w3/operations_committee_isas/10.", // To: z34655 - ]); - - let known_bad_from = HashSet::from([ - "skilling-j/inbox/223.", // From: pep - ]); - - let mut i = 0; - for entry in WalkDir::new(d.as_path()) - .into_iter() - .filter_map(|file| file.ok()) - { - if entry.metadata().unwrap().is_file() { - let mail_path = entry.path(); - let suffix = &mail_path.to_str().unwrap()[prefix_sz..]; - - // read file - let mut raw = Vec::new(); - let mut f = File::open(mail_path).unwrap(); - f.read_to_end(&mut raw).unwrap(); - - // parse - parser(&raw, |hdrs| { - let ok_date = hdrs.date.is_some(); - let ok_from = hdrs.from.len() > 0; - let ok_fields = hdrs.bad_fields.len() == 0; - - if !ok_date || !ok_from || !ok_fields { - println!("Issue with: {}", suffix); - } - - assert!(ok_date); - - if !known_bad_from.contains(suffix) { - assert!(ok_from); - } - - if !known_bad_fields.contains(suffix) { - assert!(ok_fields); - } - - i += 1; - if i % 1000 == 0 { - println!("Analyzed emails: {}", i); - } - }) - } - } -} diff --git a/tests/known.rs b/tests/known.rs deleted file mode 100644 index 3cd756d..0000000 --- a/tests/known.rs +++ /dev/null @@ -1,340 +0,0 @@ -use chrono::{FixedOffset, TimeZone}; -use imf_codec::fragments::{misc_token, model, section, part, trace}; -use imf_codec::multipass; -use std::collections::HashMap; - -fn parser<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(§ion::Section) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - - func(§ion.fields); -} - -#[test] -fn test_headers() { - let fullmail: &[u8] = r#"Return-Path: -Delivered-To: quentin@example.com -Received: from smtp.example.com ([10.83.2.2]) - by doradille with LMTP - id xyzabcd - (envelope-from ) - for ; Tue, 13 Jun 2023 19:01:08 +0000 -Date: Tue, 13 Jun 2023 10:01:10 +0200 -From: Mary Smith - , "A\lan" -Sender: imf@example.com -Reply-To: "Mary Smith: Personal Account" -To: John Doe -Cc: imf2@example.com -Bcc: (hidden) -Subject: Re: Saying Hello -Comments: A simple message -Comments: Not that complicated -comments : not valid header name but should be accepted - by the parser. -Keywords: hello, world -Héron: Raté - Raté raté -Keywords: salut, le, monde -Not a real header but should still recover -Message-ID: <3456@example.net> -In-Reply-To: <1234@local.machine.example> -References: <1234@local.machine.example> -Unknown: unknown - -This is a reply to your hello. -"# - .as_bytes(); - parser(fullmail, |parsed_section| { - assert_eq!( - parsed_section, - §ion::Section { - date: Some( - &FixedOffset::east_opt(2 * 3600) - .unwrap() - .with_ymd_and_hms(2023, 06, 13, 10, 01, 10) - .unwrap() - ), - - from: vec![ - &model::MailboxRef { - name: Some("Mary Smith".into()), - addrspec: model::AddrSpec { - local_part: "mary".into(), - domain: "example.net".into(), - } - }, - &model::MailboxRef { - name: Some("Alan".into()), - addrspec: model::AddrSpec { - local_part: "alan".into(), - domain: "example".into(), - } - } - ], - - sender: Some(&model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "imf".into(), - domain: "example.com".into(), - } - }), - - reply_to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("Mary Smith: Personal Account".into()), - addrspec: model::AddrSpec { - local_part: "smith".into(), - domain: "home.example".into(), - } - })], - - to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("John Doe".into()), - addrspec: model::AddrSpec { - local_part: "jdoe".into(), - domain: "machine.example".into(), - } - })], - - cc: vec![&model::AddressRef::Single(model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "imf2".into(), - domain: "example.com".into(), - } - })], - - bcc: vec![], - - msg_id: Some(&model::MessageId { - left: "3456", - right: "example.net" - }), - in_reply_to: vec![&model::MessageId { - left: "1234", - right: "local.machine.example" - }], - references: vec![&model::MessageId { - left: "1234", - right: "local.machine.example" - }], - - subject: Some(&misc_token::Unstructured("Re: Saying Hello".into())), - - comments: vec![ - &misc_token::Unstructured("A simple message".into()), - &misc_token::Unstructured("Not that complicated".into()), - &misc_token::Unstructured( - "not valid header name but should be accepted by the parser.".into() - ), - ], - - keywords: vec![ - &misc_token::PhraseList(vec!["hello".into(), "world".into(),]), - &misc_token::PhraseList(vec!["salut".into(), "le".into(), "monde".into(),]), - ], - - received: vec![&trace::ReceivedLog( - r#"from smtp.example.com ([10.83.2.2]) - by doradille with LMTP - id xyzabcd - (envelope-from ) - for "# - )], - - return_path: vec![&model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "gitlab".into(), - domain: "example.com".into(), - } - }], - - optional: HashMap::from([ - ( - "Delivered-To", - &misc_token::Unstructured("quentin@example.com".into()) - ), - ("Unknown", &misc_token::Unstructured("unknown".into())), - ]), - - bad_fields: vec![], - - unparsed: vec![ - "Héron: Raté\n Raté raté\n", - "Not a real header but should still recover\n", - ], - ..section::Section::default() - } - ) - }) -} - -#[test] -fn test_headers_mime() { - use imf_codec::fragments::mime; - let fullmail: &[u8] = r#"From: =?US-ASCII?Q?Keith_Moore?= -To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= -CC: =?ISO-8859-1?Q?Andr=E9?= Pirard -Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= - =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=ISO-8859-1 -Content-Transfer-Encoding: quoted-printable -Content-ID: -Content-Description: hello - -Now's the time = -for all folk to come= - to the aid of their country. -"# - .as_bytes(); - - parser(fullmail, |parsed_section| { - assert_eq!( - parsed_section, - §ion::Section { - from: vec![ - &model::MailboxRef { - name: Some("Keith Moore".into()), - addrspec: model::AddrSpec { - local_part: "moore".into(), - domain: "cs.utk.edu".into(), - } - }, - ], - - to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("Keld Jørn Simonsen".into()), - addrspec: model::AddrSpec { - local_part: "keld".into(), - domain: "dkuug.dk".into(), - } - })], - - cc: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("André Pirard".into()), - addrspec: model::AddrSpec { - local_part: "PIRARD".into(), - domain: "vm1.ulg.ac.be".into(), - } - })], - - subject: Some(&misc_token::Unstructured("If you can read this you understand the example.".into())), - mime_version: Some(&mime::Version{ major: 1, minor: 0 }), - mime: section::MIMESection { - content_type: Some(&mime::Type::Text(mime::TextDesc { - charset: Some(mime::EmailCharset::ISO_8859_1), - subtype: mime::TextSubtype::Plain, - unknown_parameters: vec![] - })), - content_transfer_encoding: Some(&mime::Mechanism::QuotedPrintable), - content_id: Some(&model::MessageId { - left: "a", - right: "example.com" - }), - content_description: Some(&misc_token::Unstructured("hello".into())), - ..section::MIMESection::default() - }, - ..section::Section::default() - } - ); - }) -} - -fn parser_bodystruct<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(&part::PartNode) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - let bodystruct = section.body_structure(); - - func(&bodystruct.body); -} - -#[test] -fn test_multipart() { - let fullmail: &[u8] = r#"Date: Sat, 8 Jul 2023 07:14:29 +0200 -From: Grrrnd Zero -To: John Doe -Subject: Re: Saying Hello -Message-ID: -MIME-Version: 1.0 -Content-Type: multipart/alternative; - boundary="b1_e376dc71bafc953c0b0fdeb9983a9956" -Content-Transfer-Encoding: 7bit - -This is a multi-part message in MIME format. - ---b1_e376dc71bafc953c0b0fdeb9983a9956 -Content-Type: text/plain; charset=utf-8 -Content-Transfer-Encoding: quoted-printable - -GZ -OoOoO -oOoOoOoOo -oOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOoOoOoOo -OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO - ---b1_e376dc71bafc953c0b0fdeb9983a9956 -Content-Type: text/html; charset=us-ascii - -
GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
- ---b1_e376dc71bafc953c0b0fdeb9983a9956-- -"#.as_bytes(); - - parser_bodystruct(fullmail, |part| { - assert_eq!(part, &part::PartNode::Composite( - part::PartHeader { - ..part::PartHeader::default() - }, - vec![ - part::PartNode::Discrete( - part::PartHeader { - ..part::PartHeader::default() - }, - r#"GZ -OoOoO -oOoOoOoOo -oOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOoOoOoOo -OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO"#.as_bytes() - ), - part::PartNode::Discrete( - part::PartHeader { - ..part::PartHeader::default() - }, - r#"
GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
"#.as_bytes() - ), - ])); - }); -}