wip refactor

This commit is contained in:
Quentin 2023-07-18 23:25:10 +02:00
parent 23c663b943
commit a503eb1de6
Signed by: quentin
GPG key ID: E9602264D639FF68
35 changed files with 746 additions and 1038 deletions

View file

@ -1,129 +0,0 @@
use nom::{
branch::alt,
bytes::complete::{tag, take_while1},
character::complete::space0,
combinator::{into, map, opt},
multi::{many0, many1, separated_list1},
sequence::tuple,
IResult,
};
use std::borrow::Cow;
use crate::error::IMFError;
use crate::fragments::lazy;
use crate::fragments::quoted::quoted_string;
use crate::fragments::whitespace::{fws, is_obs_no_ws_ctl};
use crate::fragments::words::{atom, is_vchar};
use crate::fragments::encoding::encoded_word;
#[derive(Debug, PartialEq, Default)]
pub struct Unstructured(pub String);
#[derive(Debug, PartialEq, Default)]
pub struct PhraseList(pub Vec<String>);
impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured {
type Error = IMFError<'a>;
fn try_from(input: &'a lazy::Unstructured<'a>) -> Result<Self, Self::Error> {
unstructured(input.0)
.map(|(_, v)| Unstructured(v))
.map_err(|e| IMFError::Unstructured(e))
}
}
impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList {
type Error = IMFError<'a>;
fn try_from(p: &'a lazy::PhraseList<'a>) -> Result<Self, Self::Error> {
separated_list1(tag(","), phrase)(p.0)
.map(|(_, q)| PhraseList(q))
.map_err(|e| IMFError::PhraseList(e))
}
}
/// Word
///
/// ```abnf
/// word = atom / quoted-string
/// ```
pub fn word(input: &str) -> IResult<&str, Cow<str>> {
alt((into(quoted_string), into(encoded_word), into(atom)))(input)
}
/// Phrase
///
/// ```abnf
/// phrase = 1*word / obs-phrase
/// ```
pub fn phrase(input: &str) -> IResult<&str, String> {
let (input, words) = many1(word)(input)?;
let phrase = words.join(" ");
Ok((input, phrase))
}
/// Compatible unstructured input
///
/// ```abnf
/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR
/// ```
fn is_unstructured(c: char) -> bool {
is_vchar(c) || is_obs_no_ws_ctl(c) || c == '\x00'
}
enum UnstrToken {
Init,
Encoded,
Plain,
}
/// Unstructured header field body
///
/// ```abnf
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
/// ```
pub fn unstructured(input: &str) -> IResult<&str, String> {
let (input, r) = many0(tuple((opt(fws), alt((
map(encoded_word, |v| (Cow::Owned(v), UnstrToken::Encoded)),
map(take_while1(is_unstructured), |v| (Cow::Borrowed(v), UnstrToken::Plain)),
)))))(input)?;
let (input, _) = space0(input)?;
// Try to optimize for the most common cases
let body = match r.as_slice() {
// Optimization when there is only one line
[(None, (content, _))] | [(_, (content, UnstrToken::Encoded))] => content.to_string(),
[(Some(_), (content, _))] => " ".to_string() + content,
// Generic case, with multiple lines
lines => lines.iter().fold(
(&UnstrToken::Init, String::with_capacity(255)),
|(prev_token, result), (may_ws, (content, current_token))| {
let new_res = match (may_ws, prev_token, current_token) {
(_, UnstrToken::Encoded, UnstrToken::Encoded) | (None, _, _) => result + content,
_ => result + " " + content,
};
(current_token, new_res)
}).1,
};
Ok((input, body))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_phrase() {
assert_eq!(phrase("hello world"), Ok(("", "hello world".into())));
assert_eq!(
phrase("salut \"le\" monde"),
Ok(("", "salut le monde".into()))
);
assert_eq!(
phrase("fin\r\n du\r\nmonde"),
Ok(("\r\nmonde", "fin du".into()))
);
}
}

View file

@ -1,23 +0,0 @@
// Model
pub mod model;
// Generic
pub mod misc_token;
mod quoted;
pub mod whitespace;
mod words;
// Header specific
mod address;
mod datetime;
pub mod eager;
mod identification;
pub mod lazy;
mod mailbox;
pub mod section;
pub mod trace;
// MIME related
pub mod mime;
pub mod encoding;
pub mod part;

View file

@ -1,146 +0,0 @@
use chrono::{DateTime, FixedOffset};
use std::collections::HashMap;
#[derive(Debug, PartialEq)]
pub struct AddrSpec {
pub local_part: String,
pub domain: String,
}
impl AddrSpec {
pub fn fully_qualified(&self) -> String {
format!("{}@{}", self.local_part, self.domain)
}
}
#[derive(Debug, PartialEq)]
pub struct MailboxRef {
// The actual "email address" like hello@example.com
pub addrspec: AddrSpec,
pub name: Option<String>,
}
impl From<AddrSpec> for MailboxRef {
fn from(addr: AddrSpec) -> Self {
MailboxRef {
name: None,
addrspec: addr,
}
}
}
pub type MailboxList = Vec<MailboxRef>;
#[derive(Debug, PartialEq)]
pub struct GroupRef {
pub name: String,
pub participants: Vec<MailboxRef>,
}
#[derive(Debug, PartialEq)]
pub enum AddressRef {
Single(MailboxRef),
Many(GroupRef),
}
impl From<MailboxRef> for AddressRef {
fn from(mx: MailboxRef) -> Self {
AddressRef::Single(mx)
}
}
impl From<GroupRef> for AddressRef {
fn from(grp: GroupRef) -> Self {
AddressRef::Many(grp)
}
}
pub type AddressList = Vec<AddressRef>;
#[derive(Debug, PartialEq)]
pub struct MessageId<'a> {
pub left: &'a str,
pub right: &'a str,
}
pub type MessageIdList<'a> = Vec<MessageId<'a>>;
#[derive(Debug, PartialEq)]
pub enum FieldBody<'a, T> {
Correct(T),
Failed(&'a str),
}
#[derive(Debug, PartialEq)]
pub enum Field<'a> {
// 3.6.1. The Origination Date Field
Date(FieldBody<'a, Option<DateTime<FixedOffset>>>),
// 3.6.2. Originator Fields
From(FieldBody<'a, Vec<MailboxRef>>),
Sender(FieldBody<'a, MailboxRef>),
ReplyTo(FieldBody<'a, Vec<AddressRef>>),
// 3.6.3. Destination Address Fields
To(FieldBody<'a, Vec<AddressRef>>),
Cc(FieldBody<'a, Vec<AddressRef>>),
Bcc(FieldBody<'a, Vec<AddressRef>>),
// 3.6.4. Identification Fields
MessageID(FieldBody<'a, MessageId<'a>>),
InReplyTo(FieldBody<'a, Vec<MessageId<'a>>>),
References(FieldBody<'a, Vec<MessageId<'a>>>),
// 3.6.5. Informational Fields
Subject(FieldBody<'a, String>),
Comments(FieldBody<'a, String>),
Keywords(FieldBody<'a, Vec<String>>),
// 3.6.6 Resent Fields (not implemented)
// 3.6.7 Trace Fields
Received(FieldBody<'a, &'a str>),
ReturnPath(FieldBody<'a, Option<MailboxRef>>),
// 3.6.8. Optional Fields
Optional(&'a str, String),
// None
Rescue(&'a str),
}
/// Permissive Header Section
///
/// This is a structure intended for parsing/decoding,
/// hence it's support cases where the email is considered
/// as invalid according to RFC5322 but for which we can
/// still extract some data.
#[derive(Debug, PartialEq, Default)]
pub struct HeaderSection<'a> {
// 3.6.1. The Origination Date Field
pub date: Option<DateTime<FixedOffset>>,
// 3.6.2. Originator Fields
pub from: Vec<MailboxRef>,
pub sender: Option<MailboxRef>,
pub reply_to: Vec<AddressRef>,
// 3.6.3. Destination Address Fields
pub to: Vec<AddressRef>,
pub cc: Vec<AddressRef>,
pub bcc: Vec<AddressRef>,
// 3.6.4. Identification Fields
pub msg_id: Option<MessageId<'a>>,
pub in_reply_to: Vec<MessageId<'a>>,
pub references: Vec<MessageId<'a>>,
// 3.6.5. Informational Fields
pub subject: Option<String>,
pub comments: Vec<String>,
pub keywords: Vec<String>,
// 3.6.6 Not implemented
// 3.6.7 Trace Fields
pub return_path: Vec<MailboxRef>,
pub received: Vec<&'a str>,
// 3.6.8. Optional Fields
pub optional: HashMap<&'a str, String>,
// Recovery
pub bad_fields: Vec<Field<'a>>,
pub unparsed: Vec<&'a str>,
}

View file

@ -1,116 +0,0 @@
use crate::fragments::whitespace::cfws;
use nom::{
bytes::complete::{tag, take_while1},
combinator::{opt, recognize},
multi::many0,
sequence::{delimited, pair},
IResult,
};
/// VCHAR definition
pub fn is_vchar(c: char) -> bool {
(c >= '\x21' && c <= '\x7E') || !c.is_ascii()
}
/// Sequence of visible chars with the UTF-8 extension
///
/// ```abnf
/// VCHAR = %x21-7E
/// ; visible (printing) characters
/// VCHAR =/ UTF8-non-ascii
/// SEQ = 1*VCHAR
///```
#[allow(dead_code)]
pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
take_while1(is_vchar)(input)
}
/// Atom allowed characters
fn is_atext(c: char) -> bool {
c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) || !c.is_ascii()
}
/// Atom
///
/// `[CFWS] 1*atext [CFWS]`
pub fn atom(input: &str) -> IResult<&str, &str> {
delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input)
}
/// dot-atom-text
///
/// `1*atext *("." 1*atext)`
pub fn dot_atom_text(input: &str) -> IResult<&str, &str> {
recognize(pair(
take_while1(is_atext),
many0(pair(tag("."), take_while1(is_atext))),
))(input)
}
/// dot-atom
///
/// `[CFWS] dot-atom-text [CFWS]`
pub fn dot_atom(input: &str) -> IResult<&str, &str> {
delimited(opt(cfws), dot_atom_text, opt(cfws))(input)
}
#[allow(dead_code)]
pub fn is_special(c: char) -> bool {
c == '('
|| c == ')'
|| c == '<'
|| c == '>'
|| c == '['
|| c == ']'
|| c == ':'
|| c == ';'
|| c == '@'
|| c == '\\'
|| c == ','
|| c == '.'
|| c == '"'
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vchar_seq() {
assert_eq!(vchar_seq("hello world"), Ok((" world", "hello")));
assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋")));
}
#[test]
fn test_atext() {
assert!(is_atext('='));
assert!(is_atext('5'));
assert!(is_atext('Q'));
assert!(!is_atext(' '));
assert!(is_atext('É')); // support utf8
}
#[test]
fn test_atom() {
assert_eq!(
atom("(skip) imf_codec (hidden) aerogramme"),
Ok(("aerogramme", "imf_codec"))
);
}
#[test]
fn test_dot_atom_text() {
assert_eq!(
dot_atom_text("quentin.dufour.io abcdef"),
Ok((" abcdef", "quentin.dufour.io"))
);
}
#[test]
fn test_dot_atom() {
assert_eq!(
dot_atom(" (skip) quentin.dufour.io abcdef"),
Ok(("abcdef", "quentin.dufour.io"))
);
}
}

27
src/headers.rs Normal file
View file

@ -0,0 +1,27 @@
use nom::{
self,
combinator::{all_consuming, recognize},
multi::many0,
sequence::terminated,
IResult,
};
use crate::text::whitespace::{foldable_line, line, obs_crlf};
pub fn headers(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> {
let (body, hdrs) = segment(input)?;
let (_, fields) = fields(hdrs)?;
Ok((body, fields))
}
// -- part 1, segment
fn segment(input: &[u8]) -> IResult<&[u8], &[u8]> {
terminated(recognize(many0(line)), obs_crlf)(input)
}
// -- part 2, isolate fields
fn fields(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> {
let (rest, parsed) = all_consuming(many0(foldable_line))(input)?;
Ok((rest, parsed))
}

View file

@ -1,3 +1,5 @@
pub mod error;
pub mod fragments;
pub mod multipass;
//pub mod mime;
//pub mod message;
pub mod headers;
pub mod text;

View file

@ -292,18 +292,6 @@ pub fn version(input: &str) -> IResult<&str, Version> {
Ok((rest, Version { major, minor }))
}
/// Token allowed characters
fn is_token_text(c: char) -> bool {
c.is_ascii() && !c.is_ascii_control() && !c.is_ascii_whitespace() && !"()<>@,;:\\\"/[]?=".contains(c)
}
/// Token
///
/// `[CFWS] 1*token_text [CFWS]`
pub fn token(input: &str) -> IResult<&str, &str> {
delimited(opt(cfws), take_while1(is_token_text), opt(cfws))(input)
}
pub fn parameter(input: &str) -> IResult<&str, Parameter> {
let (rest, (pname, _, pvalue)) = tuple((
token,

View file

@ -1,8 +1,9 @@
use imf_codec::fragments::section::Section;
use imf_codec::multipass::segment;
//use imf_codec::fragments::section::Section;
//use imf_codec::multipass::segment;
use std::io;
use std::io::Read;
/*
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
where
F: FnOnce(&Section) -> (),
@ -15,9 +16,10 @@ where
let section = field_body.section();
func(&section.fields);
}
}*/
fn main() {
/*
// Read full mail in memory
let mut rawmail = Vec::new();
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
@ -30,4 +32,6 @@ fn main() {
assert!(section.from.len() > 0);
assert!(section.bad_fields.len() == 0);
});
*/
println!("hello world");
}

View file

@ -11,9 +11,32 @@ use crate::error::IMFError;
use crate::fragments::lazy;
use crate::fragments::mailbox::mailbox;
use crate::fragments::misc_token::phrase;
use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef};
//use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef};
use crate::fragments::whitespace::cfws;
#[derive(Debug, PartialEq)]
pub struct GroupRef {
pub name: String,
pub participants: Vec<MailboxRef>,
}
#[derive(Debug, PartialEq)]
pub enum AddressRef {
Single(MailboxRef),
Many(GroupRef),
}
impl From<MailboxRef> for AddressRef {
fn from(mx: MailboxRef) -> Self {
AddressRef::Single(mx)
}
}
impl From<GroupRef> for AddressRef {
fn from(grp: GroupRef) -> Self {
AddressRef::Many(grp)
}
}
pub type AddressList = Vec<AddressRef>;
impl<'a> TryFrom<&'a lazy::Mailbox<'a>> for MailboxRef {
type Error = IMFError<'a>;

View file

@ -14,6 +14,14 @@ use crate::fragments::model::{MessageId, MessageIdList};
use crate::fragments::whitespace::cfws;
use crate::fragments::words::dot_atom_text;
#[derive(Debug, PartialEq)]
pub struct MessageId<'a> {
pub left: &'a str,
pub right: &'a str,
}
pub type MessageIdList<'a> = Vec<MessageId<'a>>;
impl<'a> TryFrom<&'a lazy::Identifier<'a>> for MessageId<'a> {
type Error = IMFError<'a>;

View file

@ -10,11 +10,37 @@ use nom::{
use std::borrow::Cow;
use crate::fragments::misc_token::{phrase, word};
use crate::fragments::model::{AddrSpec, MailboxRef};
use crate::fragments::quoted::quoted_string;
use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
use crate::fragments::words::{atom, dot_atom};
#[derive(Debug, PartialEq)]
pub struct AddrSpec {
pub local_part: String,
pub domain: String,
}
impl AddrSpec {
pub fn fully_qualified(&self) -> String {
format!("{}@{}", self.local_part, self.domain)
}
}
#[derive(Debug, PartialEq)]
pub struct MailboxRef {
// The actual "email address" like hello@example.com
pub addrspec: AddrSpec,
pub name: Option<String>,
}
impl From<AddrSpec> for MailboxRef {
fn from(addr: AddrSpec) -> Self {
MailboxRef {
name: None,
addrspec: addr,
}
}
}
pub type MailboxList = Vec<MailboxRef>;
/// Mailbox
///
/// ```abnf

142
src/text/ascii.rs Normal file
View file

@ -0,0 +1,142 @@
// ASCII
// -- CONTROL CHARACTERS
pub const NULL: u8 = 0x00; // NULL
pub const SOH: u8 = 0x01; // START OF HEADER
pub const STX: u8 = 0x02; // START OF TEXT
pub const ETX: u8 = 0x03; // END OF TEXT
pub const EOT: u8 = 0x04; //
pub const ANQ: u8 = 0x05;
pub const ACK: u8 = 0x06;
pub const BEL: u8 = 0x07;
pub const BS: u8 = 0x08; // BACKSPACE
pub const HT: u8 = 0x09; // horizontal tab
pub const LF: u8 = 0x0A;
pub const VT: u8 = 0x0B;
pub const FF: u8 = 0x0C;
pub const CR: u8 = 0x0D;
pub const SO: u8 = 0x0E;
pub const SI: u8 = 0x0F;
pub const DLE: u8 = 0x10;
pub const DC1: u8 = 0x11;
pub const DC2: u8 = 0x12;
pub const DC3: u8 = 0x13;
pub const DC4 : u8 = 0x14;
pub const NAK: u8 = 0x15;
pub const SYN: u8 = 0x16;
pub const ETB: u8 = 0x17;
pub const CAN: u8 = 0x18;
pub const EM: u8 = 0x19;
pub const SUB: u8 = 0x1A;
pub const ESC: u8 = 0x1B;
pub const FS: u8 = 0x1C;
pub const GS: u8 = 0x1D;
pub const RS: u8 = 0x1E;
pub const US: u8 = 0x1F;
pub const DEL: u8 = 0x7F;
// -- GRAPHIC CHARACTERS
pub const SP: u8 = 0x20; // space
pub const EXCLAMATION: u8 = 0x21; // !
pub const DQUOTE: u8 = 0x22; // "
pub const NUM: u8 = 0x23; // #
pub const DOLLAR: u8 = 0x24; // $
pub const PERCENT: u8 = 0x25; // %
pub const AMPERSAND: u8 = 0x26; // &
pub const SQUOTE: u8 = 0x27; // '
pub const LEFT_PAR: u8 = 0x28; // (
pub const RIGHT_PAR: u8 = 0x29; // )
pub const ASTERISK: u8 = 0x2A; // *
pub const PLUS: u8 = 0x2B; // +
pub const COMMA: u8 = 0x2C; // ,
pub const MINUS: u8 = 0x2D; // -
pub const PERIOD: u8 = 0x2E; // .
pub const SLASH: u8 = 0x2F; // /
pub const N0: u8 = 0x30; // 0
pub const N1: u8 = 0x31; // 1
pub const N2: u8 = 0x32; // 2
pub const N3: u8 = 0x33; // 3
pub const N4: u8 = 0x34; // 4
pub const N5: u8 = 0x35; // 5
pub const N6: u8 = 0x36; // 6
pub const N7: u8 = 0x37; // 7
pub const N8: u8 = 0x38; // 8
pub const N9: u8 = 0x39; // 9
pub const COL: u8 = 0x3A; // :
pub const SEM_COL: u8 = 0x3B; // ;
pub const LT: u8 = 0x3C; // <
pub const EQ: u8 = 0x3D; // =
pub const GT: u8 = 0x3E; // >
pub const QUESTION: u8 = 0x3F; // ?
pub const AT: u8 = 0x40; // @
pub const LCA: u8 = 0x41; // A
pub const LCB: u8 = 0x42; // B
pub const LCC: u8 = 0x43; // C
pub const LCD: u8 = 0x44; // D
pub const LCE: u8 = 0x45; // E
pub const LCF: u8 = 0x46; // F
pub const LCG: u8 = 0x47; // G
pub const LCH: u8 = 0x48; // H
pub const LCI: u8 = 0x49; // I
pub const LCJ: u8 = 0x4A; // J
pub const LCK: u8 = 0x4B; // K
pub const LCL: u8 = 0x4C; // L
pub const LCM: u8 = 0x4D; // M
pub const LCN: u8 = 0x4E; // N
pub const LCO: u8 = 0x4F; // O
pub const LCP: u8 = 0x50; // P
pub const LCQ: u8 = 0x51; // Q
pub const LCR: u8 = 0x52; // R
pub const LCS: u8 = 0x53; // S
pub const LCT: u8 = 0x54; // T
pub const LCU: u8 = 0x55; // U
pub const LCV: u8 = 0x56; // V
pub const LCW: u8 = 0x57; // W
pub const LCX: u8 = 0x58; // X
pub const LCY: u8 = 0x59; // Y
pub const LCZ: u8 = 0x5A; // Z
pub const LEFT_BRACKET: u8 = 0x5B; // [
pub const BACKSLASH: u8 = 0x5C; // \
pub const RIGHT_BRACKET: u8 = 0x5D; // ]
pub const CARRET: u8 = 0x5E; // ^
pub const UNDERSCORE: u8 = 0x5F; // _
pub const GRAVE: u8 = 0x60; // `
pub const LSA: u8 = 0x61; // a
pub const LSB: u8 = 0x62; // b
pub const LSC: u8 = 0x63; // c
pub const LSD: u8 = 0x64; // d
pub const LSE: u8 = 0x65; // e
pub const LSF: u8 = 0x66; // f
pub const LSG: u8 = 0x67; // g
pub const LSH: u8 = 0x68; // h
pub const LSI: u8 = 0x69; // i
pub const LSJ: u8 = 0x6A; // j
pub const LSK: u8 = 0x6B; // k
pub const LSL: u8 = 0x6C; // l
pub const LSM: u8 = 0x6D; // m
pub const LSN: u8 = 0x6E; // n
pub const LSO: u8 = 0x6F; // o
pub const LSP: u8 = 0x70; // p
pub const LSQ: u8 = 0x71; // q
pub const LSR: u8 = 0x72; // r
pub const LSS: u8 = 0x73; // s
pub const LST: u8 = 0x74; // t
pub const LSU: u8 = 0x75; // u
pub const LSV: u8 = 0x76; // v
pub const LSW: u8 = 0x77; // w
pub const LSX: u8 = 0x78; // x
pub const LSY: u8 = 0x79; // y
pub const LSZ: u8 = 0x7A; // z
pub const LEFT_CURLY: u8 = 0x7B; // {
pub const PIPE: u8 = 0x7C; // |
pub const RIGHT_CURLY: u8 = 0x7D; // }
pub const TILDE: u8 = 0x7E; // ~
// GROUP OF CHARACTERS
// -- CRLF
pub const CRLF: &[u8] = &[CR, LF];
// -- WHITESPACE
pub const WS: &[u8] = &[HT, SP];
pub const GRAPHIC_BEGIN: u8 = SP;
pub const GRAPHIC_END: u8 = TILDE;

43
src/text/buffer.rs Normal file
View file

@ -0,0 +1,43 @@
use encoding_rs::Encoding;
#[derive(Debug, PartialEq, Default)]
pub struct Text<'a> {
parts: Vec<&'a [u8]>,
}
impl<'a> Text<'a> {
pub fn push(&mut self, e: &[u8]) {
self.parts.push(e)
}
pub fn to_string(&self) -> String {
let enc = encoding_rs::UTF_8;
let size = self.parts.iter().fold(0, |acc, v| acc + v.len());
self.parts.iter().fold(
String::with_capacity(size),
|mut acc, v| {
let (content, _) = enc.decode_without_bom_handling(v);
acc.push_str(content.as_ref());
acc
},
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text() {
let mut text = Text::default();
text.push(b"hello");
text.push(&[ascii::SP]);
text.push(b"world");
assert_eq!(
text.to_string(),
"hello world".to_string(),
);
}
}

View file

@ -1,5 +1,3 @@
use std::borrow::Cow;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use nom::{
@ -7,92 +5,107 @@ use nom::{
branch::alt,
bytes::complete::{tag, take, take_while1, take_while},
character::complete::{one_of},
character::is_alphanumeric,
combinator::map,
sequence::{preceded, terminated, tuple},
multi::many0,
};
use encoding_rs::Encoding;
use base64::{Engine as _, engine::general_purpose};
use crate::fragments::mime;
use crate::text::words;
use crate::text::ascii;
const IS_LAST_BUFFER: bool = true;
const ALLOW_UTF8: bool = true;
const NO_TLD: Option<&[u8]> = None;
pub fn header_decode(input: &[u8]) -> Cow<str> {
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(input, IS_LAST_BUFFER);
// Get encoding
let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8);
let (header, _, _) = enc.decode(input);
header
}
pub fn encoded_word(input: &str) -> IResult<&str, String> {
pub fn encoded_word(input: &[u8]) -> IResult<&[u8], EncodedWord> {
alt((encoded_word_quoted, encoded_word_base64))(input)
}
pub fn encoded_word_quoted(input: &str) -> IResult<&str, String> {
pub fn encoded_word_quoted(input: &[u8]) -> IResult<&[u8], EncodedWord> {
let (rest, (_, charset, _, _, _, txt, _)) = tuple((
tag("=?"), mime::token,
tag("=?"), words::mime_token,
tag("?"), one_of("Qq"),
tag("?"), ptext,
tag("?=")))(input)?;
let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252);
let parsed = decode_quoted_encoding(renc, txt.iter());
let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252);
let parsed = EncodedWord::Quoted(QuotedWord { enc: renc, chunks: txt });
Ok((rest, parsed))
}
pub fn encoded_word_base64(input: &str) -> IResult<&str, String> {
pub fn encoded_word_base64(input: &[u8]) -> IResult<&[u8], EncodedWord> {
let (rest, (_, charset, _, _, _, txt, _)) = tuple((
tag("=?"), mime::token,
tag("=?"), words::mime_token,
tag("?"), one_of("Bb"),
tag("?"), btext,
tag("?=")))(input)?;
let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252);
let parsed = general_purpose::STANDARD_NO_PAD.decode(txt).map(|d| renc.decode(d.as_slice()).0.to_string()).unwrap_or("".into());
let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252);
let parsed = EncodedWord::Base64(Base64Word { enc: renc, content: txt });
Ok((rest, parsed))
}
fn decode_quoted_encoding<'a>(enc: &'static Encoding, q: impl Iterator<Item = &'a QuotedChunk<'a>>) -> String {
q.fold(
String::new(),
|mut acc, c| {
let dec = match c {
QuotedChunk::Safe(v) => Cow::Borrowed(*v),
QuotedChunk::Space => Cow::Borrowed(" "),
QuotedChunk::Encoded(v) => {
let w = &[*v];
let (d, _, _) = enc.decode(w);
Cow::Owned(d.into_owned())
},
};
acc.push_str(dec.as_ref());
acc
})
#[derive(PartialEq,Debug)]
pub enum EncodedWord<'a> {
Quoted(QuotedWord<'a>),
Base64(Base64Word<'a>),
}
#[derive(PartialEq,Debug)]
pub struct Base64Word<'a> {
pub enc: &'static Encoding,
pub content: &'a [u8],
}
impl<'a> Base64Word<'a> {
pub fn to_string(&self) -> String {
general_purpose::STANDARD_NO_PAD
.decode(self.content)
.map(|d| self.enc.decode(d.as_slice()).0.to_string())
.unwrap_or("".into())
}
}
#[derive(PartialEq,Debug)]
pub struct QuotedWord<'a> {
pub enc: &'static Encoding,
pub chunks: Vec<QuotedChunk<'a>>,
}
impl<'a> QuotedWord<'a> {
pub fn to_string(&self) -> String {
self.chunks.iter().fold(
String::new(),
|mut acc, c| {
match c {
QuotedChunk::Safe(v) => {
let (content, _) = encoding_rs::UTF_8.decode_without_bom_handling(v);
acc.push_str(content.as_ref());
}
QuotedChunk::Space => acc.push(' '),
QuotedChunk::Encoded(v) => {
let w = &[*v];
let (d, _) = self.enc.decode_without_bom_handling(w);
acc.push_str(d.as_ref());
},
};
acc
})
}
}
#[derive(PartialEq,Debug)]
pub enum QuotedChunk<'a> {
Safe(&'a str),
Safe(&'a [u8]),
Encoded(u8),
Space,
}
//quoted_printable
pub fn ptext(input: &str) -> IResult<&str, Vec<QuotedChunk>> {
pub fn ptext(input: &[u8]) -> IResult<&[u8], Vec<QuotedChunk>> {
many0(alt((safe_char2, encoded_space, hex_octet)))(input)
}
fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> {
fn safe_char2(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
map(take_while1(is_safe_char2), |v| QuotedChunk::Safe(v))(input)
}
@ -101,8 +114,8 @@ fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> {
/// 8-bit values which correspond to printable ASCII characters other
/// than "=", "?", and "_" (underscore), MAY be represented as those
/// characters.
fn is_safe_char2(c: char) -> bool {
c.is_ascii() && !c.is_ascii_control() && c != '_' && c != '?' && c != '='
fn is_safe_char2(c: u8) -> bool {
c >= ascii::SP && c != ascii::UNDERSCORE && c != ascii::QUESTION && c != ascii::EQ
}
/*
@ -111,28 +124,30 @@ fn is_safe_char(c: char) -> bool {
(c >= '\x3e' && c <= '\x7e')
}*/
fn encoded_space(input: &str) -> IResult<&str, QuotedChunk> {
fn encoded_space(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
map(tag("_"), |_| QuotedChunk::Space)(input)
}
fn hex_octet(input: &str) -> IResult<&str, QuotedChunk> {
fn hex_octet(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
use nom::error::*;
let (rest, hstr) = preceded(tag("="), take(2usize))(input)?;
let (rest, hbytes) = preceded(tag("="), take(2usize))(input)?;
let parsed = u8::from_str_radix(hstr, 16)
let (hstr, _) = encoding_rs::UTF_8.decode_without_bom_handling(hbytes);
let parsed = u8::from_str_radix(hstr.as_ref(), 16)
.map_err(|_| nom::Err::Error(Error::new(input, ErrorKind::Verify)))?;
Ok((rest, QuotedChunk::Encoded(parsed)))
}
//base64 (maybe use a crate)
pub fn btext(input: &str) -> IResult<&str, &str> {
pub fn btext(input: &[u8]) -> IResult<&[u8], &[u8]> {
terminated(take_while(is_bchar), many0(tag("=")))(input)
}
fn is_bchar(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '+' || c == '/'
fn is_bchar(c: u8) -> bool {
is_alphanumeric(c) || c == ascii::PLUS || c == ascii::SLASH
}
#[cfg(test)]

166
src/text/misc_token.rs Normal file
View file

@ -0,0 +1,166 @@
use nom::{
branch::alt,
bytes::complete::take_while1,
character::complete::space0,
combinator::{into, map, opt},
multi::{many0, many1},
sequence::{preceded, tuple},
IResult,
};
use std::borrow::Cow;
use crate::text::{
quoted::quoted_string,
whitespace::{fws, is_obs_no_ws_ctl},
words::{atom, is_vchar},
encoding::{self, encoded_word},
buffer,
ascii,
};
#[derive(Debug, PartialEq, Default)]
pub struct PhraseList(pub Vec<String>);
/*
impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured {
type Error = IMFError<'a>;
fn try_from(input: &'a lazy::Unstructured<'a>) -> Result<Self, Self::Error> {
unstructured(input.0)
.map(|(_, v)| Unstructured(v))
.map_err(|e| IMFError::Unstructured(e))
}
}
impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList {
type Error = IMFError<'a>;
fn try_from(p: &'a lazy::PhraseList<'a>) -> Result<Self, Self::Error> {
separated_list1(tag(","), phrase)(p.0)
.map(|(_, q)| PhraseList(q))
.map_err(|e| IMFError::PhraseList(e))
}
}*/
pub enum Word<'a> {
Quoted(buffer::Text<'a>),
Encoded(encoding::EncodedWord<'a>),
Atom(&'a [u8]),
}
impl<'a> Word<'a> {
pub fn to_string(&self) -> String {
match self {
Word::Quoted(v) => v.to_string(),
Word::Encoded(v) => v.to_string(),
Word::Atom(v) => v.to_string(),
}
}
}
/// Word
///
/// ```abnf
/// word = atom / quoted-string
/// ```
pub fn word(input: &[u8]) -> IResult<&[u8], Word> {
alt((
map(quoted_string, |v| Word::Quoted(v)),
map(encoded_word, |v| Word::Encoded(v)),
map(atom, |v| Word::Atom(v))
))(input)
}
pub struct Phrase<'a>(pub Vec<Word<'a>>);
impl<'a> Phrase<'a> {
pub fn to_string(&self) -> String {
self.0.join(" ")
}
}
/// Phrase
///
/// ```abnf
/// phrase = 1*word / obs-phrase
/// ```
pub fn phrase(input: &[u8]) -> IResult<&[u8], Phrase> {
let (input, phrase) = map(many1(word), |v| Phrase(v))(input)?;
Ok((input, phrase))
}
/// Compatible unstructured input
///
/// ```abnf
/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR
/// ```
fn is_unstructured(c: u8) -> bool {
is_vchar(c) || is_obs_no_ws_ctl(c) || c == ascii::NULL
}
enum UnstrToken<'a> {
Init,
Encoded(encoding::EncodedWord<'a>),
Plain(&'a [u8]),
}
impl<'a> UnstrToken<'a> {
pub fn to_string(&self) -> String {
match self {
UnstrToken::Init => "".into(),
UnstrToken::Encoded(e) => e.to_string(),
UnstrToken::Plain(e) => encoding_rs::UTF_8.decode_without_bom_handling(e).into_owned(),
}
}
}
pub struct Unstructured<'a>(pub Vec<UnstrToken<'a>>);
impl<'a> Unstructured<'a> {
pub fn to_string(&self) -> String {
self.0.iter().fold(
(&UnstrToken::Init, String::new()),
|(prev_token, result), current_token| {
match (prev_token, current_token) {
(UnstrToken::Init, v) => result.push_str(v.to_string().as_ref()),
(UnstrToken::EncodedWord(_), UnstrToken::EncodedWord(v)) => result.push_str(v.to_string()).as_ref(),
(_, v) => {
result.push(' ');
result.push_str(v.to_string().as_ref())
},
};
result
}
)
}
}
/// Unstructured header field body
///
/// ```abnf
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
/// ```
pub fn unstructured(input: &[u8]) -> IResult<&[u8], Unstructured> {
let (input, r) = many0(preceded(opt(fws), alt((
map(encoded_word, |v| UnstrToken::Encoded(v)),
map(take_while1(is_unstructured), |v| UnstrToken::Plain(v)),
))))(input)?;
let (input, _) = space0(input)?;
Ok((input, Unstructured(r)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_phrase() {
assert_eq!(phrase("hello world"), Ok(("", "hello world".into())));
assert_eq!(
phrase("salut \"le\" monde"),
Ok(("", "salut le monde".into()))
);
assert_eq!(
phrase("fin\r\n du\r\nmonde"),
Ok(("\r\nmonde", "fin du".into()))
);
}
}

7
src/text/mod.rs Normal file
View file

@ -0,0 +1,7 @@
pub mod ascii;
pub mod encoding;
pub mod misc_token;
pub mod quoted;
pub mod whitespace;
pub mod words;
pub mod buffer;

View file

@ -1,14 +1,16 @@
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{anychar, satisfy},
combinator::opt,
bytes::complete::{take_while1, tag},
character::complete::anychar,
combinator::{recognize, opt},
multi::many0,
sequence::{pair, preceded},
IResult,
};
use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
use crate::text::whitespace::{cfws, fws, is_obs_no_ws_ctl};
use crate::text::ascii;
use crate::text::buffer;
/// Quoted pair
///
@ -16,8 +18,8 @@ use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
/// obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR)
/// ```
pub fn quoted_pair(input: &str) -> IResult<&str, char> {
preceded(tag("\\"), anychar)(input)
pub fn quoted_pair(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag(&[ascii::SLASH]), anychar)(input)
}
/// Allowed characters in quote
@ -28,11 +30,11 @@ pub fn quoted_pair(input: &str) -> IResult<&str, char> {
/// %d93-126 / ; "\" or the quote character
/// obs-qtext
/// ```
fn is_restr_qtext(c: char) -> bool {
c == '\x21' || (c >= '\x23' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E')
fn is_restr_qtext(c: u8) -> bool {
c == ascii::EXCLAMATION || (c >= ascii::NUM && c <= ascii::LEFT_BRACKET) || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
}
fn is_qtext(c: char) -> bool {
fn is_qtext(c: u8) -> bool {
is_restr_qtext(c) || is_obs_no_ws_ctl(c)
}
@ -41,8 +43,8 @@ fn is_qtext(c: char) -> bool {
/// ```abnf
/// qcontent = qtext / quoted-pair
/// ```
fn qcontent(input: &str) -> IResult<&str, char> {
alt((satisfy(is_qtext), quoted_pair))(input)
fn qcontent(input: &u8) -> IResult<&[u8], &[u8]> {
alt((take_while1(is_qtext), recognize(quoted_pair)))(input)
}
/// Quoted string
@ -52,7 +54,7 @@ fn qcontent(input: &str) -> IResult<&str, char> {
/// DQUOTE *([FWS] qcontent) [FWS] DQUOTE
/// [CFWS]
/// ```
pub fn quoted_string(input: &str) -> IResult<&str, String> {
pub fn quoted_string(input: &[u8]) -> IResult<&[u8], buffer::Text> {
let (input, _) = opt(cfws)(input)?;
let (input, _) = tag("\"")(input)?;
let (input, content) = many0(pair(opt(fws), qcontent))(input)?;
@ -60,11 +62,11 @@ pub fn quoted_string(input: &str) -> IResult<&str, String> {
// Rebuild string
let mut qstring = content
.iter()
.fold(String::with_capacity(16), |mut acc, (maybe_wsp, c)| {
.fold(buffer::Text::default(), |mut acc, (maybe_wsp, c)| {
if let Some(wsp) = maybe_wsp {
acc.push(*wsp);
acc.push(&[ascii::SP]);
}
acc.push(*c);
acc.push(c);
acc
});
@ -84,13 +86,22 @@ mod tests {
#[test]
fn test_quoted_string() {
let mut text = buffer::Text::default();
text.push(b"hello");
text.push(&[ascii::DQUOTE]);
text.push(b"world");
assert_eq!(
quoted_string(" \"hello\\\"world\" "),
Ok(("", "hello\"world".to_string()))
quoted_string(b" \"hello\\\"world\" "),
Ok(("", text))
);
let mut text = buffer::Text::default();
text.push(b"hello");
text.push(&[ascii::SP]);
text.push(b"world");
assert_eq!(
quoted_string("\"hello\r\n world\""),
Ok(("", "hello world".to_string()))
quoted_string(b"\"hello\r\n world\""),
Ok(("", text))
);
}
}

View file

@ -1,71 +1,68 @@
use crate::fragments::quoted::quoted_pair;
use nom::{
branch::alt,
bytes::complete::{is_not, tag},
character::complete::{crlf, satisfy, space0, space1},
bytes::complete::{is_not, tag, take_while1},
character::complete::{space0, space1},
combinator::{opt, recognize},
multi::{many0, many1},
sequence::{pair, terminated, tuple},
sequence::{pair, tuple},
IResult,
};
use crate::fragments::encoding::encoded_word;
use crate::text::encoding::encoded_word;
use crate::text::quoted::quoted_pair;
use crate::text::ascii;
/// Whitespace (space, new line, tab) content and
/// delimited content (eg. comment, line, sections, etc.)
// Bytes CRLF
const CR: u8 = 0x0D;
const LF: u8 = 0x0A;
pub const CRLF: &[u8] = &[CR, LF];
/// Obsolete/Compatible CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.
pub fn headers(input: &[u8]) -> IResult<&[u8], &[u8]> {
terminated(recognize(many0(line)), obs_crlf)(input)
pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)
}
pub fn fields(input: &str) -> IResult<&str, Vec<&str>> {
all_consuming(many0(foldable_line))(input)
}
pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
// is_not(CRLF) is a hack, it means "is not CR or LF"
// and not "is not CRLF". In other words, it continues while
// it does not encounter 0x0D or 0x0A.
pair(is_not(CRLF), obs_crlf)(input)
pair(is_not(ascii::CRLF), obs_crlf)(input)
}
pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((tag(CRLF), tag(&[CR]), tag(&[LF])))(input)
/// ```abnf
/// fold_line = any *(1*(crlf WS) any) crlf
/// ```
pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(tuple((
is_not(ascii::CRLF),
many0(pair(
many1(pair(obs_crlf, space1)),
is_not(ascii::CRLF),
)),
obs_crlf,
)))(input)
}
// --- whitespaces and comments
// Note: WSP = SP / HTAB = %x20 / %x09
// nom::*::space0 = *WSP
// nom::*::space1 = 1*WSP
/// Permissive CRLF
///
/// Theoretically, all lines must end with \r\n
/// but some mail servers like Dovecot support malformated emails,
/// for example with only \n eol. It works because
/// \r or \n is allowed nowhere else, so we also add this support.
pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
alt((crlf, tag("\r"), tag("\n")))(input)
}
/// Permissive foldable white space
///
/// Folding white space are used for long headers splitted on multiple lines.
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
/// reasons
pub fn fws(input: &str) -> IResult<&str, char> {
pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {
let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
Ok((input, ' '))
Ok((input, ascii::SP))
}
fn fold_marker(input: &str) -> IResult<&str, &str> {
fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {
let (input, _) = space0(input)?;
let (input, _) = perm_crlf(input)?;
let (input, _) = obs_crlf(input)?;
space1(input)
}
@ -85,17 +82,17 @@ fn fold_marker(input: &str) -> IResult<&str, &str> {
///
/// CFWS = (1*([FWS] comment) [FWS]) / FWS
/// ```
pub fn cfws(input: &str) -> IResult<&str, &str> {
pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((recognize(comments), recognize(fws)))(input)
}
pub fn comments(input: &str) -> IResult<&str, ()> {
pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {
let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
let (input, _) = opt(fws)(input)?;
Ok((input, ()))
}
pub fn comment(input: &str) -> IResult<&str, ()> {
pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {
let (input, _) = tag("(")(input)?;
let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
let (input, _) = opt(fws)(input)?;
@ -103,12 +100,16 @@ pub fn comment(input: &str) -> IResult<&str, ()> {
Ok((input, ()))
}
pub fn ccontent(input: &str) -> IResult<&str, &str> {
alt((recognize(ctext), recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {
alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
}
pub fn ctext(input: &str) -> IResult<&str, char> {
satisfy(is_ctext)(input)
pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {
take_while1(is_ctext)(input)
}
pub fn is_ctext(c: u8) -> bool {
is_restr_ctext(c) || is_obs_no_ws_ctl(c)
}
/// Check if it's a comment text character
@ -119,15 +120,10 @@ pub fn ctext(input: &str) -> IResult<&str, char> {
/// %d93-126 / ; "(", ")", or "\"
/// obs-ctext
///```
pub fn is_restr_ctext(c: char) -> bool {
(c >= '\x21' && c <= '\x27')
|| (c >= '\x2A' && c <= '\x5B')
|| (c >= '\x5D' && c <= '\x7E')
|| !c.is_ascii()
}
pub fn is_ctext(c: char) -> bool {
is_restr_ctext(c) || is_obs_no_ws_ctl(c)
pub fn is_restr_ctext(c: u8) -> bool {
(c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)
|| (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)
|| (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
}
/// US ASCII control characters without effect
@ -139,12 +135,12 @@ pub fn is_ctext(c: char) -> bool {
/// %d14-31 / ; return, line feed, and
/// %d127 ; white space characters
/// ```
pub fn is_obs_no_ws_ctl(c: char) -> bool {
(c >= '\x01' && c <= '\x08')
|| c == '\x0b'
|| c == '\x0b'
|| (c >= '\x0e' && c <= '\x1f')
|| c == '\x7F'
pub fn is_obs_no_ws_ctl(c: u8) -> bool {
(c >= ascii::SOH && c <= ascii::BS)
|| c == ascii::VT
|| c == ascii::FF
|| (c >= ascii::SO && c <= ascii::US)
|| c == ascii::DEL
}
#[cfg(test)]
@ -152,10 +148,10 @@ mod tests {
use super::*;
#[test]
fn test_perm_crlf() {
assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));
assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));
assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));
fn test_obs_crlf() {
assert_eq!(obs_crlf("\rworld"), Ok(("world", "\r")));
assert_eq!(obs_crlf("\r\nworld"), Ok(("world", "\r\n")));
assert_eq!(obs_crlf("\nworld"), Ok(("world", "\n")));
}
#[test]

133
src/text/words.rs Normal file
View file

@ -0,0 +1,133 @@
use crate::text::whitespace::cfws;
use crate::text::ascii;
use nom::{
bytes::complete::{tag, take_while1},
character::is_alphanumeric,
combinator::{opt, recognize},
multi::many0,
sequence::{delimited, pair},
IResult,
};
pub fn is_vchar(c: u8) -> bool {
c >= ascii::EXCLAMATION && c <= ascii::TILDE
}
/// MIME Token allowed characters
///
/// forbidden: ()<>@,;:\"/[]?=
fn is_mime_token_text(c: u8) -> bool {
is_alphanumeric(c)
|| c == ascii::EXCLAMATION
|| c == ascii::NUM
|| c == ascii::DOLLAR
|| c == ascii::PERCENT
|| c == ascii::AMPERSAND
|| c == ascii::SQUOTE
|| c == ascii::ASTERISK
|| c == ascii::PLUS
|| c == ascii::MINUS
|| c == ascii::PERIOD
|| c == ascii::CARRET
|| c == ascii::UNDERSCORE
|| c == ascii::GRAVE
|| c == ascii::LEFT_CURLY
|| c == ascii::PIPE
|| c == ascii::RIGHT_CURLY
|| c == ascii::TILDE
}
/// MIME Token
///
/// `[CFWS] 1*token_text [CFWS]`
pub fn mime_token(input: &[u8]) -> IResult<&[u8], &[u8]> {
delimited(opt(cfws), take_while1(is_mime_token_text), opt(cfws))(input)
}
/// Atom allowed characters
///
/// authorized: !#$%&'*+-/=?^_`{|}~
fn is_atext(c: u8) -> bool {
is_alphanumeric(c)
|| c == ascii::EXCLAMATION
|| c == ascii::NUM
|| c == ascii::DOLLAR
|| c == ascii::PERCENT
|| c == ascii::AMPERSAND
|| c == ascii::SQUOTE
|| c == ascii::ASTERISK
|| c == ascii::PLUS
|| c == ascii::MINUS
|| c == ascii::SLASH
|| c == ascii::EQ
|| c == ascii::QUESTION
|| c == ascii::CARRET
|| c == ascii::UNDERSCORE
|| c == ascii::GRAVE
|| c == ascii::LEFT_CURLY
|| c == ascii::PIPE
|| c == ascii::RIGHT_CURLY
|| c == ascii::TILDE
}
/// Atom
///
/// `[CFWS] 1*atext [CFWS]`
pub fn atom(input: &[u8]) -> IResult<&[u8], &[u8]> {
delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input)
}
/// dot-atom-text
///
/// `1*atext *("." 1*atext)`
pub fn dot_atom_text(input: &[u8]) -> IResult<&[u8], &[u8]> {
recognize(pair(
take_while1(is_atext),
many0(pair(tag("."), take_while1(is_atext))),
))(input)
}
/// dot-atom
///
/// `[CFWS] dot-atom-text [CFWS]`
pub fn dot_atom(input: &[u8]) -> IResult<&[u8], &[u8]> {
delimited(opt(cfws), dot_atom_text, opt(cfws))(input)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_atext() {
assert!(is_atext('=' as u8));
assert!(is_atext('5' as u8));
assert!(is_atext('Q' as u8));
assert!(!is_atext(' ' as u8));
//assert!(is_atext('É')); // support utf8
}
#[test]
fn test_atom() {
assert_eq!(
atom(b"(skip) imf_codec (hidden) aerogramme"),
Ok((&b"aerogramme"[..], &b"imf_codec"[..]))
);
}
#[test]
fn test_dot_atom_text() {
assert_eq!(
dot_atom_text("quentin.dufour.io abcdef"),
Ok((" abcdef", "quentin.dufour.io"))
);
}
#[test]
fn test_dot_atom() {
assert_eq!(
dot_atom(" (skip) quentin.dufour.io abcdef"),
Ok(("abcdef", "quentin.dufour.io"))
);
}
}

View file

@ -1,129 +0,0 @@
use imf_codec::fragments::section;
use imf_codec::multipass;
use std::collections::HashSet;
use std::fs::File;
use std::io::Read;
use std::path::PathBuf;
use walkdir::WalkDir;
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
where
F: FnOnce(&section::Section) -> (),
{
let seg = multipass::segment::new(input).unwrap();
let charset = seg.charset();
let fields = charset.fields().unwrap();
let field_names = fields.names();
let field_body = field_names.body();
let section = field_body.section();
func(&section.fields);
}
#[test]
#[ignore]
fn test_enron500k() {
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("resources/enron/maildir/");
let prefix_sz = d.as_path().to_str().unwrap().len();
//d.push("williams-w3/");
let known_bad_fields = HashSet::from([
"white-s/calendar/113.", // To: east <7..>
"skilling-j/inbox/223.", // From: pep <performance.>
"jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
"jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
"lokey-t/calendar/33.", // A second Date entry for the calendar containing
// Date: Monday, March 12
"zipper-a/inbox/199.", // To: e-mail <mari.>
"dasovich-j/deleted_items/128.", // To: f62489 <g>
"dasovich-j/all_documents/677.", // To: w/assts <govt.>
"dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
"dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
"dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
"dasovich-j/all_documents/578.", // To: w/assts <govt.>
"dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
"dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
"dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
"dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
"dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
"dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
"dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
"dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
"dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
"kaminski-v/sites/19.", // To: <"the.desk":@enron.com>
"kaminski-v/sites/1.", // To: <"the.desk":@enron.com>
"kaminski-v/discussion_threads/5082.", // To: <"ft.com.users":@enron.com>
"kaminski-v/discussion_threads/4046.", // To: <"the.desk":@enron.com>
"kaminski-v/discussion_threads/4187.", // To: <"the.desk":@enron.com>
"kaminski-v/discussion_threads/8068.", // To: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
"kaminski-v/discussion_threads/7980.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
"kaminski-v/all_documents/5970.", //To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
"kaminski-v/all_documents/5838.", // To + Cc: dogs <breakthrough.>, breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com
"kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com>
"kaminski-v/all_documents/92.", // To: <"the.desk":@enron.com>
"kaminski-v/all_documents/276.", // To: <"the.desk":@enron.com>
"kaminski-v/technical/1.", // To: <"the.desk":@enron.com>
"kaminski-v/technical/7.", // To: <"the.desk":@enron.com>
"kaminski-v/notes_inbox/140.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
"kaminski-v/notes_inbox/95.", // To + CC failed: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
"kean-s/archiving/untitled/1232.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
"kean-s/archiving/untitled/1688.", // To: w/assts <govt.>
"kean-s/sent/198.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
"kean-s/reg_risk/9.", // To: w/assts <govt.>
"kean-s/discussion_threads/950.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
"kean-s/discussion_threads/577.", // To: w/assts <govt.>
"kean-s/calendar/untitled/1096.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
"kean-s/calendar/untitled/640.", // To: w/assts <govt.>
"kean-s/all_documents/640.", // To: w/assts <govt.>
"kean-s/all_documents/1095.", // To: w/assts <govt.>
"kean-s/attachments/2030.", // To: w/assts <govt.>
"williams-w3/operations_committee_isas/10.", // To: z34655 <m>
]);
let known_bad_from = HashSet::from([
"skilling-j/inbox/223.", // From: pep <performance.>
]);
let mut i = 0;
for entry in WalkDir::new(d.as_path())
.into_iter()
.filter_map(|file| file.ok())
{
if entry.metadata().unwrap().is_file() {
let mail_path = entry.path();
let suffix = &mail_path.to_str().unwrap()[prefix_sz..];
// read file
let mut raw = Vec::new();
let mut f = File::open(mail_path).unwrap();
f.read_to_end(&mut raw).unwrap();
// parse
parser(&raw, |hdrs| {
let ok_date = hdrs.date.is_some();
let ok_from = hdrs.from.len() > 0;
let ok_fields = hdrs.bad_fields.len() == 0;
if !ok_date || !ok_from || !ok_fields {
println!("Issue with: {}", suffix);
}
assert!(ok_date);
if !known_bad_from.contains(suffix) {
assert!(ok_from);
}
if !known_bad_fields.contains(suffix) {
assert!(ok_fields);
}
i += 1;
if i % 1000 == 0 {
println!("Analyzed emails: {}", i);
}
})
}
}
}

View file

@ -1,340 +0,0 @@
use chrono::{FixedOffset, TimeZone};
use imf_codec::fragments::{misc_token, model, section, part, trace};
use imf_codec::multipass;
use std::collections::HashMap;
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
where
F: FnOnce(&section::Section) -> (),
{
let seg = multipass::segment::new(input).unwrap();
let charset = seg.charset();
let fields = charset.fields().unwrap();
let field_names = fields.names();
let field_body = field_names.body();
let section = field_body.section();
func(&section.fields);
}
#[test]
fn test_headers() {
let fullmail: &[u8] = r#"Return-Path: <gitlab@example.com>
Delivered-To: quentin@example.com
Received: from smtp.example.com ([10.83.2.2])
by doradille with LMTP
id xyzabcd
(envelope-from <gitlab@example.com>)
for <quentin@example.com>; Tue, 13 Jun 2023 19:01:08 +0000
Date: Tue, 13 Jun 2023 10:01:10 +0200
From: Mary Smith
<mary@example.net>, "A\lan" <alan@example>
Sender: imf@example.com
Reply-To: "Mary Smith: Personal Account" <smith@home.example>
To: John Doe <jdoe@machine.example>
Cc: imf2@example.com
Bcc: (hidden)
Subject: Re: Saying Hello
Comments: A simple message
Comments: Not that complicated
comments : not valid header name but should be accepted
by the parser.
Keywords: hello, world
Héron: Raté
Raté raté
Keywords: salut, le, monde
Not a real header but should still recover
Message-ID: <3456@example.net>
In-Reply-To: <1234@local.machine.example>
References: <1234@local.machine.example>
Unknown: unknown
This is a reply to your hello.
"#
.as_bytes();
parser(fullmail, |parsed_section| {
assert_eq!(
parsed_section,
&section::Section {
date: Some(
&FixedOffset::east_opt(2 * 3600)
.unwrap()
.with_ymd_and_hms(2023, 06, 13, 10, 01, 10)
.unwrap()
),
from: vec![
&model::MailboxRef {
name: Some("Mary Smith".into()),
addrspec: model::AddrSpec {
local_part: "mary".into(),
domain: "example.net".into(),
}
},
&model::MailboxRef {
name: Some("Alan".into()),
addrspec: model::AddrSpec {
local_part: "alan".into(),
domain: "example".into(),
}
}
],
sender: Some(&model::MailboxRef {
name: None,
addrspec: model::AddrSpec {
local_part: "imf".into(),
domain: "example.com".into(),
}
}),
reply_to: vec![&model::AddressRef::Single(model::MailboxRef {
name: Some("Mary Smith: Personal Account".into()),
addrspec: model::AddrSpec {
local_part: "smith".into(),
domain: "home.example".into(),
}
})],
to: vec![&model::AddressRef::Single(model::MailboxRef {
name: Some("John Doe".into()),
addrspec: model::AddrSpec {
local_part: "jdoe".into(),
domain: "machine.example".into(),
}
})],
cc: vec![&model::AddressRef::Single(model::MailboxRef {
name: None,
addrspec: model::AddrSpec {
local_part: "imf2".into(),
domain: "example.com".into(),
}
})],
bcc: vec![],
msg_id: Some(&model::MessageId {
left: "3456",
right: "example.net"
}),
in_reply_to: vec![&model::MessageId {
left: "1234",
right: "local.machine.example"
}],
references: vec![&model::MessageId {
left: "1234",
right: "local.machine.example"
}],
subject: Some(&misc_token::Unstructured("Re: Saying Hello".into())),
comments: vec![
&misc_token::Unstructured("A simple message".into()),
&misc_token::Unstructured("Not that complicated".into()),
&misc_token::Unstructured(
"not valid header name but should be accepted by the parser.".into()
),
],
keywords: vec![
&misc_token::PhraseList(vec!["hello".into(), "world".into(),]),
&misc_token::PhraseList(vec!["salut".into(), "le".into(), "monde".into(),]),
],
received: vec![&trace::ReceivedLog(
r#"from smtp.example.com ([10.83.2.2])
by doradille with LMTP
id xyzabcd
(envelope-from <gitlab@example.com>)
for <quentin@example.com>"#
)],
return_path: vec![&model::MailboxRef {
name: None,
addrspec: model::AddrSpec {
local_part: "gitlab".into(),
domain: "example.com".into(),
}
}],
optional: HashMap::from([
(
"Delivered-To",
&misc_token::Unstructured("quentin@example.com".into())
),
("Unknown", &misc_token::Unstructured("unknown".into())),
]),
bad_fields: vec![],
unparsed: vec![
"Héron: Raté\n Raté raté\n",
"Not a real header but should still recover\n",
],
..section::Section::default()
}
)
})
}
#[test]
fn test_headers_mime() {
use imf_codec::fragments::mime;
let fullmail: &[u8] = r#"From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>
Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
MIME-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable
Content-ID: <a@example.com>
Content-Description: hello
Now's the time =
for all folk to come=
to the aid of their country.
"#
.as_bytes();
parser(fullmail, |parsed_section| {
assert_eq!(
parsed_section,
&section::Section {
from: vec![
&model::MailboxRef {
name: Some("Keith Moore".into()),
addrspec: model::AddrSpec {
local_part: "moore".into(),
domain: "cs.utk.edu".into(),
}
},
],
to: vec![&model::AddressRef::Single(model::MailboxRef {
name: Some("Keld Jørn Simonsen".into()),
addrspec: model::AddrSpec {
local_part: "keld".into(),
domain: "dkuug.dk".into(),
}
})],
cc: vec![&model::AddressRef::Single(model::MailboxRef {
name: Some("André Pirard".into()),
addrspec: model::AddrSpec {
local_part: "PIRARD".into(),
domain: "vm1.ulg.ac.be".into(),
}
})],
subject: Some(&misc_token::Unstructured("If you can read this you understand the example.".into())),
mime_version: Some(&mime::Version{ major: 1, minor: 0 }),
mime: section::MIMESection {
content_type: Some(&mime::Type::Text(mime::TextDesc {
charset: Some(mime::EmailCharset::ISO_8859_1),
subtype: mime::TextSubtype::Plain,
unknown_parameters: vec![]
})),
content_transfer_encoding: Some(&mime::Mechanism::QuotedPrintable),
content_id: Some(&model::MessageId {
left: "a",
right: "example.com"
}),
content_description: Some(&misc_token::Unstructured("hello".into())),
..section::MIMESection::default()
},
..section::Section::default()
}
);
})
}
fn parser_bodystruct<'a, F>(input: &'a [u8], func: F) -> ()
where
F: FnOnce(&part::PartNode) -> (),
{
let seg = multipass::segment::new(input).unwrap();
let charset = seg.charset();
let fields = charset.fields().unwrap();
let field_names = fields.names();
let field_body = field_names.body();
let section = field_body.section();
let bodystruct = section.body_structure();
func(&bodystruct.body);
}
#[test]
fn test_multipart() {
let fullmail: &[u8] = r#"Date: Sat, 8 Jul 2023 07:14:29 +0200
From: Grrrnd Zero <grrrndzero@example.org>
To: John Doe <jdoe@machine.example>
Subject: Re: Saying Hello
Message-ID: <NTAxNzA2AC47634Y366BAMTY4ODc5MzQyODY0ODY5@www.grrrndzero.org>
MIME-Version: 1.0
Content-Type: multipart/alternative;
boundary="b1_e376dc71bafc953c0b0fdeb9983a9956"
Content-Transfer-Encoding: 7bit
This is a multi-part message in MIME format.
--b1_e376dc71bafc953c0b0fdeb9983a9956
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
GZ
OoOoO
oOoOoOoOo
oOoOoOoOoOoOoOoOo
oOoOoOoOoOoOoOoOoOoOoOo
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
--b1_e376dc71bafc953c0b0fdeb9983a9956
Content-Type: text/html; charset=us-ascii
<div style="text-align: center;"><strong>GZ</strong><br />
OoOoO<br />
oOoOoOoOo<br />
oOoOoOoOoOoOoOoOo<br />
oOoOoOoOoOoOoOoOoOoOoOo<br />
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />
--b1_e376dc71bafc953c0b0fdeb9983a9956--
"#.as_bytes();
parser_bodystruct(fullmail, |part| {
assert_eq!(part, &part::PartNode::Composite(
part::PartHeader {
..part::PartHeader::default()
},
vec![
part::PartNode::Discrete(
part::PartHeader {
..part::PartHeader::default()
},
r#"GZ
OoOoO
oOoOoOoOo
oOoOoOoOoOoOoOoOo
oOoOoOoOoOoOoOoOoOoOoOo
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO"#.as_bytes()
),
part::PartNode::Discrete(
part::PartHeader {
..part::PartHeader::default()
},
r#"<div style="text-align: center;"><strong>GZ</strong><br />
OoOoO<br />
oOoOoOoOo<br />
oOoOoOoOoOoOoOoOo<br />
oOoOoOoOoOoOoOoOoOoOoOo<br />
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />"#.as_bytes()
),
]));
});
}