wip refactor
This commit is contained in:
parent
23c663b943
commit
a503eb1de6
35 changed files with 746 additions and 1038 deletions
|
@ -1,129 +0,0 @@
|
|||
use nom::{
|
||||
branch::alt,
|
||||
bytes::complete::{tag, take_while1},
|
||||
character::complete::space0,
|
||||
combinator::{into, map, opt},
|
||||
multi::{many0, many1, separated_list1},
|
||||
sequence::tuple,
|
||||
IResult,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::error::IMFError;
|
||||
use crate::fragments::lazy;
|
||||
use crate::fragments::quoted::quoted_string;
|
||||
use crate::fragments::whitespace::{fws, is_obs_no_ws_ctl};
|
||||
use crate::fragments::words::{atom, is_vchar};
|
||||
use crate::fragments::encoding::encoded_word;
|
||||
|
||||
#[derive(Debug, PartialEq, Default)]
|
||||
pub struct Unstructured(pub String);
|
||||
|
||||
#[derive(Debug, PartialEq, Default)]
|
||||
pub struct PhraseList(pub Vec<String>);
|
||||
|
||||
impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured {
|
||||
type Error = IMFError<'a>;
|
||||
|
||||
fn try_from(input: &'a lazy::Unstructured<'a>) -> Result<Self, Self::Error> {
|
||||
unstructured(input.0)
|
||||
.map(|(_, v)| Unstructured(v))
|
||||
.map_err(|e| IMFError::Unstructured(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList {
|
||||
type Error = IMFError<'a>;
|
||||
|
||||
fn try_from(p: &'a lazy::PhraseList<'a>) -> Result<Self, Self::Error> {
|
||||
separated_list1(tag(","), phrase)(p.0)
|
||||
.map(|(_, q)| PhraseList(q))
|
||||
.map_err(|e| IMFError::PhraseList(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// Word
|
||||
///
|
||||
/// ```abnf
|
||||
/// word = atom / quoted-string
|
||||
/// ```
|
||||
pub fn word(input: &str) -> IResult<&str, Cow<str>> {
|
||||
alt((into(quoted_string), into(encoded_word), into(atom)))(input)
|
||||
}
|
||||
|
||||
/// Phrase
|
||||
///
|
||||
/// ```abnf
|
||||
/// phrase = 1*word / obs-phrase
|
||||
/// ```
|
||||
pub fn phrase(input: &str) -> IResult<&str, String> {
|
||||
let (input, words) = many1(word)(input)?;
|
||||
let phrase = words.join(" ");
|
||||
Ok((input, phrase))
|
||||
}
|
||||
|
||||
/// Compatible unstructured input
|
||||
///
|
||||
/// ```abnf
|
||||
/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR
|
||||
/// ```
|
||||
fn is_unstructured(c: char) -> bool {
|
||||
is_vchar(c) || is_obs_no_ws_ctl(c) || c == '\x00'
|
||||
}
|
||||
|
||||
enum UnstrToken {
|
||||
Init,
|
||||
Encoded,
|
||||
Plain,
|
||||
}
|
||||
|
||||
/// Unstructured header field body
|
||||
///
|
||||
/// ```abnf
|
||||
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
|
||||
/// ```
|
||||
pub fn unstructured(input: &str) -> IResult<&str, String> {
|
||||
let (input, r) = many0(tuple((opt(fws), alt((
|
||||
map(encoded_word, |v| (Cow::Owned(v), UnstrToken::Encoded)),
|
||||
map(take_while1(is_unstructured), |v| (Cow::Borrowed(v), UnstrToken::Plain)),
|
||||
)))))(input)?;
|
||||
|
||||
let (input, _) = space0(input)?;
|
||||
|
||||
// Try to optimize for the most common cases
|
||||
let body = match r.as_slice() {
|
||||
// Optimization when there is only one line
|
||||
[(None, (content, _))] | [(_, (content, UnstrToken::Encoded))] => content.to_string(),
|
||||
[(Some(_), (content, _))] => " ".to_string() + content,
|
||||
// Generic case, with multiple lines
|
||||
lines => lines.iter().fold(
|
||||
(&UnstrToken::Init, String::with_capacity(255)),
|
||||
|(prev_token, result), (may_ws, (content, current_token))| {
|
||||
let new_res = match (may_ws, prev_token, current_token) {
|
||||
(_, UnstrToken::Encoded, UnstrToken::Encoded) | (None, _, _) => result + content,
|
||||
_ => result + " " + content,
|
||||
};
|
||||
(current_token, new_res)
|
||||
}).1,
|
||||
};
|
||||
|
||||
Ok((input, body))
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_phrase() {
|
||||
assert_eq!(phrase("hello world"), Ok(("", "hello world".into())));
|
||||
assert_eq!(
|
||||
phrase("salut \"le\" monde"),
|
||||
Ok(("", "salut le monde".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
phrase("fin\r\n du\r\nmonde"),
|
||||
Ok(("\r\nmonde", "fin du".into()))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
// Model
|
||||
pub mod model;
|
||||
|
||||
// Generic
|
||||
pub mod misc_token;
|
||||
mod quoted;
|
||||
pub mod whitespace;
|
||||
mod words;
|
||||
|
||||
// Header specific
|
||||
mod address;
|
||||
mod datetime;
|
||||
pub mod eager;
|
||||
mod identification;
|
||||
pub mod lazy;
|
||||
mod mailbox;
|
||||
pub mod section;
|
||||
pub mod trace;
|
||||
|
||||
// MIME related
|
||||
pub mod mime;
|
||||
pub mod encoding;
|
||||
pub mod part;
|
|
@ -1,146 +0,0 @@
|
|||
use chrono::{DateTime, FixedOffset};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct AddrSpec {
|
||||
pub local_part: String,
|
||||
pub domain: String,
|
||||
}
|
||||
impl AddrSpec {
|
||||
pub fn fully_qualified(&self) -> String {
|
||||
format!("{}@{}", self.local_part, self.domain)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct MailboxRef {
|
||||
// The actual "email address" like hello@example.com
|
||||
pub addrspec: AddrSpec,
|
||||
pub name: Option<String>,
|
||||
}
|
||||
impl From<AddrSpec> for MailboxRef {
|
||||
fn from(addr: AddrSpec) -> Self {
|
||||
MailboxRef {
|
||||
name: None,
|
||||
addrspec: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
pub type MailboxList = Vec<MailboxRef>;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct GroupRef {
|
||||
pub name: String,
|
||||
pub participants: Vec<MailboxRef>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum AddressRef {
|
||||
Single(MailboxRef),
|
||||
Many(GroupRef),
|
||||
}
|
||||
impl From<MailboxRef> for AddressRef {
|
||||
fn from(mx: MailboxRef) -> Self {
|
||||
AddressRef::Single(mx)
|
||||
}
|
||||
}
|
||||
impl From<GroupRef> for AddressRef {
|
||||
fn from(grp: GroupRef) -> Self {
|
||||
AddressRef::Many(grp)
|
||||
}
|
||||
}
|
||||
pub type AddressList = Vec<AddressRef>;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct MessageId<'a> {
|
||||
pub left: &'a str,
|
||||
pub right: &'a str,
|
||||
}
|
||||
pub type MessageIdList<'a> = Vec<MessageId<'a>>;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum FieldBody<'a, T> {
|
||||
Correct(T),
|
||||
Failed(&'a str),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Field<'a> {
|
||||
// 3.6.1. The Origination Date Field
|
||||
Date(FieldBody<'a, Option<DateTime<FixedOffset>>>),
|
||||
|
||||
// 3.6.2. Originator Fields
|
||||
From(FieldBody<'a, Vec<MailboxRef>>),
|
||||
Sender(FieldBody<'a, MailboxRef>),
|
||||
ReplyTo(FieldBody<'a, Vec<AddressRef>>),
|
||||
|
||||
// 3.6.3. Destination Address Fields
|
||||
To(FieldBody<'a, Vec<AddressRef>>),
|
||||
Cc(FieldBody<'a, Vec<AddressRef>>),
|
||||
Bcc(FieldBody<'a, Vec<AddressRef>>),
|
||||
|
||||
// 3.6.4. Identification Fields
|
||||
MessageID(FieldBody<'a, MessageId<'a>>),
|
||||
InReplyTo(FieldBody<'a, Vec<MessageId<'a>>>),
|
||||
References(FieldBody<'a, Vec<MessageId<'a>>>),
|
||||
|
||||
// 3.6.5. Informational Fields
|
||||
Subject(FieldBody<'a, String>),
|
||||
Comments(FieldBody<'a, String>),
|
||||
Keywords(FieldBody<'a, Vec<String>>),
|
||||
|
||||
// 3.6.6 Resent Fields (not implemented)
|
||||
// 3.6.7 Trace Fields
|
||||
Received(FieldBody<'a, &'a str>),
|
||||
ReturnPath(FieldBody<'a, Option<MailboxRef>>),
|
||||
|
||||
// 3.6.8. Optional Fields
|
||||
Optional(&'a str, String),
|
||||
|
||||
// None
|
||||
Rescue(&'a str),
|
||||
}
|
||||
|
||||
/// Permissive Header Section
|
||||
///
|
||||
/// This is a structure intended for parsing/decoding,
|
||||
/// hence it's support cases where the email is considered
|
||||
/// as invalid according to RFC5322 but for which we can
|
||||
/// still extract some data.
|
||||
#[derive(Debug, PartialEq, Default)]
|
||||
pub struct HeaderSection<'a> {
|
||||
// 3.6.1. The Origination Date Field
|
||||
pub date: Option<DateTime<FixedOffset>>,
|
||||
|
||||
// 3.6.2. Originator Fields
|
||||
pub from: Vec<MailboxRef>,
|
||||
pub sender: Option<MailboxRef>,
|
||||
pub reply_to: Vec<AddressRef>,
|
||||
|
||||
// 3.6.3. Destination Address Fields
|
||||
pub to: Vec<AddressRef>,
|
||||
pub cc: Vec<AddressRef>,
|
||||
pub bcc: Vec<AddressRef>,
|
||||
|
||||
// 3.6.4. Identification Fields
|
||||
pub msg_id: Option<MessageId<'a>>,
|
||||
pub in_reply_to: Vec<MessageId<'a>>,
|
||||
pub references: Vec<MessageId<'a>>,
|
||||
|
||||
// 3.6.5. Informational Fields
|
||||
pub subject: Option<String>,
|
||||
pub comments: Vec<String>,
|
||||
pub keywords: Vec<String>,
|
||||
|
||||
// 3.6.6 Not implemented
|
||||
// 3.6.7 Trace Fields
|
||||
pub return_path: Vec<MailboxRef>,
|
||||
pub received: Vec<&'a str>,
|
||||
|
||||
// 3.6.8. Optional Fields
|
||||
pub optional: HashMap<&'a str, String>,
|
||||
|
||||
// Recovery
|
||||
pub bad_fields: Vec<Field<'a>>,
|
||||
pub unparsed: Vec<&'a str>,
|
||||
}
|
|
@ -1,116 +0,0 @@
|
|||
use crate::fragments::whitespace::cfws;
|
||||
use nom::{
|
||||
bytes::complete::{tag, take_while1},
|
||||
combinator::{opt, recognize},
|
||||
multi::many0,
|
||||
sequence::{delimited, pair},
|
||||
IResult,
|
||||
};
|
||||
|
||||
/// VCHAR definition
|
||||
pub fn is_vchar(c: char) -> bool {
|
||||
(c >= '\x21' && c <= '\x7E') || !c.is_ascii()
|
||||
}
|
||||
|
||||
/// Sequence of visible chars with the UTF-8 extension
|
||||
///
|
||||
/// ```abnf
|
||||
/// VCHAR = %x21-7E
|
||||
/// ; visible (printing) characters
|
||||
/// VCHAR =/ UTF8-non-ascii
|
||||
/// SEQ = 1*VCHAR
|
||||
///```
|
||||
#[allow(dead_code)]
|
||||
pub fn vchar_seq(input: &str) -> IResult<&str, &str> {
|
||||
take_while1(is_vchar)(input)
|
||||
}
|
||||
|
||||
/// Atom allowed characters
|
||||
fn is_atext(c: char) -> bool {
|
||||
c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) || !c.is_ascii()
|
||||
}
|
||||
|
||||
/// Atom
|
||||
///
|
||||
/// `[CFWS] 1*atext [CFWS]`
|
||||
pub fn atom(input: &str) -> IResult<&str, &str> {
|
||||
delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input)
|
||||
}
|
||||
|
||||
/// dot-atom-text
|
||||
///
|
||||
/// `1*atext *("." 1*atext)`
|
||||
pub fn dot_atom_text(input: &str) -> IResult<&str, &str> {
|
||||
recognize(pair(
|
||||
take_while1(is_atext),
|
||||
many0(pair(tag("."), take_while1(is_atext))),
|
||||
))(input)
|
||||
}
|
||||
|
||||
/// dot-atom
|
||||
///
|
||||
/// `[CFWS] dot-atom-text [CFWS]`
|
||||
pub fn dot_atom(input: &str) -> IResult<&str, &str> {
|
||||
delimited(opt(cfws), dot_atom_text, opt(cfws))(input)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn is_special(c: char) -> bool {
|
||||
c == '('
|
||||
|| c == ')'
|
||||
|| c == '<'
|
||||
|| c == '>'
|
||||
|| c == '['
|
||||
|| c == ']'
|
||||
|| c == ':'
|
||||
|| c == ';'
|
||||
|| c == '@'
|
||||
|| c == '\\'
|
||||
|| c == ','
|
||||
|| c == '.'
|
||||
|| c == '"'
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vchar_seq() {
|
||||
assert_eq!(vchar_seq("hello world"), Ok((" world", "hello")));
|
||||
assert_eq!(vchar_seq("hello👋 world"), Ok((" world", "hello👋")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atext() {
|
||||
assert!(is_atext('='));
|
||||
assert!(is_atext('5'));
|
||||
assert!(is_atext('Q'));
|
||||
assert!(!is_atext(' '));
|
||||
assert!(is_atext('É')); // support utf8
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atom() {
|
||||
assert_eq!(
|
||||
atom("(skip) imf_codec (hidden) aerogramme"),
|
||||
Ok(("aerogramme", "imf_codec"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dot_atom_text() {
|
||||
assert_eq!(
|
||||
dot_atom_text("quentin.dufour.io abcdef"),
|
||||
Ok((" abcdef", "quentin.dufour.io"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dot_atom() {
|
||||
assert_eq!(
|
||||
dot_atom(" (skip) quentin.dufour.io abcdef"),
|
||||
Ok(("abcdef", "quentin.dufour.io"))
|
||||
);
|
||||
}
|
||||
}
|
27
src/headers.rs
Normal file
27
src/headers.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
use nom::{
|
||||
self,
|
||||
combinator::{all_consuming, recognize},
|
||||
multi::many0,
|
||||
sequence::terminated,
|
||||
IResult,
|
||||
};
|
||||
|
||||
use crate::text::whitespace::{foldable_line, line, obs_crlf};
|
||||
|
||||
pub fn headers(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> {
|
||||
let (body, hdrs) = segment(input)?;
|
||||
let (_, fields) = fields(hdrs)?;
|
||||
Ok((body, fields))
|
||||
}
|
||||
|
||||
// -- part 1, segment
|
||||
fn segment(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
terminated(recognize(many0(line)), obs_crlf)(input)
|
||||
}
|
||||
|
||||
// -- part 2, isolate fields
|
||||
fn fields(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> {
|
||||
let (rest, parsed) = all_consuming(many0(foldable_line))(input)?;
|
||||
Ok((rest, parsed))
|
||||
}
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
pub mod error;
|
||||
pub mod fragments;
|
||||
pub mod multipass;
|
||||
//pub mod mime;
|
||||
//pub mod message;
|
||||
pub mod headers;
|
||||
pub mod text;
|
||||
|
|
|
@ -292,18 +292,6 @@ pub fn version(input: &str) -> IResult<&str, Version> {
|
|||
Ok((rest, Version { major, minor }))
|
||||
}
|
||||
|
||||
/// Token allowed characters
|
||||
fn is_token_text(c: char) -> bool {
|
||||
c.is_ascii() && !c.is_ascii_control() && !c.is_ascii_whitespace() && !"()<>@,;:\\\"/[]?=".contains(c)
|
||||
}
|
||||
|
||||
/// Token
|
||||
///
|
||||
/// `[CFWS] 1*token_text [CFWS]`
|
||||
pub fn token(input: &str) -> IResult<&str, &str> {
|
||||
delimited(opt(cfws), take_while1(is_token_text), opt(cfws))(input)
|
||||
}
|
||||
|
||||
pub fn parameter(input: &str) -> IResult<&str, Parameter> {
|
||||
let (rest, (pname, _, pvalue)) = tuple((
|
||||
token,
|
10
src/parse.rs
10
src/parse.rs
|
@ -1,8 +1,9 @@
|
|||
use imf_codec::fragments::section::Section;
|
||||
use imf_codec::multipass::segment;
|
||||
//use imf_codec::fragments::section::Section;
|
||||
//use imf_codec::multipass::segment;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
/*
|
||||
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
|
||||
where
|
||||
F: FnOnce(&Section) -> (),
|
||||
|
@ -15,9 +16,10 @@ where
|
|||
let section = field_body.section();
|
||||
|
||||
func(§ion.fields);
|
||||
}
|
||||
}*/
|
||||
|
||||
fn main() {
|
||||
/*
|
||||
// Read full mail in memory
|
||||
let mut rawmail = Vec::new();
|
||||
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
||||
|
@ -30,4 +32,6 @@ fn main() {
|
|||
assert!(section.from.len() > 0);
|
||||
assert!(section.bad_fields.len() == 0);
|
||||
});
|
||||
*/
|
||||
println!("hello world");
|
||||
}
|
||||
|
|
|
@ -11,9 +11,32 @@ use crate::error::IMFError;
|
|||
use crate::fragments::lazy;
|
||||
use crate::fragments::mailbox::mailbox;
|
||||
use crate::fragments::misc_token::phrase;
|
||||
use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef};
|
||||
//use crate::fragments::model::{AddressList, AddressRef, GroupRef, MailboxList, MailboxRef};
|
||||
use crate::fragments::whitespace::cfws;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct GroupRef {
|
||||
pub name: String,
|
||||
pub participants: Vec<MailboxRef>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum AddressRef {
|
||||
Single(MailboxRef),
|
||||
Many(GroupRef),
|
||||
}
|
||||
impl From<MailboxRef> for AddressRef {
|
||||
fn from(mx: MailboxRef) -> Self {
|
||||
AddressRef::Single(mx)
|
||||
}
|
||||
}
|
||||
impl From<GroupRef> for AddressRef {
|
||||
fn from(grp: GroupRef) -> Self {
|
||||
AddressRef::Many(grp)
|
||||
}
|
||||
}
|
||||
pub type AddressList = Vec<AddressRef>;
|
||||
|
||||
impl<'a> TryFrom<&'a lazy::Mailbox<'a>> for MailboxRef {
|
||||
type Error = IMFError<'a>;
|
||||
|
|
@ -14,6 +14,14 @@ use crate::fragments::model::{MessageId, MessageIdList};
|
|||
use crate::fragments::whitespace::cfws;
|
||||
use crate::fragments::words::dot_atom_text;
|
||||
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct MessageId<'a> {
|
||||
pub left: &'a str,
|
||||
pub right: &'a str,
|
||||
}
|
||||
pub type MessageIdList<'a> = Vec<MessageId<'a>>;
|
||||
|
||||
impl<'a> TryFrom<&'a lazy::Identifier<'a>> for MessageId<'a> {
|
||||
type Error = IMFError<'a>;
|
||||
|
|
@ -10,11 +10,37 @@ use nom::{
|
|||
use std::borrow::Cow;
|
||||
|
||||
use crate::fragments::misc_token::{phrase, word};
|
||||
use crate::fragments::model::{AddrSpec, MailboxRef};
|
||||
use crate::fragments::quoted::quoted_string;
|
||||
use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
|
||||
use crate::fragments::words::{atom, dot_atom};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct AddrSpec {
|
||||
pub local_part: String,
|
||||
pub domain: String,
|
||||
}
|
||||
impl AddrSpec {
|
||||
pub fn fully_qualified(&self) -> String {
|
||||
format!("{}@{}", self.local_part, self.domain)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct MailboxRef {
|
||||
// The actual "email address" like hello@example.com
|
||||
pub addrspec: AddrSpec,
|
||||
pub name: Option<String>,
|
||||
}
|
||||
impl From<AddrSpec> for MailboxRef {
|
||||
fn from(addr: AddrSpec) -> Self {
|
||||
MailboxRef {
|
||||
name: None,
|
||||
addrspec: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
pub type MailboxList = Vec<MailboxRef>;
|
||||
|
||||
/// Mailbox
|
||||
///
|
||||
/// ```abnf
|
142
src/text/ascii.rs
Normal file
142
src/text/ascii.rs
Normal file
|
@ -0,0 +1,142 @@
|
|||
// ASCII
|
||||
// -- CONTROL CHARACTERS
|
||||
pub const NULL: u8 = 0x00; // NULL
|
||||
pub const SOH: u8 = 0x01; // START OF HEADER
|
||||
pub const STX: u8 = 0x02; // START OF TEXT
|
||||
pub const ETX: u8 = 0x03; // END OF TEXT
|
||||
pub const EOT: u8 = 0x04; //
|
||||
pub const ANQ: u8 = 0x05;
|
||||
pub const ACK: u8 = 0x06;
|
||||
pub const BEL: u8 = 0x07;
|
||||
pub const BS: u8 = 0x08; // BACKSPACE
|
||||
pub const HT: u8 = 0x09; // horizontal tab
|
||||
pub const LF: u8 = 0x0A;
|
||||
pub const VT: u8 = 0x0B;
|
||||
pub const FF: u8 = 0x0C;
|
||||
pub const CR: u8 = 0x0D;
|
||||
pub const SO: u8 = 0x0E;
|
||||
pub const SI: u8 = 0x0F;
|
||||
pub const DLE: u8 = 0x10;
|
||||
pub const DC1: u8 = 0x11;
|
||||
pub const DC2: u8 = 0x12;
|
||||
pub const DC3: u8 = 0x13;
|
||||
pub const DC4 : u8 = 0x14;
|
||||
pub const NAK: u8 = 0x15;
|
||||
pub const SYN: u8 = 0x16;
|
||||
pub const ETB: u8 = 0x17;
|
||||
pub const CAN: u8 = 0x18;
|
||||
pub const EM: u8 = 0x19;
|
||||
pub const SUB: u8 = 0x1A;
|
||||
pub const ESC: u8 = 0x1B;
|
||||
pub const FS: u8 = 0x1C;
|
||||
pub const GS: u8 = 0x1D;
|
||||
pub const RS: u8 = 0x1E;
|
||||
pub const US: u8 = 0x1F;
|
||||
pub const DEL: u8 = 0x7F;
|
||||
|
||||
// -- GRAPHIC CHARACTERS
|
||||
pub const SP: u8 = 0x20; // space
|
||||
pub const EXCLAMATION: u8 = 0x21; // !
|
||||
pub const DQUOTE: u8 = 0x22; // "
|
||||
pub const NUM: u8 = 0x23; // #
|
||||
pub const DOLLAR: u8 = 0x24; // $
|
||||
pub const PERCENT: u8 = 0x25; // %
|
||||
pub const AMPERSAND: u8 = 0x26; // &
|
||||
pub const SQUOTE: u8 = 0x27; // '
|
||||
pub const LEFT_PAR: u8 = 0x28; // (
|
||||
pub const RIGHT_PAR: u8 = 0x29; // )
|
||||
pub const ASTERISK: u8 = 0x2A; // *
|
||||
pub const PLUS: u8 = 0x2B; // +
|
||||
pub const COMMA: u8 = 0x2C; // ,
|
||||
pub const MINUS: u8 = 0x2D; // -
|
||||
pub const PERIOD: u8 = 0x2E; // .
|
||||
pub const SLASH: u8 = 0x2F; // /
|
||||
pub const N0: u8 = 0x30; // 0
|
||||
pub const N1: u8 = 0x31; // 1
|
||||
pub const N2: u8 = 0x32; // 2
|
||||
pub const N3: u8 = 0x33; // 3
|
||||
pub const N4: u8 = 0x34; // 4
|
||||
pub const N5: u8 = 0x35; // 5
|
||||
pub const N6: u8 = 0x36; // 6
|
||||
pub const N7: u8 = 0x37; // 7
|
||||
pub const N8: u8 = 0x38; // 8
|
||||
pub const N9: u8 = 0x39; // 9
|
||||
pub const COL: u8 = 0x3A; // :
|
||||
pub const SEM_COL: u8 = 0x3B; // ;
|
||||
pub const LT: u8 = 0x3C; // <
|
||||
pub const EQ: u8 = 0x3D; // =
|
||||
pub const GT: u8 = 0x3E; // >
|
||||
pub const QUESTION: u8 = 0x3F; // ?
|
||||
pub const AT: u8 = 0x40; // @
|
||||
pub const LCA: u8 = 0x41; // A
|
||||
pub const LCB: u8 = 0x42; // B
|
||||
pub const LCC: u8 = 0x43; // C
|
||||
pub const LCD: u8 = 0x44; // D
|
||||
pub const LCE: u8 = 0x45; // E
|
||||
pub const LCF: u8 = 0x46; // F
|
||||
pub const LCG: u8 = 0x47; // G
|
||||
pub const LCH: u8 = 0x48; // H
|
||||
pub const LCI: u8 = 0x49; // I
|
||||
pub const LCJ: u8 = 0x4A; // J
|
||||
pub const LCK: u8 = 0x4B; // K
|
||||
pub const LCL: u8 = 0x4C; // L
|
||||
pub const LCM: u8 = 0x4D; // M
|
||||
pub const LCN: u8 = 0x4E; // N
|
||||
pub const LCO: u8 = 0x4F; // O
|
||||
pub const LCP: u8 = 0x50; // P
|
||||
pub const LCQ: u8 = 0x51; // Q
|
||||
pub const LCR: u8 = 0x52; // R
|
||||
pub const LCS: u8 = 0x53; // S
|
||||
pub const LCT: u8 = 0x54; // T
|
||||
pub const LCU: u8 = 0x55; // U
|
||||
pub const LCV: u8 = 0x56; // V
|
||||
pub const LCW: u8 = 0x57; // W
|
||||
pub const LCX: u8 = 0x58; // X
|
||||
pub const LCY: u8 = 0x59; // Y
|
||||
pub const LCZ: u8 = 0x5A; // Z
|
||||
pub const LEFT_BRACKET: u8 = 0x5B; // [
|
||||
pub const BACKSLASH: u8 = 0x5C; // \
|
||||
pub const RIGHT_BRACKET: u8 = 0x5D; // ]
|
||||
pub const CARRET: u8 = 0x5E; // ^
|
||||
pub const UNDERSCORE: u8 = 0x5F; // _
|
||||
pub const GRAVE: u8 = 0x60; // `
|
||||
pub const LSA: u8 = 0x61; // a
|
||||
pub const LSB: u8 = 0x62; // b
|
||||
pub const LSC: u8 = 0x63; // c
|
||||
pub const LSD: u8 = 0x64; // d
|
||||
pub const LSE: u8 = 0x65; // e
|
||||
pub const LSF: u8 = 0x66; // f
|
||||
pub const LSG: u8 = 0x67; // g
|
||||
pub const LSH: u8 = 0x68; // h
|
||||
pub const LSI: u8 = 0x69; // i
|
||||
pub const LSJ: u8 = 0x6A; // j
|
||||
pub const LSK: u8 = 0x6B; // k
|
||||
pub const LSL: u8 = 0x6C; // l
|
||||
pub const LSM: u8 = 0x6D; // m
|
||||
pub const LSN: u8 = 0x6E; // n
|
||||
pub const LSO: u8 = 0x6F; // o
|
||||
pub const LSP: u8 = 0x70; // p
|
||||
pub const LSQ: u8 = 0x71; // q
|
||||
pub const LSR: u8 = 0x72; // r
|
||||
pub const LSS: u8 = 0x73; // s
|
||||
pub const LST: u8 = 0x74; // t
|
||||
pub const LSU: u8 = 0x75; // u
|
||||
pub const LSV: u8 = 0x76; // v
|
||||
pub const LSW: u8 = 0x77; // w
|
||||
pub const LSX: u8 = 0x78; // x
|
||||
pub const LSY: u8 = 0x79; // y
|
||||
pub const LSZ: u8 = 0x7A; // z
|
||||
pub const LEFT_CURLY: u8 = 0x7B; // {
|
||||
pub const PIPE: u8 = 0x7C; // |
|
||||
pub const RIGHT_CURLY: u8 = 0x7D; // }
|
||||
pub const TILDE: u8 = 0x7E; // ~
|
||||
|
||||
// GROUP OF CHARACTERS
|
||||
// -- CRLF
|
||||
pub const CRLF: &[u8] = &[CR, LF];
|
||||
|
||||
// -- WHITESPACE
|
||||
pub const WS: &[u8] = &[HT, SP];
|
||||
|
||||
pub const GRAPHIC_BEGIN: u8 = SP;
|
||||
pub const GRAPHIC_END: u8 = TILDE;
|
43
src/text/buffer.rs
Normal file
43
src/text/buffer.rs
Normal file
|
@ -0,0 +1,43 @@
|
|||
use encoding_rs::Encoding;
|
||||
|
||||
#[derive(Debug, PartialEq, Default)]
|
||||
pub struct Text<'a> {
|
||||
parts: Vec<&'a [u8]>,
|
||||
}
|
||||
|
||||
impl<'a> Text<'a> {
|
||||
pub fn push(&mut self, e: &[u8]) {
|
||||
self.parts.push(e)
|
||||
}
|
||||
|
||||
pub fn to_string(&self) -> String {
|
||||
let enc = encoding_rs::UTF_8;
|
||||
let size = self.parts.iter().fold(0, |acc, v| acc + v.len());
|
||||
|
||||
self.parts.iter().fold(
|
||||
String::with_capacity(size),
|
||||
|mut acc, v| {
|
||||
let (content, _) = enc.decode_without_bom_handling(v);
|
||||
acc.push_str(content.as_ref());
|
||||
acc
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_text() {
|
||||
let mut text = Text::default();
|
||||
text.push(b"hello");
|
||||
text.push(&[ascii::SP]);
|
||||
text.push(b"world");
|
||||
assert_eq!(
|
||||
text.to_string(),
|
||||
"hello world".to_string(),
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,5 +1,3 @@
|
|||
use std::borrow::Cow;
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
use nom::{
|
||||
|
@ -7,92 +5,107 @@ use nom::{
|
|||
branch::alt,
|
||||
bytes::complete::{tag, take, take_while1, take_while},
|
||||
character::complete::{one_of},
|
||||
character::is_alphanumeric,
|
||||
combinator::map,
|
||||
sequence::{preceded, terminated, tuple},
|
||||
multi::many0,
|
||||
};
|
||||
use encoding_rs::Encoding;
|
||||
use base64::{Engine as _, engine::general_purpose};
|
||||
|
||||
use crate::fragments::mime;
|
||||
use crate::text::words;
|
||||
use crate::text::ascii;
|
||||
|
||||
const IS_LAST_BUFFER: bool = true;
|
||||
const ALLOW_UTF8: bool = true;
|
||||
const NO_TLD: Option<&[u8]> = None;
|
||||
|
||||
pub fn header_decode(input: &[u8]) -> Cow<str> {
|
||||
// Create detector
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(input, IS_LAST_BUFFER);
|
||||
|
||||
// Get encoding
|
||||
let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8);
|
||||
let (header, _, _) = enc.decode(input);
|
||||
header
|
||||
}
|
||||
|
||||
pub fn encoded_word(input: &str) -> IResult<&str, String> {
|
||||
pub fn encoded_word(input: &[u8]) -> IResult<&[u8], EncodedWord> {
|
||||
alt((encoded_word_quoted, encoded_word_base64))(input)
|
||||
}
|
||||
|
||||
pub fn encoded_word_quoted(input: &str) -> IResult<&str, String> {
|
||||
pub fn encoded_word_quoted(input: &[u8]) -> IResult<&[u8], EncodedWord> {
|
||||
let (rest, (_, charset, _, _, _, txt, _)) = tuple((
|
||||
tag("=?"), mime::token,
|
||||
tag("=?"), words::mime_token,
|
||||
tag("?"), one_of("Qq"),
|
||||
tag("?"), ptext,
|
||||
tag("?=")))(input)?;
|
||||
|
||||
let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252);
|
||||
let parsed = decode_quoted_encoding(renc, txt.iter());
|
||||
let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252);
|
||||
let parsed = EncodedWord::Quoted(QuotedWord { enc: renc, chunks: txt });
|
||||
Ok((rest, parsed))
|
||||
}
|
||||
|
||||
pub fn encoded_word_base64(input: &str) -> IResult<&str, String> {
|
||||
pub fn encoded_word_base64(input: &[u8]) -> IResult<&[u8], EncodedWord> {
|
||||
let (rest, (_, charset, _, _, _, txt, _)) = tuple((
|
||||
tag("=?"), mime::token,
|
||||
tag("=?"), words::mime_token,
|
||||
tag("?"), one_of("Bb"),
|
||||
tag("?"), btext,
|
||||
tag("?=")))(input)?;
|
||||
|
||||
let renc = Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252);
|
||||
let parsed = general_purpose::STANDARD_NO_PAD.decode(txt).map(|d| renc.decode(d.as_slice()).0.to_string()).unwrap_or("".into());
|
||||
|
||||
let renc = Encoding::for_label(charset).unwrap_or(encoding_rs::WINDOWS_1252);
|
||||
let parsed = EncodedWord::Base64(Base64Word { enc: renc, content: txt });
|
||||
Ok((rest, parsed))
|
||||
}
|
||||
|
||||
fn decode_quoted_encoding<'a>(enc: &'static Encoding, q: impl Iterator<Item = &'a QuotedChunk<'a>>) -> String {
|
||||
q.fold(
|
||||
String::new(),
|
||||
|mut acc, c| {
|
||||
let dec = match c {
|
||||
QuotedChunk::Safe(v) => Cow::Borrowed(*v),
|
||||
QuotedChunk::Space => Cow::Borrowed(" "),
|
||||
QuotedChunk::Encoded(v) => {
|
||||
let w = &[*v];
|
||||
let (d, _, _) = enc.decode(w);
|
||||
Cow::Owned(d.into_owned())
|
||||
},
|
||||
};
|
||||
acc.push_str(dec.as_ref());
|
||||
acc
|
||||
})
|
||||
#[derive(PartialEq,Debug)]
|
||||
pub enum EncodedWord<'a> {
|
||||
Quoted(QuotedWord<'a>),
|
||||
Base64(Base64Word<'a>),
|
||||
}
|
||||
|
||||
#[derive(PartialEq,Debug)]
|
||||
pub struct Base64Word<'a> {
|
||||
pub enc: &'static Encoding,
|
||||
pub content: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> Base64Word<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
general_purpose::STANDARD_NO_PAD
|
||||
.decode(self.content)
|
||||
.map(|d| self.enc.decode(d.as_slice()).0.to_string())
|
||||
.unwrap_or("".into())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq,Debug)]
|
||||
pub struct QuotedWord<'a> {
|
||||
pub enc: &'static Encoding,
|
||||
pub chunks: Vec<QuotedChunk<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> QuotedWord<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
self.chunks.iter().fold(
|
||||
String::new(),
|
||||
|mut acc, c| {
|
||||
match c {
|
||||
QuotedChunk::Safe(v) => {
|
||||
let (content, _) = encoding_rs::UTF_8.decode_without_bom_handling(v);
|
||||
acc.push_str(content.as_ref());
|
||||
}
|
||||
QuotedChunk::Space => acc.push(' '),
|
||||
QuotedChunk::Encoded(v) => {
|
||||
let w = &[*v];
|
||||
let (d, _) = self.enc.decode_without_bom_handling(w);
|
||||
acc.push_str(d.as_ref());
|
||||
},
|
||||
};
|
||||
acc
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq,Debug)]
|
||||
pub enum QuotedChunk<'a> {
|
||||
Safe(&'a str),
|
||||
Safe(&'a [u8]),
|
||||
Encoded(u8),
|
||||
Space,
|
||||
}
|
||||
|
||||
//quoted_printable
|
||||
pub fn ptext(input: &str) -> IResult<&str, Vec<QuotedChunk>> {
|
||||
pub fn ptext(input: &[u8]) -> IResult<&[u8], Vec<QuotedChunk>> {
|
||||
many0(alt((safe_char2, encoded_space, hex_octet)))(input)
|
||||
}
|
||||
|
||||
|
||||
fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> {
|
||||
fn safe_char2(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
|
||||
map(take_while1(is_safe_char2), |v| QuotedChunk::Safe(v))(input)
|
||||
}
|
||||
|
||||
|
@ -101,8 +114,8 @@ fn safe_char2(input: &str) -> IResult<&str, QuotedChunk> {
|
|||
/// 8-bit values which correspond to printable ASCII characters other
|
||||
/// than "=", "?", and "_" (underscore), MAY be represented as those
|
||||
/// characters.
|
||||
fn is_safe_char2(c: char) -> bool {
|
||||
c.is_ascii() && !c.is_ascii_control() && c != '_' && c != '?' && c != '='
|
||||
fn is_safe_char2(c: u8) -> bool {
|
||||
c >= ascii::SP && c != ascii::UNDERSCORE && c != ascii::QUESTION && c != ascii::EQ
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -111,28 +124,30 @@ fn is_safe_char(c: char) -> bool {
|
|||
(c >= '\x3e' && c <= '\x7e')
|
||||
}*/
|
||||
|
||||
fn encoded_space(input: &str) -> IResult<&str, QuotedChunk> {
|
||||
fn encoded_space(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
|
||||
map(tag("_"), |_| QuotedChunk::Space)(input)
|
||||
}
|
||||
|
||||
fn hex_octet(input: &str) -> IResult<&str, QuotedChunk> {
|
||||
fn hex_octet(input: &[u8]) -> IResult<&[u8], QuotedChunk> {
|
||||
use nom::error::*;
|
||||
|
||||
let (rest, hstr) = preceded(tag("="), take(2usize))(input)?;
|
||||
let (rest, hbytes) = preceded(tag("="), take(2usize))(input)?;
|
||||
|
||||
let parsed = u8::from_str_radix(hstr, 16)
|
||||
let (hstr, _) = encoding_rs::UTF_8.decode_without_bom_handling(hbytes);
|
||||
|
||||
let parsed = u8::from_str_radix(hstr.as_ref(), 16)
|
||||
.map_err(|_| nom::Err::Error(Error::new(input, ErrorKind::Verify)))?;
|
||||
|
||||
Ok((rest, QuotedChunk::Encoded(parsed)))
|
||||
}
|
||||
|
||||
//base64 (maybe use a crate)
|
||||
pub fn btext(input: &str) -> IResult<&str, &str> {
|
||||
pub fn btext(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
terminated(take_while(is_bchar), many0(tag("=")))(input)
|
||||
}
|
||||
|
||||
fn is_bchar(c: char) -> bool {
|
||||
c.is_ascii_alphanumeric() || c == '+' || c == '/'
|
||||
fn is_bchar(c: u8) -> bool {
|
||||
is_alphanumeric(c) || c == ascii::PLUS || c == ascii::SLASH
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
166
src/text/misc_token.rs
Normal file
166
src/text/misc_token.rs
Normal file
|
@ -0,0 +1,166 @@
|
|||
use nom::{
|
||||
branch::alt,
|
||||
bytes::complete::take_while1,
|
||||
character::complete::space0,
|
||||
combinator::{into, map, opt},
|
||||
multi::{many0, many1},
|
||||
sequence::{preceded, tuple},
|
||||
IResult,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::text::{
|
||||
quoted::quoted_string,
|
||||
whitespace::{fws, is_obs_no_ws_ctl},
|
||||
words::{atom, is_vchar},
|
||||
encoding::{self, encoded_word},
|
||||
buffer,
|
||||
ascii,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Default)]
|
||||
pub struct PhraseList(pub Vec<String>);
|
||||
|
||||
/*
|
||||
impl<'a> TryFrom<&'a lazy::Unstructured<'a>> for Unstructured {
|
||||
type Error = IMFError<'a>;
|
||||
|
||||
fn try_from(input: &'a lazy::Unstructured<'a>) -> Result<Self, Self::Error> {
|
||||
unstructured(input.0)
|
||||
.map(|(_, v)| Unstructured(v))
|
||||
.map_err(|e| IMFError::Unstructured(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TryFrom<&'a lazy::PhraseList<'a>> for PhraseList {
|
||||
type Error = IMFError<'a>;
|
||||
|
||||
fn try_from(p: &'a lazy::PhraseList<'a>) -> Result<Self, Self::Error> {
|
||||
separated_list1(tag(","), phrase)(p.0)
|
||||
.map(|(_, q)| PhraseList(q))
|
||||
.map_err(|e| IMFError::PhraseList(e))
|
||||
}
|
||||
}*/
|
||||
|
||||
pub enum Word<'a> {
|
||||
Quoted(buffer::Text<'a>),
|
||||
Encoded(encoding::EncodedWord<'a>),
|
||||
Atom(&'a [u8]),
|
||||
}
|
||||
impl<'a> Word<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
match self {
|
||||
Word::Quoted(v) => v.to_string(),
|
||||
Word::Encoded(v) => v.to_string(),
|
||||
Word::Atom(v) => v.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Word
|
||||
///
|
||||
/// ```abnf
|
||||
/// word = atom / quoted-string
|
||||
/// ```
|
||||
pub fn word(input: &[u8]) -> IResult<&[u8], Word> {
|
||||
alt((
|
||||
map(quoted_string, |v| Word::Quoted(v)),
|
||||
map(encoded_word, |v| Word::Encoded(v)),
|
||||
map(atom, |v| Word::Atom(v))
|
||||
))(input)
|
||||
}
|
||||
|
||||
pub struct Phrase<'a>(pub Vec<Word<'a>>);
|
||||
impl<'a> Phrase<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
self.0.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
/// Phrase
|
||||
///
|
||||
/// ```abnf
|
||||
/// phrase = 1*word / obs-phrase
|
||||
/// ```
|
||||
pub fn phrase(input: &[u8]) -> IResult<&[u8], Phrase> {
|
||||
let (input, phrase) = map(many1(word), |v| Phrase(v))(input)?;
|
||||
Ok((input, phrase))
|
||||
}
|
||||
|
||||
/// Compatible unstructured input
|
||||
///
|
||||
/// ```abnf
|
||||
/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR
|
||||
/// ```
|
||||
fn is_unstructured(c: u8) -> bool {
|
||||
is_vchar(c) || is_obs_no_ws_ctl(c) || c == ascii::NULL
|
||||
}
|
||||
|
||||
enum UnstrToken<'a> {
|
||||
Init,
|
||||
Encoded(encoding::EncodedWord<'a>),
|
||||
Plain(&'a [u8]),
|
||||
}
|
||||
impl<'a> UnstrToken<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
match self {
|
||||
UnstrToken::Init => "".into(),
|
||||
UnstrToken::Encoded(e) => e.to_string(),
|
||||
UnstrToken::Plain(e) => encoding_rs::UTF_8.decode_without_bom_handling(e).into_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Unstructured<'a>(pub Vec<UnstrToken<'a>>);
|
||||
impl<'a> Unstructured<'a> {
|
||||
pub fn to_string(&self) -> String {
|
||||
self.0.iter().fold(
|
||||
(&UnstrToken::Init, String::new()),
|
||||
|(prev_token, result), current_token| {
|
||||
match (prev_token, current_token) {
|
||||
(UnstrToken::Init, v) => result.push_str(v.to_string().as_ref()),
|
||||
(UnstrToken::EncodedWord(_), UnstrToken::EncodedWord(v)) => result.push_str(v.to_string()).as_ref(),
|
||||
(_, v) => {
|
||||
result.push(' ');
|
||||
result.push_str(v.to_string().as_ref())
|
||||
},
|
||||
};
|
||||
|
||||
result
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unstructured header field body
|
||||
///
|
||||
/// ```abnf
|
||||
/// unstructured = (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
|
||||
/// ```
|
||||
pub fn unstructured(input: &[u8]) -> IResult<&[u8], Unstructured> {
|
||||
let (input, r) = many0(preceded(opt(fws), alt((
|
||||
map(encoded_word, |v| UnstrToken::Encoded(v)),
|
||||
map(take_while1(is_unstructured), |v| UnstrToken::Plain(v)),
|
||||
))))(input)?;
|
||||
|
||||
let (input, _) = space0(input)?;
|
||||
Ok((input, Unstructured(r)))
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_phrase() {
|
||||
assert_eq!(phrase("hello world"), Ok(("", "hello world".into())));
|
||||
assert_eq!(
|
||||
phrase("salut \"le\" monde"),
|
||||
Ok(("", "salut le monde".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
phrase("fin\r\n du\r\nmonde"),
|
||||
Ok(("\r\nmonde", "fin du".into()))
|
||||
);
|
||||
}
|
||||
}
|
7
src/text/mod.rs
Normal file
7
src/text/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
pub mod ascii;
|
||||
pub mod encoding;
|
||||
pub mod misc_token;
|
||||
pub mod quoted;
|
||||
pub mod whitespace;
|
||||
pub mod words;
|
||||
pub mod buffer;
|
|
@ -1,14 +1,16 @@
|
|||
use nom::{
|
||||
branch::alt,
|
||||
bytes::complete::tag,
|
||||
character::complete::{anychar, satisfy},
|
||||
combinator::opt,
|
||||
bytes::complete::{take_while1, tag},
|
||||
character::complete::anychar,
|
||||
combinator::{recognize, opt},
|
||||
multi::many0,
|
||||
sequence::{pair, preceded},
|
||||
IResult,
|
||||
};
|
||||
|
||||
use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
|
||||
use crate::text::whitespace::{cfws, fws, is_obs_no_ws_ctl};
|
||||
use crate::text::ascii;
|
||||
use crate::text::buffer;
|
||||
|
||||
/// Quoted pair
|
||||
///
|
||||
|
@ -16,8 +18,8 @@ use crate::fragments::whitespace::{cfws, fws, is_obs_no_ws_ctl};
|
|||
/// quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
|
||||
/// obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR)
|
||||
/// ```
|
||||
pub fn quoted_pair(input: &str) -> IResult<&str, char> {
|
||||
preceded(tag("\\"), anychar)(input)
|
||||
pub fn quoted_pair(input: &[u8]) -> IResult<&[u8], u8> {
|
||||
preceded(tag(&[ascii::SLASH]), anychar)(input)
|
||||
}
|
||||
|
||||
/// Allowed characters in quote
|
||||
|
@ -28,11 +30,11 @@ pub fn quoted_pair(input: &str) -> IResult<&str, char> {
|
|||
/// %d93-126 / ; "\" or the quote character
|
||||
/// obs-qtext
|
||||
/// ```
|
||||
fn is_restr_qtext(c: char) -> bool {
|
||||
c == '\x21' || (c >= '\x23' && c <= '\x5B') || (c >= '\x5D' && c <= '\x7E')
|
||||
fn is_restr_qtext(c: u8) -> bool {
|
||||
c == ascii::EXCLAMATION || (c >= ascii::NUM && c <= ascii::LEFT_BRACKET) || (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
|
||||
}
|
||||
|
||||
fn is_qtext(c: char) -> bool {
|
||||
fn is_qtext(c: u8) -> bool {
|
||||
is_restr_qtext(c) || is_obs_no_ws_ctl(c)
|
||||
}
|
||||
|
||||
|
@ -41,8 +43,8 @@ fn is_qtext(c: char) -> bool {
|
|||
/// ```abnf
|
||||
/// qcontent = qtext / quoted-pair
|
||||
/// ```
|
||||
fn qcontent(input: &str) -> IResult<&str, char> {
|
||||
alt((satisfy(is_qtext), quoted_pair))(input)
|
||||
fn qcontent(input: &u8) -> IResult<&[u8], &[u8]> {
|
||||
alt((take_while1(is_qtext), recognize(quoted_pair)))(input)
|
||||
}
|
||||
|
||||
/// Quoted string
|
||||
|
@ -52,7 +54,7 @@ fn qcontent(input: &str) -> IResult<&str, char> {
|
|||
/// DQUOTE *([FWS] qcontent) [FWS] DQUOTE
|
||||
/// [CFWS]
|
||||
/// ```
|
||||
pub fn quoted_string(input: &str) -> IResult<&str, String> {
|
||||
pub fn quoted_string(input: &[u8]) -> IResult<&[u8], buffer::Text> {
|
||||
let (input, _) = opt(cfws)(input)?;
|
||||
let (input, _) = tag("\"")(input)?;
|
||||
let (input, content) = many0(pair(opt(fws), qcontent))(input)?;
|
||||
|
@ -60,11 +62,11 @@ pub fn quoted_string(input: &str) -> IResult<&str, String> {
|
|||
// Rebuild string
|
||||
let mut qstring = content
|
||||
.iter()
|
||||
.fold(String::with_capacity(16), |mut acc, (maybe_wsp, c)| {
|
||||
.fold(buffer::Text::default(), |mut acc, (maybe_wsp, c)| {
|
||||
if let Some(wsp) = maybe_wsp {
|
||||
acc.push(*wsp);
|
||||
acc.push(&[ascii::SP]);
|
||||
}
|
||||
acc.push(*c);
|
||||
acc.push(c);
|
||||
acc
|
||||
});
|
||||
|
||||
|
@ -84,13 +86,22 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_quoted_string() {
|
||||
let mut text = buffer::Text::default();
|
||||
text.push(b"hello");
|
||||
text.push(&[ascii::DQUOTE]);
|
||||
text.push(b"world");
|
||||
assert_eq!(
|
||||
quoted_string(" \"hello\\\"world\" "),
|
||||
Ok(("", "hello\"world".to_string()))
|
||||
quoted_string(b" \"hello\\\"world\" "),
|
||||
Ok(("", text))
|
||||
);
|
||||
|
||||
let mut text = buffer::Text::default();
|
||||
text.push(b"hello");
|
||||
text.push(&[ascii::SP]);
|
||||
text.push(b"world");
|
||||
assert_eq!(
|
||||
quoted_string("\"hello\r\n world\""),
|
||||
Ok(("", "hello world".to_string()))
|
||||
quoted_string(b"\"hello\r\n world\""),
|
||||
Ok(("", text))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,71 +1,68 @@
|
|||
use crate::fragments::quoted::quoted_pair;
|
||||
use nom::{
|
||||
branch::alt,
|
||||
bytes::complete::{is_not, tag},
|
||||
character::complete::{crlf, satisfy, space0, space1},
|
||||
bytes::complete::{is_not, tag, take_while1},
|
||||
character::complete::{space0, space1},
|
||||
combinator::{opt, recognize},
|
||||
multi::{many0, many1},
|
||||
sequence::{pair, terminated, tuple},
|
||||
sequence::{pair, tuple},
|
||||
IResult,
|
||||
};
|
||||
use crate::fragments::encoding::encoded_word;
|
||||
use crate::text::encoding::encoded_word;
|
||||
use crate::text::quoted::quoted_pair;
|
||||
use crate::text::ascii;
|
||||
|
||||
/// Whitespace (space, new line, tab) content and
|
||||
/// delimited content (eg. comment, line, sections, etc.)
|
||||
|
||||
// Bytes CRLF
|
||||
const CR: u8 = 0x0D;
|
||||
const LF: u8 = 0x0A;
|
||||
pub const CRLF: &[u8] = &[CR, LF];
|
||||
/// Obsolete/Compatible CRLF
|
||||
///
|
||||
/// Theoretically, all lines must end with \r\n
|
||||
/// but some mail servers like Dovecot support malformated emails,
|
||||
/// for example with only \n eol. It works because
|
||||
/// \r or \n is allowed nowhere else, so we also add this support.
|
||||
|
||||
pub fn headers(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
terminated(recognize(many0(line)), obs_crlf)(input)
|
||||
pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
alt((tag(ascii::CRLF), tag(&[ascii::CR]), tag(&[ascii::LF])))(input)
|
||||
}
|
||||
|
||||
pub fn fields(input: &str) -> IResult<&str, Vec<&str>> {
|
||||
all_consuming(many0(foldable_line))(input)
|
||||
}
|
||||
|
||||
pub fn line(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> {
|
||||
// is_not(CRLF) is a hack, it means "is not CR or LF"
|
||||
// and not "is not CRLF". In other words, it continues while
|
||||
// it does not encounter 0x0D or 0x0A.
|
||||
pair(is_not(CRLF), obs_crlf)(input)
|
||||
pair(is_not(ascii::CRLF), obs_crlf)(input)
|
||||
}
|
||||
|
||||
pub fn obs_crlf(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
alt((tag(CRLF), tag(&[CR]), tag(&[LF])))(input)
|
||||
/// ```abnf
|
||||
/// fold_line = any *(1*(crlf WS) any) crlf
|
||||
/// ```
|
||||
pub fn foldable_line(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
recognize(tuple((
|
||||
is_not(ascii::CRLF),
|
||||
many0(pair(
|
||||
many1(pair(obs_crlf, space1)),
|
||||
is_not(ascii::CRLF),
|
||||
)),
|
||||
obs_crlf,
|
||||
)))(input)
|
||||
}
|
||||
|
||||
|
||||
// --- whitespaces and comments
|
||||
|
||||
// Note: WSP = SP / HTAB = %x20 / %x09
|
||||
// nom::*::space0 = *WSP
|
||||
// nom::*::space1 = 1*WSP
|
||||
|
||||
/// Permissive CRLF
|
||||
///
|
||||
/// Theoretically, all lines must end with \r\n
|
||||
/// but some mail servers like Dovecot support malformated emails,
|
||||
/// for example with only \n eol. It works because
|
||||
/// \r or \n is allowed nowhere else, so we also add this support.
|
||||
pub fn perm_crlf(input: &str) -> IResult<&str, &str> {
|
||||
alt((crlf, tag("\r"), tag("\n")))(input)
|
||||
}
|
||||
|
||||
/// Permissive foldable white space
|
||||
///
|
||||
/// Folding white space are used for long headers splitted on multiple lines.
|
||||
/// The obsolete syntax allowes multiple lines without content; implemented for compatibility
|
||||
/// reasons
|
||||
pub fn fws(input: &str) -> IResult<&str, char> {
|
||||
pub fn fws(input: &[u8]) -> IResult<&[u8], u8> {
|
||||
let (input, _) = alt((recognize(many1(fold_marker)), space1))(input)?;
|
||||
Ok((input, ' '))
|
||||
Ok((input, ascii::SP))
|
||||
}
|
||||
fn fold_marker(input: &str) -> IResult<&str, &str> {
|
||||
fn fold_marker(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
let (input, _) = space0(input)?;
|
||||
let (input, _) = perm_crlf(input)?;
|
||||
let (input, _) = obs_crlf(input)?;
|
||||
space1(input)
|
||||
}
|
||||
|
||||
|
@ -85,17 +82,17 @@ fn fold_marker(input: &str) -> IResult<&str, &str> {
|
|||
///
|
||||
/// CFWS = (1*([FWS] comment) [FWS]) / FWS
|
||||
/// ```
|
||||
pub fn cfws(input: &str) -> IResult<&str, &str> {
|
||||
pub fn cfws(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
alt((recognize(comments), recognize(fws)))(input)
|
||||
}
|
||||
|
||||
pub fn comments(input: &str) -> IResult<&str, ()> {
|
||||
pub fn comments(input: &[u8]) -> IResult<&[u8], ()> {
|
||||
let (input, _) = many1(tuple((opt(fws), comment)))(input)?;
|
||||
let (input, _) = opt(fws)(input)?;
|
||||
Ok((input, ()))
|
||||
}
|
||||
|
||||
pub fn comment(input: &str) -> IResult<&str, ()> {
|
||||
pub fn comment(input: &[u8]) -> IResult<&[u8], ()> {
|
||||
let (input, _) = tag("(")(input)?;
|
||||
let (input, _) = many0(tuple((opt(fws), ccontent)))(input)?;
|
||||
let (input, _) = opt(fws)(input)?;
|
||||
|
@ -103,12 +100,16 @@ pub fn comment(input: &str) -> IResult<&str, ()> {
|
|||
Ok((input, ()))
|
||||
}
|
||||
|
||||
pub fn ccontent(input: &str) -> IResult<&str, &str> {
|
||||
alt((recognize(ctext), recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
|
||||
pub fn ccontent(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
alt((ctext, recognize(quoted_pair), recognize(encoded_word), recognize(comment)))(input)
|
||||
}
|
||||
|
||||
pub fn ctext(input: &str) -> IResult<&str, char> {
|
||||
satisfy(is_ctext)(input)
|
||||
pub fn ctext(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
take_while1(is_ctext)(input)
|
||||
}
|
||||
|
||||
pub fn is_ctext(c: u8) -> bool {
|
||||
is_restr_ctext(c) || is_obs_no_ws_ctl(c)
|
||||
}
|
||||
|
||||
/// Check if it's a comment text character
|
||||
|
@ -119,15 +120,10 @@ pub fn ctext(input: &str) -> IResult<&str, char> {
|
|||
/// %d93-126 / ; "(", ")", or "\"
|
||||
/// obs-ctext
|
||||
///```
|
||||
pub fn is_restr_ctext(c: char) -> bool {
|
||||
(c >= '\x21' && c <= '\x27')
|
||||
|| (c >= '\x2A' && c <= '\x5B')
|
||||
|| (c >= '\x5D' && c <= '\x7E')
|
||||
|| !c.is_ascii()
|
||||
}
|
||||
|
||||
pub fn is_ctext(c: char) -> bool {
|
||||
is_restr_ctext(c) || is_obs_no_ws_ctl(c)
|
||||
pub fn is_restr_ctext(c: u8) -> bool {
|
||||
(c >= ascii::EXCLAMATION && c <= ascii::SQUOTE)
|
||||
|| (c >= ascii::ASTERISK && c <= ascii::LEFT_BRACKET)
|
||||
|| (c >= ascii::RIGHT_BRACKET && c <= ascii::TILDE)
|
||||
}
|
||||
|
||||
/// US ASCII control characters without effect
|
||||
|
@ -139,12 +135,12 @@ pub fn is_ctext(c: char) -> bool {
|
|||
/// %d14-31 / ; return, line feed, and
|
||||
/// %d127 ; white space characters
|
||||
/// ```
|
||||
pub fn is_obs_no_ws_ctl(c: char) -> bool {
|
||||
(c >= '\x01' && c <= '\x08')
|
||||
|| c == '\x0b'
|
||||
|| c == '\x0b'
|
||||
|| (c >= '\x0e' && c <= '\x1f')
|
||||
|| c == '\x7F'
|
||||
pub fn is_obs_no_ws_ctl(c: u8) -> bool {
|
||||
(c >= ascii::SOH && c <= ascii::BS)
|
||||
|| c == ascii::VT
|
||||
|| c == ascii::FF
|
||||
|| (c >= ascii::SO && c <= ascii::US)
|
||||
|| c == ascii::DEL
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -152,10 +148,10 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_perm_crlf() {
|
||||
assert_eq!(perm_crlf("\rworld"), Ok(("world", "\r")));
|
||||
assert_eq!(perm_crlf("\r\nworld"), Ok(("world", "\r\n")));
|
||||
assert_eq!(perm_crlf("\nworld"), Ok(("world", "\n")));
|
||||
fn test_obs_crlf() {
|
||||
assert_eq!(obs_crlf("\rworld"), Ok(("world", "\r")));
|
||||
assert_eq!(obs_crlf("\r\nworld"), Ok(("world", "\r\n")));
|
||||
assert_eq!(obs_crlf("\nworld"), Ok(("world", "\n")));
|
||||
}
|
||||
|
||||
#[test]
|
133
src/text/words.rs
Normal file
133
src/text/words.rs
Normal file
|
@ -0,0 +1,133 @@
|
|||
use crate::text::whitespace::cfws;
|
||||
use crate::text::ascii;
|
||||
use nom::{
|
||||
bytes::complete::{tag, take_while1},
|
||||
character::is_alphanumeric,
|
||||
combinator::{opt, recognize},
|
||||
multi::many0,
|
||||
sequence::{delimited, pair},
|
||||
IResult,
|
||||
};
|
||||
|
||||
pub fn is_vchar(c: u8) -> bool {
|
||||
c >= ascii::EXCLAMATION && c <= ascii::TILDE
|
||||
}
|
||||
|
||||
/// MIME Token allowed characters
|
||||
///
|
||||
/// forbidden: ()<>@,;:\"/[]?=
|
||||
fn is_mime_token_text(c: u8) -> bool {
|
||||
is_alphanumeric(c)
|
||||
|| c == ascii::EXCLAMATION
|
||||
|| c == ascii::NUM
|
||||
|| c == ascii::DOLLAR
|
||||
|| c == ascii::PERCENT
|
||||
|| c == ascii::AMPERSAND
|
||||
|| c == ascii::SQUOTE
|
||||
|| c == ascii::ASTERISK
|
||||
|| c == ascii::PLUS
|
||||
|| c == ascii::MINUS
|
||||
|| c == ascii::PERIOD
|
||||
|| c == ascii::CARRET
|
||||
|| c == ascii::UNDERSCORE
|
||||
|| c == ascii::GRAVE
|
||||
|| c == ascii::LEFT_CURLY
|
||||
|| c == ascii::PIPE
|
||||
|| c == ascii::RIGHT_CURLY
|
||||
|| c == ascii::TILDE
|
||||
}
|
||||
|
||||
/// MIME Token
|
||||
///
|
||||
/// `[CFWS] 1*token_text [CFWS]`
|
||||
pub fn mime_token(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
delimited(opt(cfws), take_while1(is_mime_token_text), opt(cfws))(input)
|
||||
}
|
||||
|
||||
/// Atom allowed characters
|
||||
///
|
||||
/// authorized: !#$%&'*+-/=?^_`{|}~
|
||||
fn is_atext(c: u8) -> bool {
|
||||
is_alphanumeric(c)
|
||||
|| c == ascii::EXCLAMATION
|
||||
|| c == ascii::NUM
|
||||
|| c == ascii::DOLLAR
|
||||
|| c == ascii::PERCENT
|
||||
|| c == ascii::AMPERSAND
|
||||
|| c == ascii::SQUOTE
|
||||
|| c == ascii::ASTERISK
|
||||
|| c == ascii::PLUS
|
||||
|| c == ascii::MINUS
|
||||
|| c == ascii::SLASH
|
||||
|| c == ascii::EQ
|
||||
|| c == ascii::QUESTION
|
||||
|| c == ascii::CARRET
|
||||
|| c == ascii::UNDERSCORE
|
||||
|| c == ascii::GRAVE
|
||||
|| c == ascii::LEFT_CURLY
|
||||
|| c == ascii::PIPE
|
||||
|| c == ascii::RIGHT_CURLY
|
||||
|| c == ascii::TILDE
|
||||
}
|
||||
|
||||
/// Atom
|
||||
///
|
||||
/// `[CFWS] 1*atext [CFWS]`
|
||||
pub fn atom(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input)
|
||||
}
|
||||
|
||||
/// dot-atom-text
|
||||
///
|
||||
/// `1*atext *("." 1*atext)`
|
||||
pub fn dot_atom_text(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
recognize(pair(
|
||||
take_while1(is_atext),
|
||||
many0(pair(tag("."), take_while1(is_atext))),
|
||||
))(input)
|
||||
}
|
||||
|
||||
/// dot-atom
|
||||
///
|
||||
/// `[CFWS] dot-atom-text [CFWS]`
|
||||
pub fn dot_atom(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
delimited(opt(cfws), dot_atom_text, opt(cfws))(input)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_atext() {
|
||||
assert!(is_atext('=' as u8));
|
||||
assert!(is_atext('5' as u8));
|
||||
assert!(is_atext('Q' as u8));
|
||||
assert!(!is_atext(' ' as u8));
|
||||
//assert!(is_atext('É')); // support utf8
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atom() {
|
||||
assert_eq!(
|
||||
atom(b"(skip) imf_codec (hidden) aerogramme"),
|
||||
Ok((&b"aerogramme"[..], &b"imf_codec"[..]))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dot_atom_text() {
|
||||
assert_eq!(
|
||||
dot_atom_text("quentin.dufour.io abcdef"),
|
||||
Ok((" abcdef", "quentin.dufour.io"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dot_atom() {
|
||||
assert_eq!(
|
||||
dot_atom(" (skip) quentin.dufour.io abcdef"),
|
||||
Ok(("abcdef", "quentin.dufour.io"))
|
||||
);
|
||||
}
|
||||
}
|
129
tests/enron.rs
129
tests/enron.rs
|
@ -1,129 +0,0 @@
|
|||
use imf_codec::fragments::section;
|
||||
use imf_codec::multipass;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::PathBuf;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
|
||||
where
|
||||
F: FnOnce(§ion::Section) -> (),
|
||||
{
|
||||
let seg = multipass::segment::new(input).unwrap();
|
||||
let charset = seg.charset();
|
||||
let fields = charset.fields().unwrap();
|
||||
let field_names = fields.names();
|
||||
let field_body = field_names.body();
|
||||
let section = field_body.section();
|
||||
|
||||
func(§ion.fields);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_enron500k() {
|
||||
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
d.push("resources/enron/maildir/");
|
||||
let prefix_sz = d.as_path().to_str().unwrap().len();
|
||||
//d.push("williams-w3/");
|
||||
|
||||
let known_bad_fields = HashSet::from([
|
||||
"white-s/calendar/113.", // To: east <7..>
|
||||
"skilling-j/inbox/223.", // From: pep <performance.>
|
||||
"jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
|
||||
"jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
|
||||
"lokey-t/calendar/33.", // A second Date entry for the calendar containing
|
||||
// Date: Monday, March 12
|
||||
"zipper-a/inbox/199.", // To: e-mail <mari.>
|
||||
"dasovich-j/deleted_items/128.", // To: f62489 <g>
|
||||
"dasovich-j/all_documents/677.", // To: w/assts <govt.>
|
||||
"dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/all_documents/578.", // To: w/assts <govt.>
|
||||
"dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
|
||||
"dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
|
||||
"dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
|
||||
"dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
|
||||
"dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
|
||||
"dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
|
||||
"dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
|
||||
"kaminski-v/sites/19.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/sites/1.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/discussion_threads/5082.", // To: <"ft.com.users":@enron.com>
|
||||
"kaminski-v/discussion_threads/4046.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/discussion_threads/4187.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/discussion_threads/8068.", // To: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
|
||||
"kaminski-v/discussion_threads/7980.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
||||
"kaminski-v/all_documents/5970.", //To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
||||
"kaminski-v/all_documents/5838.", // To + Cc: dogs <breakthrough.>, breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com
|
||||
"kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com>
|
||||
"kaminski-v/all_documents/92.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/all_documents/276.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/technical/1.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/technical/7.", // To: <"the.desk":@enron.com>
|
||||
"kaminski-v/notes_inbox/140.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
||||
"kaminski-v/notes_inbox/95.", // To + CC failed: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
|
||||
"kean-s/archiving/untitled/1232.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
||||
"kean-s/archiving/untitled/1688.", // To: w/assts <govt.>
|
||||
"kean-s/sent/198.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
||||
"kean-s/reg_risk/9.", // To: w/assts <govt.>
|
||||
"kean-s/discussion_threads/950.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
||||
"kean-s/discussion_threads/577.", // To: w/assts <govt.>
|
||||
"kean-s/calendar/untitled/1096.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
||||
"kean-s/calendar/untitled/640.", // To: w/assts <govt.>
|
||||
"kean-s/all_documents/640.", // To: w/assts <govt.>
|
||||
"kean-s/all_documents/1095.", // To: w/assts <govt.>
|
||||
"kean-s/attachments/2030.", // To: w/assts <govt.>
|
||||
"williams-w3/operations_committee_isas/10.", // To: z34655 <m>
|
||||
]);
|
||||
|
||||
let known_bad_from = HashSet::from([
|
||||
"skilling-j/inbox/223.", // From: pep <performance.>
|
||||
]);
|
||||
|
||||
let mut i = 0;
|
||||
for entry in WalkDir::new(d.as_path())
|
||||
.into_iter()
|
||||
.filter_map(|file| file.ok())
|
||||
{
|
||||
if entry.metadata().unwrap().is_file() {
|
||||
let mail_path = entry.path();
|
||||
let suffix = &mail_path.to_str().unwrap()[prefix_sz..];
|
||||
|
||||
// read file
|
||||
let mut raw = Vec::new();
|
||||
let mut f = File::open(mail_path).unwrap();
|
||||
f.read_to_end(&mut raw).unwrap();
|
||||
|
||||
// parse
|
||||
parser(&raw, |hdrs| {
|
||||
let ok_date = hdrs.date.is_some();
|
||||
let ok_from = hdrs.from.len() > 0;
|
||||
let ok_fields = hdrs.bad_fields.len() == 0;
|
||||
|
||||
if !ok_date || !ok_from || !ok_fields {
|
||||
println!("Issue with: {}", suffix);
|
||||
}
|
||||
|
||||
assert!(ok_date);
|
||||
|
||||
if !known_bad_from.contains(suffix) {
|
||||
assert!(ok_from);
|
||||
}
|
||||
|
||||
if !known_bad_fields.contains(suffix) {
|
||||
assert!(ok_fields);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
if i % 1000 == 0 {
|
||||
println!("Analyzed emails: {}", i);
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
340
tests/known.rs
340
tests/known.rs
|
@ -1,340 +0,0 @@
|
|||
use chrono::{FixedOffset, TimeZone};
|
||||
use imf_codec::fragments::{misc_token, model, section, part, trace};
|
||||
use imf_codec::multipass;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn parser<'a, F>(input: &'a [u8], func: F) -> ()
|
||||
where
|
||||
F: FnOnce(§ion::Section) -> (),
|
||||
{
|
||||
let seg = multipass::segment::new(input).unwrap();
|
||||
let charset = seg.charset();
|
||||
let fields = charset.fields().unwrap();
|
||||
let field_names = fields.names();
|
||||
let field_body = field_names.body();
|
||||
let section = field_body.section();
|
||||
|
||||
func(§ion.fields);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_headers() {
|
||||
let fullmail: &[u8] = r#"Return-Path: <gitlab@example.com>
|
||||
Delivered-To: quentin@example.com
|
||||
Received: from smtp.example.com ([10.83.2.2])
|
||||
by doradille with LMTP
|
||||
id xyzabcd
|
||||
(envelope-from <gitlab@example.com>)
|
||||
for <quentin@example.com>; Tue, 13 Jun 2023 19:01:08 +0000
|
||||
Date: Tue, 13 Jun 2023 10:01:10 +0200
|
||||
From: Mary Smith
|
||||
<mary@example.net>, "A\lan" <alan@example>
|
||||
Sender: imf@example.com
|
||||
Reply-To: "Mary Smith: Personal Account" <smith@home.example>
|
||||
To: John Doe <jdoe@machine.example>
|
||||
Cc: imf2@example.com
|
||||
Bcc: (hidden)
|
||||
Subject: Re: Saying Hello
|
||||
Comments: A simple message
|
||||
Comments: Not that complicated
|
||||
comments : not valid header name but should be accepted
|
||||
by the parser.
|
||||
Keywords: hello, world
|
||||
Héron: Raté
|
||||
Raté raté
|
||||
Keywords: salut, le, monde
|
||||
Not a real header but should still recover
|
||||
Message-ID: <3456@example.net>
|
||||
In-Reply-To: <1234@local.machine.example>
|
||||
References: <1234@local.machine.example>
|
||||
Unknown: unknown
|
||||
|
||||
This is a reply to your hello.
|
||||
"#
|
||||
.as_bytes();
|
||||
parser(fullmail, |parsed_section| {
|
||||
assert_eq!(
|
||||
parsed_section,
|
||||
§ion::Section {
|
||||
date: Some(
|
||||
&FixedOffset::east_opt(2 * 3600)
|
||||
.unwrap()
|
||||
.with_ymd_and_hms(2023, 06, 13, 10, 01, 10)
|
||||
.unwrap()
|
||||
),
|
||||
|
||||
from: vec![
|
||||
&model::MailboxRef {
|
||||
name: Some("Mary Smith".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "mary".into(),
|
||||
domain: "example.net".into(),
|
||||
}
|
||||
},
|
||||
&model::MailboxRef {
|
||||
name: Some("Alan".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "alan".into(),
|
||||
domain: "example".into(),
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
sender: Some(&model::MailboxRef {
|
||||
name: None,
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "imf".into(),
|
||||
domain: "example.com".into(),
|
||||
}
|
||||
}),
|
||||
|
||||
reply_to: vec![&model::AddressRef::Single(model::MailboxRef {
|
||||
name: Some("Mary Smith: Personal Account".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "smith".into(),
|
||||
domain: "home.example".into(),
|
||||
}
|
||||
})],
|
||||
|
||||
to: vec![&model::AddressRef::Single(model::MailboxRef {
|
||||
name: Some("John Doe".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "jdoe".into(),
|
||||
domain: "machine.example".into(),
|
||||
}
|
||||
})],
|
||||
|
||||
cc: vec![&model::AddressRef::Single(model::MailboxRef {
|
||||
name: None,
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "imf2".into(),
|
||||
domain: "example.com".into(),
|
||||
}
|
||||
})],
|
||||
|
||||
bcc: vec![],
|
||||
|
||||
msg_id: Some(&model::MessageId {
|
||||
left: "3456",
|
||||
right: "example.net"
|
||||
}),
|
||||
in_reply_to: vec![&model::MessageId {
|
||||
left: "1234",
|
||||
right: "local.machine.example"
|
||||
}],
|
||||
references: vec![&model::MessageId {
|
||||
left: "1234",
|
||||
right: "local.machine.example"
|
||||
}],
|
||||
|
||||
subject: Some(&misc_token::Unstructured("Re: Saying Hello".into())),
|
||||
|
||||
comments: vec![
|
||||
&misc_token::Unstructured("A simple message".into()),
|
||||
&misc_token::Unstructured("Not that complicated".into()),
|
||||
&misc_token::Unstructured(
|
||||
"not valid header name but should be accepted by the parser.".into()
|
||||
),
|
||||
],
|
||||
|
||||
keywords: vec![
|
||||
&misc_token::PhraseList(vec!["hello".into(), "world".into(),]),
|
||||
&misc_token::PhraseList(vec!["salut".into(), "le".into(), "monde".into(),]),
|
||||
],
|
||||
|
||||
received: vec![&trace::ReceivedLog(
|
||||
r#"from smtp.example.com ([10.83.2.2])
|
||||
by doradille with LMTP
|
||||
id xyzabcd
|
||||
(envelope-from <gitlab@example.com>)
|
||||
for <quentin@example.com>"#
|
||||
)],
|
||||
|
||||
return_path: vec![&model::MailboxRef {
|
||||
name: None,
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "gitlab".into(),
|
||||
domain: "example.com".into(),
|
||||
}
|
||||
}],
|
||||
|
||||
optional: HashMap::from([
|
||||
(
|
||||
"Delivered-To",
|
||||
&misc_token::Unstructured("quentin@example.com".into())
|
||||
),
|
||||
("Unknown", &misc_token::Unstructured("unknown".into())),
|
||||
]),
|
||||
|
||||
bad_fields: vec![],
|
||||
|
||||
unparsed: vec![
|
||||
"Héron: Raté\n Raté raté\n",
|
||||
"Not a real header but should still recover\n",
|
||||
],
|
||||
..section::Section::default()
|
||||
}
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_headers_mime() {
|
||||
use imf_codec::fragments::mime;
|
||||
let fullmail: &[u8] = r#"From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
|
||||
To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
|
||||
CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>
|
||||
Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
|
||||
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=ISO-8859-1
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
Content-ID: <a@example.com>
|
||||
Content-Description: hello
|
||||
|
||||
Now's the time =
|
||||
for all folk to come=
|
||||
to the aid of their country.
|
||||
"#
|
||||
.as_bytes();
|
||||
|
||||
parser(fullmail, |parsed_section| {
|
||||
assert_eq!(
|
||||
parsed_section,
|
||||
§ion::Section {
|
||||
from: vec![
|
||||
&model::MailboxRef {
|
||||
name: Some("Keith Moore".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "moore".into(),
|
||||
domain: "cs.utk.edu".into(),
|
||||
}
|
||||
},
|
||||
],
|
||||
|
||||
to: vec![&model::AddressRef::Single(model::MailboxRef {
|
||||
name: Some("Keld Jørn Simonsen".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "keld".into(),
|
||||
domain: "dkuug.dk".into(),
|
||||
}
|
||||
})],
|
||||
|
||||
cc: vec![&model::AddressRef::Single(model::MailboxRef {
|
||||
name: Some("André Pirard".into()),
|
||||
addrspec: model::AddrSpec {
|
||||
local_part: "PIRARD".into(),
|
||||
domain: "vm1.ulg.ac.be".into(),
|
||||
}
|
||||
})],
|
||||
|
||||
subject: Some(&misc_token::Unstructured("If you can read this you understand the example.".into())),
|
||||
mime_version: Some(&mime::Version{ major: 1, minor: 0 }),
|
||||
mime: section::MIMESection {
|
||||
content_type: Some(&mime::Type::Text(mime::TextDesc {
|
||||
charset: Some(mime::EmailCharset::ISO_8859_1),
|
||||
subtype: mime::TextSubtype::Plain,
|
||||
unknown_parameters: vec![]
|
||||
})),
|
||||
content_transfer_encoding: Some(&mime::Mechanism::QuotedPrintable),
|
||||
content_id: Some(&model::MessageId {
|
||||
left: "a",
|
||||
right: "example.com"
|
||||
}),
|
||||
content_description: Some(&misc_token::Unstructured("hello".into())),
|
||||
..section::MIMESection::default()
|
||||
},
|
||||
..section::Section::default()
|
||||
}
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
fn parser_bodystruct<'a, F>(input: &'a [u8], func: F) -> ()
|
||||
where
|
||||
F: FnOnce(&part::PartNode) -> (),
|
||||
{
|
||||
let seg = multipass::segment::new(input).unwrap();
|
||||
let charset = seg.charset();
|
||||
let fields = charset.fields().unwrap();
|
||||
let field_names = fields.names();
|
||||
let field_body = field_names.body();
|
||||
let section = field_body.section();
|
||||
let bodystruct = section.body_structure();
|
||||
|
||||
func(&bodystruct.body);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multipart() {
|
||||
let fullmail: &[u8] = r#"Date: Sat, 8 Jul 2023 07:14:29 +0200
|
||||
From: Grrrnd Zero <grrrndzero@example.org>
|
||||
To: John Doe <jdoe@machine.example>
|
||||
Subject: Re: Saying Hello
|
||||
Message-ID: <NTAxNzA2AC47634Y366BAMTY4ODc5MzQyODY0ODY5@www.grrrndzero.org>
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/alternative;
|
||||
boundary="b1_e376dc71bafc953c0b0fdeb9983a9956"
|
||||
Content-Transfer-Encoding: 7bit
|
||||
|
||||
This is a multi-part message in MIME format.
|
||||
|
||||
--b1_e376dc71bafc953c0b0fdeb9983a9956
|
||||
Content-Type: text/plain; charset=utf-8
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
|
||||
GZ
|
||||
OoOoO
|
||||
oOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
|
||||
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
|
||||
|
||||
--b1_e376dc71bafc953c0b0fdeb9983a9956
|
||||
Content-Type: text/html; charset=us-ascii
|
||||
|
||||
<div style="text-align: center;"><strong>GZ</strong><br />
|
||||
OoOoO<br />
|
||||
oOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
|
||||
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />
|
||||
|
||||
--b1_e376dc71bafc953c0b0fdeb9983a9956--
|
||||
"#.as_bytes();
|
||||
|
||||
parser_bodystruct(fullmail, |part| {
|
||||
assert_eq!(part, &part::PartNode::Composite(
|
||||
part::PartHeader {
|
||||
..part::PartHeader::default()
|
||||
},
|
||||
vec![
|
||||
part::PartNode::Discrete(
|
||||
part::PartHeader {
|
||||
..part::PartHeader::default()
|
||||
},
|
||||
r#"GZ
|
||||
OoOoO
|
||||
oOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOoOoOoOo
|
||||
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
|
||||
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO"#.as_bytes()
|
||||
),
|
||||
part::PartNode::Discrete(
|
||||
part::PartHeader {
|
||||
..part::PartHeader::default()
|
||||
},
|
||||
r#"<div style="text-align: center;"><strong>GZ</strong><br />
|
||||
OoOoO<br />
|
||||
oOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOoOoOoOo<br />
|
||||
oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
|
||||
OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />"#.as_bytes()
|
||||
),
|
||||
]));
|
||||
});
|
||||
}
|
Loading…
Reference in a new issue