wip enron, todo list

This commit is contained in:
Quentin 2023-06-19 11:22:51 +02:00
parent cad1ca9978
commit 4fe6c9c8df
Signed by: quentin
GPG key ID: E9602264D639FF68
5 changed files with 132 additions and 15 deletions

View file

@ -28,6 +28,16 @@ Current known limitations/bugs:
## Design ## Design
Based on nom, a parser combinator lib in Rust. Based on nom, a parser combinator lib in Rust.
multipass parser
- extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
- decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
- extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
- extract header names iter::Name::From(&str)
- extract header body iter::Body::From(Vec<MailboxRef>)
- extract header section Section
recovery
- based on multipass, equivalent to sentinel / synchronization tokens
## Testing strategy ## Testing strategy
@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
Early development. Not ready. Early development. Not ready.
Do not use it in production or any software at all. Do not use it in production or any software at all.
Todo:
- [ ] test over enron dataset
- [ ] convert to multipass parser
- [ ] implement mime part 3 (encoded headers)
- [ ] implement mime part 1 (new headers)
- [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
- [ ] implement some targeted testing as part of mime part 5
- [ ] implement fuzzing through cargo fuzz
- [ ] test over other datasets (jpbush, ml, my inbox)
- [ ] backport to aerogramme
## Targeted RFC ## Targeted RFC
| # | Name | | # | Name |

View file

@ -1,3 +1,4 @@
use std::borrow::Cow;
use chrono::{DateTime, FixedOffset}; use chrono::{DateTime, FixedOffset};
use nom::{ use nom::{
IResult, IResult,
@ -10,6 +11,9 @@ use nom::{
sequence::{terminated, preceded, pair, tuple}, sequence::{terminated, preceded, pair, tuple},
}; };
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use crate::whitespace::{fws, perm_crlf}; use crate::whitespace::{fws, perm_crlf};
use crate::words::vchar_seq; use crate::words::vchar_seq;
use crate::misc_token::{phrase, unstructured}; use crate::misc_token::{phrase, unstructured};
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};
/// HEADERS /// HEADERS
/// Header section ///
pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
enc.decode(&rawmail)
}
/// Internal header section
/// ///
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2 /// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
pub fn section(input: &str) -> IResult<&str, HeaderSection> { pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
let (input, headers) = fold_many0( let (input, headers) = fold_many0(
alt((known_field, unknown_field, rescue_field)), alt((known_field, unknown_field, rescue_field)),
HeaderSection::default, HeaderSection::default,

View file

@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?; let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?;
let (input, mut rest) = obs_domain_list_rest(input)?; let (input, mut rest) = obs_domain_list_rest(input)?;
rest.insert(0, head); rest.insert(0, head);
Ok(("", rest)) Ok((input, rest))
} }
fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> { fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {

View file

@ -2,29 +2,21 @@ use imf_codec::header;
use std::io; use std::io;
use std::io::Read; use std::io::Read;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
fn main() { fn main() {
// Read full mail in memory // Read full mail in memory
let mut rawmail = Vec::new(); let mut rawmail = Vec::new();
io::stdin().lock().read_to_end(&mut rawmail).unwrap(); io::stdin().lock().read_to_end(&mut rawmail).unwrap();
// Create detector // Parse it
let mut detector = EncodingDetector::new(); let (email, encoding, malformed) = header::from_bytes(&rawmail);
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
let (email, encoding, malformed) = enc.decode(&rawmail);
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed); println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (_, hdrs) = header::section(&email).unwrap(); let (input, hdrs) = header::section(&email).unwrap();
// Checks/debug
println!("{:?}", hdrs); println!("{:?}", hdrs);
assert!(hdrs.date.is_some()); assert!(hdrs.date.is_some());
assert!(hdrs.from.len() > 0); assert!(hdrs.from.len() > 0);
assert!(hdrs.bad_fields.len() == 0); assert!(hdrs.bad_fields.len() == 0);
} }

89
tests/enron.rs Normal file
View file

@ -0,0 +1,89 @@
use std::path::PathBuf;
use std::fs::File;
use std::io::Read;
use imf_codec::header;
use walkdir::WalkDir;
#[test]
#[ignore]
fn test_enron500k() {
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("resources/enron/maildir/");
let known_bad_fields = [
"maildir/white-s/calendar/113.", // To: east <7..>
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
"maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
"maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
"maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
// Date: Monday, March 12
"maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
"maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
"maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
"maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
"maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
"maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
"maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
"maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
"maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
"maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
];
let known_bad_from = [
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
];
let mut i = 0;
for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
if entry.metadata().unwrap().is_file() {
//@TODO check list
// read file
let mut raw = Vec::new();
let mut f = File::open(entry.path()).unwrap();
f.read_to_end(&mut raw).unwrap();
// parse
let (email, encoding, malformed) = header::from_bytes(&raw);
//println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (input, hdrs) = header::section(&email).unwrap();
//println!("{:?}", hdrs);
let ok_date = hdrs.date.is_some();
let ok_from = hdrs.from.len() > 0;
let ok_fields = hdrs.bad_fields.len() == 0;
let p = entry.path();
if !ok_date || !ok_from || !ok_fields {
println!("Issue with: {}", p.display());
}
assert!(ok_date);
if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
assert!(ok_from);
}
if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
assert!(ok_fields);
}
i += 1;
if i % 1000 == 0 {
println!("Analyzed emails: {}", i);
}
}
}
}