wip enron, todo list
This commit is contained in:
parent
cad1ca9978
commit
4fe6c9c8df
5 changed files with 132 additions and 15 deletions
21
README.md
21
README.md
|
@ -28,6 +28,16 @@ Current known limitations/bugs:
|
||||||
## Design
|
## Design
|
||||||
|
|
||||||
Based on nom, a parser combinator lib in Rust.
|
Based on nom, a parser combinator lib in Rust.
|
||||||
|
multipass parser
|
||||||
|
- extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
|
||||||
|
- decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
|
||||||
|
- extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
|
||||||
|
- extract header names iter::Name::From(&str)
|
||||||
|
- extract header body iter::Body::From(Vec<MailboxRef>)
|
||||||
|
- extract header section Section
|
||||||
|
|
||||||
|
recovery
|
||||||
|
- based on multipass, equivalent to sentinel / synchronization tokens
|
||||||
|
|
||||||
## Testing strategy
|
## Testing strategy
|
||||||
|
|
||||||
|
@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
|
||||||
Early development. Not ready.
|
Early development. Not ready.
|
||||||
Do not use it in production or any software at all.
|
Do not use it in production or any software at all.
|
||||||
|
|
||||||
|
Todo:
|
||||||
|
- [ ] test over enron dataset
|
||||||
|
- [ ] convert to multipass parser
|
||||||
|
- [ ] implement mime part 3 (encoded headers)
|
||||||
|
- [ ] implement mime part 1 (new headers)
|
||||||
|
- [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
|
||||||
|
- [ ] implement some targeted testing as part of mime part 5
|
||||||
|
- [ ] implement fuzzing through cargo fuzz
|
||||||
|
- [ ] test over other datasets (jpbush, ml, my inbox)
|
||||||
|
- [ ] backport to aerogramme
|
||||||
|
|
||||||
## Targeted RFC
|
## Targeted RFC
|
||||||
|
|
||||||
| # | Name |
|
| # | Name |
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
use std::borrow::Cow;
|
||||||
use chrono::{DateTime, FixedOffset};
|
use chrono::{DateTime, FixedOffset};
|
||||||
use nom::{
|
use nom::{
|
||||||
IResult,
|
IResult,
|
||||||
|
@ -10,6 +11,9 @@ use nom::{
|
||||||
sequence::{terminated, preceded, pair, tuple},
|
sequence::{terminated, preceded, pair, tuple},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use chardetng::EncodingDetector;
|
||||||
|
use encoding_rs::Encoding;
|
||||||
|
|
||||||
use crate::whitespace::{fws, perm_crlf};
|
use crate::whitespace::{fws, perm_crlf};
|
||||||
use crate::words::vchar_seq;
|
use crate::words::vchar_seq;
|
||||||
use crate::misc_token::{phrase, unstructured};
|
use crate::misc_token::{phrase, unstructured};
|
||||||
|
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};
|
||||||
|
|
||||||
/// HEADERS
|
/// HEADERS
|
||||||
|
|
||||||
/// Header section
|
///
|
||||||
|
pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
|
||||||
|
// Create detector
|
||||||
|
let mut detector = EncodingDetector::new();
|
||||||
|
detector.feed(&rawmail, true);
|
||||||
|
|
||||||
|
// Get encoding
|
||||||
|
let enc: &Encoding = detector.guess(None, true);
|
||||||
|
enc.decode(&rawmail)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal header section
|
||||||
///
|
///
|
||||||
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
|
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
|
||||||
pub fn section(input: &str) -> IResult<&str, HeaderSection> {
|
pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
|
||||||
let (input, headers) = fold_many0(
|
let (input, headers) = fold_many0(
|
||||||
alt((known_field, unknown_field, rescue_field)),
|
alt((known_field, unknown_field, rescue_field)),
|
||||||
HeaderSection::default,
|
HeaderSection::default,
|
||||||
|
|
|
@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
|
||||||
let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?;
|
let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?;
|
||||||
let (input, mut rest) = obs_domain_list_rest(input)?;
|
let (input, mut rest) = obs_domain_list_rest(input)?;
|
||||||
rest.insert(0, head);
|
rest.insert(0, head);
|
||||||
Ok(("", rest))
|
Ok((input, rest))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {
|
fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {
|
||||||
|
|
16
src/parse.rs
16
src/parse.rs
|
@ -2,29 +2,21 @@ use imf_codec::header;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
|
||||||
use chardetng::EncodingDetector;
|
|
||||||
use encoding_rs::Encoding;
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
// Read full mail in memory
|
// Read full mail in memory
|
||||||
let mut rawmail = Vec::new();
|
let mut rawmail = Vec::new();
|
||||||
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
||||||
|
|
||||||
// Create detector
|
// Parse it
|
||||||
let mut detector = EncodingDetector::new();
|
let (email, encoding, malformed) = header::from_bytes(&rawmail);
|
||||||
detector.feed(&rawmail, true);
|
|
||||||
|
|
||||||
// Get encoding
|
|
||||||
let enc: &Encoding = detector.guess(None, true);
|
|
||||||
let (email, encoding, malformed) = enc.decode(&rawmail);
|
|
||||||
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||||
|
|
||||||
let (_, hdrs) = header::section(&email).unwrap();
|
let (input, hdrs) = header::section(&email).unwrap();
|
||||||
|
|
||||||
|
// Checks/debug
|
||||||
println!("{:?}", hdrs);
|
println!("{:?}", hdrs);
|
||||||
|
|
||||||
assert!(hdrs.date.is_some());
|
assert!(hdrs.date.is_some());
|
||||||
assert!(hdrs.from.len() > 0);
|
assert!(hdrs.from.len() > 0);
|
||||||
assert!(hdrs.bad_fields.len() == 0);
|
assert!(hdrs.bad_fields.len() == 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
89
tests/enron.rs
Normal file
89
tests/enron.rs
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::Read;
|
||||||
|
use imf_codec::header;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[ignore]
|
||||||
|
fn test_enron500k() {
|
||||||
|
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||||
|
d.push("resources/enron/maildir/");
|
||||||
|
|
||||||
|
let known_bad_fields = [
|
||||||
|
"maildir/white-s/calendar/113.", // To: east <7..>
|
||||||
|
|
||||||
|
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
|
||||||
|
|
||||||
|
"maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
|
||||||
|
"maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
|
||||||
|
|
||||||
|
"maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
|
||||||
|
// Date: Monday, March 12
|
||||||
|
|
||||||
|
"maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
|
||||||
|
|
||||||
|
"maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
|
||||||
|
"maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
|
||||||
|
"maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
|
||||||
|
"maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
|
||||||
|
"maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
|
||||||
|
"maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
|
||||||
|
"maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
|
||||||
|
"maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
|
||||||
|
"maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
|
||||||
|
"maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
|
||||||
|
];
|
||||||
|
|
||||||
|
let known_bad_from = [
|
||||||
|
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut i = 0;
|
||||||
|
for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
|
||||||
|
if entry.metadata().unwrap().is_file() {
|
||||||
|
//@TODO check list
|
||||||
|
|
||||||
|
// read file
|
||||||
|
let mut raw = Vec::new();
|
||||||
|
let mut f = File::open(entry.path()).unwrap();
|
||||||
|
f.read_to_end(&mut raw).unwrap();
|
||||||
|
|
||||||
|
// parse
|
||||||
|
let (email, encoding, malformed) = header::from_bytes(&raw);
|
||||||
|
//println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||||
|
|
||||||
|
let (input, hdrs) = header::section(&email).unwrap();
|
||||||
|
//println!("{:?}", hdrs);
|
||||||
|
let ok_date = hdrs.date.is_some();
|
||||||
|
let ok_from = hdrs.from.len() > 0;
|
||||||
|
let ok_fields = hdrs.bad_fields.len() == 0;
|
||||||
|
|
||||||
|
let p = entry.path();
|
||||||
|
if !ok_date || !ok_from || !ok_fields {
|
||||||
|
println!("Issue with: {}", p.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(ok_date);
|
||||||
|
|
||||||
|
if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
|
||||||
|
assert!(ok_from);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
|
||||||
|
assert!(ok_fields);
|
||||||
|
}
|
||||||
|
|
||||||
|
i += 1;
|
||||||
|
if i % 1000 == 0 {
|
||||||
|
println!("Analyzed emails: {}", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue