wip enron, todo list
This commit is contained in:
parent
cad1ca9978
commit
4fe6c9c8df
5 changed files with 132 additions and 15 deletions
21
README.md
21
README.md
|
@ -28,6 +28,16 @@ Current known limitations/bugs:
|
|||
## Design
|
||||
|
||||
Based on nom, a parser combinator lib in Rust.
|
||||
multipass parser
|
||||
- extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
|
||||
- decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
|
||||
- extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
|
||||
- extract header names iter::Name::From(&str)
|
||||
- extract header body iter::Body::From(Vec<MailboxRef>)
|
||||
- extract header section Section
|
||||
|
||||
recovery
|
||||
- based on multipass, equivalent to sentinel / synchronization tokens
|
||||
|
||||
## Testing strategy
|
||||
|
||||
|
@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
|
|||
Early development. Not ready.
|
||||
Do not use it in production or any software at all.
|
||||
|
||||
Todo:
|
||||
- [ ] test over enron dataset
|
||||
- [ ] convert to multipass parser
|
||||
- [ ] implement mime part 3 (encoded headers)
|
||||
- [ ] implement mime part 1 (new headers)
|
||||
- [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
|
||||
- [ ] implement some targeted testing as part of mime part 5
|
||||
- [ ] implement fuzzing through cargo fuzz
|
||||
- [ ] test over other datasets (jpbush, ml, my inbox)
|
||||
- [ ] backport to aerogramme
|
||||
|
||||
## Targeted RFC
|
||||
|
||||
| # | Name |
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use std::borrow::Cow;
|
||||
use chrono::{DateTime, FixedOffset};
|
||||
use nom::{
|
||||
IResult,
|
||||
|
@ -10,6 +11,9 @@ use nom::{
|
|||
sequence::{terminated, preceded, pair, tuple},
|
||||
};
|
||||
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
use crate::whitespace::{fws, perm_crlf};
|
||||
use crate::words::vchar_seq;
|
||||
use crate::misc_token::{phrase, unstructured};
|
||||
|
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};
|
|||
|
||||
/// HEADERS
|
||||
|
||||
/// Header section
|
||||
///
|
||||
pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
|
||||
// Create detector
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(&rawmail, true);
|
||||
|
||||
// Get encoding
|
||||
let enc: &Encoding = detector.guess(None, true);
|
||||
enc.decode(&rawmail)
|
||||
}
|
||||
|
||||
/// Internal header section
|
||||
///
|
||||
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
|
||||
pub fn section(input: &str) -> IResult<&str, HeaderSection> {
|
||||
pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
|
||||
let (input, headers) = fold_many0(
|
||||
alt((known_field, unknown_field, rescue_field)),
|
||||
HeaderSection::default,
|
||||
|
|
|
@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
|
|||
let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?;
|
||||
let (input, mut rest) = obs_domain_list_rest(input)?;
|
||||
rest.insert(0, head);
|
||||
Ok(("", rest))
|
||||
Ok((input, rest))
|
||||
}
|
||||
|
||||
fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {
|
||||
|
|
16
src/parse.rs
16
src/parse.rs
|
@ -2,29 +2,21 @@ use imf_codec::header;
|
|||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
fn main() {
|
||||
// Read full mail in memory
|
||||
let mut rawmail = Vec::new();
|
||||
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
||||
|
||||
// Create detector
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(&rawmail, true);
|
||||
|
||||
// Get encoding
|
||||
let enc: &Encoding = detector.guess(None, true);
|
||||
let (email, encoding, malformed) = enc.decode(&rawmail);
|
||||
// Parse it
|
||||
let (email, encoding, malformed) = header::from_bytes(&rawmail);
|
||||
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||
|
||||
let (_, hdrs) = header::section(&email).unwrap();
|
||||
let (input, hdrs) = header::section(&email).unwrap();
|
||||
|
||||
// Checks/debug
|
||||
println!("{:?}", hdrs);
|
||||
|
||||
assert!(hdrs.date.is_some());
|
||||
assert!(hdrs.from.len() > 0);
|
||||
assert!(hdrs.bad_fields.len() == 0);
|
||||
|
||||
}
|
||||
|
|
89
tests/enron.rs
Normal file
89
tests/enron.rs
Normal file
|
@ -0,0 +1,89 @@
|
|||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use imf_codec::header;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_enron500k() {
|
||||
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
d.push("resources/enron/maildir/");
|
||||
|
||||
let known_bad_fields = [
|
||||
"maildir/white-s/calendar/113.", // To: east <7..>
|
||||
|
||||
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
|
||||
|
||||
"maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
|
||||
"maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
|
||||
|
||||
"maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
|
||||
// Date: Monday, March 12
|
||||
|
||||
"maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
|
||||
|
||||
"maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
|
||||
"maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
|
||||
"maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
|
||||
"maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
|
||||
"maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
|
||||
"maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
|
||||
"maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
|
||||
"maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
|
||||
"maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
|
||||
"maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
|
||||
];
|
||||
|
||||
let known_bad_from = [
|
||||
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
|
||||
];
|
||||
|
||||
let mut i = 0;
|
||||
for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
|
||||
if entry.metadata().unwrap().is_file() {
|
||||
//@TODO check list
|
||||
|
||||
// read file
|
||||
let mut raw = Vec::new();
|
||||
let mut f = File::open(entry.path()).unwrap();
|
||||
f.read_to_end(&mut raw).unwrap();
|
||||
|
||||
// parse
|
||||
let (email, encoding, malformed) = header::from_bytes(&raw);
|
||||
//println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||
|
||||
let (input, hdrs) = header::section(&email).unwrap();
|
||||
//println!("{:?}", hdrs);
|
||||
let ok_date = hdrs.date.is_some();
|
||||
let ok_from = hdrs.from.len() > 0;
|
||||
let ok_fields = hdrs.bad_fields.len() == 0;
|
||||
|
||||
let p = entry.path();
|
||||
if !ok_date || !ok_from || !ok_fields {
|
||||
println!("Issue with: {}", p.display());
|
||||
}
|
||||
|
||||
assert!(ok_date);
|
||||
|
||||
if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
|
||||
assert!(ok_from);
|
||||
}
|
||||
|
||||
if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
|
||||
assert!(ok_fields);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
if i % 1000 == 0 {
|
||||
println!("Analyzed emails: {}", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue