wip enron, todo list

2023-06-19 11:22:51 +02:00 · 2023-06-19 11:22:51 +02:00 · 4fe6c9c8df
commit 4fe6c9c8df
parent cad1ca9978
5 changed files with 132 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -28,6 +28,16 @@ Current known limitations/bugs:
 ## Design
 Based on nom, a parser combinator lib in Rust.
 multipass parser
 - extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
 - decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
 - extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
 - extract header names iter::Name::From(&str)
 - extract header body iter::Body::From(Vec<MailboxRef>)
 - extract header section Section
 recovery
 - based on multipass, equivalent to sentinel / synchronization tokens
 ## Testing strategy
@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
 Early development. Not ready.
 Do not use it in production or any software at all.
 Todo:
 - [ ] test over enron dataset
 - [ ] convert to multipass parser
 - [ ] implement mime part 3 (encoded headers)
 - [ ] implement mime part 1 (new headers)
 - [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
 - [ ] implement some targeted testing as part of mime part 5
 - [ ] implement fuzzing through cargo fuzz
 - [ ] test over other datasets (jpbush, ml, my inbox)
 - [ ] backport to aerogramme
 ## Targeted RFC
 | # | Name |
--- a/src/header.rs
+++ b/src/header.rs
@ -1,3 +1,4 @@
 use std::borrow::Cow;
 use chrono::{DateTime, FixedOffset};
 use nom::{
    IResult,
@ -10,6 +11,9 @@ use nom::{
    sequence::{terminated, preceded, pair, tuple},
 };
 use chardetng::EncodingDetector;
 use encoding_rs::Encoding;
 use crate::whitespace::{fws, perm_crlf};
 use crate::words::vchar_seq;
 use crate::misc_token::{phrase, unstructured};
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};
 /// HEADERS
-/// Header section
+///
 pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
    // Create detector
    let mut detector = EncodingDetector::new();
    detector.feed(&rawmail, true);
    // Get encoding
    let enc: &Encoding = detector.guess(None, true);
    enc.decode(&rawmail)
 }
 /// Internal header section
 ///
 /// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
-pub fn section(input: &str) -> IResult<&str, HeaderSection> {
+pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
    let (input, headers) = fold_many0(
        alt((known_field, unknown_field, rescue_field)),
        HeaderSection::default,
--- a/src/mailbox.rs
+++ b/src/mailbox.rs
@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
    let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?; 
    let (input, mut rest) = obs_domain_list_rest(input)?;
    rest.insert(0, head);
-    Ok(("", rest))
+    Ok((input, rest))
 }
 fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {
--- a/src/parse.rs
+++ b/src/parse.rs
@ -2,29 +2,21 @@ use imf_codec::header;
 use std::io;
 use std::io::Read;
 use chardetng::EncodingDetector;
 use encoding_rs::Encoding;
 fn main() {
    // Read full mail in memory
    let mut rawmail = Vec::new();
    io::stdin().lock().read_to_end(&mut rawmail).unwrap();
-    // Create detector
+    // Parse it
-    let mut detector = EncodingDetector::new();
+    let (email, encoding, malformed) = header::from_bytes(&rawmail);
    detector.feed(&rawmail, true);
    // Get encoding
    let enc: &Encoding = detector.guess(None, true);
    let (email, encoding, malformed) = enc.decode(&rawmail);
    println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
-    let (_, hdrs) = header::section(&email).unwrap();
+    let (input, hdrs) = header::section(&email).unwrap();
    // Checks/debug
    println!("{:?}", hdrs);
    assert!(hdrs.date.is_some());
    assert!(hdrs.from.len() > 0);
    assert!(hdrs.bad_fields.len() == 0);
 }
--- a/tests/enron.rs
+++ b/tests/enron.rs
@ -0,0 +1,89 @@
 use std::path::PathBuf;
 use std::fs::File;
 use std::io::Read;
 use imf_codec::header;
 use walkdir::WalkDir;
 #[test]
 #[ignore]
 fn test_enron500k() {
    let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
    d.push("resources/enron/maildir/");
    let known_bad_fields = [
        "maildir/white-s/calendar/113.", // To: east <7..>
        "maildir/skilling-j/inbox/223.", // From: pep <performance.>
        "maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
        "maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
        "maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
                                        // Date:       Monday, March 12
        "maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
        "maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
        "maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
        "maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
        "maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
        "maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
        "maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
        "maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
        "maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
        "maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
        "maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
    ];
    let known_bad_from = [
        "maildir/skilling-j/inbox/223.", // From: pep <performance.>
    ];
    let mut i = 0;
    for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
        if entry.metadata().unwrap().is_file() {
            //@TODO check list
            // read file
            let mut raw = Vec::new();
            let mut f = File::open(entry.path()).unwrap();
            f.read_to_end(&mut raw).unwrap();
            // parse
            let (email, encoding, malformed) = header::from_bytes(&raw);
            //println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
            let (input, hdrs) = header::section(&email).unwrap();
            //println!("{:?}", hdrs);
            let ok_date = hdrs.date.is_some();
            let ok_from = hdrs.from.len() > 0;
            let ok_fields = hdrs.bad_fields.len() == 0;
            let p = entry.path();
            if !ok_date || !ok_from || !ok_fields {
                println!("Issue with: {}", p.display());
            }
            assert!(ok_date);
            if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
                assert!(ok_from);
            }
            if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
                assert!(ok_fields);
            }
            i += 1;
            if i % 1000 == 0 {
                println!("Analyzed emails: {}", i);
            }
        } 
    }
 }