wip enron, todo list

2023-06-19 11:22:51 +02:00 · 2023-06-19 11:22:51 +02:00 · 4fe6c9c8df
commit 4fe6c9c8df
parent cad1ca9978
5 changed files with 132 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -28,6 +28,16 @@ Current known limitations/bugs:
 ## Design

 Based on nom, a parser combinator lib in Rust.
+multipass parser
+ - extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
+ - decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
+ - extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
+ - extract header names iter::Name::From(&str)
+ - extract header body iter::Body::From(Vec<MailboxRef>)
+ - extract header section Section
+
+recovery
+ - based on multipass, equivalent to sentinel / synchronization tokens

 ## Testing strategy

@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
 Early development. Not ready.
 Do not use it in production or any software at all.

+Todo:
+ - [ ] test over enron dataset
+ - [ ] convert to multipass parser
+ - [ ] implement mime part 3 (encoded headers)
+ - [ ] implement mime part 1 (new headers)
+ - [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
+ - [ ] implement some targeted testing as part of mime part 5
+ - [ ] implement fuzzing through cargo fuzz
+ - [ ] test over other datasets (jpbush, ml, my inbox)
+ - [ ] backport to aerogramme
+
 ## Targeted RFC

 | # | Name |
--- a/src/header.rs
+++ b/src/header.rs
@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use chrono::{DateTime, FixedOffset};
 use nom::{
    IResult,
@ -10,6 +11,9 @@ use nom::{
    sequence::{terminated, preceded, pair, tuple},
 };

+use chardetng::EncodingDetector;
+use encoding_rs::Encoding;
+
 use crate::whitespace::{fws, perm_crlf};
 use crate::words::vchar_seq;
 use crate::misc_token::{phrase, unstructured};
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};

 /// HEADERS

-/// Header section
+///
+pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
+    // Create detector
+    let mut detector = EncodingDetector::new();
+    detector.feed(&rawmail, true);
+
+    // Get encoding
+    let enc: &Encoding = detector.guess(None, true);
+    enc.decode(&rawmail)
+}
+
+/// Internal header section
 ///
 /// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
-pub fn section(input: &str) -> IResult<&str, HeaderSection> {
+pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
    let (input, headers) = fold_many0(
        alt((known_field, unknown_field, rescue_field)),
        HeaderSection::default,
--- a/src/mailbox.rs
+++ b/src/mailbox.rs
@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
    let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?; 
    let (input, mut rest) = obs_domain_list_rest(input)?;
    rest.insert(0, head);
-    Ok(("", rest))
+    Ok((input, rest))
 }

 fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {
--- a/src/parse.rs
+++ b/src/parse.rs
@ -2,29 +2,21 @@ use imf_codec::header;
 use std::io;
 use std::io::Read;

-use chardetng::EncodingDetector;
-use encoding_rs::Encoding;

 fn main() {
    // Read full mail in memory
    let mut rawmail = Vec::new();
    io::stdin().lock().read_to_end(&mut rawmail).unwrap();

-    // Create detector
-    let mut detector = EncodingDetector::new();
-    detector.feed(&rawmail, true);
-    
-    // Get encoding
-    let enc: &Encoding = detector.guess(None, true);
-    let (email, encoding, malformed) = enc.decode(&rawmail);
+    // Parse it
+    let (email, encoding, malformed) = header::from_bytes(&rawmail);
    println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);

-    let (_, hdrs) = header::section(&email).unwrap();
+    let (input, hdrs) = header::section(&email).unwrap();

+    // Checks/debug
    println!("{:?}", hdrs);
-
    assert!(hdrs.date.is_some());
    assert!(hdrs.from.len() > 0);
    assert!(hdrs.bad_fields.len() == 0);
-
 }
--- a/tests/enron.rs
+++ b/tests/enron.rs
@ -0,0 +1,89 @@
+use std::path::PathBuf;
+use std::fs::File;
+use std::io::Read;
+use imf_codec::header;
+use walkdir::WalkDir;
+
+
+#[test]
+#[ignore]
+fn test_enron500k() {
+    let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    d.push("resources/enron/maildir/");
+
+    let known_bad_fields = [
+        "maildir/white-s/calendar/113.", // To: east <7..>
+                                         
+        "maildir/skilling-j/inbox/223.", // From: pep <performance.>
+                                         
+        "maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
+        "maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
+                                             
+        "maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
+                                        // Date:       Monday, March 12
+                                        
+        "maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
+
+        "maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
+        "maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
+        "maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
+        "maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
+        "maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
+        "maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
+        "maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
+        "maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
+        "maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
+        "maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
+    ];
+
+    let known_bad_from = [
+        "maildir/skilling-j/inbox/223.", // From: pep <performance.>
+    ];
+
+    let mut i = 0;
+    for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
+        if entry.metadata().unwrap().is_file() {
+            //@TODO check list
+
+            // read file
+            let mut raw = Vec::new();
+            let mut f = File::open(entry.path()).unwrap();
+            f.read_to_end(&mut raw).unwrap();
+
+            // parse
+            let (email, encoding, malformed) = header::from_bytes(&raw);
+            //println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
+
+            let (input, hdrs) = header::section(&email).unwrap();
+            //println!("{:?}", hdrs);
+            let ok_date = hdrs.date.is_some();
+            let ok_from = hdrs.from.len() > 0;
+            let ok_fields = hdrs.bad_fields.len() == 0;
+
+            let p = entry.path();
+            if !ok_date || !ok_from || !ok_fields {
+                println!("Issue with: {}", p.display());
+            }
+
+            assert!(ok_date);
+
+            if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
+                assert!(ok_from);
+            }
+
+            if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
+                assert!(ok_fields);
+            }
+
+            i += 1;
+            if i % 1000 == 0 {
+                println!("Analyzed emails: {}", i);
+            }
+        } 
+    }
+}