wip enron, todo list

This commit is contained in:
Quentin 2023-06-19 11:22:51 +02:00
parent cad1ca9978
commit 4fe6c9c8df
Signed by: quentin
GPG key ID: E9602264D639FF68
5 changed files with 132 additions and 15 deletions

View file

@ -28,6 +28,16 @@ Current known limitations/bugs:
## Design
Based on nom, a parser combinator lib in Rust.
multipass parser
- extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
- decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
- extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
- extract header names iter::Name::From(&str)
- extract header body iter::Body::From(Vec<MailboxRef>)
- extract header section Section
recovery
- based on multipass, equivalent to sentinel / synchronization tokens
## Testing strategy
@ -40,6 +50,17 @@ Based on nom, a parser combinator lib in Rust.
Early development. Not ready.
Do not use it in production or any software at all.
Todo:
- [ ] test over enron dataset
- [ ] convert to multipass parser
- [ ] implement mime part 3 (encoded headers)
- [ ] implement mime part 1 (new headers)
- [ ] review part 2 (media types) and part 4 (registration procedure) but might be out of scope
- [ ] implement some targeted testing as part of mime part 5
- [ ] implement fuzzing through cargo fuzz
- [ ] test over other datasets (jpbush, ml, my inbox)
- [ ] backport to aerogramme
## Targeted RFC
| # | Name |

View file

@ -1,3 +1,4 @@
use std::borrow::Cow;
use chrono::{DateTime, FixedOffset};
use nom::{
IResult,
@ -10,6 +11,9 @@ use nom::{
sequence::{terminated, preceded, pair, tuple},
};
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use crate::whitespace::{fws, perm_crlf};
use crate::words::vchar_seq;
use crate::misc_token::{phrase, unstructured};
@ -21,10 +25,21 @@ use crate::{datetime, trace, model};
/// HEADERS
/// Header section
///
pub fn from_bytes<'a>(rawmail: &'a [u8]) -> (Cow<'a, str>, &Encoding, bool) {
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
enc.decode(&rawmail)
}
/// Internal header section
///
/// See: https://www.rfc-editor.org/rfc/rfc5322.html#section-2.2
pub fn section(input: &str) -> IResult<&str, HeaderSection> {
pub fn section<'a>(input: &'a str) -> IResult<&'a str, HeaderSection> {
let (input, headers) = fold_many0(
alt((known_field, unknown_field, rescue_field)),
HeaderSection::default,

View file

@ -65,7 +65,7 @@ fn obs_domain_list(input: &str) -> IResult<&str, Vec<String>> {
let (input, head) = preceded(pair(many0(alt((recognize(cfws), tag(",")))), tag("@")), obs_domain)(input)?;
let (input, mut rest) = obs_domain_list_rest(input)?;
rest.insert(0, head);
Ok(("", rest))
Ok((input, rest))
}
fn obs_domain_list_rest(input: &str) -> IResult<&str, Vec<String>> {

View file

@ -2,29 +2,21 @@ use imf_codec::header;
use std::io;
use std::io::Read;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
fn main() {
// Read full mail in memory
let mut rawmail = Vec::new();
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
let (email, encoding, malformed) = enc.decode(&rawmail);
// Parse it
let (email, encoding, malformed) = header::from_bytes(&rawmail);
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (_, hdrs) = header::section(&email).unwrap();
let (input, hdrs) = header::section(&email).unwrap();
// Checks/debug
println!("{:?}", hdrs);
assert!(hdrs.date.is_some());
assert!(hdrs.from.len() > 0);
assert!(hdrs.bad_fields.len() == 0);
}

89
tests/enron.rs Normal file
View file

@ -0,0 +1,89 @@
use std::path::PathBuf;
use std::fs::File;
use std::io::Read;
use imf_codec::header;
use walkdir::WalkDir;
#[test]
#[ignore]
fn test_enron500k() {
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("resources/enron/maildir/");
let known_bad_fields = [
"maildir/white-s/calendar/113.", // To: east <7..>
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
"maildir/jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
"maildir/jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
"maildir/lokey-t/calendar/33.", // A second Date entry for the calendar containing
// Date: Monday, March 12
"maildir/zipper-a/inbox/199.", // To: e-mail <mari.>
"maildir/dasovich-j/deleted_items/128.", // To: f62489 <g>
"maildir/dasovich-j/all_documents/677.", // To: w/assts <govt.>
"maildir/dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/all_documents/578.", // To: w/assts <govt.>
"maildir/dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
"maildir/dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
"maildir/dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
"maildir/dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
"maildir/dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
"maildir/dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
"maildir/dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
];
let known_bad_from = [
"maildir/skilling-j/inbox/223.", // From: pep <performance.>
];
let mut i = 0;
for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
if entry.metadata().unwrap().is_file() {
//@TODO check list
// read file
let mut raw = Vec::new();
let mut f = File::open(entry.path()).unwrap();
f.read_to_end(&mut raw).unwrap();
// parse
let (email, encoding, malformed) = header::from_bytes(&raw);
//println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (input, hdrs) = header::section(&email).unwrap();
//println!("{:?}", hdrs);
let ok_date = hdrs.date.is_some();
let ok_from = hdrs.from.len() > 0;
let ok_fields = hdrs.bad_fields.len() == 0;
let p = entry.path();
if !ok_date || !ok_from || !ok_fields {
println!("Issue with: {}", p.display());
}
assert!(ok_date);
if !known_bad_from.iter().any(|&s| p.ends_with(s)) {
assert!(ok_from);
}
if !known_bad_fields.iter().any(|&s| p.ends_with(s)) {
assert!(ok_fields);
}
i += 1;
if i % 1000 == 0 {
println!("Analyzed emails: {}", i);
}
}
}
}