2023-06-19 10:15:05 +00:00
|
|
|
use std::collections::HashSet;
|
2023-06-19 09:22:51 +00:00
|
|
|
use std::path::PathBuf;
|
|
|
|
use std::fs::File;
|
|
|
|
use std::io::Read;
|
2023-06-19 15:25:16 +00:00
|
|
|
use imf_codec::fragments::header;
|
2023-06-19 09:22:51 +00:00
|
|
|
use walkdir::WalkDir;
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[ignore]
|
|
|
|
fn test_enron500k() {
|
|
|
|
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
|
|
|
d.push("resources/enron/maildir/");
|
2023-06-19 10:15:05 +00:00
|
|
|
let prefix_sz = d.as_path().to_str().unwrap().len();
|
2023-06-19 14:09:11 +00:00
|
|
|
//d.push("williams-w3/");
|
2023-06-19 09:22:51 +00:00
|
|
|
|
2023-06-19 10:15:05 +00:00
|
|
|
let known_bad_fields = HashSet::from([
|
2023-06-19 09:55:30 +00:00
|
|
|
"white-s/calendar/113.", // To: east <7..>
|
2023-06-19 09:22:51 +00:00
|
|
|
|
2023-06-19 09:55:30 +00:00
|
|
|
"skilling-j/inbox/223.", // From: pep <performance.>
|
2023-06-19 09:22:51 +00:00
|
|
|
|
2023-06-19 09:55:30 +00:00
|
|
|
"jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
|
|
|
|
"jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com>
|
2023-06-19 09:22:51 +00:00
|
|
|
|
2023-06-19 09:55:30 +00:00
|
|
|
"lokey-t/calendar/33.", // A second Date entry for the calendar containing
|
2023-06-19 09:22:51 +00:00
|
|
|
// Date: Monday, March 12
|
|
|
|
|
2023-06-19 09:55:30 +00:00
|
|
|
"zipper-a/inbox/199.", // To: e-mail <mari.>
|
|
|
|
|
|
|
|
"dasovich-j/deleted_items/128.", // To: f62489 <g>
|
|
|
|
"dasovich-j/all_documents/677.", // To: w/assts <govt.>
|
|
|
|
"dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/all_documents/578.", // To: w/assts <govt.>
|
|
|
|
"dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com>
|
|
|
|
"dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com>
|
|
|
|
"dasovich-j/risk_analytics/3.", // To: w/assts <govt.>
|
|
|
|
"dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com>
|
|
|
|
"dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com>
|
|
|
|
"dasovich-j/notes_inbox/5.", // To: w/assts <govt.>
|
|
|
|
|
|
|
|
"kaminski-v/sites/19.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/sites/1.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/discussion_threads/5082.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"kaminski-v/discussion_threads/4046.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/discussion_threads/4187.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/discussion_threads/8068.", // To: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
|
|
|
|
"kaminski-v/discussion_threads/7980.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
|
|
|
"kaminski-v/all_documents/5970.", //To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
|
|
|
"kaminski-v/all_documents/5838.", // To + Cc: dogs <breakthrough.>, breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com
|
|
|
|
"kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com>
|
|
|
|
"kaminski-v/all_documents/92.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/all_documents/276.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/technical/1.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/technical/7.", // To: <"the.desk":@enron.com>
|
|
|
|
"kaminski-v/notes_inbox/140.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
|
|
|
|
"kaminski-v/notes_inbox/95.", // To + CC failed: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
|
|
|
|
|
2023-06-19 13:24:22 +00:00
|
|
|
"kean-s/archiving/untitled/1232.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
|
|
|
"kean-s/archiving/untitled/1688.", // To: w/assts <govt.>
|
|
|
|
"kean-s/sent/198.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
|
|
|
"kean-s/reg_risk/9.", // To: w/assts <govt.>
|
|
|
|
"kean-s/discussion_threads/950.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
|
|
|
"kean-s/discussion_threads/577.", // To: w/assts <govt.>
|
|
|
|
"kean-s/calendar/untitled/1096.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
|
|
|
|
"kean-s/calendar/untitled/640.", // To: w/assts <govt.>
|
|
|
|
"kean-s/all_documents/640.", // To: w/assts <govt.>
|
|
|
|
"kean-s/all_documents/1095.", // To: w/assts <govt.>
|
|
|
|
"kean-s/attachments/2030.", // To: w/assts <govt.>
|
2023-06-19 14:09:11 +00:00
|
|
|
|
|
|
|
"williams-w3/operations_committee_isas/10.", // To: z34655 <m>
|
2023-06-19 10:15:05 +00:00
|
|
|
]);
|
2023-06-19 09:22:51 +00:00
|
|
|
|
2023-06-19 10:15:05 +00:00
|
|
|
let known_bad_from = HashSet::from([
|
|
|
|
"skilling-j/inbox/223.", // From: pep <performance.>
|
|
|
|
]);
|
2023-06-19 09:22:51 +00:00
|
|
|
|
|
|
|
let mut i = 0;
|
|
|
|
for entry in WalkDir::new(d.as_path()).into_iter().filter_map(|file| file.ok()) {
|
|
|
|
if entry.metadata().unwrap().is_file() {
|
2023-06-19 10:15:05 +00:00
|
|
|
let mail_path = entry.path();
|
|
|
|
let suffix = &mail_path.to_str().unwrap()[prefix_sz..];
|
2023-06-19 09:22:51 +00:00
|
|
|
|
|
|
|
// read file
|
|
|
|
let mut raw = Vec::new();
|
2023-06-19 10:15:05 +00:00
|
|
|
let mut f = File::open(mail_path).unwrap();
|
2023-06-19 09:22:51 +00:00
|
|
|
f.read_to_end(&mut raw).unwrap();
|
|
|
|
|
|
|
|
// parse
|
|
|
|
let (email, encoding, malformed) = header::from_bytes(&raw);
|
|
|
|
//println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
|
|
|
|
|
|
|
let (input, hdrs) = header::section(&email).unwrap();
|
|
|
|
//println!("{:?}", hdrs);
|
|
|
|
let ok_date = hdrs.date.is_some();
|
|
|
|
let ok_from = hdrs.from.len() > 0;
|
|
|
|
let ok_fields = hdrs.bad_fields.len() == 0;
|
|
|
|
|
|
|
|
let p = entry.path();
|
|
|
|
if !ok_date || !ok_from || !ok_fields {
|
2023-06-19 10:15:05 +00:00
|
|
|
println!("Issue with: {}", suffix);
|
2023-06-19 09:22:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
assert!(ok_date);
|
|
|
|
|
2023-06-19 10:15:05 +00:00
|
|
|
if !known_bad_from.contains(suffix) {
|
2023-06-19 09:22:51 +00:00
|
|
|
assert!(ok_from);
|
|
|
|
}
|
|
|
|
|
2023-06-19 10:15:05 +00:00
|
|
|
if !known_bad_fields.contains(suffix) {
|
2023-06-19 09:22:51 +00:00
|
|
|
assert!(ok_fields);
|
|
|
|
}
|
|
|
|
|
|
|
|
i += 1;
|
|
|
|
if i % 1000 == 0 {
|
|
|
|
println!("Analyzed emails: {}", i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|