validate enron
This commit is contained in:
parent
849c741be9
commit
98f42d3efc
2 changed files with 24 additions and 7 deletions
27
README.md
27
README.md
|
@ -41,11 +41,26 @@ recovery
|
||||||
|
|
||||||
## Testing strategy
|
## Testing strategy
|
||||||
|
|
||||||
- Unit testing: parser combinator independently (done)
|
imf-codec aims to be as much tested as possible against reald
|
||||||
- Selected full emails (done)
|
|
||||||
- Enron 500k (done)
|
### Unit testing: parser combinator independently (done)
|
||||||
- Fuzzing (expected)
|
|
||||||
- Across reference IMAP servers (dovevot, cyrus) (expected)
|
### Selected full emails (expected)
|
||||||
|
|
||||||
|
### Existing datasets
|
||||||
|
|
||||||
|
**Enron 500k** - Took 20 minutes to parse ~517k emails and check that
|
||||||
|
RFC5322 headers (From, To, Cc, etc.) are correctly parsed.
|
||||||
|
From this list, we had to exclude ~50 emails on which
|
||||||
|
the From/To/Cc fields were simply completely wrong, but while
|
||||||
|
some fields failed to parse, the parser did not crash and
|
||||||
|
parsed the other fields of the email correctly.
|
||||||
|
|
||||||
|
Planned: jpbush, my inbox, etc.
|
||||||
|
|
||||||
|
### Fuzzing (expected)
|
||||||
|
|
||||||
|
### Across reference IMAP servers (dovevot, cyrus) (expected)
|
||||||
|
|
||||||
## Development status
|
## Development status
|
||||||
|
|
||||||
|
@ -53,7 +68,7 @@ Early development. Not ready.
|
||||||
Do not use it in production or any software at all.
|
Do not use it in production or any software at all.
|
||||||
|
|
||||||
Todo:
|
Todo:
|
||||||
- [ ] test over the enron dataset
|
- [X] test over the enron dataset
|
||||||
- [ ] convert to multipass parser
|
- [ ] convert to multipass parser
|
||||||
- [ ] implement mime part 3 (encoded headers)
|
- [ ] implement mime part 3 (encoded headers)
|
||||||
- [ ] implement mime part 1 (new headers)
|
- [ ] implement mime part 1 (new headers)
|
||||||
|
|
|
@ -12,7 +12,7 @@ fn test_enron500k() {
|
||||||
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||||
d.push("resources/enron/maildir/");
|
d.push("resources/enron/maildir/");
|
||||||
let prefix_sz = d.as_path().to_str().unwrap().len();
|
let prefix_sz = d.as_path().to_str().unwrap().len();
|
||||||
//d.push("kean-s/");
|
//d.push("williams-w3/");
|
||||||
|
|
||||||
let known_bad_fields = HashSet::from([
|
let known_bad_fields = HashSet::from([
|
||||||
"white-s/calendar/113.", // To: east <7..>
|
"white-s/calendar/113.", // To: east <7..>
|
||||||
|
@ -71,6 +71,8 @@ fn test_enron500k() {
|
||||||
"kean-s/all_documents/640.", // To: w/assts <govt.>
|
"kean-s/all_documents/640.", // To: w/assts <govt.>
|
||||||
"kean-s/all_documents/1095.", // To: w/assts <govt.>
|
"kean-s/all_documents/1095.", // To: w/assts <govt.>
|
||||||
"kean-s/attachments/2030.", // To: w/assts <govt.>
|
"kean-s/attachments/2030.", // To: w/assts <govt.>
|
||||||
|
|
||||||
|
"williams-w3/operations_committee_isas/10.", // To: z34655 <m>
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let known_bad_from = HashSet::from([
|
let known_bad_from = HashSet::from([
|
||||||
|
|
Loading…
Add table
Reference in a new issue