validate enron
This commit is contained in:
parent
849c741be9
commit
98f42d3efc
2 changed files with 24 additions and 7 deletions
27
README.md
27
README.md
|
@ -41,11 +41,26 @@ recovery
|
|||
|
||||
## Testing strategy
|
||||
|
||||
- Unit testing: parser combinator independently (done)
|
||||
- Selected full emails (done)
|
||||
- Enron 500k (done)
|
||||
- Fuzzing (expected)
|
||||
- Across reference IMAP servers (dovevot, cyrus) (expected)
|
||||
imf-codec aims to be as much tested as possible against reald
|
||||
|
||||
### Unit testing: parser combinator independently (done)
|
||||
|
||||
### Selected full emails (expected)
|
||||
|
||||
### Existing datasets
|
||||
|
||||
**Enron 500k** - Took 20 minutes to parse ~517k emails and check that
|
||||
RFC5322 headers (From, To, Cc, etc.) are correctly parsed.
|
||||
From this list, we had to exclude ~50 emails on which
|
||||
the From/To/Cc fields were simply completely wrong, but while
|
||||
some fields failed to parse, the parser did not crash and
|
||||
parsed the other fields of the email correctly.
|
||||
|
||||
Planned: jpbush, my inbox, etc.
|
||||
|
||||
### Fuzzing (expected)
|
||||
|
||||
### Across reference IMAP servers (dovevot, cyrus) (expected)
|
||||
|
||||
## Development status
|
||||
|
||||
|
@ -53,7 +68,7 @@ Early development. Not ready.
|
|||
Do not use it in production or any software at all.
|
||||
|
||||
Todo:
|
||||
- [ ] test over the enron dataset
|
||||
- [X] test over the enron dataset
|
||||
- [ ] convert to multipass parser
|
||||
- [ ] implement mime part 3 (encoded headers)
|
||||
- [ ] implement mime part 1 (new headers)
|
||||
|
|
|
@ -12,7 +12,7 @@ fn test_enron500k() {
|
|||
let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
d.push("resources/enron/maildir/");
|
||||
let prefix_sz = d.as_path().to_str().unwrap().len();
|
||||
//d.push("kean-s/");
|
||||
//d.push("williams-w3/");
|
||||
|
||||
let known_bad_fields = HashSet::from([
|
||||
"white-s/calendar/113.", // To: east <7..>
|
||||
|
@ -71,6 +71,8 @@ fn test_enron500k() {
|
|||
"kean-s/all_documents/640.", // To: w/assts <govt.>
|
||||
"kean-s/all_documents/1095.", // To: w/assts <govt.>
|
||||
"kean-s/attachments/2030.", // To: w/assts <govt.>
|
||||
|
||||
"williams-w3/operations_committee_isas/10.", // To: z34655 <m>
|
||||
]);
|
||||
|
||||
let known_bad_from = HashSet::from([
|
||||
|
|
Loading…
Reference in a new issue