From bcb5a81d32c384ca2211e728dd9ddfb233a8ece1 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Mon, 24 Jul 2023 13:04:21 +0200 Subject: [PATCH] Improve README --- README.md | 45 +++--- ignore.test/enron.rs | 129 ---------------- ignore.test/known.rs | 340 ------------------------------------------- 3 files changed, 17 insertions(+), 497 deletions(-) delete mode 100644 ignore.test/enron.rs delete mode 100644 ignore.test/known.rs diff --git a/README.md b/README.md index 3eed100..50167ed 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ **⚠️ This is currently only a decoder (ie. a parser), encoding is not yet implemented.** +`eml-codec` is a child project of [Aerogramme](https://aerogramme.deuxfleurs.fr), a distributed and encrypted IMAP server developped by the non-profit organization [Deuxfleurs](https://deuxfleurs.fr). +Its aim is to be a swiss army knife to handle emails, whether it is to build an IMAP/JMAP server, a mail filter (like an antispam), or a mail client. + ## Example ```rust @@ -56,34 +59,17 @@ Speak about parser combinators. ## Testing strategy -eml-codec aims to be as much tested as possible against real word data. +Currently this crate has some unit tests on most of its parsing functions. +It is also tested as part of Aerogramme, its parent project where it handles email parsing. +In this project, `eml-codec` parsing capabilities are compared to Dovecot, Cyrus, Maddy and other IMAP servers. -### Unit testing: parser combinator independently (done) +It is planned to test it on large email datasets (like Enron, jpbush, mailing lists, etc.) but it's not done yet. +Fuzzing the library would also be interesting, probably to detect crashing due to stack overflow for example +due to the infinite recursivity of MIME. -### Selected full emails (expected) +## RFC and IANA references -### Existing datasets - -**Enron 500k** - Took 20 minutes to parse ~517k emails and check that -RFC5322 headers (From, To, Cc, etc.) are correctly parsed. -From this list, we had to exclude ~50 emails on which -the From/To/Cc fields were simply completely wrong, but while -some fields failed to parse, the parser did not crash and -parsed the other fields of the email correctly. - -Run it on your machine: - -```bash -cargo test -- --ignored --nocapture enron500k -``` - -Planned: jpbush, my inbox, etc. - -### Fuzzing (expected) - -### Across reference IMAP servers (dovevot, cyrus) (expected) - -## Targeted RFC and IANA references +RFC | 🚩 | # | Name | |----|---|------| @@ -106,9 +92,12 @@ Planned: jpbush, my inbox, etc. | 🔴 |3798 | ↳ Message Disposition Notification | | 🔴 |6838 | ↳ Media Type Specifications and Registration Procedures | -IANA references : - - (tbd) MIME subtypes - - [IANA character sets](https://www.iana.org/assignments/character-sets/character-sets.xhtml) +IANA + +| Name | Description | Note | +|------|-------------|------| +| [Media Types](https://www.iana.org/assignments/media-types/media-types.xhtml) | Registered media types for the Content-Type field | Currently only the media types in the MIME RFC have dedicated support in `eml-codec`. | +| [Character sets](https://www.iana.org/assignments/character-sets/character-sets.xhtml) | Supported character sets for the `charset` parameter | They should all be supported through the `encoding_rs` crate | ## State of the art / alternatives diff --git a/ignore.test/enron.rs b/ignore.test/enron.rs deleted file mode 100644 index 8020bd9..0000000 --- a/ignore.test/enron.rs +++ /dev/null @@ -1,129 +0,0 @@ -use imf_codec::fragments::section; -use imf_codec::multipass; -use std::collections::HashSet; -use std::fs::File; -use std::io::Read; -use std::path::PathBuf; -use walkdir::WalkDir; - -fn parser<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(§ion::Section) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - - func(§ion.fields); -} - -#[test] -#[ignore] -fn test_enron500k() { - let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - d.push("resources/enron/maildir/"); - let prefix_sz = d.as_path().to_str().unwrap().len(); - //d.push("williams-w3/"); - - let known_bad_fields = HashSet::from([ - "white-s/calendar/113.", // To: east <7..> - "skilling-j/inbox/223.", // From: pep - "jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com> - "jones-t/notes_inbox/3303.", // To: <"tibor.vizkelety":@enron.com> - "lokey-t/calendar/33.", // A second Date entry for the calendar containing - // Date: Monday, March 12 - "zipper-a/inbox/199.", // To: e-mail - "dasovich-j/deleted_items/128.", // To: f62489 - "dasovich-j/all_documents/677.", // To: w/assts - "dasovich-j/all_documents/8984.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/3514.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/4467.", // To: <"ft.com.users":@enron.com> - "dasovich-j/all_documents/578.", // To: w/assts - "dasovich-j/all_documents/3148.", // To: <"economist.com.readers":@enron.com> - "dasovich-j/all_documents/9953.", // To: <"economist.com.reader":@enron.com> - "dasovich-j/risk_analytics/3.", // To: w/assts - "dasovich-j/notes_inbox/5391.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/4952.", // To: <"economist.com.reader":@enron.com> - "dasovich-j/notes_inbox/2386.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/1706.", // To: <"ft.com.users":@enron.com> - "dasovich-j/notes_inbox/1489.", // To: <"economist.com.readers":@enron.com> - "dasovich-j/notes_inbox/5.", // To: w/assts - "kaminski-v/sites/19.", // To: <"the.desk":@enron.com> - "kaminski-v/sites/1.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/5082.", // To: <"ft.com.users":@enron.com> - "kaminski-v/discussion_threads/4046.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/4187.", // To: <"the.desk":@enron.com> - "kaminski-v/discussion_threads/8068.", // To: cats , risk , leaders - "kaminski-v/discussion_threads/7980.", // To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/all_documents/5970.", //To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/all_documents/5838.", // To + Cc: dogs , breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com - "kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com> - "kaminski-v/all_documents/92.", // To: <"the.desk":@enron.com> - "kaminski-v/all_documents/276.", // To: <"the.desk":@enron.com> - "kaminski-v/technical/1.", // To: <"the.desk":@enron.com> - "kaminski-v/technical/7.", // To: <"the.desk":@enron.com> - "kaminski-v/notes_inbox/140.", // To: dogs , cats , risk ,\r\n\tleaders - "kaminski-v/notes_inbox/95.", // To + CC failed: cats , risk , leaders - "kean-s/archiving/untitled/1232.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/archiving/untitled/1688.", // To: w/assts - "kean-s/sent/198.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/reg_risk/9.", // To: w/assts - "kean-s/discussion_threads/950.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/discussion_threads/577.", // To: w/assts - "kean-s/calendar/untitled/1096.", // To: w/assts , mark.palmer@enron.com, karen.denne@enron.com - "kean-s/calendar/untitled/640.", // To: w/assts - "kean-s/all_documents/640.", // To: w/assts - "kean-s/all_documents/1095.", // To: w/assts - "kean-s/attachments/2030.", // To: w/assts - "williams-w3/operations_committee_isas/10.", // To: z34655 - ]); - - let known_bad_from = HashSet::from([ - "skilling-j/inbox/223.", // From: pep - ]); - - let mut i = 0; - for entry in WalkDir::new(d.as_path()) - .into_iter() - .filter_map(|file| file.ok()) - { - if entry.metadata().unwrap().is_file() { - let mail_path = entry.path(); - let suffix = &mail_path.to_str().unwrap()[prefix_sz..]; - - // read file - let mut raw = Vec::new(); - let mut f = File::open(mail_path).unwrap(); - f.read_to_end(&mut raw).unwrap(); - - // parse - parser(&raw, |hdrs| { - let ok_date = hdrs.date.is_some(); - let ok_from = hdrs.from.len() > 0; - let ok_fields = hdrs.bad_fields.len() == 0; - - if !ok_date || !ok_from || !ok_fields { - println!("Issue with: {}", suffix); - } - - assert!(ok_date); - - if !known_bad_from.contains(suffix) { - assert!(ok_from); - } - - if !known_bad_fields.contains(suffix) { - assert!(ok_fields); - } - - i += 1; - if i % 1000 == 0 { - println!("Analyzed emails: {}", i); - } - }) - } - } -} diff --git a/ignore.test/known.rs b/ignore.test/known.rs deleted file mode 100644 index 3cd756d..0000000 --- a/ignore.test/known.rs +++ /dev/null @@ -1,340 +0,0 @@ -use chrono::{FixedOffset, TimeZone}; -use imf_codec::fragments::{misc_token, model, section, part, trace}; -use imf_codec::multipass; -use std::collections::HashMap; - -fn parser<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(§ion::Section) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - - func(§ion.fields); -} - -#[test] -fn test_headers() { - let fullmail: &[u8] = r#"Return-Path: -Delivered-To: quentin@example.com -Received: from smtp.example.com ([10.83.2.2]) - by doradille with LMTP - id xyzabcd - (envelope-from ) - for ; Tue, 13 Jun 2023 19:01:08 +0000 -Date: Tue, 13 Jun 2023 10:01:10 +0200 -From: Mary Smith - , "A\lan" -Sender: imf@example.com -Reply-To: "Mary Smith: Personal Account" -To: John Doe -Cc: imf2@example.com -Bcc: (hidden) -Subject: Re: Saying Hello -Comments: A simple message -Comments: Not that complicated -comments : not valid header name but should be accepted - by the parser. -Keywords: hello, world -Héron: Raté - Raté raté -Keywords: salut, le, monde -Not a real header but should still recover -Message-ID: <3456@example.net> -In-Reply-To: <1234@local.machine.example> -References: <1234@local.machine.example> -Unknown: unknown - -This is a reply to your hello. -"# - .as_bytes(); - parser(fullmail, |parsed_section| { - assert_eq!( - parsed_section, - §ion::Section { - date: Some( - &FixedOffset::east_opt(2 * 3600) - .unwrap() - .with_ymd_and_hms(2023, 06, 13, 10, 01, 10) - .unwrap() - ), - - from: vec![ - &model::MailboxRef { - name: Some("Mary Smith".into()), - addrspec: model::AddrSpec { - local_part: "mary".into(), - domain: "example.net".into(), - } - }, - &model::MailboxRef { - name: Some("Alan".into()), - addrspec: model::AddrSpec { - local_part: "alan".into(), - domain: "example".into(), - } - } - ], - - sender: Some(&model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "imf".into(), - domain: "example.com".into(), - } - }), - - reply_to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("Mary Smith: Personal Account".into()), - addrspec: model::AddrSpec { - local_part: "smith".into(), - domain: "home.example".into(), - } - })], - - to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("John Doe".into()), - addrspec: model::AddrSpec { - local_part: "jdoe".into(), - domain: "machine.example".into(), - } - })], - - cc: vec![&model::AddressRef::Single(model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "imf2".into(), - domain: "example.com".into(), - } - })], - - bcc: vec![], - - msg_id: Some(&model::MessageId { - left: "3456", - right: "example.net" - }), - in_reply_to: vec![&model::MessageId { - left: "1234", - right: "local.machine.example" - }], - references: vec![&model::MessageId { - left: "1234", - right: "local.machine.example" - }], - - subject: Some(&misc_token::Unstructured("Re: Saying Hello".into())), - - comments: vec![ - &misc_token::Unstructured("A simple message".into()), - &misc_token::Unstructured("Not that complicated".into()), - &misc_token::Unstructured( - "not valid header name but should be accepted by the parser.".into() - ), - ], - - keywords: vec![ - &misc_token::PhraseList(vec!["hello".into(), "world".into(),]), - &misc_token::PhraseList(vec!["salut".into(), "le".into(), "monde".into(),]), - ], - - received: vec![&trace::ReceivedLog( - r#"from smtp.example.com ([10.83.2.2]) - by doradille with LMTP - id xyzabcd - (envelope-from ) - for "# - )], - - return_path: vec![&model::MailboxRef { - name: None, - addrspec: model::AddrSpec { - local_part: "gitlab".into(), - domain: "example.com".into(), - } - }], - - optional: HashMap::from([ - ( - "Delivered-To", - &misc_token::Unstructured("quentin@example.com".into()) - ), - ("Unknown", &misc_token::Unstructured("unknown".into())), - ]), - - bad_fields: vec![], - - unparsed: vec![ - "Héron: Raté\n Raté raté\n", - "Not a real header but should still recover\n", - ], - ..section::Section::default() - } - ) - }) -} - -#[test] -fn test_headers_mime() { - use imf_codec::fragments::mime; - let fullmail: &[u8] = r#"From: =?US-ASCII?Q?Keith_Moore?= -To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= -CC: =?ISO-8859-1?Q?Andr=E9?= Pirard -Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= - =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=ISO-8859-1 -Content-Transfer-Encoding: quoted-printable -Content-ID: -Content-Description: hello - -Now's the time = -for all folk to come= - to the aid of their country. -"# - .as_bytes(); - - parser(fullmail, |parsed_section| { - assert_eq!( - parsed_section, - §ion::Section { - from: vec![ - &model::MailboxRef { - name: Some("Keith Moore".into()), - addrspec: model::AddrSpec { - local_part: "moore".into(), - domain: "cs.utk.edu".into(), - } - }, - ], - - to: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("Keld Jørn Simonsen".into()), - addrspec: model::AddrSpec { - local_part: "keld".into(), - domain: "dkuug.dk".into(), - } - })], - - cc: vec![&model::AddressRef::Single(model::MailboxRef { - name: Some("André Pirard".into()), - addrspec: model::AddrSpec { - local_part: "PIRARD".into(), - domain: "vm1.ulg.ac.be".into(), - } - })], - - subject: Some(&misc_token::Unstructured("If you can read this you understand the example.".into())), - mime_version: Some(&mime::Version{ major: 1, minor: 0 }), - mime: section::MIMESection { - content_type: Some(&mime::Type::Text(mime::TextDesc { - charset: Some(mime::EmailCharset::ISO_8859_1), - subtype: mime::TextSubtype::Plain, - unknown_parameters: vec![] - })), - content_transfer_encoding: Some(&mime::Mechanism::QuotedPrintable), - content_id: Some(&model::MessageId { - left: "a", - right: "example.com" - }), - content_description: Some(&misc_token::Unstructured("hello".into())), - ..section::MIMESection::default() - }, - ..section::Section::default() - } - ); - }) -} - -fn parser_bodystruct<'a, F>(input: &'a [u8], func: F) -> () -where - F: FnOnce(&part::PartNode) -> (), -{ - let seg = multipass::segment::new(input).unwrap(); - let charset = seg.charset(); - let fields = charset.fields().unwrap(); - let field_names = fields.names(); - let field_body = field_names.body(); - let section = field_body.section(); - let bodystruct = section.body_structure(); - - func(&bodystruct.body); -} - -#[test] -fn test_multipart() { - let fullmail: &[u8] = r#"Date: Sat, 8 Jul 2023 07:14:29 +0200 -From: Grrrnd Zero -To: John Doe -Subject: Re: Saying Hello -Message-ID: -MIME-Version: 1.0 -Content-Type: multipart/alternative; - boundary="b1_e376dc71bafc953c0b0fdeb9983a9956" -Content-Transfer-Encoding: 7bit - -This is a multi-part message in MIME format. - ---b1_e376dc71bafc953c0b0fdeb9983a9956 -Content-Type: text/plain; charset=utf-8 -Content-Transfer-Encoding: quoted-printable - -GZ -OoOoO -oOoOoOoOo -oOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOoOoOoOo -OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO - ---b1_e376dc71bafc953c0b0fdeb9983a9956 -Content-Type: text/html; charset=us-ascii - -
GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
- ---b1_e376dc71bafc953c0b0fdeb9983a9956-- -"#.as_bytes(); - - parser_bodystruct(fullmail, |part| { - assert_eq!(part, &part::PartNode::Composite( - part::PartHeader { - ..part::PartHeader::default() - }, - vec![ - part::PartNode::Discrete( - part::PartHeader { - ..part::PartHeader::default() - }, - r#"GZ -OoOoO -oOoOoOoOo -oOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOo -oOoOoOoOoOoOoOoOoOoOoOoOoOoOo -OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO"#.as_bytes() - ), - part::PartNode::Discrete( - part::PartHeader { - ..part::PartHeader::default() - }, - r#"
GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
"#.as_bytes() - ), - ])); - }); -}