Improve README

2023-07-24 13:04:21 +02:00 · 2023-07-24 13:04:21 +02:00 · bcb5a81d32
commit bcb5a81d32
parent 6af2e38ae3
3 changed files with 17 additions and 497 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,9 @@

 **⚠️ This is currently only a decoder (ie. a parser), encoding is not yet implemented.**

+`eml-codec` is a child project of [Aerogramme](https://aerogramme.deuxfleurs.fr), a distributed and encrypted IMAP server developped by the non-profit organization [Deuxfleurs](https://deuxfleurs.fr).
+Its aim is to be a swiss army knife to handle emails, whether it is to build an IMAP/JMAP server, a mail filter (like an antispam), or a mail client.
+
 ## Example

 ```rust
@ -56,34 +59,17 @@ Speak about parser combinators.

 ## Testing strategy

-eml-codec aims to be as much tested as possible against real word data.
+Currently this crate has some unit tests on most of its parsing functions.
+It is also tested as part of Aerogramme, its parent project where it handles email parsing.
+In this project,  `eml-codec` parsing capabilities are compared to Dovecot, Cyrus, Maddy and other IMAP servers.

-### Unit testing: parser combinator independently (done)
+It is planned to test it on large email datasets (like Enron, jpbush, mailing lists, etc.) but it's not done yet.
+Fuzzing the library would also be interesting, probably to detect crashing due to stack overflow for example
+due to the infinite recursivity of MIME.

-### Selected full emails (expected)
+## RFC and IANA references

-### Existing datasets
-
-**Enron 500k** - Took 20 minutes to parse ~517k emails and check that 
-RFC5322 headers (From, To, Cc, etc.) are correctly parsed.
-From this list, we had to exclude ~50 emails on which
-the From/To/Cc fields were simply completely wrong, but while
-some fields failed to parse, the parser did not crash and
-parsed the other fields of the email correctly.
-
-Run it on your machine:
-
-```bash
-cargo test -- --ignored --nocapture enron500k
-```
-
-Planned: jpbush, my inbox, etc.
-
-### Fuzzing (expected)
-
-### Across reference IMAP servers (dovevot, cyrus) (expected)
-
-## Targeted RFC and IANA references
+RFC

 | 🚩 | # | Name |
 |----|---|------|
@ -106,9 +92,12 @@ Planned: jpbush, my inbox, etc.
 | 🔴 |3798  | ↳ Message Disposition Notification |
 | 🔴 |6838  | ↳ Media Type Specifications and Registration Procedures |

-IANA references :
- - (tbd) MIME subtypes
- - [IANA character sets](https://www.iana.org/assignments/character-sets/character-sets.xhtml)
+IANA
+
+| Name | Description | Note |
+|------|-------------|------|
+| [Media Types](https://www.iana.org/assignments/media-types/media-types.xhtml) | Registered media types for the Content-Type field | Currently only the media types in the MIME RFC have dedicated support in `eml-codec`. |
+| [Character sets](https://www.iana.org/assignments/character-sets/character-sets.xhtml) | Supported character sets for the `charset` parameter | They should all be supported through the `encoding_rs` crate |

 ## State of the art / alternatives

--- a/ignore.test/enron.rs
+++ b/ignore.test/enron.rs
@ -1,129 +0,0 @@
-use imf_codec::fragments::section;
-use imf_codec::multipass;
-use std::collections::HashSet;
-use std::fs::File;
-use std::io::Read;
-use std::path::PathBuf;
-use walkdir::WalkDir;
-
-fn parser<'a, F>(input: &'a [u8], func: F) -> ()
-where
-    F: FnOnce(&section::Section) -> (),
-{
-    let seg = multipass::segment::new(input).unwrap();
-    let charset = seg.charset();
-    let fields = charset.fields().unwrap();
-    let field_names = fields.names();
-    let field_body = field_names.body();
-    let section = field_body.section();
-
-    func(&section.fields);
-}
-
-#[test]
-#[ignore]
-fn test_enron500k() {
-    let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    d.push("resources/enron/maildir/");
-    let prefix_sz = d.as_path().to_str().unwrap().len();
-    //d.push("williams-w3/");
-
-    let known_bad_fields = HashSet::from([
-        "white-s/calendar/113.",       // To: east <7..>
-        "skilling-j/inbox/223.",       // From: pep <performance.>
-        "jones-t/all_documents/9806.", // To: <"tibor.vizkelety":@enron.com>
-        "jones-t/notes_inbox/3303.",   // To: <"tibor.vizkelety":@enron.com>
-        "lokey-t/calendar/33.",        // A second Date entry for the calendar containing
-        // Date:       Monday, March 12
-        "zipper-a/inbox/199.",                       // To: e-mail <mari.>
-        "dasovich-j/deleted_items/128.",             // To: f62489 <g>
-        "dasovich-j/all_documents/677.",             // To: w/assts <govt.>
-        "dasovich-j/all_documents/8984.",            // To: <"ft.com.users":@enron.com>
-        "dasovich-j/all_documents/3514.",            // To: <"ft.com.users":@enron.com>
-        "dasovich-j/all_documents/4467.",            // To: <"ft.com.users":@enron.com>
-        "dasovich-j/all_documents/578.",             // To: w/assts <govt.>
-        "dasovich-j/all_documents/3148.",            // To: <"economist.com.readers":@enron.com>
-        "dasovich-j/all_documents/9953.",            // To: <"economist.com.reader":@enron.com>
-        "dasovich-j/risk_analytics/3.",              // To: w/assts <govt.>
-        "dasovich-j/notes_inbox/5391.",              // To: <"ft.com.users":@enron.com>
-        "dasovich-j/notes_inbox/4952.",              // To: <"economist.com.reader":@enron.com>
-        "dasovich-j/notes_inbox/2386.",              // To: <"ft.com.users":@enron.com>
-        "dasovich-j/notes_inbox/1706.",              // To: <"ft.com.users":@enron.com>
-        "dasovich-j/notes_inbox/1489.",              // To: <"economist.com.readers":@enron.com>
-        "dasovich-j/notes_inbox/5.",                 // To: w/assts <govt.>
-        "kaminski-v/sites/19.",                      // To: <"the.desk":@enron.com>
-        "kaminski-v/sites/1.",                       // To: <"the.desk":@enron.com>
-        "kaminski-v/discussion_threads/5082.",       // To: <"ft.com.users":@enron.com>
-        "kaminski-v/discussion_threads/4046.",       // To: <"the.desk":@enron.com>
-        "kaminski-v/discussion_threads/4187.",       // To: <"the.desk":@enron.com>
-        "kaminski-v/discussion_threads/8068.", // To: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
-        "kaminski-v/discussion_threads/7980.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
-        "kaminski-v/all_documents/5970.", //To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
-        "kaminski-v/all_documents/5838.", // To + Cc: dogs <breakthrough.>, breakthrough.adm@enron.com, breakthrough.adm@enron.com,\r\n\tbreakthrough.adm@enron.com
-        "kaminski-v/all_documents/10070.", // To: <"ft.com.users":@enron.com>
-        "kaminski-v/all_documents/92.",   // To: <"the.desk":@enron.com>
-        "kaminski-v/all_documents/276.",  // To: <"the.desk":@enron.com>
-        "kaminski-v/technical/1.",        // To: <"the.desk":@enron.com>
-        "kaminski-v/technical/7.",        // To: <"the.desk":@enron.com>
-        "kaminski-v/notes_inbox/140.", // To: dogs <breakthrough.>, cats <breaktkhrough.>, risk <breakthrough.>,\r\n\tleaders <breaktkhrough.>
-        "kaminski-v/notes_inbox/95.", // To + CC failed: cats <breaktkhrough.>, risk <breakthrough.>, leaders <breaktkhrough.>
-        "kean-s/archiving/untitled/1232.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
-        "kean-s/archiving/untitled/1688.", // To: w/assts <govt.>
-        "kean-s/sent/198.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
-        "kean-s/reg_risk/9.", // To: w/assts <govt.>
-        "kean-s/discussion_threads/950.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
-        "kean-s/discussion_threads/577.", // To: w/assts <govt.>
-        "kean-s/calendar/untitled/1096.", // To: w/assts <govt.>, mark.palmer@enron.com, karen.denne@enron.com
-        "kean-s/calendar/untitled/640.",  // To: w/assts <govt.>
-        "kean-s/all_documents/640.",      // To: w/assts <govt.>
-        "kean-s/all_documents/1095.",     // To: w/assts <govt.>
-        "kean-s/attachments/2030.",       // To: w/assts <govt.>
-        "williams-w3/operations_committee_isas/10.", // To: z34655 <m>
-    ]);
-
-    let known_bad_from = HashSet::from([
-        "skilling-j/inbox/223.", // From: pep <performance.>
-    ]);
-
-    let mut i = 0;
-    for entry in WalkDir::new(d.as_path())
-        .into_iter()
-        .filter_map(|file| file.ok())
-    {
-        if entry.metadata().unwrap().is_file() {
-            let mail_path = entry.path();
-            let suffix = &mail_path.to_str().unwrap()[prefix_sz..];
-
-            // read file
-            let mut raw = Vec::new();
-            let mut f = File::open(mail_path).unwrap();
-            f.read_to_end(&mut raw).unwrap();
-
-            // parse
-            parser(&raw, |hdrs| {
-                let ok_date = hdrs.date.is_some();
-                let ok_from = hdrs.from.len() > 0;
-                let ok_fields = hdrs.bad_fields.len() == 0;
-
-                if !ok_date || !ok_from || !ok_fields {
-                    println!("Issue with: {}", suffix);
-                }
-
-                assert!(ok_date);
-
-                if !known_bad_from.contains(suffix) {
-                    assert!(ok_from);
-                }
-
-                if !known_bad_fields.contains(suffix) {
-                    assert!(ok_fields);
-                }
-
-                i += 1;
-                if i % 1000 == 0 {
-                    println!("Analyzed emails: {}", i);
-                }
-            })
-        }
-    }
-}
--- a/ignore.test/known.rs
+++ b/ignore.test/known.rs
@ -1,340 +0,0 @@
-use chrono::{FixedOffset, TimeZone};
-use imf_codec::fragments::{misc_token, model, section, part, trace};
-use imf_codec::multipass;
-use std::collections::HashMap;
-
-fn parser<'a, F>(input: &'a [u8], func: F) -> ()
-where
-    F: FnOnce(&section::Section) -> (),
-{
-    let seg = multipass::segment::new(input).unwrap();
-    let charset = seg.charset();
-    let fields = charset.fields().unwrap();
-    let field_names = fields.names();
-    let field_body = field_names.body();
-    let section = field_body.section();
-
-    func(&section.fields);
-}
-
-#[test]
-fn test_headers() {
-    let fullmail: &[u8] = r#"Return-Path: <gitlab@example.com>
-Delivered-To: quentin@example.com
-Received: from smtp.example.com ([10.83.2.2])
-	by doradille with LMTP
-	id xyzabcd
-	(envelope-from <gitlab@example.com>)
-	for <quentin@example.com>; Tue, 13 Jun 2023 19:01:08 +0000
-Date: Tue, 13 Jun 2023 10:01:10 +0200
-From: Mary Smith
- <mary@example.net>, "A\lan" <alan@example>
-Sender: imf@example.com
-Reply-To: "Mary Smith: Personal Account" <smith@home.example>
-To: John Doe <jdoe@machine.example>
-Cc: imf2@example.com
-Bcc: (hidden)
-Subject: Re: Saying Hello
-Comments: A simple message
-Comments: Not that complicated
-comments : not valid header name but should be accepted
-    by the parser.
-Keywords: hello, world
-Héron: Raté
- Raté raté
-Keywords: salut, le, monde
-Not a real header but should still recover
-Message-ID: <3456@example.net>
-In-Reply-To: <1234@local.machine.example>
-References: <1234@local.machine.example>
-Unknown: unknown
-
-This is a reply to your hello.
-"#
-    .as_bytes();
-    parser(fullmail, |parsed_section| {
-        assert_eq!(
-            parsed_section,
-            &section::Section {
-                date: Some(
-                    &FixedOffset::east_opt(2 * 3600)
-                        .unwrap()
-                        .with_ymd_and_hms(2023, 06, 13, 10, 01, 10)
-                        .unwrap()
-                ),
-
-                from: vec![
-                    &model::MailboxRef {
-                        name: Some("Mary Smith".into()),
-                        addrspec: model::AddrSpec {
-                            local_part: "mary".into(),
-                            domain: "example.net".into(),
-                        }
-                    },
-                    &model::MailboxRef {
-                        name: Some("Alan".into()),
-                        addrspec: model::AddrSpec {
-                            local_part: "alan".into(),
-                            domain: "example".into(),
-                        }
-                    }
-                ],
-
-                sender: Some(&model::MailboxRef {
-                    name: None,
-                    addrspec: model::AddrSpec {
-                        local_part: "imf".into(),
-                        domain: "example.com".into(),
-                    }
-                }),
-
-                reply_to: vec![&model::AddressRef::Single(model::MailboxRef {
-                    name: Some("Mary Smith: Personal Account".into()),
-                    addrspec: model::AddrSpec {
-                        local_part: "smith".into(),
-                        domain: "home.example".into(),
-                    }
-                })],
-
-                to: vec![&model::AddressRef::Single(model::MailboxRef {
-                    name: Some("John Doe".into()),
-                    addrspec: model::AddrSpec {
-                        local_part: "jdoe".into(),
-                        domain: "machine.example".into(),
-                    }
-                })],
-
-                cc: vec![&model::AddressRef::Single(model::MailboxRef {
-                    name: None,
-                    addrspec: model::AddrSpec {
-                        local_part: "imf2".into(),
-                        domain: "example.com".into(),
-                    }
-                })],
-
-                bcc: vec![],
-
-                msg_id: Some(&model::MessageId {
-                    left: "3456",
-                    right: "example.net"
-                }),
-                in_reply_to: vec![&model::MessageId {
-                    left: "1234",
-                    right: "local.machine.example"
-                }],
-                references: vec![&model::MessageId {
-                    left: "1234",
-                    right: "local.machine.example"
-                }],
-
-                subject: Some(&misc_token::Unstructured("Re: Saying Hello".into())),
-
-                comments: vec![
-                    &misc_token::Unstructured("A simple message".into()),
-                    &misc_token::Unstructured("Not that complicated".into()),
-                    &misc_token::Unstructured(
-                        "not valid header name but should be accepted by the parser.".into()
-                    ),
-                ],
-
-                keywords: vec![
-                    &misc_token::PhraseList(vec!["hello".into(), "world".into(),]),
-                    &misc_token::PhraseList(vec!["salut".into(), "le".into(), "monde".into(),]),
-                ],
-
-                received: vec![&trace::ReceivedLog(
-                    r#"from smtp.example.com ([10.83.2.2])
-	by doradille with LMTP
-	id xyzabcd
-	(envelope-from <gitlab@example.com>)
-	for <quentin@example.com>"#
-                )],
-
-                return_path: vec![&model::MailboxRef {
-                    name: None,
-                    addrspec: model::AddrSpec {
-                        local_part: "gitlab".into(),
-                        domain: "example.com".into(),
-                    }
-                }],
-
-                optional: HashMap::from([
-                    (
-                        "Delivered-To",
-                        &misc_token::Unstructured("quentin@example.com".into())
-                    ),
-                    ("Unknown", &misc_token::Unstructured("unknown".into())),
-                ]),
-
-                bad_fields: vec![],
-
-                unparsed: vec![
-                    "Héron: Raté\n Raté raté\n",
-                    "Not a real header but should still recover\n",
-                ],
-                ..section::Section::default()
-            }
-        )
-    })
-}
-
-#[test]
-fn test_headers_mime() {
-    use imf_codec::fragments::mime;
-    let fullmail: &[u8] = r#"From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
-To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
-CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>
-Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
-    =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
-MIME-Version: 1.0
-Content-Type: text/plain; charset=ISO-8859-1
-Content-Transfer-Encoding: quoted-printable
-Content-ID: <a@example.com>
-Content-Description: hello
-
-Now's the time =
-for all folk to come=
- to the aid of their country.
-"#
-    .as_bytes();
-
-   parser(fullmail, |parsed_section| {
-        assert_eq!(
-            parsed_section,
-            &section::Section {
-                from: vec![
-                    &model::MailboxRef {
-                        name: Some("Keith Moore".into()),
-                        addrspec: model::AddrSpec {
-                            local_part: "moore".into(),
-                            domain: "cs.utk.edu".into(),
-                        }
-                    },
-                ],
-
-                to: vec![&model::AddressRef::Single(model::MailboxRef {
-                    name: Some("Keld Jørn Simonsen".into()),
-                    addrspec: model::AddrSpec {
-                        local_part: "keld".into(),
-                        domain: "dkuug.dk".into(),
-                    }
-                })],
-
-                cc: vec![&model::AddressRef::Single(model::MailboxRef {
-                    name: Some("André Pirard".into()),
-                    addrspec: model::AddrSpec {
-                        local_part: "PIRARD".into(),
-                        domain: "vm1.ulg.ac.be".into(),
-                    }
-                })],
-
-                subject: Some(&misc_token::Unstructured("If you can read this you understand the example.".into())),
-                mime_version: Some(&mime::Version{ major: 1, minor: 0 }),
-                mime: section::MIMESection {
-                    content_type: Some(&mime::Type::Text(mime::TextDesc { 
-                        charset: Some(mime::EmailCharset::ISO_8859_1), 
-                        subtype: mime::TextSubtype::Plain, 
-                        unknown_parameters: vec![]
-                    })),
-                    content_transfer_encoding: Some(&mime::Mechanism::QuotedPrintable),
-                    content_id: Some(&model::MessageId {
-                        left: "a",
-                        right: "example.com"
-                    }),
-                    content_description: Some(&misc_token::Unstructured("hello".into())),
-                    ..section::MIMESection::default()
-                },
-                ..section::Section::default()
-            }
-        );
-   })
-}
-
-fn parser_bodystruct<'a, F>(input: &'a [u8], func: F) -> ()
-where
-    F: FnOnce(&part::PartNode) -> (),
-{
-    let seg = multipass::segment::new(input).unwrap();
-    let charset = seg.charset();
-    let fields = charset.fields().unwrap();
-    let field_names = fields.names();
-    let field_body = field_names.body();
-    let section = field_body.section();
-    let bodystruct = section.body_structure();
-
-    func(&bodystruct.body);
-}
-
-#[test]
-fn test_multipart() {
-    let fullmail: &[u8] = r#"Date: Sat, 8 Jul 2023 07:14:29 +0200
-From: Grrrnd Zero <grrrndzero@example.org>
-To: John Doe <jdoe@machine.example>
-Subject: Re: Saying Hello
-Message-ID: <NTAxNzA2AC47634Y366BAMTY4ODc5MzQyODY0ODY5@www.grrrndzero.org>
-MIME-Version: 1.0
-Content-Type: multipart/alternative;
- boundary="b1_e376dc71bafc953c0b0fdeb9983a9956"
-Content-Transfer-Encoding: 7bit
-
-This is a multi-part message in MIME format.
-
--b1_e376dc71bafc953c0b0fdeb9983a9956
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: quoted-printable
-
-GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO
-
--b1_e376dc71bafc953c0b0fdeb9983a9956
-Content-Type: text/html; charset=us-ascii
-
-<div style="text-align: center;"><strong>GZ</strong><br />
-OoOoO<br />
-oOoOoOoOo<br />
-oOoOoOoOoOoOoOoOo<br />
-oOoOoOoOoOoOoOoOoOoOoOo<br />
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />
-
--b1_e376dc71bafc953c0b0fdeb9983a9956--
-"#.as_bytes();
-    
-    parser_bodystruct(fullmail, |part| {
-        assert_eq!(part, &part::PartNode::Composite(
-            part::PartHeader {
-                ..part::PartHeader::default()
-            },
-            vec![
-                part::PartNode::Discrete(
-                    part::PartHeader {
-                        ..part::PartHeader::default()
-                    },
-                    r#"GZ
-OoOoO
-oOoOoOoOo
-oOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOo
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO"#.as_bytes()
-                ),
-                part::PartNode::Discrete(
-                    part::PartHeader {
-                        ..part::PartHeader::default()
-                    },
-                    r#"<div style="text-align: center;"><strong>GZ</strong><br />
-OoOoO<br />
-oOoOoOoOo<br />
-oOoOoOoOoOoOoOoOo<br />
-oOoOoOoOoOoOoOoOoOoOoOo<br />
-oOoOoOoOoOoOoOoOoOoOoOoOoOoOo<br />
-OoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoO<br />"#.as_bytes()
-                ),
-            ]));
-    });
-}