parse version header

2023-07-03 17:05:17 +02:00 · 2023-07-03 17:05:17 +02:00 · f3dbf79927
commit f3dbf79927
parent d64b7a0bbc
6 changed files with 156 additions and 17 deletions
--- a/README.md
+++ b/README.md
@ -5,14 +5,14 @@
 ## Goals
- - Correctness: do no deviate from the RFC, support edge and obsolete cases
+- Maintainability - modifying the code does not create regression and is possible for someone exterior to the project. Keep cyclomatic complexity low.
- - Straightforward/maintainable: implement the RFCs as close as possible, minimizing the amount of clever tricks and optimizations
+- Composability - build your own parser by picking the relevant passes, avoid work that is not needed.
- - Multiple syntax: Write the parser so it's easy to alternate between the strict and obsolete/compatible syntax
+- Compatibility - always try to parse something, do not panic or return an error.
 - Never fail: Provide as many fallbacks as possible
 ## Non goals
-  - Parsing optimization (greedy parser, etc.) as it would require to significantly deviate from the RFC ABNF syntax (would consider this case if we could prove that the transformation we make are equivalent)
+  - Parsing optimization that would make more complicated to understand the logic.
  - Optimization for a specific use case, to the detriment of other use cases.
  - Pipelining/streaming/buffering as the parser can arbitrarily backtrack + our result contains reference to the whole buffer, imf-codec must keep the whole buffer in memory. Avoiding the sequential approach would certainly speed-up a little bit the parsing, but it's too much work to implement currently.
  - Zerocopy. It might be implementable in the future, but to quickly bootstrap this project, I avoided it for now.
@ -23,21 +23,24 @@ Current known limitations/bugs:
 - Resent Header Fields are not implemented
 - Return-Path/Received headers might be hard to use as their order is important, and it's currently lost in the final datastructure.
 - Datetime parsing of invalid date might return `None` instead of falling back to the `bad_body` field
- - Comments are dropped
+ - Comments contained in the email headers are dropped during parsing
 - No support is provided for message/external-body (read data from local computer) and message/partial (aggregate multiple fragmented emails) as they seem obsolete and dangerous to implement.
 ## Design
-Based on nom, a parser combinator lib in Rust.
+Multipass design: each pass is in charge of a specific work.
-multipass parser
+*Having multiple pass does not necessarily lead to abyssmal performances.
- - extract header block: `&[u8]` (find \r\n\r\n OR \n\n OR \r\r OR \r\n)
+For example, the [Chez Scheme compiler](https://legacy.cs.indiana.edu/~dyb/pubs/commercial-nanopass.pdf) 
- - decode/convert it with chardet + encoding\_rs to support latin-1: Cow<&str>
+pioneered the "Nanopass" concept and showcases excellent performances.*
- - extract header lines iter::&str (requires only to search for FWS + obs\_CRLF)
+
- - extract header names iter::Name::From(&str)
+Currently, you can use the following passes:
- - extract header body iter::Body::From(Vec<MailboxRef>)
+ - `segment.rs` - Extract the header section by finding the `CRLFCRLF` token.
- - extract header section Section
+ - `guess_charset.rs` - Find the header section encoding (should be ASCII or UTF8 but some corpus contains ISO-8859-1 headers)
 - `extract_fields.rs` - Extract the headers line by lines, taking into account Foldable White Space.
 - `field_lazy.rs` - Try to recognize the header fields (`From`, `To`, `Date`, etc.) but do not parse their value.  
 - `field_eager.rs` - Parse the value of each known header fields.  
 - `header_section.rs` - Aggregate the various fields in a single structure.  
 recovery
 - based on multipass, equivalent to sentinel / synchronization tokens
 ## Testing strategy
--- a/src/error.rs
+++ b/src/error.rs
@ -15,4 +15,5 @@ pub enum IMFError<'a> {
    Unstructured(nom::Err<nom::error::Error<&'a str>>),
    PhraseList(nom::Err<nom::error::Error<&'a str>>),
    ReceivedLog(nom::Err<nom::error::Error<&'a str>>),
    Version(nom::Err<nom::error::Error<&'a str>>),
 }
--- a/src/fragments/eager.rs
+++ b/src/fragments/eager.rs
@ -1,5 +1,6 @@
 use crate::error::IMFError;
 use crate::fragments::lazy::Field as Lazy;
 use crate::fragments::mime::{Mechanism, Type, Version};
 use crate::fragments::misc_token::{PhraseList, Unstructured};
 use crate::fragments::model::{AddressList, MailboxList, MailboxRef, MessageId, MessageIdList};
 use crate::fragments::trace::ReceivedLog;
@ -35,6 +36,13 @@ pub enum Field<'a> {
    Received(ReceivedLog<'a>),
    ReturnPath(MailboxRef),
    // MIME RFC2045
    MIMEVersion(Version),
    ContentType(Type<'a>),
    ContentTransferEncoding(Mechanism<'a>),
    ContentID(MessageId<'a>),
    ContentDescription(Unstructured),
    // 3.6.8.  Optional Fields
    Optional(&'a str, Unstructured),
@ -63,9 +71,13 @@ impl<'a> TryFrom<&'a Lazy<'a>> for Field<'a> {
            Lazy::Keywords(v) => v.try_into().map(|v| Keywords(v)),
            Lazy::Received(v) => v.try_into().map(|v| Received(v)),
            Lazy::ReturnPath(v) => v.try_into().map(|v| ReturnPath(v)),
            Lazy::MIMEVersion(v) => v.try_into().map(|v| MIMEVersion(v)),
            Lazy::ContentType(v) => v.try_into().map(|v| ContentType(v)),
            Lazy::ContentTransferEncoding(v) => v.try_into().map(|v| ContentTransferEncoding(v)),
            Lazy::ContentID(v) => v.try_into().map(|v| ContentID(v)),
            Lazy::ContentDescription(v) => v.try_into().map(|v| ContentDescription(v)),
            Lazy::Optional(k, v) => v.try_into().map(|v| Optional(k, v)),
            Lazy::Rescue(v) => Ok(Rescue(v)),
            _ => todo!(),
        }
    }
 }
--- a/src/fragments/mime.rs
+++ b/src/fragments/mime.rs
@ -0,0 +1,120 @@
 use encoding_rs::Encoding;
 use nom::{
    bytes::complete::tag, character::complete as character, combinator::opt, sequence::tuple,
    IResult,
 };
 use crate::error::IMFError;
 use crate::fragments::lazy;
 use crate::fragments::whitespace::cfws;
 #[derive(Debug, PartialEq)]
 pub struct Version {
    major: u32,
    minor: u32,
 }
 #[derive(Debug, PartialEq)]
 pub enum Type<'a> {
    Multipart(MultipartSubtype<'a>),
    Message(MessageSubtype<'a>),
    Other(&'a str, &'a str, Vec<Parameter<'a>>),
 }
 #[derive(Debug, PartialEq)]
 pub enum MultipartSubtype<'a> {
    Alternative(Parameter<'a>),
    Mixed(Parameter<'a>),
    Digest(Parameter<'a>),
    Parallel(Parameter<'a>),
    Other(&'a str, Parameter<'a>),
 }
 #[derive(Debug, PartialEq)]
 pub enum MessageSubtype<'a> {
    RFC822(Vec<Parameter<'a>>),
    Partial(Vec<Parameter<'a>>),
    External(Vec<Parameter<'a>>),
    Other(&'a str, Vec<Parameter<'a>>),
 }
 #[derive(Debug, PartialEq)]
 pub enum Parameter<'a> {
    Charset(&'static Encoding),
    Boundary(&'a str),
    Other(&'a str, &'a str),
 }
 #[derive(Debug, PartialEq)]
 pub enum Mechanism<'a> {
    _7Bit,
    _8Bit,
    Binary,
    QuotedPrintable,
    Base64,
    Other(&'a str),
 }
 impl<'a> TryFrom<&'a lazy::Version<'a>> for Version {
    type Error = IMFError<'a>;
    fn try_from(vs: &'a lazy::Version<'a>) -> Result<Self, Self::Error> {
        version(vs.0)
            .map(|(_, v)| v)
            .map_err(|e| IMFError::Version(e))
    }
 }
 impl<'a> TryFrom<&'a lazy::Type<'a>> for Type<'a> {
    type Error = IMFError<'a>;
    fn try_from(tp: &'a lazy::Type<'a>) -> Result<Self, Self::Error> {
        Ok(Type::Other("", "", vec![]))
    }
 }
 impl<'a> TryFrom<&'a lazy::Mechanism<'a>> for Mechanism<'a> {
    type Error = IMFError<'a>;
    fn try_from(mc: &'a lazy::Mechanism<'a>) -> Result<Self, Self::Error> {
        Ok(Mechanism::Other(""))
    }
 }
 pub fn version(input: &str) -> IResult<&str, Version> {
    let (rest, (_, major, _, _, _, minor, _)) = tuple((
        opt(cfws),
        character::u32,
        opt(cfws),
        tag("."),
        opt(cfws),
        character::u32,
        opt(cfws),
    ))(input)?;
    Ok((rest, Version { major, minor }))
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_version() {
        assert_eq!(version("1.0"), Ok(("", Version { major: 1, minor: 0 })),);
        assert_eq!(
            version(" 1.0 (produced by MetaSend Vx.x)"),
            Ok(("", Version { major: 1, minor: 0 })),
        );
        assert_eq!(
            version("(produced by MetaSend Vx.x) 1.0"),
            Ok(("", Version { major: 1, minor: 0 })),
        );
        assert_eq!(
            version("1.(produced by MetaSend Vx.x)0"),
            Ok(("", Version { major: 1, minor: 0 })),
        );
    }
 }
--- a/src/fragments/mod.rs
+++ b/src/fragments/mod.rs
@ -16,3 +16,5 @@ pub mod lazy;
 mod mailbox;
 pub mod section;
 pub mod trace;
 pub mod mime;
--- a/src/fragments/section.rs
+++ b/src/fragments/section.rs
@ -71,6 +71,7 @@ impl<'a> FromIterator<&'a Field<'a>> for Section<'a> {
                    section.optional.insert(k, v);
                }
                Field::Rescue(v) => section.unparsed.push(v),
                _ => todo!(),
            }
        }
        section