From a6dd1e1718a4e8624492185a692c26b709d06ded Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Sat, 17 Jun 2023 11:43:54 +0200 Subject: [PATCH] improve readme, wip datetime --- README.md | 46 +++++++++++++++++++++++++++++++++++++---- src/datetime.rs | 54 +++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 88 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 6f4837b..d710606 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,46 @@ # imf-codec **Work in progress, do not use in production** - -**Focus: correctness over performance** - **This is currently only a decoder (parser), encoding is not supported.** -Targeted RFC +## Goals + + - Correctness: do no deviate from the RFC, support edge and obsolete cases + - Straightforward/maintainable: implement the RFCs as close as possible, minimizing the amount of clever tricks and optimizations + - Multiple syntax: Write the parser so it's easy to alternate between the strict and obsolete/compatible syntax + - Never fail: Provide as many fallbacks as possible + +## Non goals + + - Parsing optimization (greedy parser, etc.) as it would require to significantly deviate from the RFC ABNF syntax (would consider this case if we could prove that the transformation we make are equivalent) + - Pipelining/streaming/buffering as the parser can arbitrarily backtrack + our result contains reference to the whole buffer, imf-codec must keep the whole buffer in memory. Avoiding the sequential approach would certainly speed-up a little bit the parsing, but it's too much work to implement currently. + - Zerocopy. It might be implementable in the future, but to quickly bootstrap this project, I avoided it for now. + +## Missing / known bugs + +Current known limitations/bugs: + + - Resent Header Fields are not implemented + - Return-Path/Received headers might be hard to use as their order is important, and it's currently lost in the final datastructure. + - Datetime parsing of invalid date might return `None` instead of falling back to the `bad_body` field + - Comments are dropped + +## Design + +Based on nom, a parser combinator lib in Rust. + +## Testing strategy + + - Unit testing: parser combinator independently. + - Selected full emails + - Enron 500k + +## Development status + +Early development. Not ready. +Do not use it in production or any software at all. + +## Targeted RFC | # | Name | |---|------| @@ -20,3 +54,7 @@ Targeted RFC |2049 | ↳ Multipurpose Internet Mail Extensions (MIME) Part Five: Conformance Criteria and Examples | |6532 | Internationalized Email Headers | |9228 | Delivered-To Email Header Field | + +## Alternatives + +`stalwartlab/mail_parser` diff --git a/src/datetime.rs b/src/datetime.rs index 988616d..fd31208 100644 --- a/src/datetime.rs +++ b/src/datetime.rs @@ -1,4 +1,4 @@ -use chrono::{DateTime, FixedOffset, NaiveDate}; +use chrono::{DateTime, FixedOffset, NaiveDate, NaiveTime, TimeZone}; use nom::{ IResult, bytes::complete::take_while_m_n, @@ -7,22 +7,23 @@ use nom::{ use crate::misc_token; /// date-time = [ day-of-week "," ] date time [CFWS] +/// time = time-of-day zone /// @FIXME: if parsing fails, Option::None is silently returned... pub fn section(input: &str) -> IResult<&str, Option>> { - let (input, (_, date, time, _) = tuple(( - opt(terminated(day_of_week), tag(",")), - date, time, + let (input, (_, date, time, tz, _)) = tuple(( + opt(terminated(day_of_week, tag(","))), + date, time_of_day, zone opt(cfws)))(input)?; - + //@TODO: rebuild DateTime from NaiveDate, NaiveTime and TimeZone // @FIXME want to extract datetime our way in the future // to better handle obsolete/bad cases instead of returning raw text. - let (input, raw_date) = misc_token::unstructured(input)?; - Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap())) + //let (input, raw_date) = misc_token::unstructured(input)?; + //Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap())) } /// day-of-week = ([FWS] day-name) / obs-day-of-week @@ -102,5 +103,42 @@ fn month(input: &str) -> IResult<&str, u32) { /// year = (FWS 4*DIGIT FWS) / obs-year /// obs-year = [CFWS] 2*DIGIT [CFWS] fn year(input: &str) -> IResult<&str, i32> { - alt(( + alt((strict_year, obs_year))(input) } + +fn strict_year(input &str) -> IResult<&str, i32> { + delimited(fws, strict_year_digit, fws)(input) +} + +fn obs_year(input: &str) -> IResult<&str, i32> { + delimited(opt(cfws), obs_year_digit, opt(cfws))(input) +} + +fn strict_year_digit(input: &str) -> IResult<&str, i32> { + // Max value for i32 is 2,147,483,647 ; in other words 10 digits. + // 9 digits should always be parsable into an i32 and enough for a year. + // @FIXME a better implementation is desirable + map(take_while_m_n(4, 9, is_digit), |d| d.parse::().unwrap())(input) +} + +fn obs_year_digit(input: &str) -> IResult<&str, i32> { + // @FIXME same as strict_year_digit + map(take_while_m_n(2, 9, is_digit), |d| d.parse::().unwrap())(input) +} + +/// time-of-day = hour ":" minute [ ":" second ] +/// +fn time(input: &str) -> IResult<&str, (NaiveTime, TimeZone)> { + map( + tuple((time_digit, tag(":"), time_digit, opt(preceded(tag(":"), time_digit)))), + |(hour, _, minute, maybe_sec)| +} + +fn time_digit(input: &str) -> IResult<&str, u32> { + alt((strict_time_digit, obs_time_digit))(input) +} + +fn strict_time_digit(input: &str) -> IResult<&str, u32> { + take_while_m_n(4, 4, is_digit)(input) +} +