improve readme, wip datetime

This commit is contained in:
Quentin 2023-06-17 11:43:54 +02:00
parent 1f5d9ebf78
commit a6dd1e1718
Signed by: quentin
GPG key ID: E9602264D639FF68
2 changed files with 88 additions and 12 deletions

View file

@ -1,12 +1,46 @@
# imf-codec
**Work in progress, do not use in production**
**Focus: correctness over performance**
**This is currently only a decoder (parser), encoding is not supported.**
Targeted RFC
## Goals
- Correctness: do no deviate from the RFC, support edge and obsolete cases
- Straightforward/maintainable: implement the RFCs as close as possible, minimizing the amount of clever tricks and optimizations
- Multiple syntax: Write the parser so it's easy to alternate between the strict and obsolete/compatible syntax
- Never fail: Provide as many fallbacks as possible
## Non goals
- Parsing optimization (greedy parser, etc.) as it would require to significantly deviate from the RFC ABNF syntax (would consider this case if we could prove that the transformation we make are equivalent)
- Pipelining/streaming/buffering as the parser can arbitrarily backtrack + our result contains reference to the whole buffer, imf-codec must keep the whole buffer in memory. Avoiding the sequential approach would certainly speed-up a little bit the parsing, but it's too much work to implement currently.
- Zerocopy. It might be implementable in the future, but to quickly bootstrap this project, I avoided it for now.
## Missing / known bugs
Current known limitations/bugs:
- Resent Header Fields are not implemented
- Return-Path/Received headers might be hard to use as their order is important, and it's currently lost in the final datastructure.
- Datetime parsing of invalid date might return `None` instead of falling back to the `bad_body` field
- Comments are dropped
## Design
Based on nom, a parser combinator lib in Rust.
## Testing strategy
- Unit testing: parser combinator independently.
- Selected full emails
- Enron 500k
## Development status
Early development. Not ready.
Do not use it in production or any software at all.
## Targeted RFC
| # | Name |
|---|------|
@ -20,3 +54,7 @@ Targeted RFC
|2049 | ↳ Multipurpose Internet Mail Extensions (MIME) Part Five: Conformance Criteria and Examples |
|6532 | Internationalized Email Headers |
|9228 | Delivered-To Email Header Field |
## Alternatives
`stalwartlab/mail_parser`

View file

@ -1,4 +1,4 @@
use chrono::{DateTime, FixedOffset, NaiveDate};
use chrono::{DateTime, FixedOffset, NaiveDate, NaiveTime, TimeZone};
use nom::{
IResult,
bytes::complete::take_while_m_n,
@ -7,22 +7,23 @@ use nom::{
use crate::misc_token;
/// date-time = [ day-of-week "," ] date time [CFWS]
/// time = time-of-day zone
/// @FIXME: if parsing fails, Option::None is silently returned...
pub fn section(input: &str) -> IResult<&str, Option<DateTime<FixedOffset>>> {
let (input, (_, date, time, _) = tuple((
opt(terminated(day_of_week), tag(",")),
date, time,
let (input, (_, date, time, tz, _)) = tuple((
opt(terminated(day_of_week, tag(","))),
date, time_of_day, zone
opt(cfws)))(input)?;
//@TODO: rebuild DateTime from NaiveDate, NaiveTime and TimeZone
// @FIXME want to extract datetime our way in the future
// to better handle obsolete/bad cases instead of returning raw text.
let (input, raw_date) = misc_token::unstructured(input)?;
Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap()))
//let (input, raw_date) = misc_token::unstructured(input)?;
//Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap()))
}
/// day-of-week = ([FWS] day-name) / obs-day-of-week
@ -102,5 +103,42 @@ fn month(input: &str) -> IResult<&str, u32) {
/// year = (FWS 4*DIGIT FWS) / obs-year
/// obs-year = [CFWS] 2*DIGIT [CFWS]
fn year(input: &str) -> IResult<&str, i32> {
alt((
alt((strict_year, obs_year))(input)
}
fn strict_year(input &str) -> IResult<&str, i32> {
delimited(fws, strict_year_digit, fws)(input)
}
fn obs_year(input: &str) -> IResult<&str, i32> {
delimited(opt(cfws), obs_year_digit, opt(cfws))(input)
}
fn strict_year_digit(input: &str) -> IResult<&str, i32> {
// Max value for i32 is 2,147,483,647 ; in other words 10 digits.
// 9 digits should always be parsable into an i32 and enough for a year.
// @FIXME a better implementation is desirable
map(take_while_m_n(4, 9, is_digit), |d| d.parse::<i32>().unwrap())(input)
}
fn obs_year_digit(input: &str) -> IResult<&str, i32> {
// @FIXME same as strict_year_digit
map(take_while_m_n(2, 9, is_digit), |d| d.parse::<i32>().unwrap())(input)
}
/// time-of-day = hour ":" minute [ ":" second ]
///
fn time(input: &str) -> IResult<&str, (NaiveTime, TimeZone)> {
map(
tuple((time_digit, tag(":"), time_digit, opt(preceded(tag(":"), time_digit)))),
|(hour, _, minute, maybe_sec)|
}
fn time_digit(input: &str) -> IResult<&str, u32> {
alt((strict_time_digit, obs_time_digit))(input)
}
fn strict_time_digit(input: &str) -> IResult<&str, u32> {
take_while_m_n(4, 4, is_digit)(input)
}