improve readme, wip datetime
This commit is contained in:
parent
1f5d9ebf78
commit
a6dd1e1718
2 changed files with 88 additions and 12 deletions
46
README.md
46
README.md
|
@ -1,12 +1,46 @@
|
||||||
# imf-codec
|
# imf-codec
|
||||||
|
|
||||||
**Work in progress, do not use in production**
|
**Work in progress, do not use in production**
|
||||||
|
|
||||||
**Focus: correctness over performance**
|
|
||||||
|
|
||||||
**This is currently only a decoder (parser), encoding is not supported.**
|
**This is currently only a decoder (parser), encoding is not supported.**
|
||||||
|
|
||||||
Targeted RFC
|
## Goals
|
||||||
|
|
||||||
|
- Correctness: do no deviate from the RFC, support edge and obsolete cases
|
||||||
|
- Straightforward/maintainable: implement the RFCs as close as possible, minimizing the amount of clever tricks and optimizations
|
||||||
|
- Multiple syntax: Write the parser so it's easy to alternate between the strict and obsolete/compatible syntax
|
||||||
|
- Never fail: Provide as many fallbacks as possible
|
||||||
|
|
||||||
|
## Non goals
|
||||||
|
|
||||||
|
- Parsing optimization (greedy parser, etc.) as it would require to significantly deviate from the RFC ABNF syntax (would consider this case if we could prove that the transformation we make are equivalent)
|
||||||
|
- Pipelining/streaming/buffering as the parser can arbitrarily backtrack + our result contains reference to the whole buffer, imf-codec must keep the whole buffer in memory. Avoiding the sequential approach would certainly speed-up a little bit the parsing, but it's too much work to implement currently.
|
||||||
|
- Zerocopy. It might be implementable in the future, but to quickly bootstrap this project, I avoided it for now.
|
||||||
|
|
||||||
|
## Missing / known bugs
|
||||||
|
|
||||||
|
Current known limitations/bugs:
|
||||||
|
|
||||||
|
- Resent Header Fields are not implemented
|
||||||
|
- Return-Path/Received headers might be hard to use as their order is important, and it's currently lost in the final datastructure.
|
||||||
|
- Datetime parsing of invalid date might return `None` instead of falling back to the `bad_body` field
|
||||||
|
- Comments are dropped
|
||||||
|
|
||||||
|
## Design
|
||||||
|
|
||||||
|
Based on nom, a parser combinator lib in Rust.
|
||||||
|
|
||||||
|
## Testing strategy
|
||||||
|
|
||||||
|
- Unit testing: parser combinator independently.
|
||||||
|
- Selected full emails
|
||||||
|
- Enron 500k
|
||||||
|
|
||||||
|
## Development status
|
||||||
|
|
||||||
|
Early development. Not ready.
|
||||||
|
Do not use it in production or any software at all.
|
||||||
|
|
||||||
|
## Targeted RFC
|
||||||
|
|
||||||
| # | Name |
|
| # | Name |
|
||||||
|---|------|
|
|---|------|
|
||||||
|
@ -20,3 +54,7 @@ Targeted RFC
|
||||||
|2049 | ↳ Multipurpose Internet Mail Extensions (MIME) Part Five: Conformance Criteria and Examples |
|
|2049 | ↳ Multipurpose Internet Mail Extensions (MIME) Part Five: Conformance Criteria and Examples |
|
||||||
|6532 | Internationalized Email Headers |
|
|6532 | Internationalized Email Headers |
|
||||||
|9228 | Delivered-To Email Header Field |
|
|9228 | Delivered-To Email Header Field |
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
`stalwartlab/mail_parser`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
use chrono::{DateTime, FixedOffset, NaiveDate};
|
use chrono::{DateTime, FixedOffset, NaiveDate, NaiveTime, TimeZone};
|
||||||
use nom::{
|
use nom::{
|
||||||
IResult,
|
IResult,
|
||||||
bytes::complete::take_while_m_n,
|
bytes::complete::take_while_m_n,
|
||||||
|
@ -7,22 +7,23 @@ use nom::{
|
||||||
use crate::misc_token;
|
use crate::misc_token;
|
||||||
|
|
||||||
/// date-time = [ day-of-week "," ] date time [CFWS]
|
/// date-time = [ day-of-week "," ] date time [CFWS]
|
||||||
|
/// time = time-of-day zone
|
||||||
/// @FIXME: if parsing fails, Option::None is silently returned...
|
/// @FIXME: if parsing fails, Option::None is silently returned...
|
||||||
pub fn section(input: &str) -> IResult<&str, Option<DateTime<FixedOffset>>> {
|
pub fn section(input: &str) -> IResult<&str, Option<DateTime<FixedOffset>>> {
|
||||||
let (input, (_, date, time, _) = tuple((
|
let (input, (_, date, time, tz, _)) = tuple((
|
||||||
opt(terminated(day_of_week), tag(",")),
|
opt(terminated(day_of_week, tag(","))),
|
||||||
date, time,
|
date, time_of_day, zone
|
||||||
opt(cfws)))(input)?;
|
opt(cfws)))(input)?;
|
||||||
|
|
||||||
|
|
||||||
|
//@TODO: rebuild DateTime from NaiveDate, NaiveTime and TimeZone
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// @FIXME want to extract datetime our way in the future
|
// @FIXME want to extract datetime our way in the future
|
||||||
// to better handle obsolete/bad cases instead of returning raw text.
|
// to better handle obsolete/bad cases instead of returning raw text.
|
||||||
let (input, raw_date) = misc_token::unstructured(input)?;
|
//let (input, raw_date) = misc_token::unstructured(input)?;
|
||||||
Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap()))
|
//Ok((input, DateTime::parse_from_rfc2822(&raw_date).unwrap()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// day-of-week = ([FWS] day-name) / obs-day-of-week
|
/// day-of-week = ([FWS] day-name) / obs-day-of-week
|
||||||
|
@ -102,5 +103,42 @@ fn month(input: &str) -> IResult<&str, u32) {
|
||||||
/// year = (FWS 4*DIGIT FWS) / obs-year
|
/// year = (FWS 4*DIGIT FWS) / obs-year
|
||||||
/// obs-year = [CFWS] 2*DIGIT [CFWS]
|
/// obs-year = [CFWS] 2*DIGIT [CFWS]
|
||||||
fn year(input: &str) -> IResult<&str, i32> {
|
fn year(input: &str) -> IResult<&str, i32> {
|
||||||
alt((
|
alt((strict_year, obs_year))(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn strict_year(input &str) -> IResult<&str, i32> {
|
||||||
|
delimited(fws, strict_year_digit, fws)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn obs_year(input: &str) -> IResult<&str, i32> {
|
||||||
|
delimited(opt(cfws), obs_year_digit, opt(cfws))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn strict_year_digit(input: &str) -> IResult<&str, i32> {
|
||||||
|
// Max value for i32 is 2,147,483,647 ; in other words 10 digits.
|
||||||
|
// 9 digits should always be parsable into an i32 and enough for a year.
|
||||||
|
// @FIXME a better implementation is desirable
|
||||||
|
map(take_while_m_n(4, 9, is_digit), |d| d.parse::<i32>().unwrap())(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn obs_year_digit(input: &str) -> IResult<&str, i32> {
|
||||||
|
// @FIXME same as strict_year_digit
|
||||||
|
map(take_while_m_n(2, 9, is_digit), |d| d.parse::<i32>().unwrap())(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// time-of-day = hour ":" minute [ ":" second ]
|
||||||
|
///
|
||||||
|
fn time(input: &str) -> IResult<&str, (NaiveTime, TimeZone)> {
|
||||||
|
map(
|
||||||
|
tuple((time_digit, tag(":"), time_digit, opt(preceded(tag(":"), time_digit)))),
|
||||||
|
|(hour, _, minute, maybe_sec)|
|
||||||
|
}
|
||||||
|
|
||||||
|
fn time_digit(input: &str) -> IResult<&str, u32> {
|
||||||
|
alt((strict_time_digit, obs_time_digit))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn strict_time_digit(input: &str) -> IResult<&str, u32> {
|
||||||
|
take_while_m_n(4, 4, is_digit)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue