From 1e6b18de5aee9f1660a34054bf778cd6dc4350b2 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Mon, 12 Jun 2023 17:20:24 +0200 Subject: [PATCH] wip atom --- src/headers.rs | 61 +++++++++++++++++++++++++++++++++++--------------- src/tokens.rs | 57 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 21 deletions(-) diff --git a/src/headers.rs b/src/headers.rs index bdcad2f..35f82be 100644 --- a/src/headers.rs +++ b/src/headers.rs @@ -46,6 +46,7 @@ pub fn header_section(input: &str) -> IResult<&str, PermissiveHeaderSection> { Ok((input, headers)) } +#[derive(Debug)] enum HeaderField<'a> { // 3.6.1. The Origination Date Field Date(HeaderDate), @@ -86,16 +87,17 @@ enum HeaderField<'a> { Optional(&'a str, String) } -/// Extract one header field +/// Parse one header field /// -/// Derived grammar inspired by RFC5322 optional-field: +/// RFC5322 optional-field seems to be a generalization of the field terminology. +/// We use it to parse all header names: /// /// ```abnf /// field = field-name ":" unstructured CRLF /// field-name = 1*ftext /// ftext = %d33-57 / ; Printable US-ASCII /// %d59-126 ; characters not including -/// ; ":". +/// ; ":". /// ``` fn header_field(input: &str) -> IResult<&str, HeaderField> { // Extract field name @@ -104,21 +106,8 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> { // Extract field body let (input, hfield) = match field_name { - "Date" => { - // @FIXME want to extract datetime our way in the future - // to better handle obsolete/bad cases instead of crashing. - let (input, raw_date) = unstructured(input)?; - let date = match DateTime::parse_from_rfc2822(&raw_date) { - Ok(chronodt) => HeaderDate::Parsed(chronodt), - Err(e) => HeaderDate::Unknown(raw_date, e), - }; - (input, HeaderField::Date(date)) - }, - "From" => { - let (input, mbx) = mailbox(input)?; - //many0( - unimplemented!() - }, + "Date" => datetime(input)?, + "From" => from(input)?, "Sender" => unimplemented!(), "Subject" => { let (input, body) = unstructured(input)?; @@ -130,6 +119,7 @@ fn header_field(input: &str) -> IResult<&str, HeaderField> { } }; + // Drop EOL let (input, _) = crlf(input)?; return Ok((input, hfield)); } @@ -159,6 +149,41 @@ fn unstructured(input: &str) -> IResult<&str, String> { Ok((input, body)) } +fn datetime(input: &str) -> IResult<&str, HeaderField> { + // @FIXME want to extract datetime our way in the future + // to better handle obsolete/bad cases instead of returning raw text. + let (input, raw_date) = unstructured(input)?; + let date = match DateTime::parse_from_rfc2822(&raw_date) { + Ok(chronodt) => HeaderDate::Parsed(chronodt), + Err(e) => HeaderDate::Unknown(raw_date, e), + }; + Ok((input, HeaderField::Date(date))) +} + +fn from(input: &str) -> IResult<&str, HeaderField> { + //let (input, mbox_list) = many0(mailbox)(input)?; + //Ok((input, HeaderField::From(mbox_list))) + unimplemented!(); +} + fn mailbox(input: &str) -> IResult<&str, MailboxRef> { unimplemented!(); } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_datetime() { + let datefield = "Thu,\r\n 13\r\n Feb\r\n 1969\r\n 23:32\r\n -0330 (Newfoundland Time)"; + let (input, v) = datetime(datefield).unwrap(); + assert_eq!(input, ""); + match v { + HeaderField::Date(HeaderDate::Parsed(_)) => (), + _ => panic!("Date has not been parsed"), + }; + } +} + + diff --git a/src/tokens.rs b/src/tokens.rs index de76c99..bd03f64 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -5,7 +5,7 @@ use nom::{ character::complete::{crlf, satisfy, space0, space1}, combinator::{recognize, opt}, multi::{many0, many1}, - sequence::{preceded, terminated, tuple}, + sequence::{delimited, pair, preceded, terminated, tuple}, }; /// Lexical tokens @@ -68,7 +68,7 @@ fn fold_marker(input: &str) -> IResult<&str, &str> { /// CFWS = (1*([FWS] comment) [FWS]) / FWS /// ``` pub fn cfws(input: &str) -> IResult<&str, &str> { - alt((perm_fws, recognize(comments)))(input) + alt((recognize(comments), perm_fws))(input) } pub fn comments(input: &str) -> IResult<&str, ()> { @@ -122,10 +122,35 @@ pub fn vchar_seq(input: &str) -> IResult<&str, &str> { take_while1(is_vchar)(input) } +fn is_atext(c: char) -> bool { + c.is_ascii_alphanumeric() || "!#$%&'*+-/=?^_`{|}~".contains(c) +} + +/// atom +/// +/// `[CFWS] 1*atext [CFWS]` +fn atom(input: &str) -> IResult<&str, &str> { + delimited(opt(cfws), take_while1(is_atext), opt(cfws))(input) +} + +/// dot-atom-text +/// +/// `1*atext *("." 1*atext)` +fn dot_atom_text(input: &str) -> IResult<&str, &str> { + recognize(pair(take_while1(is_atext), many0(pair(tag("."), take_while1(is_atext)))))(input) +} + +/// dot-atom +/// +/// `[CFWS] dot-atom-text [CFWS]` +fn dot_atom(input: &str) -> IResult<&str, &str> { + delimited(opt(cfws), dot_atom_text, opt(cfws))(input) +} + + #[cfg(test)] mod tests { use super::*; - use nom; #[test] fn test_vchar_seq() { @@ -151,5 +176,31 @@ mod tests { #[test] fn test_cfws() { assert_eq!(cfws("(A nice \\) chap) "), Ok(("", "(A nice \\) chap) "))); + assert_eq!(cfws("(Chris's host.)public.example>,"), Ok(("public.example>,", "(Chris's host.)"))); + assert_eq!(cfws("(double (comment) is fun) wouch"), Ok(("wouch", "(double (comment) is fun) "))); + } + + #[test] + fn test_atext() { + assert!(is_atext('=')); + assert!(is_atext('5')); + assert!(is_atext('Q')); + assert!(!is_atext(' ')); + assert!(!is_atext('É')); + } + + #[test] + fn test_atom() { + assert_eq!(atom("(skip) imf_codec (hidden) aerogramme"), Ok(("aerogramme", "imf_codec"))); + } + + #[test] + fn test_dot_atom_text() { + assert_eq!(dot_atom_text("quentin.dufour.io abcdef"), Ok((" abcdef", "quentin.dufour.io"))); + } + + #[test] + fn test_dot_atom() { + assert_eq!(dot_atom(" (skip) quentin.dufour.io abcdef"), Ok(("abcdef", "quentin.dufour.io"))); } }