eml-codec/src/text/misc_token.rs

use nom::{
    branch::alt,
    bytes::complete::{tag, take_while1},
    character::complete::space0,
    combinator::{map, opt},
    multi::{many0, many1, separated_list1},
    sequence::{preceded},
    IResult,
};

use crate::text::{
    quoted::{QuotedString, quoted_string},
    whitespace::{fws, is_obs_no_ws_ctl},
    words::{atom, mime_atom, is_vchar},
    encoding::{self, encoded_word},
    ascii,
};

#[derive(Debug, PartialEq, Default)]
pub struct PhraseList<'a>(pub Vec<Phrase<'a>>);
pub fn phrase_list(input: &[u8]) -> IResult<&[u8], PhraseList> {
    map(separated_list1(tag(","), phrase), PhraseList)(input)
}

#[derive(Debug, PartialEq)]
pub enum MIMEWord<'a> {
    Quoted(QuotedString<'a>),
    Atom(&'a [u8]),
}
impl<'a> MIMEWord<'a> {
    pub fn to_string(&self) -> String {
        match self {
            Quoted(v) => v.to_string(),
            Atom(v) => encoding_rs::UTF_8.decode_without_bom_handling(v).1.to_string(),
        }
    }
}
pub fn mime_word(input: &[u8]) -> IResult<&[u8], MIMEWord> {
    alt((
        map(quoted_string, MIMEWord::Quoted), 
        map(mime_atom, MIMEWord::Atom),
    ))(input)
}

#[derive(Debug, PartialEq)]
pub enum Word<'a> {
    Quoted(QuotedString<'a>),
    Encoded(encoding::EncodedWord<'a>),
    Atom(&'a [u8]),
}

impl<'a> Word<'a> {
    pub fn to_string(&self) -> String {
        match self {
            Word::Quoted(v) => v.to_string(),
            Word::Encoded(v) => v.to_string(),
            Word::Atom(v) => encoding_rs::UTF_8.decode_without_bom_handling(v).0.to_string(),
        }
    }
}

/// Word
///
/// ```abnf
///    word            =   atom / quoted-string
/// ```
pub fn word(input: &[u8]) -> IResult<&[u8], Word> {
    alt((
        map(quoted_string, |v| Word::Quoted(v)), 
        map(encoded_word, |v| Word::Encoded(v)),
        map(atom, |v| Word::Atom(v))
    ))(input)
}

#[derive(Debug, PartialEq)]
pub struct Phrase<'a>(pub Vec<Word<'a>>);

impl<'a> Phrase<'a> {
    pub fn to_string(&self) -> String {
        self.0.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(" ")
    }
}

/// Phrase
///
/// ```abnf
///    phrase          =   1*word / obs-phrase
/// ```
pub fn phrase(input: &[u8]) -> IResult<&[u8], Phrase> {
    let (input, phrase) = map(many1(word), |v| Phrase(v))(input)?;
    Ok((input, phrase))
}

/// Compatible unstructured input
///
/// ```abnf
/// obs-utext       =   %d0 / obs-NO-WS-CTL / VCHAR
/// ```
fn is_unstructured(c: u8) -> bool {
    is_vchar(c) || is_obs_no_ws_ctl(c) || c == ascii::NULL
}

#[derive(Debug, PartialEq)]
pub enum UnstrToken<'a> {
    Init,
    Encoded(encoding::EncodedWord<'a>),
    Plain(&'a [u8]),
}

impl<'a> UnstrToken<'a> {
    pub fn to_string(&self) -> String {
        match self {
            UnstrToken::Init => "".into(),
            UnstrToken::Encoded(e) => e.to_string(),
            UnstrToken::Plain(e) => encoding_rs::UTF_8.decode_without_bom_handling(e).0.into_owned(),
        }
    }
}

#[derive(Debug, PartialEq)]
pub struct Unstructured<'a>(pub Vec<UnstrToken<'a>>);

impl<'a> Unstructured<'a> {
    pub fn to_string(&self) -> String {
        self.0.iter().fold(
            (&UnstrToken::Init, String::new()),
            |(prev_token, mut result), current_token| {
                match (prev_token, current_token) {
                    (UnstrToken::Init, v) => result.push_str(v.to_string().as_ref()),
                    (UnstrToken::Encoded(_), UnstrToken::Encoded(v)) => result.push_str(v.to_string().as_ref()),
                    (_, v) => {
                        result.push(' ');
                        result.push_str(v.to_string().as_ref())
                    },
                };

                (current_token, result)
            }
        ).1
    }
}

/// Unstructured header field body
///
/// ```abnf
/// unstructured    =   (*([FWS] VCHAR_SEQ) *WSP) / obs-unstruct
/// ```
pub fn unstructured(input: &[u8]) -> IResult<&[u8], Unstructured> {
    let (input, r) = many0(preceded(opt(fws), alt((
                        map(encoded_word, |v| UnstrToken::Encoded(v)), 
                        map(take_while1(is_unstructured), |v| UnstrToken::Plain(v)),
                    ))))(input)?;

    let (input, _) = space0(input)?;
    Ok((input, Unstructured(r)))
}


#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_phrase() {
        assert_eq!(
            phrase(b"hello world").unwrap().1.to_string(), 
            "hello world".to_string(),
        );
        assert_eq!(
            phrase(b"salut \"le\" monde").unwrap().1.to_string(),
            "salut le monde".to_string(),
        );

        let (rest, parsed) = phrase(b"fin\r\n du\r\nmonde").unwrap();
        assert_eq!(rest, &b"\r\nmonde"[..]);
        assert_eq!(parsed.to_string(), "fin du".to_string());
    }
}
wip refactor 2023-07-18 23:25:10 +02:00			`use nom::{`
			`branch::alt,`
working field parsing 2023-07-20 09:41:10 +02:00			`bytes::complete::{tag, take_while1},`
wip refactor 2023-07-18 23:25:10 +02:00			`character::complete::space0,`
compile subset 2023-07-19 10:41:51 +02:00			`combinator::{map, opt},`
working field parsing 2023-07-20 09:41:10 +02:00			`multi::{many0, many1, separated_list1},`
compile subset 2023-07-19 10:41:51 +02:00			`sequence::{preceded},`
wip refactor 2023-07-18 23:25:10 +02:00			`IResult,`
			`};`

			`use crate::text::{`
fix mailbox tests 2023-07-19 15:28:17 +02:00			`quoted::{QuotedString, quoted_string},`
wip refactor 2023-07-18 23:25:10 +02:00			`whitespace::{fws, is_obs_no_ws_ctl},`
wip type 2023-07-21 18:31:56 +02:00			`words::{atom, mime_atom, is_vchar},`
wip refactor 2023-07-18 23:25:10 +02:00			`encoding::{self, encoded_word},`
			`ascii,`
			`};`

			`#[derive(Debug, PartialEq, Default)]`
working field parsing 2023-07-20 09:41:10 +02:00			`pub struct PhraseList<'a>(pub Vec<Phrase<'a>>);`
			`pub fn phrase_list(input: &[u8]) -> IResult<&[u8], PhraseList> {`
			`map(separated_list1(tag(","), phrase), PhraseList)(input)`
wip, still broken 2023-07-19 22:27:59 +02:00			`}`
wip refactor 2023-07-18 23:25:10 +02:00
wip type 2023-07-21 18:31:56 +02:00			`#[derive(Debug, PartialEq)]`
			`pub enum MIMEWord<'a> {`
			`Quoted(QuotedString<'a>),`
			`Atom(&'a [u8]),`
			`}`
			`impl<'a> MIMEWord<'a> {`
			`pub fn to_string(&self) -> String {`
			`match self {`
			`Quoted(v) => v.to_string(),`
			`Atom(v) => encoding_rs::UTF_8.decode_without_bom_handling(v).1.to_string(),`
			`}`
			`}`
			`}`
			`pub fn mime_word(input: &[u8]) -> IResult<&[u8], MIMEWord> {`
			`alt((`
			`map(quoted_string, MIMEWord::Quoted),`
			`map(mime_atom, MIMEWord::Atom),`
			`))(input)`
			`}`

compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00			`#[derive(Debug, PartialEq)]`
wip refactor 2023-07-18 23:25:10 +02:00			`pub enum Word<'a> {`
fix mailbox tests 2023-07-19 15:28:17 +02:00			`Quoted(QuotedString<'a>),`
wip refactor 2023-07-18 23:25:10 +02:00			`Encoded(encoding::EncodedWord<'a>),`
			`Atom(&'a [u8]),`
			`}`
compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00
wip refactor 2023-07-18 23:25:10 +02:00			`impl<'a> Word<'a> {`
			`pub fn to_string(&self) -> String {`
			`match self {`
			`Word::Quoted(v) => v.to_string(),`
			`Word::Encoded(v) => v.to_string(),`
compile subset 2023-07-19 10:41:51 +02:00			`Word::Atom(v) => encoding_rs::UTF_8.decode_without_bom_handling(v).0.to_string(),`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
			`}`
			`}`

			`/// Word`
			`///`
			/// ```abnf
			`/// word = atom / quoted-string`
			/// ```
			`pub fn word(input: &[u8]) -> IResult<&[u8], Word> {`
			`alt((`
			`map(quoted_string, \|v\| Word::Quoted(v)),`
			`map(encoded_word, \|v\| Word::Encoded(v)),`
			`map(atom, \|v\| Word::Atom(v))`
			`))(input)`
			`}`

compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00			`#[derive(Debug, PartialEq)]`
wip refactor 2023-07-18 23:25:10 +02:00			`pub struct Phrase<'a>(pub Vec<Word<'a>>);`
compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00
wip refactor 2023-07-18 23:25:10 +02:00			`impl<'a> Phrase<'a> {`
			`pub fn to_string(&self) -> String {`
compile subset 2023-07-19 10:41:51 +02:00			`self.0.iter().map(\|v\| v.to_string()).collect::<Vec<String>>().join(" ")`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
			`}`

			`/// Phrase`
			`///`
			/// ```abnf
			`/// phrase = 1*word / obs-phrase`
			/// ```
			`pub fn phrase(input: &[u8]) -> IResult<&[u8], Phrase> {`
			`let (input, phrase) = map(many1(word), \|v\| Phrase(v))(input)?;`
			`Ok((input, phrase))`
			`}`

			`/// Compatible unstructured input`
			`///`
			/// ```abnf
			`/// obs-utext = %d0 / obs-NO-WS-CTL / VCHAR`
			/// ```
			`fn is_unstructured(c: u8) -> bool {`
			`is_vchar(c) \|\| is_obs_no_ws_ctl(c) \|\| c == ascii::NULL`
			`}`

compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00			`#[derive(Debug, PartialEq)]`
compile subset 2023-07-19 10:41:51 +02:00			`pub enum UnstrToken<'a> {`
wip refactor 2023-07-18 23:25:10 +02:00			`Init,`
			`Encoded(encoding::EncodedWord<'a>),`
			`Plain(&'a [u8]),`
			`}`
compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00
wip refactor 2023-07-18 23:25:10 +02:00			`impl<'a> UnstrToken<'a> {`
			`pub fn to_string(&self) -> String {`
			`match self {`
			`UnstrToken::Init => "".into(),`
			`UnstrToken::Encoded(e) => e.to_string(),`
compile subset 2023-07-19 10:41:51 +02:00			`UnstrToken::Plain(e) => encoding_rs::UTF_8.decode_without_bom_handling(e).0.into_owned(),`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
			`}`
			`}`

compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00			`#[derive(Debug, PartialEq)]`
wip refactor 2023-07-18 23:25:10 +02:00			`pub struct Unstructured<'a>(pub Vec<UnstrToken<'a>>);`
compile rfc5322/mailbox 2023-07-19 12:09:23 +02:00
wip refactor 2023-07-18 23:25:10 +02:00			`impl<'a> Unstructured<'a> {`
			`pub fn to_string(&self) -> String {`
			`self.0.iter().fold(`
			`(&UnstrToken::Init, String::new()),`
compile subset 2023-07-19 10:41:51 +02:00			`\|(prev_token, mut result), current_token\| {`
wip refactor 2023-07-18 23:25:10 +02:00			`match (prev_token, current_token) {`
			`(UnstrToken::Init, v) => result.push_str(v.to_string().as_ref()),`
compile subset 2023-07-19 10:41:51 +02:00			`(UnstrToken::Encoded(_), UnstrToken::Encoded(v)) => result.push_str(v.to_string().as_ref()),`
wip refactor 2023-07-18 23:25:10 +02:00			`(_, v) => {`
			`result.push(' ');`
			`result.push_str(v.to_string().as_ref())`
			`},`
			`};`

compile subset 2023-07-19 10:41:51 +02:00			`(current_token, result)`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
compile subset 2023-07-19 10:41:51 +02:00			`).1`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
			`}`

			`/// Unstructured header field body`
			`///`
			/// ```abnf
			`/// unstructured = (([FWS] VCHAR_SEQ) WSP) / obs-unstruct`
			/// ```
			`pub fn unstructured(input: &[u8]) -> IResult<&[u8], Unstructured> {`
			`let (input, r) = many0(preceded(opt(fws), alt((`
			`map(encoded_word, \|v\| UnstrToken::Encoded(v)),`
			`map(take_while1(is_unstructured), \|v\| UnstrToken::Plain(v)),`
			`))))(input)?;`

			`let (input, _) = space0(input)?;`
			`Ok((input, Unstructured(r)))`
			`}`


			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`#[test]`
			`fn test_phrase() {`
			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`phrase(b"hello world").unwrap().1.to_string(),`
			`"hello world".to_string(),`
wip refactor 2023-07-18 23:25:10 +02:00			`);`
			`assert_eq!(`
fixed tests 2023-07-19 11:03:40 +02:00			`phrase(b"salut \"le\" monde").unwrap().1.to_string(),`
			`"salut le monde".to_string(),`
wip refactor 2023-07-18 23:25:10 +02:00			`);`
fixed tests 2023-07-19 11:03:40 +02:00
			`let (rest, parsed) = phrase(b"fin\r\n du\r\nmonde").unwrap();`
			`assert_eq!(rest, &b"\r\nmonde"[..]);`
			`assert_eq!(parsed.to_string(), "fin du".to_string());`
wip refactor 2023-07-18 23:25:10 +02:00			`}`
			`}`