reintroduce charset
This commit is contained in:
parent
ec937bf11d
commit
b444281729
6 changed files with 160 additions and 181 deletions
|
@ -2,4 +2,4 @@ pub mod error;
|
||||||
pub mod text;
|
pub mod text;
|
||||||
pub mod header;
|
pub mod header;
|
||||||
pub mod rfc5322;
|
pub mod rfc5322;
|
||||||
//pub mod mime;
|
pub mod mime;
|
||||||
|
|
144
src/mime/charset.rs
Normal file
144
src/mime/charset.rs
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
use encoding_rs::Encoding;
|
||||||
|
|
||||||
|
/// Specific implementation of charset
|
||||||
|
///
|
||||||
|
/// imf_codec has its own charset list to follow IANA's one.
|
||||||
|
/// encoding_rs implements a different standard that does not know US_ASCII.
|
||||||
|
/// using encoding_rs datastructures directly would lead to a loss of information.
|
||||||
|
/// https://www.iana.org/assignments/character-sets/character-sets.xhtml
|
||||||
|
#[allow(non_camel_case_types)]
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub enum EmailCharset<'a> {
|
||||||
|
US_ASCII,
|
||||||
|
ISO_8859_1,
|
||||||
|
ISO_8859_2,
|
||||||
|
ISO_8859_3,
|
||||||
|
ISO_8859_4,
|
||||||
|
ISO_8859_5,
|
||||||
|
ISO_8859_6,
|
||||||
|
ISO_8859_7,
|
||||||
|
ISO_8859_8,
|
||||||
|
ISO_8859_9,
|
||||||
|
ISO_8859_10,
|
||||||
|
Shift_JIS,
|
||||||
|
EUC_JP,
|
||||||
|
ISO_2022_KR,
|
||||||
|
EUC_KR,
|
||||||
|
ISO_2022_JP,
|
||||||
|
ISO_2022_JP_2,
|
||||||
|
ISO_8859_6_E,
|
||||||
|
ISO_8859_6_I,
|
||||||
|
ISO_8859_8_E,
|
||||||
|
ISO_8859_8_I,
|
||||||
|
GB2312,
|
||||||
|
Big5,
|
||||||
|
KOI8_R,
|
||||||
|
UTF_8,
|
||||||
|
Other(&'a [u8]),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a [u8]> for EmailCharset<'a> {
|
||||||
|
fn from(s: &'a [u8]) -> Self {
|
||||||
|
match s.to_ascii_lowercase().as_slice() {
|
||||||
|
b"us-ascii" | b"ascii" => EmailCharset::US_ASCII,
|
||||||
|
b"iso-8859-1" => EmailCharset::ISO_8859_1,
|
||||||
|
b"iso-8859-2" => EmailCharset::ISO_8859_2,
|
||||||
|
b"iso-8859-3" => EmailCharset::ISO_8859_3,
|
||||||
|
b"iso-8859-4" => EmailCharset::ISO_8859_4,
|
||||||
|
b"iso-8859-5" => EmailCharset::ISO_8859_5,
|
||||||
|
b"iso-8859-6" => EmailCharset::ISO_8859_6,
|
||||||
|
b"iso-8859-7" => EmailCharset::ISO_8859_7,
|
||||||
|
b"iso-8859-8" => EmailCharset::ISO_8859_8,
|
||||||
|
b"iso-8859-9" => EmailCharset::ISO_8859_9,
|
||||||
|
b"iso-8859-10" => EmailCharset::ISO_8859_10,
|
||||||
|
b"shift_jis" => EmailCharset::Shift_JIS,
|
||||||
|
b"euc-jp" => EmailCharset::EUC_JP,
|
||||||
|
b"iso-2022-kr" => EmailCharset::ISO_2022_KR,
|
||||||
|
b"euc-kr" => EmailCharset::EUC_KR,
|
||||||
|
b"iso-2022-jp" => EmailCharset::ISO_2022_JP,
|
||||||
|
b"iso-2022-jp-2" => EmailCharset::ISO_2022_JP_2,
|
||||||
|
b"iso-8859-6-e" => EmailCharset::ISO_8859_6_E,
|
||||||
|
b"iso-8859-6-i" => EmailCharset::ISO_8859_6_I,
|
||||||
|
b"iso-8859-8-e" => EmailCharset::ISO_8859_8_E,
|
||||||
|
b"iso-8859-8-i" => EmailCharset::ISO_8859_8_I,
|
||||||
|
b"gb2312" => EmailCharset::GB2312,
|
||||||
|
b"big5" => EmailCharset::Big5,
|
||||||
|
b"koi8-r" => EmailCharset::KOI8_R,
|
||||||
|
b"utf-8" | b"utf8" => EmailCharset::UTF_8,
|
||||||
|
_ => EmailCharset::Other(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> EmailCharset<'a> {
|
||||||
|
pub fn as_str(&self) -> &'static str {
|
||||||
|
use EmailCharset::*;
|
||||||
|
match self {
|
||||||
|
US_ASCII => "US-ASCII",
|
||||||
|
ISO_8859_1 => "ISO-8859-1",
|
||||||
|
ISO_8859_2 => "ISO-8859-2",
|
||||||
|
ISO_8859_3 => "ISO-8859-3",
|
||||||
|
ISO_8859_4 => "ISO-8859-4",
|
||||||
|
ISO_8859_5 => "ISO-8859-5",
|
||||||
|
ISO_8859_6 => "ISO-8859-6",
|
||||||
|
ISO_8859_7 => "ISO-8859-7",
|
||||||
|
ISO_8859_8 => "ISO-8859-8",
|
||||||
|
ISO_8859_9 => "ISO-8859-9",
|
||||||
|
ISO_8859_10 => "ISO-8859-10",
|
||||||
|
Shift_JIS => "Shift_JIS",
|
||||||
|
EUC_JP => "EUC-JP",
|
||||||
|
ISO_2022_KR => "ISO-2022-KR",
|
||||||
|
EUC_KR => "EUC-KR",
|
||||||
|
ISO_2022_JP => "ISO-2022-JP",
|
||||||
|
ISO_2022_JP_2 => "ISO-2022-JP-2",
|
||||||
|
ISO_8859_6_E => "ISO-8859-6-E",
|
||||||
|
ISO_8859_6_I => "ISO-8859-6-I",
|
||||||
|
ISO_8859_8_E => "ISO-8859-8-E",
|
||||||
|
ISO_8859_8_I => "ISO-8859-8-I",
|
||||||
|
GB2312 => "GB2312",
|
||||||
|
Big5 => "Big5",
|
||||||
|
KOI8_R => "KOI8-R",
|
||||||
|
UTF_8 => "UTF-8",
|
||||||
|
Other(_) => "UTF-8", //@FIXME bad idea...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_encoding(&self) -> &'static Encoding {
|
||||||
|
Encoding::for_label(self.as_str().as_bytes())
|
||||||
|
.unwrap_or(encoding_rs::WINDOWS_1252)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
#[test]
|
||||||
|
fn test_charset() {
|
||||||
|
assert_eq!(
|
||||||
|
EmailCharset::from(&b"Us-Ascii"[..]).as_str(),
|
||||||
|
"US-ASCII",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
EmailCharset::from(&b"Us-Ascii"[..]).as_encoding(),
|
||||||
|
encoding_rs::WINDOWS_1252,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
EmailCharset::from(&b"ISO-8859-1"[..]).as_encoding(),
|
||||||
|
encoding_rs::WINDOWS_1252,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
EmailCharset::from(&b"utf-8"[..]).as_encoding(),
|
||||||
|
encoding_rs::UTF_8,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
EmailCharset::from(&b"utf8"[..]).as_encoding(),
|
||||||
|
encoding_rs::UTF_8,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
|
@ -84,43 +84,6 @@ pub enum Parameter<'a> {
|
||||||
Other(&'a str, String),
|
Other(&'a str, String),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Specific implementation of charset
|
|
||||||
///
|
|
||||||
/// imf_codec has its own charset list to follow IANA's one.
|
|
||||||
/// encoding_rs implements a different standard that does not know US_ASCII.
|
|
||||||
/// using encoding_rs datastructures directly would lead to a loss of information.
|
|
||||||
/// https://www.iana.org/assignments/character-sets/character-sets.xhtml
|
|
||||||
#[allow(non_camel_case_types)]
|
|
||||||
#[derive(Debug, PartialEq)]
|
|
||||||
pub enum EmailCharset<'a> {
|
|
||||||
US_ASCII,
|
|
||||||
ISO_8859_1,
|
|
||||||
ISO_8859_2,
|
|
||||||
ISO_8859_3,
|
|
||||||
ISO_8859_4,
|
|
||||||
ISO_8859_5,
|
|
||||||
ISO_8859_6,
|
|
||||||
ISO_8859_7,
|
|
||||||
ISO_8859_8,
|
|
||||||
ISO_8859_9,
|
|
||||||
ISO_8859_10,
|
|
||||||
Shift_JIS,
|
|
||||||
EUC_JP,
|
|
||||||
ISO_2022_KR,
|
|
||||||
EUC_KR,
|
|
||||||
ISO_2022_JP,
|
|
||||||
ISO_2022_JP_2,
|
|
||||||
ISO_8859_6_E,
|
|
||||||
ISO_8859_6_I,
|
|
||||||
ISO_8859_8_E,
|
|
||||||
ISO_8859_8_I,
|
|
||||||
GB2312,
|
|
||||||
Big5,
|
|
||||||
KOI8_R,
|
|
||||||
UTF_8,
|
|
||||||
Other(Cow<'a, str>),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Mechanism<'a> {
|
pub enum Mechanism<'a> {
|
||||||
_7Bit,
|
_7Bit,
|
||||||
|
@ -130,114 +93,12 @@ pub enum Mechanism<'a> {
|
||||||
Base64,
|
Base64,
|
||||||
Other(&'a str),
|
Other(&'a str),
|
||||||
}
|
}
|
||||||
impl<'a> From<&'a str> for EmailCharset<'a> {
|
|
||||||
fn from(s: &'a str) -> Self {
|
|
||||||
EmailCharset::from(Cow::Borrowed(s))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<Cow<'a, str>> for EmailCharset<'a> {
|
|
||||||
fn from(s: Cow<'a, str>) -> Self {
|
|
||||||
match s.to_lowercase().as_ref() {
|
|
||||||
"us-ascii" | "ascii" => EmailCharset::US_ASCII,
|
|
||||||
"iso-8859-1" => EmailCharset::ISO_8859_1,
|
|
||||||
"iso-8859-2" => EmailCharset::ISO_8859_2,
|
|
||||||
"iso-8859-3" => EmailCharset::ISO_8859_3,
|
|
||||||
"iso-8859-4" => EmailCharset::ISO_8859_4,
|
|
||||||
"iso-8859-5" => EmailCharset::ISO_8859_5,
|
|
||||||
"iso-8859-6" => EmailCharset::ISO_8859_6,
|
|
||||||
"iso-8859-7" => EmailCharset::ISO_8859_7,
|
|
||||||
"iso-8859-8" => EmailCharset::ISO_8859_8,
|
|
||||||
"iso-8859-9" => EmailCharset::ISO_8859_9,
|
|
||||||
"iso-8859-10" => EmailCharset::ISO_8859_10,
|
|
||||||
"shift_jis" => EmailCharset::Shift_JIS,
|
|
||||||
"euc-jp" => EmailCharset::EUC_JP,
|
|
||||||
"iso-2022-kr" => EmailCharset::ISO_2022_KR,
|
|
||||||
"euc-kr" => EmailCharset::EUC_KR,
|
|
||||||
"iso-2022-jp" => EmailCharset::ISO_2022_JP,
|
|
||||||
"iso-2022-jp-2" => EmailCharset::ISO_2022_JP_2,
|
|
||||||
"iso-8859-6-e" => EmailCharset::ISO_8859_6_E,
|
|
||||||
"iso-8859-6-i" => EmailCharset::ISO_8859_6_I,
|
|
||||||
"iso-8859-8-e" => EmailCharset::ISO_8859_8_E,
|
|
||||||
"iso-8859-8-i" => EmailCharset::ISO_8859_8_I,
|
|
||||||
"gb2312" => EmailCharset::GB2312,
|
|
||||||
"big5" => EmailCharset::Big5,
|
|
||||||
"koi8-r" => EmailCharset::KOI8_R,
|
|
||||||
"utf-8" | "utf8" => EmailCharset::UTF_8,
|
|
||||||
_ => EmailCharset::Other(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> EmailCharset<'a> {
|
|
||||||
pub fn as_str(&'a self) -> &'a str {
|
|
||||||
use EmailCharset::*;
|
|
||||||
match self {
|
|
||||||
US_ASCII => "US-ASCII",
|
|
||||||
ISO_8859_1 => "ISO-8859-1",
|
|
||||||
ISO_8859_2 => "ISO-8859-2",
|
|
||||||
ISO_8859_3 => "ISO-8859-3",
|
|
||||||
ISO_8859_4 => "ISO-8859-4",
|
|
||||||
ISO_8859_5 => "ISO-8859-5",
|
|
||||||
ISO_8859_6 => "ISO-8859-6",
|
|
||||||
ISO_8859_7 => "ISO-8859-7",
|
|
||||||
ISO_8859_8 => "ISO-8859-8",
|
|
||||||
ISO_8859_9 => "ISO-8859-9",
|
|
||||||
ISO_8859_10 => "ISO-8859-10",
|
|
||||||
Shift_JIS => "Shift_JIS",
|
|
||||||
EUC_JP => "EUC-JP",
|
|
||||||
ISO_2022_KR => "ISO-2022-KR",
|
|
||||||
EUC_KR => "EUC-KR",
|
|
||||||
ISO_2022_JP => "ISO-2022-JP",
|
|
||||||
ISO_2022_JP_2 => "ISO-2022-JP-2",
|
|
||||||
ISO_8859_6_E => "ISO-8859-6-E",
|
|
||||||
ISO_8859_6_I => "ISO-8859-6-I",
|
|
||||||
ISO_8859_8_E => "ISO-8859-8-E",
|
|
||||||
ISO_8859_8_I => "ISO-8859-8-I",
|
|
||||||
GB2312 => "GB2312",
|
|
||||||
Big5 => "Big5",
|
|
||||||
KOI8_R => "KOI8-R",
|
|
||||||
UTF_8 => "UTF-8",
|
|
||||||
Other(raw) => raw.as_ref(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn as_encoding(&self) -> &'static Encoding {
|
|
||||||
Encoding::for_label(self.as_str().as_bytes())
|
|
||||||
.unwrap_or(encoding_rs::WINDOWS_1252)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TryFrom<&'a lazy::Version<'a>> for Version {
|
|
||||||
type Error = IMFError<'a>;
|
|
||||||
|
|
||||||
fn try_from(vs: &'a lazy::Version<'a>) -> Result<Self, Self::Error> {
|
|
||||||
version(vs.0)
|
|
||||||
.map(|(_, v)| v)
|
|
||||||
.map_err(|e| IMFError::Version(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TryFrom<&'a lazy::Type<'a>> for Type<'a> {
|
|
||||||
type Error = IMFError<'a>;
|
|
||||||
|
|
||||||
fn try_from(tp: &'a lazy::Type<'a>) -> Result<Self, Self::Error> {
|
|
||||||
content_type(tp.0)
|
|
||||||
.map(|(_, v)| v)
|
|
||||||
.map_err(|e| IMFError::ContentType(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TryFrom<&'a lazy::Mechanism<'a>> for Mechanism<'a> {
|
|
||||||
type Error = IMFError<'a>;
|
|
||||||
|
|
||||||
fn try_from(mc: &'a lazy::Mechanism<'a>) -> Result<Self, Self::Error> {
|
|
||||||
mechanism(mc.0)
|
|
||||||
.map(|(_, v)| v)
|
|
||||||
.map_err(|e| IMFError::Mechanism(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/*
|
||||||
impl<'a> From<&'a str> for MultipartSubtype<'a> {
|
impl<'a> From<&'a str> for MultipartSubtype<'a> {
|
||||||
fn from(csub: &'a str) -> Self {
|
fn from(csub: &'a str) -> Self {
|
||||||
match csub.to_lowercase().as_ref() {
|
match csub.to_lowercase().as_ref() {
|
||||||
|
@ -271,6 +132,7 @@ impl<'a> From<&'a str> for TextSubtype<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
pub fn parameter(input: &str) -> IResult<&str, Parameter> {
|
pub fn parameter(input: &str) -> IResult<&str, Parameter> {
|
||||||
let (rest, (pname, _, pvalue)) = tuple((
|
let (rest, (pname, _, pvalue)) = tuple((
|
||||||
|
@ -396,34 +258,6 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_charset() {
|
|
||||||
assert_eq!(
|
|
||||||
EmailCharset::from("Us-Ascii").as_str(),
|
|
||||||
"US-ASCII",
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
EmailCharset::from("Us-Ascii").as_encoding(),
|
|
||||||
encoding_rs::WINDOWS_1252,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
EmailCharset::from("ISO-8859-1").as_encoding(),
|
|
||||||
encoding_rs::WINDOWS_1252,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
EmailCharset::from("utf-8").as_encoding(),
|
|
||||||
encoding_rs::UTF_8,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
EmailCharset::from("utf8").as_encoding(),
|
|
||||||
encoding_rs::UTF_8,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parameter() {
|
fn test_parameter() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|
|
@ -1,18 +1,20 @@
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Field<'a> {
|
pub enum Content<'a> {
|
||||||
ContentType(Type<'a>),
|
Type(Type<'a>),
|
||||||
ContentTransferEncoding(Mechanism<'a>),
|
TransferEncoding(Mechanism<'a>),
|
||||||
ContentID(MessageId<'a>),
|
ID(MessageId<'a>),
|
||||||
ContentDescription(Unstructured),
|
Description(Unstructured),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn correct_mime_field(input: &str) -> IResult<&str, MIMEField> {
|
fn field(input: &str) -> IResult<&str, Content> {
|
||||||
use MIMEField::*;
|
terminated(alt((
|
||||||
|
preceded(field_name(b"content-type"), map(date, Field::Date)),
|
||||||
|
|
||||||
field_name(input).map(|(rest, name)| {
|
field_name(input).map(|(rest, name)| {
|
||||||
(
|
(
|
||||||
"",
|
"",
|
||||||
match name.to_lowercase().as_ref() {
|
match name.to_lowercase().as_ref() {
|
||||||
"content-type" => ContentType(Type(rest)),
|
"" => ContentType(Type(rest)),
|
||||||
"content-transfer-encoding" => ContentTransferEncoding(Mechanism(rest)),
|
"content-transfer-encoding" => ContentTransferEncoding(Mechanism(rest)),
|
||||||
"content-id" => ContentID(Identifier(rest)),
|
"content-id" => ContentID(Identifier(rest)),
|
||||||
"content-description" => ContentDescription(Unstructured(rest)),
|
"content-description" => ContentDescription(Unstructured(rest)),
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
pub mod field;
|
pub mod charset;
|
||||||
|
//pub mod field;
|
||||||
|
|
|
@ -2,11 +2,9 @@ use chrono::{DateTime, FixedOffset};
|
||||||
use nom::{
|
use nom::{
|
||||||
IResult,
|
IResult,
|
||||||
branch::alt,
|
branch::alt,
|
||||||
bytes::complete::{tag, tag_no_case, take_while1},
|
|
||||||
character::complete::space0,
|
|
||||||
combinator::map,
|
combinator::map,
|
||||||
multi::many0,
|
multi::many0,
|
||||||
sequence::{pair, preceded, terminated, tuple},
|
sequence::{preceded, terminated},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::text::whitespace::{obs_crlf, foldable_line};
|
use crate::text::whitespace::{obs_crlf, foldable_line};
|
||||||
|
|
Loading…
Reference in a new issue