add a charset detector

This commit is contained in:
Quentin 2023-06-18 22:05:11 +02:00
parent 8f2c944ab8
commit 950947ee3e
Signed by: quentin
GPG key ID: E9602264D639FF68
4 changed files with 55 additions and 10 deletions

22
Cargo.lock generated
View file

@ -41,6 +41,17 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chardetng"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
dependencies = [
"cfg-if",
"encoding_rs",
"memchr",
]
[[package]]
name = "chrono"
version = "0.4.26"
@ -62,6 +73,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "encoding_rs"
version = "0.8.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
dependencies = [
"cfg-if",
]
[[package]]
name = "iana-time-zone"
version = "0.1.57"
@ -89,7 +109,9 @@ dependencies = [
name = "imf-codec"
version = "0.1.0"
dependencies = [
"chardetng",
"chrono",
"encoding_rs",
"nom",
"walkdir",
]

View file

@ -18,6 +18,8 @@ path = "src/parse.rs"
[dependencies]
nom = "7"
chrono = "0.4"
chardetng = "0.1"
encoding_rs = "0.8"
[dev-dependencies]
walkdir = "2"

View file

@ -111,16 +111,13 @@ fn strict_local_part(input: &str) -> IResult<&str, String> {
/// obs_local_part.
///
/// ```abnf
/// obs-local-part = word *(1*"." word)
/// obs-local-part = *(*"." word)
/// ```
fn obs_local_part(input: &str) -> IResult<&str, String> {
map(pair(
word,
fold_many0(
pair(is_a("."), word),
String::new,
|acc, (dots, txt)| acc + dots + &txt),
), |(head, rest)| head.into_owned() + &rest)(input)
fold_many0(
pair(opt(is_a(".")), word),
String::new,
|acc, (dots, txt)| acc + dots.unwrap_or("") + &txt)(input)
}
/// Domain
@ -295,4 +292,15 @@ mod tests {
}))
);
}
#[test]
fn test_enron2() {
assert_eq!(
addr_spec(".nelson@enron.com"),
Ok(("", AddrSpec {
local_part: ".nelson".into(),
domain: "enron.com".into(),
}))
);
}
}

View file

@ -2,9 +2,22 @@ use imf_codec::header;
use std::io;
use std::io::Read;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
fn main() {
let mut email = String::new();
io::stdin().lock().read_to_string(&mut email).unwrap();
// Read full mail in memory
let mut rawmail = Vec::new();
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
let (email, encoding, malformed) = enc.decode(&rawmail);
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (_, hdrs) = header::section(&email).unwrap();
assert!(hdrs.date.is_some());