add a charset detector

This commit is contained in:
Quentin 2023-06-18 22:05:11 +02:00
parent 8f2c944ab8
commit 950947ee3e
Signed by: quentin
GPG key ID: E9602264D639FF68
4 changed files with 55 additions and 10 deletions

22
Cargo.lock generated
View file

@ -41,6 +41,17 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chardetng"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
dependencies = [
"cfg-if",
"encoding_rs",
"memchr",
]
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.26" version = "0.4.26"
@ -62,6 +73,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "encoding_rs"
version = "0.8.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "iana-time-zone" name = "iana-time-zone"
version = "0.1.57" version = "0.1.57"
@ -89,7 +109,9 @@ dependencies = [
name = "imf-codec" name = "imf-codec"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"chardetng",
"chrono", "chrono",
"encoding_rs",
"nom", "nom",
"walkdir", "walkdir",
] ]

View file

@ -18,6 +18,8 @@ path = "src/parse.rs"
[dependencies] [dependencies]
nom = "7" nom = "7"
chrono = "0.4" chrono = "0.4"
chardetng = "0.1"
encoding_rs = "0.8"
[dev-dependencies] [dev-dependencies]
walkdir = "2" walkdir = "2"

View file

@ -111,16 +111,13 @@ fn strict_local_part(input: &str) -> IResult<&str, String> {
/// obs_local_part. /// obs_local_part.
/// ///
/// ```abnf /// ```abnf
/// obs-local-part = word *(1*"." word) /// obs-local-part = *(*"." word)
/// ``` /// ```
fn obs_local_part(input: &str) -> IResult<&str, String> { fn obs_local_part(input: &str) -> IResult<&str, String> {
map(pair( fold_many0(
word, pair(opt(is_a(".")), word),
fold_many0( String::new,
pair(is_a("."), word), |acc, (dots, txt)| acc + dots.unwrap_or("") + &txt)(input)
String::new,
|acc, (dots, txt)| acc + dots + &txt),
), |(head, rest)| head.into_owned() + &rest)(input)
} }
/// Domain /// Domain
@ -295,4 +292,15 @@ mod tests {
})) }))
); );
} }
#[test]
fn test_enron2() {
assert_eq!(
addr_spec(".nelson@enron.com"),
Ok(("", AddrSpec {
local_part: ".nelson".into(),
domain: "enron.com".into(),
}))
);
}
} }

View file

@ -2,9 +2,22 @@ use imf_codec::header;
use std::io; use std::io;
use std::io::Read; use std::io::Read;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
fn main() { fn main() {
let mut email = String::new(); // Read full mail in memory
io::stdin().lock().read_to_string(&mut email).unwrap(); let mut rawmail = Vec::new();
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&rawmail, true);
// Get encoding
let enc: &Encoding = detector.guess(None, true);
let (email, encoding, malformed) = enc.decode(&rawmail);
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
let (_, hdrs) = header::section(&email).unwrap(); let (_, hdrs) = header::section(&email).unwrap();
assert!(hdrs.date.is_some()); assert!(hdrs.date.is_some());