add a charset detector
This commit is contained in:
parent
8f2c944ab8
commit
950947ee3e
4 changed files with 55 additions and 10 deletions
22
Cargo.lock
generated
22
Cargo.lock
generated
|
@ -41,6 +41,17 @@ version = "1.0.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.26"
|
||||
|
@ -62,6 +73,15 @@ version = "0.8.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.57"
|
||||
|
@ -89,7 +109,9 @@ dependencies = [
|
|||
name = "imf-codec"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chardetng",
|
||||
"chrono",
|
||||
"encoding_rs",
|
||||
"nom",
|
||||
"walkdir",
|
||||
]
|
||||
|
|
|
@ -18,6 +18,8 @@ path = "src/parse.rs"
|
|||
[dependencies]
|
||||
nom = "7"
|
||||
chrono = "0.4"
|
||||
chardetng = "0.1"
|
||||
encoding_rs = "0.8"
|
||||
|
||||
[dev-dependencies]
|
||||
walkdir = "2"
|
||||
|
|
|
@ -111,16 +111,13 @@ fn strict_local_part(input: &str) -> IResult<&str, String> {
|
|||
/// obs_local_part.
|
||||
///
|
||||
/// ```abnf
|
||||
/// obs-local-part = word *(1*"." word)
|
||||
/// obs-local-part = *(*"." word)
|
||||
/// ```
|
||||
fn obs_local_part(input: &str) -> IResult<&str, String> {
|
||||
map(pair(
|
||||
word,
|
||||
fold_many0(
|
||||
pair(is_a("."), word),
|
||||
String::new,
|
||||
|acc, (dots, txt)| acc + dots + &txt),
|
||||
), |(head, rest)| head.into_owned() + &rest)(input)
|
||||
fold_many0(
|
||||
pair(opt(is_a(".")), word),
|
||||
String::new,
|
||||
|acc, (dots, txt)| acc + dots.unwrap_or("") + &txt)(input)
|
||||
}
|
||||
|
||||
/// Domain
|
||||
|
@ -295,4 +292,15 @@ mod tests {
|
|||
}))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_enron2() {
|
||||
assert_eq!(
|
||||
addr_spec(".nelson@enron.com"),
|
||||
Ok(("", AddrSpec {
|
||||
local_part: ".nelson".into(),
|
||||
domain: "enron.com".into(),
|
||||
}))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
17
src/parse.rs
17
src/parse.rs
|
@ -2,9 +2,22 @@ use imf_codec::header;
|
|||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
|
||||
fn main() {
|
||||
let mut email = String::new();
|
||||
io::stdin().lock().read_to_string(&mut email).unwrap();
|
||||
// Read full mail in memory
|
||||
let mut rawmail = Vec::new();
|
||||
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
||||
|
||||
// Create detector
|
||||
let mut detector = EncodingDetector::new();
|
||||
detector.feed(&rawmail, true);
|
||||
|
||||
// Get encoding
|
||||
let enc: &Encoding = detector.guess(None, true);
|
||||
let (email, encoding, malformed) = enc.decode(&rawmail);
|
||||
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||
|
||||
let (_, hdrs) = header::section(&email).unwrap();
|
||||
assert!(hdrs.date.is_some());
|
||||
|
|
Loading…
Reference in a new issue