add a charset detector
This commit is contained in:
parent
8f2c944ab8
commit
950947ee3e
4 changed files with 55 additions and 10 deletions
22
Cargo.lock
generated
22
Cargo.lock
generated
|
@ -41,6 +41,17 @@ version = "1.0.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chardetng"
|
||||||
|
version = "0.1.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"encoding_rs",
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chrono"
|
name = "chrono"
|
||||||
version = "0.4.26"
|
version = "0.4.26"
|
||||||
|
@ -62,6 +73,15 @@ version = "0.8.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding_rs"
|
||||||
|
version = "0.8.32"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iana-time-zone"
|
name = "iana-time-zone"
|
||||||
version = "0.1.57"
|
version = "0.1.57"
|
||||||
|
@ -89,7 +109,9 @@ dependencies = [
|
||||||
name = "imf-codec"
|
name = "imf-codec"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"chardetng",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"encoding_rs",
|
||||||
"nom",
|
"nom",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
]
|
]
|
||||||
|
|
|
@ -18,6 +18,8 @@ path = "src/parse.rs"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
nom = "7"
|
nom = "7"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
|
chardetng = "0.1"
|
||||||
|
encoding_rs = "0.8"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
walkdir = "2"
|
walkdir = "2"
|
||||||
|
|
|
@ -111,16 +111,13 @@ fn strict_local_part(input: &str) -> IResult<&str, String> {
|
||||||
/// obs_local_part.
|
/// obs_local_part.
|
||||||
///
|
///
|
||||||
/// ```abnf
|
/// ```abnf
|
||||||
/// obs-local-part = word *(1*"." word)
|
/// obs-local-part = *(*"." word)
|
||||||
/// ```
|
/// ```
|
||||||
fn obs_local_part(input: &str) -> IResult<&str, String> {
|
fn obs_local_part(input: &str) -> IResult<&str, String> {
|
||||||
map(pair(
|
fold_many0(
|
||||||
word,
|
pair(opt(is_a(".")), word),
|
||||||
fold_many0(
|
String::new,
|
||||||
pair(is_a("."), word),
|
|acc, (dots, txt)| acc + dots.unwrap_or("") + &txt)(input)
|
||||||
String::new,
|
|
||||||
|acc, (dots, txt)| acc + dots + &txt),
|
|
||||||
), |(head, rest)| head.into_owned() + &rest)(input)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Domain
|
/// Domain
|
||||||
|
@ -295,4 +292,15 @@ mod tests {
|
||||||
}))
|
}))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_enron2() {
|
||||||
|
assert_eq!(
|
||||||
|
addr_spec(".nelson@enron.com"),
|
||||||
|
Ok(("", AddrSpec {
|
||||||
|
local_part: ".nelson".into(),
|
||||||
|
domain: "enron.com".into(),
|
||||||
|
}))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
17
src/parse.rs
17
src/parse.rs
|
@ -2,9 +2,22 @@ use imf_codec::header;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
|
||||||
|
use chardetng::EncodingDetector;
|
||||||
|
use encoding_rs::Encoding;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let mut email = String::new();
|
// Read full mail in memory
|
||||||
io::stdin().lock().read_to_string(&mut email).unwrap();
|
let mut rawmail = Vec::new();
|
||||||
|
io::stdin().lock().read_to_end(&mut rawmail).unwrap();
|
||||||
|
|
||||||
|
// Create detector
|
||||||
|
let mut detector = EncodingDetector::new();
|
||||||
|
detector.feed(&rawmail, true);
|
||||||
|
|
||||||
|
// Get encoding
|
||||||
|
let enc: &Encoding = detector.guess(None, true);
|
||||||
|
let (email, encoding, malformed) = enc.decode(&rawmail);
|
||||||
|
println!("Encoding: {:?}, Malformed: {:?}", encoding, malformed);
|
||||||
|
|
||||||
let (_, hdrs) = header::section(&email).unwrap();
|
let (_, hdrs) = header::section(&email).unwrap();
|
||||||
assert!(hdrs.date.is_some());
|
assert!(hdrs.date.is_some());
|
||||||
|
|
Loading…
Reference in a new issue