implement the guess charset parser
This commit is contained in:
parent
ddf6311925
commit
2bc62edba8
2 changed files with 52 additions and 0 deletions
|
@ -0,0 +1,51 @@
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use chardetng::EncodingDetector;
|
||||||
|
use encoding_rs::Encoding;
|
||||||
|
|
||||||
|
use crate::multipass::segment::Segment;
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub struct GuessCharset<'a> {
|
||||||
|
pub header: Cow<'a, str>,
|
||||||
|
pub encoding: &'static Encoding,
|
||||||
|
pub malformed: bool,
|
||||||
|
pub body: &'a [u8],
|
||||||
|
}
|
||||||
|
|
||||||
|
const IS_LAST_BUFFER: bool = true;
|
||||||
|
const ALLOW_UTF8: bool = true;
|
||||||
|
const NO_TLD: Option<&[u8]> = None;
|
||||||
|
|
||||||
|
impl<'a> From<Segment<'a>> for GuessCharset<'a> {
|
||||||
|
fn from(seg: Segment<'a>) -> Self {
|
||||||
|
// Create detector
|
||||||
|
let mut detector = EncodingDetector::new();
|
||||||
|
detector.feed(&seg.header, IS_LAST_BUFFER);
|
||||||
|
|
||||||
|
// Get encoding
|
||||||
|
let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8);
|
||||||
|
let (header, encoding, malformed) = enc.decode(&seg.header);
|
||||||
|
|
||||||
|
GuessCharset { header, encoding, malformed, body: seg.body }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_charset() {
|
||||||
|
assert_eq!(
|
||||||
|
GuessCharset::from(Segment {
|
||||||
|
body: b"Hello world!",
|
||||||
|
header: b"From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n",
|
||||||
|
}),
|
||||||
|
GuessCharset {
|
||||||
|
header: Cow::Borrowed("From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n"),
|
||||||
|
encoding: encoding_rs::UTF_8,
|
||||||
|
malformed: false,
|
||||||
|
body: b"Hello world!",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
|
@ -1 +1,2 @@
|
||||||
pub mod segment;
|
pub mod segment;
|
||||||
|
pub mod guess_charset;
|
||||||
|
|
Loading…
Reference in a new issue