implement the guess charset parser

This commit is contained in:
Quentin 2023-06-19 18:31:24 +02:00
parent ddf6311925
commit 2bc62edba8
Signed by: quentin
GPG key ID: E9602264D639FF68
2 changed files with 52 additions and 0 deletions

View file

@ -0,0 +1,51 @@
use std::borrow::Cow;
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use crate::multipass::segment::Segment;
#[derive(Debug, PartialEq)]
pub struct GuessCharset<'a> {
pub header: Cow<'a, str>,
pub encoding: &'static Encoding,
pub malformed: bool,
pub body: &'a [u8],
}
const IS_LAST_BUFFER: bool = true;
const ALLOW_UTF8: bool = true;
const NO_TLD: Option<&[u8]> = None;
impl<'a> From<Segment<'a>> for GuessCharset<'a> {
fn from(seg: Segment<'a>) -> Self {
// Create detector
let mut detector = EncodingDetector::new();
detector.feed(&seg.header, IS_LAST_BUFFER);
// Get encoding
let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8);
let (header, encoding, malformed) = enc.decode(&seg.header);
GuessCharset { header, encoding, malformed, body: seg.body }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_charset() {
assert_eq!(
GuessCharset::from(Segment {
body: b"Hello world!",
header: b"From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n",
}),
GuessCharset {
header: Cow::Borrowed("From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n"),
encoding: encoding_rs::UTF_8,
malformed: false,
body: b"Hello world!",
});
}
}

View file

@ -1 +1,2 @@
pub mod segment;
pub mod guess_charset;