From 2bc62edba85f6973a28ea2c39596c98a6514a6ae Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Mon, 19 Jun 2023 18:31:24 +0200 Subject: [PATCH] implement the guess charset parser --- src/multipass/guess_charset.rs | 51 ++++++++++++++++++++++++++++++++++ src/multipass/mod.rs | 1 + 2 files changed, 52 insertions(+) diff --git a/src/multipass/guess_charset.rs b/src/multipass/guess_charset.rs index e69de29..bcf223f 100644 --- a/src/multipass/guess_charset.rs +++ b/src/multipass/guess_charset.rs @@ -0,0 +1,51 @@ +use std::borrow::Cow; +use chardetng::EncodingDetector; +use encoding_rs::Encoding; + +use crate::multipass::segment::Segment; + +#[derive(Debug, PartialEq)] +pub struct GuessCharset<'a> { + pub header: Cow<'a, str>, + pub encoding: &'static Encoding, + pub malformed: bool, + pub body: &'a [u8], +} + +const IS_LAST_BUFFER: bool = true; +const ALLOW_UTF8: bool = true; +const NO_TLD: Option<&[u8]> = None; + +impl<'a> From> for GuessCharset<'a> { + fn from(seg: Segment<'a>) -> Self { + // Create detector + let mut detector = EncodingDetector::new(); + detector.feed(&seg.header, IS_LAST_BUFFER); + + // Get encoding + let enc: &Encoding = detector.guess(NO_TLD, ALLOW_UTF8); + let (header, encoding, malformed) = enc.decode(&seg.header); + + GuessCharset { header, encoding, malformed, body: seg.body } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_charset() { + assert_eq!( + GuessCharset::from(Segment { + body: b"Hello world!", + header: b"From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n", + }), + GuessCharset { + header: Cow::Borrowed("From: hello@world.com\r\nDate: 12 Mar 1997 07:33:25 Z\r\n"), + encoding: encoding_rs::UTF_8, + malformed: false, + body: b"Hello world!", + }); + } +} diff --git a/src/multipass/mod.rs b/src/multipass/mod.rs index c5780e0..e3fa3b1 100644 --- a/src/multipass/mod.rs +++ b/src/multipass/mod.rs @@ -1 +1,2 @@ pub mod segment; +pub mod guess_charset;