use std::num::NonZeroU32; use anyhow::Result; use imap_codec::imap_types::core::NonEmptyVec; use imap_codec::imap_types::search::SearchKey; use imap_codec::imap_types::sequence::{SeqOrUid, Sequence, SequenceSet}; use crate::mail::query::{QueryScope, QueryResult}; use crate::imap::index::MailIndex; use crate::imap::mail_view::MailView; pub enum SeqType { Undefined, NonUid, Uid, } impl SeqType { pub fn is_uid(&self) -> bool { matches!(self, Self::Uid) } } pub struct Criteria<'a>(pub &'a SearchKey<'a>); impl<'a> Criteria<'a> { /// Returns a set of email identifiers that is greater or equal /// to the set of emails to return pub fn to_sequence_set(&self) -> (SequenceSet, SeqType) { match self.0 { SearchKey::All => (sequence_set_all(), SeqType::Undefined), SearchKey::SequenceSet(seq_set) => (seq_set.clone(), SeqType::NonUid), SearchKey::Uid(seq_set) => (seq_set.clone(), SeqType::Uid), SearchKey::Not(_inner) => { tracing::debug!( "using NOT in a search request is slow: it selects all identifiers" ); (sequence_set_all(), SeqType::Undefined) } SearchKey::Or(left, right) => { tracing::debug!("using OR in a search request is slow: no deduplication is done"); let (base, base_seqtype) = Self(&left).to_sequence_set(); let (ext, ext_seqtype) = Self(&right).to_sequence_set(); // Check if we have a UID/ID conflict in fetching: now we don't know how to handle them match (base_seqtype, ext_seqtype) { (SeqType::Uid, SeqType::NonUid) | (SeqType::NonUid, SeqType::Uid) => { (sequence_set_all(), SeqType::Undefined) } (SeqType::Undefined, x) | (x, _) => { let mut new_vec = base.0.into_inner(); new_vec.extend_from_slice(ext.0.as_ref()); let seq = SequenceSet( NonEmptyVec::try_from(new_vec) .expect("merging non empty vec lead to non empty vec"), ); (seq, x) } } } SearchKey::And(search_list) => { tracing::debug!( "using AND in a search request is slow: no intersection is performed" ); // As we perform no intersection, we don't care if we mix uid or id. // We only keep the smallest range, being it ID or UID, depending of // which one has the less items. This is an approximation as UID ranges // can have holes while ID ones can't. search_list .as_ref() .iter() .map(|crit| Self(&crit).to_sequence_set()) .min_by(|(x, _), (y, _)| { let x_size = approx_sequence_set_size(x); let y_size = approx_sequence_set_size(y); x_size.cmp(&y_size) }) .unwrap_or((sequence_set_all(), SeqType::Undefined)) } _ => (sequence_set_all(), SeqType::Undefined), } } /// Not really clever as we can have cases where we filter out /// the email before needing to inspect its meta. /// But for now we are seeking the most basic/stupid algorithm. pub fn query_scope(&self) -> QueryScope { use SearchKey::*; match self.0 { // Combinators And(and_list) => and_list.as_ref().iter().fold(QueryScope::Index, |prev, sk| { prev.union(&Criteria(sk).query_scope()) }), Not(inner) => Criteria(inner).query_scope(), Or(left, right) => Criteria(left).query_scope().union(&Criteria(right).query_scope()), All => QueryScope::Index, // IMF Headers Bcc(_) | Cc(_) | From(_) | Header(..) | SentBefore(_) | SentOn(_) | SentSince(_) | Subject(_) | To(_) => QueryScope::Partial, // Internal Date is also stored in MailMeta Before(_) | On(_) | Since(_) => QueryScope::Partial, // Message size is also stored in MailMeta Larger(_) | Smaller(_) => QueryScope::Partial, // Text and Body require that we fetch the full content! Text(_) | Body(_) => QueryScope::Full, _ => QueryScope::Index, } } /// Returns emails that we now for sure we want to keep /// but also a second list of emails we need to investigate further by /// fetching some remote data pub fn filter_on_idx<'b>(&self, midx_list: &[MailIndex<'b>]) -> (Vec>, Vec>) { let (p1, p2): (Vec<_>, Vec<_>) = midx_list .iter() .map(|x| (x, self.is_keep_on_idx(x))) .filter(|(_midx, decision)| decision.is_keep()) .map(|(midx, decision)| ((*midx).clone(), decision)) .partition(|(_midx, decision)| matches!(decision, PartialDecision::Keep)); let to_keep = p1.into_iter().map(|(v, _)| v).collect(); let to_fetch = p2.into_iter().map(|(v, _)| v).collect(); (to_keep, to_fetch) } pub fn filter_on_query<'b>(&self, midx_list: &[MailIndex<'b>], query_result: &'b Vec>) -> Result>> { Ok(midx_list .iter() .zip(query_result.iter()) .map(|(midx, qr)| MailView::new(qr, midx.clone())) .collect::, _>>()? .into_iter() .filter(|mail_view| self.is_keep_on_query(mail_view)) .map(|mail_view| mail_view.in_idx) .collect()) } // ---- /// Here we are doing a partial filtering: we do not have access /// to the headers or to the body, so every time we encounter a rule /// based on them, we need to keep it. /// /// @TODO Could be optimized on a per-email basis by also returning the QueryScope /// when more information is needed! fn is_keep_on_idx(&self, midx: &MailIndex) -> PartialDecision { use SearchKey::*; match self.0 { // Combinator logic And(expr_list) => expr_list .as_ref() .iter() .fold(PartialDecision::Keep, |acc, cur| acc.and(&Criteria(cur).is_keep_on_idx(midx))), Or(left, right) => { let left_decision = Criteria(left).is_keep_on_idx(midx); let right_decision = Criteria(right).is_keep_on_idx(midx); left_decision.or(&right_decision) } Not(expr) => Criteria(expr).is_keep_on_idx(midx).not(), All => PartialDecision::Keep, // Sequence logic maybe_seq if is_sk_seq(maybe_seq) => is_keep_seq(maybe_seq, midx).into(), maybe_flag if is_sk_flag(maybe_flag) => is_keep_flag(maybe_flag, midx).into(), // All the stuff we can't evaluate yet Bcc(_) | Cc(_) | From(_) | Header(..) | SentBefore(_) | SentOn(_) | SentSince(_) | Subject(_) | To(_) | Before(_) | On(_) | Since(_) | Larger(_) | Smaller(_) | Text(_) | Body(_) => PartialDecision::Postpone, unknown => { tracing::error!("Unknown filter {:?}", unknown); PartialDecision::Discard }, } } /// @TODO we re-eveluate twice the same logic. The correct way would be, on each pass, /// to simplify the searck query, by removing the elements that were already checked. /// For example if we have AND(OR(seqid(X), body(Y)), body(X)), we can't keep for sure /// the email, as body(x) might be false. So we need to check it. But as seqid(x) is true, /// we could simplify the request to just body(x) and truncate the first OR. Today, we are /// not doing that, and thus we reevaluate everything. fn is_keep_on_query(&self, mail_view: &MailView) -> bool { use SearchKey::*; match self.0 { // Combinator logic And(expr_list) => expr_list .as_ref() .iter() .all(|cur| Criteria(cur).is_keep_on_query(mail_view)), Or(left, right) => { Criteria(left).is_keep_on_query(mail_view) || Criteria(right).is_keep_on_query(mail_view) } Not(expr) => !Criteria(expr).is_keep_on_query(mail_view), All => true, // Reevaluating our previous logic... maybe_seq if is_sk_seq(maybe_seq) => is_keep_seq(maybe_seq, &mail_view.in_idx), maybe_flag if is_sk_flag(maybe_flag) => is_keep_flag(maybe_flag, &mail_view.in_idx), // Filter on mail meta Before(search_naive) => match mail_view.stored_naive_date() { Ok(msg_naive) => &msg_naive < search_naive.as_ref(), _ => false, }, On(search_naive) => match mail_view.stored_naive_date() { Ok(msg_naive) => &msg_naive == search_naive.as_ref(), _ => false, }, Since(search_naive) => match mail_view.stored_naive_date() { Ok(msg_naive) => &msg_naive > search_naive.as_ref(), _ => false, }, // Message size is also stored in MailMeta Larger(size_ref) => mail_view.query_result.metadata().expect("metadata were fetched").rfc822_size > *size_ref as usize, Smaller(size_ref) => mail_view.query_result.metadata().expect("metadata were fetched").rfc822_size < *size_ref as usize, // Filter on well-known headers Bcc(txt) => mail_view.is_header_contains_pattern(&b"bcc"[..], txt.as_ref()), Cc(txt) => mail_view.is_header_contains_pattern(&b"cc"[..], txt.as_ref()), From(txt) => mail_view.is_header_contains_pattern(&b"from"[..], txt.as_ref()), Subject(txt)=> mail_view.is_header_contains_pattern(&b"subject"[..], txt.as_ref()), To(txt) => mail_view.is_header_contains_pattern(&b"to"[..], txt.as_ref()), Header(hdr, txt) => mail_view.is_header_contains_pattern(hdr.as_ref(), txt.as_ref()), // Filter on Date header SentBefore(search_naive) => mail_view.imf().map(|imf| imf.naive_date().ok()).flatten().map(|msg_naive| &msg_naive < search_naive.as_ref()).unwrap_or(false), SentOn(search_naive) => mail_view.imf().map(|imf| imf.naive_date().ok()).flatten().map(|msg_naive| &msg_naive == search_naive.as_ref()).unwrap_or(false), SentSince(search_naive) => mail_view.imf().map(|imf| imf.naive_date().ok()).flatten().map(|msg_naive| &msg_naive > search_naive.as_ref()).unwrap_or(false), // Filter on the full content of the email Text(txt) => mail_view.content.as_msg().map(|msg| msg.raw_part.windows(txt.as_ref().len()).any(|win| win == txt.as_ref())).unwrap_or(false), Body(txt) => mail_view.content.as_msg().map(|msg| msg.raw_body.windows(txt.as_ref().len()).any(|win| win == txt.as_ref())).unwrap_or(false), unknown => { tracing::error!("Unknown filter {:?}", unknown); false }, } } } // ---- Sequence things ---- fn sequence_set_all() -> SequenceSet { SequenceSet::from(Sequence::Range( SeqOrUid::Value(NonZeroU32::MIN), SeqOrUid::Asterisk, )) } // This is wrong as sequences can overlap fn approx_sequence_set_size(seq_set: &SequenceSet) -> u64 { seq_set.0.as_ref().iter().fold(0u64, |acc, seq| { acc.saturating_add(approx_sequence_size(seq)) }) } // This is wrong as sequence UID can have holes, // as we don't know the number of messages in the mailbox also fn approx_sequence_size(seq: &Sequence) -> u64 { match seq { Sequence::Single(_) => 1, Sequence::Range(SeqOrUid::Asterisk, _) | Sequence::Range(_, SeqOrUid::Asterisk) => u64::MAX, Sequence::Range(SeqOrUid::Value(x1), SeqOrUid::Value(x2)) => { let x2 = x2.get() as i64; let x1 = x1.get() as i64; (x2 - x1).abs().try_into().unwrap_or(1) } } } // --- Partial decision things ---- enum PartialDecision { Keep, Discard, Postpone, } impl From for PartialDecision { fn from(x: bool) -> Self { match x { true => PartialDecision::Keep, _ => PartialDecision::Discard, } } } impl PartialDecision { fn not(&self) -> Self { match self { Self::Keep => Self::Discard, Self::Discard => Self::Keep, Self::Postpone => Self::Postpone, } } fn or(&self, other: &Self) -> Self { match (self, other) { (Self::Keep, _) | (_, Self::Keep) => Self::Keep, (Self::Postpone, _) | (_, Self::Postpone) => Self::Postpone, (Self::Discard, Self::Discard) => Self::Discard, } } fn and(&self, other: &Self) -> Self { match (self, other) { (Self::Discard, _) | (_, Self::Discard) => Self::Discard, (Self::Postpone, _) | (_, Self::Postpone) => Self::Postpone, (Self::Keep, Self::Keep) => Self::Keep, } } fn is_keep(&self) -> bool { !matches!(self, Self::Discard) } } // ----- Search Key things --- fn is_sk_flag(sk: &SearchKey) -> bool { use SearchKey::*; match sk { Answered | Deleted | Draft | Flagged | Keyword(..) | New | Old | Recent | Seen | Unanswered | Undeleted | Undraft | Unflagged | Unkeyword(..) | Unseen => true, _ => false, } } fn is_keep_flag(sk: &SearchKey, midx: &MailIndex) -> bool { use SearchKey::*; match sk { Answered => midx.is_flag_set("\\Answered"), Deleted => midx.is_flag_set("\\Deleted"), Draft => midx.is_flag_set("\\Draft"), Flagged => midx.is_flag_set("\\Flagged"), Keyword(kw) => midx.is_flag_set(kw.inner()), New => { let is_recent = midx.is_flag_set("\\Recent"); let is_seen = midx.is_flag_set("\\Seen"); is_recent && !is_seen }, Old => { let is_recent = midx.is_flag_set("\\Recent"); !is_recent }, Recent => midx.is_flag_set("\\Recent"), Seen => midx.is_flag_set("\\Seen"), Unanswered => { let is_answered = midx.is_flag_set("\\Recent"); !is_answered }, Undeleted => { let is_deleted = midx.is_flag_set("\\Deleted"); !is_deleted }, Undraft => { let is_draft = midx.is_flag_set("\\Draft"); !is_draft }, Unflagged => { let is_flagged = midx.is_flag_set("\\Flagged"); !is_flagged }, Unkeyword(kw) => { let is_keyword_set = midx.is_flag_set(kw.inner()); !is_keyword_set }, Unseen => { let is_seen = midx.is_flag_set("\\Seen"); !is_seen }, // Not flag logic _ => unreachable!(), } } fn is_sk_seq(sk: &SearchKey) -> bool { use SearchKey::*; match sk { SequenceSet(..) | Uid(..) => true, _ => false, } } fn is_keep_seq(sk: &SearchKey, midx: &MailIndex) -> bool { use SearchKey::*; match sk { SequenceSet(seq_set) => seq_set.0.as_ref().iter().any(|seq| midx.is_in_sequence_i(seq)), Uid(seq_set) => seq_set.0.as_ref().iter().any(|seq| midx.is_in_sequence_uid(seq)), _ => unreachable!(), } }