diff options
Diffstat (limited to 'crates/store/src/fts')
-rw-r--r-- | crates/store/src/fts/bloom.rs | 257 | ||||
-rw-r--r-- | crates/store/src/fts/builder.rs | 250 | ||||
-rw-r--r-- | crates/store/src/fts/index.rs | 372 | ||||
-rw-r--r-- | crates/store/src/fts/mod.rs | 211 | ||||
-rw-r--r-- | crates/store/src/fts/query.rs | 291 | ||||
-rw-r--r-- | crates/store/src/fts/search_snippet.rs | 275 | ||||
-rw-r--r-- | crates/store/src/fts/term_index.rs | 2 |
7 files changed, 731 insertions, 927 deletions
diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs deleted file mode 100644 index 6145a637..00000000 --- a/crates/store/src/fts/bloom.rs +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::{ - borrow::Cow, - f64::consts::LN_2, - hash::{Hash, Hasher}, -}; - -use nlp::{language::stemmer::StemmedToken, tokenizers::Token}; -use roaring::RoaringBitmap; -use utils::codec::leb128::{Leb128Reader, Leb128Vec}; - -use crate::{Deserialize, Error, Serialize}; - -pub struct BloomFilter { - m: u64, - b: RoaringBitmap, -} - -#[derive(Debug)] -pub struct BloomHash { - pub h: [u64; 7], -} - -#[derive(Debug)] -pub struct BloomHashGroup { - pub h1: BloomHash, - pub h2: Option<BloomHash>, -} - -const AHASHER: ahash::RandomState = ahash::RandomState::with_seeds( - 0xaf1f2242106c64b3, - 0x60ca4cfb4b3ed0ce, - 0xc7dbc0bb615e82b3, - 0x520ad065378daf88, -); -lazy_static::lazy_static! { - static ref SIPHASHER: siphasher::sip::SipHasher13 = - siphasher::sip::SipHasher13::new_with_keys(0x56205cbdba8f02a6, 0xbd0dbc4bb06d687b); -} - -const P: f64 = 0.01; - -impl BloomFilter { - pub fn new(items: usize) -> Self { - Self { - m: if items > 0 { - std::cmp::max(Self::estimate_m(items, P), 10240) - } else { - 0 - }, - b: RoaringBitmap::new(), - } - } - - fn from_params(m: u64, b: RoaringBitmap) -> Self { - Self { m, b } - } - - fn estimate_m(n: usize, p: f64) -> u64 { - (((n as f64) * f64::ln(p) / (-8.0 * LN_2.powi(2))).ceil() as u64) * 8 - } - - #[allow(dead_code)] - fn estimate_k(m: u64, n: usize) -> u32 { - std::cmp::max(((m as f64) / (n as f64) * f64::ln(2.0f64)).ceil() as u32, 1) - } - - pub fn insert(&mut self, hash: &BloomHash) { - self.b.insert((hash.h[0] % self.m) as u32); - self.b.insert((hash.h[1] % self.m) as u32); - self.b.insert((hash.h[2] % self.m) as u32); - self.b.insert((hash.h[3] % self.m) as u32); - self.b.insert((hash.h[4] % self.m) as u32); - self.b.insert((hash.h[5] % self.m) as u32); - self.b.insert((hash.h[6] % self.m) as u32); - } - - pub fn contains(&self, hash: &BloomHash) -> bool { - self.b.contains((hash.h[0] % self.m) as u32) - && self.b.contains((hash.h[1] % self.m) as u32) - && self.b.contains((hash.h[2] % self.m) as u32) - && self.b.contains((hash.h[3] % self.m) as u32) - && self.b.contains((hash.h[4] % self.m) as u32) - && self.b.contains((hash.h[5] % self.m) as u32) - && self.b.contains((hash.h[6] % self.m) as u32) - } - - pub fn is_subset(&self, other: &Self) -> bool { - self.b.is_subset(&other.b) - } - - pub fn is_empty(&self) -> bool { - self.m == 0 || self.b.is_empty() - } -} - -pub trait BloomHasher { - fn hash<T: Hash + AsRef<[u8]> + ?Sized>(item: &T) -> Self; -} - -impl BloomHash { - pub fn hash<T: Hash + AsRef<[u8]> + ?Sized>(item: &T) -> Self { - let h1 = xxhash_rust::xxh3::xxh3_64(item.as_ref()); - let h2 = farmhash::hash64(item.as_ref()); - let h3 = AHASHER.hash_one(item); - let mut sh = *SIPHASHER; - sh.write(item.as_ref()); - let h4 = sh.finish(); - - Self { - h: [h1, h2, h3, h4, h1 ^ h2, h2 ^ h3, h3 ^ h4], - } - } -} - -pub fn hash_token(item: &str) -> Vec<u8> { - let h1 = xxhash_rust::xxh3::xxh3_64(item.as_ref()).to_le_bytes(); - let h2 = farmhash::hash64(item.as_ref()).to_le_bytes(); - let h3 = AHASHER.hash_one(item).to_le_bytes(); - let mut sh = *SIPHASHER; - sh.write(item.as_ref()); - let h4 = sh.finish().to_le_bytes(); - - match item.len() { - 0..=8 => { - let mut hash = Vec::with_capacity(6); - hash.extend_from_slice(&h1[..2]); - hash.extend_from_slice(&h2[..2]); - hash.push(h3[0]); - hash.push(h4[0]); - hash - } - 9..=16 => { - let mut hash = Vec::with_capacity(8); - hash.extend_from_slice(&h1[..2]); - hash.extend_from_slice(&h2[..2]); - hash.extend_from_slice(&h3[..2]); - hash.extend_from_slice(&h4[..2]); - hash - } - 17..=32 => { - let mut hash = Vec::with_capacity(12); - hash.extend_from_slice(&h1[..3]); - hash.extend_from_slice(&h2[..3]); - hash.extend_from_slice(&h3[..3]); - hash.extend_from_slice(&h4[..3]); - hash - } - _ => { - let mut hash = Vec::with_capacity(16); - hash.extend_from_slice(&h1[..4]); - hash.extend_from_slice(&h2[..4]); - hash.extend_from_slice(&h3[..4]); - hash.extend_from_slice(&h4[..4]); - hash - } - } -} - -impl From<&str> for BloomHash { - fn from(s: &str) -> Self { - Self::hash(&s) - } -} - -impl From<String> for BloomHash { - fn from(s: String) -> Self { - Self::hash(&s) - } -} - -impl From<&String> for BloomHash { - fn from(s: &String) -> Self { - Self::hash(&s) - } -} - -impl From<Cow<'_, str>> for BloomHash { - fn from(s: Cow<'_, str>) -> Self { - Self::hash(s.as_ref()) - } -} - -impl From<Token<Cow<'_, str>>> for BloomHashGroup { - fn from(t: Token<Cow<'_, str>>) -> Self { - Self { - h1: BloomHash::hash(t.word.as_ref()), - h2: None, - } - } -} - -impl From<StemmedToken<'_>> for BloomHashGroup { - fn from(t: StemmedToken<'_>) -> Self { - Self { - h1: BloomHash::hash(t.word.as_ref()), - h2: t.stemmed_word.map(|w| BloomHash::hash(&format!("{w}_"))), - } - } -} - -impl From<Cow<'_, str>> for BloomHashGroup { - fn from(t: Cow<'_, str>) -> Self { - Self { - h1: BloomHash::hash(t.as_ref()), - h2: None, - } - } -} - -impl Serialize for BloomFilter { - fn serialize(self) -> Vec<u8> { - let mut buf = Vec::with_capacity(U64_LEN + self.b.serialized_size()); - buf.push_leb128(self.m); - let _ = self.b.serialize_into(&mut buf); - buf - } -} - -impl Deserialize for BloomFilter { - fn deserialize(bytes: &[u8]) -> crate::Result<Self> { - let (m, pos) = bytes.read_leb128().ok_or_else(|| { - Error::InternalError( - "Failed to read 'm' value while deserializing bloom filter.".to_string(), - ) - })?; - RoaringBitmap::deserialize_unchecked_from(bytes.get(pos..).ok_or_else(|| { - Error::InternalError( - "Failed to read bitmap while deserializing bloom filter.".to_string(), - ) - })?) - .map_err(|err| Error::InternalError(format!("Failed to deserialize bloom filter: {err}."))) - .map(|b| Self::from_params(m, b)) - } -} diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs deleted file mode 100644 index f4a8422d..00000000 --- a/crates/store/src/fts/builder.rs +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::{borrow::Cow, collections::HashSet, fmt::Display}; - -use ahash::AHashSet; -use nlp::{ - language::{ - detect::{LanguageDetector, MIN_LANGUAGE_SCORE}, - stemmer::Stemmer, - Language, - }, - tokenizers::{space::SpaceTokenizer, Token}, -}; -use utils::map::vec_map::VecMap; - -use crate::{ - query::RawValue, - write::{BatchBuilder, IntoOperations, Operation, ValueClass}, - Serialize, HASH_EXACT, HASH_STEMMED, -}; - -use super::term_index::{TermIndexBuilder, TokenIndex}; - -pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize; -pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1; - -struct Text<'x, T: Into<u8> + Display> { - field: T, - text: Cow<'x, str>, - language: Type, -} - -enum Type { - Stem(Language), - Tokenize, - Static, -} - -pub struct FtsIndexBuilder<'x, T: Into<u8> + Display> { - parts: Vec<Text<'x, T>>, - default_language: Language, -} - -impl<'x, T: Into<u8> + Display> FtsIndexBuilder<'x, T> { - pub fn with_default_language(default_language: Language) -> FtsIndexBuilder<'x, T> { - FtsIndexBuilder { - parts: vec![], - default_language, - } - } - - pub fn index(&mut self, field: T, text: impl Into<Cow<'x, str>>, language: Language) { - self.parts.push(Text { - field, - text: text.into(), - language: Type::Stem(language), - }); - } - - pub fn index_raw(&mut self, field: T, text: impl Into<Cow<'x, str>>) { - self.parts.push(Text { - field, - text: text.into(), - language: Type::Tokenize, - }); - } - - pub fn index_raw_token(&mut self, field: T, text: impl Into<Cow<'x, str>>) { - self.parts.push(Text { - field, - text: text.into(), - language: Type::Static, - }); - } -} - -impl<'x, T: Into<u8> + Display> IntoOperations for FtsIndexBuilder<'x, T> { - fn build(self, batch: &mut BatchBuilder) { - let mut detect = LanguageDetector::new(); - let mut tokens: VecMap<u8, AHashSet<String>> = VecMap::new(); - let mut parts = Vec::new(); - - for text in self.parts { - match text.language { - Type::Stem(language) => { - let language = if language == Language::Unknown { - detect.detect(&text.text, MIN_LANGUAGE_SCORE) - } else { - language - }; - parts.push((text.field, language, text.text)); - } - Type::Tokenize => { - let tokens = tokens.get_mut_or_insert(text.field.into()); - for token in SpaceTokenizer::new(text.text.as_ref(), MAX_TOKEN_LENGTH) { - tokens.insert(token); - } - } - Type::Static => { - tokens - .get_mut_or_insert(text.field.into()) - .insert(text.text.into_owned()); - } - } - } - - let default_language = detect - .most_frequent_language() - .unwrap_or(self.default_language); - let mut term_index = TermIndexBuilder::new(); - let mut ops = AHashSet::new(); - - for (part_id, (field, language, text)) in parts.into_iter().enumerate() { - let language = if language != Language::Unknown { - language - } else { - default_language - }; - let mut terms = Vec::new(); - let field: u8 = field.into(); - - for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH).collect::<Vec<_>>() { - ops.insert(Operation::hash(&token.word, HASH_EXACT, field, true)); - if let Some(stemmed_word) = &token.stemmed_word { - ops.insert(Operation::hash(stemmed_word, HASH_STEMMED, field, true)); - } - terms.push(term_index.add_stemmed_token(token)); - } - - if !terms.is_empty() { - term_index.add_terms(field, part_id as u32, terms); - } - } - - for (field, tokens) in tokens { - let mut terms = Vec::with_capacity(tokens.len()); - for token in tokens { - ops.insert(Operation::hash(&token, HASH_EXACT, field, true)); - terms.push(term_index.add_token(Token { - word: token.into(), - from: 0, - to: 0, - })); - } - term_index.add_terms(field, 0, terms); - } - - for op in ops { - batch.ops.push(op); - } - - batch.ops.push(Operation::Value { - class: ValueClass::Property { - field: u8::MAX, - family: u8::MAX, - }, - set: term_index.serialize().into(), - }); - } -} - -impl TokenIndex { - fn build_index(self, batch: &mut BatchBuilder, set: bool) { - let mut ops = AHashSet::with_capacity(self.tokens.len() * 2); - for term in self.terms { - for (term_ids, is_exact) in [(term.exact_terms, true), (term.stemmed_terms, false)] { - for term_id in term_ids { - if let Some(word) = self.tokens.get(term_id as usize) { - ops.insert(Operation::hash( - word, - if is_exact { HASH_EXACT } else { HASH_STEMMED }, - term.field_id, - set, - )); - } - } - } - } - for op in ops { - batch.ops.push(op); - } - } -} - -impl IntoOperations for TokenIndex { - fn build(self, batch: &mut BatchBuilder) { - self.build_index(batch, false); - batch.ops.push(Operation::Value { - class: ValueClass::Property { - field: u8::MAX, - family: u8::MAX, - }, - set: None, - }); - } -} - -impl IntoOperations for RawValue<TokenIndex> { - fn build(self, batch: &mut BatchBuilder) { - self.inner.build_index(batch, true); - batch.ops.push(Operation::Value { - class: ValueClass::Property { - field: u8::MAX, - family: u8::MAX, - }, - set: self.raw.into(), - }); - } -} - -pub trait ToTokens { - fn to_tokens(&self) -> HashSet<String>; -} - -impl ToTokens for &str { - fn to_tokens(&self) -> HashSet<String> { - let mut tokens = HashSet::new(); - for token in SpaceTokenizer::new(self, MAX_TOKEN_LENGTH) { - tokens.insert(token); - } - tokens - } -} - -impl ToTokens for &String { - fn to_tokens(&self) -> HashSet<String> { - self.as_str().to_tokens() - } -} diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs new file mode 100644 index 00000000..1493fdcd --- /dev/null +++ b/crates/store/src/fts/index.rs @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::{borrow::Cow, fmt::Display}; + +use ahash::{AHashMap, AHashSet}; +use nlp::{ + language::{ + detect::{LanguageDetector, MIN_LANGUAGE_SCORE}, + stemmer::Stemmer, + Language, + }, + tokenizers::word::WordTokenizer, +}; + +use crate::{ + backend::MAX_TOKEN_LENGTH, + write::{ + hash::TokenType, key::KeySerializer, BatchBuilder, BitmapClass, BitmapHash, Operation, + ValueClass, + }, + Deserialize, Error, Store, ValueKey, U64_LEN, +}; + +use super::Field; + +#[derive(Debug)] +struct Text<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> { + field: Field<T>, + text: Cow<'x, str>, + typ: Type, +} + +#[derive(Debug)] +enum Type { + Text(Language), + Tokenize, + Keyword, +} + +#[derive(Debug)] +pub struct FtsDocument<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> { + parts: Vec<Text<'x, T>>, + default_language: Language, + account_id: u32, + collection: u8, + document_id: u32, +} + +impl<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> FtsDocument<'x, T> { + pub fn with_default_language(default_language: Language) -> FtsDocument<'x, T> { + FtsDocument { + parts: vec![], + default_language, + account_id: 0, + document_id: 0, + collection: 0, + } + } + + pub fn with_account_id(mut self, account_id: u32) -> Self { + self.account_id = account_id; + self + } + + pub fn with_document_id(mut self, document_id: u32) -> Self { + self.document_id = document_id; + self + } + + pub fn with_collection(mut self, collection: impl Into<u8>) -> Self { + self.collection = collection.into(); + self + } + + pub fn index(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>, language: Language) { + self.parts.push(Text { + field, + text: text.into(), + typ: Type::Text(language), + }); + } + + pub fn index_tokenized(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>) { + self.parts.push(Text { + field, + text: text.into(), + typ: Type::Tokenize, + }); + } + + pub fn index_keyword(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>) { + self.parts.push(Text { + field, + text: text.into(), + typ: Type::Keyword, + }); + } +} + +impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<Field<T>> for u8 { + fn from(value: Field<T>) -> Self { + match value { + Field::Body => 0, + Field::Attachment => 1, + Field::Keyword => 2, + Field::Header(value) => 3 + value.into(), + } + } +} + +impl Store { + pub async fn fts_index<T: Into<u8> + Display + Clone + std::fmt::Debug>( + &self, + document: FtsDocument<'_, T>, + ) -> crate::Result<()> { + let mut detect = LanguageDetector::new(); + let mut tokens: AHashMap<BitmapHash, AHashSet<u8>> = AHashMap::new(); + let mut parts = Vec::new(); + + for text in document.parts { + match text.typ { + Type::Text(language) => { + let language = if language == Language::Unknown { + detect.detect(&text.text, MIN_LANGUAGE_SCORE) + } else { + language + }; + parts.push((text.field, language, text.text)); + } + Type::Tokenize => { + let field = u8::from(text.field); + for token in WordTokenizer::new(text.text.as_ref(), MAX_TOKEN_LENGTH) { + tokens + .entry(BitmapHash::new(token.word.as_ref())) + .or_default() + .insert(TokenType::word(field)); + } + } + Type::Keyword => { + let field = u8::from(text.field); + tokens + .entry(BitmapHash::new(text.text.as_ref())) + .or_default() + .insert(TokenType::word(field)); + } + } + } + + let default_language = detect + .most_frequent_language() + .unwrap_or(document.default_language); + + for (field, language, text) in parts.into_iter() { + let language = if language != Language::Unknown { + language + } else { + default_language + }; + let field: u8 = field.into(); + + let mut last_token = Cow::Borrowed(""); + for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) { + if !last_token.is_empty() { + tokens + .entry(BitmapHash::new(&format!("{} {}", last_token, token.word))) + .or_default() + .insert(TokenType::bigram(field)); + } + + tokens + .entry(BitmapHash::new(token.word.as_ref())) + .or_default() + .insert(TokenType::word(field)); + + if let Some(stemmed_word) = token.stemmed_word { + tokens + .entry(BitmapHash::new(stemmed_word.as_ref())) + .or_default() + .insert(TokenType::stemmed(field)); + } + + last_token = token.word; + } + } + + if tokens.is_empty() { + return Ok(()); + } + + // Serialize tokens + let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2); + let mut keys = Vec::with_capacity(tokens.len()); + + for (hash, fields) in tokens.into_iter() { + serializer = serializer + .write(hash.hash.as_slice()) + .write(hash.len) + .write(fields.len() as u8); + for field in fields.into_iter() { + serializer = serializer.write(field); + keys.push(Operation::Bitmap { + class: BitmapClass::Text { field, token: hash }, + set: true, + }); + } + } + + // Write term index + let mut batch = BatchBuilder::new(); + batch + .with_account_id(document.account_id) + .with_collection(document.collection) + .update_document(document.document_id) + .set( + ValueClass::TermIndex, + lz4_flex::compress_prepend_size(&serializer.finalize()), + ); + self.write(batch.build()).await?; + let mut batch = BatchBuilder::new(); + batch + .with_account_id(document.account_id) + .with_collection(document.collection) + .update_document(document.document_id); + + for (pos, key) in keys.into_iter().enumerate() { + if pos > 0 && pos & 1023 == 0 { + self.write(batch.build()).await?; + batch = BatchBuilder::new(); + batch + .with_account_id(document.account_id) + .with_collection(document.collection) + .update_document(document.document_id); + } + batch.ops.push(key); + } + + if !batch.is_empty() { + self.write(batch.build()).await?; + } + + Ok(()) + } + + pub async fn fts_remove( + &self, + account_id: u32, + collection: u8, + document_id: u32, + ) -> crate::Result<bool> { + // Obtain term index + let term_index = if let Some(term_index) = self + .get_value::<TermIndex>(ValueKey { + account_id, + collection, + document_id, + class: ValueClass::TermIndex, + }) + .await? + { + term_index + } else { + return Ok(false); + }; + + // Remove keys + let mut batch = BatchBuilder::new(); + batch + .with_account_id(account_id) + .with_collection(collection) + .update_document(document_id); + + for (pos, key) in term_index.ops.into_iter().enumerate() { + if pos > 0 && pos & 1023 == 0 { + self.write(batch.build()).await?; + batch = BatchBuilder::new(); + batch + .with_account_id(account_id) + .with_collection(collection) + .update_document(document_id); + } + batch.ops.push(key); + } + + if !batch.is_empty() { + self.write(batch.build()).await?; + } + + // Remove term index + let mut batch = BatchBuilder::new(); + batch + .with_account_id(account_id) + .with_collection(collection) + .update_document(document_id) + .clear(ValueClass::TermIndex); + + self.write(batch.build()).await?; + + Ok(true) + } + + pub async fn fts_remove_all(&self, _: u32) -> crate::Result<()> { + // No-op + // Term indexes are stored in the same key range as the document + + Ok(()) + } +} + +struct TermIndex { + ops: Vec<Operation>, +} + +impl Deserialize for TermIndex { + fn deserialize(bytes: &[u8]) -> crate::Result<Self> { + let bytes = lz4_flex::decompress_size_prepended(bytes) + .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?; + let mut ops = Vec::new(); + let mut bytes = bytes.iter().peekable(); + + while bytes.peek().is_some() { + let mut hash = BitmapHash { + hash: [0; 8], + len: 0, + }; + + for byte in hash.hash.iter_mut() { + *byte = *bytes.next().ok_or(Error::InternalError( + "Unexpected EOF reading term index".to_string(), + ))?; + } + + hash.len = *bytes.next().ok_or(Error::InternalError( + "Unexpected EOF reading term index".to_string(), + ))?; + let num_fields = *bytes.next().ok_or(Error::InternalError( + "Unexpected EOF reading term index".to_string(), + ))?; + for _ in 0..num_fields { + let field = *bytes.next().ok_or(Error::InternalError( + "Unexpected EOF reading term index".to_string(), + ))?; + ops.push(Operation::Bitmap { + class: BitmapClass::Text { field, token: hash }, + set: false, + }); + } + } + + Ok(Self { ops }) + } +} diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs index 8761f076..cb38f1e9 100644 --- a/crates/store/src/fts/mod.rs +++ b/crates/store/src/fts/mod.rs @@ -21,55 +21,188 @@ * for more details. */ -use crate::{ - write::{BitmapFamily, Operation}, - BitmapKey, Serialize, BM_HASH, -}; +use std::fmt::Display; -use self::{bloom::hash_token, builder::MAX_TOKEN_MASK}; +use nlp::language::Language; -pub mod bloom; -pub mod builder; +pub mod index; pub mod query; -pub mod search_snippet; -pub mod term_index; - -impl BitmapKey<Vec<u8>> { - pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self { - BitmapKey { - account_id, - collection, - family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8, - field, - block_num: 0, - key: hash_token(word), + +#[derive(Clone, Debug)] +pub enum Field<T: Into<u8> + Display + Clone + std::fmt::Debug> { + Header(T), + Body, + Attachment, + Keyword, +} + +#[derive(Debug)] +pub enum FtsFilter<T: Into<u8> + Display + Clone + std::fmt::Debug> { + Exact { + field: Field<T>, + text: String, + language: Language, + }, + Contains { + field: Field<T>, + text: String, + language: Language, + }, + Keyword { + field: Field<T>, + text: String, + }, + And, + Or, + Not, + End, +} + +impl<T: Into<u8> + Display + Clone + std::fmt::Debug> FtsFilter<T> { + pub fn has_text_detect( + field: Field<T>, + text: impl Into<String>, + default_language: Language, + ) -> Self { + let (text, language) = Language::detect(text.into(), default_language); + Self::has_text(field, text, language) + } + + pub fn has_text(field: Field<T>, text: impl Into<String>, language: Language) -> Self { + let text = text.into(); + if !matches!(language, Language::None) && (text.starts_with('"') && text.ends_with('"')) + || (text.starts_with('\'') && text.ends_with('\'')) + { + FtsFilter::Exact { + field, + text, + language, + } + } else { + FtsFilter::Contains { + field, + text, + language, + } } } - pub fn value( - account_id: u32, - collection: impl Into<u8>, - field: impl Into<u8>, - value: impl BitmapFamily + Serialize, - ) -> Self { - BitmapKey { - account_id, - collection: collection.into(), - family: value.family(), - field: field.into(), - block_num: 0, - key: value.serialize(), + pub fn has_keyword(field: Field<T>, text: impl Into<String>) -> Self { + FtsFilter::Keyword { + field, + text: text.into(), } } + + pub fn has_english_text(field: Field<T>, text: impl Into<String>) -> Self { + Self::has_text(field, text, Language::English) + } } -impl Operation { - pub fn hash(word: &str, family: u8, field: u8, set: bool) -> Self { - Operation::Bitmap { - family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8, - field, - key: hash_token(word), - set, +#[derive(Clone, Copy)] +pub enum FilterType { + And, + Or, + Not, + End, + Store, + Fts, +} + +pub enum FilterGroup<T: FilterItem> { + Fts(Vec<T>), + Store(T), +} + +pub trait FilterItem: Clone { + fn filter_type(&self) -> FilterType; +} + +pub trait IntoFilterGroup<T: FilterItem + From<FilterType>> { + fn into_filter_group(self) -> Vec<FilterGroup<T>>; +} + +impl<T: FilterItem + From<FilterType>> IntoFilterGroup<T> for Vec<T> { + fn into_filter_group(self) -> Vec<FilterGroup<T>> { + let mut filter = Vec::with_capacity(self.len()); + let mut iter = self.into_iter(); + let mut logical_op = None; + + while let Some(item) = iter.next() { + if matches!(item.filter_type(), FilterType::Fts) { + let mut store_item = None; + let mut depth = 0; + let mut fts = Vec::with_capacity(5); + + // Add the logical operator if there is one + let in_logical_op = if let Some(op) = logical_op.take() { + fts.push(op); + true + } else { + false + }; + fts.push(item); + + for item in iter.by_ref() { + match item.filter_type() { + FilterType::And | FilterType::Or | FilterType::Not => { + depth += 1; + fts.push(item); + } + FilterType::End if depth > 0 => { + depth -= 1; + fts.push(item); + } + FilterType::Fts => { + fts.push(item); + } + _ => { + store_item = Some(item); + break; + } + } + } + + if in_logical_op { + fts.push(T::from(FilterType::End)); + } + + if depth > 0 { + let mut store = Vec::with_capacity(depth * 2); + while depth > 0 { + let item = fts.pop().unwrap(); + if matches!( + item.filter_type(), + FilterType::And | FilterType::Or | FilterType::Not + ) { + depth -= 1; + } + store.push(FilterGroup::Store(item)); + } + + filter.push(FilterGroup::Fts(fts)); + filter.extend(store); + } else { + filter.push(FilterGroup::Fts(fts)); + } + + if let Some(item) = store_item { + filter.push(FilterGroup::Store(item)); + } + } else { + match item.filter_type() { + FilterType::And | FilterType::Or => { + logical_op = Some(item.clone()); + } + FilterType::Not => { + logical_op = Some(T::from(FilterType::And)); + } + _ => {} + } + filter.push(FilterGroup::Store(item)); + } } + + filter } } diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs index 37938e3f..6be67a75 100644 --- a/crates/store/src/fts/query.rs +++ b/crates/store/src/fts/query.rs @@ -21,138 +21,210 @@ * for more details. */ -use std::ops::BitOrAssign; +use std::{ + fmt::Display, + ops::{BitAndAssign, BitOrAssign, BitXorAssign}, +}; -use nlp::language::{stemmer::Stemmer, Language}; +use nlp::language::stemmer::Stemmer; use roaring::RoaringBitmap; -use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ValueKey, HASH_EXACT, HASH_STEMMED}; +use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store}; -use super::term_index::TermIndex; +struct State<T: Into<u8> + Display + Clone + std::fmt::Debug> { + pub op: FtsFilter<T>, + pub bm: Option<RoaringBitmap>, +} -#[async_trait::async_trait] -pub trait StoreFts: StoreRead { - async fn fts_query( - &mut self, +impl Store { + pub async fn fts_query<T: Into<u8> + Display + Clone + std::fmt::Debug>( + &self, account_id: u32, - collection: u8, - field: u8, - text: &str, - language: Language, - match_phrase: bool, - ) -> crate::Result<Option<RoaringBitmap>> { - if match_phrase { - let mut phrase = Vec::new(); - let mut bit_keys = Vec::new(); - for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) { - let key = BitmapKey::hash( - token.word.as_ref(), - account_id, - collection, - HASH_EXACT, + collection: impl Into<u8>, + filters: Vec<FtsFilter<T>>, + ) -> crate::Result<RoaringBitmap> { + let collection = collection.into(); + let mut not_mask = RoaringBitmap::new(); + let mut not_fetch = false; + + let mut state: State<T> = FtsFilter::And.into(); + let mut stack = Vec::new(); + let mut filters = filters.into_iter().peekable(); + + while let Some(filter) = filters.next() { + let mut result = match filter { + FtsFilter::Exact { field, - ); - if !bit_keys.contains(&key) { - bit_keys.push(key); + text, + language, + } => { + let field: u8 = field.clone().into(); + + let tokens = language + .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) + .map(|t| t.word) + .collect::<Vec<_>>(); + let keys = if tokens.len() > 1 { + tokens + .windows(2) + .map(|bg| BitmapKey { + account_id, + collection, + class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field), + block_num: 0, + }) + .collect::<Vec<_>>() + } else { + tokens + .into_iter() + .map(|word| BitmapKey { + account_id, + collection, + class: BitmapClass::word(word.as_ref(), field), + block_num: 0, + }) + .collect::<Vec<_>>() + }; + + self.get_bitmaps_intersection(keys).await? } + FtsFilter::Contains { + field, + text, + language, + } => { + let mut result = RoaringBitmap::new(); + let field: u8 = field.clone().into(); - phrase.push(token.word); - } - let bitmaps = match self.get_bitmaps_intersection(bit_keys).await? { - Some(b) if !b.is_empty() => b, - _ => return Ok(None), - }; + for token in Stemmer::new(text.as_ref(), language, MAX_TOKEN_LENGTH) { + let token1 = BitmapKey { + account_id, + collection, + class: BitmapClass::word(token.word.as_ref(), field), + block_num: 0, + }; + let token2 = BitmapKey { + account_id, + collection, + class: BitmapClass::stemmed( + if let Some(stemmed_word) = token.stemmed_word { + stemmed_word + } else { + token.word + } + .as_ref(), + field, + ), + block_num: 0, + }; - match phrase.len() { - 0 => return Ok(None), - 1 => return Ok(Some(bitmaps)), - _ => (), - } + match self.get_bitmaps_union(vec![token1, token2]).await? { + Some(b) if !b.is_empty() => { + if !result.is_empty() { + result &= b; + if result.is_empty() { + break; + } + } else { + result = b; + } + } + _ => break, + } + } - let mut results = RoaringBitmap::new(); - for document_id in bitmaps { - if let Some(term_index) = self - .get_value::<TermIndex>(ValueKey::term_index( + if !result.is_empty() { + Some(result) + } else { + None + } + } + FtsFilter::Keyword { field, text } => { + self.get_bitmap(BitmapKey { account_id, collection, - document_id, - )) + class: BitmapClass::word(text, field), + block_num: 0, + }) .await? - { - if term_index - .match_terms( - &phrase - .iter() - .map(|w| term_index.get_match_term(w, None)) - .collect::<Vec<_>>(), - field.into(), - true, - false, - false, - ) - .map_err(|e| { - crate::Error::InternalError(format!( - "TermIndex match_terms failed for {account_id}/{collection}/{document_id}: {e:?}" - )) - })? - .is_some() - { - results.insert(document_id); + } + op @ (FtsFilter::And | FtsFilter::Or | FtsFilter::Not) => { + stack.push(state); + state = op.into(); + continue; + } + FtsFilter::End => { + if let Some(prev_state) = stack.pop() { + let bm = state.bm; + state = prev_state; + bm + } else { + break; } - } else { - tracing::debug!( - event = "error", - context = "fts_query", - account_id = account_id, - collection = collection, - document_id = document_id, - "Document is missing a term index", - ); } - } + }; - if !results.is_empty() { - Ok(Some(results)) - } else { - Ok(None) + // Only fetch not mask if we need it + if matches!(state.op, FtsFilter::Not) && !not_fetch { + not_mask = self + .get_bitmap(BitmapKey::document_ids(account_id, collection)) + .await? + .unwrap_or_else(RoaringBitmap::new); + not_fetch = true; } - } else { - let mut bitmaps = RoaringBitmap::new(); - - for token in Stemmer::new(text, language, MAX_TOKEN_LENGTH) { - let token1 = - BitmapKey::hash(&token.word, account_id, collection, HASH_EXACT, field); - let token2 = if let Some(stemmed_word) = token.stemmed_word { - BitmapKey::hash(&stemmed_word, account_id, collection, HASH_STEMMED, field) - } else { - let mut token2 = token1.clone(); - token2.family &= !HASH_EXACT; - token2.family |= HASH_STEMMED; - token2 - }; - - match self.get_bitmaps_union(vec![token1, token2]).await? { - Some(b) if !b.is_empty() => { - if !bitmaps.is_empty() { - bitmaps &= b; - if bitmaps.is_empty() { - return Ok(None); - } + + // Apply logical operation + if let Some(dest) = &mut state.bm { + match state.op { + FtsFilter::And => { + if let Some(result) = result { + dest.bitand_assign(result); } else { - bitmaps = b; + dest.clear(); + } + } + FtsFilter::Or => { + if let Some(result) = result { + dest.bitor_assign(result); + } + } + FtsFilter::Not => { + if let Some(mut result) = result { + result.bitxor_assign(¬_mask); + dest.bitand_assign(result); } } - _ => return Ok(None), - }; + _ => unreachable!(), + } + } else if let Some(ref mut result_) = result { + if let FtsFilter::Not = state.op { + result_.bitxor_assign(¬_mask); + } + state.bm = result; + } else if let FtsFilter::Not = state.op { + state.bm = Some(not_mask.clone()); + } else { + state.bm = Some(RoaringBitmap::new()); } - Ok(Some(bitmaps)) + // And short circuit + if matches!(state.op, FtsFilter::And) && state.bm.as_ref().unwrap().is_empty() { + while let Some(filter) = filters.peek() { + if matches!(filter, FtsFilter::End) { + break; + } else { + filters.next(); + } + } + } } + + Ok(state.bm.unwrap_or_default()) } - async fn get_bitmaps_union<T: AsRef<[u8]> + Sync + Send>( + async fn get_bitmaps_union( &self, - keys: Vec<BitmapKey<T>>, + keys: Vec<BitmapKey<BitmapClass>>, ) -> crate::Result<Option<RoaringBitmap>> { let mut bm = RoaringBitmap::new(); @@ -165,3 +237,12 @@ pub trait StoreFts: StoreRead { Ok(if !bm.is_empty() { Some(bm) } else { None }) } } + +impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<FtsFilter<T>> for State<T> { + fn from(value: FtsFilter<T>) -> Self { + Self { + op: value, + bm: None, + } + } +} diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs deleted file mode 100644 index 55d6b6b7..00000000 --- a/crates/store/src/fts/search_snippet.rs +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use super::term_index::Term; - -fn escape_char(c: char, string: &mut String) { - match c { - '&' => string.push_str("&"), - '<' => string.push_str("<"), - '>' => string.push_str(">"), - '"' => string.push_str("""), - '\n' | '\r' => string.push(' '), - _ => string.push(c), - } -} - -fn escape_char_len(c: char) -> usize { - match c { - '&' => "&".len(), - '<' => "<".len(), - '>' => ">".len(), - '"' => """.len(), - '\r' | '\n' => 1, - _ => c.len_utf8(), - } -} - -pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> { - let mut snippet = String::with_capacity(text.len()); - let start_offset = terms.get(0)?.offset as usize; - - if start_offset > 0 { - let mut word_count = 0; - let mut from_offset = 0; - let mut last_is_space = false; - - if text.len() > 240 { - for (pos, char) in text.get(0..start_offset)?.char_indices().rev() { - // Add up to 2 words or 40 characters of context - if char.is_whitespace() { - if !last_is_space { - word_count += 1; - if word_count == 3 { - break; - } - last_is_space = true; - } - } else { - last_is_space = false; - } - from_offset = pos; - if start_offset - from_offset >= 40 { - break; - } - } - } - - last_is_space = false; - for char in text.get(from_offset..start_offset)?.chars() { - if !char.is_whitespace() { - last_is_space = false; - } else { - if last_is_space { - continue; - } - last_is_space = true; - } - escape_char(char, &mut snippet); - } - } - - let mut terms = terms.iter().peekable(); - - 'outer: while let Some(term) = terms.next() { - if snippet.len() + ("<mark>".len() * 2) + term.len as usize + 1 > 255 { - break; - } - - snippet.push_str("<mark>"); - snippet.push_str(text.get(term.offset as usize..term.offset as usize + term.len as usize)?); - snippet.push_str("</mark>"); - - let next_offset = if let Some(next_term) = terms.peek() { - next_term.offset as usize - } else { - text.len() - }; - - let mut last_is_space = false; - for char in text - .get(term.offset as usize + term.len as usize..next_offset)? - .chars() - { - if !char.is_whitespace() { - last_is_space = false; - } else { - if last_is_space { - continue; - } - last_is_space = true; - } - - if snippet.len() + escape_char_len(char) <= 255 { - escape_char(char, &mut snippet); - } else { - break 'outer; - } - } - } - - Some(snippet) -} - -#[cfg(test)] -mod tests { - - use nlp::language::Language; - - use crate::{ - fts::term_index::{TermIndex, TermIndexBuilder}, - Deserialize, Serialize, - }; - - use super::*; - - #[test] - fn search_snippets() { - let inputs = [ - (vec![ - "Help a friend from Abidjan Côte d'Ivoire", - concat!( - "When my mother died when she was given birth to me, my father took me so ", - "special because I am motherless. Before the death of my late father on 22nd June ", - "2013 in a private hospital here in Abidjan Côte d'Ivoire. He secretly called me on his ", - "bedside and told me that he has a sum of $7.5M (Seven Million five Hundred ", - "Thousand Dollars) left in a suspense account in a local bank here in Abidjan Côte ", - "d'Ivoire, that he used my name as his only daughter for the next of kin in deposit of ", - "the fund. ", - "I am 24year old. Dear I am honorably seeking your assistance in the following ways. ", - "1) To provide any bank account where this money would be transferred into. ", - "2) To serve as the guardian of this fund. ", - "3) To make arrangement for me to come over to your country to further my ", - "education and to secure a residential permit for me in your country. ", - "Moreover, I am willing to offer you 30 percent of the total sum as compensation for ", - "your effort input after the successful transfer of this fund to your nominated ", - "account overseas." - )], - vec![ - ( - vec!["côte"], - vec![ - "Help a friend from Abidjan <mark>Côte</mark> d'Ivoire", - concat!( - "in Abidjan <mark>Côte</mark> d'Ivoire. He secretly called me on his bedside ", - "and told me that he has a sum of $7.5M (Seven Million five Hundred Thousand ", - "Dollars) left in a suspense account in a local bank here in Abidjan ", - "<mark>Côte</mark> d'Ivoire, that ") - ] - ), - ( - vec!["your", "country"], - vec![ - concat!( - "honorably seeking <mark>your</mark> assistance in the following ways. ", - "1) To provide any bank account where this money would be transferred into. 2) ", - "To serve as the guardian of this fund. 3) To make arrangement for me to come ", - "over to <mark>your</mark> " - )] - ), - ( - vec!["overseas"], - vec![ - "nominated account <mark>overseas</mark>." - ] - ), - - ], - ), - (vec![ - "孫子兵法", - concat!( - "<\"孫子兵法:\">", - "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。", - "孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠漆之材,", - "車甲之奉,日費千金,然後十萬之師舉矣。", - "孫子曰:凡用兵之法,全國為上,破國次之;全旅為上,破旅次之;全卒為上,破卒次之;全伍為上,破伍次之。", - "是故百戰百勝,非善之善者也;不戰而屈人之兵,善之善者也。", - "孫子曰:昔之善戰者,先為不可勝,以待敵之可勝,不可勝在己,可勝在敵。故善戰者,能為不可勝,不能使敵必可勝。", - "故曰:勝可知,而不可為。", - "兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之,實而備之,強而避之,", - "怒而撓之,卑而驕之,佚而勞之,親而離之。攻其無備,出其不意,此兵家之勝,不可先傳也。", - "夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也;多算勝,少算不勝,而況於無算乎?吾以此觀之,勝負見矣。", - "孫子曰:凡治眾如治寡,分數是也。鬥眾如鬥寡,形名是也。三軍之眾,可使必受敵而無敗者,奇正是也。兵之所加,", - "如以碬投卵者,虛實是也。", - )], - vec![ - ( - vec!["孫子兵法"], - vec![ - "<mark>孫子兵法</mark>", - concat!( - "<"<mark>孫子兵法</mark>:">孫子曰:兵者,國之大事,死生之地,存亡之道,", - "不可不察也。孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠"), - ] - ), - ( - vec!["孫子曰"], - vec![ - concat!( - "<"孫子兵法:"><mark>孫子曰</mark>:兵者,國之大事,死生之地,存亡之道,", - "不可不察也。<mark>孫子曰</mark>:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓", - )] - ), - ], - ), - ]; - - for (parts, tests) in inputs { - let mut builder = TermIndexBuilder::new(); - - for (field_num, part) in parts.iter().enumerate() { - let mut terms = Vec::new(); - for token in Language::English.tokenize_text(part, 40) { - terms.push(builder.add_token(token)); - } - builder.add_terms(field_num as u8, 0, terms); - } - - let compressed_term_index = builder.serialize(); - let term_index = TermIndex::deserialize(&compressed_term_index[..]).unwrap(); - - for (match_words, snippets) in tests { - let mut match_terms = Vec::new(); - for word in &match_words { - match_terms.push(term_index.get_match_term(word, None)); - } - - let term_groups = term_index - .match_terms(&match_terms, None, false, true, true) - .unwrap() - .unwrap(); - - assert_eq!(term_groups.len(), snippets.len()); - - for (term_group, snippet) in term_groups.iter().zip(snippets.iter()) { - assert_eq!( - snippet, - &generate_snippet(&term_group.terms, parts[term_group.field_id as usize]) - .unwrap() - ); - } - } - } - } -} diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs index 2b876578..3ec6e7e8 100644 --- a/crates/store/src/fts/term_index.rs +++ b/crates/store/src/fts/term_index.rs @@ -23,7 +23,7 @@ use std::{borrow::Cow, convert::TryInto}; -use crate::{Deserialize, Serialize}; +use crate::{Deserialize, Serialize, U32_LEN, U64_LEN}; use ahash::{AHashMap, AHashSet}; use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x}; |