summaryrefslogtreecommitdiff
path: root/crates/store/src/fts
diff options
context:
space:
mode:
Diffstat (limited to 'crates/store/src/fts')
-rw-r--r--crates/store/src/fts/bloom.rs257
-rw-r--r--crates/store/src/fts/builder.rs250
-rw-r--r--crates/store/src/fts/index.rs372
-rw-r--r--crates/store/src/fts/mod.rs211
-rw-r--r--crates/store/src/fts/query.rs291
-rw-r--r--crates/store/src/fts/search_snippet.rs275
-rw-r--r--crates/store/src/fts/term_index.rs2
7 files changed, 731 insertions, 927 deletions
diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs
deleted file mode 100644
index 6145a637..00000000
--- a/crates/store/src/fts/bloom.rs
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2023 Stalwart Labs Ltd.
- *
- * This file is part of the Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{
- borrow::Cow,
- f64::consts::LN_2,
- hash::{Hash, Hasher},
-};
-
-use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
-use roaring::RoaringBitmap;
-use utils::codec::leb128::{Leb128Reader, Leb128Vec};
-
-use crate::{Deserialize, Error, Serialize};
-
-pub struct BloomFilter {
- m: u64,
- b: RoaringBitmap,
-}
-
-#[derive(Debug)]
-pub struct BloomHash {
- pub h: [u64; 7],
-}
-
-#[derive(Debug)]
-pub struct BloomHashGroup {
- pub h1: BloomHash,
- pub h2: Option<BloomHash>,
-}
-
-const AHASHER: ahash::RandomState = ahash::RandomState::with_seeds(
- 0xaf1f2242106c64b3,
- 0x60ca4cfb4b3ed0ce,
- 0xc7dbc0bb615e82b3,
- 0x520ad065378daf88,
-);
-lazy_static::lazy_static! {
- static ref SIPHASHER: siphasher::sip::SipHasher13 =
- siphasher::sip::SipHasher13::new_with_keys(0x56205cbdba8f02a6, 0xbd0dbc4bb06d687b);
-}
-
-const P: f64 = 0.01;
-
-impl BloomFilter {
- pub fn new(items: usize) -> Self {
- Self {
- m: if items > 0 {
- std::cmp::max(Self::estimate_m(items, P), 10240)
- } else {
- 0
- },
- b: RoaringBitmap::new(),
- }
- }
-
- fn from_params(m: u64, b: RoaringBitmap) -> Self {
- Self { m, b }
- }
-
- fn estimate_m(n: usize, p: f64) -> u64 {
- (((n as f64) * f64::ln(p) / (-8.0 * LN_2.powi(2))).ceil() as u64) * 8
- }
-
- #[allow(dead_code)]
- fn estimate_k(m: u64, n: usize) -> u32 {
- std::cmp::max(((m as f64) / (n as f64) * f64::ln(2.0f64)).ceil() as u32, 1)
- }
-
- pub fn insert(&mut self, hash: &BloomHash) {
- self.b.insert((hash.h[0] % self.m) as u32);
- self.b.insert((hash.h[1] % self.m) as u32);
- self.b.insert((hash.h[2] % self.m) as u32);
- self.b.insert((hash.h[3] % self.m) as u32);
- self.b.insert((hash.h[4] % self.m) as u32);
- self.b.insert((hash.h[5] % self.m) as u32);
- self.b.insert((hash.h[6] % self.m) as u32);
- }
-
- pub fn contains(&self, hash: &BloomHash) -> bool {
- self.b.contains((hash.h[0] % self.m) as u32)
- && self.b.contains((hash.h[1] % self.m) as u32)
- && self.b.contains((hash.h[2] % self.m) as u32)
- && self.b.contains((hash.h[3] % self.m) as u32)
- && self.b.contains((hash.h[4] % self.m) as u32)
- && self.b.contains((hash.h[5] % self.m) as u32)
- && self.b.contains((hash.h[6] % self.m) as u32)
- }
-
- pub fn is_subset(&self, other: &Self) -> bool {
- self.b.is_subset(&other.b)
- }
-
- pub fn is_empty(&self) -> bool {
- self.m == 0 || self.b.is_empty()
- }
-}
-
-pub trait BloomHasher {
- fn hash<T: Hash + AsRef<[u8]> + ?Sized>(item: &T) -> Self;
-}
-
-impl BloomHash {
- pub fn hash<T: Hash + AsRef<[u8]> + ?Sized>(item: &T) -> Self {
- let h1 = xxhash_rust::xxh3::xxh3_64(item.as_ref());
- let h2 = farmhash::hash64(item.as_ref());
- let h3 = AHASHER.hash_one(item);
- let mut sh = *SIPHASHER;
- sh.write(item.as_ref());
- let h4 = sh.finish();
-
- Self {
- h: [h1, h2, h3, h4, h1 ^ h2, h2 ^ h3, h3 ^ h4],
- }
- }
-}
-
-pub fn hash_token(item: &str) -> Vec<u8> {
- let h1 = xxhash_rust::xxh3::xxh3_64(item.as_ref()).to_le_bytes();
- let h2 = farmhash::hash64(item.as_ref()).to_le_bytes();
- let h3 = AHASHER.hash_one(item).to_le_bytes();
- let mut sh = *SIPHASHER;
- sh.write(item.as_ref());
- let h4 = sh.finish().to_le_bytes();
-
- match item.len() {
- 0..=8 => {
- let mut hash = Vec::with_capacity(6);
- hash.extend_from_slice(&h1[..2]);
- hash.extend_from_slice(&h2[..2]);
- hash.push(h3[0]);
- hash.push(h4[0]);
- hash
- }
- 9..=16 => {
- let mut hash = Vec::with_capacity(8);
- hash.extend_from_slice(&h1[..2]);
- hash.extend_from_slice(&h2[..2]);
- hash.extend_from_slice(&h3[..2]);
- hash.extend_from_slice(&h4[..2]);
- hash
- }
- 17..=32 => {
- let mut hash = Vec::with_capacity(12);
- hash.extend_from_slice(&h1[..3]);
- hash.extend_from_slice(&h2[..3]);
- hash.extend_from_slice(&h3[..3]);
- hash.extend_from_slice(&h4[..3]);
- hash
- }
- _ => {
- let mut hash = Vec::with_capacity(16);
- hash.extend_from_slice(&h1[..4]);
- hash.extend_from_slice(&h2[..4]);
- hash.extend_from_slice(&h3[..4]);
- hash.extend_from_slice(&h4[..4]);
- hash
- }
- }
-}
-
-impl From<&str> for BloomHash {
- fn from(s: &str) -> Self {
- Self::hash(&s)
- }
-}
-
-impl From<String> for BloomHash {
- fn from(s: String) -> Self {
- Self::hash(&s)
- }
-}
-
-impl From<&String> for BloomHash {
- fn from(s: &String) -> Self {
- Self::hash(&s)
- }
-}
-
-impl From<Cow<'_, str>> for BloomHash {
- fn from(s: Cow<'_, str>) -> Self {
- Self::hash(s.as_ref())
- }
-}
-
-impl From<Token<Cow<'_, str>>> for BloomHashGroup {
- fn from(t: Token<Cow<'_, str>>) -> Self {
- Self {
- h1: BloomHash::hash(t.word.as_ref()),
- h2: None,
- }
- }
-}
-
-impl From<StemmedToken<'_>> for BloomHashGroup {
- fn from(t: StemmedToken<'_>) -> Self {
- Self {
- h1: BloomHash::hash(t.word.as_ref()),
- h2: t.stemmed_word.map(|w| BloomHash::hash(&format!("{w}_"))),
- }
- }
-}
-
-impl From<Cow<'_, str>> for BloomHashGroup {
- fn from(t: Cow<'_, str>) -> Self {
- Self {
- h1: BloomHash::hash(t.as_ref()),
- h2: None,
- }
- }
-}
-
-impl Serialize for BloomFilter {
- fn serialize(self) -> Vec<u8> {
- let mut buf = Vec::with_capacity(U64_LEN + self.b.serialized_size());
- buf.push_leb128(self.m);
- let _ = self.b.serialize_into(&mut buf);
- buf
- }
-}
-
-impl Deserialize for BloomFilter {
- fn deserialize(bytes: &[u8]) -> crate::Result<Self> {
- let (m, pos) = bytes.read_leb128().ok_or_else(|| {
- Error::InternalError(
- "Failed to read 'm' value while deserializing bloom filter.".to_string(),
- )
- })?;
- RoaringBitmap::deserialize_unchecked_from(bytes.get(pos..).ok_or_else(|| {
- Error::InternalError(
- "Failed to read bitmap while deserializing bloom filter.".to_string(),
- )
- })?)
- .map_err(|err| Error::InternalError(format!("Failed to deserialize bloom filter: {err}.")))
- .map(|b| Self::from_params(m, b))
- }
-}
diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs
deleted file mode 100644
index f4a8422d..00000000
--- a/crates/store/src/fts/builder.rs
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2023 Stalwart Labs Ltd.
- *
- * This file is part of the Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{borrow::Cow, collections::HashSet, fmt::Display};
-
-use ahash::AHashSet;
-use nlp::{
- language::{
- detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
- stemmer::Stemmer,
- Language,
- },
- tokenizers::{space::SpaceTokenizer, Token},
-};
-use utils::map::vec_map::VecMap;
-
-use crate::{
- query::RawValue,
- write::{BatchBuilder, IntoOperations, Operation, ValueClass},
- Serialize, HASH_EXACT, HASH_STEMMED,
-};
-
-use super::term_index::{TermIndexBuilder, TokenIndex};
-
-pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
-pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
-
-struct Text<'x, T: Into<u8> + Display> {
- field: T,
- text: Cow<'x, str>,
- language: Type,
-}
-
-enum Type {
- Stem(Language),
- Tokenize,
- Static,
-}
-
-pub struct FtsIndexBuilder<'x, T: Into<u8> + Display> {
- parts: Vec<Text<'x, T>>,
- default_language: Language,
-}
-
-impl<'x, T: Into<u8> + Display> FtsIndexBuilder<'x, T> {
- pub fn with_default_language(default_language: Language) -> FtsIndexBuilder<'x, T> {
- FtsIndexBuilder {
- parts: vec![],
- default_language,
- }
- }
-
- pub fn index(&mut self, field: T, text: impl Into<Cow<'x, str>>, language: Language) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Stem(language),
- });
- }
-
- pub fn index_raw(&mut self, field: T, text: impl Into<Cow<'x, str>>) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Tokenize,
- });
- }
-
- pub fn index_raw_token(&mut self, field: T, text: impl Into<Cow<'x, str>>) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Static,
- });
- }
-}
-
-impl<'x, T: Into<u8> + Display> IntoOperations for FtsIndexBuilder<'x, T> {
- fn build(self, batch: &mut BatchBuilder) {
- let mut detect = LanguageDetector::new();
- let mut tokens: VecMap<u8, AHashSet<String>> = VecMap::new();
- let mut parts = Vec::new();
-
- for text in self.parts {
- match text.language {
- Type::Stem(language) => {
- let language = if language == Language::Unknown {
- detect.detect(&text.text, MIN_LANGUAGE_SCORE)
- } else {
- language
- };
- parts.push((text.field, language, text.text));
- }
- Type::Tokenize => {
- let tokens = tokens.get_mut_or_insert(text.field.into());
- for token in SpaceTokenizer::new(text.text.as_ref(), MAX_TOKEN_LENGTH) {
- tokens.insert(token);
- }
- }
- Type::Static => {
- tokens
- .get_mut_or_insert(text.field.into())
- .insert(text.text.into_owned());
- }
- }
- }
-
- let default_language = detect
- .most_frequent_language()
- .unwrap_or(self.default_language);
- let mut term_index = TermIndexBuilder::new();
- let mut ops = AHashSet::new();
-
- for (part_id, (field, language, text)) in parts.into_iter().enumerate() {
- let language = if language != Language::Unknown {
- language
- } else {
- default_language
- };
- let mut terms = Vec::new();
- let field: u8 = field.into();
-
- for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH).collect::<Vec<_>>() {
- ops.insert(Operation::hash(&token.word, HASH_EXACT, field, true));
- if let Some(stemmed_word) = &token.stemmed_word {
- ops.insert(Operation::hash(stemmed_word, HASH_STEMMED, field, true));
- }
- terms.push(term_index.add_stemmed_token(token));
- }
-
- if !terms.is_empty() {
- term_index.add_terms(field, part_id as u32, terms);
- }
- }
-
- for (field, tokens) in tokens {
- let mut terms = Vec::with_capacity(tokens.len());
- for token in tokens {
- ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
- terms.push(term_index.add_token(Token {
- word: token.into(),
- from: 0,
- to: 0,
- }));
- }
- term_index.add_terms(field, 0, terms);
- }
-
- for op in ops {
- batch.ops.push(op);
- }
-
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: term_index.serialize().into(),
- });
- }
-}
-
-impl TokenIndex {
- fn build_index(self, batch: &mut BatchBuilder, set: bool) {
- let mut ops = AHashSet::with_capacity(self.tokens.len() * 2);
- for term in self.terms {
- for (term_ids, is_exact) in [(term.exact_terms, true), (term.stemmed_terms, false)] {
- for term_id in term_ids {
- if let Some(word) = self.tokens.get(term_id as usize) {
- ops.insert(Operation::hash(
- word,
- if is_exact { HASH_EXACT } else { HASH_STEMMED },
- term.field_id,
- set,
- ));
- }
- }
- }
- }
- for op in ops {
- batch.ops.push(op);
- }
- }
-}
-
-impl IntoOperations for TokenIndex {
- fn build(self, batch: &mut BatchBuilder) {
- self.build_index(batch, false);
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: None,
- });
- }
-}
-
-impl IntoOperations for RawValue<TokenIndex> {
- fn build(self, batch: &mut BatchBuilder) {
- self.inner.build_index(batch, true);
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: self.raw.into(),
- });
- }
-}
-
-pub trait ToTokens {
- fn to_tokens(&self) -> HashSet<String>;
-}
-
-impl ToTokens for &str {
- fn to_tokens(&self) -> HashSet<String> {
- let mut tokens = HashSet::new();
- for token in SpaceTokenizer::new(self, MAX_TOKEN_LENGTH) {
- tokens.insert(token);
- }
- tokens
- }
-}
-
-impl ToTokens for &String {
- fn to_tokens(&self) -> HashSet<String> {
- self.as_str().to_tokens()
- }
-}
diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs
new file mode 100644
index 00000000..1493fdcd
--- /dev/null
+++ b/crates/store/src/fts/index.rs
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::{borrow::Cow, fmt::Display};
+
+use ahash::{AHashMap, AHashSet};
+use nlp::{
+ language::{
+ detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
+ stemmer::Stemmer,
+ Language,
+ },
+ tokenizers::word::WordTokenizer,
+};
+
+use crate::{
+ backend::MAX_TOKEN_LENGTH,
+ write::{
+ hash::TokenType, key::KeySerializer, BatchBuilder, BitmapClass, BitmapHash, Operation,
+ ValueClass,
+ },
+ Deserialize, Error, Store, ValueKey, U64_LEN,
+};
+
+use super::Field;
+
+#[derive(Debug)]
+struct Text<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> {
+ field: Field<T>,
+ text: Cow<'x, str>,
+ typ: Type,
+}
+
+#[derive(Debug)]
+enum Type {
+ Text(Language),
+ Tokenize,
+ Keyword,
+}
+
+#[derive(Debug)]
+pub struct FtsDocument<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> {
+ parts: Vec<Text<'x, T>>,
+ default_language: Language,
+ account_id: u32,
+ collection: u8,
+ document_id: u32,
+}
+
+impl<'x, T: Into<u8> + Display + Clone + std::fmt::Debug> FtsDocument<'x, T> {
+ pub fn with_default_language(default_language: Language) -> FtsDocument<'x, T> {
+ FtsDocument {
+ parts: vec![],
+ default_language,
+ account_id: 0,
+ document_id: 0,
+ collection: 0,
+ }
+ }
+
+ pub fn with_account_id(mut self, account_id: u32) -> Self {
+ self.account_id = account_id;
+ self
+ }
+
+ pub fn with_document_id(mut self, document_id: u32) -> Self {
+ self.document_id = document_id;
+ self
+ }
+
+ pub fn with_collection(mut self, collection: impl Into<u8>) -> Self {
+ self.collection = collection.into();
+ self
+ }
+
+ pub fn index(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>, language: Language) {
+ self.parts.push(Text {
+ field,
+ text: text.into(),
+ typ: Type::Text(language),
+ });
+ }
+
+ pub fn index_tokenized(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>) {
+ self.parts.push(Text {
+ field,
+ text: text.into(),
+ typ: Type::Tokenize,
+ });
+ }
+
+ pub fn index_keyword(&mut self, field: Field<T>, text: impl Into<Cow<'x, str>>) {
+ self.parts.push(Text {
+ field,
+ text: text.into(),
+ typ: Type::Keyword,
+ });
+ }
+}
+
+impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<Field<T>> for u8 {
+ fn from(value: Field<T>) -> Self {
+ match value {
+ Field::Body => 0,
+ Field::Attachment => 1,
+ Field::Keyword => 2,
+ Field::Header(value) => 3 + value.into(),
+ }
+ }
+}
+
+impl Store {
+ pub async fn fts_index<T: Into<u8> + Display + Clone + std::fmt::Debug>(
+ &self,
+ document: FtsDocument<'_, T>,
+ ) -> crate::Result<()> {
+ let mut detect = LanguageDetector::new();
+ let mut tokens: AHashMap<BitmapHash, AHashSet<u8>> = AHashMap::new();
+ let mut parts = Vec::new();
+
+ for text in document.parts {
+ match text.typ {
+ Type::Text(language) => {
+ let language = if language == Language::Unknown {
+ detect.detect(&text.text, MIN_LANGUAGE_SCORE)
+ } else {
+ language
+ };
+ parts.push((text.field, language, text.text));
+ }
+ Type::Tokenize => {
+ let field = u8::from(text.field);
+ for token in WordTokenizer::new(text.text.as_ref(), MAX_TOKEN_LENGTH) {
+ tokens
+ .entry(BitmapHash::new(token.word.as_ref()))
+ .or_default()
+ .insert(TokenType::word(field));
+ }
+ }
+ Type::Keyword => {
+ let field = u8::from(text.field);
+ tokens
+ .entry(BitmapHash::new(text.text.as_ref()))
+ .or_default()
+ .insert(TokenType::word(field));
+ }
+ }
+ }
+
+ let default_language = detect
+ .most_frequent_language()
+ .unwrap_or(document.default_language);
+
+ for (field, language, text) in parts.into_iter() {
+ let language = if language != Language::Unknown {
+ language
+ } else {
+ default_language
+ };
+ let field: u8 = field.into();
+
+ let mut last_token = Cow::Borrowed("");
+ for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) {
+ if !last_token.is_empty() {
+ tokens
+ .entry(BitmapHash::new(&format!("{} {}", last_token, token.word)))
+ .or_default()
+ .insert(TokenType::bigram(field));
+ }
+
+ tokens
+ .entry(BitmapHash::new(token.word.as_ref()))
+ .or_default()
+ .insert(TokenType::word(field));
+
+ if let Some(stemmed_word) = token.stemmed_word {
+ tokens
+ .entry(BitmapHash::new(stemmed_word.as_ref()))
+ .or_default()
+ .insert(TokenType::stemmed(field));
+ }
+
+ last_token = token.word;
+ }
+ }
+
+ if tokens.is_empty() {
+ return Ok(());
+ }
+
+ // Serialize tokens
+ let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2);
+ let mut keys = Vec::with_capacity(tokens.len());
+
+ for (hash, fields) in tokens.into_iter() {
+ serializer = serializer
+ .write(hash.hash.as_slice())
+ .write(hash.len)
+ .write(fields.len() as u8);
+ for field in fields.into_iter() {
+ serializer = serializer.write(field);
+ keys.push(Operation::Bitmap {
+ class: BitmapClass::Text { field, token: hash },
+ set: true,
+ });
+ }
+ }
+
+ // Write term index
+ let mut batch = BatchBuilder::new();
+ batch
+ .with_account_id(document.account_id)
+ .with_collection(document.collection)
+ .update_document(document.document_id)
+ .set(
+ ValueClass::TermIndex,
+ lz4_flex::compress_prepend_size(&serializer.finalize()),
+ );
+ self.write(batch.build()).await?;
+ let mut batch = BatchBuilder::new();
+ batch
+ .with_account_id(document.account_id)
+ .with_collection(document.collection)
+ .update_document(document.document_id);
+
+ for (pos, key) in keys.into_iter().enumerate() {
+ if pos > 0 && pos & 1023 == 0 {
+ self.write(batch.build()).await?;
+ batch = BatchBuilder::new();
+ batch
+ .with_account_id(document.account_id)
+ .with_collection(document.collection)
+ .update_document(document.document_id);
+ }
+ batch.ops.push(key);
+ }
+
+ if !batch.is_empty() {
+ self.write(batch.build()).await?;
+ }
+
+ Ok(())
+ }
+
+ pub async fn fts_remove(
+ &self,
+ account_id: u32,
+ collection: u8,
+ document_id: u32,
+ ) -> crate::Result<bool> {
+ // Obtain term index
+ let term_index = if let Some(term_index) = self
+ .get_value::<TermIndex>(ValueKey {
+ account_id,
+ collection,
+ document_id,
+ class: ValueClass::TermIndex,
+ })
+ .await?
+ {
+ term_index
+ } else {
+ return Ok(false);
+ };
+
+ // Remove keys
+ let mut batch = BatchBuilder::new();
+ batch
+ .with_account_id(account_id)
+ .with_collection(collection)
+ .update_document(document_id);
+
+ for (pos, key) in term_index.ops.into_iter().enumerate() {
+ if pos > 0 && pos & 1023 == 0 {
+ self.write(batch.build()).await?;
+ batch = BatchBuilder::new();
+ batch
+ .with_account_id(account_id)
+ .with_collection(collection)
+ .update_document(document_id);
+ }
+ batch.ops.push(key);
+ }
+
+ if !batch.is_empty() {
+ self.write(batch.build()).await?;
+ }
+
+ // Remove term index
+ let mut batch = BatchBuilder::new();
+ batch
+ .with_account_id(account_id)
+ .with_collection(collection)
+ .update_document(document_id)
+ .clear(ValueClass::TermIndex);
+
+ self.write(batch.build()).await?;
+
+ Ok(true)
+ }
+
+ pub async fn fts_remove_all(&self, _: u32) -> crate::Result<()> {
+ // No-op
+ // Term indexes are stored in the same key range as the document
+
+ Ok(())
+ }
+}
+
+struct TermIndex {
+ ops: Vec<Operation>,
+}
+
+impl Deserialize for TermIndex {
+ fn deserialize(bytes: &[u8]) -> crate::Result<Self> {
+ let bytes = lz4_flex::decompress_size_prepended(bytes)
+ .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?;
+ let mut ops = Vec::new();
+ let mut bytes = bytes.iter().peekable();
+
+ while bytes.peek().is_some() {
+ let mut hash = BitmapHash {
+ hash: [0; 8],
+ len: 0,
+ };
+
+ for byte in hash.hash.iter_mut() {
+ *byte = *bytes.next().ok_or(Error::InternalError(
+ "Unexpected EOF reading term index".to_string(),
+ ))?;
+ }
+
+ hash.len = *bytes.next().ok_or(Error::InternalError(
+ "Unexpected EOF reading term index".to_string(),
+ ))?;
+ let num_fields = *bytes.next().ok_or(Error::InternalError(
+ "Unexpected EOF reading term index".to_string(),
+ ))?;
+ for _ in 0..num_fields {
+ let field = *bytes.next().ok_or(Error::InternalError(
+ "Unexpected EOF reading term index".to_string(),
+ ))?;
+ ops.push(Operation::Bitmap {
+ class: BitmapClass::Text { field, token: hash },
+ set: false,
+ });
+ }
+ }
+
+ Ok(Self { ops })
+ }
+}
diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs
index 8761f076..cb38f1e9 100644
--- a/crates/store/src/fts/mod.rs
+++ b/crates/store/src/fts/mod.rs
@@ -21,55 +21,188 @@
* for more details.
*/
-use crate::{
- write::{BitmapFamily, Operation},
- BitmapKey, Serialize, BM_HASH,
-};
+use std::fmt::Display;
-use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
+use nlp::language::Language;
-pub mod bloom;
-pub mod builder;
+pub mod index;
pub mod query;
-pub mod search_snippet;
-pub mod term_index;
-
-impl BitmapKey<Vec<u8>> {
- pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
- BitmapKey {
- account_id,
- collection,
- family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
- field,
- block_num: 0,
- key: hash_token(word),
+
+#[derive(Clone, Debug)]
+pub enum Field<T: Into<u8> + Display + Clone + std::fmt::Debug> {
+ Header(T),
+ Body,
+ Attachment,
+ Keyword,
+}
+
+#[derive(Debug)]
+pub enum FtsFilter<T: Into<u8> + Display + Clone + std::fmt::Debug> {
+ Exact {
+ field: Field<T>,
+ text: String,
+ language: Language,
+ },
+ Contains {
+ field: Field<T>,
+ text: String,
+ language: Language,
+ },
+ Keyword {
+ field: Field<T>,
+ text: String,
+ },
+ And,
+ Or,
+ Not,
+ End,
+}
+
+impl<T: Into<u8> + Display + Clone + std::fmt::Debug> FtsFilter<T> {
+ pub fn has_text_detect(
+ field: Field<T>,
+ text: impl Into<String>,
+ default_language: Language,
+ ) -> Self {
+ let (text, language) = Language::detect(text.into(), default_language);
+ Self::has_text(field, text, language)
+ }
+
+ pub fn has_text(field: Field<T>, text: impl Into<String>, language: Language) -> Self {
+ let text = text.into();
+ if !matches!(language, Language::None) && (text.starts_with('"') && text.ends_with('"'))
+ || (text.starts_with('\'') && text.ends_with('\''))
+ {
+ FtsFilter::Exact {
+ field,
+ text,
+ language,
+ }
+ } else {
+ FtsFilter::Contains {
+ field,
+ text,
+ language,
+ }
}
}
- pub fn value(
- account_id: u32,
- collection: impl Into<u8>,
- field: impl Into<u8>,
- value: impl BitmapFamily + Serialize,
- ) -> Self {
- BitmapKey {
- account_id,
- collection: collection.into(),
- family: value.family(),
- field: field.into(),
- block_num: 0,
- key: value.serialize(),
+ pub fn has_keyword(field: Field<T>, text: impl Into<String>) -> Self {
+ FtsFilter::Keyword {
+ field,
+ text: text.into(),
}
}
+
+ pub fn has_english_text(field: Field<T>, text: impl Into<String>) -> Self {
+ Self::has_text(field, text, Language::English)
+ }
}
-impl Operation {
- pub fn hash(word: &str, family: u8, field: u8, set: bool) -> Self {
- Operation::Bitmap {
- family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
- field,
- key: hash_token(word),
- set,
+#[derive(Clone, Copy)]
+pub enum FilterType {
+ And,
+ Or,
+ Not,
+ End,
+ Store,
+ Fts,
+}
+
+pub enum FilterGroup<T: FilterItem> {
+ Fts(Vec<T>),
+ Store(T),
+}
+
+pub trait FilterItem: Clone {
+ fn filter_type(&self) -> FilterType;
+}
+
+pub trait IntoFilterGroup<T: FilterItem + From<FilterType>> {
+ fn into_filter_group(self) -> Vec<FilterGroup<T>>;
+}
+
+impl<T: FilterItem + From<FilterType>> IntoFilterGroup<T> for Vec<T> {
+ fn into_filter_group(self) -> Vec<FilterGroup<T>> {
+ let mut filter = Vec::with_capacity(self.len());
+ let mut iter = self.into_iter();
+ let mut logical_op = None;
+
+ while let Some(item) = iter.next() {
+ if matches!(item.filter_type(), FilterType::Fts) {
+ let mut store_item = None;
+ let mut depth = 0;
+ let mut fts = Vec::with_capacity(5);
+
+ // Add the logical operator if there is one
+ let in_logical_op = if let Some(op) = logical_op.take() {
+ fts.push(op);
+ true
+ } else {
+ false
+ };
+ fts.push(item);
+
+ for item in iter.by_ref() {
+ match item.filter_type() {
+ FilterType::And | FilterType::Or | FilterType::Not => {
+ depth += 1;
+ fts.push(item);
+ }
+ FilterType::End if depth > 0 => {
+ depth -= 1;
+ fts.push(item);
+ }
+ FilterType::Fts => {
+ fts.push(item);
+ }
+ _ => {
+ store_item = Some(item);
+ break;
+ }
+ }
+ }
+
+ if in_logical_op {
+ fts.push(T::from(FilterType::End));
+ }
+
+ if depth > 0 {
+ let mut store = Vec::with_capacity(depth * 2);
+ while depth > 0 {
+ let item = fts.pop().unwrap();
+ if matches!(
+ item.filter_type(),
+ FilterType::And | FilterType::Or | FilterType::Not
+ ) {
+ depth -= 1;
+ }
+ store.push(FilterGroup::Store(item));
+ }
+
+ filter.push(FilterGroup::Fts(fts));
+ filter.extend(store);
+ } else {
+ filter.push(FilterGroup::Fts(fts));
+ }
+
+ if let Some(item) = store_item {
+ filter.push(FilterGroup::Store(item));
+ }
+ } else {
+ match item.filter_type() {
+ FilterType::And | FilterType::Or => {
+ logical_op = Some(item.clone());
+ }
+ FilterType::Not => {
+ logical_op = Some(T::from(FilterType::And));
+ }
+ _ => {}
+ }
+ filter.push(FilterGroup::Store(item));
+ }
}
+
+ filter
}
}
diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs
index 37938e3f..6be67a75 100644
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@@ -21,138 +21,210 @@
* for more details.
*/
-use std::ops::BitOrAssign;
+use std::{
+ fmt::Display,
+ ops::{BitAndAssign, BitOrAssign, BitXorAssign},
+};
-use nlp::language::{stemmer::Stemmer, Language};
+use nlp::language::stemmer::Stemmer;
use roaring::RoaringBitmap;
-use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ValueKey, HASH_EXACT, HASH_STEMMED};
+use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store};
-use super::term_index::TermIndex;
+struct State<T: Into<u8> + Display + Clone + std::fmt::Debug> {
+ pub op: FtsFilter<T>,
+ pub bm: Option<RoaringBitmap>,
+}
-#[async_trait::async_trait]
-pub trait StoreFts: StoreRead {
- async fn fts_query(
- &mut self,
+impl Store {
+ pub async fn fts_query<T: Into<u8> + Display + Clone + std::fmt::Debug>(
+ &self,
account_id: u32,
- collection: u8,
- field: u8,
- text: &str,
- language: Language,
- match_phrase: bool,
- ) -> crate::Result<Option<RoaringBitmap>> {
- if match_phrase {
- let mut phrase = Vec::new();
- let mut bit_keys = Vec::new();
- for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
- let key = BitmapKey::hash(
- token.word.as_ref(),
- account_id,
- collection,
- HASH_EXACT,
+ collection: impl Into<u8>,
+ filters: Vec<FtsFilter<T>>,
+ ) -> crate::Result<RoaringBitmap> {
+ let collection = collection.into();
+ let mut not_mask = RoaringBitmap::new();
+ let mut not_fetch = false;
+
+ let mut state: State<T> = FtsFilter::And.into();
+ let mut stack = Vec::new();
+ let mut filters = filters.into_iter().peekable();
+
+ while let Some(filter) = filters.next() {
+ let mut result = match filter {
+ FtsFilter::Exact {
field,
- );
- if !bit_keys.contains(&key) {
- bit_keys.push(key);
+ text,
+ language,
+ } => {
+ let field: u8 = field.clone().into();
+
+ let tokens = language
+ .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH)
+ .map(|t| t.word)
+ .collect::<Vec<_>>();
+ let keys = if tokens.len() > 1 {
+ tokens
+ .windows(2)
+ .map(|bg| BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field),
+ block_num: 0,
+ })
+ .collect::<Vec<_>>()
+ } else {
+ tokens
+ .into_iter()
+ .map(|word| BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::word(word.as_ref(), field),
+ block_num: 0,
+ })
+ .collect::<Vec<_>>()
+ };
+
+ self.get_bitmaps_intersection(keys).await?
}
+ FtsFilter::Contains {
+ field,
+ text,
+ language,
+ } => {
+ let mut result = RoaringBitmap::new();
+ let field: u8 = field.clone().into();
- phrase.push(token.word);
- }
- let bitmaps = match self.get_bitmaps_intersection(bit_keys).await? {
- Some(b) if !b.is_empty() => b,
- _ => return Ok(None),
- };
+ for token in Stemmer::new(text.as_ref(), language, MAX_TOKEN_LENGTH) {
+ let token1 = BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::word(token.word.as_ref(), field),
+ block_num: 0,
+ };
+ let token2 = BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::stemmed(
+ if let Some(stemmed_word) = token.stemmed_word {
+ stemmed_word
+ } else {
+ token.word
+ }
+ .as_ref(),
+ field,
+ ),
+ block_num: 0,
+ };
- match phrase.len() {
- 0 => return Ok(None),
- 1 => return Ok(Some(bitmaps)),
- _ => (),
- }
+ match self.get_bitmaps_union(vec![token1, token2]).await? {
+ Some(b) if !b.is_empty() => {
+ if !result.is_empty() {
+ result &= b;
+ if result.is_empty() {
+ break;
+ }
+ } else {
+ result = b;
+ }
+ }
+ _ => break,
+ }
+ }
- let mut results = RoaringBitmap::new();
- for document_id in bitmaps {
- if let Some(term_index) = self
- .get_value::<TermIndex>(ValueKey::term_index(
+ if !result.is_empty() {
+ Some(result)
+ } else {
+ None
+ }
+ }
+ FtsFilter::Keyword { field, text } => {
+ self.get_bitmap(BitmapKey {
account_id,
collection,
- document_id,
- ))
+ class: BitmapClass::word(text, field),
+ block_num: 0,
+ })
.await?
- {
- if term_index
- .match_terms(
- &phrase
- .iter()
- .map(|w| term_index.get_match_term(w, None))
- .collect::<Vec<_>>(),
- field.into(),
- true,
- false,
- false,
- )
- .map_err(|e| {
- crate::Error::InternalError(format!(
- "TermIndex match_terms failed for {account_id}/{collection}/{document_id}: {e:?}"
- ))
- })?
- .is_some()
- {
- results.insert(document_id);
+ }
+ op @ (FtsFilter::And | FtsFilter::Or | FtsFilter::Not) => {
+ stack.push(state);
+ state = op.into();
+ continue;
+ }
+ FtsFilter::End => {
+ if let Some(prev_state) = stack.pop() {
+ let bm = state.bm;
+ state = prev_state;
+ bm
+ } else {
+ break;
}
- } else {
- tracing::debug!(
- event = "error",
- context = "fts_query",
- account_id = account_id,
- collection = collection,
- document_id = document_id,
- "Document is missing a term index",
- );
}
- }
+ };
- if !results.is_empty() {
- Ok(Some(results))
- } else {
- Ok(None)
+ // Only fetch not mask if we need it
+ if matches!(state.op, FtsFilter::Not) && !not_fetch {
+ not_mask = self
+ .get_bitmap(BitmapKey::document_ids(account_id, collection))
+ .await?
+ .unwrap_or_else(RoaringBitmap::new);
+ not_fetch = true;
}
- } else {
- let mut bitmaps = RoaringBitmap::new();
-
- for token in Stemmer::new(text, language, MAX_TOKEN_LENGTH) {
- let token1 =
- BitmapKey::hash(&token.word, account_id, collection, HASH_EXACT, field);
- let token2 = if let Some(stemmed_word) = token.stemmed_word {
- BitmapKey::hash(&stemmed_word, account_id, collection, HASH_STEMMED, field)
- } else {
- let mut token2 = token1.clone();
- token2.family &= !HASH_EXACT;
- token2.family |= HASH_STEMMED;
- token2
- };
-
- match self.get_bitmaps_union(vec![token1, token2]).await? {
- Some(b) if !b.is_empty() => {
- if !bitmaps.is_empty() {
- bitmaps &= b;
- if bitmaps.is_empty() {
- return Ok(None);
- }
+
+ // Apply logical operation
+ if let Some(dest) = &mut state.bm {
+ match state.op {
+ FtsFilter::And => {
+ if let Some(result) = result {
+ dest.bitand_assign(result);
} else {
- bitmaps = b;
+ dest.clear();
+ }
+ }
+ FtsFilter::Or => {
+ if let Some(result) = result {
+ dest.bitor_assign(result);
+ }
+ }
+ FtsFilter::Not => {
+ if let Some(mut result) = result {
+ result.bitxor_assign(&not_mask);
+ dest.bitand_assign(result);
}
}
- _ => return Ok(None),
- };
+ _ => unreachable!(),
+ }
+ } else if let Some(ref mut result_) = result {
+ if let FtsFilter::Not = state.op {
+ result_.bitxor_assign(&not_mask);
+ }
+ state.bm = result;
+ } else if let FtsFilter::Not = state.op {
+ state.bm = Some(not_mask.clone());
+ } else {
+ state.bm = Some(RoaringBitmap::new());
}
- Ok(Some(bitmaps))
+ // And short circuit
+ if matches!(state.op, FtsFilter::And) && state.bm.as_ref().unwrap().is_empty() {
+ while let Some(filter) = filters.peek() {
+ if matches!(filter, FtsFilter::End) {
+ break;
+ } else {
+ filters.next();
+ }
+ }
+ }
}
+
+ Ok(state.bm.unwrap_or_default())
}
- async fn get_bitmaps_union<T: AsRef<[u8]> + Sync + Send>(
+ async fn get_bitmaps_union(
&self,
- keys: Vec<BitmapKey<T>>,
+ keys: Vec<BitmapKey<BitmapClass>>,
) -> crate::Result<Option<RoaringBitmap>> {
let mut bm = RoaringBitmap::new();
@@ -165,3 +237,12 @@ pub trait StoreFts: StoreRead {
Ok(if !bm.is_empty() { Some(bm) } else { None })
}
}
+
+impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<FtsFilter<T>> for State<T> {
+ fn from(value: FtsFilter<T>) -> Self {
+ Self {
+ op: value,
+ bm: None,
+ }
+ }
+}
diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs
deleted file mode 100644
index 55d6b6b7..00000000
--- a/crates/store/src/fts/search_snippet.rs
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use super::term_index::Term;
-
-fn escape_char(c: char, string: &mut String) {
- match c {
- '&' => string.push_str("&amp;"),
- '<' => string.push_str("&lt;"),
- '>' => string.push_str("&gt;"),
- '"' => string.push_str("&quot;"),
- '\n' | '\r' => string.push(' '),
- _ => string.push(c),
- }
-}
-
-fn escape_char_len(c: char) -> usize {
- match c {
- '&' => "&amp;".len(),
- '<' => "&lt;".len(),
- '>' => "&gt;".len(),
- '"' => "&quot;".len(),
- '\r' | '\n' => 1,
- _ => c.len_utf8(),
- }
-}
-
-pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
- let mut snippet = String::with_capacity(text.len());
- let start_offset = terms.get(0)?.offset as usize;
-
- if start_offset > 0 {
- let mut word_count = 0;
- let mut from_offset = 0;
- let mut last_is_space = false;
-
- if text.len() > 240 {
- for (pos, char) in text.get(0..start_offset)?.char_indices().rev() {
- // Add up to 2 words or 40 characters of context
- if char.is_whitespace() {
- if !last_is_space {
- word_count += 1;
- if word_count == 3 {
- break;
- }
- last_is_space = true;
- }
- } else {
- last_is_space = false;
- }
- from_offset = pos;
- if start_offset - from_offset >= 40 {
- break;
- }
- }
- }
-
- last_is_space = false;
- for char in text.get(from_offset..start_offset)?.chars() {
- if !char.is_whitespace() {
- last_is_space = false;
- } else {
- if last_is_space {
- continue;
- }
- last_is_space = true;
- }
- escape_char(char, &mut snippet);
- }
- }
-
- let mut terms = terms.iter().peekable();
-
- 'outer: while let Some(term) = terms.next() {
- if snippet.len() + ("<mark>".len() * 2) + term.len as usize + 1 > 255 {
- break;
- }
-
- snippet.push_str("<mark>");
- snippet.push_str(text.get(term.offset as usize..term.offset as usize + term.len as usize)?);
- snippet.push_str("</mark>");
-
- let next_offset = if let Some(next_term) = terms.peek() {
- next_term.offset as usize
- } else {
- text.len()
- };
-
- let mut last_is_space = false;
- for char in text
- .get(term.offset as usize + term.len as usize..next_offset)?
- .chars()
- {
- if !char.is_whitespace() {
- last_is_space = false;
- } else {
- if last_is_space {
- continue;
- }
- last_is_space = true;
- }
-
- if snippet.len() + escape_char_len(char) <= 255 {
- escape_char(char, &mut snippet);
- } else {
- break 'outer;
- }
- }
- }
-
- Some(snippet)
-}
-
-#[cfg(test)]
-mod tests {
-
- use nlp::language::Language;
-
- use crate::{
- fts::term_index::{TermIndex, TermIndexBuilder},
- Deserialize, Serialize,
- };
-
- use super::*;
-
- #[test]
- fn search_snippets() {
- let inputs = [
- (vec![
- "Help a friend from Abidjan Côte d'Ivoire",
- concat!(
- "When my mother died when she was given birth to me, my father took me so ",
- "special because I am motherless. Before the death of my late father on 22nd June ",
- "2013 in a private hospital here in Abidjan Côte d'Ivoire. He secretly called me on his ",
- "bedside and told me that he has a sum of $7.5M (Seven Million five Hundred ",
- "Thousand Dollars) left in a suspense account in a local bank here in Abidjan Côte ",
- "d'Ivoire, that he used my name as his only daughter for the next of kin in deposit of ",
- "the fund. ",
- "I am 24year old. Dear I am honorably seeking your assistance in the following ways. ",
- "1) To provide any bank account where this money would be transferred into. ",
- "2) To serve as the guardian of this fund. ",
- "3) To make arrangement for me to come over to your country to further my ",
- "education and to secure a residential permit for me in your country. ",
- "Moreover, I am willing to offer you 30 percent of the total sum as compensation for ",
- "your effort input after the successful transfer of this fund to your nominated ",
- "account overseas."
- )],
- vec![
- (
- vec!["côte"],
- vec![
- "Help a friend from Abidjan <mark>Côte</mark> d'Ivoire",
- concat!(
- "in Abidjan <mark>Côte</mark> d'Ivoire. He secretly called me on his bedside ",
- "and told me that he has a sum of $7.5M (Seven Million five Hundred Thousand ",
- "Dollars) left in a suspense account in a local bank here in Abidjan ",
- "<mark>Côte</mark> d'Ivoire, that ")
- ]
- ),
- (
- vec!["your", "country"],
- vec![
- concat!(
- "honorably seeking <mark>your</mark> assistance in the following ways. ",
- "1) To provide any bank account where this money would be transferred into. 2) ",
- "To serve as the guardian of this fund. 3) To make arrangement for me to come ",
- "over to <mark>your</mark> "
- )]
- ),
- (
- vec!["overseas"],
- vec![
- "nominated account <mark>overseas</mark>."
- ]
- ),
-
- ],
- ),
- (vec![
- "孫子兵法",
- concat!(
- "<\"孫子兵法:\">",
- "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
- "孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠漆之材,",
- "車甲之奉,日費千金,然後十萬之師舉矣。",
- "孫子曰:凡用兵之法,全國為上,破國次之;全旅為上,破旅次之;全卒為上,破卒次之;全伍為上,破伍次之。",
- "是故百戰百勝,非善之善者也;不戰而屈人之兵,善之善者也。",
- "孫子曰:昔之善戰者,先為不可勝,以待敵之可勝,不可勝在己,可勝在敵。故善戰者,能為不可勝,不能使敵必可勝。",
- "故曰:勝可知,而不可為。",
- "兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之,實而備之,強而避之,",
- "怒而撓之,卑而驕之,佚而勞之,親而離之。攻其無備,出其不意,此兵家之勝,不可先傳也。",
- "夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也;多算勝,少算不勝,而況於無算乎?吾以此觀之,勝負見矣。",
- "孫子曰:凡治眾如治寡,分數是也。鬥眾如鬥寡,形名是也。三軍之眾,可使必受敵而無敗者,奇正是也。兵之所加,",
- "如以碬投卵者,虛實是也。",
- )],
- vec![
- (
- vec!["孫子兵法"],
- vec![
- "<mark>孫子兵法</mark>",
- concat!(
- "&lt;&quot;<mark>孫子兵法</mark>:&quot;&gt;孫子曰:兵者,國之大事,死生之地,存亡之道,",
- "不可不察也。孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠"),
- ]
- ),
- (
- vec!["孫子曰"],
- vec![
- concat!(
- "&lt;&quot;孫子兵法:&quot;&gt;<mark>孫子曰</mark>:兵者,國之大事,死生之地,存亡之道,",
- "不可不察也。<mark>孫子曰</mark>:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓",
- )]
- ),
- ],
- ),
- ];
-
- for (parts, tests) in inputs {
- let mut builder = TermIndexBuilder::new();
-
- for (field_num, part) in parts.iter().enumerate() {
- let mut terms = Vec::new();
- for token in Language::English.tokenize_text(part, 40) {
- terms.push(builder.add_token(token));
- }
- builder.add_terms(field_num as u8, 0, terms);
- }
-
- let compressed_term_index = builder.serialize();
- let term_index = TermIndex::deserialize(&compressed_term_index[..]).unwrap();
-
- for (match_words, snippets) in tests {
- let mut match_terms = Vec::new();
- for word in &match_words {
- match_terms.push(term_index.get_match_term(word, None));
- }
-
- let term_groups = term_index
- .match_terms(&match_terms, None, false, true, true)
- .unwrap()
- .unwrap();
-
- assert_eq!(term_groups.len(), snippets.len());
-
- for (term_group, snippet) in term_groups.iter().zip(snippets.iter()) {
- assert_eq!(
- snippet,
- &generate_snippet(&term_group.terms, parts[term_group.field_id as usize])
- .unwrap()
- );
- }
- }
- }
- }
-}
diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs
index 2b876578..3ec6e7e8 100644
--- a/crates/store/src/fts/term_index.rs
+++ b/crates/store/src/fts/term_index.rs
@@ -23,7 +23,7 @@
use std::{borrow::Cow, convert::TryInto};
-use crate::{Deserialize, Serialize};
+use crate::{Deserialize, Serialize, U32_LEN, U64_LEN};
use ahash::{AHashMap, AHashSet};
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};