summaryrefslogtreecommitdiff
path: root/crates/store/src/fts/builder.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/store/src/fts/builder.rs')
-rw-r--r--crates/store/src/fts/builder.rs250
1 files changed, 0 insertions, 250 deletions
diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs
deleted file mode 100644
index f4a8422d..00000000
--- a/crates/store/src/fts/builder.rs
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2023 Stalwart Labs Ltd.
- *
- * This file is part of the Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{borrow::Cow, collections::HashSet, fmt::Display};
-
-use ahash::AHashSet;
-use nlp::{
- language::{
- detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
- stemmer::Stemmer,
- Language,
- },
- tokenizers::{space::SpaceTokenizer, Token},
-};
-use utils::map::vec_map::VecMap;
-
-use crate::{
- query::RawValue,
- write::{BatchBuilder, IntoOperations, Operation, ValueClass},
- Serialize, HASH_EXACT, HASH_STEMMED,
-};
-
-use super::term_index::{TermIndexBuilder, TokenIndex};
-
-pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
-pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
-
-struct Text<'x, T: Into<u8> + Display> {
- field: T,
- text: Cow<'x, str>,
- language: Type,
-}
-
-enum Type {
- Stem(Language),
- Tokenize,
- Static,
-}
-
-pub struct FtsIndexBuilder<'x, T: Into<u8> + Display> {
- parts: Vec<Text<'x, T>>,
- default_language: Language,
-}
-
-impl<'x, T: Into<u8> + Display> FtsIndexBuilder<'x, T> {
- pub fn with_default_language(default_language: Language) -> FtsIndexBuilder<'x, T> {
- FtsIndexBuilder {
- parts: vec![],
- default_language,
- }
- }
-
- pub fn index(&mut self, field: T, text: impl Into<Cow<'x, str>>, language: Language) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Stem(language),
- });
- }
-
- pub fn index_raw(&mut self, field: T, text: impl Into<Cow<'x, str>>) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Tokenize,
- });
- }
-
- pub fn index_raw_token(&mut self, field: T, text: impl Into<Cow<'x, str>>) {
- self.parts.push(Text {
- field,
- text: text.into(),
- language: Type::Static,
- });
- }
-}
-
-impl<'x, T: Into<u8> + Display> IntoOperations for FtsIndexBuilder<'x, T> {
- fn build(self, batch: &mut BatchBuilder) {
- let mut detect = LanguageDetector::new();
- let mut tokens: VecMap<u8, AHashSet<String>> = VecMap::new();
- let mut parts = Vec::new();
-
- for text in self.parts {
- match text.language {
- Type::Stem(language) => {
- let language = if language == Language::Unknown {
- detect.detect(&text.text, MIN_LANGUAGE_SCORE)
- } else {
- language
- };
- parts.push((text.field, language, text.text));
- }
- Type::Tokenize => {
- let tokens = tokens.get_mut_or_insert(text.field.into());
- for token in SpaceTokenizer::new(text.text.as_ref(), MAX_TOKEN_LENGTH) {
- tokens.insert(token);
- }
- }
- Type::Static => {
- tokens
- .get_mut_or_insert(text.field.into())
- .insert(text.text.into_owned());
- }
- }
- }
-
- let default_language = detect
- .most_frequent_language()
- .unwrap_or(self.default_language);
- let mut term_index = TermIndexBuilder::new();
- let mut ops = AHashSet::new();
-
- for (part_id, (field, language, text)) in parts.into_iter().enumerate() {
- let language = if language != Language::Unknown {
- language
- } else {
- default_language
- };
- let mut terms = Vec::new();
- let field: u8 = field.into();
-
- for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH).collect::<Vec<_>>() {
- ops.insert(Operation::hash(&token.word, HASH_EXACT, field, true));
- if let Some(stemmed_word) = &token.stemmed_word {
- ops.insert(Operation::hash(stemmed_word, HASH_STEMMED, field, true));
- }
- terms.push(term_index.add_stemmed_token(token));
- }
-
- if !terms.is_empty() {
- term_index.add_terms(field, part_id as u32, terms);
- }
- }
-
- for (field, tokens) in tokens {
- let mut terms = Vec::with_capacity(tokens.len());
- for token in tokens {
- ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
- terms.push(term_index.add_token(Token {
- word: token.into(),
- from: 0,
- to: 0,
- }));
- }
- term_index.add_terms(field, 0, terms);
- }
-
- for op in ops {
- batch.ops.push(op);
- }
-
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: term_index.serialize().into(),
- });
- }
-}
-
-impl TokenIndex {
- fn build_index(self, batch: &mut BatchBuilder, set: bool) {
- let mut ops = AHashSet::with_capacity(self.tokens.len() * 2);
- for term in self.terms {
- for (term_ids, is_exact) in [(term.exact_terms, true), (term.stemmed_terms, false)] {
- for term_id in term_ids {
- if let Some(word) = self.tokens.get(term_id as usize) {
- ops.insert(Operation::hash(
- word,
- if is_exact { HASH_EXACT } else { HASH_STEMMED },
- term.field_id,
- set,
- ));
- }
- }
- }
- }
- for op in ops {
- batch.ops.push(op);
- }
- }
-}
-
-impl IntoOperations for TokenIndex {
- fn build(self, batch: &mut BatchBuilder) {
- self.build_index(batch, false);
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: None,
- });
- }
-}
-
-impl IntoOperations for RawValue<TokenIndex> {
- fn build(self, batch: &mut BatchBuilder) {
- self.inner.build_index(batch, true);
- batch.ops.push(Operation::Value {
- class: ValueClass::Property {
- field: u8::MAX,
- family: u8::MAX,
- },
- set: self.raw.into(),
- });
- }
-}
-
-pub trait ToTokens {
- fn to_tokens(&self) -> HashSet<String>;
-}
-
-impl ToTokens for &str {
- fn to_tokens(&self) -> HashSet<String> {
- let mut tokens = HashSet::new();
- for token in SpaceTokenizer::new(self, MAX_TOKEN_LENGTH) {
- tokens.insert(token);
- }
- tokens
- }
-}
-
-impl ToTokens for &String {
- fn to_tokens(&self) -> HashSet<String> {
- self.as_str().to_tokens()
- }
-}