diff options
author | mdecimus <mauro@stalw.art> | 2023-10-10 18:58:38 +0200 |
---|---|---|
committer | mdecimus <mauro@stalw.art> | 2023-10-10 18:58:38 +0200 |
commit | 3d9efd363a3ed1fab306e3f5e9fea41bfe58d8be (patch) | |
tree | a123a7622a6d21dcf42c566a478650b863cff476 /crates/store | |
parent | a0812095efd7aaf21ce45420bd70bc1d3230e77f (diff) |
Bayes classifier, type tokenizer and NLP module reorganization
Diffstat (limited to 'crates/store')
-rw-r--r-- | crates/store/Cargo.toml | 7 | ||||
-rw-r--r-- | crates/store/src/fts/bloom.rs | 7 | ||||
-rw-r--r-- | crates/store/src/fts/builder.rs | 20 | ||||
-rw-r--r-- | crates/store/src/fts/lang.rs | 252 | ||||
-rw-r--r-- | crates/store/src/fts/mod.rs | 154 | ||||
-rw-r--r-- | crates/store/src/fts/ngram.rs | 61 | ||||
-rw-r--r-- | crates/store/src/fts/query.rs | 8 | ||||
-rw-r--r-- | crates/store/src/fts/search_snippet.rs | 10 | ||||
-rw-r--r-- | crates/store/src/fts/stemmer.rs | 168 | ||||
-rw-r--r-- | crates/store/src/fts/term_index.rs | 22 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/chinese.rs | 197 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/indo_european.rs | 167 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/japanese.rs | 168 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/mod.rs | 96 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/space.rs | 74 | ||||
-rw-r--r-- | crates/store/src/fts/tokenizers/word.rs | 80 | ||||
-rw-r--r-- | crates/store/src/query/filter.rs | 6 | ||||
-rw-r--r-- | crates/store/src/query/mod.rs | 5 | ||||
-rw-r--r-- | crates/store/src/write/mod.rs | 4 |
19 files changed, 40 insertions, 1466 deletions
diff --git a/crates/store/Cargo.toml b/crates/store/Cargo.toml index 9c4bb149..5a2dc3f5 100644 --- a/crates/store/Cargo.toml +++ b/crates/store/Cargo.toml @@ -6,6 +6,7 @@ resolver = "2" [dependencies] utils = { path = "../utils" } +nlp = { path = "../nlp" } maybe-async = { path = "../maybe-async" } rocksdb = { version = "0.20.1", optional = true } foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true } @@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]} ahash = { version = "0.8.0", features = ["serde"] } bitpacking = "0.8.4" lazy_static = "1.4" -whatlang = "0.16" # Language detection -rust-stemmers = "1.2" # Stemmers -tinysegmenter = "0.1" # Japanese tokenizer -jieba-rs = "0.6" # Chinese stemmer xxhash-rust = { version = "0.8.5", features = ["xxh3"] } farmhash = "1.1.5" -siphasher = "0.3" +siphasher = "1.0" parking_lot = "0.12.1" lru-cache = { version = "0.1.2", optional = true } num_cpus = { version = "1.15.0", optional = true } diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs index 54905458..31e36427 100644 --- a/crates/store/src/fts/bloom.rs +++ b/crates/store/src/fts/bloom.rs @@ -27,13 +27,12 @@ use std::{ hash::{Hash, Hasher}, }; +use nlp::{language::stemmer::StemmedToken, tokenizers::Token}; use roaring::RoaringBitmap; use utils::codec::leb128::{Leb128Reader, Leb128Vec}; use crate::{Deserialize, Error, Serialize}; -use super::{stemmer::StemmedToken, tokenizers::Token}; - pub struct BloomFilter { m: u64, b: RoaringBitmap, @@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash { } } -impl From<Token<'_>> for BloomHashGroup { - fn from(t: Token<'_>) -> Self { +impl From<Token<Cow<'_, str>>> for BloomHashGroup { + fn from(t: Token<Cow<'_, str>>) -> Self { Self { h1: BloomHash::hash(t.word.as_ref()), h2: None, diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs index 3ddf538f..508d1e87 100644 --- a/crates/store/src/fts/builder.rs +++ b/crates/store/src/fts/builder.rs @@ -24,6 +24,14 @@ use std::{borrow::Cow, collections::HashSet}; use ahash::AHashSet; +use nlp::{ + language::{ + detect::{LanguageDetector, MIN_LANGUAGE_SCORE}, + stemmer::Stemmer, + Language, + }, + tokenizers::{space::SpaceTokenizer, Token}, +}; use utils::map::vec_map::VecMap; use crate::{ @@ -32,13 +40,7 @@ use crate::{ Serialize, HASH_EXACT, HASH_STEMMED, }; -use super::{ - lang::{LanguageDetector, MIN_LANGUAGE_SCORE}, - stemmer::Stemmer, - term_index::{TermIndexBuilder, TokenIndex}, - tokenizers::{space::SpaceTokenizer, Token}, - Language, -}; +use super::term_index::{TermIndexBuilder, TokenIndex}; pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize; pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1; @@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> { ops.insert(Operation::hash(&token, HASH_EXACT, field, true)); terms.push(term_index.add_token(Token { word: token.into(), - offset: 0, - len: 0, + from: 0, + to: 0, })); } term_index.add_terms(field, 0, terms); diff --git a/crates/store/src/fts/lang.rs b/crates/store/src/fts/lang.rs deleted file mode 100644 index e5b780dc..00000000 --- a/crates/store/src/fts/lang.rs +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use ahash::AHashMap; -use whatlang::{detect, Lang}; - -use super::Language; - -pub const MIN_LANGUAGE_SCORE: f64 = 0.5; - -#[derive(Debug)] -struct WeightedAverage { - weight: usize, - occurrences: usize, - confidence: f64, -} - -#[derive(Debug)] -pub struct LanguageDetector { - lang_detected: AHashMap<Language, WeightedAverage>, -} - -impl Default for LanguageDetector { - fn default() -> Self { - Self::new() - } -} - -impl LanguageDetector { - pub fn new() -> LanguageDetector { - LanguageDetector { - lang_detected: AHashMap::default(), - } - } - - pub fn detect(&mut self, text: &str, min_score: f64) -> Language { - if let Some((language, confidence)) = LanguageDetector::detect_single(text) { - let w = self - .lang_detected - .entry(language) - .or_insert_with(|| WeightedAverage { - weight: 0, - confidence: 0.0, - occurrences: 0, - }); - w.occurrences += 1; - w.weight += text.len(); - w.confidence += confidence * text.len() as f64; - if confidence < min_score { - Language::Unknown - } else { - language - } - } else { - Language::Unknown - } - } - - pub fn most_frequent_language(&self) -> Option<Language> { - self.lang_detected - .iter() - .max_by(|(_, a), (_, b)| { - ((a.confidence / a.weight as f64) * a.occurrences as f64) - .partial_cmp(&((b.confidence / b.weight as f64) * b.occurrences as f64)) - .unwrap_or(std::cmp::Ordering::Less) - }) - .map(|(l, _)| *l) - } - - pub fn detect_single(text: &str) -> Option<(Language, f64)> { - detect(text).map(|info| { - ( - match info.lang() { - Lang::Epo => Language::Esperanto, - Lang::Eng => Language::English, - Lang::Rus => Language::Russian, - Lang::Cmn => Language::Mandarin, - Lang::Spa => Language::Spanish, - Lang::Por => Language::Portuguese, - Lang::Ita => Language::Italian, - Lang::Ben => Language::Bengali, - Lang::Fra => Language::French, - Lang::Deu => Language::German, - Lang::Ukr => Language::Ukrainian, - Lang::Kat => Language::Georgian, - Lang::Ara => Language::Arabic, - Lang::Hin => Language::Hindi, - Lang::Jpn => Language::Japanese, - Lang::Heb => Language::Hebrew, - Lang::Yid => Language::Yiddish, - Lang::Pol => Language::Polish, - Lang::Amh => Language::Amharic, - Lang::Jav => Language::Javanese, - Lang::Kor => Language::Korean, - Lang::Nob => Language::Bokmal, - Lang::Dan => Language::Danish, - Lang::Swe => Language::Swedish, - Lang::Fin => Language::Finnish, - Lang::Tur => Language::Turkish, - Lang::Nld => Language::Dutch, - Lang::Hun => Language::Hungarian, - Lang::Ces => Language::Czech, - Lang::Ell => Language::Greek, - Lang::Bul => Language::Bulgarian, - Lang::Bel => Language::Belarusian, - Lang::Mar => Language::Marathi, - Lang::Kan => Language::Kannada, - Lang::Ron => Language::Romanian, - Lang::Slv => Language::Slovene, - Lang::Hrv => Language::Croatian, - Lang::Srp => Language::Serbian, - Lang::Mkd => Language::Macedonian, - Lang::Lit => Language::Lithuanian, - Lang::Lav => Language::Latvian, - Lang::Est => Language::Estonian, - Lang::Tam => Language::Tamil, - Lang::Vie => Language::Vietnamese, - Lang::Urd => Language::Urdu, - Lang::Tha => Language::Thai, - Lang::Guj => Language::Gujarati, - Lang::Uzb => Language::Uzbek, - Lang::Pan => Language::Punjabi, - Lang::Aze => Language::Azerbaijani, - Lang::Ind => Language::Indonesian, - Lang::Tel => Language::Telugu, - Lang::Pes => Language::Persian, - Lang::Mal => Language::Malayalam, - Lang::Ori => Language::Oriya, - Lang::Mya => Language::Burmese, - Lang::Nep => Language::Nepali, - Lang::Sin => Language::Sinhalese, - Lang::Khm => Language::Khmer, - Lang::Tuk => Language::Turkmen, - Lang::Aka => Language::Akan, - Lang::Zul => Language::Zulu, - Lang::Sna => Language::Shona, - Lang::Afr => Language::Afrikaans, - Lang::Lat => Language::Latin, - Lang::Slk => Language::Slovak, - Lang::Cat => Language::Catalan, - Lang::Tgl => Language::Tagalog, - Lang::Hye => Language::Armenian, - }, - info.confidence(), - ) - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn detect_languages() { - let inputs = [ - ( - "The quick brown fox jumps over the lazy dog", - Language::English, - ), - ( - "Jovencillo emponzoñado de whisky: ¡qué figurota exhibe!", - Language::Spanish, - ), - ( - "Ma la volpe col suo balzo ha raggiunto il quieto Fido", - Language::Italian, - ), - ( - "Jaz em prisão bota que vexa dez cegonhas felizes", - Language::Portuguese, - ), - ( - "Zwölf Boxkämpfer jagten Victor quer über den großen Sylter Deich", - Language::German, - ), - ("עטלף אבק נס דרך מזגן שהתפוצץ כי חם", Language::Hebrew), - ( - "Съешь ещё этих мягких французских булок, да выпей же чаю", - Language::Russian, - ), - ( - "Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!", - Language::Ukrainian, - ), - ( - "Љубазни фењерџија чађавог лица хоће да ми покаже штос", - Language::Serbian, - ), - ( - "Pijamalı hasta yağız şoföre çabucak güvendi", - Language::Turkish, - ), - ("己所不欲,勿施于人。", Language::Mandarin), - ("井の中の蛙大海を知らず", Language::Japanese), - ("시작이 반이다", Language::Korean), - ]; - - let mut detector = LanguageDetector::new(); - - for input in inputs.iter() { - assert_eq!(detector.detect(input.0, 0.0), input.1); - } - } - - #[test] - fn weighted_language() { - let mut detector = LanguageDetector::new(); - for lang in [ - (Language::Spanish, 0.5, 70), - (Language::Japanese, 0.2, 100), - (Language::Japanese, 0.3, 100), - (Language::Japanese, 0.4, 200), - (Language::English, 0.7, 50), - ] - .iter() - { - let w = detector - .lang_detected - .entry(lang.0) - .or_insert_with(|| WeightedAverage { - weight: 0, - confidence: 0.0, - occurrences: 0, - }); - w.occurrences += 1; - w.weight += lang.2; - w.confidence += lang.1 * lang.2 as f64; - } - assert_eq!(detector.most_frequent_language(), Some(Language::Japanese)); - } -} diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs index 3f3d0b9e..8761f076 100644 --- a/crates/store/src/fts/mod.rs +++ b/crates/store/src/fts/mod.rs @@ -26,149 +26,13 @@ use crate::{ BitmapKey, Serialize, BM_HASH, }; -use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector}; +use self::{bloom::hash_token, builder::MAX_TOKEN_MASK}; -pub mod lang; -//pub mod pdf; pub mod bloom; pub mod builder; -pub mod ngram; pub mod query; pub mod search_snippet; -pub mod stemmer; pub mod term_index; -pub mod tokenizers; - -#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)] -pub enum Language { - Esperanto = 0, - English = 1, - Russian = 2, - Mandarin = 3, - Spanish = 4, - Portuguese = 5, - Italian = 6, - Bengali = 7, - French = 8, - German = 9, - Ukrainian = 10, - Georgian = 11, - Arabic = 12, - Hindi = 13, - Japanese = 14, - Hebrew = 15, - Yiddish = 16, - Polish = 17, - Amharic = 18, - Javanese = 19, - Korean = 20, - Bokmal = 21, - Danish = 22, - Swedish = 23, - Finnish = 24, - Turkish = 25, - Dutch = 26, - Hungarian = 27, - Czech = 28, - Greek = 29, - Bulgarian = 30, - Belarusian = 31, - Marathi = 32, - Kannada = 33, - Romanian = 34, - Slovene = 35, - Croatian = 36, - Serbian = 37, - Macedonian = 38, - Lithuanian = 39, - Latvian = 40, - Estonian = 41, - Tamil = 42, - Vietnamese = 43, - Urdu = 44, - Thai = 45, - Gujarati = 46, - Uzbek = 47, - Punjabi = 48, - Azerbaijani = 49, - Indonesian = 50, - Telugu = 51, - Persian = 52, - Malayalam = 53, - Oriya = 54, - Burmese = 55, - Nepali = 56, - Sinhalese = 57, - Khmer = 58, - Turkmen = 59, - Akan = 60, - Zulu = 61, - Shona = 62, - Afrikaans = 63, - Latin = 64, - Slovak = 65, - Catalan = 66, - Tagalog = 67, - Armenian = 68, - Unknown = 69, - None = 70, -} - -impl Language { - pub fn from_iso_639(code: &str) -> Option<Self> { - match code.split_once('-').map(|c| c.0).unwrap_or(code) { - "en" => Language::English, - "es" => Language::Spanish, - "pt" => Language::Portuguese, - "it" => Language::Italian, - "fr" => Language::French, - "de" => Language::German, - "ru" => Language::Russian, - "zh" => Language::Mandarin, - "ja" => Language::Japanese, - "ar" => Language::Arabic, - "hi" => Language::Hindi, - "ko" => Language::Korean, - "bn" => Language::Bengali, - "he" => Language::Hebrew, - "ur" => Language::Urdu, - "fa" => Language::Persian, - "ml" => Language::Malayalam, - "or" => Language::Oriya, - "my" => Language::Burmese, - "ne" => Language::Nepali, - "si" => Language::Sinhalese, - "km" => Language::Khmer, - "tk" => Language::Turkmen, - "am" => Language::Amharic, - "az" => Language::Azerbaijani, - "id" => Language::Indonesian, - "te" => Language::Telugu, - "ta" => Language::Tamil, - "vi" => Language::Vietnamese, - "gu" => Language::Gujarati, - "pa" => Language::Punjabi, - "uz" => Language::Uzbek, - "hy" => Language::Armenian, - "ka" => Language::Georgian, - "la" => Language::Latin, - "sl" => Language::Slovene, - "hr" => Language::Croatian, - "sr" => Language::Serbian, - "mk" => Language::Macedonian, - "lt" => Language::Lithuanian, - "lv" => Language::Latvian, - "et" => Language::Estonian, - "tl" => Language::Tagalog, - "af" => Language::Afrikaans, - "zu" => Language::Zulu, - "sn" => Language::Shona, - "ak" => Language::Akan, - _ => return None, - } - .into() - } -} impl BitmapKey<Vec<u8>> { pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self { @@ -209,19 +73,3 @@ impl Operation { } } } - -impl Language { - pub fn detect(text: String, default: Language) -> (String, Language) { - if let Some((l, t)) = text - .split_once(':') - .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into()) - { - (t.to_string(), l) - } else { - let l = LanguageDetector::detect_single(&text) - .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None }) - .unwrap_or(default); - (text, l) - } - } -} diff --git a/crates/store/src/fts/ngram.rs b/crates/store/src/fts/ngram.rs deleted file mode 100644 index 2ca2c781..00000000 --- a/crates/store/src/fts/ngram.rs +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::borrow::Cow; - -use super::bloom::{BloomFilter, BloomHashGroup}; - -pub trait ToNgrams: Sized { - fn new(items: usize) -> Self; - fn insert(&mut self, item: &str); - fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self { - let mut filter = Self::new(tokens.len().saturating_sub(1)); - for words in tokens.windows(n) { - filter.insert(&words.join(" ")); - } - filter - } -} - -impl ToNgrams for BloomFilter { - fn new(items: usize) -> Self { - BloomFilter::new(items) - } - - fn insert(&mut self, item: &str) { - self.insert(&item.into()) - } -} - -impl ToNgrams for Vec<BloomHashGroup> { - fn new(items: usize) -> Self { - Vec::with_capacity(items) - } - - fn insert(&mut self, item: &str) { - self.push(BloomHashGroup { - h1: item.into(), - h2: None, - }) - } -} diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs index 09439d30..77bc4dbd 100644 --- a/crates/store/src/fts/query.rs +++ b/crates/store/src/fts/query.rs @@ -21,14 +21,14 @@ * for more details. */ +use nlp::language::{stemmer::Stemmer, Language}; use roaring::RoaringBitmap; use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer}, - BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED, + fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED, }; -use super::{term_index::TermIndex, Language}; +use super::term_index::TermIndex; impl ReadTransaction<'_> { #[maybe_async::maybe_async] @@ -44,7 +44,7 @@ impl ReadTransaction<'_> { if match_phrase { let mut phrase = Vec::new(); let mut bit_keys = Vec::new(); - for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) { + for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) { let key = BitmapKey::hash( token.word.as_ref(), account_id, diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs index 89c557b1..55d6b6b7 100644 --- a/crates/store/src/fts/search_snippet.rs +++ b/crates/store/src/fts/search_snippet.rs @@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> { #[cfg(test)] mod tests { + use nlp::language::Language; + use crate::{ - fts::{ - term_index::{TermIndex, TermIndexBuilder}, - tokenizers::Tokenizer, - Language, - }, + fts::term_index::{TermIndex, TermIndexBuilder}, Deserialize, Serialize, }; @@ -242,7 +240,7 @@ mod tests { for (field_num, part) in parts.iter().enumerate() { let mut terms = Vec::new(); - for token in Tokenizer::new(part, Language::English, 40) { + for token in Language::English.tokenize_text(part, 40) { terms.push(builder.add_token(token)); } builder.add_terms(field_num as u8, 0, terms); diff --git a/crates/store/src/fts/stemmer.rs b/crates/store/src/fts/stemmer.rs deleted file mode 100644 index aa056d22..00000000 --- a/crates/store/src/fts/stemmer.rs +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::borrow::Cow; - -use rust_stemmers::Algorithm; - -use super::{tokenizers::Tokenizer, Language}; - -#[derive(Debug, PartialEq, Eq)] -pub struct StemmedToken<'x> { - pub word: Cow<'x, str>, - pub stemmed_word: Option<Cow<'x, str>>, - pub offset: u32, // Word offset in the text part - pub len: u8, // Word length -} - -pub struct Stemmer<'x> { - stemmer: Option<rust_stemmers::Stemmer>, - tokenizer: Tokenizer<'x>, -} - -impl<'x> Stemmer<'x> { - pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> { - Stemmer { - tokenizer: Tokenizer::new(text, language, max_token_length), - stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create), - } - } -} - -impl<'x> Iterator for Stemmer<'x> { - type Item = StemmedToken<'x>; - - fn next(&mut self) -> Option<Self::Item> { - let token = self.tokenizer.next()?; - Some(StemmedToken { - stemmed_word: self.stemmer.as_ref().and_then(|stemmer| { - match stemmer.stem(&token.word) { - Cow::Owned(text) if text.len() != token.len as usize || text != token.word => { - Some(text.into()) - } - _ => None, - } - }), - word: token.word, - offset: token.offset, - len: token.len, - }) - } -} - -static STEMMER_MAP: &[Option<Algorithm>] = &[ - None, // Esperanto = 0, - Some(Algorithm::English), // English = 1, - Some(Algorithm::Russian), // Russian = 2, - None, // Mandarin = 3, - Some(Algorithm::Spanish), // Spanish = 4, - Some(Algorithm::Portuguese), // Portuguese = 5, - Some(Algorithm::Italian), // Italian = 6, - None, // Bengali = 7, - Some(Algorithm::French), // French = 8, - Some(Algorithm::German), // German = 9, - None, // Ukrainian = 10, - None, // Georgian = 11, - Some(Algorithm::Arabic), // Arabic = 12, - None, // Hindi = 13, - None, // Japanese = 14, - None, // Hebrew = 15, - None, // Yiddish = 16, - None, // Polish = 17, - None, // Amharic = 18, - None, // Javanese = 19, - None, // Korean = 20, - Some(Algorithm::Norwegian), // Bokmal = 21, - Some(Algorithm::Danish), // Danish = 22, - Some(Algorithm::Swedish), // Swedish = 23, - Some(Algorithm::Finnish), // Finnish = 24, - Some(Algorithm::Turkish), // Turkish = 25, - Some(Algorithm::Dutch), // Dutch = 26, - Some(Algorithm::Hungarian), // Hungarian = 27, - None, // Czech = 28, - Some(Algorithm::Greek), // Greek = 29, - None, // Bulgarian = 30, - None, // Belarusian = 31, - None, // Marathi = 32, - None, // Kannada = 33, - Some(Algorithm::Romanian), // Romanian = 34, - None, // Slovene = 35, - None, // Croatian = 36, - None, // Serbian = 37, - None, // Macedonian = 38, - None, // Lithuanian = 39, - None, // Latvian = 40, - None, // Estonian = 41, - Some(Algorithm::Tamil), // Tamil = 42, - None, // Vietnamese = 43, - None, // Urdu = 44, - None, // Thai = 45, - None, // Gujarati = 46, - None, // Uzbek = 47, - None, // Punjabi = 48, - None, // Azerbaijani = 49, - None, // Indonesian = 50, - None, // Telugu = 51, - None, // Persian = 52, - None, // Malayalam = 53, - None, // Oriya = 54, - None, // Burmese = 55, - None, // Nepali = 56, - None, // Sinhalese = 57, - None, // Khmer = 58, - None, // Turkmen = 59, - None, // Akan = 60, - None, // Zulu = 61, - None, // Shona = 62, - None, // Afrikaans = 63, - None, // Latin = 64, - None, // Slovak = 65, - None, // Catalan = 66, - None, // Tagalog = 67, - None, // Armenian = 68, - None, // Unknown = 69, -]; - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn stemmer() { - let inputs = [ - ( - "love loving lovingly loved lovely", - Language::English, - "love", - ), - ("querer queremos quer", Language::Spanish, "quer"), - ]; - - for (input, language, result) in inputs { - for token in Stemmer::new(input, language, 40) { - assert_eq!(token.stemmed_word.unwrap_or(token.word), result); - } - } - } -} diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs index e2653853..b91f74db 100644 --- a/crates/store/src/fts/term_index.rs +++ b/crates/store/src/fts/term_index.rs @@ -21,14 +21,13 @@ * for more details. */ -use std::convert::TryInto; +use std::{borrow::Cow, convert::TryInto}; use crate::{Deserialize, Serialize}; -use super::{stemmer::StemmedToken, tokenizers::Token}; - use ahash::{AHashMap, AHashSet}; use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x}; +use nlp::{language::stemmer::StemmedToken, tokenizers::Token}; use utils::codec::leb128::{Leb128Reader, Leb128Vec}; #[derive(Debug)] @@ -227,7 +226,7 @@ impl TermIndexBuilder { } } - pub fn add_token(&mut self, token: Token) -> Term { + pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term { let id = self.terms.len() as u32; let id = self .terms @@ -236,8 +235,8 @@ impl TermIndexBuilder { Term { id: *id, id_stemmed: *id, - offset: token.offset, - len: token.len, + offset: token.from as u32, + len: (token.to - token.from) as u8, } } @@ -259,8 +258,8 @@ impl TermIndexBuilder { Term { id, id_stemmed, - offset: token.offset, - len: token.len, + offset: token.from as u32, + len: (token.to - token.from) as u8, } } @@ -775,13 +774,10 @@ impl TokenIndex { mod tests { use ahash::AHashMap; + use nlp::language::{stemmer::Stemmer, Language}; use crate::{ - fts::{ - stemmer::Stemmer, - term_index::{TermIndexBuilder, TokenIndex}, - Language, - }, + fts::term_index::{TermIndexBuilder, TokenIndex}, Deserialize, Serialize, }; diff --git a/crates/store/src/fts/tokenizers/chinese.rs b/crates/store/src/fts/tokenizers/chinese.rs deleted file mode 100644 index e741571d..00000000 --- a/crates/store/src/fts/tokenizers/chinese.rs +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::{borrow::Cow, vec::IntoIter}; - -use jieba_rs::Jieba; - -use super::{word::WordTokenizer, Token}; -use lazy_static::lazy_static; - -lazy_static! { - static ref JIEBA: Jieba = Jieba::new(); -} - -pub struct ChineseTokenizer<'x> { - word_tokenizer: WordTokenizer<'x>, - tokens: IntoIter<&'x str>, - token_offset: usize, - token_len: usize, - token_len_cur: usize, - max_token_length: usize, -} - -impl<'x> ChineseTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer { - ChineseTokenizer { - word_tokenizer: WordTokenizer::new(text), - tokens: Vec::new().into_iter(), - max_token_length, - token_offset: 0, - token_len: 0, - token_len_cur: 0, - } - } -} - -impl<'x> Iterator for ChineseTokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option<Self::Item> { - loop { - if let Some(ch_token) = self.tokens.next() { - let offset_start = self.token_offset + self.token_len_cur; - self.token_len_cur += ch_token.len(); - - if ch_token.len() <= self.max_token_length { - return Token::new(offset_start, ch_token.len(), ch_token.into()).into(); - } - } else { - loop { - let (token, is_ascii) = self.word_tokenizer.next()?; - if !is_ascii { - let word = match token.word { - Cow::Borrowed(word) => word, - Cow::Owned(_) => unreachable!(), - }; - self.tokens = JIEBA.cut(word, false).into_iter(); - self.token_offset = token.offset as usize; - self.token_len = token.len as usize; - self.token_len_cur = 0; - break; - } else if token.len as usize <= self.max_token_length { - return token.into(); - } - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn chinese_tokenizer() { - assert_eq!( - ChineseTokenizer::new( - "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。", - 40 - ) - .collect::<Vec<_>>(), - vec![ - Token { - word: "孫".into(), - offset: 0, - len: 3 - }, - Token { - word: "子".into(), - offset: 3, - len: 3 - }, - Token { - word: "曰".into(), - offset: 6, - len: 3 - }, - Token { - word: "兵".into(), - offset: 12, - len: 3 - }, - Token { - word: "者".into(), - offset: 15, - len: 3 - }, - Token { - word: "國".into(), - offset: 21, - len: 3 - }, - Token { - word: "之".into(), - offset: 24, - len: 3 - }, - Token { - word: "大事".into(), - offset: 27, - len: 6 - }, - Token { - word: "死".into(), - offset: 36, - len: 3 - }, - Token { - word: "生".into(), - offset: 39, - len: 3 - }, - Token { - word: "之".into(), - offset: 42, - len: 3 - }, - Token { - word: "地".into(), - offset: 45, - len: 3 - }, - Token { - word: "存亡".into(), - offset: 51, - len: 6 - }, - Token { - word: "之".into(), - offset: 57, - len: 3 - }, - Token { - word: "道".into(), - offset: 60, - len: 3 - }, - Token { - word: "不可不".into(), - offset: 66, - len: 9 - }, - Token { - word: "察".into(), - offset: 75, - len: 3 - }, - Token { - word: "也".into(), - offset: 78, - len: 3 - } - ] - ); - } -} diff --git a/crates/store/src/fts/tokenizers/indo_european.rs b/crates/store/src/fts/tokenizers/indo_european.rs deleted file mode 100644 index e1f34ce6..00000000 --- a/crates/store/src/fts/tokenizers/indo_european.rs +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::str::CharIndices; - -use super::Token; - -pub struct IndoEuropeanTokenizer<'x> { - max_token_length: usize, - text: &'x str, - iterator: CharIndices<'x>, -} - -impl<'x> IndoEuropeanTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer { - IndoEuropeanTokenizer { - max_token_length, - text, - iterator: text.char_indices(), - } - } -} - -/// Parses indo-european text into lowercase tokens. -impl<'x> Iterator for IndoEuropeanTokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option<Self::Item> { - while let Some((token_start, ch)) = self.iterator.next() { - if ch.is_alphanumeric() { - let mut is_uppercase = ch.is_uppercase(); - let token_end = (&mut self.iterator) - .filter_map(|(pos, ch)| { - if ch.is_alphanumeric() { - if !is_uppercase && ch.is_uppercase() { - is_uppercase = true; - } - None - } else { - pos.into() - } - }) - .next() - .unwrap_or(self.text.len()); - - let token_len = token_end - token_start; - if token_end > token_start && token_len <= self.max_token_length { - return Token::new( - token_start, - token_len, - if is_uppercase { - self.text[token_start..token_end].to_lowercase().into() - } else { - self.text[token_start..token_end].into() - }, - ) - .into(); - } - } - } - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn indo_european_tokenizer() { - let inputs = [ - ( - "The quick brown fox jumps over the lazy dog", - vec![ - Token::new(0, 3, "the".into()), - Token::new(4, 5, "quick".into()), - Token::new(10, 5, "brown".into()), - Token::new(16, 3, "fox".into()), - Token::new(20, 5, "jumps".into()), - Token::new(26, 4, "over".into()), - Token::new(31, 3, "the".into()), - Token::new(35, 4, "lazy".into()), - Token::new(40, 3, "dog".into()), - ], - ), - ( - "Jovencillo EMPONZOÑADO de whisky: ¡qué figurota exhibe!", - vec![ - Token::new(0, 10, "jovencillo".into()), - Token::new(11, 12, "emponzoñado".into()), - Token::new(24, 2, "de".into()), - Token::new(27, 6, "whisky".into()), - Token::new(37, 4, "qué".into()), - Token::new(42, 8, "figurota".into()), - Token::new(51, 6, "exhibe".into()), - ], - ), - ( - "ZWÖLF Boxkämpfer jagten Victor quer über den großen Sylter Deich", - vec![ - Token::new(0, 6, "zwölf".into()), - Token::new(7, 11, "boxkämpfer".into()), - Token::new(19, 6, "jagten".into()), - Token::new(26, 6, "victor".into()), - Token::new(33, 4, "quer".into()), - Token::new(38, 5, "über".into()), - Token::new(44, 3, "den".into()), - Token::new(48, 7, "großen".into()), - Token::new(56, 6, "sylter".into()), - Token::new(63, 5, "deich".into()), - ], - ), - ( - "Съешь ещё этих мягких французских булок, да выпей же чаю", - vec![ - Token::new(0, 10, "съешь".into()), - Token::new(11, 6, "ещё".into()), - Token::new(18, 8, "этих".into()), - Token::new(27, 12, "мягких".into()), - Token::new(40, 22, "французских".into()), - Token::new(63, 10, "булок".into()), - Token::new(75, 4, "да".into()), - Token::new(80, 10, "выпей".into()), - Token::new(91, 4, "же".into()), - Token::new(96, 6, "чаю".into()), - ], - ), - ( - "Pijamalı hasta yağız şoföre çabucak güvendi", - vec![ - Token::new(0, 9, "pijamalı".into()), - Token::new(10, 5, "hasta".into()), - Token::new(16, 7, "yağız".into()), - Token::new(24, 8, "şoföre".into()), - Token::new(33, 8, "çabucak".into()), - Token::new(42, 8, "güvendi".into()), - ], - ), - ]; - - for (input, tokens) in inputs.iter() { - for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() { - assert_eq!(token, tokens[pos]); - } - } - } -} diff --git a/crates/store/src/fts/tokenizers/japanese.rs b/crates/store/src/fts/tokenizers/japanese.rs deleted file mode 100644 index 816ba0a3..00000000 --- a/crates/store/src/fts/tokenizers/japanese.rs +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::vec::IntoIter; - -use super::{word::WordTokenizer, Token}; - -pub struct JapaneseTokenizer<'x> { - word_tokenizer: WordTokenizer<'x>, - tokens: IntoIter<String>, - token_offset: usize, - token_len: usize, - token_len_cur: usize, - max_token_length: usize, -} - -impl<'x> JapaneseTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer { - JapaneseTokenizer { - word_tokenizer: WordTokenizer::new(text), - tokens: Vec::new().into_iter(), - max_token_length, - token_offset: 0, - token_len: 0, - token_len_cur: 0, - } - } -} - -impl<'x> Iterator for JapaneseTokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option<Self::Item> { - loop { - if let Some(jp_token) = self.tokens.next() { - let offset_start = self.token_offset + self.token_len_cur; - self.token_len_cur += jp_token.len(); - - if jp_token.len() <= self.max_token_length { - return Token::new(offset_start, jp_token.len(), jp_token.into()).into(); - } - } else { - loop { - let (token, is_ascii) = self.word_tokenizer.next()?; - if !is_ascii { - self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter(); - self.token_offset = token.offset as usize; - self.token_len = token.len as usize; - self.token_len_cur = 0; - break; - } else if token.len as usize <= self.max_token_length { - return token.into(); - } - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn japanese_tokenizer() { - assert_eq!( - JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40) - .collect::<Vec<_>>(), - vec![ - Token { - word: "お先".into(), - offset: 0, - len: 6 - }, - Token { - word: "に".into(), - offset: 6, - len: 3 - }, - Token { - word: "失礼".into(), - offset: 9, - len: 6 - }, - Token { - word: "し".into(), - offset: 15, - len: 3 - }, - Token { - word: "ます".into(), - offset: 18, - len: 6 - }, - Token { - word: "あなた".into(), - offset: 25, - len: 9 - }, - Token { - word: "の".into(), - offset: 34, - len: 3 - }, - Token { - word: "名前".into(), - offset: 37, - len: 6 - }, - Token { - word: "は".into(), - offset: 43, - len: 3 - }, - Token { - word: "何".into(), - offset: 46, - len: 3 - }, - Token { - word: "です".into(), - offset: 49, - len: 6 - }, - Token { - word: "か".into(), - offset: 55, - len: 3 - }, - Token { - word: "123".into(), - offset: 59, - len: 3 - }, - Token { - word: "abc".into(), - offset: 63, - len: 3 - }, - Token { - word: "872".into(), - offset: 67, - len: 3 - } - ] - ); - } -} diff --git a/crates/store/src/fts/tokenizers/mod.rs b/crates/store/src/fts/tokenizers/mod.rs deleted file mode 100644 index 3679b2b3..00000000 --- a/crates/store/src/fts/tokenizers/mod.rs +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -pub mod chinese; -pub mod indo_european; -pub mod japanese; -pub mod space; -pub mod word; - -use std::borrow::Cow; - -use self::{ - chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer, -}; - -use super::Language; - -#[derive(Debug, PartialEq, Eq)] -pub struct Token<'x> { - pub word: Cow<'x, str>, - pub offset: u32, // Word offset in the text part - pub len: u8, // Word length -} - -impl<'x> Token<'x> { - pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> { - debug_assert!(offset <= u32::max_value() as usize); - debug_assert!(len <= u8::max_value() as usize); - Token { - offset: offset as u32, - len: len as u8, - word, - } - } -} - -enum LanguageTokenizer<'x> { - IndoEuropean(IndoEuropeanTokenizer<'x>), - Japanese(JapaneseTokenizer<'x>), - Chinese(ChineseTokenizer<'x>), -} - -pub struct Tokenizer<'x> { - tokenizer: LanguageTokenizer<'x>, -} - -impl<'x> Tokenizer<'x> { - pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self { - Tokenizer { - tokenizer: match language { - Language::Japanese => { - LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length)) - } - Language::Mandarin => { - LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length)) - } - _ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new( - text, - max_token_length, - )), - }, - } - } -} - -impl<'x> Iterator for Tokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option<Self::Item> { - match &mut self.tokenizer { - LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(), - LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(), - LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(), - } - } -} diff --git a/crates/store/src/fts/tokenizers/space.rs b/crates/store/src/fts/tokenizers/space.rs deleted file mode 100644 index f3ef6891..00000000 --- a/crates/store/src/fts/tokenizers/space.rs +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::str::Chars; - -pub struct SpaceTokenizer<'x> { - iterator: Chars<'x>, - token: String, - max_token_length: usize, -} - -impl SpaceTokenizer<'_> { - pub fn new(text: &str, max_token_length: usize) -> SpaceTokenizer { - SpaceTokenizer { - iterator: text.chars(), - token: String::new(), - max_token_length, - } - } -} - -impl Iterator for SpaceTokenizer<'_> { - type Item = String; - - fn next(&mut self) -> Option<Self::Item> { - for ch in self.iterator.by_ref() { - if ch.is_alphanumeric() { - if ch.is_uppercase() { - for ch in ch.to_lowercase() { - self.token.push(ch); - } - } else { - self.token.push(ch); - } - } else if !self.token.is_empty() { - if self.token.len() < self.max_token_length { - return Some(std::mem::take(&mut self.token)); - } else { - self.token.clear(); - } - } - } - - if !self.token.is_empty() { - if self.token.len() < self.max_token_length { - return Some(std::mem::take(&mut self.token)); - } else { - self.token.clear(); - } - } - - None - } -} diff --git a/crates/store/src/fts/tokenizers/word.rs b/crates/store/src/fts/tokenizers/word.rs deleted file mode 100644 index 3e50ba1a..00000000 --- a/crates/store/src/fts/tokenizers/word.rs +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::str::CharIndices; - -use super::Token; - -pub struct WordTokenizer<'x> { - text: &'x str, - iterator: CharIndices<'x>, -} - -impl<'x> WordTokenizer<'x> { - pub fn new(text: &str) -> WordTokenizer { - WordTokenizer { - text, - iterator: text.char_indices(), - } - } -} - -/// Parses text into tokens, used by non-IndoEuropean tokenizers. -impl<'x> Iterator for WordTokenizer<'x> { - type Item = (Token<'x>, bool); - - fn next(&mut self) -> Option<Self::Item> { - let mut is_ascii = true; - while let Some((token_start, ch)) = self.iterator.next() { - if ch.is_alphanumeric() { - let token_end = (&mut self.iterator) - .filter_map(|(pos, ch)| { - if ch.is_alphanumeric() { - if is_ascii && !ch.is_ascii() { - is_ascii = false; - } - None - } else { - pos.into() - } - }) - .next() - .unwrap_or(self.text.len()); - - let token_len = token_end - token_start; - if token_end > token_start { - return ( - Token::new( - token_start, - token_len, - self.text[token_start..token_end].into(), - ), - is_ascii, - ) - .into(); - } - } - } - None - } -} diff --git a/crates/store/src/query/filter.rs b/crates/store/src/query/filter.rs index 5b74a9ae..9e4b7109 100644 --- a/crates/store/src/query/filter.rs +++ b/crates/store/src/query/filter.rs @@ -24,12 +24,10 @@ use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; use ahash::HashSet; +use nlp::tokenizers::space::SpaceTokenizer; use roaring::RoaringBitmap; -use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer}, - BitmapKey, ReadTransaction, Store, -}; +use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store}; use super::{Filter, ResultSet, TextMatch}; diff --git a/crates/store/src/query/mod.rs b/crates/store/src/query/mod.rs index 86f7eec9..05442caf 100644 --- a/crates/store/src/query/mod.rs +++ b/crates/store/src/query/mod.rs @@ -26,11 +26,10 @@ pub mod get; pub mod log; pub mod sort; +use nlp::language::Language; use roaring::RoaringBitmap; -use crate::{ - fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS, -}; +use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Operator { diff --git a/crates/store/src/write/mod.rs b/crates/store/src/write/mod.rs index 48d8027a..44826133 100644 --- a/crates/store/src/write/mod.rs +++ b/crates/store/src/write/mod.rs @@ -23,11 +23,11 @@ use std::{collections::HashSet, slice::Iter, time::SystemTime}; +use nlp::tokenizers::space::SpaceTokenizer; use utils::codec::leb128::{Leb128Iterator, Leb128Vec}; use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer}, - Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, + fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, }; use self::assert::AssertValue; |