summaryrefslogtreecommitdiff
path: root/crates/store
diff options
context:
space:
mode:
authormdecimus <mauro@stalw.art>2023-10-10 18:58:38 +0200
committermdecimus <mauro@stalw.art>2023-10-10 18:58:38 +0200
commit3d9efd363a3ed1fab306e3f5e9fea41bfe58d8be (patch)
treea123a7622a6d21dcf42c566a478650b863cff476 /crates/store
parenta0812095efd7aaf21ce45420bd70bc1d3230e77f (diff)
Bayes classifier, type tokenizer and NLP module reorganization
Diffstat (limited to 'crates/store')
-rw-r--r--crates/store/Cargo.toml7
-rw-r--r--crates/store/src/fts/bloom.rs7
-rw-r--r--crates/store/src/fts/builder.rs20
-rw-r--r--crates/store/src/fts/lang.rs252
-rw-r--r--crates/store/src/fts/mod.rs154
-rw-r--r--crates/store/src/fts/ngram.rs61
-rw-r--r--crates/store/src/fts/query.rs8
-rw-r--r--crates/store/src/fts/search_snippet.rs10
-rw-r--r--crates/store/src/fts/stemmer.rs168
-rw-r--r--crates/store/src/fts/term_index.rs22
-rw-r--r--crates/store/src/fts/tokenizers/chinese.rs197
-rw-r--r--crates/store/src/fts/tokenizers/indo_european.rs167
-rw-r--r--crates/store/src/fts/tokenizers/japanese.rs168
-rw-r--r--crates/store/src/fts/tokenizers/mod.rs96
-rw-r--r--crates/store/src/fts/tokenizers/space.rs74
-rw-r--r--crates/store/src/fts/tokenizers/word.rs80
-rw-r--r--crates/store/src/query/filter.rs6
-rw-r--r--crates/store/src/query/mod.rs5
-rw-r--r--crates/store/src/write/mod.rs4
19 files changed, 40 insertions, 1466 deletions
diff --git a/crates/store/Cargo.toml b/crates/store/Cargo.toml
index 9c4bb149..5a2dc3f5 100644
--- a/crates/store/Cargo.toml
+++ b/crates/store/Cargo.toml
@@ -6,6 +6,7 @@ resolver = "2"
[dependencies]
utils = { path = "../utils" }
+nlp = { path = "../nlp" }
maybe-async = { path = "../maybe-async" }
rocksdb = { version = "0.20.1", optional = true }
foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true }
@@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]}
ahash = { version = "0.8.0", features = ["serde"] }
bitpacking = "0.8.4"
lazy_static = "1.4"
-whatlang = "0.16" # Language detection
-rust-stemmers = "1.2" # Stemmers
-tinysegmenter = "0.1" # Japanese tokenizer
-jieba-rs = "0.6" # Chinese stemmer
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
farmhash = "1.1.5"
-siphasher = "0.3"
+siphasher = "1.0"
parking_lot = "0.12.1"
lru-cache = { version = "0.1.2", optional = true }
num_cpus = { version = "1.15.0", optional = true }
diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs
index 54905458..31e36427 100644
--- a/crates/store/src/fts/bloom.rs
+++ b/crates/store/src/fts/bloom.rs
@@ -27,13 +27,12 @@ use std::{
hash::{Hash, Hasher},
};
+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
use roaring::RoaringBitmap;
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
use crate::{Deserialize, Error, Serialize};
-use super::{stemmer::StemmedToken, tokenizers::Token};
-
pub struct BloomFilter {
m: u64,
b: RoaringBitmap,
@@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash {
}
}
-impl From<Token<'_>> for BloomHashGroup {
- fn from(t: Token<'_>) -> Self {
+impl From<Token<Cow<'_, str>>> for BloomHashGroup {
+ fn from(t: Token<Cow<'_, str>>) -> Self {
Self {
h1: BloomHash::hash(t.word.as_ref()),
h2: None,
diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs
index 3ddf538f..508d1e87 100644
--- a/crates/store/src/fts/builder.rs
+++ b/crates/store/src/fts/builder.rs
@@ -24,6 +24,14 @@
use std::{borrow::Cow, collections::HashSet};
use ahash::AHashSet;
+use nlp::{
+ language::{
+ detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
+ stemmer::Stemmer,
+ Language,
+ },
+ tokenizers::{space::SpaceTokenizer, Token},
+};
use utils::map::vec_map::VecMap;
use crate::{
@@ -32,13 +40,7 @@ use crate::{
Serialize, HASH_EXACT, HASH_STEMMED,
};
-use super::{
- lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
- stemmer::Stemmer,
- term_index::{TermIndexBuilder, TokenIndex},
- tokenizers::{space::SpaceTokenizer, Token},
- Language,
-};
+use super::term_index::{TermIndexBuilder, TokenIndex};
pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
@@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
terms.push(term_index.add_token(Token {
word: token.into(),
- offset: 0,
- len: 0,
+ from: 0,
+ to: 0,
}));
}
term_index.add_terms(field, 0, terms);
diff --git a/crates/store/src/fts/lang.rs b/crates/store/src/fts/lang.rs
deleted file mode 100644
index e5b780dc..00000000
--- a/crates/store/src/fts/lang.rs
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use ahash::AHashMap;
-use whatlang::{detect, Lang};
-
-use super::Language;
-
-pub const MIN_LANGUAGE_SCORE: f64 = 0.5;
-
-#[derive(Debug)]
-struct WeightedAverage {
- weight: usize,
- occurrences: usize,
- confidence: f64,
-}
-
-#[derive(Debug)]
-pub struct LanguageDetector {
- lang_detected: AHashMap<Language, WeightedAverage>,
-}
-
-impl Default for LanguageDetector {
- fn default() -> Self {
- Self::new()
- }
-}
-
-impl LanguageDetector {
- pub fn new() -> LanguageDetector {
- LanguageDetector {
- lang_detected: AHashMap::default(),
- }
- }
-
- pub fn detect(&mut self, text: &str, min_score: f64) -> Language {
- if let Some((language, confidence)) = LanguageDetector::detect_single(text) {
- let w = self
- .lang_detected
- .entry(language)
- .or_insert_with(|| WeightedAverage {
- weight: 0,
- confidence: 0.0,
- occurrences: 0,
- });
- w.occurrences += 1;
- w.weight += text.len();
- w.confidence += confidence * text.len() as f64;
- if confidence < min_score {
- Language::Unknown
- } else {
- language
- }
- } else {
- Language::Unknown
- }
- }
-
- pub fn most_frequent_language(&self) -> Option<Language> {
- self.lang_detected
- .iter()
- .max_by(|(_, a), (_, b)| {
- ((a.confidence / a.weight as f64) * a.occurrences as f64)
- .partial_cmp(&((b.confidence / b.weight as f64) * b.occurrences as f64))
- .unwrap_or(std::cmp::Ordering::Less)
- })
- .map(|(l, _)| *l)
- }
-
- pub fn detect_single(text: &str) -> Option<(Language, f64)> {
- detect(text).map(|info| {
- (
- match info.lang() {
- Lang::Epo => Language::Esperanto,
- Lang::Eng => Language::English,
- Lang::Rus => Language::Russian,
- Lang::Cmn => Language::Mandarin,
- Lang::Spa => Language::Spanish,
- Lang::Por => Language::Portuguese,
- Lang::Ita => Language::Italian,
- Lang::Ben => Language::Bengali,
- Lang::Fra => Language::French,
- Lang::Deu => Language::German,
- Lang::Ukr => Language::Ukrainian,
- Lang::Kat => Language::Georgian,
- Lang::Ara => Language::Arabic,
- Lang::Hin => Language::Hindi,
- Lang::Jpn => Language::Japanese,
- Lang::Heb => Language::Hebrew,
- Lang::Yid => Language::Yiddish,
- Lang::Pol => Language::Polish,
- Lang::Amh => Language::Amharic,
- Lang::Jav => Language::Javanese,
- Lang::Kor => Language::Korean,
- Lang::Nob => Language::Bokmal,
- Lang::Dan => Language::Danish,
- Lang::Swe => Language::Swedish,
- Lang::Fin => Language::Finnish,
- Lang::Tur => Language::Turkish,
- Lang::Nld => Language::Dutch,
- Lang::Hun => Language::Hungarian,
- Lang::Ces => Language::Czech,
- Lang::Ell => Language::Greek,
- Lang::Bul => Language::Bulgarian,
- Lang::Bel => Language::Belarusian,
- Lang::Mar => Language::Marathi,
- Lang::Kan => Language::Kannada,
- Lang::Ron => Language::Romanian,
- Lang::Slv => Language::Slovene,
- Lang::Hrv => Language::Croatian,
- Lang::Srp => Language::Serbian,
- Lang::Mkd => Language::Macedonian,
- Lang::Lit => Language::Lithuanian,
- Lang::Lav => Language::Latvian,
- Lang::Est => Language::Estonian,
- Lang::Tam => Language::Tamil,
- Lang::Vie => Language::Vietnamese,
- Lang::Urd => Language::Urdu,
- Lang::Tha => Language::Thai,
- Lang::Guj => Language::Gujarati,
- Lang::Uzb => Language::Uzbek,
- Lang::Pan => Language::Punjabi,
- Lang::Aze => Language::Azerbaijani,
- Lang::Ind => Language::Indonesian,
- Lang::Tel => Language::Telugu,
- Lang::Pes => Language::Persian,
- Lang::Mal => Language::Malayalam,
- Lang::Ori => Language::Oriya,
- Lang::Mya => Language::Burmese,
- Lang::Nep => Language::Nepali,
- Lang::Sin => Language::Sinhalese,
- Lang::Khm => Language::Khmer,
- Lang::Tuk => Language::Turkmen,
- Lang::Aka => Language::Akan,
- Lang::Zul => Language::Zulu,
- Lang::Sna => Language::Shona,
- Lang::Afr => Language::Afrikaans,
- Lang::Lat => Language::Latin,
- Lang::Slk => Language::Slovak,
- Lang::Cat => Language::Catalan,
- Lang::Tgl => Language::Tagalog,
- Lang::Hye => Language::Armenian,
- },
- info.confidence(),
- )
- })
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn detect_languages() {
- let inputs = [
- (
- "The quick brown fox jumps over the lazy dog",
- Language::English,
- ),
- (
- "Jovencillo emponzoñado de whisky: ¡qué figurota exhibe!",
- Language::Spanish,
- ),
- (
- "Ma la volpe col suo balzo ha raggiunto il quieto Fido",
- Language::Italian,
- ),
- (
- "Jaz em prisão bota que vexa dez cegonhas felizes",
- Language::Portuguese,
- ),
- (
- "Zwölf Boxkämpfer jagten Victor quer über den großen Sylter Deich",
- Language::German,
- ),
- ("עטלף אבק נס דרך מזגן שהתפוצץ כי חם", Language::Hebrew),
- (
- "Съешь ещё этих мягких французских булок, да выпей же чаю",
- Language::Russian,
- ),
- (
- "Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!",
- Language::Ukrainian,
- ),
- (
- "Љубазни фењерџија чађавог лица хоће да ми покаже штос",
- Language::Serbian,
- ),
- (
- "Pijamalı hasta yağız şoföre çabucak güvendi",
- Language::Turkish,
- ),
- ("己所不欲,勿施于人。", Language::Mandarin),
- ("井の中の蛙大海を知らず", Language::Japanese),
- ("시작이 반이다", Language::Korean),
- ];
-
- let mut detector = LanguageDetector::new();
-
- for input in inputs.iter() {
- assert_eq!(detector.detect(input.0, 0.0), input.1);
- }
- }
-
- #[test]
- fn weighted_language() {
- let mut detector = LanguageDetector::new();
- for lang in [
- (Language::Spanish, 0.5, 70),
- (Language::Japanese, 0.2, 100),
- (Language::Japanese, 0.3, 100),
- (Language::Japanese, 0.4, 200),
- (Language::English, 0.7, 50),
- ]
- .iter()
- {
- let w = detector
- .lang_detected
- .entry(lang.0)
- .or_insert_with(|| WeightedAverage {
- weight: 0,
- confidence: 0.0,
- occurrences: 0,
- });
- w.occurrences += 1;
- w.weight += lang.2;
- w.confidence += lang.1 * lang.2 as f64;
- }
- assert_eq!(detector.most_frequent_language(), Some(Language::Japanese));
- }
-}
diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs
index 3f3d0b9e..8761f076 100644
--- a/crates/store/src/fts/mod.rs
+++ b/crates/store/src/fts/mod.rs
@@ -26,149 +26,13 @@ use crate::{
BitmapKey, Serialize, BM_HASH,
};
-use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector};
+use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
-pub mod lang;
-//pub mod pdf;
pub mod bloom;
pub mod builder;
-pub mod ngram;
pub mod query;
pub mod search_snippet;
-pub mod stemmer;
pub mod term_index;
-pub mod tokenizers;
-
-#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
-pub enum Language {
- Esperanto = 0,
- English = 1,
- Russian = 2,
- Mandarin = 3,
- Spanish = 4,
- Portuguese = 5,
- Italian = 6,
- Bengali = 7,
- French = 8,
- German = 9,
- Ukrainian = 10,
- Georgian = 11,
- Arabic = 12,
- Hindi = 13,
- Japanese = 14,
- Hebrew = 15,
- Yiddish = 16,
- Polish = 17,
- Amharic = 18,
- Javanese = 19,
- Korean = 20,
- Bokmal = 21,
- Danish = 22,
- Swedish = 23,
- Finnish = 24,
- Turkish = 25,
- Dutch = 26,
- Hungarian = 27,
- Czech = 28,
- Greek = 29,
- Bulgarian = 30,
- Belarusian = 31,
- Marathi = 32,
- Kannada = 33,
- Romanian = 34,
- Slovene = 35,
- Croatian = 36,
- Serbian = 37,
- Macedonian = 38,
- Lithuanian = 39,
- Latvian = 40,
- Estonian = 41,
- Tamil = 42,
- Vietnamese = 43,
- Urdu = 44,
- Thai = 45,
- Gujarati = 46,
- Uzbek = 47,
- Punjabi = 48,
- Azerbaijani = 49,
- Indonesian = 50,
- Telugu = 51,
- Persian = 52,
- Malayalam = 53,
- Oriya = 54,
- Burmese = 55,
- Nepali = 56,
- Sinhalese = 57,
- Khmer = 58,
- Turkmen = 59,
- Akan = 60,
- Zulu = 61,
- Shona = 62,
- Afrikaans = 63,
- Latin = 64,
- Slovak = 65,
- Catalan = 66,
- Tagalog = 67,
- Armenian = 68,
- Unknown = 69,
- None = 70,
-}
-
-impl Language {
- pub fn from_iso_639(code: &str) -> Option<Self> {
- match code.split_once('-').map(|c| c.0).unwrap_or(code) {
- "en" => Language::English,
- "es" => Language::Spanish,
- "pt" => Language::Portuguese,
- "it" => Language::Italian,
- "fr" => Language::French,
- "de" => Language::German,
- "ru" => Language::Russian,
- "zh" => Language::Mandarin,
- "ja" => Language::Japanese,
- "ar" => Language::Arabic,
- "hi" => Language::Hindi,
- "ko" => Language::Korean,
- "bn" => Language::Bengali,
- "he" => Language::Hebrew,
- "ur" => Language::Urdu,
- "fa" => Language::Persian,
- "ml" => Language::Malayalam,
- "or" => Language::Oriya,
- "my" => Language::Burmese,
- "ne" => Language::Nepali,
- "si" => Language::Sinhalese,
- "km" => Language::Khmer,
- "tk" => Language::Turkmen,
- "am" => Language::Amharic,
- "az" => Language::Azerbaijani,
- "id" => Language::Indonesian,
- "te" => Language::Telugu,
- "ta" => Language::Tamil,
- "vi" => Language::Vietnamese,
- "gu" => Language::Gujarati,
- "pa" => Language::Punjabi,
- "uz" => Language::Uzbek,
- "hy" => Language::Armenian,
- "ka" => Language::Georgian,
- "la" => Language::Latin,
- "sl" => Language::Slovene,
- "hr" => Language::Croatian,
- "sr" => Language::Serbian,
- "mk" => Language::Macedonian,
- "lt" => Language::Lithuanian,
- "lv" => Language::Latvian,
- "et" => Language::Estonian,
- "tl" => Language::Tagalog,
- "af" => Language::Afrikaans,
- "zu" => Language::Zulu,
- "sn" => Language::Shona,
- "ak" => Language::Akan,
- _ => return None,
- }
- .into()
- }
-}
impl BitmapKey<Vec<u8>> {
pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
@@ -209,19 +73,3 @@ impl Operation {
}
}
}
-
-impl Language {
- pub fn detect(text: String, default: Language) -> (String, Language) {
- if let Some((l, t)) = text
- .split_once(':')
- .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
- {
- (t.to_string(), l)
- } else {
- let l = LanguageDetector::detect_single(&text)
- .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
- .unwrap_or(default);
- (text, l)
- }
- }
-}
diff --git a/crates/store/src/fts/ngram.rs b/crates/store/src/fts/ngram.rs
deleted file mode 100644
index 2ca2c781..00000000
--- a/crates/store/src/fts/ngram.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2023 Stalwart Labs Ltd.
- *
- * This file is part of the Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::borrow::Cow;
-
-use super::bloom::{BloomFilter, BloomHashGroup};
-
-pub trait ToNgrams: Sized {
- fn new(items: usize) -> Self;
- fn insert(&mut self, item: &str);
- fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self {
- let mut filter = Self::new(tokens.len().saturating_sub(1));
- for words in tokens.windows(n) {
- filter.insert(&words.join(" "));
- }
- filter
- }
-}
-
-impl ToNgrams for BloomFilter {
- fn new(items: usize) -> Self {
- BloomFilter::new(items)
- }
-
- fn insert(&mut self, item: &str) {
- self.insert(&item.into())
- }
-}
-
-impl ToNgrams for Vec<BloomHashGroup> {
- fn new(items: usize) -> Self {
- Vec::with_capacity(items)
- }
-
- fn insert(&mut self, item: &str) {
- self.push(BloomHashGroup {
- h1: item.into(),
- h2: None,
- })
- }
-}
diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs
index 09439d30..77bc4dbd 100644
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@@ -21,14 +21,14 @@
* for more details.
*/
+use nlp::language::{stemmer::Stemmer, Language};
use roaring::RoaringBitmap;
use crate::{
- fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer},
- BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
+ fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
};
-use super::{term_index::TermIndex, Language};
+use super::term_index::TermIndex;
impl ReadTransaction<'_> {
#[maybe_async::maybe_async]
@@ -44,7 +44,7 @@ impl ReadTransaction<'_> {
if match_phrase {
let mut phrase = Vec::new();
let mut bit_keys = Vec::new();
- for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) {
+ for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
let key = BitmapKey::hash(
token.word.as_ref(),
account_id,
diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs
index 89c557b1..55d6b6b7 100644
--- a/crates/store/src/fts/search_snippet.rs
+++ b/crates/store/src/fts/search_snippet.rs
@@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
#[cfg(test)]
mod tests {
+ use nlp::language::Language;
+
use crate::{
- fts::{
- term_index::{TermIndex, TermIndexBuilder},
- tokenizers::Tokenizer,
- Language,
- },
+ fts::term_index::{TermIndex, TermIndexBuilder},
Deserialize, Serialize,
};
@@ -242,7 +240,7 @@ mod tests {
for (field_num, part) in parts.iter().enumerate() {
let mut terms = Vec::new();
- for token in Tokenizer::new(part, Language::English, 40) {
+ for token in Language::English.tokenize_text(part, 40) {
terms.push(builder.add_token(token));
}
builder.add_terms(field_num as u8, 0, terms);
diff --git a/crates/store/src/fts/stemmer.rs b/crates/store/src/fts/stemmer.rs
deleted file mode 100644
index aa056d22..00000000
--- a/crates/store/src/fts/stemmer.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::borrow::Cow;
-
-use rust_stemmers::Algorithm;
-
-use super::{tokenizers::Tokenizer, Language};
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct StemmedToken<'x> {
- pub word: Cow<'x, str>,
- pub stemmed_word: Option<Cow<'x, str>>,
- pub offset: u32, // Word offset in the text part
- pub len: u8, // Word length
-}
-
-pub struct Stemmer<'x> {
- stemmer: Option<rust_stemmers::Stemmer>,
- tokenizer: Tokenizer<'x>,
-}
-
-impl<'x> Stemmer<'x> {
- pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> {
- Stemmer {
- tokenizer: Tokenizer::new(text, language, max_token_length),
- stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create),
- }
- }
-}
-
-impl<'x> Iterator for Stemmer<'x> {
- type Item = StemmedToken<'x>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let token = self.tokenizer.next()?;
- Some(StemmedToken {
- stemmed_word: self.stemmer.as_ref().and_then(|stemmer| {
- match stemmer.stem(&token.word) {
- Cow::Owned(text) if text.len() != token.len as usize || text != token.word => {
- Some(text.into())
- }
- _ => None,
- }
- }),
- word: token.word,
- offset: token.offset,
- len: token.len,
- })
- }
-}
-
-static STEMMER_MAP: &[Option<Algorithm>] = &[
- None, // Esperanto = 0,
- Some(Algorithm::English), // English = 1,
- Some(Algorithm::Russian), // Russian = 2,
- None, // Mandarin = 3,
- Some(Algorithm::Spanish), // Spanish = 4,
- Some(Algorithm::Portuguese), // Portuguese = 5,
- Some(Algorithm::Italian), // Italian = 6,
- None, // Bengali = 7,
- Some(Algorithm::French), // French = 8,
- Some(Algorithm::German), // German = 9,
- None, // Ukrainian = 10,
- None, // Georgian = 11,
- Some(Algorithm::Arabic), // Arabic = 12,
- None, // Hindi = 13,
- None, // Japanese = 14,
- None, // Hebrew = 15,
- None, // Yiddish = 16,
- None, // Polish = 17,
- None, // Amharic = 18,
- None, // Javanese = 19,
- None, // Korean = 20,
- Some(Algorithm::Norwegian), // Bokmal = 21,
- Some(Algorithm::Danish), // Danish = 22,
- Some(Algorithm::Swedish), // Swedish = 23,
- Some(Algorithm::Finnish), // Finnish = 24,
- Some(Algorithm::Turkish), // Turkish = 25,
- Some(Algorithm::Dutch), // Dutch = 26,
- Some(Algorithm::Hungarian), // Hungarian = 27,
- None, // Czech = 28,
- Some(Algorithm::Greek), // Greek = 29,
- None, // Bulgarian = 30,
- None, // Belarusian = 31,
- None, // Marathi = 32,
- None, // Kannada = 33,
- Some(Algorithm::Romanian), // Romanian = 34,
- None, // Slovene = 35,
- None, // Croatian = 36,
- None, // Serbian = 37,
- None, // Macedonian = 38,
- None, // Lithuanian = 39,
- None, // Latvian = 40,
- None, // Estonian = 41,
- Some(Algorithm::Tamil), // Tamil = 42,
- None, // Vietnamese = 43,
- None, // Urdu = 44,
- None, // Thai = 45,
- None, // Gujarati = 46,
- None, // Uzbek = 47,
- None, // Punjabi = 48,
- None, // Azerbaijani = 49,
- None, // Indonesian = 50,
- None, // Telugu = 51,
- None, // Persian = 52,
- None, // Malayalam = 53,
- None, // Oriya = 54,
- None, // Burmese = 55,
- None, // Nepali = 56,
- None, // Sinhalese = 57,
- None, // Khmer = 58,
- None, // Turkmen = 59,
- None, // Akan = 60,
- None, // Zulu = 61,
- None, // Shona = 62,
- None, // Afrikaans = 63,
- None, // Latin = 64,
- None, // Slovak = 65,
- None, // Catalan = 66,
- None, // Tagalog = 67,
- None, // Armenian = 68,
- None, // Unknown = 69,
-];
-
-#[cfg(test)]
-mod tests {
-
- use super::*;
-
- #[test]
- fn stemmer() {
- let inputs = [
- (
- "love loving lovingly loved lovely",
- Language::English,
- "love",
- ),
- ("querer queremos quer", Language::Spanish, "quer"),
- ];
-
- for (input, language, result) in inputs {
- for token in Stemmer::new(input, language, 40) {
- assert_eq!(token.stemmed_word.unwrap_or(token.word), result);
- }
- }
- }
-}
diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs
index e2653853..b91f74db 100644
--- a/crates/store/src/fts/term_index.rs
+++ b/crates/store/src/fts/term_index.rs
@@ -21,14 +21,13 @@
* for more details.
*/
-use std::convert::TryInto;
+use std::{borrow::Cow, convert::TryInto};
use crate::{Deserialize, Serialize};
-use super::{stemmer::StemmedToken, tokenizers::Token};
-
use ahash::{AHashMap, AHashSet};
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
#[derive(Debug)]
@@ -227,7 +226,7 @@ impl TermIndexBuilder {
}
}
- pub fn add_token(&mut self, token: Token) -> Term {
+ pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term {
let id = self.terms.len() as u32;
let id = self
.terms
@@ -236,8 +235,8 @@ impl TermIndexBuilder {
Term {
id: *id,
id_stemmed: *id,
- offset: token.offset,
- len: token.len,
+ offset: token.from as u32,
+ len: (token.to - token.from) as u8,
}
}
@@ -259,8 +258,8 @@ impl TermIndexBuilder {
Term {
id,
id_stemmed,
- offset: token.offset,
- len: token.len,
+ offset: token.from as u32,
+ len: (token.to - token.from) as u8,
}
}
@@ -775,13 +774,10 @@ impl TokenIndex {
mod tests {
use ahash::AHashMap;
+ use nlp::language::{stemmer::Stemmer, Language};
use crate::{
- fts::{
- stemmer::Stemmer,
- term_index::{TermIndexBuilder, TokenIndex},
- Language,
- },
+ fts::term_index::{TermIndexBuilder, TokenIndex},
Deserialize, Serialize,
};
diff --git a/crates/store/src/fts/tokenizers/chinese.rs b/crates/store/src/fts/tokenizers/chinese.rs
deleted file mode 100644
index e741571d..00000000
--- a/crates/store/src/fts/tokenizers/chinese.rs
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{borrow::Cow, vec::IntoIter};
-
-use jieba_rs::Jieba;
-
-use super::{word::WordTokenizer, Token};
-use lazy_static::lazy_static;
-
-lazy_static! {
- static ref JIEBA: Jieba = Jieba::new();
-}
-
-pub struct ChineseTokenizer<'x> {
- word_tokenizer: WordTokenizer<'x>,
- tokens: IntoIter<&'x str>,
- token_offset: usize,
- token_len: usize,
- token_len_cur: usize,
- max_token_length: usize,
-}
-
-impl<'x> ChineseTokenizer<'x> {
- pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer {
- ChineseTokenizer {
- word_tokenizer: WordTokenizer::new(text),
- tokens: Vec::new().into_iter(),
- max_token_length,
- token_offset: 0,
- token_len: 0,
- token_len_cur: 0,
- }
- }
-}
-
-impl<'x> Iterator for ChineseTokenizer<'x> {
- type Item = Token<'x>;
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- if let Some(ch_token) = self.tokens.next() {
- let offset_start = self.token_offset + self.token_len_cur;
- self.token_len_cur += ch_token.len();
-
- if ch_token.len() <= self.max_token_length {
- return Token::new(offset_start, ch_token.len(), ch_token.into()).into();
- }
- } else {
- loop {
- let (token, is_ascii) = self.word_tokenizer.next()?;
- if !is_ascii {
- let word = match token.word {
- Cow::Borrowed(word) => word,
- Cow::Owned(_) => unreachable!(),
- };
- self.tokens = JIEBA.cut(word, false).into_iter();
- self.token_offset = token.offset as usize;
- self.token_len = token.len as usize;
- self.token_len_cur = 0;
- break;
- } else if token.len as usize <= self.max_token_length {
- return token.into();
- }
- }
- }
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn chinese_tokenizer() {
- assert_eq!(
- ChineseTokenizer::new(
- "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
- 40
- )
- .collect::<Vec<_>>(),
- vec![
- Token {
- word: "孫".into(),
- offset: 0,
- len: 3
- },
- Token {
- word: "子".into(),
- offset: 3,
- len: 3
- },
- Token {
- word: "曰".into(),
- offset: 6,
- len: 3
- },
- Token {
- word: "兵".into(),
- offset: 12,
- len: 3
- },
- Token {
- word: "者".into(),
- offset: 15,
- len: 3
- },
- Token {
- word: "國".into(),
- offset: 21,
- len: 3
- },
- Token {
- word: "之".into(),
- offset: 24,
- len: 3
- },
- Token {
- word: "大事".into(),
- offset: 27,
- len: 6
- },
- Token {
- word: "死".into(),
- offset: 36,
- len: 3
- },
- Token {
- word: "生".into(),
- offset: 39,
- len: 3
- },
- Token {
- word: "之".into(),
- offset: 42,
- len: 3
- },
- Token {
- word: "地".into(),
- offset: 45,
- len: 3
- },
- Token {
- word: "存亡".into(),
- offset: 51,
- len: 6
- },
- Token {
- word: "之".into(),
- offset: 57,
- len: 3
- },
- Token {
- word: "道".into(),
- offset: 60,
- len: 3
- },
- Token {
- word: "不可不".into(),
- offset: 66,
- len: 9
- },
- Token {
- word: "察".into(),
- offset: 75,
- len: 3
- },
- Token {
- word: "也".into(),
- offset: 78,
- len: 3
- }
- ]
- );
- }
-}
diff --git a/crates/store/src/fts/tokenizers/indo_european.rs b/crates/store/src/fts/tokenizers/indo_european.rs
deleted file mode 100644
index e1f34ce6..00000000
--- a/crates/store/src/fts/tokenizers/indo_european.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::str::CharIndices;
-
-use super::Token;
-
-pub struct IndoEuropeanTokenizer<'x> {
- max_token_length: usize,
- text: &'x str,
- iterator: CharIndices<'x>,
-}
-
-impl<'x> IndoEuropeanTokenizer<'x> {
- pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer {
- IndoEuropeanTokenizer {
- max_token_length,
- text,
- iterator: text.char_indices(),
- }
- }
-}
-
-/// Parses indo-european text into lowercase tokens.
-impl<'x> Iterator for IndoEuropeanTokenizer<'x> {
- type Item = Token<'x>;
-
- fn next(&mut self) -> Option<Self::Item> {
- while let Some((token_start, ch)) = self.iterator.next() {
- if ch.is_alphanumeric() {
- let mut is_uppercase = ch.is_uppercase();
- let token_end = (&mut self.iterator)
- .filter_map(|(pos, ch)| {
- if ch.is_alphanumeric() {
- if !is_uppercase && ch.is_uppercase() {
- is_uppercase = true;
- }
- None
- } else {
- pos.into()
- }
- })
- .next()
- .unwrap_or(self.text.len());
-
- let token_len = token_end - token_start;
- if token_end > token_start && token_len <= self.max_token_length {
- return Token::new(
- token_start,
- token_len,
- if is_uppercase {
- self.text[token_start..token_end].to_lowercase().into()
- } else {
- self.text[token_start..token_end].into()
- },
- )
- .into();
- }
- }
- }
- None
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn indo_european_tokenizer() {
- let inputs = [
- (
- "The quick brown fox jumps over the lazy dog",
- vec![
- Token::new(0, 3, "the".into()),
- Token::new(4, 5, "quick".into()),
- Token::new(10, 5, "brown".into()),
- Token::new(16, 3, "fox".into()),
- Token::new(20, 5, "jumps".into()),
- Token::new(26, 4, "over".into()),
- Token::new(31, 3, "the".into()),
- Token::new(35, 4, "lazy".into()),
- Token::new(40, 3, "dog".into()),
- ],
- ),
- (
- "Jovencillo EMPONZOÑADO de whisky: ¡qué figurota exhibe!",
- vec![
- Token::new(0, 10, "jovencillo".into()),
- Token::new(11, 12, "emponzoñado".into()),
- Token::new(24, 2, "de".into()),
- Token::new(27, 6, "whisky".into()),
- Token::new(37, 4, "qué".into()),
- Token::new(42, 8, "figurota".into()),
- Token::new(51, 6, "exhibe".into()),
- ],
- ),
- (
- "ZWÖLF Boxkämpfer jagten Victor quer über den großen Sylter Deich",
- vec![
- Token::new(0, 6, "zwölf".into()),
- Token::new(7, 11, "boxkämpfer".into()),
- Token::new(19, 6, "jagten".into()),
- Token::new(26, 6, "victor".into()),
- Token::new(33, 4, "quer".into()),
- Token::new(38, 5, "über".into()),
- Token::new(44, 3, "den".into()),
- Token::new(48, 7, "großen".into()),
- Token::new(56, 6, "sylter".into()),
- Token::new(63, 5, "deich".into()),
- ],
- ),
- (
- "Съешь ещё этих мягких французских булок, да выпей же чаю",
- vec![
- Token::new(0, 10, "съешь".into()),
- Token::new(11, 6, "ещё".into()),
- Token::new(18, 8, "этих".into()),
- Token::new(27, 12, "мягких".into()),
- Token::new(40, 22, "французских".into()),
- Token::new(63, 10, "булок".into()),
- Token::new(75, 4, "да".into()),
- Token::new(80, 10, "выпей".into()),
- Token::new(91, 4, "же".into()),
- Token::new(96, 6, "чаю".into()),
- ],
- ),
- (
- "Pijamalı hasta yağız şoföre çabucak güvendi",
- vec![
- Token::new(0, 9, "pijamalı".into()),
- Token::new(10, 5, "hasta".into()),
- Token::new(16, 7, "yağız".into()),
- Token::new(24, 8, "şoföre".into()),
- Token::new(33, 8, "çabucak".into()),
- Token::new(42, 8, "güvendi".into()),
- ],
- ),
- ];
-
- for (input, tokens) in inputs.iter() {
- for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() {
- assert_eq!(token, tokens[pos]);
- }
- }
- }
-}
diff --git a/crates/store/src/fts/tokenizers/japanese.rs b/crates/store/src/fts/tokenizers/japanese.rs
deleted file mode 100644
index 816ba0a3..00000000
--- a/crates/store/src/fts/tokenizers/japanese.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::vec::IntoIter;
-
-use super::{word::WordTokenizer, Token};
-
-pub struct JapaneseTokenizer<'x> {
- word_tokenizer: WordTokenizer<'x>,
- tokens: IntoIter<String>,
- token_offset: usize,
- token_len: usize,
- token_len_cur: usize,
- max_token_length: usize,
-}
-
-impl<'x> JapaneseTokenizer<'x> {
- pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer {
- JapaneseTokenizer {
- word_tokenizer: WordTokenizer::new(text),
- tokens: Vec::new().into_iter(),
- max_token_length,
- token_offset: 0,
- token_len: 0,
- token_len_cur: 0,
- }
- }
-}
-
-impl<'x> Iterator for JapaneseTokenizer<'x> {
- type Item = Token<'x>;
-
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- if let Some(jp_token) = self.tokens.next() {
- let offset_start = self.token_offset + self.token_len_cur;
- self.token_len_cur += jp_token.len();
-
- if jp_token.len() <= self.max_token_length {
- return Token::new(offset_start, jp_token.len(), jp_token.into()).into();
- }
- } else {
- loop {
- let (token, is_ascii) = self.word_tokenizer.next()?;
- if !is_ascii {
- self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter();
- self.token_offset = token.offset as usize;
- self.token_len = token.len as usize;
- self.token_len_cur = 0;
- break;
- } else if token.len as usize <= self.max_token_length {
- return token.into();
- }
- }
- }
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn japanese_tokenizer() {
- assert_eq!(
- JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40)
- .collect::<Vec<_>>(),
- vec![
- Token {
- word: "お先".into(),
- offset: 0,
- len: 6
- },
- Token {
- word: "に".into(),
- offset: 6,
- len: 3
- },
- Token {
- word: "失礼".into(),
- offset: 9,
- len: 6
- },
- Token {
- word: "し".into(),
- offset: 15,
- len: 3
- },
- Token {
- word: "ます".into(),
- offset: 18,
- len: 6
- },
- Token {
- word: "あなた".into(),
- offset: 25,
- len: 9
- },
- Token {
- word: "の".into(),
- offset: 34,
- len: 3
- },
- Token {
- word: "名前".into(),
- offset: 37,
- len: 6
- },
- Token {
- word: "は".into(),
- offset: 43,
- len: 3
- },
- Token {
- word: "何".into(),
- offset: 46,
- len: 3
- },
- Token {
- word: "です".into(),
- offset: 49,
- len: 6
- },
- Token {
- word: "か".into(),
- offset: 55,
- len: 3
- },
- Token {
- word: "123".into(),
- offset: 59,
- len: 3
- },
- Token {
- word: "abc".into(),
- offset: 63,
- len: 3
- },
- Token {
- word: "872".into(),
- offset: 67,
- len: 3
- }
- ]
- );
- }
-}
diff --git a/crates/store/src/fts/tokenizers/mod.rs b/crates/store/src/fts/tokenizers/mod.rs
deleted file mode 100644
index 3679b2b3..00000000
--- a/crates/store/src/fts/tokenizers/mod.rs
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-pub mod chinese;
-pub mod indo_european;
-pub mod japanese;
-pub mod space;
-pub mod word;
-
-use std::borrow::Cow;
-
-use self::{
- chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer,
-};
-
-use super::Language;
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct Token<'x> {
- pub word: Cow<'x, str>,
- pub offset: u32, // Word offset in the text part
- pub len: u8, // Word length
-}
-
-impl<'x> Token<'x> {
- pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> {
- debug_assert!(offset <= u32::max_value() as usize);
- debug_assert!(len <= u8::max_value() as usize);
- Token {
- offset: offset as u32,
- len: len as u8,
- word,
- }
- }
-}
-
-enum LanguageTokenizer<'x> {
- IndoEuropean(IndoEuropeanTokenizer<'x>),
- Japanese(JapaneseTokenizer<'x>),
- Chinese(ChineseTokenizer<'x>),
-}
-
-pub struct Tokenizer<'x> {
- tokenizer: LanguageTokenizer<'x>,
-}
-
-impl<'x> Tokenizer<'x> {
- pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self {
- Tokenizer {
- tokenizer: match language {
- Language::Japanese => {
- LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length))
- }
- Language::Mandarin => {
- LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length))
- }
- _ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new(
- text,
- max_token_length,
- )),
- },
- }
- }
-}
-
-impl<'x> Iterator for Tokenizer<'x> {
- type Item = Token<'x>;
-
- fn next(&mut self) -> Option<Self::Item> {
- match &mut self.tokenizer {
- LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(),
- LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(),
- LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(),
- }
- }
-}
diff --git a/crates/store/src/fts/tokenizers/space.rs b/crates/store/src/fts/tokenizers/space.rs
deleted file mode 100644
index f3ef6891..00000000
--- a/crates/store/src/fts/tokenizers/space.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2023 Stalwart Labs Ltd.
- *
- * This file is part of the Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::str::Chars;
-
-pub struct SpaceTokenizer<'x> {
- iterator: Chars<'x>,
- token: String,
- max_token_length: usize,
-}
-
-impl SpaceTokenizer<'_> {
- pub fn new(text: &str, max_token_length: usize) -> SpaceTokenizer {
- SpaceTokenizer {
- iterator: text.chars(),
- token: String::new(),
- max_token_length,
- }
- }
-}
-
-impl Iterator for SpaceTokenizer<'_> {
- type Item = String;
-
- fn next(&mut self) -> Option<Self::Item> {
- for ch in self.iterator.by_ref() {
- if ch.is_alphanumeric() {
- if ch.is_uppercase() {
- for ch in ch.to_lowercase() {
- self.token.push(ch);
- }
- } else {
- self.token.push(ch);
- }
- } else if !self.token.is_empty() {
- if self.token.len() < self.max_token_length {
- return Some(std::mem::take(&mut self.token));
- } else {
- self.token.clear();
- }
- }
- }
-
- if !self.token.is_empty() {
- if self.token.len() < self.max_token_length {
- return Some(std::mem::take(&mut self.token));
- } else {
- self.token.clear();
- }
- }
-
- None
- }
-}
diff --git a/crates/store/src/fts/tokenizers/word.rs b/crates/store/src/fts/tokenizers/word.rs
deleted file mode 100644
index 3e50ba1a..00000000
--- a/crates/store/src/fts/tokenizers/word.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::str::CharIndices;
-
-use super::Token;
-
-pub struct WordTokenizer<'x> {
- text: &'x str,
- iterator: CharIndices<'x>,
-}
-
-impl<'x> WordTokenizer<'x> {
- pub fn new(text: &str) -> WordTokenizer {
- WordTokenizer {
- text,
- iterator: text.char_indices(),
- }
- }
-}
-
-/// Parses text into tokens, used by non-IndoEuropean tokenizers.
-impl<'x> Iterator for WordTokenizer<'x> {
- type Item = (Token<'x>, bool);
-
- fn next(&mut self) -> Option<Self::Item> {
- let mut is_ascii = true;
- while let Some((token_start, ch)) = self.iterator.next() {
- if ch.is_alphanumeric() {
- let token_end = (&mut self.iterator)
- .filter_map(|(pos, ch)| {
- if ch.is_alphanumeric() {
- if is_ascii && !ch.is_ascii() {
- is_ascii = false;
- }
- None
- } else {
- pos.into()
- }
- })
- .next()
- .unwrap_or(self.text.len());
-
- let token_len = token_end - token_start;
- if token_end > token_start {
- return (
- Token::new(
- token_start,
- token_len,
- self.text[token_start..token_end].into(),
- ),
- is_ascii,
- )
- .into();
- }
- }
- }
- None
- }
-}
diff --git a/crates/store/src/query/filter.rs b/crates/store/src/query/filter.rs
index 5b74a9ae..9e4b7109 100644
--- a/crates/store/src/query/filter.rs
+++ b/crates/store/src/query/filter.rs
@@ -24,12 +24,10 @@
use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
use ahash::HashSet;
+use nlp::tokenizers::space::SpaceTokenizer;
use roaring::RoaringBitmap;
-use crate::{
- fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
- BitmapKey, ReadTransaction, Store,
-};
+use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store};
use super::{Filter, ResultSet, TextMatch};
diff --git a/crates/store/src/query/mod.rs b/crates/store/src/query/mod.rs
index 86f7eec9..05442caf 100644
--- a/crates/store/src/query/mod.rs
+++ b/crates/store/src/query/mod.rs
@@ -26,11 +26,10 @@ pub mod get;
pub mod log;
pub mod sort;
+use nlp::language::Language;
use roaring::RoaringBitmap;
-use crate::{
- fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS,
-};
+use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Operator {
diff --git a/crates/store/src/write/mod.rs b/crates/store/src/write/mod.rs
index 48d8027a..44826133 100644
--- a/crates/store/src/write/mod.rs
+++ b/crates/store/src/write/mod.rs
@@ -23,11 +23,11 @@
use std::{collections::HashSet, slice::Iter, time::SystemTime};
+use nlp::tokenizers::space::SpaceTokenizer;
use utils::codec::leb128::{Leb128Iterator, Leb128Vec};
use crate::{
- fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
- Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
+ fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
};
use self::assert::AssertValue;