diff options
author | mdecimus <mauro@stalw.art> | 2023-12-04 10:23:10 +0100 |
---|---|---|
committer | mdecimus <mauro@stalw.art> | 2023-12-04 10:23:10 +0100 |
commit | a7acc67cf157e635f8a498a92119dcf6fc5cd29c (patch) | |
tree | 8ff4c93008aff584cc7c5d9457b935a0a6187739 /crates/store/src/fts | |
parent | 7e94a08067b3273f78fd727cb01ccf878ebc84a0 (diff) |
FTS store bigrams in kv rather than index
Diffstat (limited to 'crates/store/src/fts')
-rw-r--r-- | crates/store/src/fts/index.rs | 32 | ||||
-rw-r--r-- | crates/store/src/fts/query.rs | 115 |
2 files changed, 113 insertions, 34 deletions
diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs index 596b737e..6894b2b9 100644 --- a/crates/store/src/fts/index.rs +++ b/crates/store/src/fts/index.rs @@ -21,7 +21,7 @@ * for more details. */ -use std::{borrow::Cow, fmt::Display}; +use std::{borrow::Cow, collections::BTreeSet, fmt::Display}; use ahash::{AHashMap, AHashSet}; use nlp::{ @@ -32,6 +32,7 @@ use nlp::{ }, tokenizers::word::WordTokenizer, }; +use utils::codec::leb128::Leb128Reader; use crate::{ backend::MAX_TOKEN_LENGTH, @@ -170,6 +171,7 @@ impl Store { let default_language = detect .most_frequent_language() .unwrap_or(document.default_language); + let mut bigrams = BTreeSet::new(); for (field, language, text) in parts.into_iter() { let language = if language != Language::Unknown { @@ -182,10 +184,7 @@ impl Store { let mut last_token = Cow::Borrowed(""); for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) { if !last_token.is_empty() { - tokens - .entry(BitmapHash::new(&format!("{} {}", last_token, token.word))) - .or_default() - .insert(TokenType::bigram(field)); + bigrams.insert(BitmapHash::new(&format!("{} {}", last_token, token.word)).hash); } tokens @@ -212,6 +211,13 @@ impl Store { let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2); let mut keys = Vec::with_capacity(tokens.len()); + // Write bigrams + serializer = serializer.write_leb128(bigrams.len()); + for bigram in bigrams { + serializer = serializer.write(bigram.as_slice()); + } + + // Write index keys for (hash, fields) in tokens.into_iter() { serializer = serializer .write(hash.hash.as_slice()) @@ -336,7 +342,21 @@ impl Deserialize for TermIndex { let bytes = lz4_flex::decompress_size_prepended(bytes) .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?; let mut ops = Vec::new(); - let mut bytes = bytes.iter().peekable(); + + // Skip bigrams + let (num_items, pos) = + bytes + .as_slice() + .read_leb128::<usize>() + .ok_or(Error::InternalError( + "Failed to read term index marker".to_string(), + ))?; + + let mut bytes = bytes + .get(pos + (num_items * 8)..) + .unwrap_or_default() + .iter() + .peekable(); while bytes.peek().is_some() { let mut hash = BitmapHash { diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs index 6be67a75..e549956e 100644 --- a/crates/store/src/fts/query.rs +++ b/crates/store/src/fts/query.rs @@ -22,20 +22,32 @@ */ use std::{ + borrow::Cow, fmt::Display, ops::{BitAndAssign, BitOrAssign, BitXorAssign}, }; +use ahash::AHashSet; use nlp::language::stemmer::Stemmer; use roaring::RoaringBitmap; +use utils::codec::leb128::Leb128Reader; -use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store}; +use crate::{ + backend::MAX_TOKEN_LENGTH, + fts::FtsFilter, + write::{BitmapClass, BitmapHash, ValueClass}, + BitmapKey, Deserialize, Error, Store, ValueKey, +}; struct State<T: Into<u8> + Display + Clone + std::fmt::Debug> { pub op: FtsFilter<T>, pub bm: Option<RoaringBitmap>, } +struct BigramIndex { + grams: Vec<[u8; 8]>, +} + impl Store { pub async fn fts_query<T: Into<u8> + Display + Clone + std::fmt::Debug>( &self, @@ -59,34 +71,60 @@ impl Store { language, } => { let field: u8 = field.clone().into(); + let mut keys = Vec::new(); + let mut bigrams = AHashSet::new(); + let mut last_token = Cow::Borrowed(""); + for token in language.tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) { + keys.push(BitmapKey { + account_id, + collection, + class: BitmapClass::word(token.word.as_ref(), field), + block_num: 0, + }); - let tokens = language - .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) - .map(|t| t.word) - .collect::<Vec<_>>(); - let keys = if tokens.len() > 1 { - tokens - .windows(2) - .map(|bg| BitmapKey { - account_id, - collection, - class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field), - block_num: 0, - }) - .collect::<Vec<_>>() - } else { - tokens - .into_iter() - .map(|word| BitmapKey { - account_id, - collection, - class: BitmapClass::word(word.as_ref(), field), - block_num: 0, - }) - .collect::<Vec<_>>() - }; - - self.get_bitmaps_intersection(keys).await? + if !last_token.is_empty() { + bigrams.insert( + BitmapHash::new(&format!("{} {}", last_token, token.word)).hash, + ); + } + + last_token = token.word; + } + + match keys.len().cmp(&1) { + std::cmp::Ordering::Less => None, + std::cmp::Ordering::Equal => self.get_bitmaps_intersection(keys).await?, + std::cmp::Ordering::Greater => { + if let Some(document_ids) = self.get_bitmaps_intersection(keys).await? { + let mut results = RoaringBitmap::new(); + for document_id in document_ids { + if let Some(bigram_index) = self + .get_value::<BigramIndex>(ValueKey { + account_id, + collection, + document_id, + class: ValueClass::TermIndex, + }) + .await? + { + if bigrams.iter().all(|bigram| { + bigram_index.grams.binary_search(bigram).is_ok() + }) { + results.insert(document_id); + } + } + } + + if !results.is_empty() { + Some(results) + } else { + None + } + } else { + None + } + } + } } FtsFilter::Contains { field, @@ -238,6 +276,27 @@ impl Store { } } +impl Deserialize for BigramIndex { + fn deserialize(bytes: &[u8]) -> crate::Result<Self> { + let bytes = lz4_flex::decompress_size_prepended(bytes) + .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?; + + let (num_items, pos) = bytes.read_leb128::<usize>().ok_or(Error::InternalError( + "Failed to read term index marker".to_string(), + ))?; + + bytes + .get(pos..pos + (num_items * 8)) + .map(|bytes| Self { + grams: bytes + .chunks_exact(8) + .map(|chunk| chunk.try_into().unwrap()) + .collect(), + }) + .ok_or_else(|| Error::InternalError("Failed to read term index".to_string())) + } +} + impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<FtsFilter<T>> for State<T> { fn from(value: FtsFilter<T>) -> Self { Self { |