summaryrefslogtreecommitdiff
path: root/crates/store/src/fts
diff options
context:
space:
mode:
authormdecimus <mauro@stalw.art>2023-12-04 10:23:10 +0100
committermdecimus <mauro@stalw.art>2023-12-04 10:23:10 +0100
commita7acc67cf157e635f8a498a92119dcf6fc5cd29c (patch)
tree8ff4c93008aff584cc7c5d9457b935a0a6187739 /crates/store/src/fts
parent7e94a08067b3273f78fd727cb01ccf878ebc84a0 (diff)
FTS store bigrams in kv rather than index
Diffstat (limited to 'crates/store/src/fts')
-rw-r--r--crates/store/src/fts/index.rs32
-rw-r--r--crates/store/src/fts/query.rs115
2 files changed, 113 insertions, 34 deletions
diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs
index 596b737e..6894b2b9 100644
--- a/crates/store/src/fts/index.rs
+++ b/crates/store/src/fts/index.rs
@@ -21,7 +21,7 @@
* for more details.
*/
-use std::{borrow::Cow, fmt::Display};
+use std::{borrow::Cow, collections::BTreeSet, fmt::Display};
use ahash::{AHashMap, AHashSet};
use nlp::{
@@ -32,6 +32,7 @@ use nlp::{
},
tokenizers::word::WordTokenizer,
};
+use utils::codec::leb128::Leb128Reader;
use crate::{
backend::MAX_TOKEN_LENGTH,
@@ -170,6 +171,7 @@ impl Store {
let default_language = detect
.most_frequent_language()
.unwrap_or(document.default_language);
+ let mut bigrams = BTreeSet::new();
for (field, language, text) in parts.into_iter() {
let language = if language != Language::Unknown {
@@ -182,10 +184,7 @@ impl Store {
let mut last_token = Cow::Borrowed("");
for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) {
if !last_token.is_empty() {
- tokens
- .entry(BitmapHash::new(&format!("{} {}", last_token, token.word)))
- .or_default()
- .insert(TokenType::bigram(field));
+ bigrams.insert(BitmapHash::new(&format!("{} {}", last_token, token.word)).hash);
}
tokens
@@ -212,6 +211,13 @@ impl Store {
let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2);
let mut keys = Vec::with_capacity(tokens.len());
+ // Write bigrams
+ serializer = serializer.write_leb128(bigrams.len());
+ for bigram in bigrams {
+ serializer = serializer.write(bigram.as_slice());
+ }
+
+ // Write index keys
for (hash, fields) in tokens.into_iter() {
serializer = serializer
.write(hash.hash.as_slice())
@@ -336,7 +342,21 @@ impl Deserialize for TermIndex {
let bytes = lz4_flex::decompress_size_prepended(bytes)
.map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?;
let mut ops = Vec::new();
- let mut bytes = bytes.iter().peekable();
+
+ // Skip bigrams
+ let (num_items, pos) =
+ bytes
+ .as_slice()
+ .read_leb128::<usize>()
+ .ok_or(Error::InternalError(
+ "Failed to read term index marker".to_string(),
+ ))?;
+
+ let mut bytes = bytes
+ .get(pos + (num_items * 8)..)
+ .unwrap_or_default()
+ .iter()
+ .peekable();
while bytes.peek().is_some() {
let mut hash = BitmapHash {
diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs
index 6be67a75..e549956e 100644
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@@ -22,20 +22,32 @@
*/
use std::{
+ borrow::Cow,
fmt::Display,
ops::{BitAndAssign, BitOrAssign, BitXorAssign},
};
+use ahash::AHashSet;
use nlp::language::stemmer::Stemmer;
use roaring::RoaringBitmap;
+use utils::codec::leb128::Leb128Reader;
-use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store};
+use crate::{
+ backend::MAX_TOKEN_LENGTH,
+ fts::FtsFilter,
+ write::{BitmapClass, BitmapHash, ValueClass},
+ BitmapKey, Deserialize, Error, Store, ValueKey,
+};
struct State<T: Into<u8> + Display + Clone + std::fmt::Debug> {
pub op: FtsFilter<T>,
pub bm: Option<RoaringBitmap>,
}
+struct BigramIndex {
+ grams: Vec<[u8; 8]>,
+}
+
impl Store {
pub async fn fts_query<T: Into<u8> + Display + Clone + std::fmt::Debug>(
&self,
@@ -59,34 +71,60 @@ impl Store {
language,
} => {
let field: u8 = field.clone().into();
+ let mut keys = Vec::new();
+ let mut bigrams = AHashSet::new();
+ let mut last_token = Cow::Borrowed("");
+ for token in language.tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) {
+ keys.push(BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::word(token.word.as_ref(), field),
+ block_num: 0,
+ });
- let tokens = language
- .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH)
- .map(|t| t.word)
- .collect::<Vec<_>>();
- let keys = if tokens.len() > 1 {
- tokens
- .windows(2)
- .map(|bg| BitmapKey {
- account_id,
- collection,
- class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field),
- block_num: 0,
- })
- .collect::<Vec<_>>()
- } else {
- tokens
- .into_iter()
- .map(|word| BitmapKey {
- account_id,
- collection,
- class: BitmapClass::word(word.as_ref(), field),
- block_num: 0,
- })
- .collect::<Vec<_>>()
- };
-
- self.get_bitmaps_intersection(keys).await?
+ if !last_token.is_empty() {
+ bigrams.insert(
+ BitmapHash::new(&format!("{} {}", last_token, token.word)).hash,
+ );
+ }
+
+ last_token = token.word;
+ }
+
+ match keys.len().cmp(&1) {
+ std::cmp::Ordering::Less => None,
+ std::cmp::Ordering::Equal => self.get_bitmaps_intersection(keys).await?,
+ std::cmp::Ordering::Greater => {
+ if let Some(document_ids) = self.get_bitmaps_intersection(keys).await? {
+ let mut results = RoaringBitmap::new();
+ for document_id in document_ids {
+ if let Some(bigram_index) = self
+ .get_value::<BigramIndex>(ValueKey {
+ account_id,
+ collection,
+ document_id,
+ class: ValueClass::TermIndex,
+ })
+ .await?
+ {
+ if bigrams.iter().all(|bigram| {
+ bigram_index.grams.binary_search(bigram).is_ok()
+ }) {
+ results.insert(document_id);
+ }
+ }
+ }
+
+ if !results.is_empty() {
+ Some(results)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
+ }
+ }
}
FtsFilter::Contains {
field,
@@ -238,6 +276,27 @@ impl Store {
}
}
+impl Deserialize for BigramIndex {
+ fn deserialize(bytes: &[u8]) -> crate::Result<Self> {
+ let bytes = lz4_flex::decompress_size_prepended(bytes)
+ .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?;
+
+ let (num_items, pos) = bytes.read_leb128::<usize>().ok_or(Error::InternalError(
+ "Failed to read term index marker".to_string(),
+ ))?;
+
+ bytes
+ .get(pos..pos + (num_items * 8))
+ .map(|bytes| Self {
+ grams: bytes
+ .chunks_exact(8)
+ .map(|chunk| chunk.try_into().unwrap())
+ .collect(),
+ })
+ .ok_or_else(|| Error::InternalError("Failed to read term index".to_string()))
+ }
+}
+
impl<T: Into<u8> + Display + Clone + std::fmt::Debug> From<FtsFilter<T>> for State<T> {
fn from(value: FtsFilter<T>) -> Self {
Self {