summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Dillinger <peterd@meta.com>2024-07-17 14:08:35 -0700
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-07-17 14:08:35 -0700
commit93b163d1a2d6c5a003a55ef3de5e802beb0ba7f9 (patch)
tree2d5cac0434b16c689b98e5dd4bc02b9d3232fa36
parent21db55f8164d2a6519dcc993f74bf7f49c700854 (diff)
Fix major bug with prefixes, SeekForPrev, and partitioned filters (#12872)
Summary: Basically, the fix in https://github.com/facebook/rocksdb/issues/8137 was incomplete (and I missed it in the review), because if `whole_key_filtering` is false, then `last_prefix_str_` will never be set to non-empty and the fix doesn't work. Also related to https://github.com/facebook/rocksdb/issues/5835. This is intended as a safe, simple fix that will regress CPU efficiency slightly (for `whole_key_filtering=false` cases, because of extra prefix string copies during flush & compaction). An efficient fix is not possible without some substantial refactoring. Also in this PR: new test DBBloomFilterTest.FilterNumEntriesCoalesce tests an adjacent code path that was previously untested for its effect of ensuring the number of unique prefixes and keys is tracked properly when both prefixes and whole keys are going into a filter. (Test fails when either of the two code segments checking for duplicates is disabled.) In addition, the same test would fail before the main bug fix here because the code would inappropriately add the empty string to the filter (because of unmodified `last_prefix_str_`). Pull Request resolved: https://github.com/facebook/rocksdb/pull/12872 Test Plan: In addition to DBBloomFilterTest.FilterNumEntriesCoalesce, extended DBBloomFilterTest.SeekForPrevWithPartitionedFilters to cover the broken case. (Mostly whitespace change.) Reviewed By: jowlyzhang Differential Revision: D59873793 Pulled By: pdillinger fbshipit-source-id: 2a7b7f09ca73dc188fb4dab833826ad6da7ebb11
-rw-r--r--db/db_bloom_filter_test.cc150
-rw-r--r--table/block_based/full_filter_block.cc9
-rw-r--r--table/block_based/partitioned_filter_block.cc2
-rw-r--r--unreleased_history/bug_fixes/0seek_for_prev_prefix.md1
4 files changed, 117 insertions, 45 deletions
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index a13820535..83cfe32a4 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -322,6 +322,67 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
}
}
+TEST_F(DBBloomFilterTest, FilterNumEntriesCoalesce) {
+ for (bool partition_filters : {true, false}) {
+ SCOPED_TRACE("partition_filters=" + std::to_string(partition_filters));
+ for (bool prefix : {true, false}) {
+ SCOPED_TRACE("prefix=" + std::to_string(prefix));
+ for (bool whole : {true, false}) {
+ SCOPED_TRACE("whole=" + std::to_string(whole));
+ Options options = last_options_;
+ options.prefix_extractor.reset();
+ if (prefix) {
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ }
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+ bbto.whole_key_filtering = whole;
+ if (partition_filters) {
+ bbto.partition_filters = true;
+ bbto.index_type =
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ }
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
+
+ // Need a snapshot to allow keeping multiple entries for the same key
+ std::vector<const Snapshot*> snapshots;
+ for (int i = 1; i <= 3; ++i) {
+ std::string val = "val" + std::to_string(i);
+ ASSERT_OK(Put("foo1", val));
+ ASSERT_OK(Put("foo2", val));
+ ASSERT_OK(Put("bar1", val));
+ ASSERT_OK(Put("bar2", val));
+ ASSERT_OK(Put("bar3", val));
+ snapshots.push_back(db_->GetSnapshot());
+ }
+ ASSERT_OK(Flush());
+
+ TablePropertiesCollection tpc;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&tpc));
+ // sanity checks
+ ASSERT_EQ(tpc.size(), 1U);
+ auto& tp = *tpc.begin()->second;
+ EXPECT_EQ(tp.num_entries, 3U * 5U);
+
+ // test checks
+ unsigned ex_filter_entries = 0;
+ if (whole) {
+ ex_filter_entries += 5; // unique keys
+ }
+ if (prefix) {
+ ex_filter_entries += 2; // unique prefixes
+ }
+ EXPECT_EQ(tp.num_filter_entries, ex_filter_entries);
+
+ for (auto* sn : snapshots) {
+ db_->ReleaseSnapshot(sn);
+ }
+ }
+ }
+ }
+}
+
TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
for (bool partition_filters : {true, false}) {
Options options = last_options_;
@@ -3143,52 +3204,55 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
}
TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
- Options options = CurrentOptions();
- constexpr size_t kNumKeys = 10000;
- static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
- options.memtable_factory.reset(
- test::NewSpecialSkipListFactory(kNumKeys + 10));
- options.create_if_missing = true;
- constexpr size_t kPrefixLength = 4;
- options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
- options.compression = kNoCompression;
- BlockBasedTableOptions bbto;
- bbto.filter_policy.reset(NewBloomFilterPolicy(50));
- bbto.index_shortening =
- BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
- bbto.block_size = 128;
- bbto.metadata_block_size = 128;
- bbto.partition_filters = true;
- bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
- options.table_factory.reset(NewBlockBasedTableFactory(bbto));
- DestroyAndReopen(options);
+ for (bool wkf : {true, false}) {
+ SCOPED_TRACE("whole_key_filtering=" + std::to_string(wkf));
+ Options options = CurrentOptions();
+ constexpr size_t kNumKeys = 10000;
+ static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+ options.memtable_factory.reset(
+ test::NewSpecialSkipListFactory(kNumKeys + 10));
+ options.create_if_missing = true;
+ constexpr size_t kPrefixLength = 4;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+ options.compression = kNoCompression;
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+ bbto.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ bbto.block_size = 128;
+ bbto.metadata_block_size = 128;
+ bbto.partition_filters = true;
+ bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+ bbto.whole_key_filtering = wkf;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyAndReopen(options);
- const std::string value(64, '\0');
+ const std::string value(64, '\0');
- WriteOptions write_opts;
- write_opts.disableWAL = true;
- for (size_t i = 0; i < kNumKeys; ++i) {
- std::ostringstream oss;
- oss << std::setfill('0') << std::setw(4) << std::fixed << i;
- ASSERT_OK(db_->Put(write_opts, oss.str(), value));
- }
- ASSERT_OK(Flush());
+ WriteOptions write_opts;
+ write_opts.disableWAL = true;
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ std::ostringstream oss;
+ oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+ ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+ }
+ ASSERT_OK(Flush());
- ReadOptions read_opts;
- // Use legacy, implicit prefix seek
- read_opts.total_order_seek = false;
- read_opts.auto_prefix_mode = false;
- std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
- for (size_t i = 0; i < kNumKeys; ++i) {
- // Seek with a key after each one added but with same prefix. One will
- // surely cross a partition boundary.
- std::ostringstream oss;
- oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
- it->SeekForPrev(oss.str());
- ASSERT_OK(it->status());
- ASSERT_TRUE(it->Valid());
- }
- it.reset();
+ ReadOptions read_opts;
+ // Use legacy, implicit prefix seek
+ read_opts.total_order_seek = false;
+ read_opts.auto_prefix_mode = false;
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ // Seek with a key after each one added but with same prefix. One will
+ // surely cross a partition boundary.
+ std::ostringstream oss;
+ oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+ it->SeekForPrev(oss.str());
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(it->Valid());
+ }
+ }
}
namespace {
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 8f31d9e83..c6f2a624a 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -82,7 +82,14 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
Slice prefix = prefix_extractor_->Transform(key);
- if (whole_key_filtering_) {
+ if (true /*WART*/ || whole_key_filtering_) {
+ // WART/FIXME: Because last_prefix_str_ is needed above to make
+ // SeekForPrev work, we are currently using this inefficient code always.
+ // Hopefully this can be optimized with some refactoring up the call chain
+ // to BlockBasedTableBuilder. Even in PartitionedFilterBlockBuilder, we
+ // don't currently have access to the previous key/prefix by the time we
+ // know we are starting a new partition.
+
// if both whole_key and prefix are added to bloom then we will have whole
// key and prefix addition being interleaved and thus cannot rely on the
// bits builder to properly detect the duplicates by comparing with the last
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 6df8df92b..e8d736ab9 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -86,7 +86,7 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
}
// Add the prefix of the next key before finishing the partition without
- // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+ // updating last_prefix_str_. This hack fixes a bug with format_verison=3
// where seeking for the prefix would lead us to the previous partition.
const bool maybe_add_prefix =
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
diff --git a/unreleased_history/bug_fixes/0seek_for_prev_prefix.md b/unreleased_history/bug_fixes/0seek_for_prev_prefix.md
new file mode 100644
index 000000000..24c2360a5
--- /dev/null
+++ b/unreleased_history/bug_fixes/0seek_for_prev_prefix.md
@@ -0,0 +1 @@
+* Fix a major bug in which an iterator using prefix filtering and SeekForPrev might miss data when the DB is using `whole_key_filtering=false` and `partition_filters=true`.