summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Dillinger <peterd@meta.com>2024-09-20 15:54:19 -0700
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-09-20 15:54:19 -0700
commita1a102ffcefab83e75b9d026d204395d00f2a582 (patch)
tree3f07fbbd3f4ac45edba3aba982a33354dd9152a0
parent5f4a8c3da41327e9844d8d14793d34a0ec3453b9 (diff)
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary: With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly. Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`. Related fixes / changes: * Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation). * Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.) Suggested follow-up: * Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness. * Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek. * Add `prefix_same_as_start` testing to db_stress Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026 Test Plan: tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while. Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all). Reviewed By: ltamasi Differential Revision: D63147378 Pulled By: pdillinger fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
-rw-r--r--db/arena_wrapped_db_iter.cc21
-rw-r--r--db/column_family.cc4
-rw-r--r--db/db_bloom_filter_test.cc202
-rw-r--r--db/db_impl/db_impl.cc17
-rw-r--r--db/db_impl/db_impl_open.cc3
-rw-r--r--db/db_iter.cc8
-rw-r--r--db/db_test2.cc58
-rw-r--r--db/db_with_timestamp_basic_test.cc2
-rw-r--r--db/flush_job.cc8
-rw-r--r--db/forward_iterator.cc16
-rw-r--r--db/memtable.cc38
-rw-r--r--db/memtable.h3
-rw-r--r--db/memtable_list.cc11
-rw-r--r--db/memtable_list.h2
-rw-r--r--db/repair.cc3
-rw-r--r--db/write_batch_test.cc2
-rw-r--r--db_stress_tool/db_stress_test_base.cc2
-rw-r--r--include/rocksdb/options.h19
-rw-r--r--java/rocksjni/write_batch_test.cc3
-rw-r--r--options/db_options.cc7
-rw-r--r--options/db_options.h1
-rw-r--r--table/block_based/block_based_table_reader.cc8
-rw-r--r--table/plain/plain_table_reader.cc6
-rw-r--r--table/table_test.cc7
-rw-r--r--unreleased_history/bug_fixes/memtable_prefix.md1
-rw-r--r--unreleased_history/new_features/prefix_seek_opt_in_only.md1
26 files changed, 325 insertions, 128 deletions
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 83cc3cfa5..22be567fd 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init(
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
uint64_t version_number, ReadCallback* read_callback,
ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
- auto mem = arena_.AllocateAligned(sizeof(DBIter));
- db_iter_ = new (mem) DBIter(
- env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
- /* iter */ nullptr, version, sequence, true,
- max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
- sv_number_ = version_number;
read_options_ = read_options;
- allow_refresh_ = allow_refresh;
- memtable_range_tombstone_iter_ = nullptr;
-
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false;
}
+ read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
+
+ auto mem = arena_.AllocateAligned(sizeof(DBIter));
+ db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
+ ioptions.user_comparator,
+ /* iter */ nullptr, version, sequence, true,
+ max_sequential_skip_in_iteration, read_callback,
+ cfh, expose_blob_index);
+
+ sv_number_ = version_number;
+ allow_refresh_ = allow_refresh;
+ memtable_range_tombstone_iter_ = nullptr;
}
Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
diff --git a/db/column_family.cc b/db/column_family.cc
index 2b611fda7..8abad2941 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1201,8 +1201,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
merge_iter_builder.AddIterator(super_version->mem->NewIterator(
- read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
+ read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
+ /*prefix_extractor=*/nullptr,
&merge_iter_builder,
false /* add_range_tombstone_iter */);
ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 7b42f4ee5..f0b3c4066 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -171,7 +171,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
CreateAndReopenWithCF({"pikachu"}, options);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_OK(Put(1, "a", "b"));
bool value_found = false;
@@ -187,7 +187,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
ASSERT_TRUE(
db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
- ASSERT_TRUE(!value_found);
+ ASSERT_FALSE(value_found);
// assert that no new files were opened and no new blocks were
// read into block cache.
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
@@ -197,7 +197,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
@@ -207,7 +207,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
@@ -215,7 +215,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
@@ -2177,24 +2177,146 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
db_->ReleaseSnapshot(snapshot);
}
+namespace {
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+ if (sst) {
+ return {options.statistics->getAndResetTickerCount(
+ NON_LAST_LEVEL_SEEK_FILTER_MATCH),
+ options.statistics->getAndResetTickerCount(
+ NON_LAST_LEVEL_SEEK_FILTERED)};
+ } else {
+ auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+ auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+ return {hit, miss};
+ }
+}
+
+std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
+ return {hits, misses};
+}
+} // namespace
-TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
- constexpr size_t kPrefixSize = 8;
- const std::string kKey = "key";
- assert(kKey.size() < kPrefixSize);
+TEST_F(DBBloomFilterTest, MemtablePrefixBloom) {
Options options = CurrentOptions();
- options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
options.memtable_prefix_bloom_size_ratio = 0.25;
Reopen(options);
- ASSERT_OK(Put(kKey, "v"));
- ASSERT_EQ("v", Get(kKey));
- std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
- iter->Seek(kKey);
+ ASSERT_FALSE(options.prefix_extractor->InDomain("key"));
+ ASSERT_OK(Put("key", "v"));
+ ASSERT_OK(Put("goat1", "g1"));
+ ASSERT_OK(Put("goat2", "g2"));
+
+ // Reset from other tests
+ GetBloomStat(options, false);
+
+ // Out of domain (Get)
+ ASSERT_EQ("v", Get("key"));
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // In domain (Get)
+ ASSERT_EQ("g1", Get("goat1"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat9"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goan1"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+ ReadOptions ropts;
+ if (options.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+ // Out of domain (scan)
+ iter->Seek("ke");
+ ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
- ASSERT_EQ(kKey, iter->key());
- iter->SeekForPrev(kKey);
+ ASSERT_EQ("key", iter->key());
+ iter->SeekForPrev("kez");
+ ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
- ASSERT_EQ(kKey, iter->key());
+ ASSERT_EQ("key", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // In domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+ iter->Seek("goat");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+
+ // Changing prefix extractor should affect prefix query semantics
+ // and bypass the existing memtable Bloom filter
+ ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}}));
+ iter.reset(db_->NewIterator(ropts));
+ // Now out of domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat2");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat2", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ if (ropts.prefix_same_as_start) {
+ iter->Seek("goat0");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ } else {
+ // NOTE: legacy prefix Seek may return keys outside of prefix
+ }
+
+ // Start a fresh new memtable, using new prefix extractor
+ ASSERT_OK(SingleDelete("key"));
+ ASSERT_OK(SingleDelete("goat1"));
+ ASSERT_OK(SingleDelete("goat2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "_v"));
+ ASSERT_OK(Put("goat1", "_g1"));
+ ASSERT_OK(Put("goat2", "_g2"));
+
+ iter.reset(db_->NewIterator(ropts));
+
+ // Still out of domain (Get)
+ ASSERT_EQ("_v", Get("key"));
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // Still in domain (Get)
+ ASSERT_EQ("_g1", Get("goat1"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat11"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat9"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goan1"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+ // Now out of domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat2");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat2", iter->key());
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat0");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
}
class DBBloomFilterTestVaryPrefixAndFormatVer
@@ -2507,7 +2629,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
- std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+ ReadOptions ropts;
+ if (options_.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
// check memtable bloom stats
iter->Seek(key1);
@@ -2526,13 +2652,13 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
iter->Seek(key2);
ASSERT_OK(iter->status());
- ASSERT_TRUE(!iter->Valid());
+ ASSERT_FALSE(iter->Valid());
ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
ASSERT_OK(Flush());
- iter.reset(dbfull()->NewIterator(ReadOptions()));
+ iter.reset(dbfull()->NewIterator(ropts));
// Check SST bloom stats
iter->Seek(key1);
@@ -2550,7 +2676,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
iter->Seek(key2);
ASSERT_OK(iter->status());
- ASSERT_TRUE(!iter->Valid());
+ ASSERT_FALSE(iter->Valid());
ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
}
@@ -2659,9 +2785,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) {
PrefixScanInit(this);
count = 0;
env_->random_read_counter_.Reset();
- iter = db_->NewIterator(ReadOptions());
+ ReadOptions ropts;
+ if (options.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ iter = db_->NewIterator(ropts);
for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
if (!iter->key().starts_with(prefix)) {
+ ASSERT_FALSE(ropts.prefix_same_as_start);
break;
}
count++;
@@ -3397,23 +3528,6 @@ class FixedSuffix4Transform : public SliceTransform {
bool InDomain(const Slice& src) const override { return src.size() >= 4; }
};
-
-std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
- if (sst) {
- return {options.statistics->getAndResetTickerCount(
- NON_LAST_LEVEL_SEEK_FILTER_MATCH),
- options.statistics->getAndResetTickerCount(
- NON_LAST_LEVEL_SEEK_FILTERED)};
- } else {
- auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
- auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
- return {hit, miss};
- }
-}
-
-std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
- return {hits, misses};
-}
} // anonymous namespace
// This uses a prefix_extractor + comparator combination that violates
@@ -3520,9 +3634,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
- // TODO: why needed?
- get_perf_context()->bloom_memtable_hit_count = 0;
- get_perf_context()->bloom_memtable_miss_count = 0;
+ // Reset from other tests
+ GetBloomStat(options, flushed);
}
EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
{
@@ -3664,9 +3777,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
- // TODO: why needed?
- get_perf_context()->bloom_memtable_hit_count = 0;
- get_perf_context()->bloom_memtable_miss_count = 0;
+ // Reset from other tests
+ GetBloomStat(options, flushed);
}
EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
{
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index e95561efa..b218bd0ba 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2066,15 +2066,19 @@ InternalIterator* DBImpl::NewInternalIterator(
bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
InternalIterator* internal_iter;
assert(arena != nullptr);
+ auto prefix_extractor =
+ super_version->mutable_cf_options.prefix_extractor.get();
// Need to create internal iterator from the arena.
MergeIteratorBuilder merge_iter_builder(
&cfd->internal_comparator(), arena,
- !read_options.total_order_seek &&
- super_version->mutable_cf_options.prefix_extractor != nullptr,
+ // FIXME? It's not clear what interpretation of prefix seek is needed
+ // here, and no unit test cares about the value provided here.
+ !read_options.total_order_seek && prefix_extractor != nullptr,
read_options.iterate_upper_bound);
// Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator(
- read_options, super_version->GetSeqnoToTimeMapping(), arena);
+ read_options, super_version->GetSeqnoToTimeMapping(), arena,
+ super_version->mutable_cf_options.prefix_extractor.get());
Status s;
if (!read_options.ignore_range_deletions) {
std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
@@ -2098,6 +2102,7 @@ InternalIterator* DBImpl::NewInternalIterator(
if (s.ok()) {
super_version->imm->AddIterators(
read_options, super_version->GetSeqnoToTimeMapping(),
+ super_version->mutable_cf_options.prefix_extractor.get(),
&merge_iter_builder, !read_options.ignore_range_deletions);
}
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
@@ -3843,6 +3848,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
}
}
if (read_options.tailing) {
+ read_options.total_order_seek |=
+ immutable_db_options_.prefix_seek_opt_in_only;
+
auto iter = new ForwardIterator(this, read_options, cfd, sv,
/* allow_unprepared_value */ true);
result = NewDBIterator(
@@ -4044,6 +4052,9 @@ Status DBImpl::NewIterators(
assert(cf_sv_pairs.size() == column_families.size());
if (read_options.tailing) {
+ read_options.total_order_seek |=
+ immutable_db_options_.prefix_seek_opt_in_only;
+
for (const auto& cf_sv_pair : cf_sv_pairs) {
auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
cf_sv_pair.super_version,
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 4fb100876..dec61c050 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
TableProperties table_properties;
{
ScopedArenaPtr<InternalIterator> iter(
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": started",
diff --git a/db/db_iter.cc b/db/db_iter.cc
index b42acc4bc..e02586377 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
valid_(false),
current_entry_is_merged_(false),
is_key_seqnum_zero_(false),
- prefix_same_as_start_(mutable_cf_options.prefix_extractor
- ? read_options.prefix_same_as_start
- : false),
+ prefix_same_as_start_(
+ prefix_extractor_ ? read_options.prefix_same_as_start : false),
pin_thru_lifetime_(read_options.pin_data),
expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
read_options.total_order_seek ||
@@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
status_.PermitUncheckedError();
assert(timestamp_size_ ==
user_comparator_.user_comparator()->timestamp_size());
+ // prefix_seek_opt_in_only should force total_order_seek whereever the caller
+ // is duplicating the original ReadOptions
+ assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
}
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 92211ad42..fbe66a986 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
bbto.whole_key_filtering = false;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
- DestroyAndReopen(options);
- // Construct two L1 files with keys:
- // f1:[aaa1 ccc1] f2:[ddd0]
- ASSERT_OK(Put("aaa1", ""));
- ASSERT_OK(Put("ccc1", ""));
- ASSERT_OK(Flush());
- ASSERT_OK(Put("ddd0", ""));
- ASSERT_OK(Flush());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
- ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ // This test is also the primary test for prefix_seek_opt_in_only
+ for (bool opt_in : {false, true}) {
+ options.prefix_seek_opt_in_only = opt_in;
+ DestroyAndReopen(options);
- Iterator* iter = db_->NewIterator(ReadOptions());
- ASSERT_OK(iter->status());
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
- // Bloom filter is filterd out by f1.
- // This is just one of several valid position following the contract.
- // Postioning to ccc1 or ddd0 is also valid. This is just to validate
- // the behavior of the current implementation. If underlying implementation
- // changes, the test might fail here.
- iter->Seek("bbb1");
- ASSERT_OK(iter->status());
- ASSERT_FALSE(iter->Valid());
+ ReadOptions ropts;
+ for (bool same : {false, true}) {
+ ropts.prefix_same_as_start = same;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+ ASSERT_OK(iter->status());
- delete iter;
+ iter->Seek("bbb1");
+ ASSERT_OK(iter->status());
+ if (opt_in && !same) {
+ // Unbounded total order seek
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), "ccc1");
+ } else {
+ // Bloom filter is filterd out by f1. When same == false, this is just
+ // one valid position following the contract. Postioning to ccc1 or ddd0
+ // is also valid. This is just to validate the behavior of the current
+ // implementation. If underlying implementation changes, the test might
+ // fail here.
+ ASSERT_FALSE(iter->Valid());
+ }
+ }
+ }
}
TEST_F(DBTest2, RowCacheSnapshot) {
@@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
// create a DB with block prefix index
BlockBasedTableOptions table_options;
Options options = CurrentOptions();
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
// Sometimes filter is checked based on upper bound. Assert counters
// for that case. Otherwise, only check data correctness.
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index acb1c05c1..60441da0b 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
Options options = CurrentOptions();
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.env = env_;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
DestroyAndReopen(options);
const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 6bd71dd56..8206bd298 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -421,8 +421,8 @@ Status FlushJob::MemPurge() {
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters;
for (MemTable* m : mems_) {
- memtables.push_back(
- m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr));
auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) {
@@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() {
db_options_.info_log,
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
- memtables.push_back(
- m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr));
auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index a4cbdb466..0bf7c15ab 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
sv_->GetSeqnoToTimeMapping();
mutable_iter_ =
- sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
- sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
- &arena_);
+ sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+ sv_->mutable_cf_options.prefix_extractor.get());
+ sv_->imm->AddIterators(read_options_, seqno_to_time_mapping,
+ sv_->mutable_cf_options.prefix_extractor.get(),
+ &imm_iters_, &arena_);
if (!read_options_.ignore_range_deletions) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
sv_->mem->NewRangeTombstoneIterator(
@@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() {
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
svnew->GetSeqnoToTimeMapping();
mutable_iter_ =
- svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
- svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
- &arena_);
+ svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+ svnew->mutable_cf_options.prefix_extractor.get());
+ svnew->imm->AddIterators(read_options_, seqno_to_time_mapping,
+ svnew->mutable_cf_options.prefix_extractor.get(),
+ &imm_iters_, &arena_);
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
kMaxSequenceNumber /* upper_bound */);
if (!read_options_.ignore_range_deletions) {
diff --git a/db/memtable.cc b/db/memtable.cc
index ef1184ded..5ba0a0dac 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -365,9 +365,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator : public InternalIterator {
public:
- MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
- Arena* arena, bool use_range_del_table = false)
+ enum Kind { kPointEntries, kRangeDelEntries };
+ MemTableIterator(
+ Kind kind, const MemTable& mem, const ReadOptions& read_options,
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping = nullptr,
+ Arena* arena = nullptr,
+ const SliceTransform* cf_prefix_extractor = nullptr)
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_),
@@ -382,14 +385,21 @@ class MemTableIterator : public InternalIterator {
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
- if (use_range_del_table) {
+ if (kind == kRangeDelEntries) {
iter_ = mem.range_del_table_->GetIterator(arena);
- } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
- !read_options.auto_prefix_mode) {
+ } else if (prefix_extractor_ != nullptr &&
+ // NOTE: checking extractor equivalence when not pointer
+ // equivalent is arguably too expensive for memtable
+ prefix_extractor_ == cf_prefix_extractor &&
+ (read_options.prefix_same_as_start ||
+ (!read_options.total_order_seek &&
+ !read_options.auto_prefix_mode))) {
// Auto prefix mode is not implemented in memtable yet.
+ assert(kind == kPointEntries);
bloom_ = mem.bloom_filter_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
+ assert(kind == kPointEntries);
iter_ = mem.table_->GetIterator(arena);
}
status_.PermitUncheckedError();
@@ -433,8 +443,8 @@ class MemTableIterator : public InternalIterator {
// iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) {
- if (!bloom_->MayContain(
- prefix_extractor_->Transform(user_k_without_ts))) {
+ Slice prefix = prefix_extractor_->Transform(user_k_without_ts);
+ if (!bloom_->MayContain(prefix)) {
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
valid_ = false;
return;
@@ -594,11 +604,13 @@ class MemTableIterator : public InternalIterator {
InternalIterator* MemTable::NewIterator(
const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+ const SliceTransform* prefix_extractor) {
assert(arena != nullptr);
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem)
- MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
+ MemTableIterator(MemTableIterator::kPointEntries, *this, read_options,
+ seqno_to_time_mapping, arena, prefix_extractor);
}
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
@@ -633,8 +645,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
cache->reader_mutex.lock();
if (!cache->tombstones) {
auto* unfragmented_iter = new MemTableIterator(
- *this, read_options, nullptr /* seqno_to_time_mapping= */,
- nullptr /* arena */, true /* use_range_del_table */);
+ MemTableIterator::kRangeDelEntries, *this, read_options);
cache->tombstones.reset(new FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator>(unfragmented_iter),
comparator_.comparator));
@@ -655,8 +666,7 @@ void MemTable::ConstructFragmentedRangeTombstones() {
if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
// TODO: plumb Env::IOActivity, Env::IOPriority
auto* unfragmented_iter = new MemTableIterator(
- *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
- nullptr /* arena */, true /* use_range_del_table */);
+ MemTableIterator::kRangeDelEntries, *this, ReadOptions());
fragmented_range_tombstone_list_ =
std::make_unique<FragmentedRangeTombstoneList>(
diff --git a/db/memtable.h b/db/memtable.h
index ca0652bc0..194b4543c 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -210,7 +210,8 @@ class MemTable {
// data, currently only needed for iterators serving user reads.
InternalIterator* NewIterator(
const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+ const SliceTransform* prefix_extractor);
// Returns an iterator that yields the range tombstones of the memtable.
// The caller must ensure that the underlying MemTable remains live
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index c3612656e..c81c096b5 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
void MemTableListVersion::AddIterators(
const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
std::vector<InternalIterator*>* iterator_list, Arena* arena) {
for (auto& m : memlist_) {
- iterator_list->push_back(
- m->NewIterator(options, seqno_to_time_mapping, arena));
+ iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping,
+ arena, prefix_extractor));
}
}
void MemTableListVersion::AddIterators(
const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
for (auto& m : memlist_) {
- auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
- merge_iter_builder->GetArena());
+ auto mem_iter =
+ m->NewIterator(options, seqno_to_time_mapping,
+ merge_iter_builder->GetArena(), prefix_extractor);
if (!add_range_tombstone_iter || options.ignore_range_deletions) {
merge_iter_builder->AddIterator(mem_iter);
} else {
diff --git a/db/memtable_list.h b/db/memtable_list.h
index dd439de55..390b4137d 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -113,11 +113,13 @@ class MemTableListVersion {
void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
std::vector<InternalIterator*>* iterator_list,
Arena* arena);
void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
MergeIteratorBuilder* merge_iter_builder,
bool add_range_tombstone_iter);
diff --git a/db/repair.cc b/db/repair.cc
index 114d36a6a..f7f4fbafb 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -443,7 +443,8 @@ class Repairer {
ro.total_order_seek = true;
Arena arena;
ScopedArenaPtr<InternalIterator> iter(
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
int64_t _current_time = 0;
immutable_db_options_.clock->GetCurrentTime(&_current_time)
.PermitUncheckedError(); // ignore error
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 8db8c32a0..d09dd0167 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -59,7 +59,7 @@ static std::string PrintContents(WriteBatch* b,
InternalIterator* iter;
if (i == 0) {
iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
- &arena);
+ &arena, /*prefix_extractor=*/nullptr);
arena_iter_guard.reset(iter);
} else {
iter = mem->NewRangeTombstoneIterator(ReadOptions(),
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 5d4839df8..5a4c310c4 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1592,7 +1592,7 @@ Status StressTest::TestIterateImpl(ThreadState* thread,
ro.total_order_seek = true;
expect_total_order = true;
} else if (thread->rand.OneIn(4)) {
- ro.total_order_seek = false;
+ ro.total_order_seek = thread->rand.OneIn(2);
ro.auto_prefix_mode = true;
expect_total_order = true;
} else if (options_.prefix_extractor.get() == nullptr) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 27feadb80..e272e3a69 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1399,6 +1399,17 @@ struct DBOptions {
// eventually be obsolete and removed as Identity files are phased out.
bool write_identity_file = true;
+ // Historically, when prefix_extractor != nullptr, iterators have an
+ // unfortunate default semantics of *possibly* only returning data
+ // within the same prefix. To avoid "spooky action at a distance," iterator
+ // bounds should come from the instantiation or seeking of the iterator,
+ // not from a mutable column family option.
+ //
+ // When set to true, it is as if every iterator is created with
+ // total_order_seek=true and only auto_prefix_mode=true and
+ // prefix_same_as_start=true can take advantage of prefix seek optimizations.
+ bool prefix_seek_opt_in_only = false;
+
// The number of bytes to prefetch when reading the log. This is mostly useful
// for reading a remotely located log, as it can save the number of
// round-trips. If 0, then the prefetching is disabled.
@@ -1848,10 +1859,10 @@ struct ReadOptions {
bool auto_prefix_mode = false;
// Enforce that the iterator only iterates over the same prefix as the seek.
- // This option is effective only for prefix seeks, i.e. prefix_extractor is
- // non-null for the column family and total_order_seek is false. Unlike
- // iterate_upper_bound, prefix_same_as_start only works within a prefix
- // but in both directions.
+ // This makes the iterator bounds dependent on the column family's current
+ // prefix_extractor, which is mutable. When SST files have been built with
+ // the same prefix extractor, prefix filtering optimizations will be used
+ // for both Seek and SeekForPrev.
bool prefix_same_as_start = false;
// Keep the blocks loaded by the iterator pinned in memory as long as the
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 53f10998c..ed456acc3 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
ROCKSDB_NAMESPACE::Arena arena;
ROCKSDB_NAMESPACE::ScopedArenaPtr<ROCKSDB_NAMESPACE::InternalIterator> iter(
mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
- /*seqno_to_time_mapping=*/nullptr, &arena));
+ /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
ikey.clear();
diff --git a/options/db_options.cc b/options/db_options.cc
index 0b880f4b9..e9e8fc5b7 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -403,6 +403,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
+ {"prefix_seek_opt_in_only",
+ {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
{"write_dbid_to_manifest",
{offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
OptionType::kBoolean, OptionVerificationType::kNormal,
@@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
background_close_inactive_wals(options.background_close_inactive_wals),
atomic_flush(options.atomic_flush),
avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+ prefix_seek_opt_in_only(options.prefix_seek_opt_in_only),
persist_stats_to_disk(options.persist_stats_to_disk),
write_dbid_to_manifest(options.write_dbid_to_manifest),
write_identity_file(options.write_identity_file),
@@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log,
" Options.avoid_unnecessary_blocking_io: %d",
avoid_unnecessary_blocking_io);
+ ROCKS_LOG_HEADER(log, " Options.prefix_seek_opt_in_only: %d",
+ prefix_seek_opt_in_only);
ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u",
persist_stats_to_disk);
ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d",
diff --git a/options/db_options.h b/options/db_options.h
index 842fefca8..ac76ea40d 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -87,6 +87,7 @@ struct ImmutableDBOptions {
bool background_close_inactive_wals;
bool atomic_flush;
bool avoid_unnecessary_blocking_io;
+ bool prefix_seek_opt_in_only;
bool persist_stats_to_disk;
bool write_dbid_to_manifest;
bool write_identity_file;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index fe45224d0..0dfe3e38a 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator(
if (arena == nullptr) {
return new BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, std::move(index_iter),
- !skip_filters && !read_options.total_order_seek &&
+ !skip_filters &&
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+ read_options.prefix_same_as_start) &&
prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, caller,
compaction_readahead_size, allow_unprepared_value);
@@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator(
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
return new (mem) BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, std::move(index_iter),
- !skip_filters && !read_options.total_order_seek &&
+ !skip_filters &&
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+ read_options.prefix_same_as_start) &&
prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, caller,
compaction_readahead_size, allow_unprepared_value);
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index d3c968f73..9d4e8eccf 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator(
assert(table_properties_);
// Auto prefix mode is not implemented in PlainTable.
- bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
- !options.auto_prefix_mode;
+ bool use_prefix_seek =
+ !IsTotalOrderMode() &&
+ (options.prefix_same_as_start ||
+ (!options.total_order_seek && !options.auto_prefix_mode));
if (arena == nullptr) {
return new PlainTableIterator(this, use_prefix_seek);
} else {
diff --git a/table/table_test.cc b/table/table_test.cc
index 9eee26761..bb0b70222 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -534,7 +534,7 @@ class MemTableConstructor : public Constructor {
const SliceTransform* /*prefix_extractor*/) const override {
return new KeyConvertingIterator(
memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
- &arena_),
+ &arena_, /*prefix_extractor=*/nullptr),
true);
}
@@ -4904,8 +4904,9 @@ TEST_F(MemTableTest, Simple) {
std::unique_ptr<InternalIterator> iter_guard;
InternalIterator* iter;
if (i == 0) {
- iter = GetMemTable()->NewIterator(
- ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
+ iter = GetMemTable()->NewIterator(ReadOptions(),
+ /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr);
arena_iter_guard.reset(iter);
} else {
iter = GetMemTable()->NewRangeTombstoneIterator(
diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md
new file mode 100644
index 000000000..d7b45c65e
--- /dev/null
+++ b/unreleased_history/bug_fixes/memtable_prefix.md
@@ -0,0 +1 @@
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md
new file mode 100644
index 000000000..71c3cd689
--- /dev/null
+++ b/unreleased_history/new_features/prefix_seek_opt_in_only.md
@@ -0,0 +1 @@
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.