summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorChangyu Bi <changyubi@meta.com>2024-08-19 13:53:25 -0700
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-08-19 13:53:25 -0700
commitdefd97bc9d5e86d0e003c6cea49fa4062cbc00ff (patch)
tree7ae8702c4bca491bfbe02c3bdb35893858ab4302 /include
parent273b3eadf0ad06acaaeaf30efc35be5ab7588a9c (diff)
Add an option to verify memtable key order during reads (#12889)
Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
Diffstat (limited to 'include')
-rw-r--r--include/rocksdb/advanced_options.h7
-rw-r--r--include/rocksdb/memtablerep.h34
2 files changed, 41 insertions, 0 deletions
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index cbe1eb52f..11f971c24 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1090,6 +1090,13 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through the SetOptions() API.
uint32_t bottommost_file_compaction_delay = 0;
+ // Enables additional integrity checks during reads/scans.
+ // Specifically, for skiplist-based memtables, we verify that keys visited
+ // are in order. This is helpful to detect corrupted memtable keys during
+ // reads. Enabling this feature incurs a performance overhead due to an
+ // additional key comparison during memtable lookup.
+ bool paranoid_memory_checks = false;
+
// Create ColumnFamilyOptions with default values for all fields
AdvancedColumnFamilyOptions();
// Create ColumnFamilyOptions from Options
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index d109a542f..fd63f127f 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -194,6 +194,15 @@ class MemTableRep {
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));
+ // Same as Get() but performs data integrity validation.
+ virtual Status GetAndValidate(const LookupKey& /* k */,
+ void* /* callback_args */,
+ bool (* /* callback_func */)(void* arg,
+ const char* entry),
+ bool /*allow_data_in_error*/) {
+ return Status::NotSupported("GetAndValidate() not implemented.");
+ }
+
virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
const Slice& /*end_key*/) {
return 0;
@@ -235,13 +244,38 @@ class MemTableRep {
// REQUIRES: Valid()
virtual void Next() = 0;
+ // Advances to the next position and performs integrity validations on the
+ // skip list. Iterator becomes invalid and Corruption is returned if a
+ // corruption is found.
+ // REQUIRES: Valid()
+ virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
+ return Status::NotSupported("NextAndValidate() not implemented.");
+ }
+
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;
+ // Advances to the previous position and performs integrity validations on
+ // the skip list. Iterator becomes invalid and Corruption is returned if a
+ // corruption is found.
+ // REQUIRES: Valid()
+ virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
+ return Status::NotSupported("PrevAndValidate() not implemented.");
+ }
+
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+ // Seek and perform integrity validations on the skip list.
+ // Iterator becomes invalid and Corruption is returned if a
+ // corruption is found.
+ virtual Status SeekAndValidate(const Slice& /* internal_key */,
+ const char* /* memtable_key */,
+ bool /* allow_data_in_errors */) {
+ return Status::NotSupported("SeekAndValidate() not implemented.");
+ }
+
// retreat to the first entry with a key <= target
virtual void SeekForPrev(const Slice& internal_key,
const char* memtable_key) = 0;