summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYu Zhang <yuzhangyu@fb.com>2024-02-13 20:30:07 -0800
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-02-13 20:30:07 -0800
commitf405e55cfa1e8a722b23223407e0d3a600cb0855 (patch)
treef1cc54b09ce23e09b002f0083f7c129a458445df
parent4bea83aa44cc299def0639eba2320f1eeed6febf (diff)
Add support in SstFileWriter to not persist user defined timestamps (#12348)
Summary: This PR adds support in `SstFileWriter` to create SST files without persisting timestamps when the column family has enabled UDTs in Memtable only feature. The sst files created from flush and compaction do not contain timestamps, we want to make the sst files created by `SstFileWriter` to follow the same pattern and not persist timestamps. This is to prepare for ingesting external SST files for this type of column family. There are timestamp-aware APIs and non timestamp-aware APIs in `SstFileWriter`. The former are exclusively used for when the column family's comparator is timestamp-aware, a.k.a `Comparator::timestamp_size() > 0`, while the latter are exclusively used for the column family's comparator is non timestamp-aware, a.k.a `Comparator::timestamp_size() == 0`. There are sanity checks to make sure these APIs are correctly used. In this PR, the APIs usage continue with above enforcement, where even though timestamps are not eventually persisted, users are still asked to use only the timestamp-aware APIs. But because data points will logically all have minimum timestamps, we don't allow multiple versions of the same user key (without timestamp) to be added. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12348 Test Plan: Added unit tests Manual inspection of generated sst files with `sst_dump` Reviewed By: ltamasi Differential Revision: D53732667 Pulled By: jowlyzhang fbshipit-source-id: e43beba0d3a1736b94ee5c617163a6280efd65b7
-rw-r--r--include/rocksdb/sst_file_writer.h6
-rw-r--r--table/sst_file_dumper.cc2
-rw-r--r--table/sst_file_reader.cc14
-rw-r--r--table/sst_file_reader_test.cc118
-rw-r--r--table/sst_file_writer.cc64
-rw-r--r--unreleased_history/new_features/sst_file_writer_not_persist_timestamp.md1
6 files changed, 197 insertions, 8 deletions
diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h
index 7477ad85f..c09e55f42 100644
--- a/include/rocksdb/sst_file_writer.h
+++ b/include/rocksdb/sst_file_writer.h
@@ -128,6 +128,8 @@ class SstFileWriter {
// REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
// key according to the comparator.
// REQUIRES: timestamp's size is equal to what is expected by the comparator.
+ // When Options.persist_user_defined_timestamps is set to false, only the
+ // minimum timestamp is accepted, and it will not be persisted.
Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value);
// Add a PutEntity (key with the wide-column entity defined by "columns") to
@@ -150,6 +152,8 @@ class SstFileWriter {
// REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
// key according to the comparator.
// REQUIRES: timestamp's size is equal to what is expected by the comparator.
+ // When Options.persist_user_defined_timestamps is set to false, only the
+ // minimum timestamp is accepted, and it will not be persisted.
Status Delete(const Slice& user_key, const Slice& timestamp);
// Add a range deletion tombstone to currently opened file. Such a range
@@ -175,6 +179,8 @@ class SstFileWriter {
// REQUIRES: begin_key and end_key are user keys without timestamp.
// REQUIRES: The comparator orders `begin_key` at or before `end_key`
// REQUIRES: timestamp's size is equal to what is expected by the comparator.
+ // When Options.persist_user_defined_timestamps is set to false, only the
+ // minimum timestamp is accepted, and it will not be persisted.
Status DeleteRange(const Slice& begin_key, const Slice& end_key,
const Slice& timestamp);
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 348a68b5a..0ad0d04bf 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -171,8 +171,6 @@ Status SstFileDumper::NewTableReader(
const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
std::unique_ptr<TableReader>* /*table_reader*/) {
- // TODO(yuzhangyu): full support in sst_dump for SST files generated when
- // `user_defined_timestamps_persisted` is false.
auto t_opt = TableReaderOptions(
ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_,
0 /* block_protection_bytes_per_key */, false /* skip_filters */,
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index d23c58deb..b34496f75 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -55,9 +55,17 @@ Status SstFileReader::Open(const std::string& file_path) {
file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
}
if (s.ok()) {
- TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
- r->soptions, r->ioptions.internal_comparator,
- r->moptions.block_protection_bytes_per_key);
+ TableReaderOptions t_opt(
+ r->ioptions, r->moptions.prefix_extractor, r->soptions,
+ r->ioptions.internal_comparator,
+ r->moptions.block_protection_bytes_per_key,
+ /*skip_filters*/ false, /*immortal*/ false,
+ /*force_direct_prefetch*/ false, /*level*/ -1,
+ /*block_cache_tracer*/ nullptr,
+ /*max_file_size_for_l0_meta_pin*/ 0, /*cur_db_session_id*/ "",
+ /*cur_file_num*/ 0,
+ /* unique_id */ {}, /* largest_seqno */ 0,
+ /* tail_size */ 0, r->ioptions.persist_user_defined_timestamps);
// Allow open file with global sequence number for backward compatibility.
t_opt.largest_seqno = kMaxSequenceNumber;
s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 36a7975cf..3dcb820ca 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -248,7 +248,8 @@ class SstFileReaderTimestampTest : public testing::Test {
: KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {}
};
- void CreateFile(const std::vector<InputKeyValueDesc>& descs) {
+ void CreateFile(const std::vector<InputKeyValueDesc>& descs,
+ ExternalSstFileInfo* file_info = nullptr) {
SstFileWriter writer(soptions_, options_);
ASSERT_OK(writer.Open(sst_name_));
@@ -276,7 +277,7 @@ class SstFileReaderTimestampTest : public testing::Test {
}
}
- ASSERT_OK(writer.Finish());
+ ASSERT_OK(writer.Finish(file_info));
}
void CheckFile(const std::string& timestamp,
@@ -413,6 +414,119 @@ TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) {
"end_key_with_a_complete_lack_of_timestamps"));
}
+class SstFileReaderTimestampNotPersistedTest
+ : public SstFileReaderTimestampTest {
+ public:
+ SstFileReaderTimestampNotPersistedTest() {
+ Env* env = Env::Default();
+ EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
+ EXPECT_NE(nullptr, env);
+
+ options_.env = env;
+
+ options_.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+ options_.persist_user_defined_timestamps = false;
+
+ sst_name_ = test::PerThreadDBPath("sst_file_ts_not_persisted");
+ }
+
+ ~SstFileReaderTimestampNotPersistedTest() {}
+};
+
+TEST_F(SstFileReaderTimestampNotPersistedTest, Basic) {
+ std::vector<InputKeyValueDesc> input_descs;
+
+ for (uint64_t k = 0; k < kNumKeys; k++) {
+ input_descs.emplace_back(
+ /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(0),
+ /* value */ EncodeAsString(k), /* is_delete */ false,
+ /* use_contiguous_buffer */ (k % 2) == 0);
+ }
+
+ ExternalSstFileInfo external_sst_file_info;
+
+ CreateFile(input_descs, &external_sst_file_info);
+ std::vector<OutputKeyValueDesc> output_descs;
+
+ for (uint64_t k = 0; k < kNumKeys; k++) {
+ output_descs.emplace_back(/* key */ EncodeAsString(k),
+ /* timestamp */ EncodeAsUint64(0),
+ /* value */ EncodeAsString(k));
+ }
+ CheckFile(EncodeAsUint64(0), output_descs);
+ ASSERT_EQ(external_sst_file_info.smallest_key, EncodeAsString(0));
+ ASSERT_EQ(external_sst_file_info.largest_key, EncodeAsString(kNumKeys - 1));
+ ASSERT_EQ(external_sst_file_info.smallest_range_del_key, "");
+ ASSERT_EQ(external_sst_file_info.largest_range_del_key, "");
+}
+
+TEST_F(SstFileReaderTimestampNotPersistedTest, NonMinTimestampNotAllowed) {
+ SstFileWriter writer(soptions_, options_);
+
+ ASSERT_OK(writer.Open(sst_name_));
+
+ ASSERT_NOK(writer.Delete("baz", EncodeAsUint64(2)));
+ ASSERT_OK(writer.Put("baz", EncodeAsUint64(0), "foo_val"));
+
+ ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value1"));
+ ASSERT_OK(writer.Put("key", EncodeAsUint64(0), "value2"));
+
+ // The `SstFileWriter::DeleteRange` API documentation specifies that
+ // a range deletion tombstone added in the file does NOT delete point
+ // (Put/Merge/Delete) keys in the same file. While there is no checks in
+ // `SstFileWriter` to ensure this requirement is met, when such a range
+ // deletion does exist, it will get over-written by point data in the same
+ // file after ingestion because they have the same sequence number.
+ // We allow having a point data entry and having a range deletion entry for
+ // a key in the same file when timestamps are removed for the same reason.
+ // After the file is ingested, the range deletion will effectively get
+ // over-written by the point data since they will have the same sequence
+ // number and the same user-defined timestamps.
+ ASSERT_NOK(writer.DeleteRange("bar", "foo", EncodeAsUint64(2)));
+ ASSERT_OK(writer.DeleteRange("bar", "foo", EncodeAsUint64(0)));
+
+ ExternalSstFileInfo external_sst_file_info;
+
+ ASSERT_OK(writer.Finish(&external_sst_file_info));
+ ASSERT_EQ(external_sst_file_info.smallest_key, "baz");
+ ASSERT_EQ(external_sst_file_info.largest_key, "key");
+ ASSERT_EQ(external_sst_file_info.smallest_range_del_key, "bar");
+ ASSERT_EQ(external_sst_file_info.largest_range_del_key, "foo");
+}
+
+TEST_F(SstFileReaderTimestampNotPersistedTest, KeyWithoutTimestampOutOfOrder) {
+ SstFileWriter writer(soptions_, options_);
+
+ ASSERT_OK(writer.Open(sst_name_));
+
+ ASSERT_OK(writer.Put("foo", EncodeAsUint64(0), "value1"));
+ ASSERT_NOK(writer.Put("bar", EncodeAsUint64(0), "value2"));
+}
+
+TEST_F(SstFileReaderTimestampNotPersistedTest, IncompatibleTimestampFormat) {
+ SstFileWriter writer(soptions_, options_);
+
+ ASSERT_OK(writer.Open(sst_name_));
+
+ // Even though in this mode timestamps are not persisted, we require users
+ // to call the timestamp-aware APIs only.
+ ASSERT_TRUE(writer.Put("key", "not_an_actual_64_bit_timestamp", "value")
+ .IsInvalidArgument());
+ ASSERT_TRUE(writer.Delete("another_key", "timestamp_of_unexpected_size")
+ .IsInvalidArgument());
+
+ ASSERT_TRUE(writer.Put("key_without_timestamp", "value").IsInvalidArgument());
+ ASSERT_TRUE(writer.Merge("another_key_missing_a_timestamp", "merge_operand")
+ .IsInvalidArgument());
+ ASSERT_TRUE(
+ writer.Delete("yet_another_key_still_no_timestamp").IsInvalidArgument());
+ ASSERT_TRUE(writer
+ .DeleteRange("begin_key_timestamp_absent",
+ "end_key_with_a_complete_lack_of_timestamps")
+ .IsInvalidArgument());
+}
+
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index f35911250..437adb2cc 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -41,7 +41,10 @@ struct SstFileWriter::Rep {
cfh(_cfh),
invalidate_page_cache(_invalidate_page_cache),
skip_filters(_skip_filters),
- db_session_id(_db_session_id) {
+ db_session_id(_db_session_id),
+ ts_sz(_user_comparator->timestamp_size()),
+ strip_timestamp(ts_sz > 0 &&
+ !ioptions.persist_user_defined_timestamps) {
// TODO (hx235): pass in `WriteOptions` instead of `rate_limiter_priority`
// during construction
write_options.rate_limiter_priority = io_priority;
@@ -68,6 +71,8 @@ struct SstFileWriter::Rep {
bool skip_filters;
std::string db_session_id;
uint64_t next_file_number = 1;
+ size_t ts_sz;
+ bool strip_timestamp;
Status AddImpl(const Slice& user_key, const Slice& value,
ValueType value_type) {
@@ -78,6 +83,17 @@ struct SstFileWriter::Rep {
return builder->status();
}
+ assert(user_key.size() >= ts_sz);
+ if (strip_timestamp) {
+ // In this mode, we expect users to always provide a min timestamp.
+ if (internal_comparator.user_comparator()->CompareTimestamp(
+ Slice(user_key.data() + user_key.size() - ts_sz, ts_sz),
+ MinU64Ts()) != 0) {
+ return Status::InvalidArgument(
+ "persist_user_defined_timestamps flag is set to false, only "
+ "minimum timestamp is accepted.");
+ }
+ }
if (file_info.num_entries == 0) {
file_info.smallest_key.assign(user_key.data(), user_key.size());
} else {
@@ -171,6 +187,28 @@ struct SstFileWriter::Rep {
return Status::OK();
}
+ assert(begin_key.size() >= ts_sz);
+ assert(end_key.size() >= ts_sz);
+ Slice begin_key_ts =
+ Slice(begin_key.data() + begin_key.size() - ts_sz, ts_sz);
+ Slice end_key_ts = Slice(end_key.data() + end_key.size() - ts_sz, ts_sz);
+ assert(begin_key_ts.compare(end_key_ts) == 0);
+ if (strip_timestamp) {
+ // In this mode, we expect users to always provide a min timestamp.
+ if (internal_comparator.user_comparator()->CompareTimestamp(
+ begin_key_ts, MinU64Ts()) != 0) {
+ return Status::InvalidArgument(
+ "persist_user_defined_timestamps flag is set to false, only "
+ "minimum timestamp is accepted for start key.");
+ }
+ if (internal_comparator.user_comparator()->CompareTimestamp(
+ end_key_ts, MinU64Ts()) != 0) {
+ return Status::InvalidArgument(
+ "persist_user_defined_timestamps flag is set to false, only "
+ "minimum timestamp is accepted for end key.");
+ }
+ }
+
RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
if (file_info.num_range_del_entries == 0) {
file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
@@ -462,6 +500,30 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
if (file_info != nullptr) {
*file_info = r->file_info;
+ Slice smallest_key = r->file_info.smallest_key;
+ Slice largest_key = r->file_info.largest_key;
+ Slice smallest_range_del_key = r->file_info.smallest_range_del_key;
+ Slice largest_range_del_key = r->file_info.largest_range_del_key;
+ assert(smallest_key.empty() == largest_key.empty());
+ assert(smallest_range_del_key.empty() == largest_range_del_key.empty());
+ // Remove user-defined timestamps from external file metadata too when they
+ // should not be persisted.
+ if (r->strip_timestamp) {
+ if (!smallest_key.empty()) {
+ assert(smallest_key.size() >= r->ts_sz);
+ assert(largest_key.size() >= r->ts_sz);
+ file_info->smallest_key.resize(smallest_key.size() - r->ts_sz);
+ file_info->largest_key.resize(largest_key.size() - r->ts_sz);
+ }
+ if (!smallest_range_del_key.empty()) {
+ assert(smallest_range_del_key.size() >= r->ts_sz);
+ assert(largest_range_del_key.size() >= r->ts_sz);
+ file_info->smallest_range_del_key.resize(smallest_range_del_key.size() -
+ r->ts_sz);
+ file_info->largest_range_del_key.resize(largest_range_del_key.size() -
+ r->ts_sz);
+ }
+ }
}
r->builder.reset();
diff --git a/unreleased_history/new_features/sst_file_writer_not_persist_timestamp.md b/unreleased_history/new_features/sst_file_writer_not_persist_timestamp.md
new file mode 100644
index 000000000..1a3da84f4
--- /dev/null
+++ b/unreleased_history/new_features/sst_file_writer_not_persist_timestamp.md
@@ -0,0 +1 @@
+*Make `SstFileWriter` create SST files without persisting user defined timestamps when the `Option.persist_user_defined_timestamps` flag is set to false. \ No newline at end of file