From 1fca175eece9213a07f99973bae8e9a7d6aea93c Mon Sep 17 00:00:00 2001 From: Jay Huh Date: Wed, 24 Apr 2024 15:28:55 -0700 Subject: MultiCFSnapshot for NewIterators() API (#12573) Summary: As mentioned in https://github.com/facebook/rocksdb/issues/12561 and https://github.com/facebook/rocksdb/issues/12566 , `NewIterators()` API has not been providing consistent view of the db across multiple column families. This PR addresses it by utilizing `MultiCFSnapshot()` function which has been used for `MultiGet()` APIs. To be able to obtain the thread-local super version with ref, `sv_exclusive_access` parameter has been added to `MultiCFSnapshot()` so that we could call `GetReferencedSuperVersion()` or `GetAndRefSuperVersion()` depending on the param and support `Refresh()` API for MultiCfIterators Pull Request resolved: https://github.com/facebook/rocksdb/pull/12573 Test Plan: **Unit Tests Added** ``` ./db_iterator_test --gtest_filter="*IteratorsConsistentView*" ``` ``` ./multi_cf_iterator_test -- --gtest_filter="*ConsistentView*" ``` **Performance Check** Setup ``` make -j64 release TEST_TMPDIR=/dev/shm/db_bench ./db_bench -benchmarks="filluniquerandom" -key_size=32 -value_size=512 -num=10000000 -compression_type=none ``` Run ``` TEST_TMPDIR=/dev/shm/db_bench ./db_bench -use_existing_db=1 -benchmarks="multireadrandom" -cache_size=10485760000 ``` Before the change ``` DB path: [/dev/shm/db_bench/dbbench] multireadrandom : 6.374 micros/op 156892 ops/sec 6.374 seconds 1000000 operations; (0 of 1000000 found) ``` After the change ``` DB path: [/dev/shm/db_bench/dbbench] multireadrandom : 6.265 micros/op 159627 ops/sec 6.265 seconds 1000000 operations; (0 of 1000000 found) ``` Reviewed By: jowlyzhang Differential Revision: D56444066 Pulled By: jaykorean fbshipit-source-id: 327ce73c072da30c221e18d4f3389f49115b8f99 --- db/db_impl/db_impl.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'db/db_impl/db_impl.h') diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index ae9a8d5a1..504d7ec60 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -2355,18 +2355,20 @@ class DBImpl : public DB { // A structure to contain ColumnFamilyData and the SuperVersion obtained for // the consistent view of DB - struct ColumnFamilyDataSuperVersionPair { + struct ColumnFamilySuperVersionPair { + ColumnFamilyHandleImpl* cfh; ColumnFamilyData* cfd; // SuperVersion for the column family obtained in a manner that ensures a // consistent view across all column families in the DB SuperVersion* super_version; - ColumnFamilyDataSuperVersionPair(ColumnFamilyHandle* column_family, - SuperVersion* sv) - : cfd(static_cast(column_family)->cfd()), + ColumnFamilySuperVersionPair(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cfh(static_cast(column_family)), + cfd(cfh->cfd()), super_version(sv) {} - ColumnFamilyDataSuperVersionPair() = default; + ColumnFamilySuperVersionPair() = default; }; // A common function to obtain a consistent snapshot, which can be implicit @@ -2380,9 +2382,17 @@ class DBImpl : public DB { // If callback is non-null, the callback is refreshed with the snapshot // sequence number // + // `extra_sv_ref` is used to indicate whether thread-local SuperVersion + // should be obtained with an extra ref (by GetReferencedSuperVersion()) or + // not (by GetAndRefSuperVersion()). For instance, point lookup like MultiGet + // does not require SuperVersion to be re-acquired throughout the entire + // invocation (no need extra ref), while MultiCfIterators may need the + // SuperVersion to be updated during Refresh() (requires extra ref). + // // `sv_from_thread_local` being set to false indicates that the SuperVersion // obtained from the ColumnFamilyData, whereas true indicates they are thread // local. + // // A non-OK status will be returned if for a column family that enables // user-defined timestamp feature, the specified `ReadOptions.timestamp` // attemps to read collapsed history. @@ -2390,7 +2400,8 @@ class DBImpl : public DB { Status MultiCFSnapshot(const ReadOptions& read_options, ReadCallback* callback, IterDerefFuncType iter_deref_func, T* cf_list, - SequenceNumber* snapshot, bool* sv_from_thread_local); + bool extra_sv_ref, SequenceNumber* snapshot, + bool* sv_from_thread_local); // The actual implementation of the batching MultiGet. The caller is expected // to have acquired the SuperVersion and pass in a snapshot sequence number -- cgit v1.2.3-70-g09d2