summaryrefslogtreecommitdiff
path: root/utilities
diff options
context:
space:
mode:
authorPeter Dillinger <peterd@meta.com>2024-07-12 16:01:57 -0700
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-07-12 16:01:57 -0700
commit72438a678872544809393b831c7273794c074215 (patch)
tree8b91008ead43c8221a0a3041250a06ca496c7f14 /utilities
parent3db030d7ee1b887ce818ec6f6a8d10949f9e9a22 (diff)
Support read & write with unsynced data in FaultInjectionTestFS (#12852)
Summary: Follow-up to https://github.com/facebook/rocksdb/issues/12729 and others to fix FaultInjectionTestFS handling the case where a live WAL is being appended to and synced while also being copied for checkpoint or backup, up to a known flushed (but not necessarily synced) prefix of the file. It was tricky to structure the code in a way that could handle a tricky race with Sync in another thread (see code comments, thanks Changyu) while maintaining good performance and test-ability. For more context, see the call to FlushWAL() in DBImpl::GetLiveFilesStorageInfo(). Also, the unit test for https://github.com/facebook/rocksdb/issues/12729 was neutered by https://github.com/facebook/rocksdb/issues/12797, and this re-enables the functionality it is testing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12852 Test Plan: unit test expanded/updated. Local runs of blackbox_crash_test. The implementation is structured so that a multi-threaded unit test is not needed to cover at least the code lines, as the race handling is folded into "catch up after returning unsynced and then a sync." Reviewed By: cbi42 Differential Revision: D59594045 Pulled By: pdillinger fbshipit-source-id: 94667bb72255e2952586c53bae2c2dd384e85a50
Diffstat (limited to 'utilities')
-rw-r--r--utilities/fault_injection_fs.cc192
-rw-r--r--utilities/fault_injection_fs.h17
2 files changed, 173 insertions, 36 deletions
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 5498721a8..9c952e7f2 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -559,38 +559,44 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
}
}
-void FaultInjectionTestFS::AddUnsyncedToRead(const std::string& fname,
- size_t pos, size_t n,
- Slice* result, char* scratch) {
- // Should be checked prior
- assert(result->size() < n);
- size_t pos_after = pos + result->size();
+namespace {
+// Modifies `result` to start at the beginning of `scratch` if not already,
+// copying data there if needed.
+void MoveToScratchIfNeeded(Slice* result, char* scratch) {
+ if (result->data() != scratch) {
+ // NOTE: might overlap
+ std::copy_n(result->data(), result->size(), scratch);
+ }
+ *result = Slice(scratch, result->size());
+}
+} // namespace
+
+void FaultInjectionTestFS::ReadUnsynced(const std::string& fname,
+ uint64_t offset, size_t n,
+ Slice* result, char* scratch,
+ int64_t* pos_at_last_sync) {
+ *result = Slice(scratch, 0); // default empty result
MutexLock l(&mutex_);
auto it = db_file_state_.find(fname);
if (it != db_file_state_.end()) {
auto& st = it->second;
- if (st.pos_at_last_append_ > static_cast<ssize_t>(pos_after)) {
- size_t remaining_requested = n - result->size();
- size_t to_copy =
- std::min(remaining_requested,
- static_cast<size_t>(st.pos_at_last_append_) - pos_after);
- size_t buffer_offset = pos_after - static_cast<size_t>(std::max(
- st.pos_at_last_sync_, ssize_t{0}));
- // Data might have been dropped from buffer
- if (st.buffer_.size() > buffer_offset) {
- to_copy = std::min(to_copy, st.buffer_.size() - buffer_offset);
- if (result->data() != scratch) {
- // TODO: this will be needed when supporting random reads
- // but not currently used
- abort();
- // NOTE: might overlap
- // std::copy_n(result->data(), result->size(), scratch);
- }
- std::copy_n(st.buffer_.data() + buffer_offset, to_copy,
- scratch + result->size());
- *result = Slice(scratch, result->size() + to_copy);
- }
+ // Use 0 to mean "tracked but nothing synced"
+ *pos_at_last_sync = std::max(st.pos_at_last_sync_, int64_t{0});
+ // Find overlap between [offset, offset + n) and
+ // [*pos_at_last_sync, *pos_at_last_sync + st.buffer_.size())
+ int64_t begin = std::max(static_cast<int64_t>(offset), *pos_at_last_sync);
+ int64_t end =
+ std::min(static_cast<int64_t>(offset + n),
+ *pos_at_last_sync + static_cast<int64_t>(st.buffer_.size()));
+
+ // Copy and return overlap if there is any
+ if (begin < end) {
+ size_t offset_in_buffer = static_cast<size_t>(begin - *pos_at_last_sync);
+ size_t offset_in_scratch = static_cast<size_t>(begin - offset);
+ std::copy_n(st.buffer_.data() + offset_in_buffer, end - begin,
+ scratch + offset_in_scratch);
+ *result = Slice(scratch + offset_in_scratch, end - begin);
}
}
}
@@ -606,13 +612,137 @@ IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
return s;
}
- s = target()->Read(n, options, result, scratch, dbg);
- if (!s.ok()) {
- return s;
+ // Some complex logic is needed to deal with concurrent write to the same
+ // file, while keeping good performance (e.g. not holding FS mutex during
+ // I/O op), especially in common cases.
+
+ if (read_pos_ == target_read_pos_) {
+ // Normal case: start by reading from underlying file
+ s = target()->Read(n, options, result, scratch, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ target_read_pos_ += result->size();
+ } else {
+ // We must have previously read buffered data (unsynced) not written to
+ // target. Deal with this case (and more) below.
+ *result = {};
}
if (fs_->ReadUnsyncedData() && result->size() < n) {
- fs_->AddUnsyncedToRead(fname_, read_pos_, n, result, scratch);
+ // We need to check if there's unsynced data to fill out the rest of the
+ // read.
+
+ // First, ensure target read data is in scratch for easy handling.
+ MoveToScratchIfNeeded(result, scratch);
+ assert(result->data() == scratch);
+
+ // If we just did a target Read, we only want unsynced data after it
+ // (target_read_pos_). Otherwise (e.g. if target is behind because of
+ // unsynced data) we want unsynced data starting at the current read pos
+ // (read_pos_, not yet updated).
+ const uint64_t unsynced_read_pos = std::max(target_read_pos_, read_pos_);
+ const size_t offset_from_read_pos =
+ static_cast<size_t>(unsynced_read_pos - read_pos_);
+ Slice unsynced_result;
+ int64_t pos_at_last_sync = -1;
+ fs_->ReadUnsynced(fname_, unsynced_read_pos, n - offset_from_read_pos,
+ &unsynced_result, scratch + offset_from_read_pos,
+ &pos_at_last_sync);
+ assert(unsynced_result.data() >= scratch + offset_from_read_pos);
+ assert(unsynced_result.data() < scratch + n);
+ // Now, there are several cases to consider (some grouped together):
+ if (pos_at_last_sync <= static_cast<int64_t>(unsynced_read_pos)) {
+ // 1. We didn't get any unsynced data because nothing has been written
+ // to the file beyond unsynced_read_pos (including untracked
+ // pos_at_last_sync == -1)
+ // 2. We got some unsynced data starting at unsynced_read_pos (possibly
+ // on top of some synced data from target). We don't need to try reading
+ // any more from target because we established a "point in time" for
+ // completing this Read in which we read as much tail data (unsynced) as
+ // we could.
+ assert(pos_at_last_sync >= 0 || unsynced_result.size() == 0);
+
+ // Combined data is already lined up in scratch.
+ assert(result->data() + result->size() == unsynced_result.data());
+ assert(result->size() + unsynced_result.size() <= n);
+ // Combine results
+ *result = Slice(result->data(), result->size() + unsynced_result.size());
+ } else {
+ // 3. Any unsynced data we got was after unsynced_read_pos because the
+ // file was synced some time since our last target Read (either from this
+ // Read or a prior Read). We need to read more data from target to ensure
+ // this Read is filled out, even though we might have already read some
+ // (but not all due to a race). This code handles:
+ //
+ // * Catching up target after prior read(s) of unsynced data
+ // * Racing Sync in another thread since we called target Read above
+ //
+ // And merging potentially three results together for this Read:
+ // * The original target Read above
+ // * The following (non-throw-away) target Read
+ // * The ReadUnsynced above, which is always last if it returned data,
+ // so that we have a "point in time" for completing this Read in which we
+ // read as much tail data (unsynced) as we could.
+ //
+ // Deeper note about the race: we cannot just treat the original target
+ // Read as a "point in time" view of available data in the file, because
+ // there might have been unsynced data at that time, which became synced
+ // data by the time we read unsynced data. That is the race we are
+ // resolving with this "double check"-style code.
+ const size_t supplemental_read_pos = unsynced_read_pos;
+
+ // First, if there's any data from target that we know we would need to
+ // throw away to catch up, try to do it.
+ if (target_read_pos_ < supplemental_read_pos) {
+ Slice throw_away_result;
+ size_t throw_away_n = supplemental_read_pos - target_read_pos_;
+ std::unique_ptr<char[]> throw_away_scratch{new char[throw_away_n]};
+ s = target()->Read(throw_away_n, options, &throw_away_result,
+ throw_away_scratch.get(), dbg);
+ if (!s.ok()) {
+ read_pos_ += result->size();
+ return s;
+ }
+ target_read_pos_ += throw_away_result.size();
+ if (target_read_pos_ < supplemental_read_pos) {
+ // Because of pos_at_last_sync > supplemental_read_pos, we should
+ // have been able to catch up
+ read_pos_ += result->size();
+ return IOStatus::IOError(
+ "Unexpected truncation or short read of file " + fname_);
+ }
+ }
+ // Now we can do a productive supplemental Read from target
+ assert(target_read_pos_ == supplemental_read_pos);
+ Slice supplemental_result;
+ size_t supplemental_n =
+ unsynced_result.size() == 0
+ ? n - offset_from_read_pos
+ : unsynced_result.data() - (scratch + offset_from_read_pos);
+ s = target()->Read(supplemental_n, options, &supplemental_result,
+ scratch + offset_from_read_pos, dbg);
+ if (!s.ok()) {
+ read_pos_ += result->size();
+ return s;
+ }
+ target_read_pos_ += supplemental_result.size();
+ MoveToScratchIfNeeded(&supplemental_result,
+ scratch + offset_from_read_pos);
+
+ // Combined data is already lined up in scratch.
+ assert(result->data() + result->size() == supplemental_result.data());
+ assert(unsynced_result.size() == 0 ||
+ supplemental_result.data() + supplemental_result.size() ==
+ unsynced_result.data());
+ assert(result->size() + supplemental_result.size() +
+ unsynced_result.size() <=
+ n);
+ // Combine results
+ *result =
+ Slice(result->data(), result->size() + supplemental_result.size() +
+ unsynced_result.size());
+ }
}
read_pos_ += result->size();
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 504a6b33c..fdc716f98 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -41,8 +41,8 @@ enum class FaultInjectionIOType {
struct FSFileState {
std::string filename_;
- ssize_t pos_at_last_append_;
- ssize_t pos_at_last_sync_;
+ int64_t pos_at_last_append_;
+ int64_t pos_at_last_sync_;
std::string buffer_;
explicit FSFileState(const std::string& filename)
@@ -178,7 +178,8 @@ class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
private:
FaultInjectionTestFS* fs_;
std::string fname_;
- size_t read_pos_ = 0;
+ uint64_t read_pos_ = 0;
+ uint64_t target_read_pos_ = 0;
};
class TestFSDirectory : public FSDirectory {
@@ -548,8 +549,14 @@ class FaultInjectionTestFS : public FileSystemWrapper {
void PrintInjectedThreadLocalErrorBacktrace(FaultInjectionIOType type);
- void AddUnsyncedToRead(const std::string& fname, size_t offset, size_t n,
- Slice* result, char* scratch);
+ // If there is unsynced data in the specified file within the specified
+ // range [offset, offset + n), return the unsynced data overlapping with
+ // that range, in a corresponding range of scratch. When known, also return
+ // the position of the last sync, so that the caller can determine whether
+ // more data is available from the target file when not available from
+ // unsynced.
+ void ReadUnsynced(const std::string& fname, uint64_t offset, size_t n,
+ Slice* result, char* scratch, int64_t* pos_at_last_sync);
private:
port::Mutex mutex_;