summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChangyu Bi <changyubi@meta.com>2024-09-17 14:10:33 -0700
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>2024-09-17 14:10:33 -0700
commitf97e33454f97e88c803b7354166258f950472aa4 (patch)
treeec37aa28fde9a17c07e17bf34bed0575f4c1e69f
parenta16457625243e22b08cd1c87ce01153e4f554d77 (diff)
Fix a bug with auto recovery on WAL write error (#12995)
Summary: A recent crash test failure shows that auto recovery from WAL write failure can cause CFs to be inconsistent. A unit test repro in P1569398553. The following is an example sequence of events: ``` 0. manual_wal_flush is true. There are multiple CFs in a DB. 1. Submit a write batch with updates to multiple CF 2. A FlushWAL or a memtable swtich that will try to write the buffered WAL data. Fail this write so that buffered WAL data is dropped: https://github.com/facebook/rocksdb/blob/4b1d595306fae602b56d2aa5128b11b1162bfa81/file/writable_file_writer.cc#L624 The error needs to be retryable to start background auto recovery. 3. One CF successfully flushes its memtable during auto recovery. 4. Crash the process. 5. Reopen the DB, one CF will have the update as a result of successful flush. Other CFs will miss all the updates in the write batch since WAL does not have them. ``` This can happen if a users configures manual_wal_flush, uses more than one CF, and can hit retryable error for WAL writes. This PR is a short-term fix that upgrades WAL related errors to fatal and not trigger auto recovery. A long-term fix may be not drop buffered WAL data by checking how much data is actually written, or require atomically flushing all column families during error recovery from this kind of errors. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12995 Test Plan: added unit test to check error severity and if recovery is triggered. A crash test repro command that fails in a few runs before this PR: ``` python3 ./tools/db_crashtest.py blackbox --interval=60 --metadata_write_fault_one_in=1000 --column_families=10 --exclude_wal_from_write_fault_injection=0 --manual_wal_flush_one_in=1000 --WAL_size_limit_MB=10240 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=1 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=1 --async_io=0 --auto_readahead_size=0 --avoid_flush_during_recovery=1 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --block_align=1 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=2147483647 --bottommost_compression_type=none --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=33554432 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_readahead_size=1048576 --compaction_ttl=0 --compress_format_version=1 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=4 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kCold --default_write_temperature=kWarm --delete_obsolete_files_period_micros=30000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=1000000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=1 --fail_if_options_file_error=1 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=6 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=1000000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --index_block_restart_interval=4 --index_shortening=1 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kWarm --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=2097152 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=2 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=2 --open_files=100 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --optimize_filters_for_hits=0 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=2 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --report_bg_io_stats=0 --reset_stats_one_in=10000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --set_options_one_in=10000 --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=1048576 --sqfc_name=bar --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=600 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=8 --universal_max_read_amp=-1 --unpartitioned_pinning=2 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=0 --use_attribute_group=1 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=1 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=0 --use_put_entity_one_in=1 --use_sqfc_for_range_queries=0 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=1 --verify_db_one_in=100000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=4194304 --write_dbid_to_manifest=0 --write_fault_one_in=50 --writepercent=35 --ops_per_thread=100000 --preserve_unverified_changes=1 ``` Reviewed By: hx235 Differential Revision: D62888510 Pulled By: cbi42 fbshipit-source-id: 308bdbbb8d897cc8eba950155cd0e37cf7eb76fe
-rw-r--r--db/db_impl/db_impl.cc4
-rw-r--r--db/db_impl/db_impl.h5
-rw-r--r--db/db_impl/db_impl_debug.cc5
-rw-r--r--db/db_impl/db_impl_write.cc14
-rw-r--r--db/db_wal_test.cc23
-rw-r--r--db/error_handler.cc31
-rw-r--r--db/error_handler.h3
-rw-r--r--unreleased_history/bug_fixes/wal-error-recovery.md1
8 files changed, 73 insertions, 13 deletions
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 352727892..5b54b4d2d 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1526,7 +1526,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
io_s.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the
// future writes
- IOStatusCheck(io_s);
+ WALIOStatusCheck(io_s);
// whether sync or not, we should abort the rest of function upon error
return static_cast<Status>(io_s);
}
@@ -1683,7 +1683,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
io_s.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the
// future writes
- IOStatusCheck(io_s);
+ WALIOStatusCheck(io_s);
}
if (io_s.ok() && need_wal_dir_sync) {
io_s = directories_.GetWalDir()->FsyncWithDirOptions(
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 1b3bafaae..da0625ae7 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1151,6 +1151,8 @@ class DBImpl : public DB {
// Get the background error status
Status TEST_GetBGError();
+ bool TEST_IsRecoveryInProgress();
+
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
uint64_t TEST_MaxNextLevelOverlappingBytes(
@@ -1956,6 +1958,7 @@ class DBImpl : public DB {
void ReleaseFileNumberFromPendingOutputs(
std::unique_ptr<std::list<uint64_t>::iterator>& v);
+ // Sets bg error if there is an error writing to WAL.
IOStatus SyncClosedWals(const WriteOptions& write_options,
JobContext* job_context, VersionEdit* synced_wals,
bool error_recovery_in_prog);
@@ -2174,7 +2177,7 @@ class DBImpl : public DB {
// Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
// WAL, sync WAL fails, if paranoid check is enabled.
- void IOStatusCheck(const IOStatus& status);
+ void WALIOStatusCheck(const IOStatus& status);
// Used by WriteImpl to update bg_error_ in case of memtable insert error.
void MemTableInsertStatusCheck(const Status& memtable_insert_status);
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 5ebe921cf..790a50d7a 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -199,6 +199,11 @@ Status DBImpl::TEST_GetBGError() {
return error_handler_.GetBGError();
}
+bool DBImpl::TEST_IsRecoveryInProgress() {
+ InstrumentedMutexLock l(&mutex_);
+ return error_handler_.IsRecoveryInProgress();
+}
+
void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index da773bac0..e28d3fa91 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -656,7 +656,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (!io_s.ok()) {
// Check WriteToWAL status
- IOStatusCheck(io_s);
+ WALIOStatusCheck(io_s);
}
if (!w.CallbackFailed()) {
if (!io_s.ok()) {
@@ -799,7 +799,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
if (!io_s.ok()) {
// Check WriteToWAL status
- IOStatusCheck(io_s);
+ WALIOStatusCheck(io_s);
} else if (!w.CallbackFailed()) {
WriteStatusCheck(w.status);
}
@@ -1077,7 +1077,7 @@ Status DBImpl::WriteImplWALOnly(
// This error checking and return is moved up to avoid using uninitialized
// last_sequence.
if (!io_s.ok()) {
- IOStatusCheck(io_s);
+ WALIOStatusCheck(io_s);
write_thread->ExitAsBatchGroupLeader(write_group, status);
return status;
}
@@ -1175,7 +1175,7 @@ void DBImpl::WriteStatusCheck(const Status& status) {
}
}
-void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
// Is setting bg_error_ enough here? This will at least stop
// compaction and fail any further writes.
if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
@@ -1183,7 +1183,8 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) {
io_status.IsIOFenced()) {
mutex_.Lock();
// Maybe change the return status to void?
- error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+ error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback,
+ /*wal_related=*/true);
mutex_.Unlock();
} else {
// Force writable file to be continue writable.
@@ -2326,7 +2327,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
// We may have lost data from the WritableFileBuffer in-memory buffer for
// the current log, so treat it as a fatal error and set bg_error
if (!io_s.ok()) {
- error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+ error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable,
+ /*wal_related=*/true);
} else {
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
}
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 51c25746d..06f3e7725 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -2931,6 +2931,29 @@ TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
ASSERT_EQ("new_v", Get("k"));
Destroy(options);
}
+
+TEST_F(DBWALTest, WALWriteErrorNoRecovery) {
+ Options options = CurrentOptions();
+ auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ options.env = fault_fs_env.get();
+ options.manual_wal_flush = true;
+ DestroyAndReopen(options);
+ fault_fs->SetThreadLocalErrorContext(
+ FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
+ true /* retryable */, false /* has_data_loss*/);
+ fault_fs->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+
+ ASSERT_OK(Put("k", "v"));
+ Status s;
+ s = db_->FlushWAL(false);
+ ASSERT_TRUE(s.IsIOError());
+ s = dbfull()->TEST_GetBGError();
+ ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+ ASSERT_FALSE(dbfull()->TEST_IsRecoveryInProgress());
+ fault_fs->DisableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+ Destroy(options);
+}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 57577d7c3..2a8a7dee5 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -381,7 +381,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
// BackgroundErrorReason reason) will be called to handle other error cases
// such as delegating to SstFileManager to handle no space error.
void ErrorHandler::SetBGError(const Status& bg_status,
- BackgroundErrorReason reason) {
+ BackgroundErrorReason reason, bool wal_related) {
db_mutex_->AssertHeld();
Status tmp_status = bg_status;
IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
@@ -389,8 +389,8 @@ void ErrorHandler::SetBGError(const Status& bg_status,
if (bg_io_err.ok()) {
return;
}
- ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
- bg_io_err.ToString().c_str());
+ ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s, reason %d",
+ bg_io_err.ToString().c_str(), static_cast<int>(reason));
RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
{} /* int_histograms */);
@@ -412,6 +412,31 @@ void ErrorHandler::SetBGError(const Status& bg_status,
recover_context_ = context;
return;
}
+ if (wal_related) {
+ assert(reason == BackgroundErrorReason::kWriteCallback ||
+ reason == BackgroundErrorReason::kMemTable ||
+ reason == BackgroundErrorReason::kFlush);
+ }
+ if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
+ // With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
+ // Memtables and WAL then become inconsistent. A successful memtable flush
+ // on one CF can cause CFs to be inconsistent upon restart. Before we fix
+ // the bug in auto recovery from WAL write failures that can flush one CF
+ // at a time, we set the error severity to fatal to disallow auto recovery.
+ // TODO: remove parameter `wal_related` once we can automatically recover
+ // from WAL write failures.
+ bool auto_recovery = false;
+ Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
+ CheckAndSetRecoveryAndBGError(bg_err);
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "ErrorHandler: A potentially WAL error happened, set "
+ "background IO error as fatal error\n");
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+ &bg_err, db_mutex_, &auto_recovery);
+ recover_context_ = context;
+ return;
+ }
+
if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
(bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
bg_io_err.GetRetryable())) {
diff --git a/db/error_handler.h b/db/error_handler.h
index b257f69cf..7535d654d 100644
--- a/db/error_handler.h
+++ b/db/error_handler.h
@@ -56,7 +56,8 @@ class ErrorHandler {
Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
Status::Code code, Status::SubCode subcode);
- void SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+ void SetBGError(const Status& bg_err, BackgroundErrorReason reason,
+ bool wal_related = false);
Status GetBGError() const { return bg_error_; }
diff --git a/unreleased_history/bug_fixes/wal-error-recovery.md b/unreleased_history/bug_fixes/wal-error-recovery.md
new file mode 100644
index 000000000..07d87473d
--- /dev/null
+++ b/unreleased_history/bug_fixes/wal-error-recovery.md
@@ -0,0 +1 @@
+* Fix a bug with manual_wal_flush and auto error recovery from WAL failure that may cause CFs to be inconsistent (#12995). The fix will set potential WAL write failure as fatal error when manual_wal_flush is true, and disables auto error recovery from these errors. \ No newline at end of file