Skip to content

Commit

Permalink
Fix duplicate WAL entries caused by write after error recovery (#12873)
Browse files Browse the repository at this point in the history
Summary:
**Context/Summary:**
We recently discovered a case where write of the same key right after error recovery of a previous failed write of the same key finishes causes two same WAL entries, violating our assertion. This is because we don't advance seqno on failed write and reuse the same WAL containing the failed write for the new write if the memtable at the time is empty.

This PR reuses the flush path for an empty memtable to switch WAL and update min WAL to keep in error recovery flush
 as well as updates the INFO log message for clarity.

```
2024/07/17-15:01:32.271789 327757 (Original Log Time 2024/07/17-15:01:25.942234) [/flush_job.cc:1017] [default] [JOB 2] Level-0 flush table #9: 0 bytes OK It's an empty SST file from a successful flush so won't be kept in the DB
2024/07/17-15:01:32.271798 327757 (Original Log Time 2024/07/17-15:01:32.269954) [/memtable_list.cc:560] [default] Level-0 commit flush result of table #9 started
2024/07/17-15:01:32.271802 327757 (Original Log Time 2024/07/17-15:01:32.271217) [/memtable_list.cc:760] [default] Level-0 commit flush result of table #9: memtable #1 done
```

Pull Request resolved: #12873

Test Plan:
New UT that failed before this PR with following assertion failure (i.e, duplicate WAL entries) and passes after
```
db_wal_test: db/write_batch.cc:2254: rocksdb::Status rocksdb::{anonymous}::MemTableInserter::PutCFImpl(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::ValueType, RebuildTxnOp, const ProtectionInfoKVOS64*) [with RebuildTxnOp = rocksdb::{anonymous}::MemTableInserter::PutCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)::<lambda(rocksdb::WriteBatch*, uint32_t, const rocksdb::Slice&, const rocksdb::Slice&)>; uint32_t = unsigned int; rocksdb::ProtectionInfoKVOS64 = rocksdb::ProtectionInfoKVOS<long unsigned int>]: Assertion `seq_per_batch_' failed.
```

Reviewed By: anand1976

Differential Revision: D59884468

Pulled By: hx235

fbshipit-source-id: 5d854b719092552c69727a979f269fb7f6c39756
  • Loading branch information
hx235 authored and facebook-github-bot committed Jul 22, 2024
1 parent c064ac3 commit 349b1ec
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 13 deletions.
7 changes: 6 additions & 1 deletion db/db_impl/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1888,6 +1888,10 @@ class DBImpl : public DB {
Env::Priority compaction_pri_;
};

static bool IsRecoveryFlush(FlushReason flush_reason) {
return flush_reason == FlushReason::kErrorRecoveryRetryFlush ||
flush_reason == FlushReason::kErrorRecovery;
}
// Initialize the built-in column family for persistent stats. Depending on
// whether on-disk persistent stats have been enabled before, it may either
// create a new column family and column family handle or just a column family
Expand Down Expand Up @@ -2032,7 +2036,8 @@ class DBImpl : public DB {
// REQUIRES: mutex held
void SelectColumnFamiliesForAtomicFlush(
autovector<ColumnFamilyData*>* selected_cfds,
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {});
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {},
FlushReason flush_reason = FlushReason::kOthers);

// Force current memtable contents to be flushed.
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
Expand Down
11 changes: 7 additions & 4 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2306,13 +2306,15 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
}
WaitForPendingWrites();

if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
s = SwitchMemtable(cfd, &context);
}
const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
if (s.ok()) {
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
!cached_recoverable_state_empty_.load()) {
!cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}};
flush_reqs.emplace_back(std::move(req));
memtable_ids_to_wait.emplace_back(
Expand Down Expand Up @@ -2489,7 +2491,7 @@ Status DBImpl::AtomicFlushMemTables(
}
WaitForPendingWrites();

SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds);
SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds, flush_reason);

// Unref the newly generated candidate cfds (when not provided) in
// `candidate_cfds`
Expand All @@ -2500,7 +2502,8 @@ Status DBImpl::AtomicFlushMemTables(
}

for (auto cfd : cfds) {
if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load() &&
!IsRecoveryFlush(flush_reason)) {
continue;
}
cfd->Ref();
Expand Down
6 changes: 4 additions & 2 deletions db/db_impl/db_impl_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1650,7 +1650,8 @@ Status DBImpl::WriteRecoverableState() {

void DBImpl::SelectColumnFamiliesForAtomicFlush(
autovector<ColumnFamilyData*>* selected_cfds,
const autovector<ColumnFamilyData*>& provided_candidate_cfds) {
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
FlushReason flush_reason) {
mutex_.AssertHeld();
assert(selected_cfds);

Expand All @@ -1673,7 +1674,8 @@ void DBImpl::SelectColumnFamiliesForAtomicFlush(
continue;
}
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
!cached_recoverable_state_empty_.load()) {
!cached_recoverable_state_empty_.load() ||
IsRecoveryFlush(flush_reason)) {
selected_cfds->push_back(cfd);
}
}
Expand Down
43 changes: 43 additions & 0 deletions db/db_wal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2757,6 +2757,49 @@ TEST_F(DBWALTest, EmptyWalReopenTest) {
}
}

TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
Options options = CurrentOptions();
auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
options.env = fault_fs_env.get();
options.avoid_flush_during_shutdown = true;
DestroyAndReopen(options);

fault_fs->SetThreadLocalErrorContext(
FaultInjectionIOType::kMetadataWrite, 7 /* seed*/, 1 /* one_in */,
true /* retryable */, false /* has_data_loss*/);
fault_fs->EnableThreadLocalErrorInjection(
FaultInjectionIOType::kMetadataWrite);

WriteOptions wo;
wo.sync = true;
Status s = Put("k", "old_v", wo);
ASSERT_TRUE(s.IsIOError());
// To verify the key is not in memtable nor SST
ASSERT_TRUE(static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
->cfd()
->mem()
->IsEmpty());
ASSERT_EQ("NOT_FOUND", Get("k"));

fault_fs->DisableThreadLocalErrorInjection(
FaultInjectionIOType::kMetadataWrite);

// Keep trying write until recovery of the previous IO error finishes
while (!s.ok()) {
options.env->SleepForMicroseconds(1000);
s = Put("k", "new_v");
}

// If recovery flush didn't switch WAL, we will end up having two duplicate
// WAL entries with same seqno and same key that violate assertion during WAL
// recovery and fail DB reopen
options.avoid_flush_during_recovery = false;
Reopen(options);

ASSERT_EQ("new_v", Get("k"));
Destroy(options);
}
} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
9 changes: 7 additions & 2 deletions db/flush_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,

AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
if (mems_.empty()) {
ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
ROCKS_LOG_BUFFER(log_buffer_, "[%s] No memtable to flush",
cfd_->GetName().c_str());
return Status::OK();
}
Expand Down Expand Up @@ -1017,10 +1017,15 @@ Status FlushJob::WriteLevel0Table() {
ROCKS_LOG_BUFFER(log_buffer_,
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
" bytes %s"
"%s",
" %s"
" %s",
cfd_->GetName().c_str(), job_context_->job_id,
meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
s.ToString().c_str(),
s.ok() && meta_.fd.GetFileSize() == 0
? "It's an empty SST file from a successful flush so "
"won't be kept in the DB"
: "",
meta_.marked_for_compaction ? " (needs compaction)" : "");

if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
Expand Down
9 changes: 5 additions & 4 deletions db/memtable_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -558,11 +558,12 @@ Status MemTableList::TryInstallMemtableFlushResults(
batch_file_number = m->file_number_;
if (m->edit_.GetBlobFileAdditions().empty()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64 " started",
"[%s] Level-0 commit flush result of table #%" PRIu64
" started",
cfd->GetName().c_str(), m->file_number_);
} else {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
" (+%zu blob files) started",
cfd->GetName().c_str(), m->file_number_,
m->edit_.GetBlobFileAdditions().size());
Expand Down Expand Up @@ -757,12 +758,12 @@ void MemTableList::RemoveMemTablesOrRestoreFlags(
MemTable* m = current_->memlist_.back();
if (m->edit_.GetBlobFileAdditions().empty()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_, mem_id);
} else {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] Level-0 commit table #%" PRIu64
"[%s] Level-0 commit flush result of table #%" PRIu64
" (+%zu blob files)"
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_,
Expand Down
1 change: 1 addition & 0 deletions unreleased_history/bug_fixes/duplicate_wal_entries.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries

0 comments on commit 349b1ec

Please sign in to comment.