From 748539c183453bdeaff1eb0da8ccf5adacb796e7 Mon Sep 17 00:00:00 2001 From: David Grogan Date: Wed, 21 Aug 2013 11:12:47 -0700 Subject: [PATCH] LevelDB 1.13 Fix issues 77, 87, 182, 190. Additionally, fix the bug described in https://groups.google.com/d/msg/leveldb/yL6h1mAOc20/vLU64RylIdMJ where a large contiguous keyspace of deleted data was not getting compacted. Also fix a bug where options.max_open_files was not getting clamped properly. --- Makefile | 6 ++- db/autocompact_test.cc | 118 +++++++++++++++++++++++++++++++++++++++++ db/corruption_test.cc | 51 ++++++++---------- db/db_impl.cc | 41 +++++++++----- db/db_impl.h | 9 +++- db/db_iter.cc | 41 +++++++++----- db/db_iter.h | 8 +-- db/dbformat.h | 3 ++ db/version_set.cc | 96 +++++++++++++++++++++++++++++++-- db/version_set.h | 15 ++++++ include/leveldb/db.h | 2 +- util/env_posix.cc | 33 +++++++++++- util/random.h | 7 ++- 13 files changed, 363 insertions(+), 67 deletions(-) create mode 100644 db/autocompact_test.cc diff --git a/Makefile b/Makefile index 96af7765be41e..20c9c4f28751a 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL) TESTS = \ arena_test \ + autocompact_test \ bloom_test \ c_test \ cache_test \ @@ -70,7 +71,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 12 +SHARED_MINOR = 13 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -114,6 +115,9 @@ leveldbutil: db/leveldb_main.o $(LIBOBJECTS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) diff --git a/db/autocompact_test.cc b/db/autocompact_test.cc new file mode 100644 index 0000000000000..d20a2362c30c4 --- /dev/null +++ b/db/autocompact_test.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" +#include "db/db_impl.h" +#include "leveldb/cache.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +class AutoCompactTest { + public: + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + AutoCompactTest() { + dbname_ = test::TmpDir() + "/autocompact_test"; + tiny_cache_ = NewLRUCache(100); + options_.block_cache = tiny_cache_; + DestroyDB(dbname_, options_); + options_.create_if_missing = true; + options_.compression = kNoCompression; + ASSERT_OK(DB::Open(options_, dbname_, &db_)); + } + + ~AutoCompactTest() { + delete db_; + DestroyDB(dbname_, Options()); + delete tiny_cache_; + } + + std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void DoReads(int n); +}; + +static const int kValueSize = 200 * 1024; +static const int kTotalSize = 100 * 1024 * 1024; +static const int kCount = kTotalSize / kValueSize; + +// Read through the first n keys repeatedly and check that they get +// compacted (verified by checking the size of the key space). +void AutoCompactTest::DoReads(int n) { + std::string value(kValueSize, 'x'); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill database + for (int i = 0; i < kCount; i++) { + ASSERT_OK(db_->Put(WriteOptions(), Key(i), value)); + } + ASSERT_OK(dbi->TEST_CompactMemTable()); + + // Delete everything + for (int i = 0; i < kCount; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), Key(i))); + } + ASSERT_OK(dbi->TEST_CompactMemTable()); + + // Get initial measurement of the space we will be reading. + const int64_t initial_size = Size(Key(0), Key(n)); + const int64_t initial_other_size = Size(Key(n), Key(kCount)); + + // Read until size drops significantly. + std::string limit_key = Key(n); + for (int read = 0; true; read++) { + ASSERT_LT(read, 100) << "Taking too long to compact"; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); + iter->Valid() && iter->key().ToString() < limit_key; + iter->Next()) { + // Drop data + } + delete iter; + // Wait a little bit to allow any triggered compactions to complete. + Env::Default()->SleepForMicroseconds(1000000); + uint64_t size = Size(Key(0), Key(n)); + fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n", + read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0); + if (size <= initial_size/10) { + break; + } + } + + // Verify that the size of the key space not touched by the reads + // is pretty much unchanged. + const int64_t final_other_size = Size(Key(n), Key(kCount)); + ASSERT_LE(final_other_size, initial_other_size + 1048576); + ASSERT_GE(final_other_size, initial_other_size/5 - 1048576); +} + +TEST(AutoCompactTest, ReadAll) { + DoReads(kCount); +} + +TEST(AutoCompactTest, ReadHalf) { + DoReads(kCount/2); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 31b2d5f4166bc..b37ffdfe645cd 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -35,6 +35,7 @@ class CorruptionTest { CorruptionTest() { tiny_cache_ = NewLRUCache(100); options_.env = &env_; + options_.block_cache = tiny_cache_; dbname_ = test::TmpDir() + "/db_test"; DestroyDB(dbname_, options_); @@ -50,17 +51,14 @@ class CorruptionTest { delete tiny_cache_; } - Status TryReopen(Options* options = NULL) { + Status TryReopen() { delete db_; db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); + return DB::Open(options_, dbname_, &db_); } - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); + void Reopen() { + ASSERT_OK(TryReopen()); } void RepairDB() { @@ -92,6 +90,10 @@ class CorruptionTest { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { uint64_t key; Slice in(iter->key()); + if (in == "" || in == "~") { + // Ignore boundary keys. + continue; + } if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || key < next_expected) { @@ -233,7 +235,7 @@ TEST(CorruptionTest, TableFile) { dbi->TEST_CompactRange(1, NULL, NULL); Corrupt(kTableFile, 100, 1); - Check(99, 99); + Check(90, 99); } TEST(CorruptionTest, TableFileIndexData) { @@ -299,7 +301,7 @@ TEST(CorruptionTest, CompactionInputError) { ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); Corrupt(kTableFile, 100, 1); - Check(9, 9); + Check(5, 9); // Force compactions by writing lots of values Build(10000); @@ -307,32 +309,23 @@ TEST(CorruptionTest, CompactionInputError) { } TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); + options_.paranoid_checks = true; + options_.write_buffer_size = 512 << 10; + Reopen(); DBImpl* dbi = reinterpret_cast(db_); - // Fill levels >= 1 so memtable compaction outputs to level 1 - for (int level = 1; level < config::kNumLevels; level++) { - dbi->Put(WriteOptions(), "", "begin"); - dbi->Put(WriteOptions(), "~", "end"); + // Make multiple inputs so we need to compact. + for (int i = 0; i < 2; i++) { + Build(10); dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + env_.SleepForMicroseconds(100000); } + dbi->CompactRange(NULL, NULL); - Build(10); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; + // Write must fail because of corrupted table std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } + Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2)); ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; } diff --git a/db/db_impl.cc b/db/db_impl.cc index 395d3172ad3a1..fa1351038bcb0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -113,14 +113,14 @@ Options SanitizeOptions(const std::string& dbname, return result; } -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - internal_filter_policy_(options.filter_policy), - options_(SanitizeOptions( - dbname, &internal_comparator_, &internal_filter_policy_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), +DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) + : env_(raw_options.env), + internal_comparator_(raw_options.comparator), + internal_filter_policy_(raw_options.filter_policy), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, raw_options)), + owns_info_log_(options_.info_log != raw_options.info_log), + owns_cache_(options_.block_cache != raw_options.block_cache), dbname_(dbname), db_lock_(NULL), shutting_down_(NULL), @@ -130,6 +130,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) logfile_(NULL), logfile_number_(0), log_(NULL), + seed_(0), tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), manual_compaction_(NULL), @@ -138,7 +139,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) has_imm_.Release_Store(NULL); // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles; + const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles; table_cache_ = new TableCache(dbname_, &options_, table_cache_size); versions_ = new VersionSet(dbname_, &options_, table_cache_, @@ -1027,7 +1028,8 @@ static void CleanupIteratorState(void* arg1, void* arg2) { } // namespace Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { + SequenceNumber* latest_snapshot, + uint32_t* seed) { IterState* cleanup = new IterState; mutex_.Lock(); *latest_snapshot = versions_->LastSequence(); @@ -1051,13 +1053,15 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, cleanup->version = versions_->current(); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL); + *seed = ++seed_; mutex_.Unlock(); return internal_iter; } Iterator* DBImpl::TEST_NewInternalIterator() { SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); + uint32_t ignored_seed; + return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { @@ -1114,12 +1118,21 @@ Status DBImpl::Get(const ReadOptions& options, Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + uint32_t seed; + Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed); return NewDBIterator( - &dbname_, env_, user_comparator(), internal_iter, + this, user_comparator(), iter, (options.snapshot != NULL ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); + : latest_snapshot), + seed); +} + +void DBImpl::RecordReadSample(Slice key) { + MutexLock l(&mutex_); + if (versions_->current()->RecordReadSample(key)) { + MaybeScheduleCompaction(); + } } const Snapshot* DBImpl::GetSnapshot() { diff --git a/db/db_impl.h b/db/db_impl.h index 3c8d711ae06de..75fd30abe9a56 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -59,13 +59,19 @@ class DBImpl : public DB { // file at a level >= 1. int64_t TEST_MaxNextLevelOverlappingBytes(); + // Record a sample of bytes read at the specified internal key. + // Samples are taken approximately once every config::kReadBytesPeriod + // bytes. + void RecordReadSample(Slice key); + private: friend class DB; struct CompactionState; struct Writer; Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); + SequenceNumber* latest_snapshot, + uint32_t* seed); Status NewDB(); @@ -135,6 +141,7 @@ class DBImpl : public DB { WritableFile* logfile_; uint64_t logfile_number_; log::Writer* log_; + uint32_t seed_; // For sampling. // Queue of writers. std::deque writers_; diff --git a/db/db_iter.cc b/db/db_iter.cc index 87dca2ded4645..071a54e3f457a 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -5,12 +5,14 @@ #include "db/db_iter.h" #include "db/filename.h" +#include "db/db_impl.h" #include "db/dbformat.h" #include "leveldb/env.h" #include "leveldb/iterator.h" #include "port/port.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/random.h" namespace leveldb { @@ -46,15 +48,16 @@ class DBIter: public Iterator { kReverse }; - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), + DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s, + uint32_t seed) + : db_(db), user_comparator_(cmp), iter_(iter), sequence_(s), direction_(kForward), - valid_(false) { + valid_(false), + rnd_(seed), + bytes_counter_(RandomPeriod()) { } virtual ~DBIter() { delete iter_; @@ -100,8 +103,12 @@ class DBIter: public Iterator { } } - const std::string* const dbname_; - Env* const env_; + // Pick next gap with average value of config::kReadBytesPeriod. + ssize_t RandomPeriod() { + return rnd_.Uniform(2*config::kReadBytesPeriod); + } + + DBImpl* db_; const Comparator* const user_comparator_; Iterator* const iter_; SequenceNumber const sequence_; @@ -112,13 +119,23 @@ class DBIter: public Iterator { Direction direction_; bool valid_; + Random rnd_; + ssize_t bytes_counter_; + // No copying allowed DBIter(const DBIter&); void operator=(const DBIter&); }; inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { + Slice k = iter_->key(); + ssize_t n = k.size() + iter_->value().size(); + bytes_counter_ -= n; + while (bytes_counter_ < 0) { + bytes_counter_ += RandomPeriod(); + db_->RecordReadSample(k); + } + if (!ParseInternalKey(k, ikey)) { status_ = Status::Corruption("corrupted internal key in DBIter"); return false; } else { @@ -288,12 +305,12 @@ void DBIter::SeekToLast() { } // anonymous namespace Iterator* NewDBIterator( - const std::string* dbname, - Env* env, + DBImpl* db, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); + SequenceNumber sequence, + uint32_t seed) { + return new DBIter(db, user_key_comparator, internal_iter, sequence, seed); } } // namespace leveldb diff --git a/db/db_iter.h b/db/db_iter.h index d9e1b174ab872..04927e937badf 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -11,15 +11,17 @@ namespace leveldb { +class DBImpl; + // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, + DBImpl* db, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence); + SequenceNumber sequence, + uint32_t seed); } // namespace leveldb diff --git a/db/dbformat.h b/db/dbformat.h index f7f64dafb6d5b..5d8a032bd3166 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -38,6 +38,9 @@ static const int kL0_StopWritesTrigger = 12; // space if the same key space is being repeatedly overwritten. static const int kMaxMemCompactLevel = 2; +// Approximate gap in bytes between samples of data read during iteration. +static const int kReadBytesPeriod = 1048576; + } // namespace config class InternalKey; diff --git a/db/version_set.cc b/db/version_set.cc index 4fd1ddef21011..66d73be71fbbe 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -289,6 +289,51 @@ static bool NewestFirst(FileMetaData* a, FileMetaData* b) { return a->number > b->number; } +void Version::ForEachOverlapping(Slice user_key, Slice internal_key, + void* arg, + bool (*func)(void*, int, FileMetaData*)) { + // TODO(sanjay): Change Version::Get() to use this function. + const Comparator* ucmp = vset_->icmp_.user_comparator(); + + // Search level-0 in order from newest to oldest. + std::vector tmp; + tmp.reserve(files_[0].size()); + for (uint32_t i = 0; i < files_[0].size(); i++) { + FileMetaData* f = files_[0][i]; + if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && + ucmp->Compare(user_key, f->largest.user_key()) <= 0) { + tmp.push_back(f); + } + } + if (!tmp.empty()) { + std::sort(tmp.begin(), tmp.end(), NewestFirst); + for (uint32_t i = 0; i < tmp.size(); i++) { + if (!(*func)(arg, 0, tmp[i])) { + return; + } + } + } + + // Search other levels. + for (int level = 1; level < config::kNumLevels; level++) { + size_t num_files = files_[level].size(); + if (num_files == 0) continue; + + // Binary search to find earliest index whose largest key >= internal_key. + uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key); + if (index < num_files) { + FileMetaData* f = files_[level][index]; + if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) { + // All of "f" is past any data for user_key + } else { + if (!(*func)(arg, level, f)) { + return; + } + } + } + } +} + Status Version::Get(const ReadOptions& options, const LookupKey& k, std::string* value, @@ -401,6 +446,44 @@ bool Version::UpdateStats(const GetStats& stats) { return false; } +bool Version::RecordReadSample(Slice internal_key) { + ParsedInternalKey ikey; + if (!ParseInternalKey(internal_key, &ikey)) { + return false; + } + + struct State { + GetStats stats; // Holds first matching file + int matches; + + static bool Match(void* arg, int level, FileMetaData* f) { + State* state = reinterpret_cast(arg); + state->matches++; + if (state->matches == 1) { + // Remember first match. + state->stats.seek_file = f; + state->stats.seek_file_level = level; + } + // We can stop iterating once we have a second match. + return state->matches < 2; + } + }; + + State state; + state.matches = 0; + ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match); + + // Must have at least two matches since we want to merge across + // files. But what if we have a single file that contains many + // overwrites and deletions? Should we have another mechanism for + // finding such files? + if (state.matches >= 2) { + // 1MB cost is about 1 seek (see comment in Builder::Apply). + return UpdateStats(state.stats); + } + return false; +} + void Version::Ref() { ++refs_; } @@ -435,10 +518,13 @@ int Version::PickLevelForMemTableOutput( if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } - GetOverlappingInputs(level + 2, &start, &limit, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > kMaxGrandParentOverlapBytes) { - break; + if (level + 2 < config::kNumLevels) { + // Check that file does not overlap too many grandparent bytes. + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > kMaxGrandParentOverlapBytes) { + break; + } } level++; } @@ -452,6 +538,8 @@ void Version::GetOverlappingInputs( const InternalKey* begin, const InternalKey* end, std::vector* inputs) { + assert(level >= 0); + assert(level < config::kNumLevels); inputs->clear(); Slice user_begin, user_end; if (begin != NULL) { diff --git a/db/version_set.h b/db/version_set.h index 9d084fdb7d005..20de0e262914c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -78,6 +78,12 @@ class Version { // REQUIRES: lock is held bool UpdateStats(const GetStats& stats); + // Record a sample of bytes read at the specified internal key. + // Samples are taken approximately once every config::kReadBytesPeriod + // bytes. Returns true if a new compaction may need to be triggered. + // REQUIRES: lock is held + bool RecordReadSample(Slice key); + // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); @@ -114,6 +120,15 @@ class Version { class LevelFileNumIterator; Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + // Call func(arg, level, f) for every file that overlaps user_key in + // order from newest to oldest. If an invocation of func returns + // false, makes no more calls. + // + // REQUIRES: user portion of internal_key == user_key. + void ForEachOverlapping(Slice user_key, Slice internal_key, + void* arg, + bool (*func)(void*, int, FileMetaData*)); + VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list diff --git a/include/leveldb/db.h b/include/leveldb/db.h index da8b11a8c05b0..57c00a5da0058 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 12; +static const int kMinorVersion = 13; struct Options; struct ReadOptions; diff --git a/util/env_posix.cc b/util/env_posix.cc index a3f197da8cddf..3e2925d111581 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -319,8 +319,39 @@ class PosixMmapFile : public WritableFile { return Status::OK(); } - virtual Status Sync() { + Status SyncDirIfManifest() { + const char* f = filename_.c_str(); + const char* sep = strrchr(f, '/'); + Slice basename; + std::string dir; + if (sep == NULL) { + dir = "."; + basename = f; + } else { + dir = std::string(f, sep - f); + basename = sep + 1; + } Status s; + if (basename.starts_with("MANIFEST")) { + int fd = open(dir.c_str(), O_RDONLY); + if (fd < 0) { + s = IOError(dir, errno); + } else { + if (fsync(fd) < 0) { + s = IOError(dir, errno); + } + close(fd); + } + } + return s; + } + + virtual Status Sync() { + // Ensure new files referred to by the manifest are in the filesystem. + Status s = SyncDirIfManifest(); + if (!s.ok()) { + return s; + } if (pending_sync_) { // Some unmapped data was not synced diff --git a/util/random.h b/util/random.h index 07538242ea559..ddd51b1c7b517 100644 --- a/util/random.h +++ b/util/random.h @@ -16,7 +16,12 @@ class Random { private: uint32_t seed_; public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { + // Avoid bad seeds. + if (seed_ == 0 || seed_ == 2147483647L) { + seed_ = 1; + } + } uint32_t Next() { static const uint32_t M = 2147483647L; // 2^31-1 static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0