diff --git a/AUTHORS b/AUTHORS index 27a9407e5..fc40194ab 100644 --- a/AUTHORS +++ b/AUTHORS @@ -6,3 +6,6 @@ Google Inc. # Initial version authors: Jeffrey Dean Sanjay Ghemawat + +# Partial list of contributors: +Kevin Regan diff --git a/Makefile b/Makefile index 42c4952fe..38b9bf772 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ TESTS = \ env_test \ filename_test \ filter_block_test \ + issue178_test \ log_test \ memenv_test \ skiplist_test \ @@ -69,7 +70,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 9 +SHARED_MINOR = 12 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -146,6 +147,9 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) diff --git a/build_detect_platform b/build_detect_platform index 609cb5122..a3ad057ee 100755 --- a/build_detect_platform +++ b/build_detect_platform @@ -94,6 +94,12 @@ case "$TARGET_OS" in PLATFORM_LIBS="-lpthread" PORT_FILE=port/port_posix.cc ;; + GNU/kFreeBSD) + PLATFORM=OS_KFREEBSD + COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD" + PLATFORM_LIBS="-lpthread" + PORT_FILE=port/port_posix.cc + ;; NetBSD) PLATFORM=OS_NETBSD COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD" diff --git a/db/db_impl.cc b/db/db_impl.cc index c9de169f2..af02467b3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -35,6 +35,8 @@ namespace leveldb { +const int kNumNonTableCacheFiles = 10; + // Information kept for every waiting writer struct DBImpl::Writer { Status status; @@ -92,9 +94,9 @@ Options SanitizeOptions(const std::string& dbname, Options result = src; result.comparator = icmp; result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); + ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000); + ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); if (result.info_log == NULL) { // Open a log file in the same directory as the db src.env->CreateDir(dbname); // In case it does not exist @@ -130,12 +132,13 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) log_(NULL), tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), - manual_compaction_(NULL) { + manual_compaction_(NULL), + consecutive_compaction_errors_(0) { mem_->Ref(); has_imm_.Release_Store(NULL); // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - 10; + const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles; table_cache_ = new TableCache(dbname_, &options_, table_cache_size); versions_ = new VersionSet(dbname_, &options_, table_cache_, @@ -310,16 +313,24 @@ Status DBImpl::Recover(VersionEdit* edit) { if (!s.ok()) { return s; } + std::set expected; + versions_->AddLiveFiles(&expected); uint64_t number; FileType type; std::vector logs; for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) - && type == kLogFile - && ((number >= min_log) || (number == prev_log))) { + if (ParseFileName(filenames[i], &number, &type)) { + expected.erase(number); + if (type == kLogFile && ((number >= min_log) || (number == prev_log))) logs.push_back(number); } } + if (!expected.empty()) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d missing files; e.g.", + static_cast(expected.size())); + return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin()))); + } // Recover in the order in which the logs were generated std::sort(logs.begin(), logs.end()); @@ -611,6 +622,7 @@ void DBImpl::BackgroundCall() { Status s = BackgroundCompaction(); if (s.ok()) { // Success + consecutive_compaction_errors_ = 0; } else if (shutting_down_.Acquire_Load()) { // Error most likely due to shutdown; do not wait } else { @@ -622,7 +634,12 @@ void DBImpl::BackgroundCall() { Log(options_.info_log, "Waiting after background compaction error: %s", s.ToString().c_str()); mutex_.Unlock(); - env_->SleepForMicroseconds(1000000); + ++consecutive_compaction_errors_; + int seconds_to_sleep = 1; + for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) { + seconds_to_sleep *= 2; + } + env_->SleepForMicroseconds(seconds_to_sleep * 1000000); mutex_.Lock(); } } @@ -805,6 +822,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, (unsigned long long) output_number, (unsigned long long) current_entries, (unsigned long long) current_bytes); + + // rate-limit compaction file creation with a 100ms pause + env_->SleepForMicroseconds(100000); } } return s; @@ -1268,10 +1288,11 @@ Status DBImpl::MakeRoomForWrite(bool force) { } else if (imm_ != NULL) { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. + Log(options_.info_log, "Current memtable full; waiting...\n"); bg_cv_.Wait(); } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // There are too many level-0 files. - Log(options_.info_log, "waiting...\n"); + Log(options_.info_log, "Too many L0 files; waiting...\n"); bg_cv_.Wait(); } else { // Attempt to switch to a new memtable and trigger compaction of old diff --git a/db/db_impl.h b/db/db_impl.h index bd29dd805..3c8d711ae 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -163,6 +163,7 @@ class DBImpl : public DB { // Have we encountered a background error in paranoid mode? Status bg_error_; + int consecutive_compaction_errors_; // Per level compaction stats. stats_[level] stores the stats for // compactions that produced data for the specified "level". diff --git a/db/db_test.cc b/db/db_test.cc index 684ea3bdb..49aae04db 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -33,8 +33,11 @@ class AtomicCounter { public: AtomicCounter() : count_(0) { } void Increment() { + IncrementBy(1); + } + void IncrementBy(int count) { MutexLock l(&mu_); - count_++; + count_ += count; } int Read() { MutexLock l(&mu_); @@ -45,6 +48,10 @@ class AtomicCounter { count_ = 0; } }; + +void DelayMilliseconds(int millis) { + Env::Default()->SleepForMicroseconds(millis * 1000); +} } // Special Env used to delay background operations @@ -69,6 +76,7 @@ class SpecialEnv : public EnvWrapper { AtomicCounter random_read_counter_; AtomicCounter sleep_counter_; + AtomicCounter sleep_time_counter_; explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(NULL); @@ -103,7 +111,7 @@ class SpecialEnv : public EnvWrapper { Status Flush() { return base_->Flush(); } Status Sync() { while (env_->delay_sstable_sync_.Acquire_Load() != NULL) { - env_->SleepForMicroseconds(100000); + DelayMilliseconds(100); } return base_->Sync(); } @@ -174,8 +182,9 @@ class SpecialEnv : public EnvWrapper { virtual void SleepForMicroseconds(int micros) { sleep_counter_.Increment(); - target()->SleepForMicroseconds(micros); + sleep_time_counter_.IncrementBy(micros); } + }; class DBTest { @@ -461,6 +470,20 @@ class DBTest { } return result; } + + bool DeleteAnSSTFile() { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { + ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number))); + return true; + } + } + return false; + } }; TEST(DBTest, Empty) { @@ -611,7 +634,7 @@ TEST(DBTest, GetEncountersEmptyLevel) { } // Step 4: Wait for compaction to finish - env_->SleepForMicroseconds(1000000); + DelayMilliseconds(1000); ASSERT_EQ(NumTableFilesAtLevel(0), 0); } while (ChangeOptions()); @@ -1295,7 +1318,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) { Reopen(); Reopen(); ASSERT_EQ("(a->v)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + DelayMilliseconds(1000); // Wait for compaction to finish ASSERT_EQ("(a->v)", Contents()); } @@ -1311,7 +1334,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) { Put("",""); Reopen(); Put("",""); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + DelayMilliseconds(1000); // Wait for compaction to finish Reopen(); Put("d","dv"); Reopen(); @@ -1321,7 +1344,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) { Delete("b"); Reopen(); ASSERT_EQ("(->)(c->cv)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + DelayMilliseconds(1000); // Wait for compaction to finish ASSERT_EQ("(->)(c->cv)", Contents()); } @@ -1506,6 +1529,30 @@ TEST(DBTest, NoSpace) { ASSERT_GE(env_->sleep_counter_.Read(), 5); } +TEST(DBTest, ExponentialBackoff) { + Options options = CurrentOptions(); + options.env = env_; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + env_->non_writable_.Release_Store(env_); // Force errors for new files + env_->sleep_counter_.Reset(); + env_->sleep_time_counter_.Reset(); + for (int i = 0; i < 5; i++) { + dbfull()->TEST_CompactRange(2, NULL, NULL); + } + env_->non_writable_.Release_Store(NULL); + + // Wait for compaction to finish + DelayMilliseconds(1000); + + ASSERT_GE(env_->sleep_counter_.Read(), 5); + ASSERT_LT(env_->sleep_counter_.Read(), 10); + ASSERT_GE(env_->sleep_time_counter_.Read(), 10e6); +} + TEST(DBTest, NonWritableFileSystem) { Options options = CurrentOptions(); options.write_buffer_size = 1000; @@ -1519,7 +1566,7 @@ TEST(DBTest, NonWritableFileSystem) { fprintf(stderr, "iter %d; errors %d\n", i, errors); if (!Put("foo", big).ok()) { errors++; - env_->SleepForMicroseconds(100000); + DelayMilliseconds(100); } } ASSERT_GT(errors, 0); @@ -1567,6 +1614,24 @@ TEST(DBTest, ManifestWriteError) { } } +TEST(DBTest, MissingSSTFile) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Dump the memtable to disk. + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("bar", Get("foo")); + + Close(); + ASSERT_TRUE(DeleteAnSSTFile()); + Options options = CurrentOptions(); + options.paranoid_checks = true; + Status s = TryReopen(&options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("issing") != std::string::npos) + << s.ToString(); +} + TEST(DBTest, FilesDeletedAfterCompaction) { ASSERT_OK(Put("foo", "v2")); Compact("a", "z"); @@ -1711,13 +1776,13 @@ TEST(DBTest, MultiThreaded) { } // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); + DelayMilliseconds(kTestSeconds * 1000); // Stop the threads and wait for them to finish mt.stop.Release_Store(&mt); for (int id = 0; id < kNumThreads; id++) { while (mt.thread_done[id].Acquire_Load() == NULL) { - env_->SleepForMicroseconds(100000); + DelayMilliseconds(100); } } } while (ChangeOptions()); diff --git a/db/dbformat.cc b/db/dbformat.cc index 28e11b398..20a7ca446 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -26,7 +26,7 @@ std::string ParsedInternalKey::DebugString() const { (unsigned long long) sequence, int(type)); std::string result = "'"; - result += user_key.ToString(); + result += EscapeString(user_key.ToString()); result += buf; return result; } diff --git a/db/filename_test.cc b/db/filename_test.cc index 47353d6c9..5a26da472 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -70,7 +70,7 @@ TEST(FileNameTest, Parse) { for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { std::string f = errors[i]; ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; - }; + } } TEST(FileNameTest, Construction) { diff --git a/db/version_set.cc b/db/version_set.cc index 7d0a5de2b..4fd1ddef2 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1331,14 +1331,19 @@ Compaction* VersionSet::CompactRange( } // Avoid compacting too much in one shot in case the range is large. - const uint64_t limit = MaxFileSizeForLevel(level); - uint64_t total = 0; - for (size_t i = 0; i < inputs.size(); i++) { - uint64_t s = inputs[i]->file_size; - total += s; - if (total >= limit) { - inputs.resize(i + 1); - break; + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (level > 0) { + const uint64_t limit = MaxFileSizeForLevel(level); + uint64_t total = 0; + for (size_t i = 0; i < inputs.size(); i++) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + inputs.resize(i + 1); + break; + } } } diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 29d367447..da8b11a8c 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 9; +static const int kMinorVersion = 12; struct Options; struct ReadOptions; diff --git a/port/port_win.cc b/port/port_win.cc index 99c1d8e34..1b0f060a1 100644 --- a/port/port_win.cc +++ b/port/port_win.cc @@ -109,12 +109,10 @@ void CondVar::Signal() { void CondVar::SignalAll() { wait_mtx_.Lock(); - for(long i = 0; i < waiting_; ++i) { - ::ReleaseSemaphore(sem1_, 1, NULL); - while(waiting_ > 0) { - --waiting_; - ::WaitForSingleObject(sem2_, INFINITE); - } + ::ReleaseSemaphore(sem1_, waiting_, NULL); + while(waiting_ > 0) { + --waiting_; + ::WaitForSingleObject(sem2_, INFINITE); } wait_mtx_.Unlock(); } diff --git a/table/block.cc b/table/block.cc index ab83c1112..79ea9d9ee 100644 --- a/table/block.cc +++ b/table/block.cc @@ -16,7 +16,7 @@ namespace leveldb { inline uint32_t Block::NumRestarts() const { - assert(size_ >= 2*sizeof(uint32_t)); + assert(size_ >= sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } @@ -27,11 +27,12 @@ Block::Block(const BlockContents& contents) if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. + size_t max_restarts_allowed = (size_-sizeof(uint32_t)) / sizeof(uint32_t); + if (NumRestarts() > max_restarts_allowed) { + // The size is too small for NumRestarts() size_ = 0; + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); } } } @@ -253,7 +254,7 @@ class Block::Iter : public Iterator { }; Iterator* Block::NewIterator(const Comparator* cmp) { - if (size_ < 2*sizeof(uint32_t)) { + if (size_ < sizeof(uint32_t)) { return NewErrorIterator(Status::Corruption("bad block contents")); } const uint32_t num_restarts = NumRestarts(); diff --git a/table/table.cc b/table/table.cc index dbd6d3a1b..71c1756e5 100644 --- a/table/table.cc +++ b/table/table.cc @@ -228,7 +228,6 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, !filter->KeyMayMatch(handle.offset(), k)) { // Not found } else { - Slice handle = iiter->value(); Iterator* block_iter = BlockReader(this, options, iiter->value()); block_iter->Seek(k); if (block_iter->Valid()) { diff --git a/table/table_test.cc b/table/table_test.cc index 57cea2533..c723bf84c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -644,6 +644,36 @@ class Harness { Constructor* constructor_; }; +// Test empty table/block. +TEST(Harness, Empty) { + for (int i = 0; i < kNumTestArgs; i++) { + Init(kTestArgList[i]); + Random rnd(test::RandomSeed() + 1); + Test(&rnd); + } +} + +// Special test for a block with no restart entries. The C++ leveldb +// code never generates such blocks, but the Java version of leveldb +// seems to. +TEST(Harness, ZeroRestartPointsInBlock) { + char data[sizeof(uint32_t)]; + memset(data, 0, sizeof(data)); + BlockContents contents; + contents.data = Slice(data, sizeof(data)); + contents.cachable = false; + contents.heap_allocated = false; + Block block(contents); + Iterator* iter = block.NewIterator(BytewiseComparator()); + iter->SeekToFirst(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + ASSERT_TRUE(!iter->Valid()); + iter->Seek("foo"); + ASSERT_TRUE(!iter->Valid()); + delete iter; +} + // Test the empty key TEST(Harness, SimpleEmptyKey) { for (int i = 0; i < kNumTestArgs; i++) { diff --git a/util/cache.cc b/util/cache.cc index 24f1f63f4..8b197bc02 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -116,7 +116,6 @@ class HandleTable { LRUHandle* h = list_[i]; while (h != NULL) { LRUHandle* next = h->next_hash; - Slice key = h->key(); uint32_t hash = h->hash; LRUHandle** ptr = &new_list[hash & (new_length - 1)]; h->next_hash = *ptr; @@ -160,7 +159,6 @@ class LRUCache { // mutex_ protects the following state. port::Mutex mutex_; size_t usage_; - uint64_t last_id_; // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. @@ -170,8 +168,7 @@ class LRUCache { }; LRUCache::LRUCache() - : usage_(0), - last_id_(0) { + : usage_(0) { // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; diff --git a/util/coding_test.cc b/util/coding_test.cc index 2c52b17b6..fb5726e33 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -109,7 +109,7 @@ TEST(Coding, Varint64) { values.push_back(power); values.push_back(power-1); values.push_back(power+1); - }; + } std::string s; for (int i = 0; i < values.size(); i++) { diff --git a/util/comparator.cc b/util/comparator.cc index 4b7b5724e..6cc319242 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -66,7 +66,7 @@ class BytewiseComparatorImpl : public Comparator { }; } // namespace -static port::OnceType once = LEVELDB_ONCE_INIT; +static port::OnceType once_comparator = LEVELDB_ONCE_INIT; static const Comparator* bytewise; static void InitModule() { @@ -74,7 +74,7 @@ static void InitModule() { } const Comparator* BytewiseComparator() { - port::InitOnce(&once, InitModule); + port::InitOnce(&once_comparator, InitModule); return bytewise; } diff --git a/util/env_posix.cc b/util/env_posix.cc index db81f56d1..6badfdc23 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -386,7 +386,7 @@ class PosixEnv : public Env { PosixEnv(); virtual ~PosixEnv() { fprintf(stderr, "Destroying Env::Default()\n"); - exit(1); + abort(); } virtual Status NewSequentialFile(const std::string& fname, @@ -467,7 +467,7 @@ class PosixEnv : public Env { result = IOError(fname, errno); } return result; - }; + } virtual Status CreateDir(const std::string& name) { Status result; @@ -475,7 +475,7 @@ class PosixEnv : public Env { result = IOError(name, errno); } return result; - }; + } virtual Status DeleteDir(const std::string& name) { Status result; @@ -483,7 +483,7 @@ class PosixEnv : public Env { result = IOError(name, errno); } return result; - }; + } virtual Status GetFileSize(const std::string& fname, uint64_t* size) { Status s; @@ -589,7 +589,7 @@ class PosixEnv : public Env { void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - exit(1); + abort(); } } diff --git a/util/hash.cc b/util/hash.cc index ba1818082..07cf02206 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -6,6 +6,13 @@ #include "util/coding.h" #include "util/hash.h" +// The FALLTHROUGH_INTENDED macro can be used to annotate implicit fall-through +// between switch labels. The real definition should be provided externally. +// This one is a fallback version for unsupported compilers. +#ifndef FALLTHROUGH_INTENDED +#define FALLTHROUGH_INTENDED do { } while (0) +#endif + namespace leveldb { uint32_t Hash(const char* data, size_t n, uint32_t seed) { @@ -28,10 +35,10 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { switch (limit - data) { case 3: h += data[2] << 16; - // fall through + FALLTHROUGH_INTENDED; case 2: h += data[1] << 8; - // fall through + FALLTHROUGH_INTENDED; case 1: h += data[0]; h *= m;