From 0a0617359d31a70c2137ef5e27f82ee5bb43f6cd Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Wed, 8 Jan 2020 17:41:28 +0530
Subject: [PATCH 1/8] Fix checkOverlap in compaction (#1166)

The overlap check in compaction would keep additional keys in case
the levels under compaction had overlap amongst themselves.
This commit fixes it.

This commit also fixes the `numVersionsToKeep` check. Without this
commit, we would end up keeping `2` versions of a key even when the
number of versions to keep was set to `1`. See
`level 0 to level 1 with lower overlap` test.

Fixes https://github.com/dgraph-io/badger/issues/1053

The following test fails on master but works with this commit
```go
func TestCompaction(t *testing.T) {
	t.Run("level 0 to level 1", func(t *testing.T) {
		dir, err := ioutil.TempDir("", "badger-test")
		require.NoError(t, err)
		defer removeDir(dir)

		// Disable compactions and keep single version of each key.
		opt := DefaultOptions(dir).WithNumCompactors(0).WithNumVersionsToKeep(1)
		db, err := OpenManaged(opt)
		require.NoError(t, err)

		l0 := []keyValVersion{{"foo", "bar", 3}, {"fooz", "baz", 1}}
		l01 := []keyValVersion{{"foo", "bar", 2}}
		l1 := []keyValVersion{{"foo", "bar", 1}}
		// Level 0 has table l0 and l01.
		createAndOpen(db, l0, 0)
		createAndOpen(db, l01, 0)
		// Level 1 has table l1.
		createAndOpen(db, l1, 1)

		// Set a high discard timestamp so that all the keys are below the discard timestamp.
		db.SetDiscardTs(10)

		getAllAndCheck(t, db, []keyValVersion{
			{"foo", "bar", 3}, {"foo", "bar", 2}, {"foo", "bar", 1}, {"fooz", "baz", 1},
		})
		cdef := compactDef{
			thisLevel: db.lc.levels[0],
			nextLevel: db.lc.levels[1],
			top:       db.lc.levels[0].tables,
			bot:       db.lc.levels[1].tables,
		}
		require.NoError(t, db.lc.runCompactDef(0, cdef))
		// foo version 2 should be dropped after compaction.
		getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 3}, {"fooz", "baz", 1}})
	})
}
```
---
 levels.go      |  54 +++---
 levels_test.go | 441 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 474 insertions(+), 21 deletions(-)
 create mode 100644 levels_test.go

diff --git a/levels.go b/levels.go
index 0a4b92f26..39a68b4a4 100644
--- a/levels.go
+++ b/levels.go
@@ -430,28 +430,35 @@ func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
 	return prios
 }
 
-// compactBuildTables merge topTables and botTables to form a list of new tables.
+// checkOverlap checks if the given tables overlap with any level from the given "lev" onwards.
+func (s *levelsController) checkOverlap(tables []*table.Table, lev int) bool {
+	kr := getKeyRange(tables...)
+	for i, lh := range s.levels {
+		if i < lev { // Skip upper levels.
+			continue
+		}
+		lh.RLock()
+		left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
+		lh.RUnlock()
+		if right-left > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+// compactBuildTables merges topTables and botTables to form a list of new tables.
 func (s *levelsController) compactBuildTables(
 	lev int, cd compactDef) ([]*table.Table, func() error, error) {
 	topTables := cd.top
 	botTables := cd.bot
 
-	var hasOverlap bool
-	{
-		kr := getKeyRange(cd.top...)
-		for i, lh := range s.levels {
-			if i <= lev { // Skip upper levels.
-				continue
-			}
-			lh.RLock()
-			left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
-			lh.RUnlock()
-			if right-left > 0 {
-				hasOverlap = true
-				break
-			}
-		}
-	}
+	// Check overlap of the top level with the levels which are not being
+	// compacted in this compaction. We don't need to check overlap of the bottom
+	// tables with other levels because if the top tables overlap with any of the lower
+	// levels, it implies bottom level also overlaps because top and bottom tables
+	// overlap with each other.
+	hasOverlap := s.checkOverlap(cd.top, cd.nextLevel.level+1)
 
 	// Try to collect stats so that we can inform value log about GC. That would help us find which
 	// value log file should be GCed.
@@ -561,10 +568,15 @@ func (s *levelsController) compactBuildTables(
 				// versions which are below the minReadTs, otherwise, we might end up discarding the
 				// only valid version for a running transaction.
 				numVersions++
-				lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0
-				if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) ||
-					numVersions > s.kv.opt.NumVersionsToKeep ||
-					lastValidVersion {
+
+				// Keep the current version and discard all the next versions if
+				// - The `discardEarlierVersions` bit is set OR
+				// - We've already processed `NumVersionsToKeep` number of versions
+				// (including the current item being processed)
+				lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0 ||
+					numVersions == s.kv.opt.NumVersionsToKeep
+
+				if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) || lastValidVersion {
 					// If this version of the key is deleted or expired, skip all the rest of the
 					// versions. Ensure that we're only removing versions below readTs.
 					skipKey = y.SafeCopy(skipKey, it.Key())
diff --git a/levels_test.go b/levels_test.go
new file mode 100644
index 000000000..51e7baf72
--- /dev/null
+++ b/levels_test.go
@@ -0,0 +1,441 @@
+/*
+ * Copyright 2019 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package badger
+
+import (
+	"math"
+	"testing"
+
+	"github.com/dgraph-io/badger/v2/options"
+	"github.com/dgraph-io/badger/v2/pb"
+	"github.com/dgraph-io/badger/v2/table"
+	"github.com/dgraph-io/badger/v2/y"
+	"github.com/stretchr/testify/require"
+)
+
+// createAndOpen creates a table with the given data and adds it to the given level.
+func createAndOpen(db *DB, td []keyValVersion, level int) {
+	opts := table.Options{
+		BlockSize:          db.opt.BlockSize,
+		BloomFalsePositive: db.opt.BloomFalsePositive,
+		LoadingMode:        options.LoadToRAM,
+		ChkMode:            options.NoVerification,
+	}
+	b := table.NewTableBuilder(opts)
+
+	// Add all keys and versions to the table.
+	for _, item := range td {
+		key := y.KeyWithTs([]byte(item.key), uint64(item.version))
+		val := y.ValueStruct{Value: []byte(item.val), Meta: item.meta}
+		b.Add(key, val, 0)
+	}
+	fd, err := y.CreateSyncedFile(table.NewFilename(db.lc.reserveFileID(), db.opt.Dir), true)
+	if err != nil {
+		panic(err)
+	}
+
+	if _, err = fd.Write(b.Finish()); err != nil {
+		panic(err)
+	}
+	tab, err := table.OpenTable(fd, opts)
+	if err != nil {
+		panic(err)
+	}
+	if err := db.manifest.addChanges([]*pb.ManifestChange{
+		newCreateChange(tab.ID(), level, 0, tab.CompressionType()),
+	}); err != nil {
+		panic(err)
+	}
+	// Add table to the given level.
+	db.lc.levels[level].tables = append(db.lc.levels[level].tables, tab)
+}
+
+type keyValVersion struct {
+	key     string
+	val     string
+	version int
+	meta    byte
+}
+
+func TestCheckOverlap(t *testing.T) {
+	t.Run("overlap", func(t *testing.T) {
+		// This test consists of one table on level 0 and one on level 1.
+		// There is an overlap amongst the tables but there is no overlap
+		// with rest of the levels.
+		t.Run("same keys", func(t *testing.T) {
+			runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+				l0 := []keyValVersion{{"foo", "bar", 3, 0}}
+				l1 := []keyValVersion{{"foo", "bar", 2, 0}}
+				createAndOpen(db, l0, 0)
+				createAndOpen(db, l1, 1)
+
+				// Level 0 should overlap with level 0 tables.
+				require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 0))
+				// Level 1 should overlap with level 0 tables (they have the same keys).
+				require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1))
+				// Level 2 and 3 should not overlap with level 0 tables.
+				require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2))
+				require.False(t, db.lc.checkOverlap(db.lc.levels[1].tables, 2))
+				require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3))
+				require.False(t, db.lc.checkOverlap(db.lc.levels[1].tables, 3))
+
+			})
+		})
+		t.Run("overlapping keys", func(t *testing.T) {
+			runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+				l0 := []keyValVersion{{"a", "x", 1, 0}, {"b", "x", 1, 0}, {"foo", "bar", 3, 0}}
+				l1 := []keyValVersion{{"foo", "bar", 2, 0}}
+				createAndOpen(db, l0, 0)
+				createAndOpen(db, l1, 1)
+
+				// Level 0 should overlap with level 0 tables.
+				require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 0))
+				require.True(t, db.lc.checkOverlap(db.lc.levels[1].tables, 1))
+				// Level 1 should overlap with level 0 tables, "foo" key is common.
+				require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1))
+				// Level 2 and 3 should not overlap with level 0 tables.
+				require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2))
+				require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3))
+			})
+		})
+	})
+	t.Run("non-overlapping", func(t *testing.T) {
+		runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{{"a", "x", 1, 0}, {"b", "x", 1, 0}, {"c", "bar", 3, 0}}
+			l1 := []keyValVersion{{"foo", "bar", 2, 0}}
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l1, 1)
+
+			// Level 1 should not overlap with level 0 tables
+			require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1))
+			// Level 2 and 3 should not overlap with level 0 tables.
+			require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2))
+			require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3))
+		})
+	})
+}
+
+func getAllAndCheck(t *testing.T, db *DB, expected []keyValVersion) {
+	db.View(func(txn *Txn) error {
+		opt := DefaultIteratorOptions
+		opt.AllVersions = true
+		opt.InternalAccess = true
+		it := txn.NewIterator(opt)
+		defer it.Close()
+		i := 0
+		for it.Rewind(); it.Valid(); it.Next() {
+			require.Less(t, i, len(expected), "DB has more number of key than expected")
+			item := it.Item()
+			v, err := item.ValueCopy(nil)
+			require.NoError(t, err)
+			// fmt.Printf("k: %s v: %d val: %s\n", item.key, item.Version(), v)
+			expect := expected[i]
+			require.Equal(t, expect.key, string(item.Key()), "expected key: %s actual key: %s",
+				expect.key, item.Key())
+			require.Equal(t, expect.val, string(v), "key: %s expected value: %s actual %s",
+				item.key, expect.val, v)
+			require.Equal(t, expect.version, int(item.Version()),
+				"key: %s expected version: %d actual %d", item.key, expect.version, item.Version())
+			require.Equal(t, expect.meta, item.meta,
+				"key: %s expected meta: %d meta %d", item.key, expect.meta, item.meta)
+			i++
+		}
+		require.Equal(t, len(expected), i, "keys examined should be equal to keys expected")
+		return nil
+	})
+
+}
+
+func TestCompaction(t *testing.T) {
+	// Disable compactions and keep single version of each key.
+	opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1)
+	opt.managedTxns = true
+	t.Run("level 0 to level 1", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}}
+			l01 := []keyValVersion{{"foo", "bar", 2, 0}}
+			l1 := []keyValVersion{{"foo", "bar", 1, 0}}
+			// Level 0 has table l0 and l01.
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l01, 0)
+			// Level 1 has table l1.
+			createAndOpen(db, l1, 1)
+
+			// Set a high discard timestamp so that all the keys are below the discard timestamp.
+			db.SetDiscardTs(10)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 3, 0}, {"foo", "bar", 2, 0},
+				{"foo", "bar", 1, 0}, {"fooz", "baz", 1, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[0],
+				nextLevel: db.lc.levels[1],
+				top:       db.lc.levels[0].tables,
+				bot:       db.lc.levels[1].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(0, cdef))
+			// foo version 2 should be dropped after compaction.
+			getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}})
+		})
+	})
+
+	t.Run("level 0 to level 1 with lower overlap", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}}
+			l01 := []keyValVersion{{"foo", "bar", 2, 0}}
+			l1 := []keyValVersion{{"foo", "bar", 1, 0}}
+			l2 := []keyValVersion{{"foo", "bar", 0, 0}}
+			// Level 0 has table l0 and l01.
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l01, 0)
+			// Level 1 has table l1.
+			createAndOpen(db, l1, 1)
+			// Level 2 has table l2.
+			createAndOpen(db, l2, 2)
+
+			// Set a high discard timestamp so that all the keys are below the discard timestamp.
+			db.SetDiscardTs(10)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0},
+				{"foo", "bar", 0, 0}, {"fooz", "baz", 1, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[0],
+				nextLevel: db.lc.levels[1],
+				top:       db.lc.levels[0].tables,
+				bot:       db.lc.levels[1].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(0, cdef))
+			// foo version 2 and version 1 should be dropped after compaction.
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 3, 0}, {"foo", "bar", 0, 0}, {"fooz", "baz", 1, 0},
+			})
+		})
+	})
+
+	t.Run("level 1 to level 2", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l1 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}}
+			l2 := []keyValVersion{{"foo", "bar", 2, 0}}
+			createAndOpen(db, l1, 1)
+			createAndOpen(db, l2, 2)
+
+			// Set a high discard timestamp so that all the keys are below the discard timestamp.
+			db.SetDiscardTs(10)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 1, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[1],
+				nextLevel: db.lc.levels[2],
+				top:       db.lc.levels[1].tables,
+				bot:       db.lc.levels[2].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(1, cdef))
+			// foo version 2 should be dropped after compaction.
+			getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}})
+		})
+	})
+}
+
+func TestHeadKeyCleanup(t *testing.T) {
+	// Disable compactions and keep single version of each key.
+	opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1)
+	opt.managedTxns = true
+
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		l0 := []keyValVersion{
+			{string(head), "foo", 5, 0}, {string(head), "bar", 4, 0}, {string(head), "baz", 3, 0},
+		}
+		l1 := []keyValVersion{{string(head), "fooz", 2, 0}, {string(head), "foozbaz", 1, 0}}
+		// Level 0 has table l0 and l01.
+		createAndOpen(db, l0, 0)
+		// Level 1 has table l1.
+		createAndOpen(db, l1, 1)
+
+		// Set a high discard timestamp so that all the keys are below the discard timestamp.
+		db.SetDiscardTs(10)
+
+		getAllAndCheck(t, db, []keyValVersion{
+			{string(head), "foo", 5, 0}, {string(head), "bar", 4, 0}, {string(head), "baz", 3, 0},
+			{string(head), "fooz", 2, 0}, {string(head), "foozbaz", 1, 0},
+		})
+		cdef := compactDef{
+			thisLevel: db.lc.levels[0],
+			nextLevel: db.lc.levels[1],
+			top:       db.lc.levels[0].tables,
+			bot:       db.lc.levels[1].tables,
+		}
+		require.NoError(t, db.lc.runCompactDef(0, cdef))
+		// foo version 2 should be dropped after compaction.
+		getAllAndCheck(t, db, []keyValVersion{{string(head), "foo", 5, 0}})
+	})
+}
+
+func TestDiscardTs(t *testing.T) {
+	// Disable compactions and keep single version of each key.
+	opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1)
+	opt.managedTxns = true
+
+	t.Run("all keys above discardTs", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}}
+			l01 := []keyValVersion{{"foo", "bar", 3, 0}}
+			l1 := []keyValVersion{{"foo", "bar", 2, 0}}
+			// Level 0 has table l0 and l01.
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l01, 0)
+			// Level 1 has table l1.
+			createAndOpen(db, l1, 1)
+
+			// Set dicardTs to 1. All the keys are above discardTs.
+			db.SetDiscardTs(1)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0},
+				{"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[0],
+				nextLevel: db.lc.levels[1],
+				top:       db.lc.levels[0].tables,
+				bot:       db.lc.levels[1].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(0, cdef))
+			// No keys should be dropped.
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0},
+				{"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0},
+			})
+		})
+	})
+	t.Run("some keys above discardTs", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0},
+				{"foo", "bar", 2, 0}, {"fooz", "baz", 2, 0},
+			}
+			l1 := []keyValVersion{{"foo", "bbb", 1, 0}}
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l1, 1)
+
+			// Set dicardTs to 3. foo2 and foo1 should be dropped.
+			db.SetDiscardTs(3)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0},
+				{"foo", "bbb", 1, 0}, {"fooz", "baz", 2, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[0],
+				nextLevel: db.lc.levels[1],
+				top:       db.lc.levels[0].tables,
+				bot:       db.lc.levels[1].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(0, cdef))
+			// foo1 and foo2 should be dropped.
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"fooz", "baz", 2, 0},
+			})
+		})
+	})
+	t.Run("all keys below discardTs", func(t *testing.T) {
+		runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+			l0 := []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}}
+			l01 := []keyValVersion{{"foo", "bar", 3, 0}}
+			l1 := []keyValVersion{{"foo", "bar", 2, 0}}
+			// Level 0 has table l0 and l01.
+			createAndOpen(db, l0, 0)
+			createAndOpen(db, l01, 0)
+			// Level 1 has table l1.
+			createAndOpen(db, l1, 1)
+
+			// Set dicardTs to 10. All the keys are below discardTs.
+			db.SetDiscardTs(10)
+
+			getAllAndCheck(t, db, []keyValVersion{
+				{"foo", "bar", 4, 0}, {"foo", "bar", 3, 0},
+				{"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0},
+			})
+			cdef := compactDef{
+				thisLevel: db.lc.levels[0],
+				nextLevel: db.lc.levels[1],
+				top:       db.lc.levels[0].tables,
+				bot:       db.lc.levels[1].tables,
+			}
+			require.NoError(t, db.lc.runCompactDef(0, cdef))
+			// Only one version of every key should be left.
+			getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}})
+		})
+	})
+}
+
+// This is a test to ensure that the first entry with DiscardEarlierversion bit < DiscardTs
+// is kept around (when numversionstokeep is infinite).
+func TestDiscardFirstVersion(t *testing.T) {
+	opt := DefaultOptions("")
+	opt.NumCompactors = 0
+	opt.NumVersionsToKeep = math.MaxUint32
+	opt.managedTxns = true
+
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		l0 := []keyValVersion{{"foo", "bar", 1, 0}}
+		l01 := []keyValVersion{{"foo", "bar", 2, bitDiscardEarlierVersions}}
+		l02 := []keyValVersion{{"foo", "bar", 3, 0}}
+		l03 := []keyValVersion{{"foo", "bar", 4, 0}}
+		l04 := []keyValVersion{{"foo", "bar", 9, 0}}
+		l05 := []keyValVersion{{"foo", "bar", 10, bitDiscardEarlierVersions}}
+
+		// Level 0 has all the tables.
+		createAndOpen(db, l0, 0)
+		createAndOpen(db, l01, 0)
+		createAndOpen(db, l02, 0)
+		createAndOpen(db, l03, 0)
+		createAndOpen(db, l04, 0)
+		createAndOpen(db, l05, 0)
+
+		// Discard Time stamp is set to 7.
+		db.SetDiscardTs(7)
+
+		// Compact L0 to L1
+		cdef := compactDef{
+			thisLevel: db.lc.levels[0],
+			nextLevel: db.lc.levels[1],
+			top:       db.lc.levels[0].tables,
+			bot:       db.lc.levels[1].tables,
+		}
+		require.NoError(t, db.lc.runCompactDef(0, cdef))
+
+		// - Version 10, 9 lie above version 7 so they should be there.
+		// - Version 4, 3, 2 lie below the discardTs but they don't have the
+		//   "bitDiscardEarlierVersions" versions set so they should not be removed because number
+		//    of versions to keep is set to infinite.
+		// - Version 1 is below DiscardTS and below the first "bitDiscardEarlierVersions"
+		//   marker so IT WILL BE REMOVED.
+		ExpectedKeys := []keyValVersion{
+			{"foo", "bar", 10, bitDiscardEarlierVersions},
+			{"foo", "bar", 9, 0},
+			{"foo", "bar", 4, 0},
+			{"foo", "bar", 3, 0},
+			{"foo", "bar", 2, bitDiscardEarlierVersions}}
+
+		getAllAndCheck(t, db, ExpectedKeys)
+	})
+}

From 2a90c665f1e57fdc48256f222ed5d826c554043c Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Wed, 8 Jan 2020 17:55:04 +0530
Subject: [PATCH 2/8] Remove the 'this entry should've caught' log from
 value.go (#1170)

Fixes - https://github.com/dgraph-io/badger/issues/1031
(There wasn't a bug to fix. The log statement shouldn't have been there)

This PR removes the warning message `WARNING: This entry should have
been caught.`. The warning message assumed that we will always find the
**newest key if two keys have the same version** This assumption is
valid in case of a normal key but it's **NOT TRUE** in case of
**move keys**.

Here's how we can end up fetching the older version of a move key if
two move keys have the same version.

```
It might be possible that the entry read from LSM Tree points to an
older vlog file. This can happen in the following situation. Assume DB
is opened with numberOfVersionsToKeep=1

Now, if we have ONLY one key in the system "FOO" which has been updated
3 times and the same key has been garbage collected 3 times, we'll have
3 versions of the movekey for the same key "FOO".
NOTE: moveKeyi is the moveKey with version i
Assume we have 3 move keys in L0.
- moveKey1 (points to vlog file 10),
- moveKey2 (points to vlog file 14) and
- moveKey3 (points to vlog file 15).

Also, assume there is another move key "moveKey1" (points to vlog
file 6) (this is also a move Key for key "FOO" ) on upper levels (let's
say level 3). The move key "moveKey1" on level 0 was inserted because
vlog file 6 was GCed.

Here's what the arrangement looks like
L0 => (moveKey1 => vlog10), (moveKey2 => vlog14), (moveKey3 => vlog15)
L1 => ....
L2 => ....
L3 => (moveKey1 => vlog6)

When L0 compaction runs, it keeps only moveKey3 because the number of
versions to keep is set to 1. (we've dropped moveKey1's latest version)

The new arrangement of keys is
L0 => ....
L1 => (moveKey3 => vlog15)
L2 => ....
L3 => (moveKey1 => vlog6)

Now if we try to GC vlog file 10, the entry read from vlog file will
point to vlog10 but the entry read from LSM Tree will point to vlog6.
The move key read from LSM tree will point to vlog6 because we've asked
for version 1 of the move key.

This might seem like an issue but it's not really an issue because the
user has set the number of versions to keep to 1 and the latest version
of moveKey points to the correct vlog file and offset. The stale move
key on L3 will be eventually dropped by compaction because there is a
newer version in the upper levels.
```
---
 value.go | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/value.go b/value.go
index feca8a1a2..ca9f0ab6b 100644
--- a/value.go
+++ b/value.go
@@ -523,12 +523,19 @@ func (vlog *valueLog) rewrite(f *logFile, tr trace.Trace) error {
 		var vp valuePointer
 		vp.Decode(vs.Value)
 
+		// If the entry found from the LSM Tree points to a newer vlog file, don't do anything.
 		if vp.Fid > f.fid {
 			return nil
 		}
+		// If the entry found from the LSM Tree points to an offset greater than the one
+		// read from vlog, don't do anything.
 		if vp.Offset > e.offset {
 			return nil
 		}
+		// If the entry read from LSM Tree and vlog file point to the same vlog file and offset,
+		// insert them back into the DB.
+		// NOTE: It might be possible that the entry read from the LSM Tree points to
+		// an older vlog file. See the comments in the else part.
 		if vp.Fid == f.fid && vp.Offset == e.offset {
 			moved++
 			// This new entry only contains the key, and a pointer to the value.
@@ -564,7 +571,46 @@ func (vlog *valueLog) rewrite(f *logFile, tr trace.Trace) error {
 			wb = append(wb, ne)
 			size += es
 		} else {
-			vlog.db.opt.Warningf("This entry should have been caught. %+v\n", e)
+			// It might be possible that the entry read from LSM Tree points to an older vlog file.
+			// This can happen in the following situation. Assume DB is opened with
+			// numberOfVersionsToKeep=1
+			//
+			// Now, if we have ONLY one key in the system "FOO" which has been updated 3 times and
+			// the same key has been garbage collected 3 times, we'll have 3 versions of the movekey
+			// for the same key "FOO".
+			// NOTE: moveKeyi is the moveKey with version i
+			// Assume we have 3 move keys in L0.
+			// - moveKey1 (points to vlog file 10),
+			// - moveKey2 (points to vlog file 14) and
+			// - moveKey3 (points to vlog file 15).
+
+			// Also, assume there is another move key "moveKey1" (points to vlog file 6) (this is
+			// also a move Key for key "FOO" ) on upper levels (let's say 3). The move key
+			//  "moveKey1" on level 0 was inserted because vlog file 6 was GCed.
+			//
+			// Here's what the arrangement looks like
+			// L0 => (moveKey1 => vlog10), (moveKey2 => vlog14), (moveKey3 => vlog15)
+			// L1 => ....
+			// L2 => ....
+			// L3 => (moveKey1 => vlog6)
+			//
+			// When L0 compaction runs, it keeps only moveKey3 because the number of versions
+			// to keep is set to 1. (we've dropped moveKey1's latest version)
+			//
+			// The new arrangement of keys is
+			// L0 => ....
+			// L1 => (moveKey3 => vlog15)
+			// L2 => ....
+			// L3 => (moveKey1 => vlog6)
+			//
+			// Now if we try to GC vlog file 10, the entry read from vlog file will point to vlog10
+			// but the entry read from LSM Tree will point to vlog6. The move key read from LSM tree
+			// will point to vlog6 because we've asked for version 1 of the move key.
+			//
+			// This might seem like an issue but it's not really an issue because the user has set
+			// the number of versions to keep to 1 and the latest version of moveKey points to the
+			// correct vlog file and offset. The stale move key on L3 will be eventually dropped by
+			// compaction because there is a newer versions in the upper levels.
 		}
 		return nil
 	}

From 2698bfc11d79feddf9515b1c8e9c25d3b5fba1d0 Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Tue, 14 Jan 2020 16:36:03 +0530
Subject: [PATCH 3/8] Avoid sync in inmemory mode (#1190)

This makes db.Sync() no-op when badger is running in in-memory mode.
The previous code would unnecessarily load up an atomic and
acquire locks.
---
 value.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/value.go b/value.go
index ca9f0ab6b..d6d6d79ba 100644
--- a/value.go
+++ b/value.go
@@ -1315,7 +1315,7 @@ func (reqs requests) IncrRef() {
 // if fid >= vlog.maxFid. In some cases such as replay(while opening db), it might be called with
 // fid < vlog.maxFid. To sync irrespective of file id just call it with math.MaxUint32.
 func (vlog *valueLog) sync(fid uint32) error {
-	if vlog.opt.SyncWrites {
+	if vlog.opt.SyncWrites || vlog.opt.InMemory {
 		return nil
 	}
 

From 9d6512b081d0c172f5e5153b289f2c97458da01c Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Tue, 14 Jan 2020 19:52:29 +0530
Subject: [PATCH 4/8] Use fastRand instead of locked-rand in skiplist (#1173)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The math/rand package (https://golang.org/src/math/rand/rand.go) uses
a global lock to allow concurrent access to the rand object.

The PR replaces `math.Rand` with `ristretto/z.FastRand()`. `FastRand`
is much faster than `math.Rand` and `rand.New(..)` generators.

The change improves concurrent writes to skiplist by ~30%
```go
func BenchmarkWrite(b *testing.B) {
	value := newValue(123)
	l := NewSkiplist(int64((b.N + 1) * MaxNodeSize))
	defer l.DecrRef()
	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		rng := rand.New(rand.NewSource(time.Now().UnixNano()))
		for pb.Next() {
			l.Put(randomKey(rng), y.ValueStruct{Value: value, Meta: 0, UserMeta: 0})
		}
	})
}
```
```
name      old time/op  new time/op  delta
Write-16   657ns ± 3%   441ns ± 1%  -32.94%  (p=0.000 n=9+10)
```
---
 skl/skl.go          |  8 ++++----
 skl/skl_test.go     | 15 ++++++++++++++-
 table/table.go      |  4 ++++
 table/table_test.go |  5 ++++-
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/skl/skl.go b/skl/skl.go
index cdfc599be..43694f14b 100644
--- a/skl/skl.go
+++ b/skl/skl.go
@@ -34,11 +34,11 @@ package skl
 
 import (
 	"math"
-	"math/rand"
 	"sync/atomic"
 	"unsafe"
 
 	"github.com/dgraph-io/badger/v2/y"
+	"github.com/dgraph-io/ristretto/z"
 )
 
 const (
@@ -165,9 +165,9 @@ func (s *node) casNextOffset(h int, old, val uint32) bool {
 //	return n != nil && y.CompareKeys(key, n.key) > 0
 //}
 
-func randomHeight() int {
+func (s *Skiplist) randomHeight() int {
 	h := 1
-	for h < maxHeight && rand.Uint32() <= heightIncrease {
+	for h < maxHeight && z.FastRand() <= heightIncrease {
 		h++
 	}
 	return h
@@ -300,7 +300,7 @@ func (s *Skiplist) Put(key []byte, v y.ValueStruct) {
 	}
 
 	// We do need to create a new node.
-	height := randomHeight()
+	height := s.randomHeight()
 	x := newNode(s.arena, key, v, height)
 
 	// Try to increase s.height via CAS.
diff --git a/skl/skl_test.go b/skl/skl_test.go
index 6bd075862..0be7a64e4 100644
--- a/skl/skl_test.go
+++ b/skl/skl_test.go
@@ -499,7 +499,7 @@ func BenchmarkReadWriteMap(b *testing.B) {
 			b.RunParallel(func(pb *testing.PB) {
 				rng := rand.New(rand.NewSource(time.Now().UnixNano()))
 				for pb.Next() {
-					if rand.Float32() < readFrac {
+					if rng.Float32() < readFrac {
 						mutex.RLock()
 						_, ok := m[string(randomKey(rng))]
 						mutex.RUnlock()
@@ -516,3 +516,16 @@ func BenchmarkReadWriteMap(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkWrite(b *testing.B) {
+	value := newValue(123)
+	l := NewSkiplist(int64((b.N + 1) * MaxNodeSize))
+	defer l.DecrRef()
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+		for pb.Next() {
+			l.Put(randomKey(rng), y.ValueStruct{Value: value, Meta: 0, UserMeta: 0})
+		}
+	})
+}
diff --git a/table/table.go b/table/table.go
index d68169384..fc20be7c0 100644
--- a/table/table.go
+++ b/table/table.go
@@ -232,6 +232,7 @@ func OpenTable(fd *os.File, opts Options) (*Table, error) {
 	if err := t.initBiggestAndSmallest(); err != nil {
 		return nil, errors.Wrapf(err, "failed to initialize table")
 	}
+
 	if opts.ChkMode == options.OnTableRead || opts.ChkMode == options.OnTableAndBlockRead {
 		if err := t.VerifyChecksum(); err != nil {
 			_ = fd.Close()
@@ -320,6 +321,9 @@ func (t *Table) readIndex() error {
 	readPos -= 4
 	buf := t.readNoFail(readPos, 4)
 	checksumLen := int(y.BytesToU32(buf))
+	if checksumLen < 0 {
+		return errors.New("checksum length less than zero. Data corrupted")
+	}
 
 	// Read checksum.
 	expectedChk := &pb.Checksum{}
diff --git a/table/table_test.go b/table/table_test.go
index 82bddf591..7aa47d547 100644
--- a/table/table_test.go
+++ b/table/table_test.go
@@ -743,7 +743,10 @@ func TestTableChecksum(t *testing.T) {
 	f := buildTestTable(t, "k", 10000, opts)
 	fi, err := f.Stat()
 	require.NoError(t, err, "unable to get file information")
-	f.WriteAt(rb, rand.Int63n(fi.Size()))
+	// Write random bytes at random location.
+	n, err := f.WriteAt(rb, rand.Int63n(fi.Size()))
+	require.NoError(t, err)
+	require.Equal(t, n, len(rb))
 
 	_, err = OpenTable(f, opts)
 	if err == nil || !strings.Contains(err.Error(), "checksum") {

From 01a00cb4e6e39ece7ebff96217ac9ee8e128597f Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Tue, 14 Jan 2020 22:04:38 +0530
Subject: [PATCH 5/8] Add Jaegar to list of projects (#1192)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a3064653c..2f6599602 100644
--- a/README.md
+++ b/README.md
@@ -748,6 +748,7 @@ Below is a list of known projects that use Badger:
 
 * [0-stor](https://github.com/zero-os/0-stor) - Single device object store.
 * [Dgraph](https://github.com/dgraph-io/dgraph) - Distributed graph database.
+* [Jaeger](https://github.com/jaegertracing/jaeger) - Distributed tracing platform.
 * [TalariaDB](https://github.com/grab/talaria) - Distributed, low latency time-series database.
 * [Dispatch Protocol](https://github.com/dispatchlabs/disgo) - Blockchain protocol for distributed application data analytics.
 * [Sandglass](https://github.com/celrenheit/sandglass) - distributed, horizontally scalable, persistent, time sorted message queue.

From 5870b7b1975e031549f8a3d4309335f6b03f0fa4 Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Wed, 15 Jan 2020 15:10:34 +0530
Subject: [PATCH 6/8] Run all tests on CI (#1189)

This could possibly be a bug in `go test` command
https://github.com/golang/go/issues/36527 .

The `go test` command would skip tests in sub-packages if
the top-level package has a `custom flag` defined and the
sub-packages don't define it.

Issue https://github.com/golang/go/issues/36527#issue-548887632
has an example of this.

This PR also removes the code from the test that would unnecessary
start a web server. I see two problems here
1. An unnecessary web server running.
2. We cannot run multiple tests are the same time since the second
    run of the test would try to start a web server and
    crash saying `port already in use`.
---
 db_test.go |  8 +-------
 test.sh    | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/db_test.go b/db_test.go
index f6185e44c..56c95de98 100644
--- a/db_test.go
+++ b/db_test.go
@@ -26,7 +26,6 @@ import (
 	"log"
 	"math"
 	"math/rand"
-	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -1957,12 +1956,7 @@ func TestVerifyChecksum(t *testing.T) {
 }
 
 func TestMain(m *testing.M) {
-	// call flag.Parse() here if TestMain uses flags
-	go func() {
-		if err := http.ListenAndServe("localhost:8080", nil); err != nil {
-			panic("Unable to open http port at 8080")
-		}
-	}()
+	flag.Parse()
 	os.Exit(m.Run())
 }
 
diff --git a/test.sh b/test.sh
index 90d21889c..b4e40601a 100755
--- a/test.sh
+++ b/test.sh
@@ -4,30 +4,43 @@ set -e
 
 go version
 
+packages=$(go list ./... | grep github.com/dgraph-io/badger/v2/)
+
+if [[ ! -z "$TEAMCITY_VERSION" ]]; then
+  export GOFLAGS="-json"
+fi
+
 # Ensure that we can compile the binary.
 pushd badger
 go build -v .
 popd
 
 # Run the memory intensive tests first.
-go test -v --manual=true -run='TestBigKeyValuePairs$'
-go test -v --manual=true -run='TestPushValueLogLimit'
+go test -v -run='TestBigKeyValuePairs$' --manual=true
+go test -v -run='TestPushValueLogLimit' --manual=true
 
 # Run the special Truncate test.
 rm -rf p
-go test -v --manual=true -run='TestTruncateVlogNoClose$' .
+go test -v -run='TestTruncateVlogNoClose$' --manual=true
 truncate --size=4096 p/000000.vlog
-go test -v --manual=true -run='TestTruncateVlogNoClose2$' .
-go test -v --manual=true -run='TestTruncateVlogNoClose3$' .
+go test -v -run='TestTruncateVlogNoClose2$' --manual=true
+go test -v -run='TestTruncateVlogNoClose3$' --manual=true
 rm -rf p
 
 # Then the normal tests.
+echo
+echo "==> Starting test for table, skl and y package"
+go test -v -race github.com/dgraph-io/badger/v2/skl
+# Run test for all package except the top level package. The top level package support the
+# `vlog_mmap` flag which rest of the packages don't support.
+go test -v -race $packages
+
 echo
 echo "==> Starting tests with value log mmapped..."
-sleep 5
-go test -v --vlog_mmap=true -race ./...
+# Run top level package tests with mmap flag.
+go test -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=true
 
 echo
 echo "==> Starting tests with value log not mmapped..."
-sleep 5
-go test -v --vlog_mmap=false -race ./...
+go test -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=false
+

From 3747be59ebbbf34382ab8d84207151c7f7014da2 Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Wed, 15 Jan 2020 15:57:31 +0530
Subject: [PATCH 7/8] Improve write stalling on level 0 and 1

We don't need to stall writes if Level 1 does not have enough space.
Level 1 is stored on the disk and it should be okay to have more
number of tables (more size) on Level 1 than the `max level 1 size`.
These tables will eventually be compacted to lower levels.

This commit changes the following
- We no longer stall writes if L1 doesn't have enough space.
- We stall writes on level 0 only if `KeepL0InMemory` is true.
- Upper levels (L0, L1, etc) get priority in compaction (previously,
    level with higher priority score would get preference)
---
 level_handler.go |   4 +-
 levels.go        |  21 ++++----
 levels_test.go   | 138 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 12 deletions(-)

diff --git a/level_handler.go b/level_handler.go
index dbc2532ba..19ba0892b 100644
--- a/level_handler.go
+++ b/level_handler.go
@@ -188,7 +188,9 @@ func (s *levelHandler) tryAddLevel0Table(t *table.Table) bool {
 	// Need lock as we may be deleting the first table during a level 0 compaction.
 	s.Lock()
 	defer s.Unlock()
-	if len(s.tables) >= s.db.opt.NumLevelZeroTablesStall {
+	// Return false only if L0 is in memory and number of tables is more than number of
+	// ZeroTableStall. For on disk L0, we should just add the tables to the level.
+	if s.db.opt.KeepL0InMemory && len(s.tables) >= s.db.opt.NumLevelZeroTablesStall {
 		return false
 	}
 
diff --git a/levels.go b/levels.go
index 39a68b4a4..8d621f3e5 100644
--- a/levels.go
+++ b/levels.go
@@ -424,9 +424,10 @@ func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
 			prios = append(prios, pri)
 		}
 	}
-	sort.Slice(prios, func(i, j int) bool {
-		return prios[i].score > prios[j].score
-	})
+	// We used to sort compaction priorities based on the score. But, we
+	// decided to compact based on the level, not the priority. So, upper
+	// levels (level 0, level 1, etc) always get compacted first, before the
+	// lower levels -- this allows us to avoid stalls.
 	return prios
 }
 
@@ -937,15 +938,13 @@ func (s *levelsController) addLevel0Table(t *table.Table) error {
 			s.cstatus.RUnlock()
 			timeStart = time.Now()
 		}
-		// Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we
-		// will very quickly fill up level 0 again and if the compaction strategy favors level 0,
-		// then level 1 is going to super full.
+		// Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we
+		// will very quickly fill up level 0 again.
 		for i := 0; ; i++ {
-			// Passing 0 for delSize to compactable means we're treating incomplete compactions as
-			// not having finished -- we wait for them to finish.  Also, it's crucial this behavior
-			// replicates pickCompactLevels' behavior in computing compactability in order to
-			// guarantee progress.
-			if !s.isLevel0Compactable() && !s.levels[1].isCompactable(0) {
+			// It's crucial that this behavior replicates pickCompactLevels' behavior in
+			// computing compactability in order to guarantee progress.
+			// Break the loop once L0 has enough space to accommodate new tables.
+			if !s.isLevel0Compactable() {
 				break
 			}
 			time.Sleep(10 * time.Millisecond)
diff --git a/levels_test.go b/levels_test.go
index 51e7baf72..66df2de01 100644
--- a/levels_test.go
+++ b/levels_test.go
@@ -19,6 +19,7 @@ package badger
 import (
 	"math"
 	"testing"
+	"time"
 
 	"github.com/dgraph-io/badger/v2/options"
 	"github.com/dgraph-io/badger/v2/pb"
@@ -439,3 +440,140 @@ func TestDiscardFirstVersion(t *testing.T) {
 		getAllAndCheck(t, db, ExpectedKeys)
 	})
 }
+
+// This test ensures we don't stall when L1's size is greater than opt.LevelOneSize.
+// We should stall only when L0 tables more than the opt.NumLevelZeroTableStall.
+func TestL1Stall(t *testing.T) {
+	opt := DefaultOptions("")
+	// Disable all compactions.
+	opt.NumCompactors = 0
+	// Number of level zero tables.
+	opt.NumLevelZeroTables = 3
+	// Addition of new tables will stall if there are 4 or more L0 tables.
+	opt.NumLevelZeroTablesStall = 4
+	// Level 1 size is 10 bytes.
+	opt.LevelOneSize = 10
+
+	runBadgerTest(t, &opt, func(t *testing.T, db *DB) {
+		// Level 0 has 4 tables.
+		db.lc.levels[0].Lock()
+		db.lc.levels[0].tables = []*table.Table{createEmptyTable(db), createEmptyTable(db),
+			createEmptyTable(db), createEmptyTable(db)}
+		db.lc.levels[0].Unlock()
+
+		timeout := time.After(5 * time.Second)
+		done := make(chan bool)
+
+		// This is important. Set level 1 size more than the opt.LevelOneSize (we've set it to 10).
+		db.lc.levels[1].totalSize = 100
+		go func() {
+			tab := createEmptyTable(db)
+			db.lc.addLevel0Table(tab)
+			tab.DecrRef()
+			done <- true
+		}()
+		time.Sleep(time.Second)
+
+		db.lc.levels[0].Lock()
+		// Drop two tables from Level 0 so that addLevel0Table can make progress. Earlier table
+		// count was 4 which is equal to L0 stall count.
+		toDrop := db.lc.levels[0].tables[:2]
+		decrRefs(toDrop)
+		db.lc.levels[0].tables = db.lc.levels[0].tables[2:]
+		db.lc.levels[0].Unlock()
+
+		select {
+		case <-timeout:
+			t.Fatal("Test didn't finish in time")
+		case <-done:
+		}
+	})
+}
+
+func createEmptyTable(db *DB) *table.Table {
+	opts := table.Options{
+		BloomFalsePositive: db.opt.BloomFalsePositive,
+		LoadingMode:        options.LoadToRAM,
+		ChkMode:            options.NoVerification,
+	}
+	b := table.NewTableBuilder(opts)
+	// Add one key so that we can open this table.
+	b.Add(y.KeyWithTs([]byte("foo"), 1), y.ValueStruct{}, 0)
+	fd, err := y.CreateSyncedFile(table.NewFilename(db.lc.reserveFileID(), db.opt.Dir), true)
+	if err != nil {
+		panic(err)
+	}
+
+	if _, err := fd.Write(b.Finish()); err != nil {
+		panic(err)
+	}
+	tab, err := table.OpenTable(fd, table.Options{})
+	if err != nil {
+		panic(err)
+	}
+	// Add dummy entry to manifest file so that it doesn't complain during compaction.
+	if err := db.manifest.addChanges([]*pb.ManifestChange{
+		newCreateChange(tab.ID(), 0, 0, tab.CompressionType()),
+	}); err != nil {
+		panic(err)
+	}
+
+	return tab
+}
+
+func TestL0Stall(t *testing.T) {
+	test := func(t *testing.T, opt *Options) {
+		runBadgerTest(t, opt, func(t *testing.T, db *DB) {
+			db.lc.levels[0].Lock()
+			// Add NumLevelZeroTableStall+1 number of tables to level 0. This would fill up level
+			// zero and all new additions are expected to stall if L0 is in memory.
+			for i := 0; i < opt.NumLevelZeroTablesStall+1; i++ {
+				db.lc.levels[0].tables = append(db.lc.levels[0].tables, createEmptyTable(db))
+			}
+			db.lc.levels[0].Unlock()
+
+			timeout := time.After(5 * time.Second)
+			done := make(chan bool)
+
+			go func() {
+				tab := createEmptyTable(db)
+				db.lc.addLevel0Table(tab)
+				tab.DecrRef()
+				done <- true
+			}()
+			// Let it stall for a second.
+			time.Sleep(time.Second)
+
+			select {
+			case <-timeout:
+				if opt.KeepL0InMemory {
+					t.Log("Timeout triggered")
+					// Mark this test as successful since L0 is in memory and the
+					// addition of new table to L0 is supposed to stall.
+				} else {
+					t.Fatal("Test didn't finish in time")
+				}
+			case <-done:
+				// The test completed before 5 second timeout. Mark it as successful.
+			}
+		})
+	}
+
+	opt := DefaultOptions("")
+	opt.EventLogging = false
+	// Disable all compactions.
+	opt.NumCompactors = 0
+	// Number of level zero tables.
+	opt.NumLevelZeroTables = 3
+	// Addition of new tables will stall if there are 4 or more L0 tables.
+	opt.NumLevelZeroTablesStall = 4
+
+	t.Run("with KeepL0InMemory", func(t *testing.T) {
+		opt.KeepL0InMemory = true
+		test(t, &opt)
+	})
+	t.Run("with L0 on disk", func(t *testing.T) {
+		opt.KeepL0InMemory = false
+		test(t, &opt)
+	})
+}

From 82381aceeed606625256e47b877edd6eef5e4011 Mon Sep 17 00:00:00 2001
From: Ibrahim Jarif <ibrahim@dgraph.io>
Date: Thu, 16 Jan 2020 13:19:39 +0530
Subject: [PATCH 8/8] Update ristretto to version  8f368f2 (#1195)

This commit pulls following changes from ristretto
```
git log c1f00be0418e...8f368f2 --oneline
```
```

8f368f2 (HEAD -> master) Fix DeepSource warnings
adb35f0 delete item immediately
fce5e91 Support nil *Cache values in Clear and Close
4e224f9 Add .travis.yml
8350826 Fix the way metrics are handled for deletions
99d1bbb (tag: v0.0.1) default to 128bit hashing for collision checks
1eea1b1 atomic Get-Delete operation for eviction
8d6a8a7 add cache test for identifying MaxCost overflows
ae09250 use the custom KeyToHash function if one is set
1dd5a4d #19 Fixes memory leak due to Oct 1st regression in processItems
```
---
 go.mod         | 2 +-
 go.sum         | 4 ++--
 table/table.go | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/go.mod b/go.mod
index f7f1fb0d3..eae04e485 100644
--- a/go.mod
+++ b/go.mod
@@ -5,7 +5,7 @@ go 1.12
 require (
 	github.com/DataDog/zstd v1.4.1
 	github.com/cespare/xxhash v1.1.0
-	github.com/dgraph-io/ristretto v0.0.0-20191025175511-c1f00be0418e
+	github.com/dgraph-io/ristretto v0.0.2-0.20200115201040-8f368f2f2ab3
 	github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2
 	github.com/dustin/go-humanize v1.0.0
 	github.com/golang/protobuf v1.3.1
diff --git a/go.sum b/go.sum
index 60e673a75..4c71dbdf4 100644
--- a/go.sum
+++ b/go.sum
@@ -13,8 +13,8 @@ github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwc
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dgraph-io/ristretto v0.0.0-20191025175511-c1f00be0418e h1:aeUNgwup7PnDOBAD1BOKAqzb/W/NksOj6r3dwKKuqfg=
-github.com/dgraph-io/ristretto v0.0.0-20191025175511-c1f00be0418e/go.mod h1:edzKIzGvqUCMzhTVWbiTSe75zD9Xxq0GtSBtFmaUTZs=
+github.com/dgraph-io/ristretto v0.0.2-0.20200115201040-8f368f2f2ab3 h1:MQLRM35Pp0yAyBYksjbj1nZI/w6eyRY/mWoM1sFf4kU=
+github.com/dgraph-io/ristretto v0.0.2-0.20200115201040-8f368f2f2ab3/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E=
 github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA=
 github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
 github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
diff --git a/table/table.go b/table/table.go
index fc20be7c0..fe6ca6e25 100644
--- a/table/table.go
+++ b/table/table.go
@@ -358,7 +358,10 @@ func (t *Table) readIndex() error {
 	y.Check(err)
 
 	t.estimatedSize = index.EstimatedSize
-	t.bf = z.JSONUnmarshal(index.BloomFilter)
+	if t.bf, err = z.JSONUnmarshal(index.BloomFilter); err != nil {
+		return y.Wrapf(err, "failed to unmarshal bloom filter for the table %d in Table.readIndex",
+			t.id)
+	}
 	t.blockIndex = index.Offsets
 	return nil
 }