theme: TargetFira, 1 slidenumbers: true
No coordination - "Silence is Golden"1
[.code-highlight: 3, 10-16]
@file:DependsOn("org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.7.10")
@file:DependsOn("org.rocksdb:rocksdbjni:7.7.3")
import org.rocksdb.*
RocksDB.loadLibrary() // instantiates the rocksdb native library
val simpleRocksDbDir = "/tmp/simpleRocksDb"
// open a new rocksdb instance in a temp directory and give it to the block
// uses the "default" column family to store all values
fun <T> withSimpleRocksDb(dir: String = simpleRocksDbDir, block: (RocksDB) -> T): T =
RocksDB.open(dir).use { rocksDb ->
return block(rocksDb)
}
// RocksDB only has ByteArray keys and values, add functions so we can use String for both
fun RocksDB.put(key: String, value: String) = put(key.toByteArray(), value.toByteArray())
fun RocksDB.get(key: String): String? = get(key.toByteArray())?.let { String(it) }
[.code-highlight: 2-4, 7]
withSimpleRocksDb { rocksDb ->
rocksDb.put("current_year", "2024")
println(rocksDb.get("current_year"))
}
2024
[.code-highlight: 1, 6-12, 15, 18]
// write 100M key/value pairs in 1M tuple batches
withSimpleRocksDb { rocksDb ->
val writeOptions = WriteOptions()
val defaultColumnFamily = rocksDb.defaultColumnFamily
WriteBatch().use { batch ->
(1..100_000_000).forEach { ordinal ->
batch.put(defaultColumnFamily, "key_${ordinal}".toByteArray(), """{ "value": ${ordinal} }""".toByteArray())
if (ordinal % 1_000_000 == 0) {
rocksDb.write(writeOptions, batch)
batch.clear()
}
}
}
println(rocksDb.get("key_9000"))
}
{ "value": 9000 }
[.code-highlight: 1, 4-11, 14-18]
// seek to key_9000 and print the next 5 values
withSimpleRocksDb { rocksDb ->
var count = 0
rocksDb.newIterator(rocksDb.defaultColumnFamily).use { rocksIterator ->
rocksIterator.seek("key_9000".toByteArray())
while(rocksIterator.isValid && count++ < 5) {
println("${count}: ${String(rocksIterator.key())} -> ${String(rocksIterator.value())}")
rocksIterator.next()
}
}
}
1: key_9000 -> { "value": 9000 }
2: key_90000 -> { "value": 90000 }
3: key_900000 -> { "value": 900000 }
4: key_9000000 -> { "value": 9000000 }
5: key_90000000 -> { "value": 90000000 }
Writing is Random I/O & Can Update Many Pages on Disk 2
SST Files - LSM Building Block3
SSTable - Block Size4
SSTable - Index 4
Bloom/Ribbon Filters5
SkipList
(binary search) is the default6
Recommendation, use ZSTD
for the bottom level (best compression, more CPU)
and LZ4
for everything else
setCompressionOptions(CompressionOptions().apply {
setCompressionType(CompressionType.LZ4_COMPRESSION)
setBottommostCompressionType(CompressionType.ZSTD_COMPRESSION)
})
[.code-highlight: 1, 6-8, 17, 24-27]
// write 100M key/value pairs in 1M tuple batches twice, so each key has a duplicate value
withSimpleRocksDb { rocksDb ->
val writeOptions = WriteOptions()
val defaultColumnFamily = rocksDb.defaultColumnFamily
WriteBatch().use { batch ->
(1..2).forEach { _ ->
(1..100_000_000).forEach { ordinal ->
batch.put(defaultColumnFamily, "key_${ordinal}".toByteArray(), """{ "value": ${ordinal} }""".toByteArray())
if (ordinal % 1_000_000 == 0) {
rocksDb.write(writeOptions, batch)
batch.clear()
}
}
}
}
rocksDb.getProperty(rocksDb.getDefaultColumnFamily(), "rocksdb.stats")
}
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 19/1 327.53 MB 4.5 0.0 0.0 0.0 1.7 1.7 0.0 1.0 0.0 15.0 113.43 99 1.146 0 0 0.0 0.0
L1 12/1 742.03 MB 2.7 3.9 1.3 2.6 3.6 1.0 0.0 2.7 11.1 10.2 361.10 23 15.700 454M 40M 0.0 0.0
L2 11/0 661.98 MB 0.3 0.6 0.3 0.3 0.3 0.0 0.0 1.0 13.8 6.9 47.03 5 9.407 85M 36M 0.0 0.0
Sum 42/2 1.69 GB 0.0 4.5 1.7 2.9 5.6 2.7 0.0 3.3 8.9 11.0 521.56 127 4.107 539M 76M 0.0 0.0
Int 0/0 0.00 KB 0.0 4.1 1.5 2.7 4.6 2.0 0.0 4.2 9.5 10.7 443.91 89 4.988 493M 76M 0.0 0.0
[.code-highlight: 2, 3, 9-12]
withSimpleRocksDb { rocksDb ->
rocksDb.compactRange(rocksDb.getDefaultColumnFamily())
rocksDb.getProperty(rocksDb.getDefaultColumnFamily(), "rocksdb.stats")
}
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 14.9 1.15 1 1.154 0 0 0.0 0.0
L1 0/0 0.00 KB 0.0 1.1 0.3 0.7 0.8 0.1 0.0 2.4 11.9 8.9 91.37 1 91.368 123M 31M 0.0 0.0
L2 11/0 661.98 MB 0.3 1.5 0.8 0.7 0.7 0.0 0.0 0.9 14.1 6.6 109.67 2 54.835 201M 91M 0.0 0.0
Sum 11/0 661.98 MB 0.0 2.6 1.1 1.4 1.5 0.1 0.0 90.5 13.0 7.7 202.19 4 50.548 325M 123M 0.0 0.0
Int 0/0 0.00 KB 0.0 2.6 1.1 1.4 1.5 0.1 0.0 161.0 13.1 7.7 201.04 3 67.013 325M 123M 0.0 0.0
[.code-highlight: 1, 6, 7, 14, 15, 18, 22-24]
// write 10M key/value pairs OF THE SAME KEY in 1M tuple batches
withSimpleRocksDb { rocksDb ->
val writeOptions = WriteOptions()
val defaultColumnFamily = rocksDb.defaultColumnFamily
WriteBatch().use { batch ->
(1..10_000_000).forEach { ordinal ->
batch.put(defaultColumnFamily, "the_key".toByteArray(), """{ "value": ${ordinal} }""".toByteArray())
if (ordinal % 1_000_000 == 0) {
rocksDb.write(writeOptions, batch)
batch.clear()
}
}
}
println(rocksDb.get("the_key"))
rocksDb.getProperty(rocksDb.getDefaultColumnFamily(), "rocksdb.stats")
}
{ "value": 10000000 }
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 3/0 3.03 KB 0.8 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.50 5 0.100 0 0 0.0 0.0
L1 1/0 1.01 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.3 0.8 0.2 0.01 1 0.006 5 4 0.0 0.0
Sum 4/0 4.04 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.2 0.0 0.0 0.51 6 0.084 5 4 0.0 0.0
Int 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.2 0.0 0.0 0.51 6 0.084 5 4 0.0 0.0
[.code-highlight: 1, 3, 4, 11-13]
// compact the default column family
withSimpleRocksDb { rocksDb ->
rocksDb.compactRange(rocksDb.getDefaultColumnFamily())
rocksDb.getProperty(rocksDb.getDefaultColumnFamily(), "rocksdb.stats")
}
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.12 1 0.123 0 0 0.0 0.0
L1 1/0 1.01 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.3 2.5 0.5 0.00 1 0.002 5 4 0.0 0.0
Sum 1/0 1.01 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.13 2 0.063 5 4 0.0 0.0
Int 0/0 0.00 KB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.13 2 0.063 5 4 0.0 0.0
Read/Write/Space Amplification
RUM Conjecture7
cacheIndexAndFilterBlocks = true
cacheIndexAndFilterBlocksWithHighPriority = true
pinL0FilterAndIndexBlocksInCache = true
# Add to your ubuntu-flavored Dockerfile
RUN apt-get update && apt-get install -y libjemalloc-dev
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide
https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html
https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h
https://github.com/facebook/rocksdb/blob/main/include/rocksdb/advanced_options.h
Footnotes
-
youtube: Peter Bailis - Silence is Golden: Coordination-Avoiding System Design ↩
-
image source: Designing Data Intensive Applications, P76 ↩
-
image source: Designing Data Intensive Applications, P77 ↩ ↩2
-
image source: https://en.wikipedia.org/wiki/Bloom_filter ↩
-
image source: https://en.wikipedia.org/wiki/Skip_list ↩
-
PDF - https://stratos.seas.harvard.edu/files/stratos/files/rum.pdf ↩