Skip to content

Commit

Permalink
leveldb: add log and descriptor decoders
Browse files Browse the repository at this point in the history
  • Loading branch information
mikez committed Dec 6, 2023
1 parent 78a3e94 commit 2df0f0f
Show file tree
Hide file tree
Showing 25 changed files with 571 additions and 92 deletions.
6 changes: 6 additions & 0 deletions format/all/all.fqtest
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ $ fq -n _registry.groups.probe
"gif",
"gzip",
"jpeg",
"leveldb_descriptor",
"leveldb_log",
"leveldb_table",
"luajit",
"macho",
"macho_fat",
Expand Down Expand Up @@ -111,6 +114,9 @@ ipv6_packet Internet protocol v6 packet
jpeg Joint Photographic Experts Group file
json JavaScript Object Notation
jsonl JavaScript Object Notation Lines
leveldb_descriptor LevelDB Descriptor
leveldb_log LevelDB Log
leveldb_table LevelDB Table
luajit LuaJIT 2.0 bytecode
macho Mach-O macOS executable
macho_fat Fat Mach-O macOS executable (multi-architecture)
Expand Down
4 changes: 3 additions & 1 deletion format/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ var (
JPEG = &decode.Group{Name: "jpeg"}
JSON = &decode.Group{Name: "json"}
JSONL = &decode.Group{Name: "jsonl"}
LDB = &decode.Group{Name: "leveldb_ldb"}
LevelDB_Descriptor = &decode.Group{Name: "leveldb_descriptor"}
LDB = &decode.Group{Name: "leveldb_table"}
LOG = &decode.Group{Name: "leveldb_log"}
LuaJIT = &decode.Group{Name: "luajit"}
MachO = &decode.Group{Name: "macho"}
MachO_Fat = &decode.Group{Name: "macho_fat"}
Expand Down
125 changes: 125 additions & 0 deletions format/leveldb/leveldb_descriptor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package leveldb

// https://github.com/google/leveldb/blob/main/doc/impl.md#manifest
// https://github.com/google/leveldb/blob/main/db/version_edit.cc
//
// Files in LevelDB using this format include:
// - MANIFEST-*

import (
"embed"

"github.com/wader/fq/format"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)

//go:embed leveldb_log.md
var leveldbDescriptorFS embed.FS

func init() {
interp.RegisterFormat(
format.LevelDB_Descriptor,
&decode.Format{
Description: "LevelDB Descriptor",
Groups: []*decode.Group{format.Probe},
DecodeFn: ldbDescriptorDecode,
})
interp.RegisterFS(leveldbDescriptorFS)
}

const (
tagTypeComparator = 1
tagTypeLogNumber = 2
tagTypeNextFileNumber = 3
tagTypeLastSequence = 4
tagTypeCompactPointer = 5
tagTypeDeletedFile = 6
tagTypeNewFile = 7
// 8 not used anymore
tagTypePrevLogNumber = 9
)

var tagTypes = scalar.UintMapSymStr{
tagTypeComparator: "comparator",
tagTypeLogNumber: "log_number",
tagTypeNextFileNumber: "next file number",
tagTypeLastSequence: "last sequence",
tagTypeCompactPointer: "compact pointer",
tagTypeDeletedFile: "deleted file",
tagTypeNewFile: "new file",
tagTypePrevLogNumber: "previous log number",
}

func ldbDescriptorDecode(d *decode.D) any {
rro := recordReadOptions{readDataFn: func(size int64, recordType int, d *decode.D) {
if recordType == recordTypeFull {
d.FieldStruct("data", func(d *decode.D) {
d.LimitedFn(size, readManifest)
})
} else {
d.FieldRawLen("data", size)
}
}}
readBlockSequence(rro, d)

return nil
}

// List of sorted tables for each level involving key ranges and other metadata.
func readManifest(d *decode.D) {
d.FieldArray("tags", func(d *decode.D) {
for {
if d.End() {
break
}
d.FieldStruct("tag", func(d *decode.D) {
tag := d.FieldULEB128("key", tagTypes)
switch tag {
case tagTypeComparator:
readLengthPrefixedString("value", d)
case tagTypeLogNumber,
tagTypePrevLogNumber,
tagTypeNextFileNumber,
tagTypeLastSequence:
d.FieldULEB128("value")
case tagTypeCompactPointer:
d.FieldStruct("value", func(d *decode.D) {
d.FieldULEB128("level")
readTagInternalKey("internal_key", d)
})
case tagTypeDeletedFile:
d.FieldStruct("value", func(d *decode.D) {
d.FieldULEB128("level")
d.FieldULEB128("file_number")
})
case tagTypeNewFile:
d.FieldStruct("value", func(d *decode.D) {
d.FieldULEB128("level")
d.FieldULEB128("file_number")
d.FieldULEB128("file_size")
readTagInternalKey("smallest_internal_key", d)
readTagInternalKey("largest_internal_key", d)
})
default:
d.Fatalf("unknown tag: %d", tag)
}
})
}
})
}

func readLengthPrefixedString(name string, d *decode.D) {
d.FieldStruct(name, func(d *decode.D) {
length := d.FieldULEB128("length")
d.FieldUTF8("data", int(length))
})
}

func readTagInternalKey(name string, d *decode.D) {
d.FieldStruct(name, func(d *decode.D) {
length := d.FieldULEB128("length")
readInternalKey("data", int64(length), d)
})
}
13 changes: 13 additions & 0 deletions format/leveldb/leveldb_descriptor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
### Limitations

- fragmented non-"full" records are not decoded further.


### Authors

- [@mikez](https://github.com/mikez), original author

### References

- https://github.com/google/leveldb/blob/main/doc/impl.md#manifest
- https://github.com/google/leveldb/blob/main/db/version_edit.cc
152 changes: 152 additions & 0 deletions format/leveldb/leveldb_log.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package leveldb

// https://github.com/google/leveldb/blob/main/doc/log_format.md
//
// Files in LevelDB using this format include:
// - *.log
// - MANIFEST-*

import (
"embed"

"github.com/wader/fq/format"
"github.com/wader/fq/internal/mathex"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)

//go:embed leveldb_log.md
var leveldbLogFS embed.FS

func init() {
interp.RegisterFormat(
format.LOG,
&decode.Format{
Description: "LevelDB Log",
Groups: []*decode.Group{format.Probe},
DecodeFn: ldbLogDecode,
})
interp.RegisterFS(leveldbLogFS)
}

type recordReadOptions struct {
// Both .log- and MANIFEST-files use the Log-format,
// i.e., a sequence of records split into 32KB blocks.
// However, the format of the data within the records differ.
// This function specifies how to read said data.
readDataFn func(size int64, recordType int, d *decode.D)
}

// https://github.com/google/leveldb/blob/main/db/log_format.h
const (
// checksum (4 bytes) + length (2 bytes) + record type (1 byte)
headerSize = (4 + 2 + 1) * 8

blockSize = (32 * 1024) * 8 // 32KB

recordTypeZero = 0 // preallocated file regions
recordTypeFull = 1
recordTypeFirst = 2 // fragments
recordTypeMiddle = 3
recordTypeLast = 4
)

var recordTypes = scalar.UintMapSymStr{
recordTypeZero: "zero",
recordTypeFull: "full",
recordTypeFirst: "first",
recordTypeMiddle: "middle",
recordTypeLast: "last",
}

func ldbLogDecode(d *decode.D) any {
rro := recordReadOptions{readDataFn: func(size int64, recordType int, d *decode.D) {
d.FieldRawLen("data", size)
}}
readBlockSequence(rro, d)

return nil
}

// Read a sequence of 32KB-blocks (the last one may be less).
// https://github.com/google/leveldb/blob/main/db/log_reader.cc#L189
func readBlockSequence(rro recordReadOptions, d *decode.D) {
d.Endian = decode.LittleEndian

d.FieldArray("blocks", func(d *decode.D) {
for d.BitsLeft() >= headerSize {
d.LimitedFn(mathex.Min(blockSize, d.BitsLeft()), func(d *decode.D) {
d.FieldStruct("block", bind(readLogBlock, rro))
})
}
})

if d.BitsLeft() > 0 {
// The reference implementation says:
// "[...] if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF."
d.FieldRawLen("truncated_block", d.BitsLeft())
}
}

// Read a Log-block, consisting of up to 32KB of records and an optional trailer.
//
// block := record* trailer?
func readLogBlock(rro recordReadOptions, d *decode.D) {
if d.BitsLeft() > blockSize {
d.Fatalf("Bits left greater than maximum log-block size of 32KB.")
}
// record*
d.FieldArray("records", func(d *decode.D) {
for d.BitsLeft() >= headerSize {
d.FieldStruct("record", bind(readLogRecord, rro))
}
})
// trailer?
if d.BitsLeft() > 0 {
d.FieldRawLen("trailer", d.BitsLeft())
}
}

// Read a Log-record.
//
// checksum: uint32 // crc32c of type and data[] ; little-endian
// length: uint16 // little-endian
// type: uint8 // One of FULL, FIRST, MIDDLE, LAST
// data: uint8[length]
//
// via https://github.com/google/leveldb/blob/main/doc/log_format.md
func readLogRecord(rro recordReadOptions, d *decode.D) {
// header
var checksumValue *decode.Value
var length int64
var recordType int
d.LimitedFn(headerSize, func(d *decode.D) {
d.FieldStruct("header", func(d *decode.D) {
d.FieldU32("checksum", scalar.UintHex)
checksumValue = d.FieldGet("checksum")
length = int64(d.FieldU16("length"))
recordType = int(d.FieldU8("record_type", recordTypes))
})
})

// verify checksum: record type (1 byte) + data (`length` bytes)
d.RangeFn(d.Pos()-8, (1+length)*8, func(d *decode.D) {
bytesToCheck := d.Bits(int(d.BitsLeft()))
actualChecksum := computeChecksum(bytesToCheck)
_ = checksumValue.TryUintScalarFn(d.UintAssert(uint64(actualChecksum)))
})

// data
dataSize := length * 8
rro.readDataFn(dataSize, recordType, d)
}

func bind(f func(recordReadOptions, *decode.D), rro recordReadOptions) func(*decode.D) {
return func(d *decode.D) {
f(rro, d)
}
}
11 changes: 11 additions & 0 deletions format/leveldb/leveldb_log.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
### Limitations

- individual records are not merged and its data further decoded.

### Authors

- [@mikez](https://github.com/mikez), original author

### References

- https://github.com/google/leveldb/blob/main/doc/log_format.md
Loading

0 comments on commit 2df0f0f

Please sign in to comment.