Skip to content

Commit

Permalink
WIP: sqlite3: Add decoder
Browse files Browse the repository at this point in the history
See sqlite3.{go,jq} for TODO

Related to #27
  • Loading branch information
wader committed Dec 28, 2021
1 parent 1a4b332 commit 9959d59
Show file tree
Hide file tree
Showing 7 changed files with 339 additions and 0 deletions.
1 change: 1 addition & 0 deletions format/all/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
_ "github.com/wader/fq/format/png"
_ "github.com/wader/fq/format/protobuf"
_ "github.com/wader/fq/format/raw"
_ "github.com/wader/fq/format/sqlite3"
_ "github.com/wader/fq/format/tar"
_ "github.com/wader/fq/format/tiff"
_ "github.com/wader/fq/format/vorbis"
Expand Down
2 changes: 2 additions & 0 deletions format/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ const (
WAV = "wav"
WEBP = "webp"
ZIP = "zip"

SQLITE3 = "sqlite3"
)

// below are data types used to communicate between formats <FormatName>In/Out
Expand Down
274 changes: 274 additions & 0 deletions format/sqlite3/sqlite3.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
package sqlite3

// https://www.sqlite.org/fileformat.html
// https://sqlite.org/schematab.html

// TODO: page overflow
// TODO: format version
// TODO: text encoding
// TODO: table/column names
// TODO: assert version and schema version?
// TODO: ptrmap
// TODO: how to represent NULL serials

// CREATE TABLE sqlite_schema(
// type text,
// name text,
// tbl_name text,
// rootpage integer,
// sql text
// );
// > A table with the name "sqlite_sequence" that is used to keep track of the maximum historical INTEGER PRIMARY KEY for a table using AUTOINCREMENT.
// CREATE TABLE sqlite_sequence(name,seq);
// > Tables with names of the form "sqlite_statN" where N is an integer. Such tables store database statistics gathered by the ANALYZE command and used by the query planner to help determine the best algorithm to use for each query.
// CREATE TABLE sqlite_stat1(tbl,idx,stat);
// Only if compiled with SQLITE_ENABLE_STAT2:
// CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample);
// Only if compiled with SQLITE_ENABLE_STAT3:
// CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample);
// Only if compiled with SQLITE_ENABLE_STAT4:
// CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample);
// TODO: sqlite_autoindex_TABLE_N index

import (
"embed"

"github.com/wader/fq/format"
"github.com/wader/fq/format/registry"
"github.com/wader/fq/internal/num"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/scalar"
)

//go:embed *.jq
var sqlite3FS embed.FS

func init() {
registry.MustRegister(decode.Format{
Name: format.SQLITE3,
Description: "SQLite v3 database",
Groups: []string{format.PROBE},
DecodeFn: sqlite3Decode,
Files: sqlite3FS,
})
}

const (
bTreeIndexInterior = 0x02
bTreeTableInterior = 0x05
bTreeIndexLeaf = 0x0a
bTreeTableLeaf = 0x0d
)

var bTreeTypeMap = scalar.UToScalar{
bTreeIndexInterior: scalar.S{Sym: "index_interior", Description: "Index interior b-tree page"},
bTreeTableInterior: scalar.S{Sym: "table_interior", Description: "Table interior b-tree page"},
bTreeIndexLeaf: scalar.S{Sym: "index_leaf", Description: "Index leaf b-tree page"},
bTreeTableLeaf: scalar.S{Sym: "table_leaf", Description: "Table leaf b-tree page"},
}

const (
textEncodingUTF8 = 1
textEncodingUTF16LE = 2
textEncodingUTF16BE = 3
)

var textEncodingMap = scalar.UToSymStr{
textEncodingUTF8: "utf8",
textEncodingUTF16LE: "utf16le",
textEncodingUTF16BE: "utf16be",
}

var versionMap = scalar.UToSymStr{
1: "legacy",
2: "wal",
}

// TODO: all bits if nine bytes?
// TODO: two complement on bit read count
func varintDecode(d *decode.D) int64 {
var n uint64
for i := 0; i < 9; i++ {
v := d.U8()
n = n<<7 | v&0b0111_1111
if v&0b1000_0000 == 0 {
break
}
}
return num.TwosComplement(64, n)
}

func sqlite3DecodeSerialType(d *decode.D, typ int64) {
switch typ {
case 0:
d.FieldValueStr("value", "NULL", scalar.Description("null"))
case 1:
d.FieldS8("value", scalar.Description("8-bit integer"))
case 2:
d.FieldS16("value", scalar.Description("16-bit integer"))
case 3:
d.FieldS24("value", scalar.Description("24-bit integer"))
case 4:
d.FieldS32("value", scalar.Description("32-bit integer"))
case 5:
d.FieldS48("value", scalar.Description("48-bit integer"))
case 6:
d.FieldS64("value", scalar.Description("64-bit integer"))
case 7:
d.FieldF64("value", scalar.Description("64-bit float"))
case 8:
d.FieldValueU("value", 0, scalar.Description("constant 0"))
case 9:
d.FieldValueU("value", 1, scalar.Description("constant 1"))
case 10, 11:
default:
if typ%2 == 0 {
// N => 12 and even: (N-12)/2 bytes blob.
d.FieldRawLen("value", (typ-12)/2*8, scalar.Description("blob"))
} else {
// N => 13 and odd: (N-13)/2 bytes text
d.FieldUTF8("value", int(typ-13)/2, scalar.Description("text"))
}
}
}

func sqlite3CellFreeblockDecode(d *decode.D) uint64 {
nextOffset := d.FieldU16("next_offset")
if nextOffset == 0 {
return 0
}
// TODO: "header" is size bytes or offset+size? seems to be just size
// "size of the freeblock in bytes, including the 4-byte header"
size := d.FieldU16("size")
d.FieldRawLen("space", int64(size-4)*8)
return nextOffset
}

func sqlite3CellPayloadDecode(d *decode.D) {
lengthStart := d.Pos()
length := d.FieldSFn("length", varintDecode)
lengtbBits := d.Pos() - lengthStart
var serialTypes []int64
d.LenFn((length)*8-lengtbBits, func(d *decode.D) {
d.FieldArray("serials", func(d *decode.D) {
for !d.End() {
serialTypes = append(serialTypes, d.FieldSFn("serial", varintDecode))
}
})
})
d.FieldArray("contents", func(d *decode.D) {
for _, s := range serialTypes {
sqlite3DecodeSerialType(d, s)
}
})
}

func sqlite3Decode(d *decode.D, in interface{}) interface{} {
var pageSizeS *scalar.S
var databaseSizePages uint64

d.FieldStruct("header", func(d *decode.D) {
d.FieldUTF8("magic", 16, d.AssertStr("SQLite format 3\x00"))
pageSizeS = d.FieldScalarU16("page_size", scalar.UToSymU{1: 65536}) // in bytes. Must be a power of two between 512 and 32768 inclusive, or the value 1 representing a page size of 65536.
d.FieldU8("write_version", versionMap) // 1 for legacy; 2 for WAL.
d.FieldU8("read_version", versionMap) // . 1 for legacy; 2 for WAL.
d.FieldU8("unused_space") // at the end of each page. Usually 0.
d.FieldU8("maximum_embedded_payload_fraction") // . Must be 64.
d.FieldU8("minimum_embedded_payload_fraction") // . Must be 32.
d.FieldU8("leaf_payload_fraction") // . Must be 32.
d.FieldU32("file_change_counter") //
databaseSizePages = d.FieldU32("database_size_pages") // . The "in-header database size".
d.FieldU32("page_number_freelist") // of the first freelist trunk page.
d.FieldU32("total_number_freelist") // pages.
d.FieldU32("schema_cookie") // .
d.FieldU32("schema_format_number") // . Supported schema formats are 1, 2, 3, and 4.
d.FieldU32("default_page_cache_size") // .
d.FieldU32("page_number_largest_root_btree") // page when in auto-vacuum or incremental-vacuum modes, or zero otherwise.
d.FieldU32("text_encoding", textEncodingMap)
d.FieldU32("user_version") // " as read and set by the user_version pragma.
d.FieldU32("incremental_vacuum_mode") // False (zero) otherwise.
d.FieldU32("application_id") // " set by PRAGMA application_id.
d.FieldRawLen("reserved", 160, d.BitBufIsZero()) // for expansion. Must be zero.
d.FieldU32("version_valid_for") // number.
d.FieldU32("sqlite_version_number") //
})

// TODO: nicer API for fallback?
pageSize := pageSizeS.ActualU()
if pageSizeS.Sym != nil {
pageSize = pageSizeS.SymU()
}

d.FieldArray("pages", func(d *decode.D) {
for i := uint64(0); i < databaseSizePages; i++ {
pageOffset := int64(pageSize) * int64(i)
d.SeekAbs(pageOffset * 8)
// skip header for first page
if i == 0 {
d.SeekRel(100 * 8)
}

d.FieldStruct("page", func(d *decode.D) {
typ := d.FieldU8("type", bTreeTypeMap)
startFreeblocks := d.FieldU16("start_freeblocks") // The two-byte integer at offset 1 gives the start of the first freeblock on the page, or is zero if there are no freeblocks.
pageCells := d.FieldU16("page_cells") // The two-byte integer at offset 3 gives the number of cells on the page.
d.FieldU16("cell_start") // sThe two-byte integer at offset 5 designates the start of the cell content area. A zero value for this integer is interpreted as 65536.
d.FieldU8("cell_fragments") // The one-byte integer at offset 7 gives the number of fragmented free bytes within the cell content area.
switch typ {
case bTreeIndexInterior,
bTreeTableInterior:
d.FieldU32("right_pointer") // The four-byte page number at offset 8 is the right-most pointer. This value appears in the header of interior b-tree pages only and is omitted from all other pages.
}
var cellPointers []uint64
d.FieldArray("cells_pointers", func(d *decode.D) {
for i := uint64(0); i < pageCells; i++ {
cellPointers = append(cellPointers, d.FieldU16("pointer"))
}
})
if startFreeblocks != 0 {
d.FieldArray("freeblocks", func(d *decode.D) {
nextOffset := startFreeblocks
for nextOffset != 0 {
d.SeekAbs((pageOffset + int64(nextOffset)) * 8)
d.FieldStruct("freeblock", func(d *decode.D) {
nextOffset = sqlite3CellFreeblockDecode(d)
})
}
})
}
d.FieldArray("cells", func(d *decode.D) {
for _, p := range cellPointers {
d.FieldStruct("cell", func(d *decode.D) {
// TODO: SeekAbs with fn later?
d.SeekAbs((pageOffset + int64(p)) * 8)
switch typ {
case bTreeIndexInterior:
d.FieldU32("left_child")
payLoadLen := d.FieldSFn("payload_len", varintDecode)
d.LenFn(payLoadLen*8, func(d *decode.D) {
d.FieldStruct("payload", sqlite3CellPayloadDecode)
})
case bTreeTableInterior:
d.FieldU32("left_child")
d.FieldSFn("rowid", varintDecode)
case bTreeIndexLeaf:
payLoadLen := d.FieldSFn("payload_len", varintDecode)
d.LenFn(payLoadLen*8, func(d *decode.D) {
d.FieldStruct("payload", sqlite3CellPayloadDecode)
})
case bTreeTableLeaf:
payLoadLen := d.FieldSFn("payload_len", varintDecode)
d.FieldSFn("rowid", varintDecode)
d.LenFn(payLoadLen*8, func(d *decode.D) {
d.FieldStruct("payload", sqlite3CellPayloadDecode)
})
}
})
}
})
})
}
})

return nil
}
41 changes: 41 additions & 0 deletions format/sqlite3/sqlite3.jq
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

# TODO: two columns tables are index tables?
# TODO: why page numbers-1? 0 excluded as special?
# TODO: traverse is wrong somehow
# TODO: chinook.db => [sqlite3_table("Track")] | length => 3496, should be 3503 rows

def sqlite3_traverse($root; $page):
def _t:
( . # debug({TRAVESE: .})
| if .type == "table_interior" or .type == "index_interior" then
( $root.pages[.cells[].left_child-1, .right_pointer-1]
| _t
)
elif .type == "table_leaf" or .type == "index_leaf" then
( .cells[]
)
end
);
( $page
| _t
);

def sqlite3_table($name):
( . as $root
| ( first(
( sqlite3_traverse($root; $root.pages[0])
| select(.payload.contents | .[0] == "table" and .[2] == $name)
)
)
) as $table_start_cell
| ( first(
( sqlite3_traverse($root; $root.pages[0])
| select(.payload.contents| .[0] == "index" and .[2] == $name)
)
)
) as $index_start_cell
| sqlite3_traverse($root; $root.pages[$index_start_cell.payload.contents[3]-1]) as $index_row
| sqlite3_traverse($root; $root.pages[$table_start_cell.payload.contents[3]-1])
| first(select(.rowid == $index_row.payload.contents[1]))
| .payload.contents
);
Binary file added format/sqlite3/testdata/test.db
Binary file not shown.
3 changes: 3 additions & 0 deletions format/sqlite3/testdata/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh

cat test.sql | sqlite3 test.db
18 changes: 18 additions & 0 deletions format/sqlite3/testdata/test.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
CREATE TABLE aaa (
cint int primary key,
cvarchar varchar(30),
ctext text,
creal real,
cblob blob
);
INSERT INTO "aaa" VALUES(0, 'var1', 'text1', 0, "blob1");
INSERT INTO "aaa" VALUES(1, 'var2', 'test2', 1, "blob2");
INSERT INTO "aaa" VALUES(128, 'var3', 'test3', 128, "blob3");
INSERT INTO "aaa" VALUES(-128, 'var3', 'test3', -128, "blob3");
INSERT INTO "aaa" VALUES(9223372036854775807, 'var4', 'test4', 9223372036854775807, "blob4");
INSERT INTO "aaa" VALUES(-9223372036854775808, 'var5', 'test5', -9223372036854775808, "blob5");

-- CREATE TABLE aaa (
-- cint int primary key
-- );
-- INSERT INTO "aaa" VALUES(123);

0 comments on commit 9959d59

Please sign in to comment.