From afed805fefcb4f737ac25db607b0d856e10eb4ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 13:32:33 -0800 Subject: [PATCH 01/21] /{.gitignore,AGENTS.md}: ignore beads stuff --- .gitignore | 5 ++++- AGENTS.md | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md diff --git a/.gitignore b/.gitignore index b333592153c..1e6b24365d0 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,7 @@ integration-tests/bats/batsee_results CLAUDE.md *~ -.dir-locals.el \ No newline at end of file +.dir-locals.el +.beads +.gitattributes + diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..8b696b78726 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,26 @@ +# Agent Instructions + +This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started. + +## Quick Reference + +```bash +bd ready # Find available work +bd show # View issue details +bd update --status in_progress # Claim work +bd close # Complete work +bd sync # Sync with git +``` + +## Landing the Plane (Session Completion) + +**When ending a work session**, you MUST complete ALL steps below. Work is NOT complete until `git push` succeeds. + +**MANDATORY WORKFLOW:** + +1. **File issues for remaining work** - Create issues for anything that needs follow-up +2. **Run quality gates** (if code changed) - Tests, linters, builds +3. **Update issue status** - Close finished work, update in-progress items +4. **Clean up** - Clear stashes, prune remote branches +5. **Hand off** - Provide context for next session + From 96f9edc6b3a733d2fd0c4d9e441b752e8d3aba84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:05:27 -0800 Subject: [PATCH 02/21] /GITBLOBSTORE_DESIGN.md: design doc --- GITBLOBSTORE_DESIGN.md | 314 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 GITBLOBSTORE_DESIGN.md diff --git a/GITBLOBSTORE_DESIGN.md b/GITBLOBSTORE_DESIGN.md new file mode 100644 index 00000000000..fd62a4b832f --- /dev/null +++ b/GITBLOBSTORE_DESIGN.md @@ -0,0 +1,314 @@ +# GitBlobstore design (Git object DB-backed `Blobstore`) + +## Summary + +This document proposes a new `Blobstore` implementation, **`GitBlobstore`**, whose backing store is a **git repository’s object database** (a bare repo or `.git` directory). `GitBlobstore` is intended to eventually enable using **Git remotes as Dolt remotes** by storing Dolt’s NBS artifacts as git objects, **without creating a working tree checkout**. + +Key constraints: + +- Must implement the existing `go/store/blobstore/blobstore.go` `Blobstore` interface (no interface changes). +- Must be compatible with NBS usage patterns (`go/store/nbs/*`) and the existing blobstore test suite (`go/store/blobstore/blobstore_test.go`). +- Must operate using only the `.git` directory / bare repo and **manipulate git objects directly** (blobs/trees/commits/refs). No checkout. +- Initial ref to use: **`refs/dolt/data`**. + +## Background: current `Blobstore` expectations beyond the interface + +Although `Blobstore`’s interface is small, the rest of the codebase imposes several **implicit behavioral requirements**. + +### Where `Blobstore` is used + +`Blobstore` is used by NBS when you construct a store via: + +- `nbs.NewBSStore(ctx, ..., bs, ...)` (full conjoin path, requires `Concatenate`) +- `nbs.NewNoConjoinBSStore(ctx, ..., bs, ...)` (OCI-like path, no conjoin/compose) + +and for the NBS manifest: + +- `go/store/nbs/bs_manifest.go` stores the manifest as a blob named `"manifest"` using `Get` + `CheckAndPut`. + +### Required semantics (tests + NBS) + +#### 1) Missing keys must return `blobstore.NotFound` + +`blobstore_test.go` checks missing-key behavior with `IsNotFoundError(err)` which relies on `err.(NotFound)` type assertion (`go/store/blobstore/errors.go`). + +`GitBlobstore.Get()` must return `NotFound{Key: key}` (or with a more descriptive Key string) when the blob does not exist. + +#### 2) `Get()` must implement `BlobRange` correctly, including negative offsets + +`blobstore_test.go` validates: + +- `AllRange` returns the entire blob +- `NewBlobRange(offset, length)` returns exactly the requested bytes +- `NewBlobRange(-n, 0)` returns the last `n` bytes +- `NewBlobRange(-n, m)` returns a slice from the tail + +NBS relies on these forms heavily to read table indices/footers: + +- `NewBlobRange(-N, 0)` to read the last `N` bytes of table files (index/footer) +- range reads to implement `ReadAtWithStats` for table readers + +#### 3) `Get()` must return the **total blob size** as `size` + +NBS code uses the `size` returned by `Get()` (even for a range request) as the full object size, e.g. archive readers use it as the total file size. + +For range requests, `size` must be the full object size, not the response byte count. + +#### 4) `Put()` and `Get()` versions must be consistent + +`blobstore_test.go` verifies that: + +- `Put()` returns a version string +- `Get()` returns the same version string immediately after the `Put()` + +#### 5) `CheckAndPut()` must provide real CAS behavior under concurrency + +`blobstore_test.go` has a concurrent read/modify/write loop that: + +1. calls `Get()` to read bytes and a version +2. calls `CheckAndPut(expectedVersion=thatVersion, ...)` + +Under contention, `CheckAndPut()` must: + +- fail with a `blobstore.CheckAndPutError` (recognized via `IsCheckAndPutError`) +- succeed only when the expected version matches the current version + +`CheckAndPut(expectedVersion="")` must also support create-if-absent semantics (the tests call it on a missing key). + +#### 6) `Concatenate()` is required for the full NBS blobstore persister + +NBS’s `blobstorePersister` (`go/store/nbs/bs_persister.go`) persists tables by writing: + +- `.records` +- `.tail` + +then calling: + +- `bs.Concatenate(name, []string{name+".records", name+".tail"})` + +Conjoin operations also call `Concatenate` to compose record-only sub-objects. + +If a blobstore cannot implement `Concatenate`, it must be used with `NewNoConjoinBSStore`, and conjoin is disabled (as is done for OCI). + +## Concept: representing a Blobstore inside git + +The `GitBlobstore` will map the blobstore keyspace onto the **tree of a commit** referenced by `refs/dolt/data`. + +- The ref `refs/dolt/data` points to a commit. +- The commit’s tree is the “directory” holding all blobstore keys. +- Each blobstore `key` is a **path** in that tree; its contents are a git **blob** object. + +This gives us: + +- immutable content-addressed blob objects for data +- an append-only history of updates (commits) +- a single ref head that can serve as a “version” for CAS updates + +No working tree is required; we only read/write objects and update refs. + +## Key design decisions + +### Ref name + +Use: + +- `refs/dolt/data` + +as the authoritative reference for the blobstore. + +### Version string + +Define the blobstore “version” as: + +- the **commit hash** (hex object id) currently pointed to by `refs/dolt/data` + +Rationale: + +- it is globally consistent across keys (a snapshot version) +- it naturally supports CAS: update ref from old commit → new commit +- it mirrors object-store generation/etag semantics as a stable identifier + +Implications: + +- `Get()` returns the commit hash of `refs/dolt/data` at the time of lookup. +- `Put()` returns the new commit hash it wrote. +- `CheckAndPut()` compares expected commit hash to current ref commit hash. + +### Atomic CAS + +`CheckAndPut()` must be atomic: it must only update if the ref still points to the expected commit id. + +Implementation requirement: + +- update `refs/dolt/data` with a compare-and-swap on the old object id + +This can be achieved by: + +- using git plumbing (`update-ref `) which is designed to be atomic, or +- implementing direct ref updates with proper locking and packed-refs handling (more complex). + +### Key/path validation + +Blobstore keys are treated as paths in the git tree. We must prevent traversal or invalid path components. + +Rules: + +- reject keys that are absolute (`/…`) or contain `..` path segments +- reject NUL bytes +- normalize path separators to `/` (git tree paths are `/`) +- optionally, restrict keys to a conservative character set if desired (not required initially) + +### `.bs` extension + +Unlike `LocalBlobstore` which appends `.bs` on disk, `GitBlobstore` should store keys **exactly as provided** (e.g. `manifest`, ``, `.records`), matching other remote blobstores (GCS/OCI/OSS) which also store by key directly. + +## Interface mapping + +Below is the planned behavior for each `Blobstore` method. + +### `Path() string` + +Return a stable identifier for logging/debugging, e.g.: + +- `@refs/dolt/data` + +### `Exists(ctx, key) (bool, error)` + +- Resolve `refs/dolt/data` to a commit (if the ref does not exist, return `(false, nil)`). +- Read the commit tree and look up the path for `key`. +- Return true if present and is a blob, else false. + +### `Get(ctx, key, br) (rc, size, version, err)` + +Steps: + +1. Resolve `refs/dolt/data` → commit id (`version`). +2. Locate the blob object id at path `key` in that commit’s tree. +3. Determine `size` (full blob size), even if the caller requests a range. +4. Return an `io.ReadCloser` over the requested range: + - If `br` is `AllRange`, stream full blob. + - If `br.offset < 0`, convert to a positive range using total `size` (same behavior as `BlobRange.positiveRange`). + - If `br.length == 0`, stream from offset to end. + - If `br.length > 0`, stream exactly `length` bytes starting at `offset`. + +Errors: + +- if key is missing: return `NotFound{Key: key}` + +### `Put(ctx, key, totalSize, reader) (version, error)` + +Behavior: + +- Write a new git blob object with the content from `reader`. +- Create a new tree that is the previous tree with path `key` updated to point to that blob. +- Create a new commit and update `refs/dolt/data` to point to it (last-writer-wins). + +Notes: + +- `Put()` does not provide CAS; `CheckAndPut()` is used where CAS is required (manifest updates and concurrent RMW patterns). +- `Put()` should return the new commit hash as `version`. + +### `CheckAndPut(ctx, expectedVersion, key, totalSize, reader) (version, error)` + +CAS semantics: + +- If `expectedVersion != ""`: + - require that `refs/dolt/data` currently points to `expectedVersion` + - if not, return `CheckAndPutError{Key, ExpectedVersion, ActualVersion}` +- If `expectedVersion == ""`: + - implement create-if-absent behavior: + - if `key` exists in the current ref snapshot, return `CheckAndPutError` + - otherwise, proceed + +If check passes: + +- perform the same write as `Put()` (write blob, update tree, commit) +- atomically update the ref from old commit id to new commit id (CAS) + +Return: + +- new commit hash + +### `Concatenate(ctx, key, sources) (version, error)` + +Correctness-first approach: + +- Stream all sources in order and write a new git blob object whose content is their concatenation. +- Update path `key` to point to that new blob. +- Commit and update `refs/dolt/data` (either last-writer-wins, or CAS on current head; CAS is preferred to avoid lost updates if multiple writers are composing concurrently). + +Notes: + +- NBS calls `Concatenate` for `Persist()` and for conjoin; correctness matters more than git-level “compose” optimization. +- This is expected to be efficient enough initially because objects are already local to the `.git` store; it avoids network. + +## NBS compatibility modes + +`GitBlobstore` is expected to implement `Concatenate`, so it can be used with: + +- `nbs.NewBSStore(...)` (full conjoin enabled) + +If at some point we introduce a mode that cannot implement efficient `Concatenate`, then it must be paired with: + +- `nbs.NewNoConjoinBSStore(...)` (conjoin disabled) + +## Performance notes and future work + +### Range reads + +Git does not provide a native “range read” API for blobs. Implementations will likely: + +- read the full decompressed blob stream and slice/limit, or +- optimize tail reads (`offset < 0`) with a ring buffer to avoid buffering whole blobs in memory + +NBS does many tail reads for index/footer, so tail-range optimization is a likely follow-up. + +### Tree updates at scale + +Updating a file path in a git tree requires rebuilding trees along the path. If the blobstore keyspace gets large and flat, tree operations may become hot. + +Potential future optimizations: + +- key sharding (e.g. store keys under `aa/bb/` derived from hash prefixes) +- batching multiple key updates into one commit when possible +- using lower-level object formats or packfile streaming for high write throughput + +### Repository maintenance + +Because each write creates new objects and commits: + +- git GC / repack policy will matter +- prune of unreachable objects may be required in long-running deployments + +These are explicitly out of scope for the first iteration. + +## Dependency / implementation strategy + +This design is compatible with multiple backends: + +- **Git CLI plumbing** (e.g., `git --git-dir cat-file`, `hash-object`, `mktree`, `commit-tree`, `update-ref`) + - Pros: low code, uses battle-tested git object manipulation, no extra Go dependencies + - Cons: runtime dependency on `git`, process overhead + +- **Pure-Go implementation** + - Pros: no external process dependency + - Cons: larger implementation effort; likely requires introducing a library dependency (e.g., `go-git`) or writing object plumbing in-house + +Given the current module does not include a Go git implementation dependency, the initial implementation can reasonably use **git plumbing commands** while still meeting the “no checkout” requirement. + +## Testing plan (expected to pass existing suite) + +`GitBlobstore` should be able to join the existing blobstore tests in `go/store/blobstore/blobstore_test.go`: + +- Put/Get version equality +- NotFound behavior +- CAS correctness (concurrent CheckAndPut test) +- Range read correctness (including negative offsets) +- Concatenate correctness + +Additional recommended tests: + +- behavior when `refs/dolt/data` does not exist yet (bootstrap) +- packed-refs handling (if the ref is packed) +- concurrent writers updating different keys (should not corrupt ref) + From 505e9e5743b75f46642907b30319d792411ba43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:19:24 -0800 Subject: [PATCH 03/21] /GITBLOBSTORE_DESIGN.md: update with git internals info --- GITBLOBSTORE_DESIGN.md | 118 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/GITBLOBSTORE_DESIGN.md b/GITBLOBSTORE_DESIGN.md index fd62a4b832f..4c3c4ea19f7 100644 --- a/GITBLOBSTORE_DESIGN.md +++ b/GITBLOBSTORE_DESIGN.md @@ -296,6 +296,124 @@ This design is compatible with multiple backends: Given the current module does not include a Go git implementation dependency, the initial implementation can reasonably use **git plumbing commands** while still meeting the “no checkout” requirement. +## Git internals and plumbing commands (implementation preview) + +This section previews the **git internals** and a concrete set of **plumbing commands** we expect `GitBlobstore` to use. The approach explicitly avoids a working tree checkout. + +### Git internals we rely on + +- **Objects** live in the object database (ODB) as either: + - **loose** objects under `.git/objects/aa/bb…`, or + - **packed** objects under `.git/objects/pack/*.pack` + `GitBlobstore` must be agnostic to storage format; it should interact through git’s plumbing which abstracts over loose/packed storage. +- **Object types**: + - **blob**: raw file contents (this is the value for a blobstore key) + - **tree**: directory mapping `name -> (mode, type, oid)` (this is the blobstore keyspace index) + - **commit**: points to a root tree + parent(s) + metadata (this is the snapshot “version”) +- **Ref**: + - `refs/dolt/data` points at the current commit snapshot + - updating this ref publishes a new blobstore state + +### Minimal plumbing command set + +Below are the commands we expect to use (exact invocation may vary). + +#### Resolve ref / versions + +- Check ref existence: + - `git --git-dir show-ref --verify --quiet refs/dolt/data` +- Resolve current commit (version): + - `git --git-dir rev-parse --verify refs/dolt/data` + +#### Key existence and path lookup + +- Check if a path exists in a commit tree: + - `git --git-dir cat-file -e :` +- Resolve the blob oid for a path: + - `git --git-dir rev-parse :` + +#### Blob size and blob content streaming + +- Object size (needed so `Get()` can return total blob size even for range requests): + - `git --git-dir cat-file -s ` +- Stream blob contents: + - `git --git-dir cat-file blob ` + +Note: git does not have a native “range read” for blobs; for `BlobRange` we will stream and slice/limit in Go. Tail-range reads (`offset < 0`) can be optimized later with a ring buffer. + +#### Write a blob object + +- Write blob from stdin and get its oid: + - `git --git-dir hash-object -w --stdin` + +This is used by `Put`, `CheckAndPut`, and `Concatenate` (after concatenating sources). + +### Tree updates without a checkout (Approach A: temporary index) + +We prefer **Approach A**: use a *temporary git index file* to stage a tree update without a working directory. + +Core idea: + +- set `GIT_DIR=` +- set `GIT_INDEX_FILE=` +- use `read-tree` + `update-index --cacheinfo` + `write-tree` + +Commands: + +- Initialize the temporary index from the current tree: + - if `refs/dolt/data` exists: + - `git read-tree ^{tree}` + - if the ref is missing (bootstrap): + - `git read-tree --empty` +- Add or replace a path with a specific blob oid: + - `git update-index --add --cacheinfo 100644 ` + - (mode may be `100644` for regular file; no executable bits) +- Write the resulting tree: + - `git write-tree` → outputs `` + +Advantages: + +- avoids manual tree reconstruction for nested paths +- avoids checkout / working tree entirely +- supports deep key paths naturally + +### Commit creation and atomic ref update (CAS) + +- Create a commit object from the new tree: + - `git commit-tree -p -m ` + - (author/committer identity can be supplied via env; we can use a fixed identity initially) +- Atomic compare-and-swap ref update: + - `git update-ref -m "" refs/dolt/data ` + +This `update-ref` form is the primary CAS primitive; it will fail if the ref no longer points to ``. + +Bootstrap note: + +- for “create ref only if absent”, git accepts an “old” value of all-zeros (`000…000`) which enforces that the ref does not exist. + +### Command-to-method mapping (quick reference) + +- `Exists(key)`: + - `rev-parse refs/dolt/data` (if missing → false) + - `cat-file -e :` +- `Get(key, br)`: + - `rev-parse refs/dolt/data` → version + - `rev-parse :` → blob oid + - `cat-file -s ` → total size + - `cat-file blob ` → stream; slice/limit in Go per `BlobRange` +- `Put(key, reader)`: + - `hash-object -w --stdin` → blob oid + - temp index: `read-tree` → `update-index --cacheinfo` → `write-tree` → tree oid + - `commit-tree` → new commit oid + - `update-ref refs/dolt/data ` (non-CAS) +- `CheckAndPut(expectedVersion, key, reader)`: + - same as `Put`, but final step is: + - `update-ref refs/dolt/data ` (CAS) +- `Concatenate(key, sources)`: + - for each source: resolve blob oid and stream `cat-file blob` + - pipe concatenation into `hash-object -w --stdin` + - then same tree/commit/ref update flow as `Put` (CAS preferred) + ## Testing plan (expected to pass existing suite) `GitBlobstore` should be able to join the existing blobstore tests in `go/store/blobstore/blobstore_test.go`: From f42421c332797d5a26d203df37c5be260528a56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:24:36 -0800 Subject: [PATCH 04/21] /go/store/blobstore/internal/git/runner.go: internal git runner --- go/store/blobstore/internal/git/runner.go | 255 ++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 go/store/blobstore/internal/git/runner.go diff --git a/go/store/blobstore/internal/git/runner.go b/go/store/blobstore/internal/git/runner.go new file mode 100644 index 00000000000..0448887323e --- /dev/null +++ b/go/store/blobstore/internal/git/runner.go @@ -0,0 +1,255 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package git provides helpers for invoking git plumbing commands against a bare +// repository or .git directory without a working tree checkout. +package git + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "os" + "os/exec" + "strings" +) + +const maxCapturedOutputBytes = 64 * 1024 + +// Runner executes git commands with GIT_DIR set (and optionally GIT_INDEX_FILE). +// It is intended for git plumbing usage and should not require a working tree. +type Runner struct { + gitPath string + gitDir string + // extraEnv is appended to os.Environ() for every command. + extraEnv []string +} + +// NewRunner creates a Runner using the git binary on PATH. +func NewRunner(gitDir string) (*Runner, error) { + p, err := exec.LookPath("git") + if err != nil { + return nil, fmt.Errorf("git not found on PATH: %w", err) + } + return NewRunnerWithGitPath(gitDir, p), nil +} + +// NewRunnerWithGitPath creates a Runner using an explicit git binary path. +func NewRunnerWithGitPath(gitDir, gitPath string) *Runner { + return &Runner{ + gitPath: gitPath, + gitDir: gitDir, + } +} + +// WithExtraEnv returns a copy of r that appends env entries (e.g. "K=V") to all commands. +func (r *Runner) WithExtraEnv(env ...string) *Runner { + cp := *r + cp.extraEnv = append(append([]string(nil), r.extraEnv...), env...) + return &cp +} + +// RunOptions control a single git invocation. +type RunOptions struct { + // Dir is the working directory for the git process. Optional. + Dir string + // IndexFile sets GIT_INDEX_FILE for the git process. Optional. + IndexFile string + // Stdin provides stdin to the git process. Optional. + Stdin io.Reader + // Stdout and Stderr override output destinations. If both are nil, output is captured and returned. + Stdout io.Writer + Stderr io.Writer + // Env is appended to the process environment. + Env []string +} + +// CmdError represents a failed git invocation with captured output. +type CmdError struct { + Args []string + Dir string + ExitCode int + Output []byte + Cause error +} + +func (e *CmdError) Error() string { + var b strings.Builder + b.WriteString("git command failed") + if e.ExitCode != 0 { + b.WriteString(fmt.Sprintf(" (exit %d)", e.ExitCode)) + } + if len(e.Args) > 0 { + b.WriteString("\ncommand: git ") + b.WriteString(strings.Join(e.Args, " ")) + } + if e.Dir != "" { + b.WriteString("\ndir: ") + b.WriteString(e.Dir) + } + if len(e.Output) > 0 { + b.WriteString("\noutput:\n") + b.WriteString(formatOutput(e.Output)) + } else { + b.WriteString("\noutput:\n(no output)") + } + if e.Cause != nil { + b.WriteString("\nerror: ") + b.WriteString(e.Cause.Error()) + } + return b.String() +} + +func (e *CmdError) Unwrap() error { return e.Cause } + +// Run executes "git " with GIT_DIR set and returns captured combined output +// when Stdout/Stderr are not supplied. +func (r *Runner) Run(ctx context.Context, opts RunOptions, args ...string) ([]byte, error) { + cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing. + if opts.Dir != "" { + cmd.Dir = opts.Dir + } + cmd.Env = r.env(opts) + + if opts.Stdin != nil { + cmd.Stdin = opts.Stdin + } + + // Capture combined output unless caller provided destinations. + var buf bytes.Buffer + if opts.Stdout == nil && opts.Stderr == nil { + cmd.Stdout = &buf + cmd.Stderr = &buf + } else { + if opts.Stdout != nil { + cmd.Stdout = opts.Stdout + } + if opts.Stderr != nil { + cmd.Stderr = opts.Stderr + } else if opts.Stdout != nil { + // Reasonable default: if only Stdout is set, send stderr there too. + cmd.Stderr = opts.Stdout + } + } + + err := cmd.Run() + out := buf.Bytes() + if err == nil { + return out, nil + } + + exitCode := 0 + var ee *exec.ExitError + if errors.As(err, &ee) { + exitCode = ee.ExitCode() + } + return out, &CmdError{ + Args: append([]string(nil), args...), + Dir: cmd.Dir, + ExitCode: exitCode, + Output: append([]byte(nil), out...), + Cause: err, + } +} + +// Start starts "git " and returns a ReadCloser for stdout. +// The caller must call Wait() on the returned Cmd to release resources. +func (r *Runner) Start(ctx context.Context, opts RunOptions, args ...string) (io.ReadCloser, *exec.Cmd, error) { + cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing. + if opts.Dir != "" { + cmd.Dir = opts.Dir + } + cmd.Env = r.env(opts) + if opts.Stdin != nil { + cmd.Stdin = opts.Stdin + } + + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil, nil, err + } + // Capture stderr into a buffer so failures have actionable output. + var stderr bytes.Buffer + cmd.Stderr = &stderr + + if err := cmd.Start(); err != nil { + _ = stdout.Close() + return nil, nil, err + } + + // Wrap stdout so that Close also waits to avoid zombies if callers bail early. + rc := &cmdReadCloser{ + r: stdout, + cmd: cmd, + stderr: &stderr, + args: append([]string(nil), args...), + dir: cmd.Dir, + } + return rc, cmd, nil +} + +type cmdReadCloser struct { + r io.ReadCloser + cmd *exec.Cmd + stderr *bytes.Buffer + args []string + dir string +} + +func (c *cmdReadCloser) Read(p []byte) (int, error) { return c.r.Read(p) } + +func (c *cmdReadCloser) Close() error { + _ = c.r.Close() + err := c.cmd.Wait() + if err == nil { + return nil + } + exitCode := 0 + var ee *exec.ExitError + if errors.As(err, &ee) { + exitCode = ee.ExitCode() + } + return &CmdError{ + Args: c.args, + Dir: c.dir, + ExitCode: exitCode, + Output: append([]byte(nil), c.stderr.Bytes()...), + Cause: err, + } +} + +func (r *Runner) env(opts RunOptions) []string { + env := append([]string(nil), os.Environ()...) + env = append(env, "GIT_DIR="+r.gitDir) + if opts.IndexFile != "" { + env = append(env, "GIT_INDEX_FILE="+opts.IndexFile) + } + env = append(env, r.extraEnv...) + env = append(env, opts.Env...) + return env +} + +func formatOutput(out []byte) string { + if len(out) == 0 { + return "(no output)" + } + if len(out) <= maxCapturedOutputBytes { + return strings.TrimRight(string(out), "\n") + } + trimmed := out[len(out)-maxCapturedOutputBytes:] + return fmt.Sprintf("... (truncated; showing last %d bytes)\n%s", maxCapturedOutputBytes, strings.TrimRight(string(trimmed), "\n")) +} + From 418a46b9031266444e0b8438278999fee0f0c263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:28:26 -0800 Subject: [PATCH 05/21] /go/store/blobstore/internal/git: wip, add read api --- go/store/blobstore/internal/git/errors.go | 61 ++++++++ go/store/blobstore/internal/git/read.go | 174 ++++++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100644 go/store/blobstore/internal/git/errors.go create mode 100644 go/store/blobstore/internal/git/read.go diff --git a/go/store/blobstore/internal/git/errors.go b/go/store/blobstore/internal/git/errors.go new file mode 100644 index 00000000000..55771670f93 --- /dev/null +++ b/go/store/blobstore/internal/git/errors.go @@ -0,0 +1,61 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package git + +import "fmt" + +// RefNotFoundError indicates that a ref (e.g. refs/dolt/data) could not be resolved. +type RefNotFoundError struct { + Ref string +} + +func (e RefNotFoundError) Error() string { + return fmt.Sprintf("git ref not found: %s", e.Ref) +} + +// PathNotFoundError indicates that a tree path could not be resolved within a commit. +type PathNotFoundError struct { + Commit string + Path string +} + +func (e PathNotFoundError) Error() string { + return fmt.Sprintf("git path not found: %s:%s", e.Commit, e.Path) +} + +// NotBlobError indicates that a resolved path did not refer to a blob object. +type NotBlobError struct { + Commit string + Path string + Type string +} + +func (e NotBlobError) Error() string { + if e.Type == "" { + return fmt.Sprintf("git path is not a blob: %s:%s", e.Commit, e.Path) + } + return fmt.Sprintf("git path is not a blob (%s): %s:%s", e.Type, e.Commit, e.Path) +} + +func IsRefNotFound(err error) bool { + _, ok := err.(RefNotFoundError) + return ok +} + +func IsPathNotFound(err error) bool { + _, ok := err.(PathNotFoundError) + return ok +} + diff --git a/go/store/blobstore/internal/git/read.go b/go/store/blobstore/internal/git/read.go new file mode 100644 index 00000000000..28479945cdc --- /dev/null +++ b/go/store/blobstore/internal/git/read.go @@ -0,0 +1,174 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package git + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "strconv" + "strings" +) + +// OID is a git object id in hex (typically 40-char SHA1). +type OID string + +func (o OID) String() string { return string(o) } + +// TryResolveRefCommit resolves |ref| to a commit OID. Returns ok=false if the ref does not exist. +func TryResolveRefCommit(ctx context.Context, r *Runner, ref string) (oid OID, ok bool, err error) { + out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", "--quiet", ref+"^{commit}") + if err == nil { + s := strings.TrimSpace(string(out)) + if s == "" { + // Shouldn't happen, but treat as missing. + return "", false, nil + } + return OID(s), true, nil + } + + if isRefNotFoundErr(err) { + return "", false, nil + } + return "", false, err +} + +// ResolveRefCommit resolves |ref| to a commit OID. +func ResolveRefCommit(ctx context.Context, r *Runner, ref string) (OID, error) { + oid, ok, err := TryResolveRefCommit(ctx, r, ref) + if err != nil { + return "", err + } + if !ok { + return "", RefNotFoundError{Ref: ref} + } + return oid, nil +} + +// ResolvePathBlob resolves |path| within |commit| to a blob OID. +// It returns PathNotFoundError if the path does not exist. +func ResolvePathBlob(ctx context.Context, r *Runner, commit OID, path string) (OID, error) { + spec := commit.String() + ":" + path + out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec) + if err != nil { + if isPathNotFoundErr(err) { + return "", PathNotFoundError{Commit: commit.String(), Path: path} + } + return "", err + } + oid := strings.TrimSpace(string(out)) + if oid == "" { + return "", fmt.Errorf("git rev-parse returned empty oid for %q", spec) + } + + typ, err := CatFileType(ctx, r, OID(oid)) + if err != nil { + return "", err + } + if typ != "blob" { + return "", NotBlobError{Commit: commit.String(), Path: path, Type: typ} + } + return OID(oid), nil +} + +// CatFileType returns the git object type for |oid| (e.g. "blob", "tree", "commit"). +func CatFileType(ctx context.Context, r *Runner, oid OID) (string, error) { + out, err := r.Run(ctx, RunOptions{}, "cat-file", "-t", oid.String()) + if err != nil { + return "", err + } + return strings.TrimSpace(string(out)), nil +} + +// BlobSize returns the size in bytes of the blob object |oid|. +func BlobSize(ctx context.Context, r *Runner, oid OID) (int64, error) { + out, err := r.Run(ctx, RunOptions{}, "cat-file", "-s", oid.String()) + if err != nil { + return 0, err + } + s := strings.TrimSpace(string(out)) + n, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0, fmt.Errorf("git cat-file -s parse error (%q): %w", s, err) + } + return n, nil +} + +// BlobReader returns a reader for blob contents. The returned ReadCloser will wait for +// the git process to exit when closed, returning a CmdError if the process fails. +func BlobReader(ctx context.Context, r *Runner, oid OID) (io.ReadCloser, error) { + rc, _, err := r.Start(ctx, RunOptions{}, "cat-file", "blob", oid.String()) + return rc, err +} + +func isRefNotFoundErr(err error) bool { + ce, ok := err.(*CmdError) + if !ok { + return false + } + // For `git rev-parse --verify --quiet ^{commit}`, a missing ref typically yields exit 1 and no output. + if ce.ExitCode == 1 && len(bytes.TrimSpace(ce.Output)) == 0 { + return true + } + // Some git versions may still emit "fatal: Needed a single revision" without --quiet; keep a defensive check. + msg := strings.ToLower(string(ce.Output)) + return strings.Contains(msg, "needed a single revision") || + strings.Contains(msg, "unknown revision") || + strings.Contains(msg, "not a valid object name") +} + +func isPathNotFoundErr(err error) bool { + ce, ok := err.(*CmdError) + if !ok { + return false + } + if ce.ExitCode == 128 || ce.ExitCode == 1 { + msg := strings.ToLower(string(ce.Output)) + // Common patterns: + // - "fatal: Path 'x' does not exist in 'HEAD'" + // - "fatal: invalid object name 'HEAD:x'" + // - "fatal: ambiguous argument '...': unknown revision or path not in the working tree." + if strings.Contains(msg, "does not exist in") || + strings.Contains(msg, "invalid object name") || + strings.Contains(msg, "unknown revision or path not in the working tree") { + return true + } + } + return false +} + +// ReadAllBytes is a small helper for read-path callers that want a whole object. +// This is not used by GitBlobstore.Get (which must support BlobRange), but it is useful in tests. +func ReadAllBytes(ctx context.Context, r *Runner, oid OID) ([]byte, error) { + rc, err := BlobReader(ctx, r, oid) + if err != nil { + return nil, err + } + defer rc.Close() + return io.ReadAll(rc) +} + +// NormalizeGitPlumbingError unwraps CmdError wrappers, returning the underlying error. +// Mostly useful for callers that want to compare against context cancellation. +func NormalizeGitPlumbingError(err error) error { + var ce *CmdError + if errors.As(err, &ce) && ce.Cause != nil { + return ce.Cause + } + return err +} + From 6ef9931a44ffb26702329a7cf3915aa412636e9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:32:15 -0800 Subject: [PATCH 06/21] /go/store/blobstore/internal/git: wip, add unimplemented write api --- go/store/blobstore/internal/git/errors.go | 9 +- go/store/blobstore/internal/git/write.go | 105 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 go/store/blobstore/internal/git/write.go diff --git a/go/store/blobstore/internal/git/errors.go b/go/store/blobstore/internal/git/errors.go index 55771670f93..dad8836c46b 100644 --- a/go/store/blobstore/internal/git/errors.go +++ b/go/store/blobstore/internal/git/errors.go @@ -14,7 +14,14 @@ package git -import "fmt" +import ( + "errors" + "fmt" +) + +// ErrUnimplemented is returned by stubbed write-path APIs. It is intentionally +// exported so higher layers (e.g. GitBlobstore) can wrap or match it. +var ErrUnimplemented = errors.New("unimplemented") // RefNotFoundError indicates that a ref (e.g. refs/dolt/data) could not be resolved. type RefNotFoundError struct { diff --git a/go/store/blobstore/internal/git/write.go b/go/store/blobstore/internal/git/write.go new file mode 100644 index 00000000000..8590acb24c6 --- /dev/null +++ b/go/store/blobstore/internal/git/write.go @@ -0,0 +1,105 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package git + +import ( + "context" + "fmt" +) + +// WriteAPI defines the git plumbing operations needed for Approach A (temporary index +// via GIT_INDEX_FILE) to perform updates without a working tree checkout. +// +// This file intentionally does not implement these operations yet; the current +// GitBlobstore milestone is read-only. All methods on the default implementation +// return ErrUnimplemented. +type WriteAPI interface { + // ReadTree populates |indexFile| with the entries from |commit|'s root tree. + // Equivalent plumbing: + // GIT_DIR=... GIT_INDEX_FILE= git read-tree ^{tree} + ReadTree(ctx context.Context, commit OID, indexFile string) error + + // ReadTreeEmpty initializes |indexFile| to an empty index. + // Equivalent plumbing: + // GIT_DIR=... GIT_INDEX_FILE= git read-tree --empty + ReadTreeEmpty(ctx context.Context, indexFile string) error + + // UpdateIndexCacheInfo adds or replaces |path| in |indexFile| with the given blob |oid| and filemode. + // Equivalent plumbing: + // GIT_DIR=... GIT_INDEX_FILE= git update-index --add --cacheinfo + UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid OID, path string) error + + // WriteTree writes a tree object from the contents of |indexFile| and returns its oid. + // Equivalent plumbing: + // GIT_DIR=... GIT_INDEX_FILE= git write-tree + WriteTree(ctx context.Context, indexFile string) (OID, error) + + // CommitTree creates a commit object from |tree| with optional |parent| and returns its oid. + // Equivalent plumbing: + // GIT_DIR=... git commit-tree [-p ] -m + CommitTree(ctx context.Context, tree OID, parent *OID, message string, author *Identity) (OID, error) + + // UpdateRefCAS atomically updates |ref| from |old| to |new|. + // Equivalent plumbing: + // GIT_DIR=... git update-ref -m + UpdateRefCAS(ctx context.Context, ref string, newOID OID, oldOID OID, msg string) error + + // UpdateRef updates |ref| to |new| without a compare-and-swap. + // Equivalent plumbing: + // GIT_DIR=... git update-ref -m + UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error +} + +// Identity represents git author/committer metadata. A future implementation +// may set this via environment variables (GIT_AUTHOR_NAME, etc.). +type Identity struct { + Name string + Email string +} + +// UnimplementedWriteAPI is the default write API for the read-only milestone. +// It can be embedded or returned by constructors to make write paths fail fast. +type UnimplementedWriteAPI struct{} + +var _ WriteAPI = UnimplementedWriteAPI{} + +func (UnimplementedWriteAPI) ReadTree(ctx context.Context, commit OID, indexFile string) error { + return fmt.Errorf("%w: ReadTree", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error { + return fmt.Errorf("%w: ReadTreeEmpty", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid OID, path string) error { + return fmt.Errorf("%w: UpdateIndexCacheInfo", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) WriteTree(ctx context.Context, indexFile string) (OID, error) { + return "", fmt.Errorf("%w: WriteTree", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) CommitTree(ctx context.Context, tree OID, parent *OID, message string, author *Identity) (OID, error) { + return "", fmt.Errorf("%w: CommitTree", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) UpdateRefCAS(ctx context.Context, ref string, newOID OID, oldOID OID, msg string) error { + return fmt.Errorf("%w: UpdateRefCAS", ErrUnimplemented) +} + +func (UnimplementedWriteAPI) UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error { + return fmt.Errorf("%w: UpdateRef", ErrUnimplemented) +} + From 089ccfbc99251f703809e8ee3de09b70be7233e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:35:05 -0800 Subject: [PATCH 07/21] /go/store/blobstore/git_blobstore.go: wip, support all range --- go/store/blobstore/git_blobstore.go | 117 ++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 go/store/blobstore/git_blobstore.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go new file mode 100644 index 00000000000..773637e4c76 --- /dev/null +++ b/go/store/blobstore/git_blobstore.go @@ -0,0 +1,117 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "context" + "fmt" + "io" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" +) + +// GitBlobstore is a Blobstore implementation backed by a git repository's object +// database (bare repo or .git directory). It stores keys as paths within the tree +// of the commit referenced by a git ref (e.g. refs/dolt/data). +// +// This initial implementation is intentionally READ-ONLY. Write-path methods +// (Put / CheckAndPut / Concatenate) return an explicit unimplemented error while +// we lock down read behavior for manifests and table files. +type GitBlobstore struct { + gitDir string + ref string + runner *git.Runner +} + +var _ Blobstore = (*GitBlobstore)(nil) + +// NewGitBlobstore creates a new read-only GitBlobstore rooted at |gitDir| and |ref|. +// |gitDir| should point at a bare repo directory or a .git directory. +func NewGitBlobstore(gitDir, ref string) (*GitBlobstore, error) { + r, err := git.NewRunner(gitDir) + if err != nil { + return nil, err + } + return &GitBlobstore{gitDir: gitDir, ref: ref, runner: r}, nil +} + +func (gbs *GitBlobstore) Path() string { + return fmt.Sprintf("%s@%s", gbs.gitDir, gbs.ref) +} + +func (gbs *GitBlobstore) Exists(ctx context.Context, key string) (bool, error) { + commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref) + if err != nil { + return false, err + } + if !ok { + return false, nil + } + _, err = git.ResolvePathBlob(ctx, gbs.runner, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return false, nil + } + return false, err + } + return true, nil +} + +func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io.ReadCloser, uint64, string, error) { + commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref) + if err != nil { + return nil, 0, "", err + } + if !ok { + return nil, 0, "", NotFound{Key: key} + } + + blobOID, err := git.ResolvePathBlob(ctx, gbs.runner, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return nil, 0, commit.String(), NotFound{Key: key} + } + return nil, 0, commit.String(), err + } + + sz, err := git.BlobSize(ctx, gbs.runner, blobOID) + if err != nil { + return nil, 0, commit.String(), err + } + + // Range support is implemented in a follow-up task. For now, only AllRange. + if !br.isAllRange() { + return nil, uint64(sz), commit.String(), fmt.Errorf("%w: GitBlobstore.Get supports only AllRange (for now)", git.ErrUnimplemented) + } + + rc, err := git.BlobReader(ctx, gbs.runner, blobOID) + if err != nil { + return nil, 0, commit.String(), err + } + return rc, uint64(sz), commit.String(), nil +} + +func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { + return "", fmt.Errorf("%w: GitBlobstore.Put", git.ErrUnimplemented) +} + +func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { + return "", fmt.Errorf("%w: GitBlobstore.CheckAndPut", git.ErrUnimplemented) +} + +func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { + return "", fmt.Errorf("%w: GitBlobstore.Concatenate", git.ErrUnimplemented) +} + From 838a9f453bd8caa7051dae0a83e269e14f86e102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:43:14 -0800 Subject: [PATCH 08/21] /go/store/blobstore/git_blobstore.go: wip, implement ranges --- go/store/blobstore/git_blobstore.go | 47 +++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 773637e4c76..d5995164208 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -91,18 +91,53 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. return nil, 0, commit.String(), err } - // Range support is implemented in a follow-up task. For now, only AllRange. - if !br.isAllRange() { - return nil, uint64(sz), commit.String(), fmt.Errorf("%w: GitBlobstore.Get supports only AllRange (for now)", git.ErrUnimplemented) - } - rc, err := git.BlobReader(ctx, gbs.runner, blobOID) if err != nil { return nil, 0, commit.String(), err } - return rc, uint64(sz), commit.String(), nil + + // Implement BlobRange by slicing the streamed blob contents. + if br.isAllRange() { + return rc, uint64(sz), commit.String(), nil + } + + pos := br.positiveRange(sz) + if pos.offset < 0 || pos.offset > sz { + _ = rc.Close() + return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange offset %d for blob of size %d", pos.offset, sz) + } + if pos.length < 0 { + _ = rc.Close() + return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange length %d", pos.length) + } + if pos.length == 0 { + // Read from offset to end. + pos.length = sz - pos.offset + } + // Clamp to end (defensive; positiveRange should already do this). + if pos.offset+pos.length > sz { + pos.length = sz - pos.offset + } + + // Skip to offset. + if pos.offset > 0 { + if _, err := io.CopyN(io.Discard, rc, pos.offset); err != nil { + _ = rc.Close() + return nil, uint64(sz), commit.String(), err + } + } + + return &limitReadCloser{r: io.LimitReader(rc, pos.length), c: rc}, uint64(sz), commit.String(), nil +} + +type limitReadCloser struct { + r io.Reader + c io.Closer } +func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } +func (l *limitReadCloser) Close() error { return l.c.Close() } + func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { return "", fmt.Errorf("%w: GitBlobstore.Put", git.ErrUnimplemented) } From 0b327d0f274f440e8ca4dc0d381be7ca147ae7e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:45:20 -0800 Subject: [PATCH 09/21] /go/store/blobstore/git_blobstore.go: wip, normalize paths --- go/store/blobstore/git_blobstore.go | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index d5995164208..ff84ff9613f 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "io" + "strings" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" ) @@ -52,6 +53,10 @@ func (gbs *GitBlobstore) Path() string { } func (gbs *GitBlobstore) Exists(ctx context.Context, key string) (bool, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return false, err + } commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref) if err != nil { return false, err @@ -70,6 +75,10 @@ func (gbs *GitBlobstore) Exists(ctx context.Context, key string) (bool, error) { } func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io.ReadCloser, uint64, string, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return nil, 0, "", err + } commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref) if err != nil { return nil, 0, "", err @@ -139,14 +148,63 @@ func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } func (l *limitReadCloser) Close() error { return l.c.Close() } func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { + if _, err := normalizeGitTreePath(key); err != nil { + return "", err + } return "", fmt.Errorf("%w: GitBlobstore.Put", git.ErrUnimplemented) } func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { + if _, err := normalizeGitTreePath(key); err != nil { + return "", err + } return "", fmt.Errorf("%w: GitBlobstore.CheckAndPut", git.ErrUnimplemented) } func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { + if _, err := normalizeGitTreePath(key); err != nil { + return "", err + } + for _, src := range sources { + if _, err := normalizeGitTreePath(src); err != nil { + return "", err + } + } return "", fmt.Errorf("%w: GitBlobstore.Concatenate", git.ErrUnimplemented) } +// normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path. +// +// Rules: +// - convert Windows-style separators: "\" -> "/" +// - disallow absolute paths (leading "/") +// - disallow empty segments and trailing "/" +// - disallow "." and ".." segments +// - disallow NUL bytes +func normalizeGitTreePath(key string) (string, error) { + if strings.ContainsRune(key, '\x00') { + return "", fmt.Errorf("invalid git blobstore key (NUL byte): %q", key) + } + key = strings.ReplaceAll(key, "\\", "/") + if key == "" { + return "", fmt.Errorf("invalid git blobstore key (empty)") + } + if strings.HasPrefix(key, "/") { + return "", fmt.Errorf("invalid git blobstore key (absolute path): %q", key) + } + + parts := strings.Split(key, "/") + for _, p := range parts { + if p == "" { + return "", fmt.Errorf("invalid git blobstore key (empty path segment): %q", key) + } + if p == "." || p == ".." { + return "", fmt.Errorf("invalid git blobstore key (path traversal): %q", key) + } + if strings.ContainsRune(p, '\x00') { + return "", fmt.Errorf("invalid git blobstore key (NUL byte): %q", key) + } + } + return key, nil +} + From 7b72763b4f0d4b5c6803a37c7ead187668bc455a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:51:35 -0800 Subject: [PATCH 10/21] /go/store/testutils/gitrepo: wip, adding tests --- go/store/testutils/gitrepo/gitrepo.go | 210 +++++++++++++++++++++ go/store/testutils/gitrepo/gitrepo_test.go | 44 +++++ 2 files changed, 254 insertions(+) create mode 100644 go/store/testutils/gitrepo/gitrepo.go create mode 100644 go/store/testutils/gitrepo/gitrepo_test.go diff --git a/go/store/testutils/gitrepo/gitrepo.go b/go/store/testutils/gitrepo/gitrepo.go new file mode 100644 index 00000000000..2d80c03c5fb --- /dev/null +++ b/go/store/testutils/gitrepo/gitrepo.go @@ -0,0 +1,210 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gitrepo contains test helpers for creating and manipulating git repositories +// using plumbing commands without requiring a working tree checkout. +// +// This package is intended for tests of GitBlobstore and related read paths. It +// deliberately uses the git CLI (not a Go git library) to keep the harness small +// and to match how the initial GitBlobstore implementation interacts with git. +package gitrepo + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "sort" + "strings" +) + +// Repo is a test-only handle to a bare git repository (its directory is the GIT_DIR). +type Repo struct { + // GitDir is the path to the bare repository directory. + GitDir string +} + +// InitBare initializes a new bare git repository at |dir| (which should not exist yet). +func InitBare(ctx context.Context, dir string) (*Repo, error) { + if err := runGit(ctx, "", "", "", "init", "--bare", dir); err != nil { + return nil, err + } + return &Repo{GitDir: dir}, nil +} + +// InitBareTemp creates and initializes a new bare git repository under |parentDir| +// (or os.TempDir if empty). +func InitBareTemp(ctx context.Context, parentDir string) (*Repo, error) { + if parentDir == "" { + parentDir = os.TempDir() + } + dir, err := os.MkdirTemp(parentDir, "gitrepo-bare-") + if err != nil { + return nil, err + } + // git init --bare expects the target directory to not exist in some versions; + // to avoid that, create a child directory. + bareDir := filepath.Join(dir, "repo.git") + return InitBare(ctx, bareDir) +} + +// SetRefToTree writes a commit whose tree contains |files| and updates |ref| to point at it. +// This is done without a working tree checkout using a temporary index (GIT_INDEX_FILE). +// +// - |ref| example: "refs/dolt/data" +// - |files| keys are tree paths (e.g. "manifest", "a/b/c") +// - |message| becomes the commit message (defaults to "test commit" if empty) +func (r *Repo) SetRefToTree(ctx context.Context, ref string, files map[string][]byte, message string) (commitOID string, err error) { + if message == "" { + message = "test commit" + } + + indexDir, err := os.MkdirTemp("", "gitrepo-index-") + if err != nil { + return "", err + } + defer func() { + _ = os.RemoveAll(indexDir) + }() + + indexFile := filepath.Join(indexDir, "index") + + // Empty index. + if err := runGit(ctx, r.GitDir, indexFile, "", "read-tree", "--empty"); err != nil { + return "", err + } + + // Add paths. Sort for determinism. + paths := make([]string, 0, len(files)) + for p := range files { + paths = append(paths, p) + } + sort.Strings(paths) + + for _, p := range paths { + oid, err := hashObject(ctx, r.GitDir, files[p]) + if err != nil { + return "", err + } + if err := runGit(ctx, r.GitDir, indexFile, "", "update-index", "--add", "--cacheinfo", "100644", oid, p); err != nil { + return "", err + } + } + + treeOID, err := outputGit(ctx, r.GitDir, indexFile, nil, "write-tree") + if err != nil { + return "", err + } + treeOID = strings.TrimSpace(treeOID) + if treeOID == "" { + return "", fmt.Errorf("write-tree returned empty oid") + } + + commitOID, err = outputGit(ctx, r.GitDir, "", commitEnv(), "commit-tree", treeOID, "-m", message) + if err != nil { + return "", err + } + commitOID = strings.TrimSpace(commitOID) + if commitOID == "" { + return "", fmt.Errorf("commit-tree returned empty oid") + } + + if err := runGit(ctx, r.GitDir, "", "", "update-ref", ref, commitOID); err != nil { + return "", err + } + return commitOID, nil +} + +func commitEnv() []string { + // Deterministic-ish author/committer identity for tests. + return []string{ + "GIT_AUTHOR_NAME=gitrepo test", + "GIT_AUTHOR_EMAIL=gitrepo@test.invalid", + "GIT_COMMITTER_NAME=gitrepo test", + "GIT_COMMITTER_EMAIL=gitrepo@test.invalid", + } +} + +func hashObject(ctx context.Context, gitDir string, data []byte) (string, error) { + out, err := outputGitWithStdin(ctx, gitDir, "", "", bytes.NewReader(data), "hash-object", "-w", "--stdin") + if err != nil { + return "", err + } + oid := strings.TrimSpace(out) + if oid == "" { + return "", fmt.Errorf("hash-object returned empty oid") + } + return oid, nil +} + +func runGit(ctx context.Context, gitDir, indexFile string, extraEnv string, args ...string) error { + _, err := outputGit(ctx, gitDir, indexFile, splitEnv(extraEnv), args...) + return err +} + +func outputGit(ctx context.Context, gitDir, indexFile string, extraEnv []string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, "git", args...) //nolint:gosec // test harness invokes git with controlled args. + cmd.Env = envForGit(gitDir, indexFile, extraEnv) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("%w\ncommand: %s\noutput:\n%s", err, cmd.String(), strings.TrimRight(buf.String(), "\n")) + } + return buf.String(), nil +} + +func outputGitWithStdin(ctx context.Context, gitDir, indexFile string, extraEnv string, stdin *bytes.Reader, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, "git", args...) //nolint:gosec // test harness invokes git with controlled args. + cmd.Env = envForGit(gitDir, indexFile, splitEnv(extraEnv)) + cmd.Stdin = stdin + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("%w\ncommand: %s\noutput:\n%s", err, cmd.String(), strings.TrimRight(buf.String(), "\n")) + } + return buf.String(), nil +} + +func envForGit(gitDir, indexFile string, extra []string) []string { + env := append([]string(nil), os.Environ()...) + if gitDir != "" { + env = append(env, "GIT_DIR="+gitDir) + } + if indexFile != "" { + env = append(env, "GIT_INDEX_FILE="+indexFile) + } + env = append(env, extra...) + return env +} + +func splitEnv(extraEnv string) []string { + if extraEnv == "" { + return nil + } + // Allow callers to pass "K=V\nK2=V2" style strings. + lines := strings.Split(extraEnv, "\n") + out := lines[:0] + for _, l := range lines { + l = strings.TrimSpace(l) + if l != "" { + out = append(out, l) + } + } + return out +} + diff --git a/go/store/testutils/gitrepo/gitrepo_test.go b/go/store/testutils/gitrepo/gitrepo_test.go new file mode 100644 index 00000000000..7a5fbc8089d --- /dev/null +++ b/go/store/testutils/gitrepo/gitrepo_test.go @@ -0,0 +1,44 @@ +package gitrepo + +import ( + "context" + "os/exec" + "path/filepath" + "strings" + "testing" +) + +func TestInitBareAndSetRefToTree(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + root := t.TempDir() + bareDir := filepath.Join(root, "repo.git") + + repo, err := InitBare(ctx, bareDir) + if err != nil { + t.Fatalf("InitBare failed: %v", err) + } + + commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{ + "manifest": []byte("hello\n"), + "dir/file": []byte("abc"), + "dir/file2": []byte("def"), + "dir2/x.txt": []byte("xyz"), + }, "seed refs/dolt/data") + if err != nil { + t.Fatalf("SetRefToTree failed: %v", err) + } + if len(strings.TrimSpace(commit)) == 0 { + t.Fatalf("expected non-empty commit oid") + } + + // Validate the path exists in the commit. + cmd := exec.CommandContext(ctx, "git", "--git-dir", repo.GitDir, "cat-file", "-e", commit+":manifest") //nolint:gosec + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("cat-file -e failed: %v\n%s", err, string(out)) + } +} + From 363dac9d52e76ec3a2d19bc86c5ffffb9cbab0dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 14:56:06 -0800 Subject: [PATCH 11/21] /go/store/blobstore: wip, gitblobstore tests --- go/store/blobstore/git_blobstore_test.go | 188 +++++++++++++++++++++++ go/store/blobstore/internal/git/read.go | 2 + 2 files changed, 190 insertions(+) create mode 100644 go/store/blobstore/git_blobstore_test.go diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go new file mode 100644 index 00000000000..3571d583bc7 --- /dev/null +++ b/go/store/blobstore/git_blobstore_test.go @@ -0,0 +1,188 @@ +package blobstore + +import ( + "context" + "os/exec" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_RefMissingIsNotFound(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + ok, err := bs.Exists(ctx, "manifest") + require.NoError(t, err) + require.False(t, ok) + + _, _, err = GetBytes(ctx, bs, "manifest", AllRange) + require.Error(t, err) + require.True(t, IsNotFoundError(err)) +} + +func TestGitBlobstore_ExistsAndGet_AllRange(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + want := []byte("hello manifest\n") + commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{ + "manifest": want, + "dir/file": []byte("abc"), + }, "seed") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + ok, err := bs.Exists(ctx, "manifest") + require.NoError(t, err) + require.True(t, ok) + + ok, err = bs.Exists(ctx, "missing") + require.NoError(t, err) + require.False(t, ok) + + // Validate key normalization: backslash -> slash. + ok, err = bs.Exists(ctx, "dir\\file") + require.NoError(t, err) + require.True(t, ok) + + got, ver, err := GetBytes(ctx, bs, "manifest", AllRange) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, want, got) + + // Validate size + version on Get. + rc, sz, ver2, err := bs.Get(ctx, "manifest", NewBlobRange(0, 5)) + require.NoError(t, err) + require.Equal(t, uint64(len(want)), sz) + require.Equal(t, commit, ver2) + _ = rc.Close() +} + +func TestGitBlobstore_Get_NotFoundMissingKey(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + _, err = repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{ + "present": []byte("x"), + }, "seed") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + _, _, err = GetBytes(ctx, bs, "missing", AllRange) + require.Error(t, err) + require.True(t, IsNotFoundError(err)) +} + +func TestGitBlobstore_BlobRangeSemantics(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + maxValue := int64(16 * 1024) + testData := rangeData(0, maxValue) + + commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{ + "range": testData, + }, "range fixture") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + // full range + got, ver, err := GetBytes(ctx, bs, "range", AllRange) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, rangeData(0, maxValue), got) + + // first 2048 bytes (1024 shorts) + got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(0, 2048)) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, rangeData(0, 1024), got) + + // bytes 2048..4096 of original + got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(2*1024, 2*1024)) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, rangeData(1024, 2048), got) + + // last 2048 bytes + got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 0)) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, rangeData(maxValue-1024, maxValue), got) + + // tail slice: beginning 2048 bytes from end, size 512 + got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 512)) + require.NoError(t, err) + require.Equal(t, commit, ver) + require.Equal(t, rangeData(maxValue-1024, maxValue-768), got) +} + +func TestGitBlobstore_InvalidKeysError(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + _, err = repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{"ok": []byte("x")}, "seed") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + invalid := []string{ + "", + "/abs", + "../x", + "a/../b", + "a//b", + "a/", + ".", + "..", + "a/./b", + "a/\x00/b", + } + + for _, k := range invalid { + _, err := bs.Exists(ctx, k) + require.Error(t, err, "expected error for key %q", k) + + _, _, _, err = bs.Get(ctx, k, AllRange) + require.Error(t, err, "expected error for key %q", k) + } +} + diff --git a/go/store/blobstore/internal/git/read.go b/go/store/blobstore/internal/git/read.go index 28479945cdc..5356ac82b99 100644 --- a/go/store/blobstore/internal/git/read.go +++ b/go/store/blobstore/internal/git/read.go @@ -141,9 +141,11 @@ func isPathNotFoundErr(err error) bool { // Common patterns: // - "fatal: Path 'x' does not exist in 'HEAD'" // - "fatal: invalid object name 'HEAD:x'" + // - "fatal: Needed a single revision" // - "fatal: ambiguous argument '...': unknown revision or path not in the working tree." if strings.Contains(msg, "does not exist in") || strings.Contains(msg, "invalid object name") || + strings.Contains(msg, "needed a single revision") || strings.Contains(msg, "unknown revision or path not in the working tree") { return true } From ecdaff9465637330cda79a93e6533ebeced85873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 15:02:09 -0800 Subject: [PATCH 12/21] /go/store/nbs/git_blobstore_read_smoke_test.go: smoke test for git blobstore --- go/store/nbs/git_blobstore_read_smoke_test.go | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 go/store/nbs/git_blobstore_read_smoke_test.go diff --git a/go/store/nbs/git_blobstore_read_smoke_test.go b/go/store/nbs/git_blobstore_read_smoke_test.go new file mode 100644 index 00000000000..65d9c704665 --- /dev/null +++ b/go/store/nbs/git_blobstore_read_smoke_test.go @@ -0,0 +1,97 @@ +package nbs + +import ( + "bytes" + "context" + "io" + "os/exec" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/store/blobstore" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" + "github.com/dolthub/dolt/go/store/types" +) + +func TestGitBlobstoreReadSmoke_ManifestAndTableAccessPatterns(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Seed a valid v5 manifest with no tables. This should allow NBS to open + // without triggering any write paths. + mc := manifestContents{ + nbfVers: types.Format_DOLT.VersionString(), + lock: hash.Of([]byte("lock")), + root: hash.Of([]byte("root")), + gcGen: hash.Of([]byte("gcgen")), + specs: nil, + } + var buf bytes.Buffer + require.NoError(t, writeManifest(&buf, mc)) + + // Seed a "table-like" blob to exercise the same access patterns NBS uses: + // - tail reads via negative BlobRange offsets + // - ReadAt-style ranged reads (ReadAtWithStats) + table := make([]byte, 64*1024) + for i := range table { + table[i] = byte(i % 251) + } + + commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{ + "manifest": buf.Bytes(), + "table": table, + }, "seed refs/dolt/data for smoke test") + require.NoError(t, err) + require.NotEmpty(t, commit) + + bs, err := blobstore.NewGitBlobstore(repo.GitDir, "refs/dolt/data") + require.NoError(t, err) + + // 1) Manifest read path via blobstoreManifest.ParseIfExists. + stats := NewStats() + exists, got, err := blobstoreManifest{bs: bs}.ParseIfExists(ctx, stats, nil) + require.NoError(t, err) + require.True(t, exists) + require.Equal(t, mc.nbfVers, got.nbfVers) + require.Equal(t, mc.root, got.root) + require.Equal(t, mc.lock, got.lock) + require.Equal(t, mc.gcGen, got.gcGen) + require.Len(t, got.specs, 0) + + // 2) Tail-read pattern used by table index/footer loads: + // bs.Get(key, NewBlobRange(-N, 0)) and io.ReadFull. + const tailN = 1024 + rc, totalSz, ver, err := bs.Get(ctx, "table", blobstore.NewBlobRange(-tailN, 0)) + require.NoError(t, err) + require.Equal(t, uint64(len(table)), totalSz) + require.Equal(t, commit, ver) + tail := make([]byte, tailN) + _, err = io.ReadFull(rc, tail) + require.NoError(t, err) + require.NoError(t, rc.Close()) + require.Equal(t, table[len(table)-tailN:], tail) + + // 3) ReadAt-style ranged reads used by table readers. + tr := &bsTableReaderAt{bs: bs, key: "table"} + out := make([]byte, 4096) + n, err := tr.ReadAtWithStats(ctx, out, 1234, stats) + require.NoError(t, err) + require.Equal(t, len(out), n) + require.Equal(t, table[1234:1234+int64(len(out))], out) + + // Near-end reads should return short read without error. + out2 := make([]byte, 4096) + start := int64(len(table) - 100) + n, err = tr.ReadAtWithStats(ctx, out2, start, stats) + require.NoError(t, err) + require.Equal(t, 100, n) + require.Equal(t, table[start:], out2[:n]) +} + From a7fa1f6ca8beb607ef85e5d4fccce8ccd011d9b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 15:03:40 -0800 Subject: [PATCH 13/21] /go/{store, utils}: formatting --- go/store/blobstore/git_blobstore.go | 1 - go/store/blobstore/git_blobstore_test.go | 1 - go/store/blobstore/internal/git/errors.go | 1 - go/store/blobstore/internal/git/read.go | 1 - go/store/blobstore/internal/git/runner.go | 1 - go/store/blobstore/internal/git/write.go | 1 - go/store/nbs/git_blobstore_read_smoke_test.go | 1 - go/store/testutils/gitrepo/gitrepo.go | 1 - go/store/testutils/gitrepo/gitrepo_test.go | 1 - go/utils/repofmt/format_repo.sh | 2 +- 10 files changed, 1 insertion(+), 10 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index ff84ff9613f..086316c5629 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -207,4 +207,3 @@ func normalizeGitTreePath(key string) (string, error) { } return key, nil } - diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index 3571d583bc7..bed925bb4bd 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -185,4 +185,3 @@ func TestGitBlobstore_InvalidKeysError(t *testing.T) { require.Error(t, err, "expected error for key %q", k) } } - diff --git a/go/store/blobstore/internal/git/errors.go b/go/store/blobstore/internal/git/errors.go index dad8836c46b..0ad0221fc30 100644 --- a/go/store/blobstore/internal/git/errors.go +++ b/go/store/blobstore/internal/git/errors.go @@ -65,4 +65,3 @@ func IsPathNotFound(err error) bool { _, ok := err.(PathNotFoundError) return ok } - diff --git a/go/store/blobstore/internal/git/read.go b/go/store/blobstore/internal/git/read.go index 5356ac82b99..b27faa3d3e3 100644 --- a/go/store/blobstore/internal/git/read.go +++ b/go/store/blobstore/internal/git/read.go @@ -173,4 +173,3 @@ func NormalizeGitPlumbingError(err error) error { } return err } - diff --git a/go/store/blobstore/internal/git/runner.go b/go/store/blobstore/internal/git/runner.go index 0448887323e..5bec49f45a4 100644 --- a/go/store/blobstore/internal/git/runner.go +++ b/go/store/blobstore/internal/git/runner.go @@ -252,4 +252,3 @@ func formatOutput(out []byte) string { trimmed := out[len(out)-maxCapturedOutputBytes:] return fmt.Sprintf("... (truncated; showing last %d bytes)\n%s", maxCapturedOutputBytes, strings.TrimRight(string(trimmed), "\n")) } - diff --git a/go/store/blobstore/internal/git/write.go b/go/store/blobstore/internal/git/write.go index 8590acb24c6..a3d7a09c1ac 100644 --- a/go/store/blobstore/internal/git/write.go +++ b/go/store/blobstore/internal/git/write.go @@ -102,4 +102,3 @@ func (UnimplementedWriteAPI) UpdateRefCAS(ctx context.Context, ref string, newOI func (UnimplementedWriteAPI) UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error { return fmt.Errorf("%w: UpdateRef", ErrUnimplemented) } - diff --git a/go/store/nbs/git_blobstore_read_smoke_test.go b/go/store/nbs/git_blobstore_read_smoke_test.go index 65d9c704665..5b1063e0d21 100644 --- a/go/store/nbs/git_blobstore_read_smoke_test.go +++ b/go/store/nbs/git_blobstore_read_smoke_test.go @@ -94,4 +94,3 @@ func TestGitBlobstoreReadSmoke_ManifestAndTableAccessPatterns(t *testing.T) { require.Equal(t, 100, n) require.Equal(t, table[start:], out2[:n]) } - diff --git a/go/store/testutils/gitrepo/gitrepo.go b/go/store/testutils/gitrepo/gitrepo.go index 2d80c03c5fb..c2ed9d78c75 100644 --- a/go/store/testutils/gitrepo/gitrepo.go +++ b/go/store/testutils/gitrepo/gitrepo.go @@ -207,4 +207,3 @@ func splitEnv(extraEnv string) []string { } return out } - diff --git a/go/store/testutils/gitrepo/gitrepo_test.go b/go/store/testutils/gitrepo/gitrepo_test.go index 7a5fbc8089d..aa0cf074db5 100644 --- a/go/store/testutils/gitrepo/gitrepo_test.go +++ b/go/store/testutils/gitrepo/gitrepo_test.go @@ -41,4 +41,3 @@ func TestInitBareAndSetRefToTree(t *testing.T) { t.Fatalf("cat-file -e failed: %v\n%s", err, string(out)) } } - diff --git a/go/utils/repofmt/format_repo.sh b/go/utils/repofmt/format_repo.sh index 64bf3292d88..6a0edd26751 100755 --- a/go/utils/repofmt/format_repo.sh +++ b/go/utils/repofmt/format_repo.sh @@ -10,7 +10,7 @@ paths=`find . -maxdepth 1 -mindepth 1 \( -type d -print -o -type f -name '*.go' goimports -w -local github.com/dolthub/dolt,github.com/dolthub/eventsapi_schema $paths bad_files=$(find $paths -name '*.go' | while read f; do - if [[ $(awk '/import \(/{flag=1;next}/\)/{flag=0}flag' < $f | egrep -c '$^') -gt 2 ]]; then + if [[ $(awk '/import \(/{flag=1;next}/\)/{flag=0}flag' < $f | grep -Ec '$^') -gt 2 ]]; then echo $f fi done) From b6121452d86cff4919272202f0889008ac0ea4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 15:15:39 -0800 Subject: [PATCH 14/21] /go/store/blobstore/internal/git/runner.go: fix comment --- go/store/blobstore/internal/git/runner.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/go/store/blobstore/internal/git/runner.go b/go/store/blobstore/internal/git/runner.go index 5bec49f45a4..2dfa62bd75f 100644 --- a/go/store/blobstore/internal/git/runner.go +++ b/go/store/blobstore/internal/git/runner.go @@ -166,7 +166,12 @@ func (r *Runner) Run(ctx context.Context, opts RunOptions, args ...string) ([]by } // Start starts "git " and returns a ReadCloser for stdout. -// The caller must call Wait() on the returned Cmd to release resources. +// +// Resource management: +// - Call Close() on the returned ReadCloser to ensure the underlying git process +// is waited (cmd.Wait()) and resources are released. +// - The returned *exec.Cmd is provided for advanced uses (e.g. signals), but most +// callers should not call Wait() directly. func (r *Runner) Start(ctx context.Context, opts RunOptions, args ...string) (io.ReadCloser, *exec.Cmd, error) { cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing. if opts.Dir != "" { From 2c6ffcdb39d1642bdf732581ce50f1f04ac7dff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 15:17:46 -0800 Subject: [PATCH 15/21] /go/store/{blobstore,testutils}: some comment updates --- go/store/blobstore/internal/git/read.go | 3 ++- go/store/testutils/gitrepo/gitrepo.go | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/go/store/blobstore/internal/git/read.go b/go/store/blobstore/internal/git/read.go index b27faa3d3e3..4a7f4148a39 100644 --- a/go/store/blobstore/internal/git/read.go +++ b/go/store/blobstore/internal/git/read.go @@ -60,7 +60,8 @@ func ResolveRefCommit(ctx context.Context, r *Runner, ref string) (OID, error) { } // ResolvePathBlob resolves |path| within |commit| to a blob OID. -// It returns PathNotFoundError if the path does not exist. +// It returns PathNotFoundError if the path does not exist, and NotBlobError if the +// path resolves to a non-blob object (e.g. a tree). func ResolvePathBlob(ctx context.Context, r *Runner, commit OID, path string) (OID, error) { spec := commit.String() + ":" + path out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec) diff --git a/go/store/testutils/gitrepo/gitrepo.go b/go/store/testutils/gitrepo/gitrepo.go index c2ed9d78c75..31113c356d1 100644 --- a/go/store/testutils/gitrepo/gitrepo.go +++ b/go/store/testutils/gitrepo/gitrepo.go @@ -37,7 +37,9 @@ type Repo struct { GitDir string } -// InitBare initializes a new bare git repository at |dir| (which should not exist yet). +// InitBare initializes a new bare git repository at |dir|. +// For portability across git versions, callers should generally pass a path that +// does not exist yet. func InitBare(ctx context.Context, dir string) (*Repo, error) { if err := runGit(ctx, "", "", "", "init", "--bare", dir); err != nil { return nil, err From 12ce028bb6f2622c5398405626557254e3ee742f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 15:33:49 -0800 Subject: [PATCH 16/21] /go/store/{blobstore,nbs,testutils}: fix copyright headers --- go/store/blobstore/git_blobstore_test.go | 14 ++++++++++++++ go/store/nbs/git_blobstore_read_smoke_test.go | 14 ++++++++++++++ go/store/testutils/gitrepo/gitrepo_test.go | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index bed925bb4bd..543d5f059a1 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -1,3 +1,17 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package blobstore import ( diff --git a/go/store/nbs/git_blobstore_read_smoke_test.go b/go/store/nbs/git_blobstore_read_smoke_test.go index 5b1063e0d21..590f39588df 100644 --- a/go/store/nbs/git_blobstore_read_smoke_test.go +++ b/go/store/nbs/git_blobstore_read_smoke_test.go @@ -1,3 +1,17 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package nbs import ( diff --git a/go/store/testutils/gitrepo/gitrepo_test.go b/go/store/testutils/gitrepo/gitrepo_test.go index aa0cf074db5..b7438712c31 100644 --- a/go/store/testutils/gitrepo/gitrepo_test.go +++ b/go/store/testutils/gitrepo/gitrepo_test.go @@ -1,3 +1,17 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package gitrepo import ( From 54debeca139fe5cb78b72526f9bdc739c257ddc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 16:32:31 -0800 Subject: [PATCH 17/21] /{AGENTS.md,GITBLOBSTORE_DESGIN.md}: remove md files --- AGENTS.md | 26 --- GITBLOBSTORE_DESIGN.md | 432 ----------------------------------------- 2 files changed, 458 deletions(-) delete mode 100644 AGENTS.md delete mode 100644 GITBLOBSTORE_DESIGN.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 8b696b78726..00000000000 --- a/AGENTS.md +++ /dev/null @@ -1,26 +0,0 @@ -# Agent Instructions - -This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started. - -## Quick Reference - -```bash -bd ready # Find available work -bd show # View issue details -bd update --status in_progress # Claim work -bd close # Complete work -bd sync # Sync with git -``` - -## Landing the Plane (Session Completion) - -**When ending a work session**, you MUST complete ALL steps below. Work is NOT complete until `git push` succeeds. - -**MANDATORY WORKFLOW:** - -1. **File issues for remaining work** - Create issues for anything that needs follow-up -2. **Run quality gates** (if code changed) - Tests, linters, builds -3. **Update issue status** - Close finished work, update in-progress items -4. **Clean up** - Clear stashes, prune remote branches -5. **Hand off** - Provide context for next session - diff --git a/GITBLOBSTORE_DESIGN.md b/GITBLOBSTORE_DESIGN.md deleted file mode 100644 index 4c3c4ea19f7..00000000000 --- a/GITBLOBSTORE_DESIGN.md +++ /dev/null @@ -1,432 +0,0 @@ -# GitBlobstore design (Git object DB-backed `Blobstore`) - -## Summary - -This document proposes a new `Blobstore` implementation, **`GitBlobstore`**, whose backing store is a **git repository’s object database** (a bare repo or `.git` directory). `GitBlobstore` is intended to eventually enable using **Git remotes as Dolt remotes** by storing Dolt’s NBS artifacts as git objects, **without creating a working tree checkout**. - -Key constraints: - -- Must implement the existing `go/store/blobstore/blobstore.go` `Blobstore` interface (no interface changes). -- Must be compatible with NBS usage patterns (`go/store/nbs/*`) and the existing blobstore test suite (`go/store/blobstore/blobstore_test.go`). -- Must operate using only the `.git` directory / bare repo and **manipulate git objects directly** (blobs/trees/commits/refs). No checkout. -- Initial ref to use: **`refs/dolt/data`**. - -## Background: current `Blobstore` expectations beyond the interface - -Although `Blobstore`’s interface is small, the rest of the codebase imposes several **implicit behavioral requirements**. - -### Where `Blobstore` is used - -`Blobstore` is used by NBS when you construct a store via: - -- `nbs.NewBSStore(ctx, ..., bs, ...)` (full conjoin path, requires `Concatenate`) -- `nbs.NewNoConjoinBSStore(ctx, ..., bs, ...)` (OCI-like path, no conjoin/compose) - -and for the NBS manifest: - -- `go/store/nbs/bs_manifest.go` stores the manifest as a blob named `"manifest"` using `Get` + `CheckAndPut`. - -### Required semantics (tests + NBS) - -#### 1) Missing keys must return `blobstore.NotFound` - -`blobstore_test.go` checks missing-key behavior with `IsNotFoundError(err)` which relies on `err.(NotFound)` type assertion (`go/store/blobstore/errors.go`). - -`GitBlobstore.Get()` must return `NotFound{Key: key}` (or with a more descriptive Key string) when the blob does not exist. - -#### 2) `Get()` must implement `BlobRange` correctly, including negative offsets - -`blobstore_test.go` validates: - -- `AllRange` returns the entire blob -- `NewBlobRange(offset, length)` returns exactly the requested bytes -- `NewBlobRange(-n, 0)` returns the last `n` bytes -- `NewBlobRange(-n, m)` returns a slice from the tail - -NBS relies on these forms heavily to read table indices/footers: - -- `NewBlobRange(-N, 0)` to read the last `N` bytes of table files (index/footer) -- range reads to implement `ReadAtWithStats` for table readers - -#### 3) `Get()` must return the **total blob size** as `size` - -NBS code uses the `size` returned by `Get()` (even for a range request) as the full object size, e.g. archive readers use it as the total file size. - -For range requests, `size` must be the full object size, not the response byte count. - -#### 4) `Put()` and `Get()` versions must be consistent - -`blobstore_test.go` verifies that: - -- `Put()` returns a version string -- `Get()` returns the same version string immediately after the `Put()` - -#### 5) `CheckAndPut()` must provide real CAS behavior under concurrency - -`blobstore_test.go` has a concurrent read/modify/write loop that: - -1. calls `Get()` to read bytes and a version -2. calls `CheckAndPut(expectedVersion=thatVersion, ...)` - -Under contention, `CheckAndPut()` must: - -- fail with a `blobstore.CheckAndPutError` (recognized via `IsCheckAndPutError`) -- succeed only when the expected version matches the current version - -`CheckAndPut(expectedVersion="")` must also support create-if-absent semantics (the tests call it on a missing key). - -#### 6) `Concatenate()` is required for the full NBS blobstore persister - -NBS’s `blobstorePersister` (`go/store/nbs/bs_persister.go`) persists tables by writing: - -- `.records` -- `.tail` - -then calling: - -- `bs.Concatenate(name, []string{name+".records", name+".tail"})` - -Conjoin operations also call `Concatenate` to compose record-only sub-objects. - -If a blobstore cannot implement `Concatenate`, it must be used with `NewNoConjoinBSStore`, and conjoin is disabled (as is done for OCI). - -## Concept: representing a Blobstore inside git - -The `GitBlobstore` will map the blobstore keyspace onto the **tree of a commit** referenced by `refs/dolt/data`. - -- The ref `refs/dolt/data` points to a commit. -- The commit’s tree is the “directory” holding all blobstore keys. -- Each blobstore `key` is a **path** in that tree; its contents are a git **blob** object. - -This gives us: - -- immutable content-addressed blob objects for data -- an append-only history of updates (commits) -- a single ref head that can serve as a “version” for CAS updates - -No working tree is required; we only read/write objects and update refs. - -## Key design decisions - -### Ref name - -Use: - -- `refs/dolt/data` - -as the authoritative reference for the blobstore. - -### Version string - -Define the blobstore “version” as: - -- the **commit hash** (hex object id) currently pointed to by `refs/dolt/data` - -Rationale: - -- it is globally consistent across keys (a snapshot version) -- it naturally supports CAS: update ref from old commit → new commit -- it mirrors object-store generation/etag semantics as a stable identifier - -Implications: - -- `Get()` returns the commit hash of `refs/dolt/data` at the time of lookup. -- `Put()` returns the new commit hash it wrote. -- `CheckAndPut()` compares expected commit hash to current ref commit hash. - -### Atomic CAS - -`CheckAndPut()` must be atomic: it must only update if the ref still points to the expected commit id. - -Implementation requirement: - -- update `refs/dolt/data` with a compare-and-swap on the old object id - -This can be achieved by: - -- using git plumbing (`update-ref `) which is designed to be atomic, or -- implementing direct ref updates with proper locking and packed-refs handling (more complex). - -### Key/path validation - -Blobstore keys are treated as paths in the git tree. We must prevent traversal or invalid path components. - -Rules: - -- reject keys that are absolute (`/…`) or contain `..` path segments -- reject NUL bytes -- normalize path separators to `/` (git tree paths are `/`) -- optionally, restrict keys to a conservative character set if desired (not required initially) - -### `.bs` extension - -Unlike `LocalBlobstore` which appends `.bs` on disk, `GitBlobstore` should store keys **exactly as provided** (e.g. `manifest`, ``, `.records`), matching other remote blobstores (GCS/OCI/OSS) which also store by key directly. - -## Interface mapping - -Below is the planned behavior for each `Blobstore` method. - -### `Path() string` - -Return a stable identifier for logging/debugging, e.g.: - -- `@refs/dolt/data` - -### `Exists(ctx, key) (bool, error)` - -- Resolve `refs/dolt/data` to a commit (if the ref does not exist, return `(false, nil)`). -- Read the commit tree and look up the path for `key`. -- Return true if present and is a blob, else false. - -### `Get(ctx, key, br) (rc, size, version, err)` - -Steps: - -1. Resolve `refs/dolt/data` → commit id (`version`). -2. Locate the blob object id at path `key` in that commit’s tree. -3. Determine `size` (full blob size), even if the caller requests a range. -4. Return an `io.ReadCloser` over the requested range: - - If `br` is `AllRange`, stream full blob. - - If `br.offset < 0`, convert to a positive range using total `size` (same behavior as `BlobRange.positiveRange`). - - If `br.length == 0`, stream from offset to end. - - If `br.length > 0`, stream exactly `length` bytes starting at `offset`. - -Errors: - -- if key is missing: return `NotFound{Key: key}` - -### `Put(ctx, key, totalSize, reader) (version, error)` - -Behavior: - -- Write a new git blob object with the content from `reader`. -- Create a new tree that is the previous tree with path `key` updated to point to that blob. -- Create a new commit and update `refs/dolt/data` to point to it (last-writer-wins). - -Notes: - -- `Put()` does not provide CAS; `CheckAndPut()` is used where CAS is required (manifest updates and concurrent RMW patterns). -- `Put()` should return the new commit hash as `version`. - -### `CheckAndPut(ctx, expectedVersion, key, totalSize, reader) (version, error)` - -CAS semantics: - -- If `expectedVersion != ""`: - - require that `refs/dolt/data` currently points to `expectedVersion` - - if not, return `CheckAndPutError{Key, ExpectedVersion, ActualVersion}` -- If `expectedVersion == ""`: - - implement create-if-absent behavior: - - if `key` exists in the current ref snapshot, return `CheckAndPutError` - - otherwise, proceed - -If check passes: - -- perform the same write as `Put()` (write blob, update tree, commit) -- atomically update the ref from old commit id to new commit id (CAS) - -Return: - -- new commit hash - -### `Concatenate(ctx, key, sources) (version, error)` - -Correctness-first approach: - -- Stream all sources in order and write a new git blob object whose content is their concatenation. -- Update path `key` to point to that new blob. -- Commit and update `refs/dolt/data` (either last-writer-wins, or CAS on current head; CAS is preferred to avoid lost updates if multiple writers are composing concurrently). - -Notes: - -- NBS calls `Concatenate` for `Persist()` and for conjoin; correctness matters more than git-level “compose” optimization. -- This is expected to be efficient enough initially because objects are already local to the `.git` store; it avoids network. - -## NBS compatibility modes - -`GitBlobstore` is expected to implement `Concatenate`, so it can be used with: - -- `nbs.NewBSStore(...)` (full conjoin enabled) - -If at some point we introduce a mode that cannot implement efficient `Concatenate`, then it must be paired with: - -- `nbs.NewNoConjoinBSStore(...)` (conjoin disabled) - -## Performance notes and future work - -### Range reads - -Git does not provide a native “range read” API for blobs. Implementations will likely: - -- read the full decompressed blob stream and slice/limit, or -- optimize tail reads (`offset < 0`) with a ring buffer to avoid buffering whole blobs in memory - -NBS does many tail reads for index/footer, so tail-range optimization is a likely follow-up. - -### Tree updates at scale - -Updating a file path in a git tree requires rebuilding trees along the path. If the blobstore keyspace gets large and flat, tree operations may become hot. - -Potential future optimizations: - -- key sharding (e.g. store keys under `aa/bb/` derived from hash prefixes) -- batching multiple key updates into one commit when possible -- using lower-level object formats or packfile streaming for high write throughput - -### Repository maintenance - -Because each write creates new objects and commits: - -- git GC / repack policy will matter -- prune of unreachable objects may be required in long-running deployments - -These are explicitly out of scope for the first iteration. - -## Dependency / implementation strategy - -This design is compatible with multiple backends: - -- **Git CLI plumbing** (e.g., `git --git-dir cat-file`, `hash-object`, `mktree`, `commit-tree`, `update-ref`) - - Pros: low code, uses battle-tested git object manipulation, no extra Go dependencies - - Cons: runtime dependency on `git`, process overhead - -- **Pure-Go implementation** - - Pros: no external process dependency - - Cons: larger implementation effort; likely requires introducing a library dependency (e.g., `go-git`) or writing object plumbing in-house - -Given the current module does not include a Go git implementation dependency, the initial implementation can reasonably use **git plumbing commands** while still meeting the “no checkout” requirement. - -## Git internals and plumbing commands (implementation preview) - -This section previews the **git internals** and a concrete set of **plumbing commands** we expect `GitBlobstore` to use. The approach explicitly avoids a working tree checkout. - -### Git internals we rely on - -- **Objects** live in the object database (ODB) as either: - - **loose** objects under `.git/objects/aa/bb…`, or - - **packed** objects under `.git/objects/pack/*.pack` - `GitBlobstore` must be agnostic to storage format; it should interact through git’s plumbing which abstracts over loose/packed storage. -- **Object types**: - - **blob**: raw file contents (this is the value for a blobstore key) - - **tree**: directory mapping `name -> (mode, type, oid)` (this is the blobstore keyspace index) - - **commit**: points to a root tree + parent(s) + metadata (this is the snapshot “version”) -- **Ref**: - - `refs/dolt/data` points at the current commit snapshot - - updating this ref publishes a new blobstore state - -### Minimal plumbing command set - -Below are the commands we expect to use (exact invocation may vary). - -#### Resolve ref / versions - -- Check ref existence: - - `git --git-dir show-ref --verify --quiet refs/dolt/data` -- Resolve current commit (version): - - `git --git-dir rev-parse --verify refs/dolt/data` - -#### Key existence and path lookup - -- Check if a path exists in a commit tree: - - `git --git-dir cat-file -e :` -- Resolve the blob oid for a path: - - `git --git-dir rev-parse :` - -#### Blob size and blob content streaming - -- Object size (needed so `Get()` can return total blob size even for range requests): - - `git --git-dir cat-file -s ` -- Stream blob contents: - - `git --git-dir cat-file blob ` - -Note: git does not have a native “range read” for blobs; for `BlobRange` we will stream and slice/limit in Go. Tail-range reads (`offset < 0`) can be optimized later with a ring buffer. - -#### Write a blob object - -- Write blob from stdin and get its oid: - - `git --git-dir hash-object -w --stdin` - -This is used by `Put`, `CheckAndPut`, and `Concatenate` (after concatenating sources). - -### Tree updates without a checkout (Approach A: temporary index) - -We prefer **Approach A**: use a *temporary git index file* to stage a tree update without a working directory. - -Core idea: - -- set `GIT_DIR=` -- set `GIT_INDEX_FILE=` -- use `read-tree` + `update-index --cacheinfo` + `write-tree` - -Commands: - -- Initialize the temporary index from the current tree: - - if `refs/dolt/data` exists: - - `git read-tree ^{tree}` - - if the ref is missing (bootstrap): - - `git read-tree --empty` -- Add or replace a path with a specific blob oid: - - `git update-index --add --cacheinfo 100644 ` - - (mode may be `100644` for regular file; no executable bits) -- Write the resulting tree: - - `git write-tree` → outputs `` - -Advantages: - -- avoids manual tree reconstruction for nested paths -- avoids checkout / working tree entirely -- supports deep key paths naturally - -### Commit creation and atomic ref update (CAS) - -- Create a commit object from the new tree: - - `git commit-tree -p -m ` - - (author/committer identity can be supplied via env; we can use a fixed identity initially) -- Atomic compare-and-swap ref update: - - `git update-ref -m "" refs/dolt/data ` - -This `update-ref` form is the primary CAS primitive; it will fail if the ref no longer points to ``. - -Bootstrap note: - -- for “create ref only if absent”, git accepts an “old” value of all-zeros (`000…000`) which enforces that the ref does not exist. - -### Command-to-method mapping (quick reference) - -- `Exists(key)`: - - `rev-parse refs/dolt/data` (if missing → false) - - `cat-file -e :` -- `Get(key, br)`: - - `rev-parse refs/dolt/data` → version - - `rev-parse :` → blob oid - - `cat-file -s ` → total size - - `cat-file blob ` → stream; slice/limit in Go per `BlobRange` -- `Put(key, reader)`: - - `hash-object -w --stdin` → blob oid - - temp index: `read-tree` → `update-index --cacheinfo` → `write-tree` → tree oid - - `commit-tree` → new commit oid - - `update-ref refs/dolt/data ` (non-CAS) -- `CheckAndPut(expectedVersion, key, reader)`: - - same as `Put`, but final step is: - - `update-ref refs/dolt/data ` (CAS) -- `Concatenate(key, sources)`: - - for each source: resolve blob oid and stream `cat-file blob` - - pipe concatenation into `hash-object -w --stdin` - - then same tree/commit/ref update flow as `Put` (CAS preferred) - -## Testing plan (expected to pass existing suite) - -`GitBlobstore` should be able to join the existing blobstore tests in `go/store/blobstore/blobstore_test.go`: - -- Put/Get version equality -- NotFound behavior -- CAS correctness (concurrent CheckAndPut test) -- Range read correctness (including negative offsets) -- Concatenate correctness - -Additional recommended tests: - -- behavior when `refs/dolt/data` does not exist yet (bootstrap) -- packed-refs handling (if the ref is packed) -- concurrent writers updating different keys (should not corrupt ref) - From b79b3e5a469fecc461300616ecfe076c7e1dc3c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 16:34:45 -0800 Subject: [PATCH 18/21] /go/store/blobstore/internal/git/runner.go: formatting --- go/store/blobstore/internal/git/runner.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/store/blobstore/internal/git/runner.go b/go/store/blobstore/internal/git/runner.go index 2dfa62bd75f..c9196cc8fa3 100644 --- a/go/store/blobstore/internal/git/runner.go +++ b/go/store/blobstore/internal/git/runner.go @@ -168,10 +168,10 @@ func (r *Runner) Run(ctx context.Context, opts RunOptions, args ...string) ([]by // Start starts "git " and returns a ReadCloser for stdout. // // Resource management: -// - Call Close() on the returned ReadCloser to ensure the underlying git process -// is waited (cmd.Wait()) and resources are released. -// - The returned *exec.Cmd is provided for advanced uses (e.g. signals), but most -// callers should not call Wait() directly. +// - Call Close() on the returned ReadCloser to ensure the underlying git process +// is waited (cmd.Wait()) and resources are released. +// - The returned *exec.Cmd is provided for advanced uses (e.g. signals), but most +// callers should not call Wait() directly. func (r *Runner) Start(ctx context.Context, opts RunOptions, args ...string) (io.ReadCloser, *exec.Cmd, error) { cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing. if opts.Dir != "" { From d25fcead42faf65b1db4bb0ec84d65365972d5e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 16:40:28 -0800 Subject: [PATCH 19/21] /go/store/blobstore/internal/git: addressing pr feedback --- go/store/blobstore/internal/git/errors.go | 14 +++++++------- go/store/blobstore/internal/git/read.go | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go/store/blobstore/internal/git/errors.go b/go/store/blobstore/internal/git/errors.go index 0ad0221fc30..e72bd28977c 100644 --- a/go/store/blobstore/internal/git/errors.go +++ b/go/store/blobstore/internal/git/errors.go @@ -28,7 +28,7 @@ type RefNotFoundError struct { Ref string } -func (e RefNotFoundError) Error() string { +func (e *RefNotFoundError) Error() string { return fmt.Sprintf("git ref not found: %s", e.Ref) } @@ -38,7 +38,7 @@ type PathNotFoundError struct { Path string } -func (e PathNotFoundError) Error() string { +func (e *PathNotFoundError) Error() string { return fmt.Sprintf("git path not found: %s:%s", e.Commit, e.Path) } @@ -49,7 +49,7 @@ type NotBlobError struct { Type string } -func (e NotBlobError) Error() string { +func (e *NotBlobError) Error() string { if e.Type == "" { return fmt.Sprintf("git path is not a blob: %s:%s", e.Commit, e.Path) } @@ -57,11 +57,11 @@ func (e NotBlobError) Error() string { } func IsRefNotFound(err error) bool { - _, ok := err.(RefNotFoundError) - return ok + var e *RefNotFoundError + return errors.As(err, &e) } func IsPathNotFound(err error) bool { - _, ok := err.(PathNotFoundError) - return ok + var e *PathNotFoundError + return errors.As(err, &e) } diff --git a/go/store/blobstore/internal/git/read.go b/go/store/blobstore/internal/git/read.go index 4a7f4148a39..4a6b16f09ab 100644 --- a/go/store/blobstore/internal/git/read.go +++ b/go/store/blobstore/internal/git/read.go @@ -54,7 +54,7 @@ func ResolveRefCommit(ctx context.Context, r *Runner, ref string) (OID, error) { return "", err } if !ok { - return "", RefNotFoundError{Ref: ref} + return "", &RefNotFoundError{Ref: ref} } return oid, nil } @@ -67,7 +67,7 @@ func ResolvePathBlob(ctx context.Context, r *Runner, commit OID, path string) (O out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec) if err != nil { if isPathNotFoundErr(err) { - return "", PathNotFoundError{Commit: commit.String(), Path: path} + return "", &PathNotFoundError{Commit: commit.String(), Path: path} } return "", err } @@ -81,7 +81,7 @@ func ResolvePathBlob(ctx context.Context, r *Runner, commit OID, path string) (O return "", err } if typ != "blob" { - return "", NotBlobError{Commit: commit.String(), Path: path, Type: typ} + return "", &NotBlobError{Commit: commit.String(), Path: path, Type: typ} } return OID(oid), nil } From b45fce36fcd19bbcd7b8f02b3c273d007d77419f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 16:45:08 -0800 Subject: [PATCH 20/21] /go/store/blobstore/internal/git/runner.go: remove buffer slices --- go/store/blobstore/internal/git/runner.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/go/store/blobstore/internal/git/runner.go b/go/store/blobstore/internal/git/runner.go index c9196cc8fa3..02a4353709b 100644 --- a/go/store/blobstore/internal/git/runner.go +++ b/go/store/blobstore/internal/git/runner.go @@ -100,12 +100,8 @@ func (e *CmdError) Error() string { b.WriteString("\ndir: ") b.WriteString(e.Dir) } - if len(e.Output) > 0 { - b.WriteString("\noutput:\n") - b.WriteString(formatOutput(e.Output)) - } else { - b.WriteString("\noutput:\n(no output)") - } + b.WriteString("\noutput:\n") + b.WriteString(formatOutput(e.Output)) if e.Cause != nil { b.WriteString("\nerror: ") b.WriteString(e.Cause.Error()) @@ -160,7 +156,7 @@ func (r *Runner) Run(ctx context.Context, opts RunOptions, args ...string) ([]by Args: append([]string(nil), args...), Dir: cmd.Dir, ExitCode: exitCode, - Output: append([]byte(nil), out...), + Output: out, Cause: err, } } @@ -231,7 +227,7 @@ func (c *cmdReadCloser) Close() error { Args: c.args, Dir: c.dir, ExitCode: exitCode, - Output: append([]byte(nil), c.stderr.Bytes()...), + Output: c.stderr.Bytes(), Cause: err, } } From 83a31e261c22e4cef9ef70f6f9fa628672dd7f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Tue, 3 Feb 2026 16:51:23 -0800 Subject: [PATCH 21/21] /go/store/blobstore: more feedback --- go/store/blobstore/git_blobstore.go | 10 +++++++++- go/store/blobstore/git_blobstore_test.go | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 086316c5629..c1e7055f2f3 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -84,7 +84,12 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. return nil, 0, "", err } if !ok { - return nil, 0, "", NotFound{Key: key} + // If the ref doesn't exist, treat the manifest as missing (empty store), + // but surface a hard error for other keys: the store itself is missing. + if key == "manifest" { + return nil, 0, "", NotFound{Key: key} + } + return nil, 0, "", &git.RefNotFoundError{Ref: gbs.ref} } blobOID, err := git.ResolvePathBlob(ctx, gbs.runner, commit, key) @@ -100,6 +105,9 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. return nil, 0, commit.String(), err } + // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads + // that do many small ranged reads (e.g. table index/footer reads). Consider caching/materializing + // blobs to a local file (or using a batched git cat-file mode) to serve ranges efficiently. rc, err := git.BlobReader(ctx, gbs.runner, blobOID) if err != nil { return nil, 0, commit.String(), err diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index 543d5f059a1..1d1b920f88a 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -16,11 +16,13 @@ package blobstore import ( "context" + "errors" "os/exec" "testing" "github.com/stretchr/testify/require" + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" "github.com/dolthub/dolt/go/store/testutils/gitrepo" ) @@ -43,6 +45,13 @@ func TestGitBlobstore_RefMissingIsNotFound(t *testing.T) { _, _, err = GetBytes(ctx, bs, "manifest", AllRange) require.Error(t, err) require.True(t, IsNotFoundError(err)) + + // For non-manifest keys, missing the ref is a hard error. + _, _, _, err = bs.Get(ctx, "table", AllRange) + require.Error(t, err) + require.False(t, IsNotFoundError(err)) + var rnf *git.RefNotFoundError + require.True(t, errors.As(err, &rnf)) } func TestGitBlobstore_ExistsAndGet_AllRange(t *testing.T) {