diff --git a/go/cmd/dolt/commands/clone.go b/go/cmd/dolt/commands/clone.go index 5009ca63895..4baca8cebe3 100644 --- a/go/cmd/dolt/commands/clone.go +++ b/go/cmd/dolt/commands/clone.go @@ -222,6 +222,12 @@ func parseArgs(apr *argparser.ArgParseResults) (string, string, errhand.VerboseE } else if dir == "/" { return "", "", errhand.BuildDError("Could not infer repo name. Please explicitly define a directory for this url").Build() } + if strings.HasSuffix(dir, ".git") { + dir = strings.TrimSuffix(dir, ".git") + if dir == "" { + return "", "", errhand.BuildDError("Could not infer repo name. Please explicitly define a directory for this url").Build() + } + } } return dir, urlStr, nil diff --git a/go/cmd/dolt/commands/clone_test.go b/go/cmd/dolt/commands/clone_test.go index a0947612063..0be2641a168 100644 --- a/go/cmd/dolt/commands/clone_test.go +++ b/go/cmd/dolt/commands/clone_test.go @@ -73,6 +73,17 @@ func TestCloneParseArgs_InferDir(t *testing.T) { dir, urlStr, verr := parseArgs(apr) require.Nil(t, verr) - require.Equal(t, "repo.git", dir) + require.Equal(t, "repo", dir) require.Equal(t, "https://example.com/org/repo.git", urlStr) } + +func TestCloneParseArgs_InferDir_GitFile(t *testing.T) { + ap := CloneCmd{}.ArgParser() + apr, err := ap.Parse([]string{"git+file:///tmp/remote.git"}) + require.NoError(t, err) + + dir, urlStr, verr := parseArgs(apr) + require.Nil(t, verr) + require.Equal(t, "remote", dir) + require.Equal(t, "git+file:///tmp/remote.git", urlStr) +} diff --git a/go/cmd/dolt/commands/remote.go b/go/cmd/dolt/commands/remote.go index 73f522b2e1e..865e655bcbd 100644 --- a/go/cmd/dolt/commands/remote.go +++ b/go/cmd/dolt/commands/remote.go @@ -168,7 +168,13 @@ func addRemote(sqlCtx *sql.Context, queryist cli.Queryist, dEnv *env.DoltEnv, ap } if len(params) == 0 { - err := callSQLRemoteAdd(sqlCtx, queryist, remoteName, remoteUrl) + // For git remotes, ensure the stored URL is the normalized git+* dbfactory URL (e.g. scp-style ssh becomes + // git+ssh://..., and local paths become git+file://...). + urlToStore := remoteUrl + if strings.HasPrefix(strings.ToLower(scheme), "git+") { + urlToStore = absRemoteUrl + } + err := callSQLRemoteAdd(sqlCtx, queryist, remoteName, urlToStore) if err != nil { return errhand.BuildDError("error: Unable to add remote.").AddCause(err).Build() } diff --git a/go/cmd/dolt/commands/remote_test.go b/go/cmd/dolt/commands/remote_test.go index a0984251c0c..af0c626f1f4 100644 --- a/go/cmd/dolt/commands/remote_test.go +++ b/go/cmd/dolt/commands/remote_test.go @@ -17,6 +17,7 @@ package commands import ( "fmt" "path/filepath" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -146,3 +147,20 @@ func TestParseRemoteArgs_GitRef(t *testing.T) { assert.Nil(t, verr) assert.Equal(t, "refs/dolt/custom", params[dbfactory.GitRefParam]) } + +func TestRemoteAdd_StoresNormalizedGitUrl(t *testing.T) { + // This mirrors the behavior in addRemote(): for git remotes, we must store the normalized git+* URL + // (in particular, scp-style ssh should never be stored as a schemeless string). + scheme := dbfactory.GitSSHScheme + original := "git@github.com:timsehn/dolt-in-git.git" + normalized, ok, err := env.NormalizeGitRemoteUrl(original) + assert.NoError(t, err) + assert.True(t, ok) + assert.Equal(t, "git+ssh://git@github.com/timsehn/dolt-in-git.git", normalized) + + urlToStore := original + if strings.HasPrefix(strings.ToLower(scheme), "git+") { + urlToStore = normalized + } + assert.Equal(t, normalized, urlToStore) +} diff --git a/go/libraries/doltcore/dbfactory/git_remote.go b/go/libraries/doltcore/dbfactory/git_remote.go index e69d553b3e3..1534ba80e6d 100644 --- a/go/libraries/doltcore/dbfactory/git_remote.go +++ b/go/libraries/doltcore/dbfactory/git_remote.go @@ -44,6 +44,8 @@ const ( defaultGitRemoteName = "origin" ) +var ErrGitRemoteHasNoBranches = errors.New("git remote has no branches") + // GitCacheRootProvider provides the local Dolt repo root for per-repo git remote caches. // Implementations should return ok=false when no repo root is available. type GitCacheRootProvider interface { @@ -116,6 +118,9 @@ func (fact GitRemoteFactory) CreateDB(ctx context.Context, nbf *types.NomsBinFor if err := ensureGitRemoteURL(ctx, cacheRepo, remoteName, remoteURL.String()); err != nil { return nil, nil, nil, err } + if err := ensureRemoteHasBranches(ctx, cacheRepo, remoteName, remoteURL.String()); err != nil { + return nil, nil, nil, err + } q := nbs.NewUnlimitedMemQuotaProvider() cs, err := nbs.NewGitStore(ctx, nbf.VersionString(), cacheRepo, ref, blobstore.GitBlobstoreOptions{RemoteName: remoteName}, defaultMemTableSize, q) @@ -129,6 +134,17 @@ func (fact GitRemoteFactory) CreateDB(ctx context.Context, nbf *types.NomsBinFor return db, vrw, ns, nil } +func ensureRemoteHasBranches(ctx context.Context, gitDir string, remoteName string, remoteURL string) error { + out, err := runGitInDir(ctx, gitDir, "ls-remote", "--heads", "--", remoteName) + if err != nil { + return err + } + if strings.TrimSpace(out) == "" { + return fmt.Errorf("%w: cannot push to %q; initialize the repository with an initial branch/commit first", ErrGitRemoteHasNoBranches, remoteURL) + } + return nil +} + func parseGitRemoteFactoryURL(urlObj *url.URL, params map[string]interface{}) (remoteURL *url.URL, ref string, err error) { if urlObj == nil { return nil, "", fmt.Errorf("nil url") diff --git a/go/libraries/doltcore/dbfactory/git_remote_test.go b/go/libraries/doltcore/dbfactory/git_remote_test.go index a46c5387102..8fd3c1a4f95 100644 --- a/go/libraries/doltcore/dbfactory/git_remote_test.go +++ b/go/libraries/doltcore/dbfactory/git_remote_test.go @@ -64,6 +64,8 @@ func TestGitRemoteFactory_GitFile_CachesUnderRepoDoltDirAndCanWrite(t *testing.T ctx := context.Background() remoteRepo, err := gitrepo.InitBare(ctx, filepath.Join(shortTempDir(t), "remote.git")) require.NoError(t, err) + _, err = remoteRepo.SetRefToTree(ctx, "refs/heads/main", map[string][]byte{"README": []byte("seed\n")}, "seed") + require.NoError(t, err) localRepoRoot := shortTempDir(t) @@ -121,6 +123,8 @@ func TestGitRemoteFactory_TwoClientsDistinctCacheDirsRoundtrip(t *testing.T) { ctx := context.Background() remoteRepo, err := gitrepo.InitBare(ctx, filepath.Join(shortTempDir(t), "remote.git")) require.NoError(t, err) + _, err = remoteRepo.SetRefToTree(ctx, "refs/heads/main", map[string][]byte{"README": []byte("seed\n")}, "seed") + require.NoError(t, err) remotePath := filepath.ToSlash(remoteRepo.GitDir) urlStr := "git+file://" + remotePath @@ -185,3 +189,24 @@ func TestGitRemoteFactory_TwoClientsDistinctCacheDirsRoundtrip(t *testing.T) { require.Equal(t, "clientB\n", string(gotB.Data())) require.NoError(t, dbA2.Close()) } + +func TestGitRemoteFactory_GitFile_RemoteWithNoBranchesFails(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + remoteRepo, err := gitrepo.InitBare(ctx, filepath.Join(shortTempDir(t), "remote.git")) + require.NoError(t, err) + + localRepoRoot := shortTempDir(t) + remotePath := filepath.ToSlash(remoteRepo.GitDir) + urlStr := "git+file://" + remotePath + params := map[string]interface{}{ + GitCacheRootParam: localRepoRoot, + } + + _, _, _, err = CreateDB(ctx, types.Format_Default, urlStr, params) + require.Error(t, err) + require.ErrorIs(t, err, ErrGitRemoteHasNoBranches) +} diff --git a/go/store/nbs/git_blobstore_empty_remote_test.go b/go/store/nbs/git_blobstore_empty_remote_test.go index 0534d0ea701..09dc2fa9b0a 100644 --- a/go/store/nbs/git_blobstore_empty_remote_test.go +++ b/go/store/nbs/git_blobstore_empty_remote_test.go @@ -112,3 +112,55 @@ func TestNBS_GitBlobstore_EmptyRemote_FirstManifestUpdateBootstrapsRef(t *testin require.NoError(t, writeManifest(&buf, contents)) require.NotEmpty(t, buf.Bytes()) } + +func TestNBS_NewGitStore_DefaultsMaxPartSizeTo50MB(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + remoteRepo, err := gitrepo.InitBare(ctx, t.TempDir()+"/remote.git") + require.NoError(t, err) + _, err = remoteRepo.SetRefToTree(ctx, blobstore.DoltDataRef, nil, "seed empty") + require.NoError(t, err) + + localRepo, err := gitrepo.InitBare(ctx, t.TempDir()+"/local.git") + require.NoError(t, err) + cmd := exec.CommandContext(ctx, "git", "--git-dir", localRepo.GitDir, "remote", "add", "origin", remoteRepo.GitDir) + out, err := cmd.CombinedOutput() + require.NoError(t, err, "git remote add failed: %s", string(out)) + + store, err := NewGitStore(ctx, types.Format_DOLT.VersionString(), localRepo.GitDir, blobstore.DoltDataRef, blobstore.GitBlobstoreOptions{}, 0, NewUnlimitedMemQuotaProvider()) + require.NoError(t, err) + defer store.Close() + + // Assert the underlying blobstore is a GitBlobstore and that chunked writes are enabled by default. + bsp, ok := store.persister.(*blobstorePersister) + require.True(t, ok, "expected persister to be *blobstorePersister, got %T", store.persister) + + gbs, ok := bsp.bs.(*blobstore.GitBlobstore) + require.True(t, ok, "expected blobstore to be *blobstore.GitBlobstore, got %T", bsp.bs) + + // Use a totalSize larger than the default 50MB threshold, but provide a tiny reader. + // planPutWrites decides chunked-vs-inline based on totalSize, and should pick chunked mode + // when the default max part size is enabled. + _, err = gbs.Put(ctx, "k", int64(50*1024*1024+1), bytes.NewReader([]byte("hi"))) + require.NoError(t, err) + + // On the remote, key "k" should be represented as a tree containing part "0001". + cmd = exec.CommandContext(ctx, "git", "--git-dir", remoteRepo.GitDir, "rev-parse", "--verify", "--quiet", blobstore.DoltDataRef+"^{commit}") + headBytes, err := cmd.CombinedOutput() + require.NoError(t, err, "git rev-parse failed: %s", string(headBytes)) + + commit := string(bytes.TrimSpace(headBytes)) + cmd = exec.CommandContext(ctx, "git", "--git-dir", remoteRepo.GitDir, "ls-tree", commit, "k") + lsOut, err := cmd.CombinedOutput() + require.NoError(t, err, "git ls-tree failed: %s", string(lsOut)) + require.Contains(t, string(lsOut), "\tk\n") + require.Contains(t, string(lsOut), "tree") + + cmd = exec.CommandContext(ctx, "git", "--git-dir", remoteRepo.GitDir, "ls-tree", commit, "k/0001") + partOut, err := cmd.CombinedOutput() + require.NoError(t, err, "git ls-tree part failed: %s", string(partOut)) + require.Contains(t, string(partOut), "0001") +} diff --git a/go/store/nbs/store.go b/go/store/nbs/store.go index adc2e2a3cef..8249383e76d 100644 --- a/go/store/nbs/store.go +++ b/go/store/nbs/store.go @@ -70,6 +70,10 @@ const ( defaultMaxTables = 256 defaultManifestCacheSize = 1 << 23 // 8MB + + // defaultGitBlobstoreMaxPartSize is the default maximum size (in bytes) for a single part blob written to a + // git-backed blobstore. Large values may be rejected by some Git remotes. + defaultGitBlobstoreMaxPartSize uint64 = 50 * 1024 * 1024 ) var ( @@ -628,6 +632,12 @@ func NewOCISStore(ctx context.Context, nbfVerStr string, bucketName, path string func NewGitStore(ctx context.Context, nbfVerStr string, gitDir string, ref string, opts blobstore.GitBlobstoreOptions, memTableSize uint64, q MemoryQuotaProvider) (*NomsBlockStore, error) { cacheOnce.Do(makeGlobalCaches) + // A Git remote may reject large blobs. To keep git-backed remotes broadly usable by default, enable + // chunked-object writes with a conservative max part size unless the caller explicitly overrides it. + if opts.MaxPartSize == 0 { + opts.MaxPartSize = defaultGitBlobstoreMaxPartSize + } + bs, err := blobstore.NewGitBlobstoreWithOptions(gitDir, ref, opts) if err != nil { return nil, err @@ -640,6 +650,10 @@ func NewGitStore(ctx context.Context, nbfVerStr string, gitDir string, ref strin func NewNoConjoinGitStore(ctx context.Context, nbfVerStr string, gitDir string, ref string, opts blobstore.GitBlobstoreOptions, memTableSize uint64, q MemoryQuotaProvider) (*NomsBlockStore, error) { cacheOnce.Do(makeGlobalCaches) + if opts.MaxPartSize == 0 { + opts.MaxPartSize = defaultGitBlobstoreMaxPartSize + } + bs, err := blobstore.NewGitBlobstoreWithOptions(gitDir, ref, opts) if err != nil { return nil, err diff --git a/integration-tests/bats/remotes-git.bats b/integration-tests/bats/remotes-git.bats index 64d897bc610..e31fc86787b 100644 --- a/integration-tests/bats/remotes-git.bats +++ b/integration-tests/bats/remotes-git.bats @@ -18,9 +18,39 @@ teardown() { teardown_common } +seed_git_remote_branch() { + # Create an initial branch on an otherwise-empty bare git remote. + # Dolt git remotes require at least one git branch to exist on the remote. + local remote_git_dir="$1" + local branch="${2:-main}" + + local remote_abs + remote_abs="$(cd "$remote_git_dir" && pwd)" + + local seed_dir + seed_dir="$(mktemp -d "${BATS_TMPDIR:-/tmp}/seed-repo.XXXXXX")" + + ( + set -euo pipefail + trap 'rm -rf "$seed_dir"' EXIT + cd "$seed_dir" + + git init >/dev/null + git config user.email "bats@email.fake" + git config user.name "Bats Tests" + echo "seed" > README + git add README + git commit -m "seed" >/dev/null + git branch -M "$branch" + git remote add origin "$remote_abs" + git push origin "$branch" >/dev/null + ) +} + @test "remotes-git: smoke push/clone/push-back/pull" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -58,6 +88,7 @@ teardown() { @test "remotes-git: empty remote bootstrap creates refs/dolt/data" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main # Assert the dolt data ref doesn't exist yet. run git --git-dir remote.git show-ref refs/dolt/data @@ -82,6 +113,7 @@ teardown() { @test "remotes-git: pull also fetches branches from git remote" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -115,6 +147,7 @@ teardown() { @test "remotes-git: pull fetches but does not merge other branches" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -156,6 +189,7 @@ teardown() { @test "remotes-git: custom --ref writes to configured dolt data ref" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -192,6 +226,7 @@ teardown() { @test "remotes-git: push works with per-repo git cache under .dolt/" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 diff --git a/integration-tests/bats/sql-remotes-git.bats b/integration-tests/bats/sql-remotes-git.bats index 44a289423ef..cb7e776b219 100644 --- a/integration-tests/bats/sql-remotes-git.bats +++ b/integration-tests/bats/sql-remotes-git.bats @@ -17,9 +17,39 @@ teardown() { teardown_common } +seed_git_remote_branch() { + # Create an initial branch on an otherwise-empty bare git remote. + # Dolt git remotes require at least one git branch to exist on the remote. + local remote_git_dir="$1" + local branch="${2:-main}" + + local remote_abs + remote_abs="$(cd "$remote_git_dir" && pwd)" + + local seed_dir + seed_dir="$(mktemp -d "${BATS_TMPDIR:-/tmp}/seed-repo.XXXXXX")" + + ( + set -euo pipefail + trap 'rm -rf "$seed_dir"' EXIT + cd "$seed_dir" + + git init >/dev/null + git config user.email "bats@email.fake" + git config user.name "Bats Tests" + echo "seed" > README + git add README + git commit -m "seed" >/dev/null + git branch -M "$branch" + git remote add origin "$remote_abs" + git push origin "$branch" >/dev/null + ) +} + @test "sql-remotes-git: dolt_remote add supports --ref for git remotes" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -44,6 +74,7 @@ SQL @test "sql-remotes-git: dolt_clone supports --ref for git remotes" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1 @@ -78,6 +109,7 @@ SQL @test "sql-remotes-git: dolt_backup sync-url supports --ref for git remotes" { mkdir remote.git git init --bare remote.git + seed_git_remote_branch remote.git main mkdir repo1 cd repo1