diff --git a/go/cmd/dolt/commands/tblcmds/export.go b/go/cmd/dolt/commands/tblcmds/export.go index a4e4d76b963..f7b6b656d45 100644 --- a/go/cmd/dolt/commands/tblcmds/export.go +++ b/go/cmd/dolt/commands/tblcmds/export.go @@ -42,6 +42,12 @@ var exportDocs = cli.CommandDocumentationContent{ ShortDesc: `Export the contents of a table to a file.`, LongDesc: `{{.EmphasisLeft}}dolt table export{{.EmphasisRight}} will export the contents of {{.LessThan}}table{{.GreaterThan}} to {{.LessThan}}|file{{.GreaterThan}} +The output format is inferred from the file extension, or can be set explicitly with {{.EmphasisLeft}}--file-type{{.EmphasisRight}}. + +Supported file types: {{.EmphasisLeft}}csv{{.EmphasisRight}}, {{.EmphasisLeft}}psv{{.EmphasisRight}}, {{.EmphasisLeft}}json{{.EmphasisRight}}, {{.EmphasisLeft}}jsonl{{.EmphasisRight}}, {{.EmphasisLeft}}sql{{.EmphasisRight}}, {{.EmphasisLeft}}parquet{{.EmphasisRight}}. + +{{.EmphasisLeft}}.json{{.EmphasisRight}} exports a single JSON object containing a {{.EmphasisLeft}}rows{{.EmphasisRight}} array; {{.EmphasisLeft}}.jsonl{{.EmphasisRight}} exports one JSON object per line. + See the help for {{.EmphasisLeft}}dolt table import{{.EmphasisRight}} as the options are the same. `, Synopsis: []string{ diff --git a/go/cmd/dolt/commands/tblcmds/import.go b/go/cmd/dolt/commands/tblcmds/import.go index a94b16544cb..8ff1e1866aa 100644 --- a/go/cmd/dolt/commands/tblcmds/import.go +++ b/go/cmd/dolt/commands/tblcmds/import.go @@ -91,11 +91,20 @@ var jsonInputFileHelp = "The expected JSON input file format is:" + ` where column_name is the name of a column of the table being imported and value is the data for that column in the table. ` +var jsonlInputFileHelp = "The expected JSONL input file format is:" + ` + + {"column_name":"value", ...} + {"column_name":"value", ...} + ... + +where each line is a JSON object representing a row. +` + var importDocs = cli.CommandDocumentationContent{ ShortDesc: `Imports data into a dolt table`, LongDesc: `If {{.EmphasisLeft}}--create-table | -c{{.EmphasisRight}} is given the operation will create {{.LessThan}}table{{.GreaterThan}} and import the contents of file into it. If a table already exists at this location then the operation will fail, unless the {{.EmphasisLeft}}--force | -f{{.EmphasisRight}} flag is provided. The force flag forces the existing table to be overwritten. -The schema for the new table can be specified explicitly by providing a SQL schema definition file, or will be inferred from the imported file. All schemas, inferred or explicitly defined must define a primary key. If the file format being imported does not support defining a primary key, then the {{.EmphasisLeft}}--pk{{.EmphasisRight}} parameter must supply the name of the field that should be used as the primary key. If no primary key is explicitly defined, the first column in the import file will be used as the primary key. +The schema for the new table can be specified explicitly by providing a SQL schema definition file, or may be inferred from the imported file (depending on file type). All schemas, inferred or explicitly defined must define a primary key. If the file format being imported does not support defining a primary key, then the {{.EmphasisLeft}}--pk{{.EmphasisRight}} parameter must supply the name of the field that should be used as the primary key. If no primary key is explicitly defined, the first column in the import file will be used as the primary key. For {{.EmphasisLeft}}json{{.EmphasisRight}}, {{.EmphasisLeft}}jsonl{{.EmphasisRight}}, and {{.EmphasisLeft}}parquet{{.EmphasisRight}} create operations, a schema file must be provided with {{.EmphasisLeft}}--schema{{.EmphasisRight}}. If {{.EmphasisLeft}}--update-table | -u{{.EmphasisRight}} is given the operation will update {{.LessThan}}table{{.GreaterThan}} with the contents of file. The table's existing schema will be used, and field names will be used to match file fields with table fields unless a mapping file is specified. @@ -113,7 +122,9 @@ During import, if there is an error importing any row, the import will be aborte ` ` + jsonInputFileHelp + ` -In create, update, and replace scenarios the file's extension is used to infer the type of the file. If a file does not have the expected extension then the {{.EmphasisLeft}}--file-type{{.EmphasisRight}} parameter should be used to explicitly define the format of the file in one of the supported formats (csv, psv, json, xlsx). For files separated by a delimiter other than a ',' (type csv) or a '|' (type psv), the --delim parameter can be used to specify a delimiter`, +` + jsonlInputFileHelp + + ` + In create, update, and replace scenarios the file's extension is used to infer the type of the file. If a file does not have the expected extension then the {{.EmphasisLeft}}--file-type{{.EmphasisRight}} parameter should be used to explicitly define the format of the file in one of the supported formats (csv, psv, json, jsonl, xlsx, parquet). For files separated by a delimiter other than a ',' (type csv) or a '|' (type psv), the --delim parameter can be used to specify a delimiter`, Synopsis: []string{ "-c [-f] [--pk {{.LessThan}}field{{.GreaterThan}}] [--all-text] [--schema {{.LessThan}}file{{.GreaterThan}}] [--map {{.LessThan}}file{{.GreaterThan}}] [--continue] [--quiet] [--disable-fk-checks] [--file-type {{.LessThan}}type{{.GreaterThan}}] [--no-header] [--columns {{.LessThan}}col1,col2,...{{.GreaterThan}}] {{.LessThan}}table{{.GreaterThan}} {{.LessThan}}file{{.GreaterThan}}", @@ -226,7 +237,7 @@ func getImportMoveOptions(ctx *sql.Context, apr *argparser.ArgParseResults, dEnv } else if val.Format == mvdata.XlsxFile { // table name must match sheet name currently srcOpts = mvdata.XlsxOptions{SheetName: tableName} - } else if val.Format == mvdata.JsonFile { + } else if val.Format == mvdata.JsonFile || val.Format == mvdata.JsonlFile { opts := mvdata.JSONOptions{TableName: tableName, SchFile: schemaFile} if schemaFile != "" { opts.SqlCtx = ctx @@ -371,8 +382,8 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError { } _, hasSchema := apr.GetValue(schemaParam) - if srcFileLoc.Format == mvdata.JsonFile && apr.Contains(createParam) && !hasSchema { - return errhand.BuildDError("Please specify schema file for .json tables.").Build() + if (srcFileLoc.Format == mvdata.JsonFile || srcFileLoc.Format == mvdata.JsonlFile) && apr.Contains(createParam) && !hasSchema { + return errhand.BuildDError("Please specify schema file for .json/.jsonl tables.").Build() } else if srcFileLoc.Format == mvdata.ParquetFile && apr.Contains(createParam) && !hasSchema { return errhand.BuildDError("Please specify schema file for .parquet tables.").Build() } diff --git a/go/libraries/doltcore/mvdata/data_loc.go b/go/libraries/doltcore/mvdata/data_loc.go index e1a5176497e..098d7d36c52 100644 --- a/go/libraries/doltcore/mvdata/data_loc.go +++ b/go/libraries/doltcore/mvdata/data_loc.go @@ -52,6 +52,9 @@ const ( // JsonFile is the format of a data location that is a json file JsonFile DataFormat = ".json" + // JsonlFile is the format of a data location that is a jsonl file + JsonlFile DataFormat = ".jsonl" + // SqlFile is the format of a data location that is a .sql file SqlFile DataFormat = ".sql" @@ -72,6 +75,8 @@ func (df DataFormat) ReadableStr() string { return "xlsx file" case JsonFile: return "json file" + case JsonlFile: + return "jsonl file" case SqlFile: return "sql file" case ParquetFile: @@ -116,6 +121,8 @@ func NewDataLocation(path, fileFmtStr string) DataLocation { dataFmt = XlsxFile case string(JsonFile): dataFmt = JsonFile + case string(JsonlFile): + dataFmt = JsonlFile case string(SqlFile): dataFmt = SqlFile case string(ParquetFile): diff --git a/go/libraries/doltcore/mvdata/data_loc_test.go b/go/libraries/doltcore/mvdata/data_loc_test.go index 436be4b71cf..0e1280ad696 100644 --- a/go/libraries/doltcore/mvdata/data_loc_test.go +++ b/go/libraries/doltcore/mvdata/data_loc_test.go @@ -69,6 +69,8 @@ func TestBasics(t *testing.T) { {NewDataLocation("file.csv", ""), CsvFile.ReadableStr() + ":file.csv", true}, {NewDataLocation("file.psv", ""), PsvFile.ReadableStr() + ":file.psv", true}, {NewDataLocation("file.json", ""), JsonFile.ReadableStr() + ":file.json", true}, + {NewDataLocation("file.jsonl", ""), JsonlFile.ReadableStr() + ":file.jsonl", true}, + {NewDataLocation("file.ignored", "jsonl"), JsonlFile.ReadableStr() + ":file.ignored", true}, // {NewDataLocation("file.nbf", ""), NbfFile, "file.nbf", true}, } @@ -87,6 +89,7 @@ func TestExists(t *testing.T) { NewDataLocation("file.csv", ""), NewDataLocation("file.psv", ""), NewDataLocation("file.json", ""), + NewDataLocation("file.jsonl", ""), // NewDataLocation("file.nbf", ""), } diff --git a/go/libraries/doltcore/mvdata/file_data_loc.go b/go/libraries/doltcore/mvdata/file_data_loc.go index eb3c3d68a20..d85260dbe87 100644 --- a/go/libraries/doltcore/mvdata/file_data_loc.go +++ b/go/libraries/doltcore/mvdata/file_data_loc.go @@ -47,6 +47,8 @@ func DFFromString(dfStr string) DataFormat { return XlsxFile case "json", ".json": return JsonFile + case "jsonl", ".jsonl": + return JsonlFile case "sql", ".sql": return SqlFile case "parquet", ".parquet": @@ -110,37 +112,23 @@ func (dl FileDataLocation) NewReader(ctx context.Context, dEnv *env.DoltEnv, opt return rd, false, err case JsonFile: - var sch schema.Schema - jsonOpts, _ := opts.(JSONOptions) - if jsonOpts.SchFile != "" { - tn, s, err := SchAndTableNameFromFile(jsonOpts.SqlCtx, jsonOpts.SchFile, dEnv.FS, root, jsonOpts.Engine) - if err != nil { - return nil, false, err - } - if tn != jsonOpts.TableName { - return nil, false, fmt.Errorf("table name '%s' from schema file %s does not match table arg '%s'", tn, jsonOpts.SchFile, jsonOpts.TableName) - } - sch = s - } else { - if opts == nil { - return nil, false, errors.New("Unable to determine table name on JSON import") - } - tbl, exists, err := root.GetTable(context.TODO(), doltdb.TableName{Name: jsonOpts.TableName}) - if !exists { - return nil, false, fmt.Errorf("The following table could not be found:\n%v", jsonOpts.TableName) - } - if err != nil { - return nil, false, fmt.Errorf("An error occurred attempting to read the table:\n%v", err.Error()) - } - sch, err = tbl.GetSchema(context.TODO()) - if err != nil { - return nil, false, fmt.Errorf("An error occurred attempting to read the table schema:\n%v", err.Error()) - } + sch, err := resolveJSONSchema(dEnv, root, opts) + if err != nil { + return nil, false, err } rd, err := json.OpenJSONReader(root.VRW(), dl.Path, fs, sch) return rd, false, err + case JsonlFile: + sch, err := resolveJSONSchema(dEnv, root, opts) + if err != nil { + return nil, false, err + } + + rd, err := json.OpenJSONLReader(root.VRW(), dl.Path, fs, sch) + return rd, false, err + case ParquetFile: var tableSch schema.Schema parquetOpts, _ := opts.(ParquetOptions) @@ -176,6 +164,42 @@ func (dl FileDataLocation) NewReader(ctx context.Context, dEnv *env.DoltEnv, opt return nil, false, errors.New("unsupported format") } +func resolveJSONSchema(dEnv *env.DoltEnv, root doltdb.RootValue, opts interface{}) (schema.Schema, error) { + if opts == nil { + return nil, errors.New("Unable to determine table name on JSON import") + } + + jsonOpts, ok := opts.(JSONOptions) + if !ok { + return nil, fmt.Errorf("invalid JSON import options: expected mvdata.JSONOptions, got %T", opts) + } + + if jsonOpts.SchFile != "" { + tn, s, err := SchAndTableNameFromFile(jsonOpts.SqlCtx, jsonOpts.SchFile, dEnv.FS, root, jsonOpts.Engine) + if err != nil { + return nil, err + } + if tn != jsonOpts.TableName { + return nil, fmt.Errorf("table name '%s' from schema file %s does not match table arg '%s'", tn, jsonOpts.SchFile, jsonOpts.TableName) + } + return s, nil + } + + tbl, exists, err := root.GetTable(jsonOpts.SqlCtx, doltdb.TableName{Name: jsonOpts.TableName}) + if !exists { + return nil, fmt.Errorf("The following table could not be found:\n%v", jsonOpts.TableName) + } + if err != nil { + return nil, fmt.Errorf("An error occurred attempting to read the table:\n%v", err.Error()) + } + sch, err := tbl.GetSchema(jsonOpts.SqlCtx) + if err != nil { + return nil, fmt.Errorf("An error occurred attempting to read the table schema:\n%v", err.Error()) + } + + return sch, nil +} + // NewCreatingWriter will create a TableWriteCloser for a DataLocation that will create a new table, or overwrite // an existing table. func (dl FileDataLocation) NewCreatingWriter(ctx context.Context, mvOpts DataMoverOptions, root doltdb.RootValue, outSch schema.Schema, opts editor.Options, wr io.WriteCloser) (table.SqlRowWriter, error) { @@ -188,6 +212,10 @@ func (dl FileDataLocation) NewCreatingWriter(ctx context.Context, mvOpts DataMov panic("writing to xlsx files is not supported yet") case JsonFile: return json.NewJSONWriter(wr, outSch) + case JsonlFile: + // JSONL is newline-delimited JSON objects, one object per row. + // We reuse the existing JSON export writer to ensure identical row serialization. + return json.NewJSONWriterWithHeader(wr, outSch, "", "\n", "\n") case SqlFile: if mvOpts.IsBatched() { return sqlexport.OpenBatchedSQLExportWriter(ctx, wr, root, mvOpts.SrcName(), mvOpts.IsAutocommitOff(), outSch, opts) diff --git a/go/libraries/doltcore/table/typed/json/jsonl_reader.go b/go/libraries/doltcore/table/typed/json/jsonl_reader.go new file mode 100644 index 00000000000..dcde8a68e43 --- /dev/null +++ b/go/libraries/doltcore/table/typed/json/jsonl_reader.go @@ -0,0 +1,174 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package json + +import ( + "bufio" + "context" + stdjson "encoding/json" + "errors" + "fmt" + "io" + "strings" + + "github.com/dolthub/go-mysql-server/sql" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" + + "github.com/dolthub/dolt/go/libraries/doltcore/row" + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/table" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/libraries/utils/iohelp" + "github.com/dolthub/dolt/go/store/types" +) + +// JSONLReader reads newline-delimited JSON objects, one per line. +type JSONLReader struct { + vrw types.ValueReadWriter + closer io.Closer + sch schema.Schema + bRd *bufio.Reader + numLine int + sampleRow sql.Row +} + +var _ table.SqlTableReader = (*JSONLReader)(nil) + +func OpenJSONLReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableFS, sch schema.Schema) (*JSONLReader, error) { + r, err := fs.OpenForRead(path) + if err != nil { + return nil, err + } + + return NewJSONLReader(vrw, r, sch) +} + +// NewJSONLReader creates a JSONL reader. The bytes of the supplied reader are treated as UTF-8. If there is a UTF8, +// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the remaining contents of the reader are +// treated as that encoding. +func NewJSONLReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONLReader, error) { + if sch == nil { + return nil, errors.New("schema must be provided to JSONLReader") + } + + textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder())) + br := bufio.NewReaderSize(textReader, ReadBufSize) + + return &JSONLReader{ + vrw: vrw, + closer: r, + sch: sch, + bRd: br, + }, nil +} + +func (r *JSONLReader) Close(ctx context.Context) error { + if r.closer != nil { + err := r.closer.Close() + r.closer = nil + return err + } + return nil +} + +func (r *JSONLReader) GetSchema() schema.Schema { + return r.sch +} + +func (r *JSONLReader) VerifySchema(sch schema.Schema) (bool, error) { + if r.sampleRow == nil { + row, err := r.ReadSqlRow(context.Background()) + if err == nil { + r.sampleRow = row + return true, nil + } + if err == io.EOF { + return false, nil + } + return false, err + } + return true, nil +} + +func (r *JSONLReader) ReadRow(ctx context.Context) (row.Row, error) { + panic("deprecated") +} + +func (r *JSONLReader) ReadSqlRow(ctx context.Context) (sql.Row, error) { + if r.sampleRow != nil { + ret := r.sampleRow + r.sampleRow = nil + return ret, nil + } + + for { + line, done, err := iohelp.ReadLine(r.bRd) + if err != nil { + return nil, err + } + if done && line == "" { + return nil, io.EOF + } + r.numLine++ + + line = strings.TrimSpace(line) + if line == "" { + if done { + return nil, io.EOF + } + continue + } + + var val any + if err := stdjson.Unmarshal([]byte(line), &val); err != nil { + return nil, fmt.Errorf("invalid JSON at line %d: %w", r.numLine, err) + } + + mapVal, ok := val.(map[string]any) + if !ok { + return nil, fmt.Errorf("expected JSON object at line %d", r.numLine) + } + + row, err := r.convToSqlRow(ctx, mapVal) + if err != nil { + return nil, fmt.Errorf("error converting JSONL row at line %d: %w", r.numLine, err) + } + + return row, nil + } +} + +func (r *JSONLReader) convToSqlRow(ctx context.Context, rowMap map[string]interface{}) (sql.Row, error) { + allCols := r.sch.GetAllCols() + + ret := make(sql.Row, allCols.Size()) + for k, v := range rowMap { + col, ok := allCols.GetByName(k) + if !ok { + return nil, fmt.Errorf("column %s not found in schema", k) + } + + v, _, err := col.TypeInfo.ToSqlType().Convert(ctx, v) + if err != nil { + return nil, err + } + + idx := allCols.TagToIdx[col.Tag] + ret[idx] = v + } + + return ret, nil +} diff --git a/go/libraries/doltcore/table/typed/json/jsonl_reader_test.go b/go/libraries/doltcore/table/typed/json/jsonl_reader_test.go new file mode 100644 index 00000000000..4b0f16a1cca --- /dev/null +++ b/go/libraries/doltcore/table/typed/json/jsonl_reader_test.go @@ -0,0 +1,188 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package json + +import ( + "bytes" + "context" + "io" + "os" + "testing" + + "github.com/dolthub/go-mysql-server/enginetest" + "github.com/dolthub/go-mysql-server/sql" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" + + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/schema/typeinfo" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/sqlutil" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/store/types" +) + +func testGoodJSONL(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (sqlTableReaderWithVerify, error)) { + colColl := schema.NewColCollection( + schema.Column{ + Name: "id", + Tag: 0, + Kind: types.IntKind, + IsPartOfPK: true, + TypeInfo: typeinfo.Int64Type, + }, + schema.Column{ + Name: "first name", + Tag: 1, + Kind: types.StringKind, + IsPartOfPK: false, + TypeInfo: typeinfo.StringDefaultType, + }, + schema.Column{ + Name: "last name", + Tag: 2, + Kind: types.StringKind, + IsPartOfPK: false, + TypeInfo: typeinfo.StringDefaultType, + }, + ) + + sch, err := schema.SchemaFromCols(colColl) + require.NoError(t, err) + + sqlSch, err := sqlutil.FromDoltSchema("", "", sch) + require.NoError(t, err) + + vrw := types.NewMemoryValueStore() + reader, err := getReader(vrw, sch) + require.NoError(t, err) + + verifySchema, err := reader.VerifySchema(sch) + require.NoError(t, err) + assert.True(t, verifySchema) + + var rows []sql.Row + for { + r, err := reader.ReadSqlRow(context.Background()) + if err == io.EOF { + break + } else { + require.NoError(t, err) + } + rows = append(rows, r) + } + + expectedRows := []sql.Row{ + {0, "tim", "sehn"}, + {1, "brian", "hendriks"}, + } + + assert.Equal(t, enginetest.WidenRows(t, sqlSch.Schema, expectedRows), rows) +} + +// sqlTableReaderWithVerify is the minimal interface needed for the shared test helper. +type sqlTableReaderWithVerify interface { + ReadSqlRow(ctx context.Context) (sql.Row, error) + VerifySchema(sch schema.Schema) (bool, error) +} + +func TestJSONLReader(t *testing.T) { + testJSONL := `{"id":0,"first name":"tim","last name":"sehn"} +{"id":1,"first name":"brian","last name":"hendriks"}` + + fs := filesys.EmptyInMemFS("/") + require.NoError(t, fs.WriteFile("file.jsonl", []byte(testJSONL), os.ModePerm)) + + testGoodJSONL(t, func(vrw types.ValueReadWriter, sch schema.Schema) (sqlTableReaderWithVerify, error) { + return OpenJSONLReader(vrw, "file.jsonl", fs, sch) + }) +} + +func TestJSONLReaderSkipsBlankLines(t *testing.T) { + testJSONL := `{"id":0,"first name":"tim","last name":"sehn"} + + +{"id":1,"first name":"brian","last name":"hendriks"} +` + + fs := filesys.EmptyInMemFS("/") + require.NoError(t, fs.WriteFile("file.jsonl", []byte(testJSONL), os.ModePerm)) + + testGoodJSONL(t, func(vrw types.ValueReadWriter, sch schema.Schema) (sqlTableReaderWithVerify, error) { + return OpenJSONLReader(vrw, "file.jsonl", fs, sch) + }) +} + +func TestJSONLReaderBadJsonIncludesLineNumber(t *testing.T) { + testJSONL := `{"id":0,"first name":"tim","last name":"sehn"} +bad +{"id":1,"first name":"brian","last name":"hendriks"}` + + fs := filesys.EmptyInMemFS("/") + require.NoError(t, fs.WriteFile("file.jsonl", []byte(testJSONL), os.ModePerm)) + + colColl := schema.NewColCollection( + schema.Column{Name: "id", Tag: 0, Kind: types.IntKind, IsPartOfPK: true, TypeInfo: typeinfo.Int64Type}, + schema.Column{Name: "first name", Tag: 1, Kind: types.StringKind, IsPartOfPK: false, TypeInfo: typeinfo.StringDefaultType}, + schema.Column{Name: "last name", Tag: 2, Kind: types.StringKind, IsPartOfPK: false, TypeInfo: typeinfo.StringDefaultType}, + ) + sch, err := schema.SchemaFromCols(colColl) + require.NoError(t, err) + + vrw := types.NewMemoryValueStore() + reader, err := OpenJSONLReader(vrw, "file.jsonl", fs, sch) + require.NoError(t, err) + + _, err = reader.ReadSqlRow(context.Background()) + require.NoError(t, err) + + _, err = reader.ReadSqlRow(context.Background()) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid JSON at line 2") +} + +func TestJSONLReaderRejectsNonObject(t *testing.T) { + testJSONL := `[]` + + fs := filesys.EmptyInMemFS("/") + require.NoError(t, fs.WriteFile("file.jsonl", []byte(testJSONL), os.ModePerm)) + + colColl := schema.NewColCollection( + schema.Column{Name: "id", Tag: 0, Kind: types.IntKind, IsPartOfPK: true, TypeInfo: typeinfo.Int64Type}, + ) + sch, err := schema.SchemaFromCols(colColl) + require.NoError(t, err) + + vrw := types.NewMemoryValueStore() + reader, err := OpenJSONLReader(vrw, "file.jsonl", fs, sch) + require.NoError(t, err) + + _, err = reader.ReadSqlRow(context.Background()) + require.Error(t, err) + assert.Contains(t, err.Error(), "expected JSON object at line 1") +} + +func TestJSONLReaderBOMHandlingUTF8(t *testing.T) { + testJSONL := `{"id":0,"first name":"tim","last name":"sehn"} +{"id":1,"first name":"brian","last name":"hendriks"}` + + bs := bytes.NewBuffer([]byte(testJSONL)) + reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder()) + + testGoodJSONL(t, func(vrw types.ValueReadWriter, sch schema.Schema) (sqlTableReaderWithVerify, error) { + return NewJSONLReader(vrw, io.NopCloser(reader), sch) + }) +} diff --git a/go/libraries/doltcore/table/typed/json/writer_jsonl_test.go b/go/libraries/doltcore/table/typed/json/writer_jsonl_test.go new file mode 100644 index 00000000000..64b139a95dc --- /dev/null +++ b/go/libraries/doltcore/table/typed/json/writer_jsonl_test.go @@ -0,0 +1,104 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package json + +import ( + "bytes" + "context" + stdjson "encoding/json" + "strings" + "testing" + + "github.com/dolthub/go-mysql-server/sql" + gmstypes "github.com/dolthub/go-mysql-server/sql/types" + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/schema/typeinfo" + "github.com/dolthub/dolt/go/libraries/utils/iohelp" + "github.com/dolthub/dolt/go/store/types" +) + +func TestJSONLWriterViaHeaderConfig(t *testing.T) { + colColl := schema.NewColCollection( + schema.Column{ + Name: "id", + Tag: 0, + Kind: types.IntKind, + IsPartOfPK: true, + TypeInfo: typeinfo.Int64Type, + }, + schema.Column{ + Name: "payload", + Tag: 1, + Kind: types.JSONKind, + IsPartOfPK: false, + TypeInfo: typeinfo.JSONType, + }, + ) + sch, err := schema.SchemaFromCols(colColl) + require.NoError(t, err) + + var buf bytes.Buffer + wr, err := NewJSONWriterWithHeader(iohelp.NopWrCloser(&buf), sch, "", "\n", "\n") + require.NoError(t, err) + + sqlCtx := sql.NewEmptyContext() + require.NoError(t, wr.WriteSqlRow(sqlCtx, sql.Row{int64(0), gmstypes.MustJSON(`{"a": 1}`)})) + require.NoError(t, wr.WriteSqlRow(sqlCtx, sql.Row{int64(1), gmstypes.MustJSON(`[1, 2]`)})) + require.NoError(t, wr.Close(context.Background())) + + out := buf.String() + require.True(t, strings.HasSuffix(out, "\n")) + + lines := strings.Split(strings.TrimSuffix(out, "\n"), "\n") + require.Len(t, lines, 2) + + var row0 map[string]any + require.NoError(t, stdjson.Unmarshal([]byte(lines[0]), &row0)) + require.Equal(t, float64(0), row0["id"]) + + payload0, ok := row0["payload"].(map[string]any) + require.True(t, ok) + require.Equal(t, float64(1), payload0["a"]) + + var row1 map[string]any + require.NoError(t, stdjson.Unmarshal([]byte(lines[1]), &row1)) + require.Equal(t, float64(1), row1["id"]) + + payload1, ok := row1["payload"].([]any) + require.True(t, ok) + require.Equal(t, []any{float64(1), float64(2)}, payload1) +} + +func TestJSONLWriterEmptyWritesNothing(t *testing.T) { + colColl := schema.NewColCollection( + schema.Column{ + Name: "id", + Tag: 0, + Kind: types.IntKind, + IsPartOfPK: true, + TypeInfo: typeinfo.Int64Type, + }, + ) + sch, err := schema.SchemaFromCols(colColl) + require.NoError(t, err) + + var buf bytes.Buffer + wr, err := NewJSONWriterWithHeader(iohelp.NopWrCloser(&buf), sch, "", "\n", "\n") + require.NoError(t, err) + require.NoError(t, wr.Close(context.Background())) + require.Equal(t, "", buf.String()) +} diff --git a/integration-tests/bats/export-tables.bats b/integration-tests/bats/export-tables.bats index 4eccc3c9786..0ca623eee23 100644 --- a/integration-tests/bats/export-tables.bats +++ b/integration-tests/bats/export-tables.bats @@ -51,6 +51,14 @@ SQL dolt table export test test.json run cat test.json [ "$output" = '{"rows": [{"pk":1,"v1":"2020-04-08","v2":"11:11:11","v3":2020,"v4":"2020-04-08 11:11:11"},{"pk":2,"v1":"2020-04-08","v2":"12:12:12","v3":2020,"v4":"2020-04-08 12:12:12"}]}' ] + + dolt table export test test.jsonl + run wc -l test.jsonl + [ "$status" -eq 0 ] + [[ "$output" =~ "2 test.jsonl" ]] || false + run cat test.jsonl + [ "${lines[0]}" = '{"pk":1,"v1":"2020-04-08","v2":"11:11:11","v3":2020,"v4":"2020-04-08 11:11:11"}' ] + [ "${lines[1]}" = '{"pk":2,"v1":"2020-04-08","v2":"12:12:12","v3":2020,"v4":"2020-04-08 12:12:12"}' ] } @test "export-tables: dolt table import from stdin export to stdout" { @@ -82,6 +90,76 @@ if rows[2] != "9,8,7,6,5,4".split(","): ' } +@test "export-tables: table export jsonl edge cases" { + skiponwindows "Need to install python before this test will work." + + dolt sql < employees.jsonl +{"id":0,"first name":"tim","last name":"sehn","title":"ceo","start date":"","end date":""} +{"id":1,"first name":"aaron","last name":"son","title":"founder","start date":"","end date":""} +{"id":2,"first name":"brian","last name":"hendricks","title":"founder","start date":"","end date":""} +JSONL + + run dolt table import -c -s `batshelper employees-sch.sql` employees employees.jsonl + [ "$status" -eq 0 ] + [[ "$output" =~ "Import completed successfully." ]] || false + run dolt ls + [ "$status" -eq 0 ] + [[ "$output" =~ "employees" ]] || false + run dolt sql -q "select * from employees" + [ "$status" -eq 0 ] + [[ "$output" =~ "tim" ]] || false + [ "${#lines[@]}" -eq 7 ] +} + @test "import-create-tables: create a table with json import, utf8 with bom" { run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf8bom.json` echo "$output" @@ -116,7 +135,7 @@ teardown() { @test "import-create-tables: create a table with json import. no schema." { run dolt table import -c employees `batshelper employees-tbl.json` [ "$status" -ne 0 ] - [ "$output" = "Please specify schema file for .json tables." ] + [ "$output" = "Please specify schema file for .json/.jsonl tables." ] } @test "import-create-tables: create a table with json data import. bad json data." {