diff --git a/go/cmd/dolt/commands/tblcmds/import.go b/go/cmd/dolt/commands/tblcmds/import.go index 59adf469451..464d6af8cee 100644 --- a/go/cmd/dolt/commands/tblcmds/import.go +++ b/go/cmd/dolt/commands/tblcmds/import.go @@ -378,6 +378,56 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError { return nil } +// validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema +// considering any name mapping that may be applied +func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error { + if len(primaryKeys) == 0 { + return nil + } + + cols := rdSchema.GetAllCols() + var missingKeys []string + + for _, pk := range primaryKeys { + // First check if the primary key exists directly in the schema + if _, ok := cols.GetByName(pk); ok { + continue + } + + // If not found directly, check if any column maps to this primary key name + found := false + cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) { + if nameMapper.Map(col.Name) == pk { + found = true + return true, nil + } + return false, nil + }) + + if !found { + missingKeys = append(missingKeys, pk) + } + } + + if len(missingKeys) > 0 { + // Build list of available columns for helpful error message + var availableCols []string + cols.Iter(func(tag uint64, col schema.Column) (bool, error) { + availableCols = append(availableCols, col.Name) + return false, nil + }) + + if len(missingKeys) == 1 { + return fmt.Errorf("primary key '%s' not found in import file. Available columns: %s", + missingKeys[0], strings.Join(availableCols, ", ")) + } + return fmt.Errorf("primary keys %v not found in import file. Available columns: %s", + missingKeys, strings.Join(availableCols, ", ")) + } + + return nil +} + type ImportCmd struct{} // Name is returns the name of the Dolt cli command. This is what is used on the command line to invoke the command @@ -529,6 +579,15 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.CreateReaderErr, Cause: err} } + // Validate primary keys early for create operations, so that we return validation errors early + if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" { + rdSchema := rd.GetSchema() + if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil { + rd.Close(ctx) + return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err} + } + } + return rd, nil } diff --git a/go/cmd/dolt/commands/tblcmds/import_test.go b/go/cmd/dolt/commands/tblcmds/import_test.go new file mode 100644 index 00000000000..09ceaa64a14 --- /dev/null +++ b/go/cmd/dolt/commands/tblcmds/import_test.go @@ -0,0 +1,191 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tblcmds + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/libraries/doltcore/rowconv" + "github.com/dolthub/dolt/go/libraries/doltcore/schema" +) + +func TestValidatePrimaryKeysAgainstSchema(t *testing.T) { + // Create a test schema with columns: id, name, email, age + testCols := []schema.Column{ + {Name: "id", Tag: 0, IsPartOfPK: true}, + {Name: "name", Tag: 1, IsPartOfPK: false}, + {Name: "email", Tag: 2, IsPartOfPK: false}, + {Name: "age", Tag: 3, IsPartOfPK: false}, + } + testSchema, err := schema.NewSchema( + schema.NewColCollection(testCols...), + nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK + schema.Collation_Default, + nil, // indexes + nil, // checks + ) + require.NoError(t, err) + + tests := []struct { + name string + primaryKeys []string + schema schema.Schema + expectError bool + errorContains string + }{ + { + name: "valid single primary key", + primaryKeys: []string{"id"}, + schema: testSchema, + expectError: false, + }, + { + name: "valid multiple primary keys", + primaryKeys: []string{"id", "name"}, + schema: testSchema, + expectError: false, + }, + { + name: "empty primary keys", + primaryKeys: []string{}, + schema: testSchema, + expectError: false, + }, + { + name: "invalid single primary key", + primaryKeys: []string{"invalid_col"}, + schema: testSchema, + expectError: true, + errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age", + }, + { + name: "mix of valid and invalid primary keys", + primaryKeys: []string{"id", "invalid_col1", "name", "invalid_col2"}, + schema: testSchema, + expectError: true, + errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age", + }, + { + name: "all invalid primary keys", + primaryKeys: []string{"col1", "col2", "col3"}, + schema: testSchema, + expectError: true, + errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{}) + + if tt.expectError { + assert.Error(t, err) + if tt.errorContains != "" { + assert.Contains(t, err.Error(), tt.errorContains) + } + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) { + // Test that available columns are listed in a consistent order + testCols := []schema.Column{ + {Name: "zebra", Tag: 3, IsPartOfPK: false}, + {Name: "alpha", Tag: 0, IsPartOfPK: true}, + {Name: "beta", Tag: 1, IsPartOfPK: false}, + {Name: "gamma", Tag: 2, IsPartOfPK: false}, + } + testSchema, err := schema.NewSchema( + schema.NewColCollection(testCols...), + nil, // pkOrdinals - nil means use default ordinals based on IsPartOfPK + schema.Collation_Default, + nil, // indexes + nil, // checks + ) + require.NoError(t, err) + + err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{}) + assert.Error(t, err) + // The implementation returns a detailed error with available columns + assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma") +} + +func TestValidatePrimaryKeysWithNameMapping(t *testing.T) { + // Create a test schema with columns: user_id, user_name, user_email + testCols := []schema.Column{ + {Name: "user_id", Tag: 0, IsPartOfPK: true}, + {Name: "user_name", Tag: 1, IsPartOfPK: false}, + {Name: "user_email", Tag: 2, IsPartOfPK: false}, + } + testSchema, err := schema.NewSchema( + schema.NewColCollection(testCols...), + nil, + schema.Collation_Default, + nil, + nil, + ) + require.NoError(t, err) + + // Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email + nameMapper := rowconv.NameMapper{ + "user_id": "id", + "user_name": "name", + "user_email": "email", + } + + tests := []struct { + name string + primaryKeys []string + expectError bool + }{ + { + name: "primary key using mapped name", + primaryKeys: []string{"id"}, // mapped from user_id + expectError: false, + }, + { + name: "primary key using original name", + primaryKeys: []string{"user_id"}, // original column name + expectError: false, + }, + { + name: "multiple primary keys with mixed names", + primaryKeys: []string{"id", "name"}, // mapped names + expectError: false, + }, + { + name: "invalid primary key not in mapping", + primaryKeys: []string{"invalid_col"}, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper) + if tt.expectError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } +} diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats index 16c4c42baf7..6e571618c53 100755 --- a/integration-tests/bats/import-create-tables.bats +++ b/integration-tests/bats/import-create-tables.bats @@ -254,14 +254,16 @@ CSV run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv [ "$status" -eq 1 ] [[ "$output" =~ "Error determining the output schema." ]] || false - [[ "$output" =~ "column 'batmansparents' not found" ]] || false + [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false + [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false } @test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" { run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv [ "$status" -eq 1 ] [[ "$output" =~ "Error determining the output schema." ]] || false - [[ "$output" =~ "column 'batmansparents' not found" ]] || false + [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false + [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false } @test "import-create-tables: create a table with two primary keys from csv import" { @@ -947,4 +949,106 @@ DELIM [[ "$output" =~ '5,contains null,"[4,null]"' ]] || false [[ "$output" =~ '6,empty,[]' ]] || false +} + +# See: https://github.com/dolthub/dolt/issues/1083 +@test "import-create-tables: validate primary keys exist in CSV file" { + # Create a test CSV file + cat < test_pk_validation.csv +id,name,email,age +1,Alice,alice@example.com,30 +2,Bob,bob@example.com,25 +3,Charlie,charlie@example.com,35 +DELIM + + # Test 1: Invalid single primary key should fail immediately + run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false + + # Test 2: Multiple invalid primary keys should fail immediately + run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary keys" ]] || false + [[ "$output" =~ "invalid1 invalid2" ]] || false + [[ "$output" =~ "not found in import file" ]] || false + + # Test 3: Mix of valid and invalid primary keys should fail + run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false + + # Test 4: Valid primary key should succeed + run dolt table import -c --pk "id" test_table test_pk_validation.csv + [ "$status" -eq 0 ] + + # Verify table was created correctly + run dolt sql -q "DESCRIBE test_table;" + [ "$status" -eq 0 ] + [[ "$output" =~ "id" ]] || false + [[ "$output" =~ "PRI" ]] || false + + # Test 5: Valid multiple primary keys should succeed + run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv + [ "$status" -eq 0 ] + + # Test 6: PSV file with invalid primary key should also fail immediately + cat < test_pk_validation.psv +id|name|email|age +1|Alice|alice@example.com|30 +2|Bob|bob@example.com|25 +3|Charlie|charlie@example.com|35 +DELIM + + run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false + + # Test 7: Large CSV should fail quickly (not after reading entire file) + # Create a larger CSV to simulate the original issue + { + echo "year,state_fips,county_fips,precinct,candidate,votes" + for i in {1..1000}; do + echo "2020,$i,$i,precinct$i,candidate$i,$i" + done + } > large_test.csv + + # Time the command - it should fail immediately, not after processing all rows + start_time=$(date +%s) + run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv + end_time=$(date +%s) + duration=$((end_time - start_time)) + + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false + # Should fail in less than 2 seconds (immediate validation) + [ "$duration" -lt 2 ] || false +} + +@test "import-create-tables: primary key validation with schema file should skip validation" { + # Create a test CSV file + cat < test_data.csv +id,name,email +1,Alice,alice@example.com +2,Bob,bob@example.com +DELIM + + # Create a schema file with different column as primary key + cat < test_schema.sql +CREATE TABLE test_with_schema ( + id INT, + name VARCHAR(100), + email VARCHAR(100), + PRIMARY KEY (name) +); +SQL + + # When schema file is provided, it should work without primary key validation + run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv + [ "$status" -eq 0 ] + + # Verify that 'name' is the primary key (from schema file) + run dolt sql -q "SHOW CREATE TABLE test_with_schema;" + [ "$status" -eq 0 ] + [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false } \ No newline at end of file diff --git a/integration-tests/bats/import-tables.bats b/integration-tests/bats/import-tables.bats index 7d69a66f554..92467cf86f7 100644 --- a/integration-tests/bats/import-tables.bats +++ b/integration-tests/bats/import-tables.bats @@ -47,3 +47,4 @@ teardown() { dolt table import -r -pk '`Key4' t `batshelper escaped-characters.csv` dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv` } +