From 5548e0f6477c65f2186f7c63a15cf628da2fd574 Mon Sep 17 00:00:00 2001 From: Tim Sehn Date: Thu, 29 May 2025 14:16:05 -0700 Subject: [PATCH] Added a new constant for string columns created during schema import. Length is 200 instead of 1023. --- go/cmd/dolt/commands/tblcmds/import.go | 2 +- .../doltcore/env/actions/infer_schema.go | 18 ++++---- .../doltcore/env/actions/infer_schema_test.go | 28 ++++++------ .../doltcore/schema/typeinfo/varstring.go | 2 + .../bats/import-create-tables.bats | 14 +++--- integration-tests/bats/schema-import.bats | 45 +++++++++++++------ 6 files changed, 64 insertions(+), 45 deletions(-) diff --git a/go/cmd/dolt/commands/tblcmds/import.go b/go/cmd/dolt/commands/tblcmds/import.go index b9d7c24977f..59adf469451 100644 --- a/go/cmd/dolt/commands/tblcmds/import.go +++ b/go/cmd/dolt/commands/tblcmds/import.go @@ -798,7 +798,7 @@ func generateAllTextSchema(rd table.ReadCloser, impOpts *importOptions) (schema. var colType typeinfo.TypeInfo if slices.Contains(impOpts.primaryKeys, col.Name) || (len(impOpts.primaryKeys) == 0 && len(cols) == 0) { // text type is not supported for primary keys, pk is either explicitly set or is the first column - colType = typeinfo.StringDefaultType + colType = typeinfo.StringImportDefaultType } else { colType = typeinfo.TextType } diff --git a/go/libraries/doltcore/env/actions/infer_schema.go b/go/libraries/doltcore/env/actions/infer_schema.go index de99c155cba..cfc4f5b5008 100644 --- a/go/libraries/doltcore/env/actions/infer_schema.go +++ b/go/libraries/doltcore/env/actions/infer_schema.go @@ -193,7 +193,7 @@ func leastPermissiveType(strVal string, floatThreshold float64) typeinfo.TypeInf if int64(len(strVal)) > typeinfo.MaxVarcharLength { return typeinfo.TextType } else { - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } } @@ -234,7 +234,7 @@ func leastPermissiveNumericType(strVal string, floatThreshold float64) (ti typei // use string for out of range if errors.Is(err, strconv.ErrRange) { - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } if err != nil { @@ -243,7 +243,7 @@ func leastPermissiveNumericType(strVal string, floatThreshold float64) (ti typei // handle leading zero case if len(strVal) > 1 && strVal[0] == '0' { - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } if i >= math.MinInt32 && i <= math.MaxInt32 { @@ -325,7 +325,7 @@ func findCommonType(ts typeInfoSet) typeinfo.TypeInfo { if len(ts) == 0 { // use strings if all values were empty - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } if len(ts) == 1 { @@ -339,9 +339,9 @@ func findCommonType(ts typeInfoSet) typeinfo.TypeInfo { if setHasType(ts, typeinfo.TextType) { return typeinfo.TextType } else if setHasType(ts, typeinfo.StringDefaultType) { - return typeinfo.StringDefaultType - } else if setHasType(ts, typeinfo.StringDefaultType) { - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType + } else if setHasType(ts, typeinfo.StringImportDefaultType) { + return typeinfo.StringImportDefaultType } hasNumeric := false @@ -364,7 +364,7 @@ func findCommonType(ts typeInfoSet) typeinfo.TypeInfo { } if hasNumeric && hasNonNumeric { - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } if hasNumeric { @@ -383,7 +383,7 @@ func findCommonType(ts typeInfoSet) typeinfo.TypeInfo { if setHasType(ts, nct) { // types in nonChronoTypes have only string // as a common type with any other type - return typeinfo.StringDefaultType + return typeinfo.StringImportDefaultType } } diff --git a/go/libraries/doltcore/env/actions/infer_schema_test.go b/go/libraries/doltcore/env/actions/infer_schema_test.go index 0fcd644682f..8b3e6e5ed44 100644 --- a/go/libraries/doltcore/env/actions/infer_schema_test.go +++ b/go/libraries/doltcore/env/actions/infer_schema_test.go @@ -45,10 +45,10 @@ func TestLeastPermissiveType(t *testing.T) { }{ {"empty string", "", 0.0, typeinfo.UnknownType}, {"valid uuid", "00000000-0000-0000-0000-000000000000", 0.0, typeinfo.UuidType}, - {"invalid uuid", "00000000-0000-0000-0000-00000000000z", 0.0, typeinfo.StringDefaultType}, + {"invalid uuid", "00000000-0000-0000-0000-00000000000z", 0.0, typeinfo.StringImportDefaultType}, {"lower bool", "true", 0.0, typeinfo.BoolType}, {"upper bool", "FALSE", 0.0, typeinfo.BoolType}, - {"yes", "yes", 0.0, typeinfo.StringDefaultType}, + {"yes", "yes", 0.0, typeinfo.StringImportDefaultType}, {"one", "1", 0.0, typeinfo.Int32Type}, {"negative one", "-1", 0.0, typeinfo.Int32Type}, {"negative one point 0", "-1.0", 0.0, typeinfo.Float32Type}, @@ -57,7 +57,7 @@ func TestLeastPermissiveType(t *testing.T) { {"negative one point 999 with FT of 1.0", "-1.999", 1.0, typeinfo.Int32Type}, {"zero point zero zero zero zero", "0.0000", 0.0, typeinfo.Float32Type}, {"max int", strconv.FormatUint(math.MaxInt64, 10), 0.0, typeinfo.Int64Type}, - {"bigger than max int", strconv.FormatUint(math.MaxUint64, 10) + "0", 0.0, typeinfo.StringDefaultType}, + {"bigger than max int", strconv.FormatUint(math.MaxUint64, 10) + "0", 0.0, typeinfo.StringImportDefaultType}, } for _, test := range tests { @@ -82,11 +82,11 @@ func TestLeastPermissiveNumericType(t *testing.T) { {"double decimal point", "0.00.0", 0.0, typeinfo.UnknownType}, {"leading zero floats", "05.78", 0.0, typeinfo.Float32Type}, {"zero float with high precision", "0.0000", 0.0, typeinfo.Float32Type}, - {"all zeroes", "0000", 0.0, typeinfo.StringDefaultType}, - {"leading zeroes", "01", 0.0, typeinfo.StringDefaultType}, + {"all zeroes", "0000", 0.0, typeinfo.StringImportDefaultType}, + {"leading zeroes", "01", 0.0, typeinfo.StringImportDefaultType}, {"negative int", "-1234", 0.0, typeinfo.Int32Type}, - {"fits in uint64 but not int64", strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.StringDefaultType}, - {"negative less than math.MinInt64", "-" + strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.StringDefaultType}, + {"fits in uint64 but not int64", strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.StringImportDefaultType}, + {"negative less than math.MinInt64", "-" + strconv.FormatUint(math.MaxUint64, 10), 0.0, typeinfo.StringImportDefaultType}, {"math.MinInt64", strconv.FormatInt(math.MinInt64, 10), 0.0, typeinfo.Int64Type}, } @@ -186,7 +186,7 @@ func testFindCommonType(t *testing.T) { typeinfo.Int32Type: {}, typeinfo.BoolType: {}, }, - expType: typeinfo.StringDefaultType, + expType: typeinfo.StringImportDefaultType, }, { name: "floats and bools", @@ -194,7 +194,7 @@ func testFindCommonType(t *testing.T) { typeinfo.Float32Type: {}, typeinfo.BoolType: {}, }, - expType: typeinfo.StringDefaultType, + expType: typeinfo.StringImportDefaultType, }, { name: "floats and uuids", @@ -202,7 +202,7 @@ func testFindCommonType(t *testing.T) { typeinfo.Float32Type: {}, typeinfo.UuidType: {}, }, - expType: typeinfo.StringDefaultType, + expType: typeinfo.StringImportDefaultType, }, } @@ -230,7 +230,7 @@ func testFindCommonTypeFromSingleType(t *testing.T) { typeinfo.TimeType, typeinfo.TimestampType, typeinfo.DatetimeType, - typeinfo.StringDefaultType, + typeinfo.StringImportDefaultType, } for _, ti := range allTypes { @@ -371,11 +371,11 @@ func TestInferSchema(t *testing.T) { }, map[string]typeinfo.TypeInfo{ "int": typeinfo.Int32Type, - "uint": typeinfo.StringDefaultType, + "uint": typeinfo.StringImportDefaultType, "uuid": typeinfo.UuidType, "float": typeinfo.Float32Type, "bool": typeinfo.BoolType, - "string": typeinfo.StringDefaultType, + "string": typeinfo.StringImportDefaultType, }, nil, }, @@ -387,7 +387,7 @@ func TestInferSchema(t *testing.T) { floatThreshold: 0, }, map[string]typeinfo.TypeInfo{ - "mix": typeinfo.StringDefaultType, + "mix": typeinfo.StringImportDefaultType, "uuid": typeinfo.UuidType, }, nil, diff --git a/go/libraries/doltcore/schema/typeinfo/varstring.go b/go/libraries/doltcore/schema/typeinfo/varstring.go index 7a8ca06341b..9c8edbcbce9 100644 --- a/go/libraries/doltcore/schema/typeinfo/varstring.go +++ b/go/libraries/doltcore/schema/typeinfo/varstring.go @@ -52,6 +52,8 @@ var ( MaxVarcharLength = int64(16383) // StringDefaultType is sized to 1k, which allows up to 16 fields per row StringDefaultType = &varStringType{gmstypes.MustCreateStringWithDefaults(sqltypes.VarChar, MaxVarcharLength/16)} + // StringImportDefaultType is sized to 200, which allows up to 80+ fields per row during import operations + StringImportDefaultType = &varStringType{gmstypes.MustCreateStringWithDefaults(sqltypes.VarChar, 200)} ) func CreateVarStringTypeFromSqlType(stringType sql.StringType) TypeInfo { diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats index 5de9c5205c4..7fa1351a1d1 100755 --- a/integration-tests/bats/import-create-tables.bats +++ b/integration-tests/bats/import-create-tables.bats @@ -596,7 +596,7 @@ DELIM [ "$status" -eq 0 ] [[ "$output" =~ "CREATE TABLE \`test\`" ]] || false [[ "$output" =~ "\`pk\` int" ]] || false - [[ "$output" =~ "\`str\` varchar(1023)" ]] || false + [[ "$output" =~ "\`str\` varchar(200)" ]] || false [[ "$output" =~ "\`int\` int" ]] || false [[ "$output" =~ "\`bool\` tinyint" ]] || false [[ "$output" =~ "\`float\` float" ]] || false @@ -895,9 +895,9 @@ DELIM run dolt sql -q "describe test" [ "$status" -eq 0 ] - [[ "$output" =~ "| id | varchar(1023) |" ]] || false - [[ "$output" =~ "| state | text |" ]] || false - [[ "$output" =~ "| data | text |" ]] || false + [[ "$output" =~ "| id | varchar(200) |" ]] || false + [[ "$output" =~ "| state | text |" ]] || false + [[ "$output" =~ "| data | text |" ]] || false # pk defaults to first column if not explicitly defined run dolt table import -c --all-text test2 test.csv @@ -906,9 +906,9 @@ DELIM run dolt sql -q "describe test2" [ "$status" -eq 0 ] - [[ "$output" =~ "| id | varchar(1023) |" ]] || false - [[ "$output" =~ "| state | text |" ]] || false - [[ "$output" =~ "| data | text |" ]] || false + [[ "$output" =~ "| id | varchar(200) |" ]] || false + [[ "$output" =~ "| state | text |" ]] || false + [[ "$output" =~ "| data | text |" ]] || false } @test "import-create-tables: --all-text and --schema are mutually exclusive" { diff --git a/integration-tests/bats/schema-import.bats b/integration-tests/bats/schema-import.bats index 5b66f997c4a..3f5904bc9c9 100755 --- a/integration-tests/bats/schema-import.bats +++ b/integration-tests/bats/schema-import.bats @@ -116,7 +116,7 @@ teardown() { [[ "${lines[0]}" =~ "test" ]] || false [[ "$output" =~ "\`pk\` int" ]] || false [[ "$output" =~ "\`int\` int" ]] || false - [[ "$output" =~ "\`string\` varchar(1023)" ]] || false + [[ "$output" =~ "\`string\` varchar(200)" ]] || false [[ "$output" =~ "\`boolean\` tinyint" ]] || false [[ "$output" =~ "\`float\` float" ]] || false [[ "$output" =~ "\`uint\` int" ]] || false @@ -141,7 +141,7 @@ DELIM [[ "${lines[0]}" =~ "test" ]] || false [[ "$output" =~ "\`pk\` int" ]] || false [[ "$output" =~ "\`int\` int" ]] || false - [[ "$output" =~ "\`string\` varchar(1023)" ]] || false + [[ "$output" =~ "\`string\` varchar(200)" ]] || false [[ "$output" =~ "\`boolean\` tinyint" ]] || false [[ "$output" =~ "\`float\` float" ]] || false [[ "$output" =~ "\`uint\` int" ]] || false @@ -201,8 +201,8 @@ DELIM [ "$status" -eq 0 ] [ "${#lines[@]}" -eq 7 ] [[ "${lines[0]}" =~ "test" ]] || false - [[ "$output" =~ "\`pk\` varchar(1023)" ]] || false - [[ "$output" =~ "\`headerOne\` varchar(1023)" ]] || false + [[ "$output" =~ "\`pk\` varchar(200)" ]] || false + [[ "$output" =~ "\`headerOne\` varchar(200)" ]] || false [[ "$output" =~ "\`headerTwo\` int" ]] || false } @@ -227,7 +227,7 @@ DELIM [[ "$output" =~ "\`c3\` int" ]] || false [[ "$output" =~ "\`c4\` int" ]] || false [[ "$output" =~ "\`c5\` int" ]] || false - [[ "$output" =~ "\`c6\` varchar(1023)" ]] || false + [[ "$output" =~ "\`c6\` varchar(200)" ]] || false [[ "$output" =~ "PRIMARY KEY (\`pk\`)" ]] || false } @@ -243,12 +243,12 @@ DELIM [ "${#lines[@]}" -eq 11 ] [[ "${lines[0]}" =~ "test" ]] || false [[ "$output" =~ "\`pk\` int" ]] || false - [[ "$output" =~ "\`c1\` varchar(1023)" ]] || false - [[ "$output" =~ "\`c2\` varchar(1023)" ]] || false - [[ "$output" =~ "\`c3\` varchar(1023)" ]] || false - [[ "$output" =~ "\`c4\` varchar(1023)" ]] || false - [[ "$output" =~ "\`c5\` varchar(1023)" ]] || false - [[ "$output" =~ "\`c6\` varchar(1023)" ]] || false + [[ "$output" =~ "\`c1\` varchar(200)" ]] || false + [[ "$output" =~ "\`c2\` varchar(200)" ]] || false + [[ "$output" =~ "\`c3\` varchar(200)" ]] || false + [[ "$output" =~ "\`c4\` varchar(200)" ]] || false + [[ "$output" =~ "\`c5\` varchar(200)" ]] || false + [[ "$output" =~ "\`c6\` varchar(200)" ]] || false [[ "$output" =~ "PRIMARY KEY (\`pk\`)" ]] || false } @@ -295,7 +295,7 @@ DELIM run dolt diff --schema [ "$status" -eq 0 ] - [[ "$output" =~ '+ `x` varchar(1023),' ]] || false + [[ "$output" =~ '+ `x` varchar(200),' ]] || false [[ "$output" =~ '+ `y` float,' ]] || false [[ "$output" =~ '+ `z` int,' ]] || false # assert no columns were deleted/replaced @@ -331,7 +331,7 @@ DELIM run dolt diff --schema [ "$status" -eq 0 ] - [[ "$output" =~ '+ `x` varchar(1023),' ]] || false + [[ "$output" =~ '+ `x` varchar(200),' ]] || false [[ "$output" =~ '+ `y` float,' ]] || false [[ "$output" =~ '+ `z` int,' ]] || false # assert no columns were deleted/replaced @@ -373,7 +373,7 @@ DELIM run dolt diff --schema [ "$status" -eq 0 ] - [[ "$output" =~ '- `a` varchar(1023),' ]] || false + [[ "$output" =~ '- `a` varchar(200),' ]] || false [[ "$output" =~ '- `b` float,' ]] || false [[ "$output" =~ '- `c` tinyint(1),' ]] || false # assert no columns were added @@ -410,3 +410,20 @@ CSV [[ "$output" =~ "name" ]] || false [[ "$output" =~ "invalid schema" ]] || false } + +@test "schema-import: varchar(200) allows many columns" { + # Test that import operations use varchar(200) as default length, allowing many varchar columns + # With varchar(200), we should be able to have 80+ columns vs only 16 with varchar(1023) + cat < many_varchar_cols.csv +pk,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30 +1,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a1,b1,c1,d1 +DELIM + run dolt schema import -c --pks=pk test many_varchar_cols.csv + [ "$status" -eq 0 ] + [[ "$output" =~ "Created table successfully." ]] || false + run dolt schema show test + [ "$status" -eq 0 ] + # Verify that columns were created with varchar(200) + [[ "$output" =~ "\`c1\` varchar(200)" ]] || false + [[ "$output" =~ "\`c30\` varchar(200)" ]] || false +}