From 2963d80fafa53c91458e13f764e14d80a9be7827 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 16:45:16 -0500 Subject: [PATCH 01/16] To-do items for broader platform/go-version benchmarking --- docs/src/new-in-miller-6.md | 4 ++- docs/src/new-in-miller-6.md.in | 4 ++- go.mod | 2 +- scripts/chain-lengths.sh | 20 ++++++------- todo.txt | 54 ++++++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 13 deletions(-) diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index 22e3a9974a..211c8e4f9b 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -270,7 +270,9 @@ The following differences are rather technical. If they don't sound familiar to As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, -then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs. +then converted to DKVP, JSON, etc. + +These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index 000763f52b..82539b98c6 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -228,7 +228,9 @@ The following differences are rather technical. If they don't sound familiar to As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/docs/src/example.csv) file [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, -then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs. +then converted to DKVP, JSON, etc. + +These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/go.mod b/go.mod index 65acb8d294..48a652e941 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/lestrrat-go/strftime v1.0.4 github.com/mattn/go-isatty v0.0.12 - github.com/pkg/profile v1.6.0 // indirect + github.com/pkg/profile v1.6.0 github.com/stretchr/testify v1.7.0 // indirect golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 golang.org/x/term v0.0.0-20201210144234-2321bbc49cbf diff --git a/scripts/chain-lengths.sh b/scripts/chain-lengths.sh index 4acc539e5c..c36e7f208e 100755 --- a/scripts/chain-lengths.sh +++ b/scripts/chain-lengths.sh @@ -1,14 +1,14 @@ -mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -reps="1" +#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +mlrs="mlr5 ./mlr" -#mlrs="mlr5 ./mlr" -#reps="1 2 3" +#reps="1" +reps="1 2 3" echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -17,7 +17,7 @@ echo; for mlr in $mlrs; do justtime $mlr --csv --from ~/tmp/big.csv \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -27,7 +27,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -38,7 +38,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -50,7 +50,7 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done @@ -63,6 +63,6 @@ echo; for mlr in $mlrs; do then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ then put -f scripts/chain-1.mlr \ - | md5sum; + > /dev/null done done diff --git a/todo.txt b/todo.txt index ace2f5e36f..a52526f0a3 100644 --- a/todo.txt +++ b/todo.txt @@ -2,6 +2,7 @@ PUNCHDOWN LIST * blockers: + - linux/1.17 perf checks - fractional-strptime - improved regex doc w/ lots of examples - cmp-matrices @@ -12,6 +13,59 @@ PUNCHDOWN LIST - big-picture note ? array/map fields: marshal as JSON_SINGLE_LINE +* numeric-inference perf + o perf docs mac -> mac w/ 1.16 + o multiple go versions + 1.15, 1.16, 1.17; 1.18 beta + https://go.dev/doc/manage-install + https://go.dev/dl/#go1.18beta1 + go install golang.org/dl/go1.18beta1@latest + go install golang.org/dl/go1.17.5@latest + go install golang.org/dl/go1.16.12@latest + go install golang.org/dl/go1.15.15@latest + + go1.15.15 download + go1.16.12 download + go1.17.5 download + go1.18beta1 download + + for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do + $go clean github.com/johnkerl/miller/cmd/mlr + $go build github.com/johnkerl/miller/cmd/mlr + mv mlr mlr-$go + done + + for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv check ~/tmp/big.csv > /dev/null; done + for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv cat ~/tmp/big.csv > /dev/null; done + for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv --from ~/tmp/big.csv put -f ./scripts/chain-1.mlr > /dev/null; done + + o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi + o benchmark per se + o enough UT cases + o reconsider TryInt/TryFloat -> maybe try number -- ? + o get octal regexp out of hot path + o make a grammar for numbers & case-though + - len 0 + - len 1 + - has leading minus; strip & rest + - 0x, 0b, 0[0-9] + - decimal: leading minus; [0-9]+ + - octal: leading minus; 0[0-7]+ + - hex: leading minus; 0[xX][0-9a-fA-F]+ + - float: leadinug minus; + + o float literals: + 123 123. 123.4 .234 + 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 + .2e3 .2e-3 1.e-3 + + ?- [0-9]+ + ?- [0-9]+ '.' [0-9]* + ?- [0-9]* '.' [0-9]+ + ?- [0-9]+ [eE] ?- [0-9]+ + ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ + ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ + * nikos materials -> fold in * cases/dsl-min-max-types: cmp-matrices need to be fixed to follow the advertised rule for mixed types From 4f549d23e448da0fb42426f18c4b01cc45c6d996 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 19:19:52 -0500 Subject: [PATCH 02/16] neaten inferrer API --- internal/pkg/mlrval/mlrval_infer.go | 38 ++++++++++++++--------------- internal/pkg/mlrval/mlrval_new.go | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index bdfb10a2be..df25895753 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -15,13 +15,13 @@ import ( func (mv *Mlrval) Type() MVType { if mv.mvtype == MT_PENDING { - packageLevelInferrer(mv, mv.printrep, false) + packageLevelInferrer(mv, false) } return mv.mvtype } // Support for mlr -S, mlr -A, mlr -O. -type tInferrer func(mv *Mlrval, input string, inferBool bool) *Mlrval +type tInferrer func(mv *Mlrval, inferBool bool) *Mlrval var packageLevelInferrer tInferrer = inferWithOctalAsString @@ -60,49 +60,49 @@ var downcasedFloatNamesToNotInfer = map[string]bool{ var octalDetector = regexp.MustCompile("^-?0[0-9]+") // inferWithOctalAsString is for default behavior. -func inferWithOctalAsString(mv *Mlrval, input string, inferBool bool) *Mlrval { - inferWithOctalAsInt(mv, input, inferBool) +func inferWithOctalAsString(mv *Mlrval, inferBool bool) *Mlrval { + inferWithOctalAsInt(mv, inferBool) if mv.mvtype != MT_INT && mv.mvtype != MT_FLOAT { return mv } if octalDetector.MatchString(mv.printrep) { - return mv.SetFromString(input) + return mv.SetFromString(mv.printrep) } else { return mv } } // inferWithOctalAsInt is for mlr -O. -func inferWithOctalAsInt(mv *Mlrval, input string, inferBool bool) *Mlrval { - if input == "" { +func inferWithOctalAsInt(mv *Mlrval, inferBool bool) *Mlrval { + if mv.printrep == "" { return mv.SetFromVoid() } - intval, iok := lib.TryIntFromString(input) + intval, iok := lib.TryIntFromString(mv.printrep) if iok { - return mv.SetFromPrevalidatedIntString(input, intval) + return mv.SetFromPrevalidatedIntString(mv.printrep, intval) } - if downcasedFloatNamesToNotInfer[strings.ToLower(input)] == false { - floatval, fok := lib.TryFloatFromString(input) + if downcasedFloatNamesToNotInfer[strings.ToLower(mv.printrep)] == false { + floatval, fok := lib.TryFloatFromString(mv.printrep) if fok { - return mv.SetFromPrevalidatedFloatString(input, floatval) + return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval) } } if inferBool { - boolval, bok := lib.TryBoolFromBoolString(input) + boolval, bok := lib.TryBoolFromBoolString(mv.printrep) if bok { - return mv.SetFromPrevalidatedBoolString(input, boolval) + return mv.SetFromPrevalidatedBoolString(mv.printrep, boolval) } } - return mv.SetFromString(input) + return mv.SetFromString(mv.printrep) } // inferWithIntAsFloat is for mlr -A. -func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval { - inferWithOctalAsString(mv, input, inferBool) +func inferWithIntAsFloat(mv *Mlrval, inferBool bool) *Mlrval { + inferWithOctalAsString(mv, inferBool) if mv.Type() == MT_INT { mv.floatval = float64(mv.intval) mv.mvtype = MT_FLOAT @@ -111,6 +111,6 @@ func inferWithIntAsFloat(mv *Mlrval, input string, inferBool bool) *Mlrval { } // inferStringOnly is for mlr -S. -func inferStringOnly(mv *Mlrval, input string, inferBool bool) *Mlrval { - return mv.SetFromString(input) +func inferStringOnly(mv *Mlrval, inferBool bool) *Mlrval { + return mv.SetFromString(mv.printrep) } diff --git a/internal/pkg/mlrval/mlrval_new.go b/internal/pkg/mlrval/mlrval_new.go index 507f5ad535..371d3ee8d1 100644 --- a/internal/pkg/mlrval/mlrval_new.go +++ b/internal/pkg/mlrval/mlrval_new.go @@ -40,7 +40,7 @@ func FromInferredType(input string) *Mlrval { printrepValid: true, } // TODO: comment re inferBool arg - packageLevelInferrer(mv, mv.printrep, true) + packageLevelInferrer(mv, true) return mv } From 7c753886b978d4135eb74468bb978b7d63338692 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 19:43:28 -0500 Subject: [PATCH 03/16] extend type-inference unit-test cases --- .vimrc | 1 + Makefile | 9 + internal/pkg/mlrval/mlrval_infer_test.go | 259 +++++++++++++++++++++++ 3 files changed, 269 insertions(+) create mode 100644 internal/pkg/mlrval/mlrval_infer_test.go diff --git a/.vimrc b/.vimrc index 7d420eb5ae..ad59ce3e33 100644 --- a/.vimrc +++ b/.vimrc @@ -1,2 +1,3 @@ map \d :w:!clear;echo Building ...; echo; make mlr map \f :w:!clear;echo Building ...; echo; make ut +map \r :w:!clear;echo Building ...; echo; make mv-ut diff --git a/Makefile b/Makefile index 29c5cd5ab7..4af0cc6f78 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,15 @@ install: build unit-test ut: go test github.com/johnkerl/miller/internal/pkg/... +lib-ut: + go test github.com/johnkerl/miller/internal/pkg/lib... +mv-ut: + go test github.com/johnkerl/miller/internal/pkg/mlrval/... +bifs-ut: + go test github.com/johnkerl/miller/internal/pkg/bifs/... +input-ut: + go test github.com/johnkerl/miller/internal/pkg/input/... + # ---------------------------------------------------------------- # Regression tests (large number) # diff --git a/internal/pkg/mlrval/mlrval_infer_test.go b/internal/pkg/mlrval/mlrval_infer_test.go new file mode 100644 index 0000000000..e5b8f8577c --- /dev/null +++ b/internal/pkg/mlrval/mlrval_infer_test.go @@ -0,0 +1,259 @@ +// ================================================================ +// Tests mlrval constructors. +// ================================================================ + +package mlrval + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestInferWithOctalAsString(t *testing.T) { + assert.True(t, inferWithOctalAsString(FromDeferredType(""), false).IsVoid()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("true"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("false"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("true"), true).IsBool()) + assert.True(t, inferWithOctalAsString(FromDeferredType("false"), true).IsBool()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("abc"), false).IsString()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("0123"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0123"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0377"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0377"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0923"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0923"), false).IsString()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("123"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-123"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0xff"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0xff"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0b1011"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0b1011"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0x7fffffffffffffff"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0x8000000000000000"), false).IsInt()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0xffffffffffffffff"), false).IsInt()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("12_3"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-12_3"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1_2.3_4"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1_2.3_4"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0xca_fe"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0xca_fe"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("0b1011_1101"), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-0b1011_1101"), false).IsString()) + + assert.True(t, inferWithOctalAsString(FromDeferredType("."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("123."), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-123."), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType(".123"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-.123"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("123.456"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-123.456"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1e2."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1e2."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1e-2."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1e-2."), false).IsString()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1.2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1.2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1.e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1.e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("1.e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-1.e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType(".2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType(".2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsString(FromDeferredType("-.2e-3"), false).IsFloat()) +} + +func TestInferWithOctalAsInt(t *testing.T) { + assert.True(t, inferWithOctalAsInt(FromDeferredType(""), false).IsVoid()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("true"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("false"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("true"), true).IsBool()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("false"), true).IsBool()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("abc"), false).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("0123"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0123"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0377"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0377"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0923"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0923"), false).IsFloat()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("123"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xff"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xff"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x7fffffffffffffff"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x8000000000000000"), false).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xffffffffffffffff"), false).IsInt()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("12_3"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-12_3"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1_2.3_4"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1_2.3_4"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xca_fe"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xca_fe"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011_1101"), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011_1101"), false).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123."), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123."), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".123"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.123"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123.456"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.456"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e2."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e2."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e-2."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e-2."), false).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3"), false).IsFloat()) +} + +func TestInferWithIntAsFloat(t *testing.T) { + assert.True(t, inferWithIntAsFloat(FromDeferredType(""), false).IsVoid()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("true"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("false"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("true"), true).IsBool()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("false"), true).IsBool()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("abc"), false).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("0123"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0123"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0377"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0377"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0923"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0923"), false).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("123"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xff"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xff"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x7fffffffffffffff"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x8000000000000000"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xffffffffffffffff"), false).IsFloat()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("12_3"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-12_3"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1_2.3_4"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1_2.3_4"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xca_fe"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xca_fe"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011_1101"), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011_1101"), false).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123."), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123."), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".123"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.123"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123.456"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.456"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e2."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e2."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e-2."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e-2."), false).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e-3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e-3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e-3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e-3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.2e3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3"), false).IsFloat()) +} + +func TestInferStringOnly(t *testing.T) { + assert.True(t, inferStringOnly(FromDeferredType(""), false).IsVoid()) + + assert.True(t, inferStringOnly(FromDeferredType("true"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("false"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("true"), true).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("false"), true).IsString()) + + assert.True(t, inferStringOnly(FromDeferredType("abc"), false).IsString()) + + assert.True(t, inferStringOnly(FromDeferredType("0123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0377"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0377"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0923"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0923"), false).IsString()) + + assert.True(t, inferStringOnly(FromDeferredType("123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0xff"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0xff"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0b1011"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0b1011"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0x7fffffffffffffff"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0x8000000000000000"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0xffffffffffffffff"), false).IsString()) + + assert.True(t, inferStringOnly(FromDeferredType("12_3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-12_3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1_2.3_4"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1_2.3_4"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0xca_fe"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0xca_fe"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("0b1011_1101"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-0b1011_1101"), false).IsString()) + + assert.True(t, inferStringOnly(FromDeferredType("."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("123."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-123."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType(".123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-.123"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("123.456"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-123.456"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1e2."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1e2."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1e-2."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1e-2."), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1.2e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1.2e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1.2e-3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1.2e-3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1.e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1.e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("1.e-3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-1.e-3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType(".2e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-.2e3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType(".2e-3"), false).IsString()) + assert.True(t, inferStringOnly(FromDeferredType("-.2e-3"), false).IsString()) +} From 80b4396bfd2801905ce3429f99d4ccf1240a17cb Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 22:26:52 -0500 Subject: [PATCH 04/16] Add benchmark scripts for comparing compiler versions --- docs/src/new-in-miller-6.md | 4 ++++ docs/src/new-in-miller-6.md.in | 4 ++++ scripts/compiler-versions-build | 7 +++++++ scripts/compiler-versions-install | 13 +++++++++++++ scripts/compiler-versions-time | 9 +++++++++ todo.txt | 23 ----------------------- 6 files changed, 37 insertions(+), 23 deletions(-) create mode 100755 scripts/compiler-versions-build create mode 100755 scripts/compiler-versions-install create mode 100755 scripts/compiler-versions-time diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index 211c8e4f9b..c948d4b5e0 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -274,6 +274,10 @@ then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. +Linux benchmarks are pending. + +As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. + For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: | Format | Miller 5 | Miller 6 | Speedup | diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index 82539b98c6..f6595fe1fe 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -232,6 +232,10 @@ then converted to DKVP, JSON, etc. These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. +Linux benchmarks are pending. + +As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. + For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: | Format | Miller 5 | Miller 6 | Speedup | diff --git a/scripts/compiler-versions-build b/scripts/compiler-versions-build new file mode 100755 index 0000000000..49e2c2ad22 --- /dev/null +++ b/scripts/compiler-versions-build @@ -0,0 +1,7 @@ +#!/bin/sh + +for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do + $go clean github.com/johnkerl/miller/cmd/mlr + $go build github.com/johnkerl/miller/cmd/mlr + mv mlr mlr-$go +done diff --git a/scripts/compiler-versions-install b/scripts/compiler-versions-install new file mode 100755 index 0000000000..873e8857fc --- /dev/null +++ b/scripts/compiler-versions-install @@ -0,0 +1,13 @@ +#!/bin/sh + +# https://go.dev/doc/manage-install + +go install golang.org/dl/go1.18beta1@latest +go install golang.org/dl/go1.17.5@latest +go install golang.org/dl/go1.16.12@latest +go install golang.org/dl/go1.15.15@latest + +go1.15.15 download +go1.16.12 download +go1.17.5 download +go1.18beta1 download diff --git a/scripts/compiler-versions-time b/scripts/compiler-versions-time new file mode 100755 index 0000000000..03ed64965e --- /dev/null +++ b/scripts/compiler-versions-time @@ -0,0 +1,9 @@ +#!/bin/sh + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv check ~/tmp/big.csv > /dev/null; done +echo + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv cat ~/tmp/big.csv > /dev/null; done +echo + +for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv --from ~/tmp/big.csv put -f ./scripts/chain-1.mlr > /dev/null; done diff --git a/todo.txt b/todo.txt index a52526f0a3..298672ed72 100644 --- a/todo.txt +++ b/todo.txt @@ -16,29 +16,6 @@ PUNCHDOWN LIST * numeric-inference perf o perf docs mac -> mac w/ 1.16 o multiple go versions - 1.15, 1.16, 1.17; 1.18 beta - https://go.dev/doc/manage-install - https://go.dev/dl/#go1.18beta1 - go install golang.org/dl/go1.18beta1@latest - go install golang.org/dl/go1.17.5@latest - go install golang.org/dl/go1.16.12@latest - go install golang.org/dl/go1.15.15@latest - - go1.15.15 download - go1.16.12 download - go1.17.5 download - go1.18beta1 download - - for go in go1.15.15 go1.16.12 go1.17.5 go1.18beta1; do - $go clean github.com/johnkerl/miller/cmd/mlr - $go build github.com/johnkerl/miller/cmd/mlr - mv mlr mlr-$go - done - - for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv check ~/tmp/big.csv > /dev/null; done - for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv cat ~/tmp/big.csv > /dev/null; done - for mlr in mlr5 mlr-go1.1*; do justtime $mlr --csv --from ~/tmp/big.csv put -f ./scripts/chain-1.mlr > /dev/null; done - o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi o benchmark per se o enough UT cases From c1dcdd652c2522f97b00992624d425ddd26a5042 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 22:39:41 -0500 Subject: [PATCH 05/16] mlr version in addition to mlr --version --- docs/src/new-in-miller-6.md | 8 ++++---- docs/src/new-in-miller-6.md.in | 8 ++++---- internal/pkg/auxents/auxents.go | 8 ++++++++ internal/pkg/auxents/repl/prompt.go | 2 +- todo.txt | 5 +---- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index c948d4b5e0..563fad8315 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -272,11 +272,11 @@ As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/d [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, then converted to DKVP, JSON, etc. -These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. - -Linux benchmarks are pending. +Notes: -As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. +* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`. +* Linux benchmarks are pending. +* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index f6595fe1fe..1fe1facb23 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -230,11 +230,11 @@ As a benchmark, the [example.csv](https://github.com/johnkerl/miller/blob/main/d [was expanded](https://github.com/johnkerl/miller/blob/main/scripts/make-big-files) into a million-line CSV file, then converted to DKVP, JSON, etc. -These were run on a commodity Mac laptop with four CPUs, using `go1.16.5 darwin/amd64`. - -Linux benchmarks are pending. +Notes: -As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. +* These were run on a commodity Mac laptop with four CPUs, on MacOS Monterey, using `go1.16.5 darwin/amd64`. +* Linux benchmarks are pending. +* As of late 2021, Miller has been benchmarks using Go compiler versions 1.15.15, 1.16.12, 1.17.5, and 1.18beta1, with no significant performance changes attributable to compiler versions. For the [first benchmark](https://github.com/johnkerl/miller/blob/main/scripts/time-big-files), we have `mlr cat` of those files, with processing times shown: diff --git a/internal/pkg/auxents/auxents.go b/internal/pkg/auxents/auxents.go index 231a168990..ef87d510ef 100644 --- a/internal/pkg/auxents/auxents.go +++ b/internal/pkg/auxents/auxents.go @@ -8,10 +8,12 @@ package auxents import ( "fmt" "os" + "runtime" "github.com/johnkerl/miller/internal/pkg/auxents/help" "github.com/johnkerl/miller/internal/pkg/auxents/regtest" "github.com/johnkerl/miller/internal/pkg/auxents/repl" + "github.com/johnkerl/miller/internal/pkg/version" ) // tAuxMain is a function-pointer type for the entrypoint handler for a given auxent, @@ -38,6 +40,7 @@ func init() { {"help", help.HelpMain}, {"regtest", regtest.RegTestMain}, {"repl", repl.ReplMain}, + {"version", showVersion}, } } @@ -82,3 +85,8 @@ func ShowAuxEntries(o *os.File) { fmt.Fprintf(o, "For more information, please invoke mlr {subcommand} --help.\n") } + +func showVersion(args []string) int { + fmt.Printf("mlr version %s for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) + return 0 +} diff --git a/internal/pkg/auxents/repl/prompt.go b/internal/pkg/auxents/repl/prompt.go index 350bf849f3..c4c4a4a767 100644 --- a/internal/pkg/auxents/repl/prompt.go +++ b/internal/pkg/auxents/repl/prompt.go @@ -50,7 +50,7 @@ func getPrompt2() string { func (repl *Repl) printStartupBanner() { if repl.inputIsTerminal { - fmt.Printf("Miller %s REPL for %s:%s:%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) + fmt.Printf("Miller %s REPL for %s/%s/%s\n", version.STRING, runtime.GOOS, runtime.GOARCH, runtime.Version()) fmt.Printf("Docs: %s\n", lib.DOC_URL) fmt.Printf("Type ':h' or ':help' for online help; ':q' or ':quit' to quit.\n") } diff --git a/todo.txt b/todo.txt index 298672ed72..7a4d960a6e 100644 --- a/todo.txt +++ b/todo.txt @@ -14,14 +14,11 @@ PUNCHDOWN LIST ? array/map fields: marshal as JSON_SINGLE_LINE * numeric-inference perf - o perf docs mac -> mac w/ 1.16 - o multiple go versions o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi o benchmark per se - o enough UT cases o reconsider TryInt/TryFloat -> maybe try number -- ? o get octal regexp out of hot path - o make a grammar for numbers & case-though + o make a grammar for numbers & case-through - len 0 - len 1 - has leading minus; strip & rest From fe439d04ec2045ec7b48b622e287d08a0f393e42 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 23:34:22 -0500 Subject: [PATCH 06/16] some go-benchmark files for Mac/Linux perf comparisons --- Makefile | 17 +++-- .../pkg/input/record_reader_benchmark_test.go | 71 +++++++++++++++++++ internal/pkg/mlrval/mlrval_benchmark_test.go | 35 +++++++++ scripts/chain-1.mlr | 2 +- todo.txt | 1 + 5 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 internal/pkg/input/record_reader_benchmark_test.go create mode 100644 internal/pkg/mlrval/mlrval_benchmark_test.go diff --git a/Makefile b/Makefile index 4af0cc6f78..39270b578b 100644 --- a/Makefile +++ b/Makefile @@ -31,15 +31,22 @@ install: build unit-test ut: go test github.com/johnkerl/miller/internal/pkg/... -lib-ut: +ut-lib: go test github.com/johnkerl/miller/internal/pkg/lib... -mv-ut: +ut-mlv: go test github.com/johnkerl/miller/internal/pkg/mlrval/... -bifs-ut: +ut-bifs: go test github.com/johnkerl/miller/internal/pkg/bifs/... -input-ut: +ut-input: go test github.com/johnkerl/miller/internal/pkg/input/... +bench: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/... +bench-mlv: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/... +bench-input: + go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/... + # ---------------------------------------------------------------- # Regression tests (large number) # @@ -102,4 +109,4 @@ release_tarball: build check # ================================================================ # Go does its own dependency management, outside of make. -.PHONY: build mlr check unit_test regression_test fmt staticcheck dev docs +.PHONY: build mlr check unit_test regression_test bench fmt staticcheck dev docs diff --git a/internal/pkg/input/record_reader_benchmark_test.go b/internal/pkg/input/record_reader_benchmark_test.go new file mode 100644 index 0000000000..93ce898570 --- /dev/null +++ b/internal/pkg/input/record_reader_benchmark_test.go @@ -0,0 +1,71 @@ +package input + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/johnkerl/miller/internal/pkg/cli" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/input/... + +func BenchmarkDKVPParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "dkvp", + IFS: ",", + IPS: "=", + IRS: "\n", + } + reader, err := NewRecordReaderDKVP(readerOptions, 1) + assert.Nil(b, err) + + for i := 0; i < b.N; i++ { + _, _ = recordFromDKVPLine( + reader, + "color=yellow,shape=triangle,flag=true,k=1,index=11,quantity=43.6498,rate=9.8870", + ) + } +} + +func BenchmarkNIDXParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "nidx", + IFS: " ", + AllowRepeatIFS: true, + IRS: "\n", + } + reader, err := NewRecordReaderNIDX(readerOptions, 1) + assert.Nil(b, err) + + for i := 0; i < b.N; i++ { + _, _ = recordFromDKVPLine( + reader, + "yellow triangle true 1 11 43.6498 9.8870", + ) + } +} + +func BenchmarkXTABParse(b *testing.B) { + readerOptions := &cli.TReaderOptions{ + InputFileFormat: "xtab", + IPS: " ", + IFS: "\n", + IRS: "\n", + } + reader, err := NewRecordReaderXTAB(readerOptions, 1) + assert.Nil(b, err) + + stanza := newStanza() + stanza.dataLines.PushBack("color yellow") + stanza.dataLines.PushBack("shape triangle") + stanza.dataLines.PushBack("flag true") + stanza.dataLines.PushBack("k 1") + stanza.dataLines.PushBack("index 11") + stanza.dataLines.PushBack("quantity 43.6498") + stanza.dataLines.PushBack("rate 9.8870") + + for i := 0; i < b.N; i++ { + _, _ = reader.recordFromXTABLines(stanza.dataLines) + } +} diff --git a/internal/pkg/mlrval/mlrval_benchmark_test.go b/internal/pkg/mlrval/mlrval_benchmark_test.go new file mode 100644 index 0000000000..141e855db2 --- /dev/null +++ b/internal/pkg/mlrval/mlrval_benchmark_test.go @@ -0,0 +1,35 @@ +package mlrval + +import ( + "testing" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/... + +func BenchmarkFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = FromDeferredType("123") + } +} + +func BenchmarkInferIntFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("123") + mv.Type() + } +} + +func BenchmarkInferFloatFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("123.4") + mv.Type() + } +} + +func BenchmarkInferStringFromDeferredType(b *testing.B) { + for i := 0; i < b.N; i++ { + mv := FromDeferredType("abc") + mv.Type() + } +} + diff --git a/scripts/chain-1.mlr b/scripts/chain-1.mlr index c2279799bc..05fa011141 100644 --- a/scripts/chain-1.mlr +++ b/scripts/chain-1.mlr @@ -1,2 +1,2 @@ $color_shape = $color . $shape; -$y = int($k) + int($index) **3 + log10(float($quantity)/float($rate)); +$y = $k + $index **3 + log10($quantity/$rate); diff --git a/todo.txt b/todo.txt index 7a4d960a6e..672dfbd1bc 100644 --- a/todo.txt +++ b/todo.txt @@ -14,6 +14,7 @@ PUNCHDOWN LIST ? array/map fields: marshal as JSON_SINGLE_LINE * numeric-inference perf + o README-profiling.md re various scripts o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi o benchmark per se o reconsider TryInt/TryFloat -> maybe try number -- ? From 115d99999e559d1d9c9bfd1b5ed1c44fb0df76fc Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 25 Dec 2021 23:45:40 -0500 Subject: [PATCH 07/16] neaten perf-scripts --- scripts/chain-cmps.sh | 22 +++++++++++----------- scripts/time-big-file | 2 +- scripts/time-big-files | 20 ++++++++++++-------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/scripts/chain-cmps.sh b/scripts/chain-cmps.sh index de4bf262e5..31a541887f 100755 --- a/scripts/chain-cmps.sh +++ b/scripts/chain-cmps.sh @@ -1,13 +1,13 @@ -mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -reps="1" +#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +#reps="1" -#mlrs="mlr5 ./mlr" -#reps="1 2 3" +mlrs="mlr5 ./mlr" +reps="1 2 3" -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv head | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape | md5sum; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity | md5sum; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv head > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape > /dev/null; done; done +echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -n quantity > /dev/null; done; done diff --git a/scripts/time-big-file b/scripts/time-big-file index 5da24aa787..f660391a46 100755 --- a/scripts/time-big-file +++ b/scripts/time-big-file @@ -15,4 +15,4 @@ fi if [ $# -eq 2 ]; then mlr="$2" fi -justtime $mlr $iflag cat ~/tmp/big.$suffix | md5sum - +justtime $mlr $iflag cat ~/tmp/big.$suffix > /dev/null diff --git a/scripts/time-big-files b/scripts/time-big-files index 135fd4184e..882e20b1d0 100755 --- a/scripts/time-big-files +++ b/scripts/time-big-files @@ -2,12 +2,16 @@ ourdir=$(dirname $0) -mlrs="mlr5 ~/tmp/miller/mlr ./mlr" - -echo; for mlr in $mlrs; do $ourdir/time-big-file csv $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file csvlite $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file dkvp $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file nidx $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file xtab $mlr; done -echo; for mlr in $mlrs; do $ourdir/time-big-file json $mlr; done +#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +mlrs="mlr5 ./mlr" + +#reps="1" +reps="1 2 3" + +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csv $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file csvlite $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file dkvp $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file nidx $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file xtab $mlr; done; done +echo; for mlr in $mlrs; do for k in $reps; do $ourdir/time-big-file json $mlr; done; done From a834bd9228ab5215d7f08e946273bf24137f029d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 16:43:00 -0500 Subject: [PATCH 08/16] merge --- scripts/chain-cmps.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/chain-cmps.sh b/scripts/chain-cmps.sh index 31a541887f..68bb9ad690 100755 --- a/scripts/chain-cmps.sh +++ b/scripts/chain-cmps.sh @@ -6,7 +6,6 @@ reps="1 2 3" echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check > /dev/null; done; done echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv cat > /dev/null; done; done -echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv head > /dev/null; done; done echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tail > /dev/null; done; done echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv tac > /dev/null; done; done echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv sort -f shape > /dev/null; done; done From a5e136b36e068bcf6dcfd17c269b4bf61dcf7a76 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 11:05:34 -0500 Subject: [PATCH 09/16] type-scan optimization tests --- Makefile | 12 +- cmd/scan/main.go | 338 +++++++++++++++++++++++++++++++++++++++++++++++ todo.txt | 8 +- 3 files changed, 355 insertions(+), 3 deletions(-) create mode 100644 cmd/scan/main.go diff --git a/Makefile b/Makefile index 39270b578b..f2f55ce01b 100644 --- a/Makefile +++ b/Makefile @@ -57,12 +57,22 @@ bench-input: regression-test: go test -v regression_test.go +# ---------------------------------------------------------------- +# Experimental executables: +scan: + go build github.com/johnkerl/miller/cmd/scan + +# ---------------------------------------------------------------- +# Formatting # go fmt ./... finds experimental C files which we want to ignore. fmt: -go fmt ./cmd/... -go fmt ./internal/pkg/... -go fmt ./regression_test.go +# ---------------------------------------------------------------- +# Static analysis + # Needs first: go install honnef.co/go/tools/cmd/staticcheck@latest # See also: https://staticcheck.io staticcheck: @@ -109,4 +119,4 @@ release_tarball: build check # ================================================================ # Go does its own dependency management, outside of make. -.PHONY: build mlr check unit_test regression_test bench fmt staticcheck dev docs +.PHONY: build mlr scan check unit_test regression_test bench fmt staticcheck dev docs diff --git a/cmd/scan/main.go b/cmd/scan/main.go new file mode 100644 index 0000000000..343465011d --- /dev/null +++ b/cmd/scan/main.go @@ -0,0 +1,338 @@ +// ================================================================ +// Experiments for type-inference performance optimization +// ================================================================ + +package main + +import ( + "fmt" + "os" + "runtime" + "runtime/debug" + "runtime/pprof" + "strconv" + + "github.com/pkg/profile" // for trace.out +) + +type tScanType int + +const ( + scanTypeString tScanType = 0 + scanTypeDecimalInt = 1 + scanTypeOctalInt = 2 + scanTypeHexInt = 3 + scanTypeBinaryInt = 4 + scanTypeMaybeFloat = 5 + scanTypeBool = 6 +) + +var scanTypeNames = []string{ + "string", + "decint", + "octint", + "hexint", + "binint", + "float?", + "bool", +} + +// 00000000: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |................| +// 00000010: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f |................| +// 00000020: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f | !"#$%&'()*+,-./| +// 00000030: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f |0123456789:;<=>?| +// 00000040: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f |@ABCDEFGHIJKLMNO| +// 00000050: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f |PQRSTUVWXYZ[\]^_| +// 00000060: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f |`abcdefghijklmno| +// 00000070: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f |pqrstuvwxyz{|}~.| + +var isDecimalDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +var isHexDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +// Possible character in floats include '.', 0-9, [eE], [-+] -- the latter two for things like 1.2e-8. +// Miller intentionally does not accept 'inf' or 'NaN' as float numbers in file-input data. +var isFloatDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, true, false, true, true, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +func isDecimalDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isDecimalDigitTable[c] + } else { + return false + } +} + +func isHexDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isHexDigitTable[c] + } else { + return false + } +} + +func isFloatDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isFloatDigitTable[c] + } else { + return false + } +} + +// ---------------------------------------------------------------- + +// TODO: UT the type-names LUT +// TODO: inout tabls & CLI access & UT access & bench access + +func findScanType(input []byte) tScanType { + if len(input) == 0 { + return scanTypeString + } + + i0 := input[0] + if i0 == '-' { + return findScanTypePositiveNumberOrString(input[1:]) + } + if i0 >= '0' && i0 <= '9' { + return findScanTypePositiveNumberOrString(input) + } + if i0 == '.' { + return findScanTypePositiveDecimalOrFloatOrString(input) + } + + sinput := string(input) + if sinput == "true" || sinput == "false" { + return scanTypeBool + } + + return scanTypeString +} + +// TODO: type up the why +// o make a grammar for numbers & case-through +// k len 0 +// - len 1 +// k has leading minus; strip & rest +// - 0x, 0b, 0[0-9] +// - decimal: leading minus; [0-9]+ +// - octal: leading minus; 0[0-7]+ +// - hex: leading minus; 0[xX][0-9a-fA-F]+ +// - float: leadinug minus; [0-9] or '.' +// +// o float literals: +// 123 123. 123.4 .234 +// 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 +// .2e3 .2e-3 1.e-3 +// +// ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* +// ?- [0-9]* '.' [0-9]+ +// ?- [0-9]+ [eE] ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ +// ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ + +func findScanTypePositiveNumberOrString(input []byte) tScanType { + if len(input) == 0 { + return scanTypeString + } + i0 := input[0] + + if i0 == '.' { + return findScanTypePositiveFloatOrString(input) + } + + if isDecimalDigit(i0) { + if len(input) == 1 { + return scanTypeDecimalInt + } + if i0 == '0' { + i1 := input[1] + if i1 == 'x' || i1 == 'X' { + return findScanTypePositiveHexOrString(input[2:]) + } + if i1 == 'b' || i1 == 'B' { + return findScanTypePositiveBinaryOrString(input[2:]) + } + } + + // TODO: nope, could be float too + return findScanTypePositiveDecimalOrFloatOrString(input) + } + + return scanTypeString +} + +func findScanTypePositiveFloatOrString(input []byte) tScanType { + for _, c := range []byte(input) { + if !isFloatDigit(c) { + return scanTypeString + } + } + return scanTypeMaybeFloat +} + +func findScanTypePositiveDecimalOrFloatOrString(input []byte) tScanType { + maybeInt := true + for _, c := range []byte(input) { + // All float digits are decimal-int digits so if the current character + // is not a float digit, this can't be either a float or a decimal int. + // Example: "1x2" + if !isFloatDigit(c) { + return scanTypeString + } + + // Examples: "1e2" or "1x2". + if !isDecimalDigit(c) { + maybeInt = false + } + } + if maybeInt { + return scanTypeDecimalInt + } else { + return scanTypeMaybeFloat + } +} + +// Leading 0x has already been stripped +func findScanTypePositiveHexOrString(input []byte) tScanType { + for _, c := range []byte(input) { + if !isHexDigit(c) { + return scanTypeString + } + } + return scanTypeHexInt +} + +// Leading 0b has already been stripped +func findScanTypePositiveBinaryOrString(input []byte) tScanType { + for _, c := range []byte(input) { + if c < '0' || c > '1' { + return scanTypeString + } + } + return scanTypeBinaryInt +} + +// ---------------------------------------------------------------- +func scanMain() { + + // var c byte + // + // fmt.Printf("dec: ") + // for c = 0x20; c <= 0x6f; c++ { + // if isDecimalDigit(c) { + // fmt.Printf("%c", c) + // } + // } + // fmt.Println() + // + // fmt.Printf("hex: ") + // for c = 0x20; c <= 0x6f; c++ { + // if isHexDigit(c) { + // fmt.Printf("%c", c) + // } + // } + // fmt.Println() + // + // fmt.Printf("float: ") + // for c = 0x20; c <= 0x6f; c++ { + // if isFloatDigit(c) { + // fmt.Printf("%c", c) + // } + // } + // fmt.Println() + + // TODO: + // func ParseInt(s string, base int, bitSize int) (int64, error) + // func ParseUint(s string, base int, bitSize int) (uint64, error) + + for _, arg := range os.Args[1:] { + scanType := findScanType([]byte(arg)) + fmt.Printf("%-10s -> %s\n", arg, scanTypeNames[scanType]) + } +} + +// ---------------------------------------------------------------- +func main() { + + // Respect env $GOMAXPROCS, if provided, else set default. + haveSetGoMaxProcs := false + goMaxProcsString := os.Getenv("GOMAXPROCS") + if goMaxProcsString != "" { + goMaxProcs, err := strconv.Atoi(goMaxProcsString) + if err != nil { + runtime.GOMAXPROCS(goMaxProcs) + haveSetGoMaxProcs = true + } + } + if !haveSetGoMaxProcs { + // As of Go 1.16 this is the default anyway. For 1.15 and below we need + // to explicitly set this. + runtime.GOMAXPROCS(runtime.NumCPU()) + } + + debug.SetGCPercent(500) // Empirical: See README-profiling.md + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // CPU profiling + // + // We do this here, not in the command-line parser, since + // pprof.StopCPUProfile() needs to be called at the very end of everything. + // Putting this pprof logic into a go func running in parallel with main, + // and properly stopping the profile only when main ends via chan-sync, + // results in a zero-length pprof file. + // + // Please see README-profiling.md for more information. + + if len(os.Args) >= 3 && os.Args[1] == "--cpuprofile" { + profFilename := os.Args[2] + handle, err := os.Create(profFilename) + if err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer handle.Close() + + if err := pprof.StartCPUProfile(handle); err != nil { + fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) + return + } + defer pprof.StopCPUProfile() + + fmt.Fprintf(os.Stderr, "CPU profile started.\n") + defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename) + } + + if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { + defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() + defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") + } + + scanMain() +} diff --git a/todo.txt b/todo.txt index 672dfbd1bc..1e0c04bcf4 100644 --- a/todo.txt +++ b/todo.txt @@ -2,6 +2,7 @@ PUNCHDOWN LIST * blockers: + ! .mlrrc suppress - linux/1.17 perf checks - fractional-strptime - improved regex doc w/ lots of examples @@ -16,9 +17,12 @@ PUNCHDOWN LIST * numeric-inference perf o README-profiling.md re various scripts o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi - o benchmark per se - o reconsider TryInt/TryFloat -> maybe try number -- ? + k benchmark per se + o mlr --time + o try traceprofile? + o find a second linux machine to check -- ? o get octal regexp out of hot path + o make a grammar for numbers & case-through - len 0 - len 1 From 19c5a5171bba14199b318dc9fd9520aa7334c84e Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 12:05:14 -0500 Subject: [PATCH 10/16] type-scan optimization infra --- Makefile | 2 + cmd/scan/main.go | 331 +---------------------- internal/pkg/scan/digits.go | 71 +++++ internal/pkg/scan/digits_test.go | 46 ++++ internal/pkg/scan/doc.go | 3 + internal/pkg/scan/find.go | 148 ++++++++++ internal/pkg/scan/find_benchmark_test.go | 65 +++++ internal/pkg/scan/find_test.go | 62 +++++ internal/pkg/scan/type.go | 33 +++ internal/pkg/scan/type_test.go | 17 ++ todo.txt | 2 + 11 files changed, 462 insertions(+), 318 deletions(-) create mode 100644 internal/pkg/scan/digits.go create mode 100644 internal/pkg/scan/digits_test.go create mode 100644 internal/pkg/scan/doc.go create mode 100644 internal/pkg/scan/find.go create mode 100644 internal/pkg/scan/find_benchmark_test.go create mode 100644 internal/pkg/scan/find_test.go create mode 100644 internal/pkg/scan/type.go create mode 100644 internal/pkg/scan/type_test.go diff --git a/Makefile b/Makefile index f2f55ce01b..263ace6c8a 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,8 @@ unit-test ut: ut-lib: go test github.com/johnkerl/miller/internal/pkg/lib... +ut-scan: + go test github.com/johnkerl/miller/internal/pkg/scan/... ut-mlv: go test github.com/johnkerl/miller/internal/pkg/mlrval/... ut-bifs: diff --git a/cmd/scan/main.go b/cmd/scan/main.go index 343465011d..9b4537577c 100644 --- a/cmd/scan/main.go +++ b/cmd/scan/main.go @@ -7,332 +7,27 @@ package main import ( "fmt" "os" - "runtime" - "runtime/debug" - "runtime/pprof" - "strconv" - "github.com/pkg/profile" // for trace.out + "github.com/johnkerl/miller/internal/pkg/scan" ) -type tScanType int - -const ( - scanTypeString tScanType = 0 - scanTypeDecimalInt = 1 - scanTypeOctalInt = 2 - scanTypeHexInt = 3 - scanTypeBinaryInt = 4 - scanTypeMaybeFloat = 5 - scanTypeBool = 6 -) - -var scanTypeNames = []string{ - "string", - "decint", - "octint", - "hexint", - "binint", - "float?", - "bool", -} - -// 00000000: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |................| -// 00000010: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f |................| -// 00000020: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f | !"#$%&'()*+,-./| -// 00000030: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f |0123456789:;<=>?| -// 00000040: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f |@ABCDEFGHIJKLMNO| -// 00000050: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f |PQRSTUVWXYZ[\]^_| -// 00000060: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f |`abcdefghijklmno| -// 00000070: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f |pqrstuvwxyz{|}~.| - -var isDecimalDigitTable = []bool{ - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f - true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f -} - -var isHexDigitTable = []bool{ - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f - true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f - false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 40-4f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f - false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 60-6f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f -} - -// Possible character in floats include '.', 0-9, [eE], [-+] -- the latter two for things like 1.2e-8. -// Miller intentionally does not accept 'inf' or 'NaN' as float numbers in file-input data. -var isFloatDigitTable = []bool{ - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f - false, false, false, false, false, false, false, false, false, false, false, true, false, true, true, false, // 20-2f - true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f - false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 40-4f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f - false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 60-6f - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f -} - -func isDecimalDigit(c byte) bool { - if c < 128 { // byte is unsigned in Go - return isDecimalDigitTable[c] - } else { - return false - } -} - -func isHexDigit(c byte) bool { - if c < 128 { // byte is unsigned in Go - return isHexDigitTable[c] - } else { - return false - } -} - -func isFloatDigit(c byte) bool { - if c < 128 { // byte is unsigned in Go - return isFloatDigitTable[c] - } else { - return false - } -} - -// ---------------------------------------------------------------- - -// TODO: UT the type-names LUT -// TODO: inout tabls & CLI access & UT access & bench access - -func findScanType(input []byte) tScanType { - if len(input) == 0 { - return scanTypeString - } - - i0 := input[0] - if i0 == '-' { - return findScanTypePositiveNumberOrString(input[1:]) - } - if i0 >= '0' && i0 <= '9' { - return findScanTypePositiveNumberOrString(input) - } - if i0 == '.' { - return findScanTypePositiveDecimalOrFloatOrString(input) - } - - sinput := string(input) - if sinput == "true" || sinput == "false" { - return scanTypeBool - } - - return scanTypeString -} - -// TODO: type up the why -// o make a grammar for numbers & case-through -// k len 0 -// - len 1 -// k has leading minus; strip & rest -// - 0x, 0b, 0[0-9] -// - decimal: leading minus; [0-9]+ -// - octal: leading minus; 0[0-7]+ -// - hex: leading minus; 0[xX][0-9a-fA-F]+ -// - float: leadinug minus; [0-9] or '.' -// -// o float literals: -// 123 123. 123.4 .234 -// 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 -// .2e3 .2e-3 1.e-3 -// -// ?- [0-9]+ -// ?- [0-9]+ '.' [0-9]* -// ?- [0-9]* '.' [0-9]+ -// ?- [0-9]+ [eE] ?- [0-9]+ -// ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ -// ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ - -func findScanTypePositiveNumberOrString(input []byte) tScanType { - if len(input) == 0 { - return scanTypeString - } - i0 := input[0] - - if i0 == '.' { - return findScanTypePositiveFloatOrString(input) - } - - if isDecimalDigit(i0) { - if len(input) == 1 { - return scanTypeDecimalInt - } - if i0 == '0' { - i1 := input[1] - if i1 == 'x' || i1 == 'X' { - return findScanTypePositiveHexOrString(input[2:]) - } - if i1 == 'b' || i1 == 'B' { - return findScanTypePositiveBinaryOrString(input[2:]) - } - } - - // TODO: nope, could be float too - return findScanTypePositiveDecimalOrFloatOrString(input) - } - - return scanTypeString -} - -func findScanTypePositiveFloatOrString(input []byte) tScanType { - for _, c := range []byte(input) { - if !isFloatDigit(c) { - return scanTypeString - } - } - return scanTypeMaybeFloat -} - -func findScanTypePositiveDecimalOrFloatOrString(input []byte) tScanType { - maybeInt := true - for _, c := range []byte(input) { - // All float digits are decimal-int digits so if the current character - // is not a float digit, this can't be either a float or a decimal int. - // Example: "1x2" - if !isFloatDigit(c) { - return scanTypeString - } - - // Examples: "1e2" or "1x2". - if !isDecimalDigit(c) { - maybeInt = false - } - } - if maybeInt { - return scanTypeDecimalInt - } else { - return scanTypeMaybeFloat - } -} - -// Leading 0x has already been stripped -func findScanTypePositiveHexOrString(input []byte) tScanType { - for _, c := range []byte(input) { - if !isHexDigit(c) { - return scanTypeString - } - } - return scanTypeHexInt -} - -// Leading 0b has already been stripped -func findScanTypePositiveBinaryOrString(input []byte) tScanType { - for _, c := range []byte(input) { - if c < '0' || c > '1' { - return scanTypeString - } - } - return scanTypeBinaryInt -} - -// ---------------------------------------------------------------- -func scanMain() { - - // var c byte - // - // fmt.Printf("dec: ") - // for c = 0x20; c <= 0x6f; c++ { - // if isDecimalDigit(c) { - // fmt.Printf("%c", c) - // } - // } - // fmt.Println() - // - // fmt.Printf("hex: ") - // for c = 0x20; c <= 0x6f; c++ { - // if isHexDigit(c) { - // fmt.Printf("%c", c) - // } - // } - // fmt.Println() - // - // fmt.Printf("float: ") - // for c = 0x20; c <= 0x6f; c++ { - // if isFloatDigit(c) { - // fmt.Printf("%c", c) - // } - // } - // fmt.Println() +// const ( +// scanTypeString ScanType = 0 +// scanTypeDecimalInt = 1 +// scanTypeOctalInt = 2 +// scanTypeHexInt = 3 +// scanTypeBinaryInt = 4 +// scanTypeMaybeFloat = 5 +// scanTypeBool = 6 +// ) +func main() { // TODO: // func ParseInt(s string, base int, bitSize int) (int64, error) // func ParseUint(s string, base int, bitSize int) (uint64, error) for _, arg := range os.Args[1:] { - scanType := findScanType([]byte(arg)) - fmt.Printf("%-10s -> %s\n", arg, scanTypeNames[scanType]) - } -} - -// ---------------------------------------------------------------- -func main() { - - // Respect env $GOMAXPROCS, if provided, else set default. - haveSetGoMaxProcs := false - goMaxProcsString := os.Getenv("GOMAXPROCS") - if goMaxProcsString != "" { - goMaxProcs, err := strconv.Atoi(goMaxProcsString) - if err != nil { - runtime.GOMAXPROCS(goMaxProcs) - haveSetGoMaxProcs = true - } - } - if !haveSetGoMaxProcs { - // As of Go 1.16 this is the default anyway. For 1.15 and below we need - // to explicitly set this. - runtime.GOMAXPROCS(runtime.NumCPU()) + scanType := scan.FindScanType(arg) + fmt.Printf("%-10s -> %s\n", arg, scan.TypeNames[scanType]) } - - debug.SetGCPercent(500) // Empirical: See README-profiling.md - - // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // CPU profiling - // - // We do this here, not in the command-line parser, since - // pprof.StopCPUProfile() needs to be called at the very end of everything. - // Putting this pprof logic into a go func running in parallel with main, - // and properly stopping the profile only when main ends via chan-sync, - // results in a zero-length pprof file. - // - // Please see README-profiling.md for more information. - - if len(os.Args) >= 3 && os.Args[1] == "--cpuprofile" { - profFilename := os.Args[2] - handle, err := os.Create(profFilename) - if err != nil { - fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) - return - } - defer handle.Close() - - if err := pprof.StartCPUProfile(handle); err != nil { - fmt.Fprintln(os.Stderr, os.Args[0], ": ", "Could not start CPU profile: ", err) - return - } - defer pprof.StopCPUProfile() - - fmt.Fprintf(os.Stderr, "CPU profile started.\n") - defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename) - } - - if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { - defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() - defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") - } - - scanMain() } diff --git a/internal/pkg/scan/digits.go b/internal/pkg/scan/digits.go new file mode 100644 index 0000000000..032ac9e7d5 --- /dev/null +++ b/internal/pkg/scan/digits.go @@ -0,0 +1,71 @@ +package scan + +// TODO: comment re context + +// 00000000: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |................| +// 00000010: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f |................| +// 00000020: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f | !"#$%&'()*+,-./| +// 00000030: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f |0123456789:;<=>?| +// 00000040: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f |@ABCDEFGHIJKLMNO| +// 00000050: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f |PQRSTUVWXYZ[\]^_| +// 00000060: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f |`abcdefghijklmno| +// 00000070: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f |pqrstuvwxyz{|}~.| + +var isDecimalDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +var isHexDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +// Possible character in floats include '.', 0-9, [eE], [-+] -- the latter two for things like 1.2e-8. +// Miller intentionally does not accept 'inf' or 'NaN' as float numbers in file-input data. +var isFloatDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, true, false, true, true, false, // 20-2f + true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + +func isDecimalDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isDecimalDigitTable[c] + } else { + return false + } +} + +func isHexDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isHexDigitTable[c] + } else { + return false + } +} + +func isFloatDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isFloatDigitTable[c] + } else { + return false + } +} diff --git a/internal/pkg/scan/digits_test.go b/internal/pkg/scan/digits_test.go new file mode 100644 index 0000000000..06c1329a85 --- /dev/null +++ b/internal/pkg/scan/digits_test.go @@ -0,0 +1,46 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsDecimalDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isDecimalDigit(c)) + } else { + assert.False(t, isDecimalDigit(c)) + } + } +} + +func TestIsHexDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isHexDigit(c)) + } else if c >= 'a' && c <= 'f' { + assert.True(t, isHexDigit(c)) + } else if c >= 'A' && c <= 'F' { + assert.True(t, isHexDigit(c)) + } else { + assert.False(t, isHexDigit(c)) + } + } +} + +func TestIsFloatDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '9' { + assert.True(t, isFloatDigit(c)) + } else if c == '.' || c == '-' || c == '+' || c == 'e' || c == 'E' { + assert.True(t, isFloatDigit(c)) + } else { + assert.False(t, isFloatDigit(c)) + } + } +} diff --git a/internal/pkg/scan/doc.go b/internal/pkg/scan/doc.go new file mode 100644 index 0000000000..67737c3c3a --- /dev/null +++ b/internal/pkg/scan/doc.go @@ -0,0 +1,3 @@ +// Package scan contains low-level logic for efficient type-inference of string +// to int/float/bool/string. +package scan diff --git a/internal/pkg/scan/find.go b/internal/pkg/scan/find.go new file mode 100644 index 0000000000..a0623a6b6a --- /dev/null +++ b/internal/pkg/scan/find.go @@ -0,0 +1,148 @@ +package scan + +// TODO: comment re context + +// o grammar for numbers & case-through +// k len 0 +// - len 1 +// k has leading minus; strip & rest +// - 0x, 0b, 0[0-9] +// - decimal: leading minus; [0-9]+ +// - octal: leading minus; 0[0-7]+ +// - hex: leading minus; 0[xX][0-9a-fA-F]+ +// - float: leadinug minus; [0-9] or '.' +// +// o float literals: +// 123 123. 123.4 .234 +// 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 +// .2e3 .2e-3 1.e-3 +// +// ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* +// ?- [0-9]* '.' [0-9]+ +// ?- [0-9]+ [eE] ?- [0-9]+ +// ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ +// ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ + +func FindScanType(sinput string) ScanType { + input := []byte(sinput) + + if len(input) == 0 { + return scanTypeString + } + + i0 := input[0] + if i0 == '-' { + return findScanTypePositiveNumberOrString(input[1:]) + } + if i0 >= '0' && i0 <= '9' { + return findScanTypePositiveNumberOrString(input) + } + if i0 == '.' { + if len(input) == 1 { + return scanTypeString + } else { + return findScanTypePositiveDecimalOrFloatOrString(input) + } + } + + if sinput == "true" || sinput == "false" { + return scanTypeBool + } + + return scanTypeString +} + +// Convenience function for unit test +func findScanTypeName(sinput string) string { + return TypeNames[FindScanType(sinput)] +} + +func findScanTypePositiveNumberOrString(input []byte) ScanType { + if len(input) == 0 { + return scanTypeString + } + i0 := input[0] + + if i0 == '.' { + return findScanTypePositiveFloatOrString(input) + } + + if isDecimalDigit(i0) { + if len(input) == 1 { + return scanTypeDecimalInt + } + if i0 == '0' { + i1 := input[1] + if i1 == 'x' || i1 == 'X' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveHexOrString(input[2:]) + } + } + if i1 == 'b' || i1 == 'B' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveBinaryOrString(input[2:]) + } + } + } + + return findScanTypePositiveDecimalOrFloatOrString(input) + } + + return scanTypeString +} + +func findScanTypePositiveFloatOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isFloatDigit(c) { + return scanTypeString + } + } + return scanTypeMaybeFloat +} + +func findScanTypePositiveDecimalOrFloatOrString(input []byte) ScanType { + maybeInt := true + for _, c := range []byte(input) { + // All float digits are decimal-int digits so if the current character + // is not a float digit, this can't be either a float or a decimal int. + // Example: "1x2" + if !isFloatDigit(c) { + return scanTypeString + } + + // Examples: "1e2" or "1x2". + if !isDecimalDigit(c) { + maybeInt = false + } + } + if maybeInt { + return scanTypeDecimalInt + } else { + return scanTypeMaybeFloat + } +} + +// Leading 0x has already been stripped +func findScanTypePositiveHexOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isHexDigit(c) { + return scanTypeString + } + } + return scanTypeHexInt +} + +// Leading 0b has already been stripped +func findScanTypePositiveBinaryOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if c < '0' || c > '1' { + return scanTypeString + } + } + return scanTypeBinaryInt +} diff --git a/internal/pkg/scan/find_benchmark_test.go b/internal/pkg/scan/find_benchmark_test.go new file mode 100644 index 0000000000..ba0880091b --- /dev/null +++ b/internal/pkg/scan/find_benchmark_test.go @@ -0,0 +1,65 @@ +package scan + +import ( + "testing" +) + +// go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/scan/... + +func BenchmarkFromNormalCases(b *testing.B) { + + data := []string{ + "yellow", "triangle", "true", "1", "11", "43.6498", "9.8870", + "red", "square", "true", "2", "15", "79.2778", "0.0130", + "red", "circle", "true", "3", "16", "13.8103", "2.9010", + "red", "square", "false", "4", "48", "77.5542", "7.4670", + "purple", "triangle", "false", "5", "51", "81.2290", "8.5910", + "red", "square", "false", "6", "64", "77.1991", "9.5310", + "purple", "triangle", "false", "7", "65", "80.1405", "5.8240", + "yellow", "circle", "true", "8", "73", "63.9785", "4.2370", + "yellow", "circle", "true", "9", "87", "63.5058", "8.3350", + "purple", "square", "false", "10", "91", "72.3735", "8.2430", + } + ndata := len(data) + + for i := 0; i < b.N; i++ { + _ = FindScanType(data[i%ndata]) + } +} + +func BenchmarkFromAbnormalCases(b *testing.B) { + + data := []string{ + "", "-", + "abc", "-abc", + "0", "-0", + "1", "-1", + "2", "-2", + "123", "-123", + "1.", "-1.", + ".2", "-.2", + ".", "-.", + "1.2", "-1.2", + "1.2.3", "-1.2.3", + "1e2e3", "-1e2e3", + "12e-2", "-12e-2", + "1e2x3", "-1e2x3", + "0x", "-0x", + "0x0", "-0x0", + "0xcafe", "-0xcafe", + "0xcape", "-0xcape", + "0b", "-0b", + "0b0", "-0b0", + "0b1011", "-0b1011", + "0b1021", "-0b1021", + "true", "true", + "false", "false", + "True", "True", + "False", "False", + } + ndata := len(data) + + for i := 0; i < b.N; i++ { + _ = FindScanType(data[i%ndata]) + } +} diff --git a/internal/pkg/scan/find_test.go b/internal/pkg/scan/find_test.go new file mode 100644 index 0000000000..0909df0c6b --- /dev/null +++ b/internal/pkg/scan/find_test.go @@ -0,0 +1,62 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFindScanTypeName(t *testing.T) { + assert.Equal(t, typeNameString, findScanTypeName("")) + assert.Equal(t, typeNameString, findScanTypeName("-")) + assert.Equal(t, typeNameString, findScanTypeName("abc")) + assert.Equal(t, typeNameString, findScanTypeName("-abc")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("0")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-0")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("1")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-1")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("2")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-2")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("123")) + assert.Equal(t, typeNameDecimalInt, findScanTypeName("-123")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName(".2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.2")) + assert.Equal(t, typeNameString, findScanTypeName(".")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2.3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2.3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1e2e3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1e2e3")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("12e-2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-12e-2")) + assert.Equal(t, typeNameString, findScanTypeName("1e2x3")) + assert.Equal(t, typeNameString, findScanTypeName("-1e2x3")) + assert.Equal(t, typeNameString, findScanTypeName("0x")) + assert.Equal(t, typeNameString, findScanTypeName("-0x")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x0")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0x0")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xcafe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0xcafe")) + assert.Equal(t, typeNameString, findScanTypeName("0xcape")) + assert.Equal(t, typeNameString, findScanTypeName("-0xcape")) + assert.Equal(t, typeNameString, findScanTypeName("0b")) + assert.Equal(t, typeNameString, findScanTypeName("-0b")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b0")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b0")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b1011")) + assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b1011")) + assert.Equal(t, typeNameString, findScanTypeName("0b1021")) + assert.Equal(t, typeNameString, findScanTypeName("-0b1021")) + assert.Equal(t, typeNameBool, findScanTypeName("true")) + assert.Equal(t, typeNameBool, findScanTypeName("true")) + assert.Equal(t, typeNameBool, findScanTypeName("false")) + assert.Equal(t, typeNameBool, findScanTypeName("false")) + assert.Equal(t, typeNameString, findScanTypeName("True")) + assert.Equal(t, typeNameString, findScanTypeName("True")) + assert.Equal(t, typeNameString, findScanTypeName("False")) + assert.Equal(t, typeNameString, findScanTypeName("False")) +} diff --git a/internal/pkg/scan/type.go b/internal/pkg/scan/type.go new file mode 100644 index 0000000000..f8ad5eb3d4 --- /dev/null +++ b/internal/pkg/scan/type.go @@ -0,0 +1,33 @@ +package scan + +// TODO: comment re context + +type ScanType int + +const ( + scanTypeString ScanType = 0 + scanTypeDecimalInt = 1 + scanTypeOctalInt = 2 + scanTypeHexInt = 3 + scanTypeBinaryInt = 4 + scanTypeMaybeFloat = 5 + scanTypeBool = 6 +) + +const typeNameString = "string" +const typeNameDecimalInt = "decint" +const typeNameOctalInt = "octint" +const typeNameHexInt = "hexint" +const typeNameBinaryInt = "binint" +const typeNameMaybeFloat = "float?" +const typeNameBool = "bool" + +var TypeNames = []string{ + typeNameString, + typeNameDecimalInt, + typeNameOctalInt, + typeNameHexInt, + typeNameBinaryInt, + typeNameMaybeFloat, + typeNameBool, +} diff --git a/internal/pkg/scan/type_test.go b/internal/pkg/scan/type_test.go new file mode 100644 index 0000000000..b68627f4a6 --- /dev/null +++ b/internal/pkg/scan/type_test.go @@ -0,0 +1,17 @@ +package scan + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTypeNames(t *testing.T) { + assert.Equal(t, TypeNames[scanTypeString], "string") + assert.Equal(t, TypeNames[scanTypeDecimalInt], "decint") + assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") + assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") + assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") + assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") + assert.Equal(t, TypeNames[scanTypeBool], "bool") +} diff --git a/todo.txt b/todo.txt index 1e0c04bcf4..db68fc32f7 100644 --- a/todo.txt +++ b/todo.txt @@ -18,6 +18,8 @@ PUNCHDOWN LIST o README-profiling.md re various scripts o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi k benchmark per se + ! octal handling + ! opt-bool handling o mlr --time o try traceprofile? o find a second linux machine to check -- ? From 6e7031a31a159297871c53a025e9fd46e5b7af97 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 15:44:36 -0500 Subject: [PATCH 11/16] test new inferrer --- internal/pkg/mlrval/mlrval_infer.go | 93 ++++++++++++++++++++++++++++- scripts/chain-cmps.sh | 6 +- scripts/chain-lengths.sh | 4 +- scripts/time-big-files | 4 +- todo.txt | 35 +++-------- 5 files changed, 107 insertions(+), 35 deletions(-) diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index df25895753..4b3a1edbca 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -2,9 +2,11 @@ package mlrval import ( "regexp" + "strconv" "strings" "github.com/johnkerl/miller/internal/pkg/lib" + "github.com/johnkerl/miller/internal/pkg/scan" ) // TODO: comment no infer-bool from data files. Always false in this path. @@ -23,7 +25,8 @@ func (mv *Mlrval) Type() MVType { // Support for mlr -S, mlr -A, mlr -O. type tInferrer func(mv *Mlrval, inferBool bool) *Mlrval -var packageLevelInferrer tInferrer = inferWithOctalAsString +// xxx temp var packageLevelInferrer tInferrer = inferWithOctalAsString +var packageLevelInferrer tInferrer = inferTemp // SetInferrerOctalAsInt is for default behavior. func SetInferrerOctalAsString() { @@ -114,3 +117,91 @@ func inferWithIntAsFloat(mv *Mlrval, inferBool bool) *Mlrval { func inferStringOnly(mv *Mlrval, inferBool bool) *Mlrval { return mv.SetFromString(mv.printrep) } + +// ---------------------------------------------------------------- +// experimental + +// TODO: comment +func inferFromDecimalInt(mv *Mlrval, inferBool bool) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 10, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromOctalInt(mv *Mlrval, inferBool bool) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 8, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromHexInt(mv *Mlrval, inferBool bool) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 16, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromBinaryInt(mv *Mlrval, inferBool bool) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 2, 64) + // xxx to do: length check & overflow/uint check + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromMaybeFloat(mv *Mlrval, inferBool bool) *Mlrval { + floatval, err := strconv.ParseFloat(mv.printrep, 64) + if err == nil { + return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +func inferFromBool(mv *Mlrval, inferBool bool) *Mlrval { + if mv.printrep == "true" { + return mv.SetFromPrevalidatedBoolString(mv.printrep, true) + } else { + return mv.SetFromPrevalidatedBoolString(mv.printrep, false) + } +} + +// const ( +// scanTypeString ScanType = 0 +// scanTypeDecimalInt = 1 +// scanTypeOctalInt = 2 +// scanTypeHexInt = 3 +// scanTypeBinaryInt = 4 +// scanTypeMaybeFloat = 5 +// scanTypeBool = 6 +// ) + +var tempScanTypeInferrerTable []tInferrer = []tInferrer{ + inferStringOnly, + inferFromDecimalInt, + inferFromOctalInt, + inferFromHexInt, + inferFromBinaryInt, + inferFromMaybeFloat, + inferFromBool, +} + +func inferTemp(mv *Mlrval, inferBool bool) *Mlrval { + scanType := scan.FindScanType(mv.printrep) + return tempScanTypeInferrerTable[scanType](mv, inferBool) +} diff --git a/scripts/chain-cmps.sh b/scripts/chain-cmps.sh index 68bb9ad690..4ef6b814fa 100755 --- a/scripts/chain-cmps.sh +++ b/scripts/chain-cmps.sh @@ -1,7 +1,7 @@ -#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -#reps="1" +mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +#mlrs="mlr5 ./mlr" -mlrs="mlr5 ./mlr" +#reps="1" reps="1 2 3" echo; for mlr in $mlrs; do for k in $reps; do justtime $mlr --csv --from ~/tmp/big.csv check > /dev/null; done; done diff --git a/scripts/chain-lengths.sh b/scripts/chain-lengths.sh index c36e7f208e..dd999c919b 100755 --- a/scripts/chain-lengths.sh +++ b/scripts/chain-lengths.sh @@ -1,5 +1,5 @@ -#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -mlrs="mlr5 ./mlr" +mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +#mlrs="mlr5 ./mlr" #reps="1" reps="1 2 3" diff --git a/scripts/time-big-files b/scripts/time-big-files index 882e20b1d0..2e2d3917b1 100755 --- a/scripts/time-big-files +++ b/scripts/time-big-files @@ -2,8 +2,8 @@ ourdir=$(dirname $0) -#mlrs="mlr5 ~/tmp/miller/mlr ./mlr" -mlrs="mlr5 ./mlr" +mlrs="mlr5 ~/tmp/miller/mlr ./mlr" +#mlrs="mlr5 ./mlr" #reps="1" reps="1 2 3" diff --git a/todo.txt b/todo.txt index db68fc32f7..3620b4c1b3 100644 --- a/todo.txt +++ b/todo.txt @@ -15,37 +15,18 @@ PUNCHDOWN LIST ? array/map fields: marshal as JSON_SINGLE_LINE * numeric-inference perf - o README-profiling.md re various scripts - o https://stackoverflow.com/questions/64513411/why-is-strconv-parseuint-so-slow-compared-to-strconv-atoi k benchmark per se + ! octal handling ! opt-bool handling + o mlr --time - o try traceprofile? - o find a second linux machine to check -- ? - o get octal regexp out of hot path - - o make a grammar for numbers & case-through - - len 0 - - len 1 - - has leading minus; strip & rest - - 0x, 0b, 0[0-9] - - decimal: leading minus; [0-9]+ - - octal: leading minus; 0[0-7]+ - - hex: leading minus; 0[xX][0-9a-fA-F]+ - - float: leadinug minus; - - o float literals: - 123 123. 123.4 .234 - 1e2 1e-2 1.2e3 1.e3 1.2e-3 1.e-3 - .2e3 .2e-3 1.e-3 - - ?- [0-9]+ - ?- [0-9]+ '.' [0-9]* - ?- [0-9]* '.' [0-9]+ - ?- [0-9]+ [eE] ?- [0-9]+ - ?- [0-9]+ '.' [0-9]* [eE] ?- [0-9]+ - ?- [0-9]* '.' [0-9]+ [eE] ?- [0-9]+ + + o README-profiling.md re various scripts + o README-profiling.md re this PR + o webdoc re on-battery anecdote + + o look at trace-profile again ... * nikos materials -> fold in From 44f24dba6cc611e26d57e6038e2aecd1a626d030 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 16:11:35 -0500 Subject: [PATCH 12/16] mlr --time option --- cmd/mlr/main.go | 29 ++++++++++++++++++++++++++- internal/pkg/cli/option_parse.go | 11 ++++++++++ internal/pkg/cli/option_types.go | 2 ++ internal/pkg/climain/mlrcli_parse.go | 2 ++ internal/pkg/entrypoint/entrypoint.go | 11 ++++++++-- todo.txt | 1 + 6 files changed, 53 insertions(+), 3 deletions(-) diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go index 0e8c9f3e4d..04e4a76973 100644 --- a/cmd/mlr/main.go +++ b/cmd/mlr/main.go @@ -8,12 +8,16 @@ import ( "runtime/debug" "runtime/pprof" "strconv" + "strings" + "time" "github.com/johnkerl/miller/internal/pkg/entrypoint" "github.com/pkg/profile" // for trace.out ) func main() { + // For mlr --time + startTime := time.Now() // Respect env $GOMAXPROCS, if provided, else set default. haveSetGoMaxProcs := false @@ -68,7 +72,30 @@ func main() { defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") } + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // This will obtain os.Args and go from there. All the usual contents of // main() are put into this package for ease of testing. - entrypoint.Main() + mainReturn := entrypoint.Main() + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Timing + // + // The system 'time' command is built-in, of course but it's nice to have + // simply wall-time without the real/user/sys distinction. Also, making + // this a Miller built-in is nice for Windows. + if mainReturn.PrintElapsedTime { + endTime := time.Now() + startNanos := startTime.UnixNano() + endNanos := endTime.UnixNano() + seconds := float64(endNanos-startNanos) / 1e9 + fmt.Fprintf(os.Stderr, "%.6f", seconds) + for _, arg := range os.Args { + if strings.Contains(arg, " ") || strings.Contains(arg, "\t") { + fmt.Fprintf(os.Stderr, " '%s'", arg) + } else { + fmt.Fprintf(os.Stderr, " %s", arg) + } + } + fmt.Fprintf(os.Stderr, "\n") + } } diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 95cddff895..7340b0e71c 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -99,6 +99,7 @@ var FLAG_TABLE = FlagTable{ &OutputColorizationFlagSection, &FlattenUnflattenFlagSection, &MiscFlagSection, + // TODO: make a Profiling section }, } @@ -2672,5 +2673,15 @@ has its own overhead.`, *pargi += 1 }, }, + + // TODO: make a Profiling section + { + name: "--time", + help: `Print elapsed execution time in seconds to stderr at the end of the execution of the program.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.PrintElapsedTime = true + *pargi += 1 + }, + }, }, } diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index 68c08fc8a5..7b366a5cdf 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -153,6 +153,8 @@ type TOptions struct { HaveRandSeed bool RandSeed int + + PrintElapsedTime bool // mlr --time } // Not usable until FinalizeReaderOptions and FinalizeWriterOptions are called. diff --git a/internal/pkg/climain/mlrcli_parse.go b/internal/pkg/climain/mlrcli_parse.go index 2a2cb3bb32..220487b0fe 100644 --- a/internal/pkg/climain/mlrcli_parse.go +++ b/internal/pkg/climain/mlrcli_parse.go @@ -1,3 +1,4 @@ +// ================================================================ // Miller main command-line parsing. // // Before Miller 6 the ordering was: @@ -65,6 +66,7 @@ // foo.csv' the '--csv' looks like it belongs to the 'head' verb. When people // use '#!/bin/sh' scripts they need to insert the '--' in 'mlr head -n 10 -- // --csv foo.csv'; for 'mlr -s' we insert the '--' for them. +// ================================================================ package climain diff --git a/internal/pkg/entrypoint/entrypoint.go b/internal/pkg/entrypoint/entrypoint.go index 26fb345658..fd6e8eeabd 100644 --- a/internal/pkg/entrypoint/entrypoint.go +++ b/internal/pkg/entrypoint/entrypoint.go @@ -20,8 +20,11 @@ import ( "github.com/johnkerl/miller/internal/pkg/transformers" ) -// ---------------------------------------------------------------- -func Main() { +type MainReturn struct { + PrintElapsedTime bool +} + +func Main() MainReturn { // Special handling for Windows so we can do things like: // // mlr put '$a = $b . "cd \"efg\" hi"' foo.dat @@ -55,6 +58,10 @@ func Main() { } else { processInPlace(options) } + + return MainReturn { + PrintElapsedTime: options.PrintElapsedTime, + } } // ---------------------------------------------------------------- diff --git a/todo.txt b/todo.txt index 3620b4c1b3..96d7b480a6 100644 --- a/todo.txt +++ b/todo.txt @@ -25,6 +25,7 @@ PUNCHDOWN LIST o README-profiling.md re various scripts o README-profiling.md re this PR o webdoc re on-battery anecdote + o webdoc section --cpuprofile --traceprofile --time o look at trace-profile again ... From 15b0c27167d73ca77943a6ddcb6c77b1c7195afe Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Dec 2021 16:59:34 -0500 Subject: [PATCH 13/16] include --cpuprofile and --traceprofile in on-line help --- cmd/mlr/main.go | 2 +- docs/src/manpage.md | 18 +- docs/src/manpage.txt | 18 +- docs/src/mk-flag-info.rb | 2 +- docs/src/online-help.md | 1 + docs/src/reference-main-auxiliary-commands.md | 1 + docs/src/reference-main-flag-list.md | 382 ++++++------------ internal/pkg/cli/option_parse.go | 57 ++- internal/pkg/climain/mlrcli_parse.go | 10 +- internal/pkg/entrypoint/entrypoint.go | 2 +- internal/pkg/mlrval/mlrval_benchmark_test.go | 17 +- internal/pkg/mlrval/mlrval_infer.go | 6 +- man/manpage.txt | 18 +- man/mlr.1 | 28 +- todo.txt | 7 +- 15 files changed, 276 insertions(+), 293 deletions(-) diff --git a/cmd/mlr/main.go b/cmd/mlr/main.go index 04e4a76973..23ed5b5b62 100644 --- a/cmd/mlr/main.go +++ b/cmd/mlr/main.go @@ -67,7 +67,7 @@ func main() { defer fmt.Fprintf(os.Stderr, "CPU profile finished.\ngo tool pprof -http=:8080 %s\n", profFilename) } - if len(os.Args) >= 3 && os.Args[1] == "--traceprofile" { + if len(os.Args) >= 2 && os.Args[1] == "--traceprofile" { defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() defer fmt.Fprintf(os.Stderr, "go tool trace trace.out\n") } diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 91d703d3cc..1453ce12d0 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -144,6 +144,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -637,6 +638,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -756,6 +771,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3024,5 +3040,5 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-26 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 07154c73b9..e96a68b310 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -123,6 +123,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -735,6 +750,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3003,4 +3019,4 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-26 MILLER(1) diff --git a/docs/src/mk-flag-info.rb b/docs/src/mk-flag-info.rb index c6b07cf7a6..3f54fe08a6 100755 --- a/docs/src/mk-flag-info.rb +++ b/docs/src/mk-flag-info.rb @@ -46,7 +46,7 @@ for flag in flags headline = `mlr help show-headline-for-flag '#{flag}'` help = `mlr help show-help-for-flag '#{flag}'` - puts "* `#{headline}`: #{help}" + puts "* `#{headline.chomp}`: #{help}" end end diff --git a/docs/src/online-help.md b/docs/src/online-help.md index d74921e909..0216e0cda5 100644 --- a/docs/src/online-help.md +++ b/docs/src/online-help.md @@ -62,6 +62,7 @@ Flags: mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs diff --git a/docs/src/reference-main-auxiliary-commands.md b/docs/src/reference-main-auxiliary-commands.md index 7ee241ac20..a55ce0361a 100644 --- a/docs/src/reference-main-auxiliary-commands.md +++ b/docs/src/reference-main-auxiliary-commands.md @@ -31,6 +31,7 @@ Available subcommands: help regtest repl + version For more information, please invoke mlr {subcommand} --help. diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index 124b1f027e..2704577980 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -60,14 +60,10 @@ Notes: **Flags:** -* `--pass-comments -`: Immediately print commented lines (prefixed by `#`) within the input. -* `--pass-comments-with {string} -`: Immediately print commented lines within input, with specified prefix. -* `--skip-comments -`: Ignore commented lines (prefixed by `#`) within the input. -* `--skip-comments-with {string} -`: Ignore commented lines within input, with specified prefix. +* `--pass-comments`: Immediately print commented lines (prefixed by `#`) within the input. +* `--pass-comments-with {string}`: Immediately print commented lines within input, with specified prefix. +* `--skip-comments`: Ignore commented lines (prefixed by `#`) within the input. +* `--skip-comments-with {string}`: Ignore commented lines within input, with specified prefix. ## Compressed-data flags @@ -102,22 +98,14 @@ decisions that might have been made based on the file suffix. Likewise, **Flags:** -* `--bz2in -`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. -* `--gzin -`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`. -* `--prepipe {decompression command} -`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. -* `--prepipe-bz2 -`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`. -* `--prepipe-gunzip -`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`. -* `--prepipe-zcat -`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. -* `--prepipex {decompression command} -`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. -* `--zin -`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`. +* `--bz2in`: Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. +* `--gzin`: Uncompress gzip within the Miller process. Done by default if file ends in `.gz`. +* `--prepipe {decompression command}`: You can, of course, already do without this for single input files, e.g. `gunzip < myfile.csv.gz | mlr ...`. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. +* `--prepipe-bz2`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`. +* `--prepipe-gunzip`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`. +* `--prepipe-zcat`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. +* `--prepipex {decompression command}`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. +* `--zin`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`. ## CSV-only flags @@ -126,16 +114,11 @@ These are flags which are applicable to CSV format. **Flags:** -* `--allow-ragged-csv-input or --ragged -`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. -* `--headerless-csv-output or --ho -`: Print only CSV data lines; do not print CSV header lines. -* `--implicit-csv-header or --headerless-csv-input or --hi -`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. -* `--no-implicit-csv-header -`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`. -* `-N -`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. +* `--allow-ragged-csv-input or --ragged`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. +* `--headerless-csv-output or --ho`: Print only CSV data lines; do not print CSV header lines. +* `--implicit-csv-header or --headerless-csv-input or --hi`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. +* `--no-implicit-csv-header`: Opposite of `--implicit-csv-header`. This is the default anyway -- the main use is for the flags to `mlr join` if you have main file(s) which are headerless but you want to join in on a file which does have a CSV header. Then you could use `mlr --csv --implicit-csv-header join --no-implicit-csv-header -l your-join-in-with-header.csv ... your-headerless.csv`. +* `-N`: Keystroke-saver for `--implicit-csv-header --headerless-csv-output`. ## File-format flags @@ -152,90 +135,48 @@ are overridden in all cases by setting output format to `format2`. **Flags:** -* `--asv or --asvlite -`: Use ASV format for input and output data. -* `--csv or -c -`: Use CSV format for input and output data. -* `--csvlite -`: Use CSV-lite format for input and output data. -* `--dkvp -`: Use DKVP format for input and output data. -* `--gen-field-name -`: Specify field name for --igen. Defaults to "i". -* `--gen-start -`: Specify start value for --igen. Defaults to 1. -* `--gen-step -`: Specify step value for --igen. Defaults to 1. -* `--gen-stop -`: Specify stop value for --igen. Defaults to 100. -* `--iasv or --iasvlite -`: Use ASV format for input data. -* `--icsv -`: Use CSV format for input data. -* `--icsvlite -`: Use CSV-lite format for input data. -* `--idkvp -`: Use DKVP format for input data. -* `--igen -`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive. -* `--ijson -`: Use JSON format for input data. -* `--inidx -`: Use NIDX format for input data. -* `--io {format name} -`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`. -* `--ipprint -`: Use PPRINT format for input data. -* `--itsv -`: Use TSV format for input data. -* `--itsvlite -`: Use TSV-lite format for input data. -* `--iusv or --iusvlite -`: Use USV format for input data. -* `--ixtab -`: Use XTAB format for input data. -* `--json or -j -`: Use JSON format for input and output data. -* `--nidx -`: Use NIDX format for input and output data. -* `--oasv or --oasvlite -`: Use ASV format for output data. -* `--ocsv -`: Use CSV format for output data. -* `--ocsvlite -`: Use CSV-lite format for output data. -* `--odkvp -`: Use DKVP format for output data. -* `--ojson -`: Use JSON format for output data. -* `--omd -`: Use markdown-tabular format for output data. -* `--onidx -`: Use NIDX format for output data. -* `--opprint -`: Use PPRINT format for output data. -* `--otsv -`: Use TSV format for output data. -* `--otsvlite -`: Use TSV-lite format for output data. -* `--ousv or --ousvlite -`: Use USV format for output data. -* `--oxtab -`: Use XTAB format for output data. -* `--pprint -`: Use PPRINT format for input and output data. -* `--tsv -`: Use TSV format for input and output data. -* `--tsvlite or -t -`: Use TSV-lite format for input and output data. -* `--usv or --usvlite -`: Use USV format for input and output data. -* `--xtab -`: Use XTAB format for input and output data. -* `-i {format name} -`: Use format name for input data. For example: `-i csv` is the same as `--icsv`. -* `-o {format name} -`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`. +* `--asv or --asvlite`: Use ASV format for input and output data. +* `--csv or -c`: Use CSV format for input and output data. +* `--csvlite`: Use CSV-lite format for input and output data. +* `--dkvp`: Use DKVP format for input and output data. +* `--gen-field-name`: Specify field name for --igen. Defaults to "i". +* `--gen-start`: Specify start value for --igen. Defaults to 1. +* `--gen-step`: Specify step value for --igen. Defaults to 1. +* `--gen-stop`: Specify stop value for --igen. Defaults to 100. +* `--iasv or --iasvlite`: Use ASV format for input data. +* `--icsv`: Use CSV format for input data. +* `--icsvlite`: Use CSV-lite format for input data. +* `--idkvp`: Use DKVP format for input data. +* `--igen`: Ignore input files and instead generate sequential numeric input using --gen-field-name, --gen-start, --gen-step, and --gen-stop values. See also the seqgen verb, which is more useful/intuitive. +* `--ijson`: Use JSON format for input data. +* `--inidx`: Use NIDX format for input data. +* `--io {format name}`: Use format name for input and output data. For example: `--io csv` is the same as `--csv`. +* `--ipprint`: Use PPRINT format for input data. +* `--itsv`: Use TSV format for input data. +* `--itsvlite`: Use TSV-lite format for input data. +* `--iusv or --iusvlite`: Use USV format for input data. +* `--ixtab`: Use XTAB format for input data. +* `--json or -j`: Use JSON format for input and output data. +* `--nidx`: Use NIDX format for input and output data. +* `--oasv or --oasvlite`: Use ASV format for output data. +* `--ocsv`: Use CSV format for output data. +* `--ocsvlite`: Use CSV-lite format for output data. +* `--odkvp`: Use DKVP format for output data. +* `--ojson`: Use JSON format for output data. +* `--omd`: Use markdown-tabular format for output data. +* `--onidx`: Use NIDX format for output data. +* `--opprint`: Use PPRINT format for output data. +* `--otsv`: Use TSV format for output data. +* `--otsvlite`: Use TSV-lite format for output data. +* `--ousv or --ousvlite`: Use USV format for output data. +* `--oxtab`: Use XTAB format for output data. +* `--pprint`: Use PPRINT format for input and output data. +* `--tsv`: Use TSV format for input and output data. +* `--tsvlite or -t`: Use TSV-lite format for input and output data. +* `--usv or --usvlite`: Use USV format for input and output data. +* `--xtab`: Use XTAB format for input and output data. +* `-i {format name}`: Use format name for input data. For example: `-i csv` is the same as `--icsv`. +* `-o {format name}`: Use format name for output data. For example: `-o csv` is the same as `--ocsv`. ## Flatten-unflatten flags @@ -246,14 +187,10 @@ See the Flatten/unflatten doc page for more information. **Flags:** -* `--flatsep or --jflatsep {string} -`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. -* `--no-auto-flatten -`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. -* `--no-auto-unflatten -`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. -* `--xvright -`: Right-justify values for XTAB format. +* `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`. +* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`. +* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`. +* `--xvright`: Right-justify values for XTAB format. ## Format-conversion keystroke-saver flags @@ -283,12 +220,9 @@ These are flags which are applicable to JSON format. **Flags:** -* `--jlistwrap or --jl -`: Wrap JSON output in outermost `[ ]`. -* `--jvstack -`: Put one key-value pair per line for JSON output (multi-line output). -* `--no-jvstack -`: Put objects/arrays all on one line for JSON output. +* `--jlistwrap or --jl`: Wrap JSON output in outermost `[ ]`. +* `--jvstack`: Put one key-value pair per line for JSON output (multi-line output). +* `--no-jvstack`: Put objects/arrays all on one line for JSON output. ## Legacy flags @@ -298,38 +232,22 @@ They are accepted as no-op flags in order to keep old scripts from breaking. **Flags:** -* `--jknquoteint -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--jquoteall -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--json-fatal-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--json-map-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--json-skip-arrays-on-input -`: Miller now supports arrays as of version 6. -* `--jsonx -`: The `--jvstack` flag is now default true in Miller 6. -* `--jvquoteall -`: Type information from JSON input files is now preserved throughout the processing stream. -* `--mmap -`: Miller no longer uses memory-mapping to access data files. -* `--no-mmap -`: Miller no longer uses memory-mapping to access data files. -* `--ojsonx -`: The `--jvstack` flag is now default true in Miller 6. -* `--quote-all -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-minimal -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-none -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-numeric -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--quote-original -`: Ignored as of version 6. Types are inferred/retained through the processing flow now. -* `--vflatsep -`: Ignored as of version 6. This functionality is subsumed into JSON formatting. +* `--jknquoteint`: Type information from JSON input files is now preserved throughout the processing stream. +* `--jquoteall`: Type information from JSON input files is now preserved throughout the processing stream. +* `--json-fatal-arrays-on-input`: Miller now supports arrays as of version 6. +* `--json-map-arrays-on-input`: Miller now supports arrays as of version 6. +* `--json-skip-arrays-on-input`: Miller now supports arrays as of version 6. +* `--jsonx`: The `--jvstack` flag is now default true in Miller 6. +* `--jvquoteall`: Type information from JSON input files is now preserved throughout the processing stream. +* `--mmap`: Miller no longer uses memory-mapping to access data files. +* `--no-mmap`: Miller no longer uses memory-mapping to access data files. +* `--ojsonx`: The `--jvstack` flag is now default true in Miller 6. +* `--quote-all`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-minimal`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-none`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-numeric`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--quote-original`: Ignored as of version 6. Types are inferred/retained through the processing flow now. +* `--vflatsep`: Ignored as of version 6. This functionality is subsumed into JSON formatting. ## Miscellaneous flags @@ -337,44 +255,25 @@ These are flags which don't fit into any other category. **Flags:** -* `--fflush -`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost. -* `--from {filename} -`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. -* `--hash-records -`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit. -* `--infer-int-as-float or -A -`: Cast all integers in data files to floats. -* `--infer-none or -S -`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings. -* `--infer-octal or -O -`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float. -* `--load {filename} -`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like. -* `--mfrom {filenames} -`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does. -* `--mload {filenames} -`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`. -* `--no-dedupe-field-names -`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained. -* `--no-fflush -`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. -* `--no-hash-records -`: See --hash-records. -* `--nr-progress-mod {m} -`: With m a positive integer: print filename and record count to os.Stderr every m input records. -* `--ofmt {format} -`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. -* `--records-per-batch {n} -`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified. -* `--seed {n} -`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. -* `--tz {timezone} -`: Specify timezone, overriding `$TZ` environment variable (if any). -* `-I -`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on. -* `-n -`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing). +* `--fflush`: Force buffered output to be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to force frequent updates even when output is to a pipe or file, at a performance cost. +* `--from {filename}`: Use this to specify an input file before the verb(s), rather than after. May be used more than once. Example: `mlr --from a.dat --from b.dat cat` is the same as `mlr cat a.dat b.dat`. +* `--hash-records`: This is an internal parameter which normally does not need to be modified. It controls the mechanism by which Miller accesses fields within records. In general --no-hash-records is faster, and is the default. For specific use-cases involving data having many fields, and many of them being processed during a given processing run, --hash-records might offer a slight performance benefit. +* `--infer-int-as-float or -A`: Cast all integers in data files to floats. +* `--infer-none or -S`: Don't treat values like 123 or 456.7 in data files as int/float; leave them as strings. +* `--infer-octal or -O`: Treat numbers like 0123 in data files as numeric; default is string. Note that 00--07 etc scan as int; 08-09 scan as float. +* `--load {filename}`: Load DSL script file for all put/filter operations on the command line. If the name following `--load` is a directory, load all `*.mlr` files in that directory. This is just like `put -f` and `filter -f` except it's up-front on the command line, so you can do something like `alias mlr='mlr --load ~/myscripts'` if you like. +* `--mfrom {filenames}`: Use this to specify one of more input files before the verb(s), rather than after. May be used more than once. The list of filename must end with `--`. This is useful for example since `--from *.csv` doesn't do what you might hope but `--mfrom *.csv --` does. +* `--mload {filenames}`: Like `--load` but works with more than one filename, e.g. `--mload *.mlr --`. +* `--no-dedupe-field-names`: By default, if an input record has a field named `x` and another also named `x`, the second will be renamed `x_2`, and so on. With this flag provided, the second `x`'s value will replace the first `x`'s value when the record is read. This flag has no effect on JSON input records, where duplicate keys always result in the last one's value being retained. +* `--no-fflush`: Let buffered output not be written after every output record. The default is flush output after every record if the output is to the terminal, or less often if the output is to a file or a pipe. The default is a significant performance optimization for large files. Use this flag to allow less-frequent updates when output is to the terminal. This is unlikely to be a noticeable performance improvement, since direct-to-screen output for large files has its own overhead. +* `--no-hash-records`: See --hash-records. +* `--nr-progress-mod {m}`: With m a positive integer: print filename and record count to os.Stderr every m input records. +* `--ofmt {format}`: E.g. `%.18f`, `%.0f`, `%9.6e`. Please use sprintf-style codes for floating-point nummbers. If not specified, default formatting is used. See also the `fmtnum` function and the `format-values` verb. +* `--records-per-batch {n}`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified. +* `--seed {n}`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. +* `--tz {timezone}`: Specify timezone, overriding `$TZ` environment variable (if any). +* `-I`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on. +* `-n`: Process no input files, nor standard input either. Useful for `mlr put` with `begin`/`end` statements only. (Same as `--from /dev/null`.) Also useful in `mlr -n put -v '...'` for analyzing abstract syntax trees (if that's your thing). ## Output-colorization flags @@ -436,24 +335,15 @@ and `mlr --list-color-names` to see available names (like `orchid`). **Flags:** -* `--always-color or -C -`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`. -* `--fail-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`. -* `--help-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output. -* `--key-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys. -* `--list-color-codes -`: Show the available color codes in the range 0..255, such as 170 for example. -* `--list-color-names -`: Show the names for the available color codes, such as `orchid` for example. -* `--no-color or -M -`: Instructs Miller to not colorize any output. -* `--pass-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`. -* `--value-color -`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values. +* `--always-color or -C`: Instructs Miller to colorize output even when it normally would not. Useful for piping output to `less -r`. +* `--fail-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for failing cases in `mlr regtest`. +* `--help-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for highlights in `mlr help` output. +* `--key-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record keys. +* `--list-color-codes`: Show the available color codes in the range 0..255, such as 170 for example. +* `--list-color-names`: Show the names for the available color codes, such as `orchid` for example. +* `--no-color or -M`: Instructs Miller to not colorize any output. +* `--pass-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for passing cases in `mlr regtest`. +* `--value-color`: Specify the color (see `--list-color-codes` and `--list-color-names`) for record values. ## PPRINT-only flags @@ -462,10 +352,18 @@ These are flags which are applicable to PPRINT output format. **Flags:** -* `--barred -`: Prints a border around PPRINT output (not available for input). -* `--right -`: Right-justifies all fields for PPRINT output. +* `--barred`: Prints a border around PPRINT output (not available for input). +* `--right`: Right-justifies all fields for PPRINT output. + +## Profiling flags + +These are flags for profiling Miller performance. + +**Flags:** + +* `--cpuprofile {CPU-profile file name}`: Create a CPU-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line. +* `--time`: Print elapsed execution time in seconds to stderr at the end of the execution of the program. +* `--traceprofile`: Create a trace-profile file for performance analysis. Instructions will be printed to stderr. This flag must be the very first thing after 'mlr' on the command line. ## Separator flags @@ -566,28 +464,16 @@ Notes about all other separators: **Flags:** -* `--fs {string} -`: Specify FS for input and output. -* `--ifs {string} -`: Specify FS for input. -* `--ifs-regex {string} -`: Specify FS for input as a regular expression. -* `--ips {string} -`: Specify PS for input. -* `--ips-regex {string} -`: Specify PS for input as a regular expression. -* `--irs {string} -`: Specify RS for input. -* `--ofs {string} -`: Specify FS for output. -* `--ops {string} -`: Specify PS for output. -* `--ors {string} -`: Specify RS for output. -* `--ps {string} -`: Specify PS for input and output. -* `--repifs -`: Let IFS be repeated: e.g. for splitting on multiple spaces. -* `--rs {string} -`: Specify RS for input and output. +* `--fs {string}`: Specify FS for input and output. +* `--ifs {string}`: Specify FS for input. +* `--ifs-regex {string}`: Specify FS for input as a regular expression. +* `--ips {string}`: Specify PS for input. +* `--ips-regex {string}`: Specify PS for input as a regular expression. +* `--irs {string}`: Specify RS for input. +* `--ofs {string}`: Specify FS for output. +* `--ops {string}`: Specify PS for output. +* `--ors {string}`: Specify RS for output. +* `--ps {string}`: Specify PS for input and output. +* `--repifs`: Let IFS be repeated: e.g. for splitting on multiple spaces. +* `--rs {string}`: Specify RS for input and output. diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 7340b0e71c..7ed501f361 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -98,8 +98,8 @@ var FLAG_TABLE = FlagTable{ &CommentsInDataFlagSection, &OutputColorizationFlagSection, &FlattenUnflattenFlagSection, + &ProfilingFlagSection, &MiscFlagSection, - // TODO: make a Profiling section }, } @@ -2411,6 +2411,51 @@ var FlattenUnflattenFlagSection = FlagSection{ }, } +// ================================================================ +// PROFILING FLAGS + +func ProfilingPrintInfo() { + fmt.Print("These are flags for profiling Miller performance.") +} + +func init() { ProfilingFlagSection.Sort() } + +var ProfilingFlagSection = FlagSection{ + name: "Profiling flags", + infoPrinter: ProfilingPrintInfo, + flags: []Flag{ + { + name: "--cpuprofile", + arg: "{CPU-profile file name}", + help: `Create a CPU-profile file for performance analysis. Instructions will be printed to stderr. +This flag must be the very first thing after 'mlr' on the command line.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + // Already handled in main(). Nothing to do here except to accept this as valid syntax. + *pargi += 2 + }, + }, + + { + name: "--traceprofile", + help: `Create a trace-profile file for performance analysis. Instructions will be printed to stderr. +This flag must be the very first thing after 'mlr' on the command line.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + // Already handled in main(). Nothing to do here except to accept this as valid syntax. + *pargi += 1 + }, + }, + + { + name: "--time", + help: "Print elapsed execution time in seconds to stderr at the end of the execution of the program.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.PrintElapsedTime = true + *pargi += 1 + }, + }, + }, +} + // ================================================================ // MISC FLAGS @@ -2673,15 +2718,5 @@ has its own overhead.`, *pargi += 1 }, }, - - // TODO: make a Profiling section - { - name: "--time", - help: `Print elapsed execution time in seconds to stderr at the end of the execution of the program.`, - parser: func(args []string, argc int, pargi *int, options *TOptions) { - options.PrintElapsedTime = true - *pargi += 1 - }, - }, }, } diff --git a/internal/pkg/climain/mlrcli_parse.go b/internal/pkg/climain/mlrcli_parse.go index 220487b0fe..d0eb2c649f 100644 --- a/internal/pkg/climain/mlrcli_parse.go +++ b/internal/pkg/climain/mlrcli_parse.go @@ -130,15 +130,7 @@ func parseCommandLinePassOne( oargi := argi if args[argi][0] == '-' { - - if args[argi] == "--cpuprofile" { - // Already handled in main(); ignore here, and don't send it to pass two. - cli.CheckArgCount(args, argi, argc, 1) - argi += 2 - } else if args[argi] == "--traceprofile" { - // Already handled in main(); ignore here, and don't send it to pass two. - argi += 1 - } else if args[argi] == "--version" { + if args[argi] == "--version" { // Exiting flag: handle it immediately. fmt.Printf("mlr %s\n", version.STRING) os.Exit(0) diff --git a/internal/pkg/entrypoint/entrypoint.go b/internal/pkg/entrypoint/entrypoint.go index fd6e8eeabd..6445b87331 100644 --- a/internal/pkg/entrypoint/entrypoint.go +++ b/internal/pkg/entrypoint/entrypoint.go @@ -59,7 +59,7 @@ func Main() MainReturn { processInPlace(options) } - return MainReturn { + return MainReturn{ PrintElapsedTime: options.PrintElapsedTime, } } diff --git a/internal/pkg/mlrval/mlrval_benchmark_test.go b/internal/pkg/mlrval/mlrval_benchmark_test.go index 141e855db2..cb8a1f6ee4 100644 --- a/internal/pkg/mlrval/mlrval_benchmark_test.go +++ b/internal/pkg/mlrval/mlrval_benchmark_test.go @@ -7,29 +7,28 @@ import ( // go test -run=nonesuch -bench=. github.com/johnkerl/miller/internal/pkg/mlrval/... func BenchmarkFromDeferredType(b *testing.B) { - for i := 0; i < b.N; i++ { + for i := 0; i < b.N; i++ { _ = FromDeferredType("123") - } + } } func BenchmarkInferIntFromDeferredType(b *testing.B) { - for i := 0; i < b.N; i++ { + for i := 0; i < b.N; i++ { mv := FromDeferredType("123") mv.Type() - } + } } func BenchmarkInferFloatFromDeferredType(b *testing.B) { - for i := 0; i < b.N; i++ { + for i := 0; i < b.N; i++ { mv := FromDeferredType("123.4") mv.Type() - } + } } func BenchmarkInferStringFromDeferredType(b *testing.B) { - for i := 0; i < b.N; i++ { + for i := 0; i < b.N; i++ { mv := FromDeferredType("abc") mv.Type() - } + } } - diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index 4b3a1edbca..442511f492 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -25,8 +25,10 @@ func (mv *Mlrval) Type() MVType { // Support for mlr -S, mlr -A, mlr -O. type tInferrer func(mv *Mlrval, inferBool bool) *Mlrval -// xxx temp var packageLevelInferrer tInferrer = inferWithOctalAsString -var packageLevelInferrer tInferrer = inferTemp +// xxx temp +var packageLevelInferrer tInferrer = inferWithOctalAsString + +//var packageLevelInferrer tInferrer = inferTemp // SetInferrerOctalAsInt is for default behavior. func SetInferrerOctalAsString() { diff --git a/man/manpage.txt b/man/manpage.txt index 07154c73b9..e96a68b310 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -123,6 +123,7 @@ HELP OPTIONS mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -616,6 +617,20 @@ PPRINT-ONLY FLAGS for input). --right Right-justifies all fields for PPRINT output. +PROFILING FLAGS + These are flags for profiling Miller performance. + --cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + --time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. + --traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. + SEPARATOR FLAGS See the Separators doc page for more about record separators, field separators, and pair separators. Also see the File formats doc page, or @@ -735,6 +750,7 @@ AUXILIARY COMMANDS help regtest repl + version For more information, please invoke mlr {subcommand} --help. MLRRC @@ -3003,4 +3019,4 @@ SEE ALSO - 2021-12-25 MILLER(1) + 2021-12-26 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 468512bf13..4ce4b1cd18 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-12-25 +.\" Date: 2021-12-26 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-12-25" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-12-26" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -158,6 +158,7 @@ Flags: mlr help miscellaneous-flags mlr help output-colorization-flags mlr help pprint-only-flags + mlr help profiling-flags mlr help separator-flags Verbs: mlr help list-verbs @@ -753,6 +754,28 @@ These are flags which are applicable to PPRINT output format. .fi .if n \{\ .RE +.SH "PROFILING FLAGS" +.sp + +.if n \{\ +.RS 0 +.\} +.nf +These are flags for profiling Miller performance. +--cpuprofile {CPU-profile file name} + Create a CPU-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. +--time Print elapsed execution time in seconds to stderr at + the end of the execution of the program. +--traceprofile Create a trace-profile file for performance analysis. + Instructions will be printed to stderr. This flag + must be the very first thing after 'mlr' on the + command line. +.fi +.if n \{\ +.RE .SH "SEPARATOR FLAGS" .sp @@ -884,6 +907,7 @@ Available subcommands: help regtest repl + version For more information, please invoke mlr {subcommand} --help. .fi .if n \{\ diff --git a/todo.txt b/todo.txt index 96d7b480a6..d27ad200af 100644 --- a/todo.txt +++ b/todo.txt @@ -2,7 +2,6 @@ PUNCHDOWN LIST * blockers: - ! .mlrrc suppress - linux/1.17 perf checks - fractional-strptime - improved regex doc w/ lots of examples @@ -16,18 +15,14 @@ PUNCHDOWN LIST * numeric-inference perf k benchmark per se + ? webdoc --cpuprofile wut ! octal handling ! opt-bool handling - o mlr --time - o README-profiling.md re various scripts o README-profiling.md re this PR o webdoc re on-battery anecdote - o webdoc section --cpuprofile --traceprofile --time - - o look at trace-profile again ... * nikos materials -> fold in From b23080191620549435a2877f7ac6cffa2e33577b Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 27 Dec 2021 00:08:33 -0500 Subject: [PATCH 14/16] sharpen inferred/deferred-type API distinction --- .vimrc | 2 +- internal/pkg/mlrval/mlrval_infer.go | 221 ++++++----- internal/pkg/mlrval/mlrval_infer_test.go | 469 +++++++++++------------ internal/pkg/mlrval/mlrval_new.go | 12 +- internal/pkg/scan/digits.go | 19 + internal/pkg/scan/digits_test.go | 11 + internal/pkg/scan/find.go | 43 ++- internal/pkg/scan/find_benchmark_test.go | 3 + internal/pkg/scan/find_test.go | 80 +++- internal/pkg/scan/type.go | 31 +- internal/pkg/scan/type_test.go | 11 +- todo.txt | 26 +- 12 files changed, 535 insertions(+), 393 deletions(-) diff --git a/.vimrc b/.vimrc index ad59ce3e33..d3d35005ff 100644 --- a/.vimrc +++ b/.vimrc @@ -1,3 +1,3 @@ map \d :w:!clear;echo Building ...; echo; make mlr map \f :w:!clear;echo Building ...; echo; make ut -map \r :w:!clear;echo Building ...; echo; make mv-ut +map \r :w:!clear;echo Building ...; echo; make ut-scan ut-mlv diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index 442511f492..3704a0e13b 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -1,11 +1,8 @@ package mlrval import ( - "regexp" "strconv" - "strings" - "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/scan" ) @@ -17,22 +14,19 @@ import ( func (mv *Mlrval) Type() MVType { if mv.mvtype == MT_PENDING { - packageLevelInferrer(mv, false) + packageLevelInferrer(mv) } return mv.mvtype } // Support for mlr -S, mlr -A, mlr -O. -type tInferrer func(mv *Mlrval, inferBool bool) *Mlrval +type tInferrer func(mv *Mlrval) *Mlrval -// xxx temp -var packageLevelInferrer tInferrer = inferWithOctalAsString - -//var packageLevelInferrer tInferrer = inferTemp +var packageLevelInferrer tInferrer = inferNormally -// SetInferrerOctalAsInt is for default behavior. -func SetInferrerOctalAsString() { - packageLevelInferrer = inferWithOctalAsString +// SetInferNormally is the default behavior. +func SetInferNormally() { + packageLevelInferrer = inferNormally } // SetInferrerOctalAsInt is for mlr -O. @@ -47,67 +41,25 @@ func SetInferrerIntAsFloat() { // SetInferrerStringOnly is for mlr -S. func SetInferrerStringOnly() { - packageLevelInferrer = inferStringOnly -} - -// When loading data files, don't scan these words into floats -- even though -// the Go library is willing to do so. -var downcasedFloatNamesToNotInfer = map[string]bool{ - "inf": true, - "+inf": true, - "-inf": true, - "infinity": true, - "+infinity": true, - "-infinity": true, - "nan": true, + packageLevelInferrer = inferString } -var octalDetector = regexp.MustCompile("^-?0[0-9]+") - -// inferWithOctalAsString is for default behavior. -func inferWithOctalAsString(mv *Mlrval, inferBool bool) *Mlrval { - inferWithOctalAsInt(mv, inferBool) - if mv.mvtype != MT_INT && mv.mvtype != MT_FLOAT { - return mv - } +// ---------------------------------------------------------------- - if octalDetector.MatchString(mv.printrep) { - return mv.SetFromString(mv.printrep) - } else { - return mv - } +func inferNormally(mv *Mlrval) *Mlrval { + scanType := scan.FindScanType(mv.printrep) + return normalInferrerTable[scanType](mv) } -// inferWithOctalAsInt is for mlr -O. -func inferWithOctalAsInt(mv *Mlrval, inferBool bool) *Mlrval { - if mv.printrep == "" { - return mv.SetFromVoid() - } - - intval, iok := lib.TryIntFromString(mv.printrep) - if iok { - return mv.SetFromPrevalidatedIntString(mv.printrep, intval) - } - - if downcasedFloatNamesToNotInfer[strings.ToLower(mv.printrep)] == false { - floatval, fok := lib.TryFloatFromString(mv.printrep) - if fok { - return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval) - } - } - - if inferBool { - boolval, bok := lib.TryBoolFromBoolString(mv.printrep) - if bok { - return mv.SetFromPrevalidatedBoolString(mv.printrep, boolval) - } - } - return mv.SetFromString(mv.printrep) +// xxx temp +func inferWithOctalAsInt(mv *Mlrval) *Mlrval { + scanType := scan.FindScanType(mv.printrep) + return leadingZeroAsIntInferrerTable[scanType](mv) } // inferWithIntAsFloat is for mlr -A. -func inferWithIntAsFloat(mv *Mlrval, inferBool bool) *Mlrval { - inferWithOctalAsString(mv, inferBool) +func inferWithIntAsFloat(mv *Mlrval) *Mlrval { + inferNormally(mv) if mv.Type() == MT_INT { mv.floatval = float64(mv.intval) mv.mvtype = MT_FLOAT @@ -115,16 +67,39 @@ func inferWithIntAsFloat(mv *Mlrval, inferBool bool) *Mlrval { return mv } -// inferStringOnly is for mlr -S. -func inferStringOnly(mv *Mlrval, inferBool bool) *Mlrval { +// inferString is for mlr -S. +func inferString(mv *Mlrval) *Mlrval { return mv.SetFromString(mv.printrep) } // ---------------------------------------------------------------- -// experimental + +// Important: synchronize this with the type-ordering in the scan package. +var normalInferrerTable []tInferrer = []tInferrer{ + inferString, + inferDecimalInt, + inferString, // inferLeadingZeroDecimalIntAsInt, + inferOctalInt, + inferString, // inferFromLeadingZeroOctalIntAsInt, + inferHexInt, + inferBinaryInt, + inferMaybeFloat, +} + +// Important: synchronize this with the type-ordering in the scan package. +var leadingZeroAsIntInferrerTable []tInferrer = []tInferrer{ + inferString, + inferDecimalInt, + inferLeadingZeroDecimalIntAsInt, + inferOctalInt, + inferFromLeadingZeroOctalIntAsInt, + inferHexInt, + inferBinaryInt, + inferMaybeFloat, +} // TODO: comment -func inferFromDecimalInt(mv *Mlrval, inferBool bool) *Mlrval { +func inferDecimalInt(mv *Mlrval) *Mlrval { intval, err := strconv.ParseInt(mv.printrep, 10, 64) if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) @@ -134,8 +109,26 @@ func inferFromDecimalInt(mv *Mlrval, inferBool bool) *Mlrval { } // TODO: comment -func inferFromOctalInt(mv *Mlrval, inferBool bool) *Mlrval { - intval, err := strconv.ParseInt(mv.printrep, 8, 64) +func inferLeadingZeroDecimalIntAsInt(mv *Mlrval) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep[1:], 10, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} + +// TODO: comment +// E.g. explicit 0o377, not 0377 +func inferOctalInt(mv *Mlrval) *Mlrval { + var input string + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + } else { + input = mv.printrep[2:] + } + intval, err := strconv.ParseInt(input, 8, 64) if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) } else { @@ -144,8 +137,8 @@ func inferFromOctalInt(mv *Mlrval, inferBool bool) *Mlrval { } // TODO: comment -func inferFromHexInt(mv *Mlrval, inferBool bool) *Mlrval { - intval, err := strconv.ParseInt(mv.printrep, 16, 64) +func inferFromLeadingZeroOctalIntAsInt(mv *Mlrval) *Mlrval { + intval, err := strconv.ParseInt(mv.printrep, 8, 64) if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) } else { @@ -154,9 +147,56 @@ func inferFromHexInt(mv *Mlrval, inferBool bool) *Mlrval { } // TODO: comment -func inferFromBinaryInt(mv *Mlrval, inferBool bool) *Mlrval { - intval, err := strconv.ParseInt(mv.printrep, 2, 64) - // xxx to do: length check & overflow/uint check +// The 2: is to get past the known 0x prefix +func inferHexInt(mv *Mlrval) *Mlrval { + var input string + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + } else { + input = mv.printrep[2:] + } + + // Following twos-complement formatting familiar from all manners of + // languages, including C which was Miller's original implementation + // language, we want to allow 0x00....00 through 0x7f....ff as positive + // 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's + // signed-int parsing explicitly doesn't allow that, but we don't want Go + // semantics to dictate Miller semantics. So, we try signed-int parsing + // for 0x00....00 through 0x7f....ff, as well as positive or negative + // decimal. Failing that, we try unsigned-int parsing for 0x80....00 + // through 0xff....ff. + + i0 := input[0] + if len(input) == 16 && ('8' <= i0 && i0 <= 'f') { + uintval, err := strconv.ParseUint(input, 16, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(uintval)) + } else { + return mv.SetFromString(mv.printrep) + } + } else { + intval, err := strconv.ParseInt(input, 16, 64) + if err == nil { + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } + } + +} + +// TODO: comment +// The 2: is to get past the known 0b prefix +func inferBinaryInt(mv *Mlrval) *Mlrval { + var input string + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + } else { + input = mv.printrep[2:] + } + intval, err := strconv.ParseInt(input, 16, 64) if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) } else { @@ -165,7 +205,7 @@ func inferFromBinaryInt(mv *Mlrval, inferBool bool) *Mlrval { } // TODO: comment -func inferFromMaybeFloat(mv *Mlrval, inferBool bool) *Mlrval { +func inferMaybeFloat(mv *Mlrval) *Mlrval { floatval, err := strconv.ParseFloat(mv.printrep, 64) if err == nil { return mv.SetFromPrevalidatedFloatString(mv.printrep, floatval) @@ -175,35 +215,10 @@ func inferFromMaybeFloat(mv *Mlrval, inferBool bool) *Mlrval { } // TODO: comment -func inferFromBool(mv *Mlrval, inferBool bool) *Mlrval { +func inferFromBool(mv *Mlrval) *Mlrval { if mv.printrep == "true" { return mv.SetFromPrevalidatedBoolString(mv.printrep, true) } else { return mv.SetFromPrevalidatedBoolString(mv.printrep, false) } } - -// const ( -// scanTypeString ScanType = 0 -// scanTypeDecimalInt = 1 -// scanTypeOctalInt = 2 -// scanTypeHexInt = 3 -// scanTypeBinaryInt = 4 -// scanTypeMaybeFloat = 5 -// scanTypeBool = 6 -// ) - -var tempScanTypeInferrerTable []tInferrer = []tInferrer{ - inferStringOnly, - inferFromDecimalInt, - inferFromOctalInt, - inferFromHexInt, - inferFromBinaryInt, - inferFromMaybeFloat, - inferFromBool, -} - -func inferTemp(mv *Mlrval, inferBool bool) *Mlrval { - scanType := scan.FindScanType(mv.printrep) - return tempScanTypeInferrerTable[scanType](mv, inferBool) -} diff --git a/internal/pkg/mlrval/mlrval_infer_test.go b/internal/pkg/mlrval/mlrval_infer_test.go index e5b8f8577c..dcb4b01b9d 100644 --- a/internal/pkg/mlrval/mlrval_infer_test.go +++ b/internal/pkg/mlrval/mlrval_infer_test.go @@ -10,250 +10,243 @@ import ( "github.com/stretchr/testify/assert" ) -func TestInferWithOctalAsString(t *testing.T) { - assert.True(t, inferWithOctalAsString(FromDeferredType(""), false).IsVoid()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("true"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("false"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("true"), true).IsBool()) - assert.True(t, inferWithOctalAsString(FromDeferredType("false"), true).IsBool()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("abc"), false).IsString()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("0123"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0123"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0377"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0377"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0923"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0923"), false).IsString()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("123"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-123"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0xff"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0xff"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0b1011"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0b1011"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0x7fffffffffffffff"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0x8000000000000000"), false).IsInt()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0xffffffffffffffff"), false).IsInt()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("12_3"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-12_3"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1_2.3_4"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1_2.3_4"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0xca_fe"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0xca_fe"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("0b1011_1101"), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-0b1011_1101"), false).IsString()) - - assert.True(t, inferWithOctalAsString(FromDeferredType("."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("123."), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-123."), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType(".123"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-.123"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("123.456"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-123.456"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1e2."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1e2."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1e-2."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1e-2."), false).IsString()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1.2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1.2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1.e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1.e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("1.e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-1.e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType(".2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType(".2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsString(FromDeferredType("-.2e-3"), false).IsFloat()) +func TestInferNormally(t *testing.T) { + assert.True(t, inferNormally(FromDeferredType("")).IsVoid()) + + assert.True(t, inferNormally(FromDeferredType("true")).IsString()) + assert.True(t, inferNormally(FromDeferredType("false")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("abc")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("0123")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0123")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0377")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0377")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0923")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferNormally(FromDeferredType("123")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-123")).IsInt()) + + assert.True(t, inferNormally(FromDeferredType("0xff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-0xff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0b1011")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("-0b1011")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0x7fffffffffffffff")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0x8000000000000000")).IsInt()) + assert.True(t, inferNormally(FromDeferredType("0xffffffffffffffff")).IsInt()) + + assert.True(t, inferNormally(FromDeferredType("12_3")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferNormally(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferNormally(FromDeferredType(".")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferNormally(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferNormally(FromDeferredType("-.2e-3")).IsFloat()) } func TestInferWithOctalAsInt(t *testing.T) { - assert.True(t, inferWithOctalAsInt(FromDeferredType(""), false).IsVoid()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("true"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("false"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("true"), true).IsBool()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("false"), true).IsBool()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("abc"), false).IsString()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("0123"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0123"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0377"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0377"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0923"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0923"), false).IsFloat()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("123"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-123"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0xff"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xff"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0x7fffffffffffffff"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0x8000000000000000"), false).IsInt()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0xffffffffffffffff"), false).IsInt()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("12_3"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-12_3"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1_2.3_4"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1_2.3_4"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0xca_fe"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xca_fe"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011_1101"), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011_1101"), false).IsString()) - - assert.True(t, inferWithOctalAsInt(FromDeferredType("."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("123."), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-123."), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType(".123"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-.123"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("123.456"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.456"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1e2."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e2."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1e-2."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e-2."), false).IsString()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3"), false).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("")).IsVoid()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("true")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("false")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("abc")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("0123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0377")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0377")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0923")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0923")).IsInt()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x7fffffffffffffff")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0x8000000000000000")).IsInt()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xffffffffffffffff")).IsInt()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType("12_3")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferWithOctalAsInt(FromDeferredType(".")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat()) } func TestInferWithIntAsFloat(t *testing.T) { - assert.True(t, inferWithIntAsFloat(FromDeferredType(""), false).IsVoid()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("true"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("false"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("true"), true).IsBool()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("false"), true).IsBool()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("abc"), false).IsString()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("0123"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0123"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0377"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0377"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0923"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0923"), false).IsString()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("123"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-123"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0xff"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xff"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0x7fffffffffffffff"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0x8000000000000000"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0xffffffffffffffff"), false).IsFloat()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("12_3"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-12_3"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1_2.3_4"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1_2.3_4"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0xca_fe"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xca_fe"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011_1101"), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011_1101"), false).IsString()) - - assert.True(t, inferWithIntAsFloat(FromDeferredType("."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("123."), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-123."), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType(".123"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-.123"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("123.456"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.456"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1e2."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e2."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1e-2."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e-2."), false).IsString()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e-3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e-3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e-3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e-3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType("-.2e3"), false).IsFloat()) - assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e-3"), false).IsFloat()) - assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3"), false).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("")).IsVoid()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("true")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("false")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("abc")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("0123")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0123")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0377")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0377")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0923")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x7fffffffffffffff")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0x8000000000000000")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xffffffffffffffff")).IsFloat()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType("12_3")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferWithIntAsFloat(FromDeferredType(".")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123.")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.123")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("123.456")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-123.456")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.2e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.2e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("1.e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-1.e-3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType("-.2e3")).IsFloat()) + assert.True(t, inferWithIntAsFloat(FromDeferredType(".2e-3")).IsFloat()) + assert.True(t, inferWithOctalAsInt(FromDeferredType("-.2e-3")).IsFloat()) } -func TestInferStringOnly(t *testing.T) { - assert.True(t, inferStringOnly(FromDeferredType(""), false).IsVoid()) - - assert.True(t, inferStringOnly(FromDeferredType("true"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("false"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("true"), true).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("false"), true).IsString()) - - assert.True(t, inferStringOnly(FromDeferredType("abc"), false).IsString()) - - assert.True(t, inferStringOnly(FromDeferredType("0123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0377"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0377"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0923"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0923"), false).IsString()) - - assert.True(t, inferStringOnly(FromDeferredType("123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0xff"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0xff"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0b1011"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0b1011"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0x7fffffffffffffff"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0x8000000000000000"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0xffffffffffffffff"), false).IsString()) - - assert.True(t, inferStringOnly(FromDeferredType("12_3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-12_3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1_2.3_4"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1_2.3_4"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0xca_fe"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0xca_fe"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("0b1011_1101"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-0b1011_1101"), false).IsString()) - - assert.True(t, inferStringOnly(FromDeferredType("."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("123."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-123."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType(".123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-.123"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("123.456"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-123.456"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1e2."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1e2."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1e-2."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1e-2."), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1.2e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1.2e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1.2e-3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1.2e-3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1.e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1.e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("1.e-3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-1.e-3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType(".2e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-.2e3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType(".2e-3"), false).IsString()) - assert.True(t, inferStringOnly(FromDeferredType("-.2e-3"), false).IsString()) +func TestInferString(t *testing.T) { + assert.True(t, inferString(FromDeferredType("")).IsVoid()) + + assert.True(t, inferString(FromDeferredType("true")).IsString()) + assert.True(t, inferString(FromDeferredType("false")).IsString()) + + assert.True(t, inferString(FromDeferredType("abc")).IsString()) + + assert.True(t, inferString(FromDeferredType("0123")).IsString()) + assert.True(t, inferString(FromDeferredType("-0123")).IsString()) + assert.True(t, inferString(FromDeferredType("0377")).IsString()) + assert.True(t, inferString(FromDeferredType("-0377")).IsString()) + assert.True(t, inferString(FromDeferredType("0923")).IsString()) + assert.True(t, inferString(FromDeferredType("-0923")).IsString()) + + assert.True(t, inferString(FromDeferredType("123")).IsString()) + assert.True(t, inferString(FromDeferredType("-123")).IsString()) + assert.True(t, inferString(FromDeferredType("0xff")).IsString()) + assert.True(t, inferString(FromDeferredType("-0xff")).IsString()) + assert.True(t, inferString(FromDeferredType("0b1011")).IsString()) + assert.True(t, inferString(FromDeferredType("-0b1011")).IsString()) + assert.True(t, inferString(FromDeferredType("0x7fffffffffffffff")).IsString()) + assert.True(t, inferString(FromDeferredType("0x8000000000000000")).IsString()) + assert.True(t, inferString(FromDeferredType("0xffffffffffffffff")).IsString()) + + assert.True(t, inferString(FromDeferredType("12_3")).IsString()) + assert.True(t, inferString(FromDeferredType("-12_3")).IsString()) + assert.True(t, inferString(FromDeferredType("1_2.3_4")).IsString()) + assert.True(t, inferString(FromDeferredType("-1_2.3_4")).IsString()) + assert.True(t, inferString(FromDeferredType("0xca_fe")).IsString()) + assert.True(t, inferString(FromDeferredType("-0xca_fe")).IsString()) + assert.True(t, inferString(FromDeferredType("0b1011_1101")).IsString()) + assert.True(t, inferString(FromDeferredType("-0b1011_1101")).IsString()) + + assert.True(t, inferString(FromDeferredType(".")).IsString()) + assert.True(t, inferString(FromDeferredType("-.")).IsString()) + assert.True(t, inferString(FromDeferredType("123.")).IsString()) + assert.True(t, inferString(FromDeferredType("-123.")).IsString()) + assert.True(t, inferString(FromDeferredType(".123")).IsString()) + assert.True(t, inferString(FromDeferredType("-.123")).IsString()) + assert.True(t, inferString(FromDeferredType("123.456")).IsString()) + assert.True(t, inferString(FromDeferredType("-123.456")).IsString()) + assert.True(t, inferString(FromDeferredType("1e2.")).IsString()) + assert.True(t, inferString(FromDeferredType("-1e2.")).IsString()) + assert.True(t, inferString(FromDeferredType("1e-2.")).IsString()) + assert.True(t, inferString(FromDeferredType("-1e-2.")).IsString()) + assert.True(t, inferString(FromDeferredType("1.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.e3")).IsString()) + assert.True(t, inferString(FromDeferredType("1.e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-1.e-3")).IsString()) + assert.True(t, inferString(FromDeferredType(".2e3")).IsString()) + assert.True(t, inferString(FromDeferredType("-.2e3")).IsString()) + assert.True(t, inferString(FromDeferredType(".2e-3")).IsString()) + assert.True(t, inferString(FromDeferredType("-.2e-3")).IsString()) } diff --git a/internal/pkg/mlrval/mlrval_new.go b/internal/pkg/mlrval/mlrval_new.go index 371d3ee8d1..8bf3a6dc2b 100644 --- a/internal/pkg/mlrval/mlrval_new.go +++ b/internal/pkg/mlrval/mlrval_new.go @@ -39,9 +39,15 @@ func FromInferredType(input string) *Mlrval { printrep: input, printrepValid: true, } - // TODO: comment re inferBool arg - packageLevelInferrer(mv, true) - return mv + // TODO: comment re data files vs literals context -- this is for the latter + if input == "true" { + return TRUE + } else if input == "false" { + return FALSE + } else { + packageLevelInferrer(mv) + return mv + } } func FromString(input string) *Mlrval { diff --git a/internal/pkg/scan/digits.go b/internal/pkg/scan/digits.go index 032ac9e7d5..92f69894db 100644 --- a/internal/pkg/scan/digits.go +++ b/internal/pkg/scan/digits.go @@ -22,6 +22,17 @@ var isDecimalDigitTable = []bool{ false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f } +var isOctalDigitTable = []bool{ + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 20-2f + true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, // 30-3f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 40-4f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 50-5f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 60-6f + false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 70-7f +} + var isHexDigitTable = []bool{ false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 00-0f false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, // 10-1f @@ -54,6 +65,14 @@ func isDecimalDigit(c byte) bool { } } +func isOctalDigit(c byte) bool { + if c < 128 { // byte is unsigned in Go + return isOctalDigitTable[c] + } else { + return false + } +} + func isHexDigit(c byte) bool { if c < 128 { // byte is unsigned in Go return isHexDigitTable[c] diff --git a/internal/pkg/scan/digits_test.go b/internal/pkg/scan/digits_test.go index 06c1329a85..d305e1beeb 100644 --- a/internal/pkg/scan/digits_test.go +++ b/internal/pkg/scan/digits_test.go @@ -17,6 +17,17 @@ func TestIsDecimalDigit(t *testing.T) { } } +func TestIsOctalDigit(t *testing.T) { + var c byte + for c = 0x00; c < 0xff; c++ { + if c >= '0' && c <= '7' { + assert.True(t, isOctalDigit(c)) + } else { + assert.False(t, isOctalDigit(c)) + } + } +} + func TestIsHexDigit(t *testing.T) { var c byte for c = 0x00; c < 0xff; c++ { diff --git a/internal/pkg/scan/find.go b/internal/pkg/scan/find.go index a0623a6b6a..f940841554 100644 --- a/internal/pkg/scan/find.go +++ b/internal/pkg/scan/find.go @@ -1,5 +1,8 @@ package scan +import ( +) + // TODO: comment re context // o grammar for numbers & case-through @@ -46,10 +49,6 @@ func FindScanType(sinput string) ScanType { } } - if sinput == "true" || sinput == "false" { - return scanTypeBool - } - return scanTypeString } @@ -81,6 +80,13 @@ func findScanTypePositiveNumberOrString(input []byte) ScanType { return findScanTypePositiveHexOrString(input[2:]) } } + if i1 == 'o' || i1 == 'O' { + if len(input) == 2 { + return scanTypeString + } else { + return findScanTypePositiveOctalOrString(input[2:]) + } + } if i1 == 'b' || i1 == 'B' { if len(input) == 2 { return scanTypeString @@ -88,6 +94,25 @@ func findScanTypePositiveNumberOrString(input []byte) ScanType { return findScanTypePositiveBinaryOrString(input[2:]) } } + + allOctal := true + allDecimal := true + for _, c := range input[1:] { + if !isOctalDigit(c) { + allOctal = false + } + if !isDecimalDigit(c) { + allDecimal = false + break + } + } + if allOctal { + return scanTypeLeadingZeroOctalInt + } + if allDecimal { + return scanTypeLeadingZeroDecimalInt + } + // else fall through } return findScanTypePositiveDecimalOrFloatOrString(input) @@ -127,6 +152,16 @@ func findScanTypePositiveDecimalOrFloatOrString(input []byte) ScanType { } } +// Leading 0o has already been stripped +func findScanTypePositiveOctalOrString(input []byte) ScanType { + for _, c := range []byte(input) { + if !isOctalDigit(c) { + return scanTypeString + } + } + return scanTypeOctalInt +} + // Leading 0x has already been stripped func findScanTypePositiveHexOrString(input []byte) ScanType { for _, c := range []byte(input) { diff --git a/internal/pkg/scan/find_benchmark_test.go b/internal/pkg/scan/find_benchmark_test.go index ba0880091b..0d023a25fa 100644 --- a/internal/pkg/scan/find_benchmark_test.go +++ b/internal/pkg/scan/find_benchmark_test.go @@ -48,6 +48,9 @@ func BenchmarkFromAbnormalCases(b *testing.B) { "0x0", "-0x0", "0xcafe", "-0xcafe", "0xcape", "-0xcape", + "0o", "-0o", + "0o0", "-0o0", + "0o1234", "-0o1234", "0b", "-0b", "0b0", "-0b0", "0b1011", "-0b1011", diff --git a/internal/pkg/scan/find_test.go b/internal/pkg/scan/find_test.go index 0909df0c6b..e1eba04376 100644 --- a/internal/pkg/scan/find_test.go +++ b/internal/pkg/scan/find_test.go @@ -6,11 +6,14 @@ import ( "github.com/stretchr/testify/assert" ) -func TestFindScanTypeName(t *testing.T) { +func TestFindScanTypeNameStrings(t *testing.T) { assert.Equal(t, typeNameString, findScanTypeName("")) assert.Equal(t, typeNameString, findScanTypeName("-")) assert.Equal(t, typeNameString, findScanTypeName("abc")) assert.Equal(t, typeNameString, findScanTypeName("-abc")) +} + +func TestFindScanTypeNameDecimals(t *testing.T) { assert.Equal(t, typeNameDecimalInt, findScanTypeName("0")) assert.Equal(t, typeNameDecimalInt, findScanTypeName("-0")) assert.Equal(t, typeNameDecimalInt, findScanTypeName("1")) @@ -19,44 +22,93 @@ func TestFindScanTypeName(t *testing.T) { assert.Equal(t, typeNameDecimalInt, findScanTypeName("-2")) assert.Equal(t, typeNameDecimalInt, findScanTypeName("123")) assert.Equal(t, typeNameDecimalInt, findScanTypeName("-123")) +} + +func TestFindScanTypeNameFloats(t *testing.T) { assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName(".2")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.2")) - assert.Equal(t, typeNameString, findScanTypeName(".")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-.")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("12e-2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-12e-2")) + assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1.2.3")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1.2.3")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("1e2e3")) assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-1e2e3")) - assert.Equal(t, typeNameMaybeFloat, findScanTypeName("12e-2")) - assert.Equal(t, typeNameMaybeFloat, findScanTypeName("-12e-2")) + + assert.Equal(t, typeNameString, findScanTypeName(".")) assert.Equal(t, typeNameString, findScanTypeName("1e2x3")) assert.Equal(t, typeNameString, findScanTypeName("-1e2x3")) - assert.Equal(t, typeNameString, findScanTypeName("0x")) - assert.Equal(t, typeNameString, findScanTypeName("-0x")) + + assert.Equal(t, typeNameString, findScanTypeName("inf")) + assert.Equal(t, typeNameString, findScanTypeName("infinity")) + assert.Equal(t, typeNameString, findScanTypeName("NaN")) + assert.Equal(t, typeNameString, findScanTypeName("-inf")) + assert.Equal(t, typeNameString, findScanTypeName("-infinity")) + assert.Equal(t, typeNameString, findScanTypeName("-NaN")) +} + +func TestFindScanTypeNameHexes(t *testing.T) { assert.Equal(t, typeNameHexInt, findScanTypeName("0x0")) assert.Equal(t, typeNameHexInt, findScanTypeName("-0x0")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xf")) + assert.Equal(t, typeNameHexInt, findScanTypeName("-0xf")) assert.Equal(t, typeNameHexInt, findScanTypeName("0xcafe")) assert.Equal(t, typeNameHexInt, findScanTypeName("-0xcafe")) + + assert.Equal(t, typeNameHexInt, findScanTypeName("0x7ffffffffffffffe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x7fffffffffffffff")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000000")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0x8000000000000001")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xfffffffffffffffe")) + assert.Equal(t, typeNameHexInt, findScanTypeName("0xffffffffffffffff")) + + assert.Equal(t, typeNameString, findScanTypeName("0x")) + assert.Equal(t, typeNameString, findScanTypeName("-0x")) assert.Equal(t, typeNameString, findScanTypeName("0xcape")) assert.Equal(t, typeNameString, findScanTypeName("-0xcape")) - assert.Equal(t, typeNameString, findScanTypeName("0b")) - assert.Equal(t, typeNameString, findScanTypeName("-0b")) +} + +func TestFindScanTypeNameOctals(t *testing.T) { + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("00")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-00")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("01")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-01")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("0377")) + assert.Equal(t, typeNameLeadingZeroOctalInt, findScanTypeName("-0377")) + + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("08")) + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-08")) + + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("06789")) + assert.Equal(t, typeNameLeadingZeroDecimalInt, findScanTypeName("-06789")) + + assert.Equal(t, typeNameOctalInt, findScanTypeName("0o377")) + assert.Equal(t, typeNameOctalInt, findScanTypeName("-0o377")) + + assert.Equal(t, typeNameString, findScanTypeName("0o6789")) + assert.Equal(t, typeNameString, findScanTypeName("-0o6789")) +} + +func TestFindScanTypeNameBinaries(t *testing.T) { assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b0")) assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b0")) assert.Equal(t, typeNameBinaryInt, findScanTypeName("0b1011")) assert.Equal(t, typeNameBinaryInt, findScanTypeName("-0b1011")) + + assert.Equal(t, typeNameString, findScanTypeName("0b")) + assert.Equal(t, typeNameString, findScanTypeName("-0b")) assert.Equal(t, typeNameString, findScanTypeName("0b1021")) assert.Equal(t, typeNameString, findScanTypeName("-0b1021")) - assert.Equal(t, typeNameBool, findScanTypeName("true")) - assert.Equal(t, typeNameBool, findScanTypeName("true")) - assert.Equal(t, typeNameBool, findScanTypeName("false")) - assert.Equal(t, typeNameBool, findScanTypeName("false")) - assert.Equal(t, typeNameString, findScanTypeName("True")) +} + +func TestFindScanTypeNameBooleans(t *testing.T) { + assert.Equal(t, typeNameString, findScanTypeName("true")) assert.Equal(t, typeNameString, findScanTypeName("True")) - assert.Equal(t, typeNameString, findScanTypeName("False")) + assert.Equal(t, typeNameString, findScanTypeName("false")) assert.Equal(t, typeNameString, findScanTypeName("False")) } diff --git a/internal/pkg/scan/type.go b/internal/pkg/scan/type.go index f8ad5eb3d4..674969ee4a 100644 --- a/internal/pkg/scan/type.go +++ b/internal/pkg/scan/type.go @@ -5,29 +5,32 @@ package scan type ScanType int const ( - scanTypeString ScanType = 0 - scanTypeDecimalInt = 1 - scanTypeOctalInt = 2 - scanTypeHexInt = 3 - scanTypeBinaryInt = 4 - scanTypeMaybeFloat = 5 - scanTypeBool = 6 + scanTypeString ScanType = 0 + scanTypeDecimalInt = 1 + scanTypeLeadingZeroDecimalInt = 2 + scanTypeOctalInt = 3 + scanTypeLeadingZeroOctalInt = 4 + scanTypeHexInt = 5 + scanTypeBinaryInt = 6 + scanTypeMaybeFloat = 7 ) const typeNameString = "string" -const typeNameDecimalInt = "decint" -const typeNameOctalInt = "octint" -const typeNameHexInt = "hexint" -const typeNameBinaryInt = "binint" -const typeNameMaybeFloat = "float?" -const typeNameBool = "bool" +const typeNameDecimalInt = "decint" // e.g. 123 +const typeNameLeadingZeroDecimalInt = "lzdecint" // e.g. 0899 +const typeNameOctalInt = "octint" // e.g. 0o377 +const typeNameLeadingZeroOctalInt = "lzoctint" // e.g. 0377 +const typeNameHexInt = "hexint" // e.g. 0xcafe +const typeNameBinaryInt = "binint" // e.g. 0b1011 +const typeNameMaybeFloat = "float?" // characters in [0-9\.-+eE] but needs parse to be sure var TypeNames = []string{ typeNameString, typeNameDecimalInt, + typeNameLeadingZeroDecimalInt, typeNameOctalInt, + typeNameLeadingZeroOctalInt, typeNameHexInt, typeNameBinaryInt, typeNameMaybeFloat, - typeNameBool, } diff --git a/internal/pkg/scan/type_test.go b/internal/pkg/scan/type_test.go index b68627f4a6..f64dfc904a 100644 --- a/internal/pkg/scan/type_test.go +++ b/internal/pkg/scan/type_test.go @@ -9,9 +9,10 @@ import ( func TestTypeNames(t *testing.T) { assert.Equal(t, TypeNames[scanTypeString], "string") assert.Equal(t, TypeNames[scanTypeDecimalInt], "decint") - assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") - assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") - assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") - assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") - assert.Equal(t, TypeNames[scanTypeBool], "bool") + assert.Equal(t, TypeNames[scanTypeLeadingZeroDecimalInt], "lzdecint") // e.g. 0899 + assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") // e.g. 0o377 + assert.Equal(t, TypeNames[scanTypeLeadingZeroOctalInt], "lzoctint") // e.g. 0377 + assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") // e.g. 0xcafe + assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") // e.g. 0b1011 + assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") // characters in [0-9\.-+eE] but needs parse to be sure } diff --git a/todo.txt b/todo.txt index d27ad200af..74f1b1a064 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,21 @@ ================================================================ PUNCHDOWN LIST +* numeric-inference perf + k benchmark per se + ? webdoc --cpuprofile wut + + ! 0x8..0 ..0xf..f handling + ! octal handling + > note/doc 0o prefix + > leading-0 tests @ scan & mlv + > what about 0999? + ! opt-bool handling + + o README-profiling.md re various scripts + o README-profiling.md re this PR + o webdoc re on-battery anecdote + * blockers: - linux/1.17 perf checks - fractional-strptime @@ -13,17 +28,6 @@ PUNCHDOWN LIST - big-picture note ? array/map fields: marshal as JSON_SINGLE_LINE -* numeric-inference perf - k benchmark per se - ? webdoc --cpuprofile wut - - ! octal handling - ! opt-bool handling - - o README-profiling.md re various scripts - o README-profiling.md re this PR - o webdoc re on-battery anecdote - * nikos materials -> fold in * cases/dsl-min-max-types: cmp-matrices need to be fixed to follow the advertised rule for mixed types From e340a75915f37460fdc051c9d2be99d62ba7c345 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 27 Dec 2021 00:26:25 -0500 Subject: [PATCH 15/16] replace old inferrer with newer/faster --- internal/pkg/lib/util.go | 2 +- internal/pkg/mlrval/mlrval_infer.go | 70 ++++++++++++++----------- internal/pkg/scan/find.go | 3 +- internal/pkg/scan/type_test.go | 10 ++-- test/cases/io-infer-flags/dash-O/expout | 4 +- todo.txt | 2 + 6 files changed, 50 insertions(+), 41 deletions(-) diff --git a/internal/pkg/lib/util.go b/internal/pkg/lib/util.go index bee04660a8..ff4c692cb0 100644 --- a/internal/pkg/lib/util.go +++ b/internal/pkg/lib/util.go @@ -101,7 +101,7 @@ func TryIntFromString(input string) (int, bool) { } } - // Following twos-complement formatting familiar from all manners of + // Following twos-complement formatting familiar from all manner of // languages, including C which was Miller's original implementation // language, we want to allow 0x00....00 through 0x7f....ff as positive // 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's diff --git a/internal/pkg/mlrval/mlrval_infer.go b/internal/pkg/mlrval/mlrval_infer.go index 3704a0e13b..68d88763c1 100644 --- a/internal/pkg/mlrval/mlrval_infer.go +++ b/internal/pkg/mlrval/mlrval_infer.go @@ -110,7 +110,7 @@ func inferDecimalInt(mv *Mlrval) *Mlrval { // TODO: comment func inferLeadingZeroDecimalIntAsInt(mv *Mlrval) *Mlrval { - intval, err := strconv.ParseInt(mv.printrep[1:], 10, 64) + intval, err := strconv.ParseInt(mv.printrep, 10, 64) if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) } else { @@ -121,19 +121,7 @@ func inferLeadingZeroDecimalIntAsInt(mv *Mlrval) *Mlrval { // TODO: comment // E.g. explicit 0o377, not 0377 func inferOctalInt(mv *Mlrval) *Mlrval { - var input string - // Skip known leading 0x or -0x prefix - if mv.printrep[0] == '-' { - input = mv.printrep[3:] - } else { - input = mv.printrep[2:] - } - intval, err := strconv.ParseInt(input, 8, 64) - if err == nil { - return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) - } else { - return mv.SetFromString(mv.printrep) - } + return inferBaseInt(mv, 8) } // TODO: comment @@ -147,17 +135,19 @@ func inferFromLeadingZeroOctalIntAsInt(mv *Mlrval) *Mlrval { } // TODO: comment -// The 2: is to get past the known 0x prefix func inferHexInt(mv *Mlrval) *Mlrval { var input string + var negate bool // Skip known leading 0x or -0x prefix if mv.printrep[0] == '-' { input = mv.printrep[3:] + negate = true } else { input = mv.printrep[2:] + negate = false } - // Following twos-complement formatting familiar from all manners of + // Following twos-complement formatting familiar from all manner of // languages, including C which was Miller's original implementation // language, we want to allow 0x00....00 through 0x7f....ff as positive // 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's @@ -170,13 +160,20 @@ func inferHexInt(mv *Mlrval) *Mlrval { i0 := input[0] if len(input) == 16 && ('8' <= i0 && i0 <= 'f') { uintval, err := strconv.ParseUint(input, 16, 64) + intval := int(uintval) + if negate { + intval = -intval + } if err == nil { - return mv.SetFromPrevalidatedIntString(mv.printrep, int(uintval)) + return mv.SetFromPrevalidatedIntString(mv.printrep, intval) } else { return mv.SetFromString(mv.printrep) } } else { intval, err := strconv.ParseInt(input, 16, 64) + if negate { + intval = -intval + } if err == nil { return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) } else { @@ -187,21 +184,8 @@ func inferHexInt(mv *Mlrval) *Mlrval { } // TODO: comment -// The 2: is to get past the known 0b prefix func inferBinaryInt(mv *Mlrval) *Mlrval { - var input string - // Skip known leading 0x or -0x prefix - if mv.printrep[0] == '-' { - input = mv.printrep[3:] - } else { - input = mv.printrep[2:] - } - intval, err := strconv.ParseInt(input, 16, 64) - if err == nil { - return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) - } else { - return mv.SetFromString(mv.printrep) - } + return inferBaseInt(mv, 2) } // TODO: comment @@ -222,3 +206,27 @@ func inferFromBool(mv *Mlrval) *Mlrval { return mv.SetFromPrevalidatedBoolString(mv.printrep, false) } } + +// TODO: comment +// Shared code for 0o/0b integers +func inferBaseInt(mv *Mlrval, base int) *Mlrval { + var input string + var negate bool + // Skip known leading 0x or -0x prefix + if mv.printrep[0] == '-' { + input = mv.printrep[3:] + negate = true + } else { + input = mv.printrep[2:] + negate = false + } + intval, err := strconv.ParseInt(input, base, 64) + if err == nil { + if negate { + intval = -intval + } + return mv.SetFromPrevalidatedIntString(mv.printrep, int(intval)) + } else { + return mv.SetFromString(mv.printrep) + } +} diff --git a/internal/pkg/scan/find.go b/internal/pkg/scan/find.go index f940841554..d3a9112054 100644 --- a/internal/pkg/scan/find.go +++ b/internal/pkg/scan/find.go @@ -1,7 +1,6 @@ package scan -import ( -) +import () // TODO: comment re context diff --git a/internal/pkg/scan/type_test.go b/internal/pkg/scan/type_test.go index f64dfc904a..4960963e6c 100644 --- a/internal/pkg/scan/type_test.go +++ b/internal/pkg/scan/type_test.go @@ -10,9 +10,9 @@ func TestTypeNames(t *testing.T) { assert.Equal(t, TypeNames[scanTypeString], "string") assert.Equal(t, TypeNames[scanTypeDecimalInt], "decint") assert.Equal(t, TypeNames[scanTypeLeadingZeroDecimalInt], "lzdecint") // e.g. 0899 - assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") // e.g. 0o377 - assert.Equal(t, TypeNames[scanTypeLeadingZeroOctalInt], "lzoctint") // e.g. 0377 - assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") // e.g. 0xcafe - assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") // e.g. 0b1011 - assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") // characters in [0-9\.-+eE] but needs parse to be sure + assert.Equal(t, TypeNames[scanTypeOctalInt], "octint") // e.g. 0o377 + assert.Equal(t, TypeNames[scanTypeLeadingZeroOctalInt], "lzoctint") // e.g. 0377 + assert.Equal(t, TypeNames[scanTypeHexInt], "hexint") // e.g. 0xcafe + assert.Equal(t, TypeNames[scanTypeBinaryInt], "binint") // e.g. 0b1011 + assert.Equal(t, TypeNames[scanTypeMaybeFloat], "float?") // characters in [0-9\.-+eE] but needs parse to be sure } diff --git a/test/cases/io-infer-flags/dash-O/expout b/test/cases/io-infer-flags/dash-O/expout index 2ecaa2daba..55e06d2b1f 100644 --- a/test/cases/io-infer-flags/dash-O/expout +++ b/test/cases/io-infer-flags/dash-O/expout @@ -3,7 +3,7 @@ x t y z 123.45 float 124.45 123.95 0123 int 84 83.5 07 int 8 7.5 -08 float 9 8.5 +08 int 9 8.5 0 int 1 0.5 0. float 1 0.5 0.0 float 1 0.5 @@ -16,7 +16,7 @@ x t y z -0b0100 int -3 -3.5 -0x1000 int -4095 -4095.5 -07 int -6 -6.5 --08 float -7 -7.5 +-08 int -7 -7.5 -0 int 1 0.5 -0. float 1 0.5 -0.0 float 1 0.5 diff --git a/todo.txt b/todo.txt index 74f1b1a064..194e2925cc 100644 --- a/todo.txt +++ b/todo.txt @@ -10,10 +10,12 @@ PUNCHDOWN LIST > note/doc 0o prefix > leading-0 tests @ scan & mlv > what about 0999? + > ensure ut 0x/0b/0o pos neg ! opt-bool handling o README-profiling.md re various scripts o README-profiling.md re this PR + o webdoc 0899 w/ mlr -O float -> int o webdoc re on-battery anecdote * blockers: From 5de39defb445d25db76e37683b4f0252a09ac462 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 27 Dec 2021 00:40:51 -0500 Subject: [PATCH 16/16] update docs for new type-inferrer --- cmd/scan/main.go | 14 -------------- docs/src/manpage.md | 2 +- docs/src/manpage.txt | 2 +- docs/src/new-in-miller-6.md | 3 ++- docs/src/new-in-miller-6.md.in | 3 ++- docs/src/reference-main-arithmetic.md | 17 +++++++++++++---- docs/src/reference-main-arithmetic.md.in | 17 +++++++++++++---- man/manpage.txt | 2 +- man/mlr.1 | 4 ++-- todo.txt | 17 +++++------------ 10 files changed, 40 insertions(+), 41 deletions(-) diff --git a/cmd/scan/main.go b/cmd/scan/main.go index 9b4537577c..c185b87524 100644 --- a/cmd/scan/main.go +++ b/cmd/scan/main.go @@ -11,21 +11,7 @@ import ( "github.com/johnkerl/miller/internal/pkg/scan" ) -// const ( -// scanTypeString ScanType = 0 -// scanTypeDecimalInt = 1 -// scanTypeOctalInt = 2 -// scanTypeHexInt = 3 -// scanTypeBinaryInt = 4 -// scanTypeMaybeFloat = 5 -// scanTypeBool = 6 -// ) - func main() { - // TODO: - // func ParseInt(s string, base int, bitSize int) (int64, error) - // func ParseUint(s string, base int, bitSize int) (uint64, error) - for _, arg := range os.Args[1:] { scanType := scan.FindScanType(arg) fmt.Printf("%-10s -> %s\n", arg, scan.TypeNames[scanType]) diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 1453ce12d0..4a591235a4 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -3040,5 +3040,5 @@ SEE ALSO - 2021-12-26 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index e96a68b310..0eb8399070 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -3019,4 +3019,4 @@ SEE ALSO - 2021-12-26 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index 563fad8315..20e59c9557 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -255,7 +255,8 @@ The following differences are rather technical. If they don't sound familiar to * See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags). * Type-inference: * The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers. - * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float. + * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers. + * Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled. * See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags). * Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the [page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information. diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index 1fe1facb23..321f08e853 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -213,7 +213,8 @@ The following differences are rather technical. If they don't sound familiar to * See also `mlr help legacy-flags` or the [legacy-flags reference](reference-main-flag-list.md#legacy-flags). * Type-inference: * The `-S` and `-F` flags to `mlr put` and `mlr filter` are ignored, since type-inference is no longer done in `mlr put` and `mlr filter`, but rather, when records are first read. You can use `mlr -S` and `mlr -A`, respectively, instead to control type-inference within the record-readers. - * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as float. + * Octal numbers like `0123` and `07` are type-inferred as string. Use `mlr -O` to infer them as octal integers. Note that `08` and `09` will then infer as deicmal integers. + * Any numbers prefix with `0o`, e.g. `0o377`, are already treated as octal regardless of `mlr -O` -- `mlr -O` only affects how leading-zero integers are handled. * See also the [miscellaneous-flags reference](reference-main-flag-list.md#miscellaneous-flags). * Emitting a map-valued expression now requires either a temporary variable or the new `emit1` keyword. Please see the [page on emit statements](reference-dsl-output-statements.md#emit1-and-emitemitpemitf) for more information. diff --git a/docs/src/reference-main-arithmetic.md b/docs/src/reference-main-arithmetic.md index bd8b797ebe..2c42aa6050 100644 --- a/docs/src/reference-main-arithmetic.md +++ b/docs/src/reference-main-arithmetic.md @@ -20,11 +20,20 @@ Quick links: Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string. -If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`). +Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`. - +Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g. +`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with +leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller, +unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with +value 255), and `06789` will scan as a decimal integer (with value 6789). + +If you want all numbers from data files to be treated as floats, then you may +use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` +with `$c = float($a) * float($b)`). Or, use `mlr -A`. + +If you use `mlr -S` then all field values from data files are read in as +strings; you can cast them using `int()` or `float()`. ## Conversion by math routines diff --git a/docs/src/reference-main-arithmetic.md.in b/docs/src/reference-main-arithmetic.md.in index 6e481f7364..cbf584643b 100644 --- a/docs/src/reference-main-arithmetic.md.in +++ b/docs/src/reference-main-arithmetic.md.in @@ -4,11 +4,20 @@ Numbers in Miller are double-precision float or 64-bit signed integers. Anything scannable as int, e.g `123` or `0xabcd`, is treated as an integer; otherwise, input scannable as float (`4.56` or `8e9`) is treated as float; everything else is a string. -If you want all numbers to be treated as floats, then you may use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` with `$c = float($a) * float($b)`). +Three flags control input-scanning for numbers: `mlr -O`, `mlr -A`, and `mlr -S`. - +Prefix `0x` means hexadecimal, e.g. `0xcafe`; prefix `0b` means binary, e.g. +`0b1011`; prefix `0o` means octal, e.g. `0o377`. Numbers in data files with +leading zeroes, e.g. `0377` or `06789`, are treated as strings in Miller, +unless you specifiy `mlr -O`: then `0377` will scan as an octal integer (with +value 255), and `06789` will scan as a decimal integer (with value 6789). + +If you want all numbers from data files to be treated as floats, then you may +use `float()` in your filter/put expressions (e.g. replacing `$c = $a * $b` +with `$c = float($a) * float($b)`). Or, use `mlr -A`. + +If you use `mlr -S` then all field values from data files are read in as +strings; you can cast them using `int()` or `float()`. ## Conversion by math routines diff --git a/man/manpage.txt b/man/manpage.txt index e96a68b310..0eb8399070 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -3019,4 +3019,4 @@ SEE ALSO - 2021-12-26 MILLER(1) + 2021-12-27 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 4ce4b1cd18..a60a32b46f 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-12-26 +.\" Date: 2021-12-27 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-12-26" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-12-27" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/todo.txt b/todo.txt index 194e2925cc..fede517f1b 100644 --- a/todo.txt +++ b/todo.txt @@ -2,20 +2,9 @@ PUNCHDOWN LIST * numeric-inference perf - k benchmark per se - ? webdoc --cpuprofile wut - - ! 0x8..0 ..0xf..f handling - ! octal handling - > note/doc 0o prefix - > leading-0 tests @ scan & mlv - > what about 0999? - > ensure ut 0x/0b/0o pos neg - ! opt-bool handling - o README-profiling.md re various scripts o README-profiling.md re this PR - o webdoc 0899 w/ mlr -O float -> int + o update mac numbers; type up linux numbers o webdoc re on-battery anecdote * blockers: @@ -89,6 +78,10 @@ PUNCHDOWN LIST ================================================================ NON-BLOCKERS +* pos/neg 0x/0b/0o UTs + +* 0o into BNF + ? BIFs as FCFs? * pv: 'mlr --prepipex pv --gzin tail -n 10 ~/tmp/zhuge.gz' needs --gzin & --prepipex both