diff --git a/.gitignore b/.gitignore index 6a1ef02bc..fd1dcbd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ out/ /measurements*.txt /*.out out_expected.txt +/*-timing.json diff --git a/calculate_average_AlexanderYastrebov.sh b/calculate_average_AlexanderYastrebov.sh new file mode 100755 index 000000000..ea951bd4e --- /dev/null +++ b/calculate_average_AlexanderYastrebov.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +INPUT=${1:-"measurements.txt"} + +target/AlexanderYastrebov/1brc "$INPUT" diff --git a/github_users.txt b/github_users.txt index 5bfc5dcbc..6213739e5 100644 --- a/github_users.txt +++ b/github_users.txt @@ -49,3 +49,4 @@ yavuztas;Yavuz Tas yehwankim23;김예환 Ye-Hwan Kim (Sam) hundredwatt;Jason Nochlin gnmathur;Gaurav Mathur +AlexanderYastrebov;Alexander Yastrebov diff --git a/prepare_AlexanderYastrebov.sh b/prepare_AlexanderYastrebov.sh new file mode 100755 index 000000000..3521ecb03 --- /dev/null +++ b/prepare_AlexanderYastrebov.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +DOCKER_BUILDKIT=1 docker build -o target/AlexanderYastrebov src/main/go/AlexanderYastrebov diff --git a/src/main/go/AlexanderYastrebov/Dockerfile b/src/main/go/AlexanderYastrebov/Dockerfile new file mode 100644 index 000000000..a3b28067f --- /dev/null +++ b/src/main/go/AlexanderYastrebov/Dockerfile @@ -0,0 +1,22 @@ +# +# Copyright 2023 The original authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM golang AS build-stage +COPY . src/ +RUN cd src && go build . + +FROM scratch AS export-stage +COPY --from=build-stage /go/src/1brc / diff --git a/src/main/go/AlexanderYastrebov/README.md b/src/main/go/AlexanderYastrebov/README.md new file mode 100644 index 000000000..dc0119252 --- /dev/null +++ b/src/main/go/AlexanderYastrebov/README.md @@ -0,0 +1,58 @@ +# 1brc in go + +It uses Docker with BuildKit plugin to build and [export binary](https://docs.docker.com/engine/reference/commandline/build/#output) binary, +see [prepare_AlexanderYastrebov.sh](../../../../prepare_AlexanderYastrebov.sh) +and [calculate_average_AlexanderYastrebov.sh](../../../../calculate_average_AlexanderYastrebov.sh). + +Demo: +```sh +$ ./test.sh AlexanderYastrebov +[+] Building 0.2s (9/9) FINISHED + => [internal] load .dockerignore 0.0s + => => transferring context: 2B 0.0s + => [internal] load build definition from Dockerfile 0.0s + => => transferring dockerfile: 172B 0.0s + => [internal] load metadata for docker.io/library/golang:latest 0.0s + => [internal] load build context 0.0s + => => transferring context: 145B 0.0s + => [build-stage 1/3] FROM docker.io/library/golang 0.0s + => CACHED [build-stage 2/3] COPY . src/ 0.0s + => CACHED [build-stage 3/3] RUN cd src && go build . 0.0s + => CACHED [export-stage 1/1] COPY --from=build-stage /go/src/1brc / 0.0s + => exporting to client directory 0.1s + => => copying files 2.03MB 0.0s +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10000-unique-keys.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-1.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-20.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-2.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-3.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-boundaries.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-complex-utf8.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-dot.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-shortest.txt +Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-short.txt + +# Run once to setup the benchmark +# ./create_measurements.sh 1000000000 +# mv measurements.txt measurements_1B.txt +# ln -s measurements_1B.txt measurements.txt +# ./calculate_average_baseline.sh > out_expected.txt + +$ wc -l measurements_1B.txt +1000000000 measurements_1B.txt + +$ ./evaluate2.sh AlexanderYastrebov royvanrijn +... 0.0s +Benchmark 1: ./calculate_average_AlexanderYastrebov.sh 2>&1 + Time (mean ± σ): 16.786 s ± 0.545 s [User: 56.030 s, System: 10.068 s] + Range (min … max): 15.918 s … 17.309 s 5 runs +... +Benchmark 1: ./calculate_average_royvanrijn.sh 2>&1 + Time (mean ± σ): 16.731 s ± 0.190 s [User: 56.485 s, System: 10.279 s] + Range (min … max): 16.490 s … 16.951 s 5 runs + +Summary + AlexanderYastrebov: trimmed mean 16.901712789513336, raw times 16.69836470718,17.30911065018,16.83413600418,15.91787706218,17.17263765718 + royvanrijn: trimmed mean 16.738037123633333, raw times 16.4900939703,16.9513459953,16.5794539913,16.8297746273,16.8048827523 +``` diff --git a/src/main/go/AlexanderYastrebov/calc.go b/src/main/go/AlexanderYastrebov/calc.go new file mode 100644 index 000000000..7c633ccf6 --- /dev/null +++ b/src/main/go/AlexanderYastrebov/calc.go @@ -0,0 +1,282 @@ +package main + +import ( + "bytes" + "fmt" + "log" + "math" + "os" + "runtime" + "sort" + "sync" + "syscall" +) + +type measurement struct { + min, max, sum, count int64 +} + +func main() { + if len(os.Args) != 2 { + log.Fatalf("Missing measurements filename") + } + + measurements := processFile(os.Args[1]) + + ids := make([]string, 0, len(measurements)) + for id := range measurements { + ids = append(ids, id) + } + sort.Strings(ids) + + fmt.Print("{") + for i, id := range ids { + if i > 0 { + fmt.Print(", ") + } + m := measurements[id] + fmt.Printf("%s=%.1f/%.1f/%.1f", id, round(float64(m.min)/10.0), round(float64(m.sum)/10.0/float64(m.count)), round(float64(m.max)/10.0)) + } + fmt.Println("}") +} + +func processFile(filename string) map[string]*measurement { + f, err := os.Open(filename) + if err != nil { + log.Fatalf("Open: %v", err) + } + defer f.Close() + + fi, err := f.Stat() + if err != nil { + log.Fatalf("Stat: %v", err) + } + + size := fi.Size() + if size <= 0 || size != int64(int(size)) { + log.Fatalf("Invalid file size: %d", size) + } + + data, err := syscall.Mmap(int(f.Fd()), 0, int(size), syscall.PROT_READ, syscall.MAP_SHARED) + if err != nil { + log.Fatalf("Mmap: %v", err) + } + + defer func() { + if err := syscall.Munmap(data); err != nil { + log.Fatalf("Munmap: %v", err) + } + }() + + return process(data) +} + +func process(data []byte) map[string]*measurement { + nChunks := runtime.NumCPU() + + chunkSize := len(data) / nChunks + if chunkSize == 0 { + chunkSize = len(data) + } + + chunks := make([]int, 0, nChunks) + offset := 0 + for offset < len(data) { + offset += chunkSize + if offset >= len(data) { + chunks = append(chunks, len(data)) + break + } + + nlPos := bytes.IndexByte(data[offset:], '\n') + if nlPos == -1 { + chunks = append(chunks, len(data)) + break + } else { + offset += nlPos + 1 + chunks = append(chunks, offset) + } + } + + var wg sync.WaitGroup + wg.Add(len(chunks)) + + results := make([]map[string]*measurement, len(chunks)) + start := 0 + for i, chunk := range chunks { + go func(data []byte, i int) { + results[i] = processChunk(data) + wg.Done() + }(data[start:chunk], i) + start = chunk + } + wg.Wait() + + measurements := make(map[string]*measurement) + for _, r := range results { + for id, rm := range r { + m := measurements[id] + if m == nil { + measurements[id] = rm + } else { + m.min = min(m.min, rm.min) + m.max = max(m.max, rm.max) + m.sum += rm.sum + m.count += rm.count + } + } + } + return measurements +} + +func processChunk(data []byte) map[string]*measurement { + // use uint64 FNV-1a hash of id value as buckets key and keep mapping to the id value. + // This assumes no collisions of id hashes. + const ( + // use power of 2 for fast modulo calculation + nBuckets = 1 << 12 + maxIds = 10_000 + + fnv1aOffset64 = 14695981039346656037 + fnv1aPrime64 = 1099511628211 + ) + + type entry struct { + key uint64 + mid int + } + buckets := make([][]entry, nBuckets) + measurements := make([]measurement, 0, maxIds) + ids := make(map[uint64][]byte) + + getMeasurement := func(key uint64) *measurement { + i := key & uint64(nBuckets-1) + for j := 0; j < len(buckets[i]); j++ { + e := &buckets[i][j] + if e.key == key { + return &measurements[e.mid] + } + } + return nil + } + + putMeasurement := func(key uint64, m measurement) { + i := key & uint64(nBuckets-1) + buckets[i] = append(buckets[i], entry{key: key, mid: len(measurements)}) + measurements = append(measurements, m) + } + + // assume valid input + for len(data) > 0 { + + idHash := uint64(fnv1aOffset64) + semiPos := 0 + for i, b := range data { + if b == ';' { + semiPos = i + break + } + + // calculate FNV-1a hash + idHash ^= uint64(b) + idHash *= fnv1aPrime64 + } + + idData := data[:semiPos] + + data = data[semiPos+1:] + + var temp int64 + // parseNumber + { + negative := data[0] == '-' + if negative { + data = data[1:] + } + + _ = data[3] + if data[1] == '.' { + // 1.2\n + temp = int64(data[0])*10 + int64(data[2]) - '0'*(10+1) + data = data[4:] + // 12.3\n + } else { + _ = data[4] + temp = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1) + data = data[5:] + } + + if negative { + temp = -temp + } + } + + m := getMeasurement(idHash) + if m == nil { + putMeasurement(idHash, measurement{ + min: temp, + max: temp, + sum: temp, + count: 1, + }) + ids[idHash] = idData + } else { + m.min = min(m.min, temp) + m.max = max(m.max, temp) + m.sum += temp + m.count++ + } + } + + result := make(map[string]*measurement, len(measurements)) + for _, bucket := range buckets { + for _, entry := range bucket { + result[string(ids[entry.key])] = &measurements[entry.mid] + } + } + return result +} + +func round(x float64) float64 { + return roundJava(x*10.0) / 10.0 +} + +// roundJava returns the closest integer to the argument, with ties +// rounding to positive infinity, see java's Math.round +func roundJava(x float64) float64 { + t := math.Trunc(x) + if x < 0.0 && t-x == 0.5 { + //return t + } else if math.Abs(x-t) >= 0.5 { + t += math.Copysign(1, x) + } + + if t == 0 { // check -0 + return 0.0 + } + return t +} + +// parseNumber reads decimal number that matches "^-?[0-9]{1,2}[.][0-9]" pattern, +// e.g.: -12.3, -3.4, 5.6, 78.9 and return the value*10, i.e. -123, -34, 56, 789. +func parseNumber(data []byte) int64 { + negative := data[0] == '-' + if negative { + data = data[1:] + } + + var result int64 + switch len(data) { + // 1.2 + case 3: + result = int64(data[0])*10 + int64(data[2]) - '0'*(10+1) + // 12.3 + case 4: + result = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1) + } + + if negative { + return -result + } + return result +} diff --git a/src/main/go/AlexanderYastrebov/calc_test.go b/src/main/go/AlexanderYastrebov/calc_test.go new file mode 100644 index 000000000..db7e27a2c --- /dev/null +++ b/src/main/go/AlexanderYastrebov/calc_test.go @@ -0,0 +1,86 @@ +package main + +import ( + "fmt" + "os" + "testing" +) + +func TestRoundJava(t *testing.T) { + for _, tc := range []struct { + value float64 + expected string + }{ + {value: -1.5, expected: "-1.0"}, + {value: -1.0, expected: "-1.0"}, + {value: -0.7, expected: "-1.0"}, + {value: -0.5, expected: "0.0"}, + {value: -0.3, expected: "0.0"}, + {value: 0.0, expected: "0.0"}, + {value: 0.3, expected: "0.0"}, + {value: 0.5, expected: "1.0"}, + {value: 0.7, expected: "1.0"}, + {value: 1.0, expected: "1.0"}, + {value: 1.5, expected: "2.0"}, + } { + if rounded := roundJava(tc.value); fmt.Sprintf("%.1f", rounded) != tc.expected { + t.Errorf("Wrong rounding of %v, expected: %s, got: %.1f", tc.value, tc.expected, rounded) + } + } +} + +func TestParseNumber(t *testing.T) { + for _, tc := range []struct { + value string + expected string + }{ + {value: "-99.9", expected: "-999"}, + {value: "-12.3", expected: "-123"}, + {value: "-1.5", expected: "-15"}, + {value: "-1.0", expected: "-10"}, + {value: "0.0", expected: "0"}, + {value: "0.3", expected: "3"}, + {value: "12.3", expected: "123"}, + {value: "99.9", expected: "999"}, + } { + if number := parseNumber([]byte(tc.value)); fmt.Sprintf("%d", number) != tc.expected { + t.Errorf("Wrong parsing of %v, expected: %s, got: %d", tc.value, tc.expected, number) + } + } +} + +var parseNumberSink int64 + +func BenchmarkParseNumber(b *testing.B) { + data1 := []byte("1.2") + data2 := []byte("-12.3") + + for i := 0; i < b.N; i++ { + parseNumberSink = parseNumber(data1) + parseNumber(data2) + } +} + +func BenchmarkProcess(b *testing.B) { + // $ ./create_measurements.sh 1000000 && mv measurements.txt measurements-1e6.txt + // Created file with 1,000,000 measurements in 514 ms + const filename = "../../../../measurements-1e6.txt" + + data, err := os.ReadFile(filename) + if err != nil { + b.Fatal(err) + } + + measurements := process(data) + rows := int64(0) + for _, m := range measurements { + rows += m.count + } + + b.ReportAllocs() + b.ResetTimer() + b.ReportMetric(float64(rows), "rows/op") + + for i := 0; i < b.N; i++ { + process(data) + } +} diff --git a/src/main/go/AlexanderYastrebov/go.mod b/src/main/go/AlexanderYastrebov/go.mod new file mode 100644 index 000000000..08f5bd193 --- /dev/null +++ b/src/main/go/AlexanderYastrebov/go.mod @@ -0,0 +1,3 @@ +module github.com/AlexanderYastrebov/1brc + +go 1.21.5