diff --git a/android/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java b/android/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java new file mode 100644 index 000000000000..ee81ea1259b2 --- /dev/null +++ b/android/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2017 The Guava Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.google.common.io; + +import com.google.caliper.BeforeExperiment; +import com.google.caliper.Benchmark; +import com.google.caliper.Param; +import com.google.caliper.api.VmOptions; +import com.google.common.base.Optional; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.Random; + +/** + * Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}. + */ +// These benchmarks allocate a lot of data so use a large heap +@VmOptions({"-Xms12g", "-Xmx12g", "-d64"}) +public class ByteSourceAsCharSourceReadBenchmark { + enum ReadStrategy { + TO_BYTE_ARRAY_NEW_STRING { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + return new String(byteSource.read(), cs); + } + }, + USING_CHARSTREAMS_COPY { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + StringBuilder sb = new StringBuilder(); + try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { + CharStreams.copy(reader, sb); + } + return sb.toString(); + } + }, + // It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't + // my best guess is that the jdk authors have spent more time optimizing that callpath than this + // one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases + // theoretically we could duplicate all that logic here to try to beat 'new String' or at least + // come close. + USING_DECODER_WITH_SIZE_HINT { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + Optional size = byteSource.sizeIfKnown(); + // if we know the size and it fits in an int + if (size.isPresent() && size.get().longValue() == size.get().intValue()) { + // otherwise try to presize a StringBuilder + // it is kind of lame that we need to construct a decoder to access this value. + // if this is a concern we could add special cases for some known charsets (like utf8) + // or we could avoid inputstreamreader and use the decoder api directly + // TODO(lukes): in a real implementation we would need to handle overflow conditions + int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte()); + char[] buffer = new char[maxChars]; + int bufIndex = 0; + int remaining = buffer.length; + try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { + int nRead = 0; + while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) { + bufIndex += nRead; + remaining -= nRead; + } + if (nRead == -1) { + // we reached EOF + return new String(buffer, 0, bufIndex); + } + // otherwise we got the size wrong. This can happen if the size changes between when + // we called sizeIfKnown and when we started reading the file (or i guess if + // maxCharsPerByte is wrong) + // Fallback to an incremental approach + StringBuilder builder = new StringBuilder(bufIndex + 32); + builder.append(buffer, 0, bufIndex); + buffer = null; // release for gc + CharStreams.copy(reader, builder); + return builder.toString(); + } + + } else { + return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs); + } + } + }; + + abstract String read(ByteSource byteSource, Charset cs) throws IOException; + } + + @Param({"UTF-8"}) + String charsetName; + + @Param ReadStrategy strategy; + + @Param({"10", "1024", "1048576"}) + int size; + + Charset charset; + ByteSource data; + + @BeforeExperiment + public void setUp() { + charset = Charset.forName(charsetName); + StringBuilder sb = new StringBuilder(); + Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior + sb.ensureCapacity(size); + for (int k = 0; k < size; k++) { + // [9-127) includes all ascii non-control characters + sb.append((char) (random.nextInt(127 - 9) + 9)); + } + String string = sb.toString(); + sb.setLength(0); + data = ByteSource.wrap(string.getBytes(charset)); + } + + @Benchmark + public int timeCopy(int reps) throws IOException { + int r = 0; + final Charset localCharset = charset; + final ByteSource localData = data; + final ReadStrategy localStrategy = strategy; + for (int i = 0; i < reps; i++) { + r += localStrategy.read(localData, localCharset).hashCode(); + } + return r; + } +} diff --git a/android/guava/src/com/google/common/io/ByteSource.java b/android/guava/src/com/google/common/io/ByteSource.java index 1beb9cacefca..7fa64b89c8ee 100644 --- a/android/guava/src/com/google/common/io/ByteSource.java +++ b/android/guava/src/com/google/common/io/ByteSource.java @@ -455,6 +455,18 @@ public Reader openStream() throws IOException { return new InputStreamReader(ByteSource.this.openStream(), charset); } + @Override + public String read() throws IOException { + // Reading all the data as a byte array is more efficient than the default read() + // implementation because: + // 1. the string constructor can avoid an extra copy most of the time by correctly sizing the + // internal char array (hard to avoid using StringBuilder) + // 2. we avoid extra copies into temporary buffers altogether + // The downside is that this will cause us to store the file bytes in memory twice for a short + // amount of time. + return new String(ByteSource.this.read(), charset); + } + @Override public String toString() { return ByteSource.this.toString() + ".asCharSource(" + charset + ")"; diff --git a/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java b/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java new file mode 100644 index 000000000000..ee81ea1259b2 --- /dev/null +++ b/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2017 The Guava Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.google.common.io; + +import com.google.caliper.BeforeExperiment; +import com.google.caliper.Benchmark; +import com.google.caliper.Param; +import com.google.caliper.api.VmOptions; +import com.google.common.base.Optional; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.Random; + +/** + * Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}. + */ +// These benchmarks allocate a lot of data so use a large heap +@VmOptions({"-Xms12g", "-Xmx12g", "-d64"}) +public class ByteSourceAsCharSourceReadBenchmark { + enum ReadStrategy { + TO_BYTE_ARRAY_NEW_STRING { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + return new String(byteSource.read(), cs); + } + }, + USING_CHARSTREAMS_COPY { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + StringBuilder sb = new StringBuilder(); + try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { + CharStreams.copy(reader, sb); + } + return sb.toString(); + } + }, + // It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't + // my best guess is that the jdk authors have spent more time optimizing that callpath than this + // one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases + // theoretically we could duplicate all that logic here to try to beat 'new String' or at least + // come close. + USING_DECODER_WITH_SIZE_HINT { + @Override + String read(ByteSource byteSource, Charset cs) throws IOException { + Optional size = byteSource.sizeIfKnown(); + // if we know the size and it fits in an int + if (size.isPresent() && size.get().longValue() == size.get().intValue()) { + // otherwise try to presize a StringBuilder + // it is kind of lame that we need to construct a decoder to access this value. + // if this is a concern we could add special cases for some known charsets (like utf8) + // or we could avoid inputstreamreader and use the decoder api directly + // TODO(lukes): in a real implementation we would need to handle overflow conditions + int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte()); + char[] buffer = new char[maxChars]; + int bufIndex = 0; + int remaining = buffer.length; + try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { + int nRead = 0; + while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) { + bufIndex += nRead; + remaining -= nRead; + } + if (nRead == -1) { + // we reached EOF + return new String(buffer, 0, bufIndex); + } + // otherwise we got the size wrong. This can happen if the size changes between when + // we called sizeIfKnown and when we started reading the file (or i guess if + // maxCharsPerByte is wrong) + // Fallback to an incremental approach + StringBuilder builder = new StringBuilder(bufIndex + 32); + builder.append(buffer, 0, bufIndex); + buffer = null; // release for gc + CharStreams.copy(reader, builder); + return builder.toString(); + } + + } else { + return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs); + } + } + }; + + abstract String read(ByteSource byteSource, Charset cs) throws IOException; + } + + @Param({"UTF-8"}) + String charsetName; + + @Param ReadStrategy strategy; + + @Param({"10", "1024", "1048576"}) + int size; + + Charset charset; + ByteSource data; + + @BeforeExperiment + public void setUp() { + charset = Charset.forName(charsetName); + StringBuilder sb = new StringBuilder(); + Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior + sb.ensureCapacity(size); + for (int k = 0; k < size; k++) { + // [9-127) includes all ascii non-control characters + sb.append((char) (random.nextInt(127 - 9) + 9)); + } + String string = sb.toString(); + sb.setLength(0); + data = ByteSource.wrap(string.getBytes(charset)); + } + + @Benchmark + public int timeCopy(int reps) throws IOException { + int r = 0; + final Charset localCharset = charset; + final ByteSource localData = data; + final ReadStrategy localStrategy = strategy; + for (int i = 0; i < reps; i++) { + r += localStrategy.read(localData, localCharset).hashCode(); + } + return r; + } +} diff --git a/guava/src/com/google/common/io/ByteSource.java b/guava/src/com/google/common/io/ByteSource.java index 1beb9cacefca..7fa64b89c8ee 100644 --- a/guava/src/com/google/common/io/ByteSource.java +++ b/guava/src/com/google/common/io/ByteSource.java @@ -455,6 +455,18 @@ public Reader openStream() throws IOException { return new InputStreamReader(ByteSource.this.openStream(), charset); } + @Override + public String read() throws IOException { + // Reading all the data as a byte array is more efficient than the default read() + // implementation because: + // 1. the string constructor can avoid an extra copy most of the time by correctly sizing the + // internal char array (hard to avoid using StringBuilder) + // 2. we avoid extra copies into temporary buffers altogether + // The downside is that this will cause us to store the file bytes in memory twice for a short + // amount of time. + return new String(ByteSource.this.read(), charset); + } + @Override public String toString() { return ByteSource.this.toString() + ".asCharSource(" + charset + ")";