Skip to content

Commit 8c39d3f

Browse files
committed
PARQUET-1505: Use Java 7 NIO StandardCharsets
1 parent e7835e0 commit 8c39d3f

File tree

7 files changed

+30
-44
lines changed

7 files changed

+30
-44
lines changed

parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
*/
1919
package org.apache.parquet.avro;
2020

21-
import com.google.common.base.Charsets;
2221
import com.google.common.collect.ImmutableMap;
2322
import com.google.common.collect.Lists;
2423
import com.google.common.io.Resources;
@@ -27,6 +26,7 @@
2726
import java.math.BigDecimal;
2827
import java.math.BigInteger;
2928
import java.nio.ByteBuffer;
29+
import java.nio.charset.StandardCharsets;
3030
import java.util.ArrayList;
3131
import java.util.Arrays;
3232
import java.util.Collection;
@@ -369,7 +369,7 @@ public void testAll() throws Exception {
369369
.set("mylong", 2L)
370370
.set("myfloat", 3.1f)
371371
.set("mydouble", 4.1)
372-
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
372+
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
373373
.set("mystring", "hello")
374374
.set("mynestedrecord", nestedRecord)
375375
.set("myenum", "a")
@@ -398,7 +398,7 @@ public void testAll() throws Exception {
398398
assertEquals(2L, nextRecord.get("mylong"));
399399
assertEquals(3.1f, nextRecord.get("myfloat"));
400400
assertEquals(4.1, nextRecord.get("mydouble"));
401-
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
401+
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
402402
assertEquals(str("hello"), nextRecord.get("mystring"));
403403
assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
404404
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -567,7 +567,7 @@ public void write(Map<String, Object> record) {
567567
record.put("mylong", 2L);
568568
record.put("myfloat", 3.1f);
569569
record.put("mydouble", 4.1);
570-
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
570+
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
571571
record.put("mystring", "hello");
572572
record.put("myenum", "a");
573573
record.put("mynestedint", 1);
@@ -615,7 +615,7 @@ public void write(Map<String, Object> record) {
615615
assertEquals(2L, nextRecord.get("mylong"));
616616
assertEquals(3.1f, nextRecord.get("myfloat"));
617617
assertEquals(4.1, nextRecord.get("mydouble"));
618-
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
618+
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
619619
assertEquals(str("hello"), nextRecord.get("mystring"));
620620
assertEquals(str("a"), nextRecord.get("myenum")); // enum symbols are unknown
621621
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));

parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
*/
1919
package org.apache.parquet.avro;
2020

21-
import com.google.common.base.Charsets;
2221
import com.google.common.collect.ImmutableMap;
2322
import com.google.common.collect.Lists;
2423
import com.google.common.io.Resources;
2524
import java.io.File;
2625
import java.nio.ByteBuffer;
26+
import java.nio.charset.StandardCharsets;
2727
import java.util.ArrayList;
2828
import java.util.Arrays;
2929
import java.util.Collection;
@@ -247,7 +247,7 @@ public void testAll() throws Exception {
247247
.set("mylong", 2L)
248248
.set("myfloat", 3.1f)
249249
.set("mydouble", 4.1)
250-
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
250+
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
251251
.set("mystring", "hello")
252252
.set("mynestedrecord", nestedRecord)
253253
.set("myenum", "a")
@@ -276,7 +276,7 @@ public void testAll() throws Exception {
276276
assertEquals(2L, nextRecord.get("mylong"));
277277
assertEquals(3.1f, nextRecord.get("myfloat"));
278278
assertEquals(4.1, nextRecord.get("mydouble"));
279-
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
279+
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
280280
assertEquals(str("hello"), nextRecord.get("mystring"));
281281
assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
282282
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -327,7 +327,7 @@ public void testArrayWithNullValues() throws Exception {
327327
.set("mylong", 2L)
328328
.set("myfloat", 3.1f)
329329
.set("mydouble", 4.1)
330-
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
330+
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
331331
.set("mystring", "hello")
332332
.set("mynestedrecord", nestedRecord)
333333
.set("myenum", "a")
@@ -512,7 +512,7 @@ public void write(Map<String, Object> record) {
512512
record.put("mylong", 2L);
513513
record.put("myfloat", 3.1f);
514514
record.put("mydouble", 4.1);
515-
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
515+
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
516516
record.put("mystring", "hello");
517517
record.put("myenum", "a");
518518
record.put("mynestedint", 1);
@@ -573,7 +573,7 @@ public void write(Map<String, Object> record) {
573573
assertEquals(2L, nextRecord.get("mylong"));
574574
assertEquals(3.1f, nextRecord.get("myfloat"));
575575
assertEquals(4.1, nextRecord.get("mydouble"));
576-
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
576+
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
577577
assertEquals(str("hello"), nextRecord.get("mystring"));
578578
assertEquals(str("a"), nextRecord.get("myenum"));
579579
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));

parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
package org.apache.parquet.cli;
2121

2222
import com.beust.jcommander.internal.Lists;
23-
import com.google.common.annotations.VisibleForTesting;
2423
import com.google.common.base.Preconditions;
2524
import com.google.common.io.CharStreams;
2625
import com.google.common.io.Resources;
@@ -52,17 +51,14 @@
5251
import java.net.MalformedURLException;
5352
import java.net.URI;
5453
import java.net.URL;
55-
import java.nio.charset.Charset;
54+
import java.nio.charset.StandardCharsets;
5655
import java.security.AccessController;
5756
import java.util.Iterator;
5857
import java.util.List;
5958
import java.util.NoSuchElementException;
6059

6160
public abstract class BaseCommand implements Command, Configurable {
6261

63-
@VisibleForTesting
64-
static final Charset UTF8 = Charset.forName("utf8");
65-
6662
private static final String RESOURCE_URI_SCHEME = "resource";
6763
private static final String STDIN_AS_SOURCE = "stdin";
6864

@@ -103,7 +99,7 @@ public void output(String content, Logger console, String filename)
10399
} else {
104100
FSDataOutputStream outgoing = create(filename);
105101
try {
106-
outgoing.write(content.getBytes(UTF8));
102+
outgoing.write(content.getBytes(StandardCharsets.UTF_8));
107103
} finally {
108104
outgoing.close();
109105
}

parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,16 @@
2323
import java.io.ObjectStreamException;
2424
import java.io.OutputStream;
2525
import java.io.Serializable;
26-
import java.io.UnsupportedEncodingException;
2726
import java.nio.ByteBuffer;
2827
import java.nio.CharBuffer;
2928
import java.nio.charset.CharacterCodingException;
3029
import java.nio.charset.CharsetEncoder;
3130
import java.nio.charset.StandardCharsets;
3231
import java.util.Arrays;
3332

34-
import org.apache.parquet.io.ParquetDecodingException;
3533
import org.apache.parquet.io.ParquetEncodingException;
3634
import org.apache.parquet.schema.PrimitiveComparator;
3735

38-
import static org.apache.parquet.bytes.BytesUtils.UTF8;
39-
4036
abstract public class Binary implements Comparable<Binary>, Serializable {
4137

4238
protected boolean isBackingBytesReused;
@@ -133,11 +129,10 @@ public ByteArraySliceBackedBinary(byte[] value, int offset, int length, boolean
133129

134130
@Override
135131
public String toStringUsingUTF8() {
136-
return UTF8.decode(ByteBuffer.wrap(value, offset, length)).toString();
137-
// TODO: figure out why the following line was much slower
138-
// rdb: new String(...) is slower because it instantiates a new Decoder,
139-
// while Charset#decode uses a thread-local decoder cache
140-
// return new String(value, offset, length, BytesUtils.UTF8);
132+
// Charset#decode uses a thread-local decoder cache and is faster than
133+
// new String(...) which instantiates a new Decoder per invocation
134+
return StandardCharsets.UTF_8
135+
.decode(ByteBuffer.wrap(value, offset, length)).toString();
141136
}
142137

143138
@Override
@@ -220,11 +215,7 @@ public String toString() {
220215
}
221216

222217
private static ByteBuffer encodeUTF8(String value) {
223-
try {
224-
return ByteBuffer.wrap(value.getBytes("UTF-8"));
225-
} catch (UnsupportedEncodingException e) {
226-
throw new ParquetEncodingException("UTF-8 not supported.", e);
227-
}
218+
return ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8));
228219
}
229220
}
230221

@@ -284,7 +275,7 @@ public ByteArrayBackedBinary(byte[] value, boolean isBackingBytesReused) {
284275

285276
@Override
286277
public String toStringUsingUTF8() {
287-
return UTF8.decode(ByteBuffer.wrap(value)).toString();
278+
return StandardCharsets.UTF_8.decode(ByteBuffer.wrap(value)).toString();
288279
}
289280

290281
@Override
@@ -393,11 +384,8 @@ public ByteBufferBackedBinary(ByteBuffer value, int offset, int length, boolean
393384
public String toStringUsingUTF8() {
394385
String ret;
395386
if (value.hasArray()) {
396-
try {
397-
ret = new String(value.array(), value.arrayOffset() + offset, length, "UTF-8");
398-
} catch (UnsupportedEncodingException e) {
399-
throw new ParquetDecodingException("UTF-8 not supported");
400-
}
387+
ret = new String(value.array(), value.arrayOffset() + offset, length,
388+
StandardCharsets.UTF_8);
401389
} else {
402390
int limit = value.limit();
403391
value.limit(offset+length);
@@ -406,7 +394,7 @@ public String toStringUsingUTF8() {
406394
// no corresponding interface to read a subset of a buffer, would have to slice it
407395
// which creates another ByteBuffer object or do what is done here to adjust the
408396
// limit/offset and set them back after
409-
ret = UTF8.decode(value).toString();
397+
ret = StandardCharsets.UTF_8.decode(value).toString();
410398
value.limit(limit);
411399
value.position(position);
412400
}

parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
2828

2929
import java.io.IOException;
30-
import java.io.UnsupportedEncodingException;
3130
import java.nio.ByteBuffer;
31+
import java.nio.charset.StandardCharsets;
3232

3333
import org.apache.parquet.bytes.ByteBufferInputStream;
3434
import org.junit.Assert;
@@ -627,9 +627,8 @@ private void writeRepeated(int COUNT, ValuesWriter cw, String prefix) {
627627
}
628628
}
629629

630-
private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw,
631-
String prefix) throws UnsupportedEncodingException {
632-
Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes("UTF-8"));
630+
private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
631+
Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes(StandardCharsets.UTF_8));
633632
for (int i = 0; i < COUNT; i++) {
634633
Binary content = Binary.fromString(prefix + i % 10);
635634
System.arraycopy(content.getBytesUnsafe(), 0, reused.getBytesUnsafe(), 0, reused.length());

parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.io.OutputStream;
2525
import java.nio.ByteBuffer;
2626
import java.nio.charset.Charset;
27+
import java.nio.charset.StandardCharsets;
2728

2829
import org.slf4j.Logger;
2930
import org.slf4j.LoggerFactory;
@@ -34,6 +35,8 @@
3435
public class BytesUtils {
3536
private static final Logger LOG = LoggerFactory.getLogger(BytesUtils.class);
3637

38+
/** @deprecated Use {@link StandardCharsets#UTF_8} instead */
39+
@Deprecated
3740
public static final Charset UTF8 = Charset.forName("UTF-8");
3841

3942
/**

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
2525

2626
import java.io.IOException;
27-
import java.nio.charset.Charset;
27+
import java.nio.charset.StandardCharsets;
2828
import java.util.ArrayList;
2929
import java.util.Arrays;
3030
import java.util.Collections;
@@ -101,7 +101,7 @@ public class ParquetFileWriter {
101101

102102
public static final String PARQUET_METADATA_FILE = "_metadata";
103103
public static final String MAGIC_STR = "PAR1";
104-
public static final byte[] MAGIC = MAGIC_STR.getBytes(Charset.forName("ASCII"));
104+
public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
105105
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
106106
public static final int CURRENT_VERSION = 1;
107107

0 commit comments

Comments
 (0)