diff --git a/.github/config/labeler-config.yml b/.github/config/labeler-config.yml index f968947a02e2..00a1b2948657 100644 --- a/.github/config/labeler-config.yml +++ b/.github/config/labeler-config.yml @@ -2,7 +2,7 @@ "tests:hive": - lib/trino-orc/** - lib/trino-parquet/** - - lib/trino-rcfile/** + - lib/trino-hive-formats/** - plugin/trino-hive-hadoop2/** - plugin/trino-hive/** - testing/trino-product-tests/** diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/FileRcFileDataSource.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInput.java similarity index 58% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/FileRcFileDataSource.java rename to lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInput.java index 21d3188188a9..a6c9f291a518 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/FileRcFileDataSource.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInput.java @@ -11,78 +11,65 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.filesystem.local; + +import io.trino.filesystem.TrinoInput; +import org.apache.iceberg.Files; +import org.apache.iceberg.io.SeekableInputStream; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; +import static java.lang.Math.min; import static java.util.Objects.requireNonNull; -public class FileRcFileDataSource - implements RcFileDataSource +class LocalInput + implements TrinoInput { - private final File path; - private final long size; + private final File file; private final RandomAccessFile input; - private long readTimeNanos; - private long readBytes; - - public FileRcFileDataSource(File path) - throws IOException - { - this.path = requireNonNull(path, "path is null"); - this.size = path.length(); - this.input = new RandomAccessFile(path, "r"); - } - @Override - public void close() + public LocalInput(File file) throws IOException { - input.close(); - } - - @Override - public long getReadBytes() - { - return readBytes; + this.file = requireNonNull(file, "file is null"); + this.input = new RandomAccessFile(file, "r"); } @Override - public long getReadTimeNanos() + public SeekableInputStream inputStream() { - return readTimeNanos; - } - - @Override - public long getSize() - { - return size; + return Files.localInput(file).newStream(); } @Override public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) throws IOException { - long start = System.nanoTime(); - input.seek(position); input.readFully(buffer, bufferOffset, bufferLength); + } - readTimeNanos += System.nanoTime() - start; - readBytes += bufferLength; + @Override + public int readTail(byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + int readSize = (int) min(file.length(), bufferLength); + readFully(file.length() - readSize, buffer, bufferOffset, readSize); + return readSize; } @Override - public RcFileDataSourceId getId() + public void close() + throws IOException { - return new RcFileDataSourceId(path.getPath()); + input.close(); } @Override public String toString() { - return path.getPath(); + return file.getPath(); } } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInputFile.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInputFile.java new file mode 100644 index 000000000000..a0475d5fc415 --- /dev/null +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalInputFile.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.filesystem.local; + +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; + +import java.io.File; +import java.io.IOException; + +import static java.util.Objects.requireNonNull; + +public class LocalInputFile + implements TrinoInputFile +{ + private final File file; + + public LocalInputFile(File file) + { + this.file = requireNonNull(file, "file is null"); + } + + @Override + public TrinoInput newInput() + throws IOException + { + return new LocalInput(file); + } + + @Override + public long length() + throws IOException + { + return file.length(); + } + + @Override + public long modificationTime() + throws IOException + { + return file.lastModified(); + } + + @Override + public boolean exists() + throws IOException + { + return file.exists(); + } + + @Override + public String location() + { + return file.getPath(); + } + + @Override + public String toString() + { + return file.getPath(); + } +} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/MemoryRcFileDataSource.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInput.java similarity index 59% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/MemoryRcFileDataSource.java rename to lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInput.java index 9e2710f9bb16..85a185ef8d56 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/MemoryRcFileDataSource.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInput.java @@ -11,57 +11,54 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.filesystem.memory; import io.airlift.slice.Slice; +import io.trino.filesystem.TrinoInput; +import org.apache.iceberg.io.SeekableInputStream; +import static java.lang.Math.min; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; -public class MemoryRcFileDataSource - implements RcFileDataSource +class MemoryInput + implements TrinoInput { - private final RcFileDataSourceId id; + private final String location; private final Slice data; - private long readBytes; - public MemoryRcFileDataSource(RcFileDataSourceId id, Slice data) + public MemoryInput(String location, Slice data) { - this.id = requireNonNull(id, "id is null"); + this.location = requireNonNull(location, "location is null"); this.data = requireNonNull(data, "data is null"); } @Override - public RcFileDataSourceId getId() + public SeekableInputStream inputStream() { - return id; + return new MemorySeekableInputStream(data); } @Override - public long getReadBytes() + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) { - return readBytes; + data.getBytes(toIntExact(position), buffer, bufferOffset, bufferLength); } @Override - public long getReadTimeNanos() + public int readTail(byte[] buffer, int bufferOffset, int bufferLength) { - return 0; + int readSize = min(data.length(), bufferLength); + readFully(data.length() - readSize, buffer, bufferOffset, readSize); + return readSize; } @Override - public long getSize() - { - return data.length(); - } + public void close() {} @Override - public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + public String toString() { - data.getBytes(toIntExact(position), buffer, bufferOffset, bufferLength); - readBytes += bufferLength; + return location; } - - @Override - public void close() {} } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInputFile.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInputFile.java new file mode 100644 index 000000000000..81b66ce986e4 --- /dev/null +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryInputFile.java @@ -0,0 +1,75 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.filesystem.memory; + +import io.airlift.slice.Slice; +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; + +import java.io.IOException; + +import static java.util.Objects.requireNonNull; + +public class MemoryInputFile + implements TrinoInputFile +{ + private final String location; + private final Slice data; + + public MemoryInputFile(String location, Slice data) + { + this.location = requireNonNull(location, "location is null"); + this.data = requireNonNull(data, "data is null"); + } + + @Override + public TrinoInput newInput() + throws IOException + { + return new MemoryInput(location, data); + } + + @Override + public long length() + throws IOException + { + return data.length(); + } + + @Override + public long modificationTime() + throws IOException + { + return 0; + } + + @Override + public boolean exists() + throws IOException + { + return true; + } + + @Override + public String location() + { + return location; + } + + @Override + public String toString() + { + return location; + } +} diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemorySeekableInputStream.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemorySeekableInputStream.java new file mode 100644 index 000000000000..966f0eb176da --- /dev/null +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemorySeekableInputStream.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.filesystem.memory; + +import io.airlift.slice.Slice; +import io.airlift.slice.SliceInput; +import org.apache.iceberg.io.SeekableInputStream; + +import java.io.IOException; + +public class MemorySeekableInputStream + extends SeekableInputStream +{ + private final SliceInput input; + + public MemorySeekableInputStream(Slice data) + { + input = data.getInput(); + } + + @Override + public long getPos() + { + return input.position(); + } + + @Override + public void seek(long newPos) + { + input.setPosition(newPos); + } + + @Override + public int read() + throws IOException + { + return input.read(); + } + + @Override + public int read(byte[] destination, int destinationIndex, int length) + { + return input.read(destination, destinationIndex, length); + } + + @Override + public long skip(long length) + { + return input.skip(length); + } +} diff --git a/lib/trino-rcfile/pom.xml b/lib/trino-hive-formats/pom.xml similarity index 89% rename from lib/trino-rcfile/pom.xml rename to lib/trino-hive-formats/pom.xml index 377a4a845c16..03d915cb137e 100644 --- a/lib/trino-rcfile/pom.xml +++ b/lib/trino-hive-formats/pom.xml @@ -9,15 +9,20 @@ ../../pom.xml - trino-rcfile - trino-rcfile - Trino - RCFile + trino-hive-formats + trino-hive-formats + Trino - Hive Formats ${project.parent.basedir} + + io.trino + trino-filesystem + + io.trino trino-hadoop-toolkit @@ -65,6 +70,11 @@ joda-time + + org.apache.iceberg + iceberg-api + + org.openjdk.jol jol-core diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataOutputStream.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataOutputStream.java new file mode 100644 index 000000000000..a6aaae2ab218 --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataOutputStream.java @@ -0,0 +1,331 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.openjdk.jol.info.ClassLayout; + +import java.io.Closeable; +import java.io.DataOutput; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Arrays; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.airlift.slice.SizeOf.SIZE_OF_SHORT; +import static java.lang.Math.toIntExact; + +public final class DataOutputStream + extends OutputStream + implements DataOutput +{ + private static final int DEFAULT_BUFFER_SIZE = 4 * 1024; + private static final int MINIMUM_CHUNK_SIZE = 1024; + + private static final int INSTANCE_SIZE = toIntExact(ClassLayout.parseClass(DataOutputStream.class).instanceSize()); + + private final OutputStream outputStream; + + private final Slice slice; + private final byte[] buffer; + + /** + * Offset of buffer within stream. + */ + private long bufferOffset; + /** + * Current position for writing in buffer. + */ + private int bufferPosition; + + public DataOutputStream(OutputStream inputStream) + { + this(inputStream, DEFAULT_BUFFER_SIZE); + } + + public DataOutputStream(OutputStream outputStream, int bufferSize) + { + checkArgument(bufferSize >= MINIMUM_CHUNK_SIZE, "minimum buffer size of " + MINIMUM_CHUNK_SIZE + " required"); + if (outputStream == null) { + throw new NullPointerException("outputStream is null"); + } + + this.outputStream = outputStream; + this.buffer = new byte[bufferSize]; + this.slice = Slices.wrappedBuffer(buffer); + } + + @Override + public void flush() + throws IOException + { + flushBufferToOutputStream(); + outputStream.flush(); + } + + @Override + public void close() + throws IOException + { + try (Closeable ignored = outputStream) { + flushBufferToOutputStream(); + } + } + + public long longSize() + { + return bufferOffset + bufferPosition; + } + + public long getRetainedSize() + { + return slice.getRetainedSize() + INSTANCE_SIZE; + } + + @Override + public void writeBoolean(boolean value) + throws IOException + { + writeByte(value ? 1 : 0); + } + + @Override + public void write(int value) + throws IOException + { + writeByte(value); + } + + @Override + public void writeByte(int value) + throws IOException + { + ensureWritableBytes(SIZE_OF_BYTE); + slice.setByte(bufferPosition, value); + bufferPosition += SIZE_OF_BYTE; + } + + @Override + public void writeShort(int value) + throws IOException + { + ensureWritableBytes(SIZE_OF_SHORT); + slice.setShort(bufferPosition, value); + bufferPosition += SIZE_OF_SHORT; + } + + @Override + public void writeInt(int value) + throws IOException + { + ensureWritableBytes(SIZE_OF_INT); + slice.setInt(bufferPosition, value); + bufferPosition += SIZE_OF_INT; + } + + @Override + public void writeLong(long value) + throws IOException + { + ensureWritableBytes(SIZE_OF_LONG); + slice.setLong(bufferPosition, value); + bufferPosition += SIZE_OF_LONG; + } + + @Override + public void writeFloat(float value) + throws IOException + { + writeInt(Float.floatToIntBits(value)); + } + + @Override + public void writeDouble(double value) + throws IOException + { + writeLong(Double.doubleToLongBits(value)); + } + + public void write(Slice source) + throws IOException + { + write(source, 0, source.length()); + } + + public void write(Slice source, int sourceIndex, int length) + throws IOException + { + // Write huge chunks direct to OutputStream + if (length >= MINIMUM_CHUNK_SIZE) { + flushBufferToOutputStream(); + writeToOutputStream(source, sourceIndex, length); + bufferOffset += length; + } + else { + ensureWritableBytes(length); + slice.setBytes(bufferPosition, source, sourceIndex, length); + bufferPosition += length; + } + } + + @Override + public void write(byte[] source) + throws IOException + { + write(source, 0, source.length); + } + + @Override + public void write(byte[] source, int sourceIndex, int length) + throws IOException + { + // Write huge chunks direct to OutputStream + if (length >= MINIMUM_CHUNK_SIZE) { + flushBufferToOutputStream(); + writeToOutputStream(source, sourceIndex, length); + bufferOffset += length; + } + else { + ensureWritableBytes(length); + slice.setBytes(bufferPosition, source, sourceIndex, length); + bufferPosition += length; + } + } + + public void write(InputStream in, int length) + throws IOException + { + while (length > 0) { + int batch = ensureBatchSize(length); + slice.setBytes(bufferPosition, in, batch); + bufferPosition += batch; + length -= batch; + } + } + + public void writeZero(int length) + throws IOException + { + checkArgument(length >= 0, "length must be 0 or greater than 0."); + + while (length > 0) { + int batch = ensureBatchSize(length); + Arrays.fill(buffer, bufferPosition, bufferPosition + batch, (byte) 0); + bufferPosition += batch; + length -= batch; + } + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder("OutputStreamSliceOutputAdapter{"); + builder.append("outputStream=").append(outputStream); + builder.append("bufferSize=").append(slice.length()); + builder.append('}'); + return builder.toString(); + } + + private void ensureWritableBytes(int minWritableBytes) + throws IOException + { + if (bufferPosition + minWritableBytes > slice.length()) { + flushBufferToOutputStream(); + } + } + + private int ensureBatchSize(int length) + throws IOException + { + ensureWritableBytes(Math.min(MINIMUM_CHUNK_SIZE, length)); + return Math.min(length, slice.length() - bufferPosition); + } + + private void flushBufferToOutputStream() + throws IOException + { + writeToOutputStream(buffer, 0, bufferPosition); + bufferOffset += bufferPosition; + bufferPosition = 0; + } + + private void writeToOutputStream(byte[] source, int sourceIndex, int length) + throws IOException + { + outputStream.write(source, sourceIndex, length); + } + + private void writeToOutputStream(Slice source, int sourceIndex, int length) + throws IOException + { + source.getBytes(sourceIndex, outputStream, length); + } + + // + // Unsupported operations + // + + /** + * Unsupported operation + * + * @throws UnsupportedOperationException always + */ + @Override + @Deprecated + public void writeChar(int value) + { + throw new UnsupportedOperationException(); + } + + /** + * Unsupported operation + * + * @throws UnsupportedOperationException always + */ + @Override + @Deprecated + public void writeChars(String s) + { + throw new UnsupportedOperationException(); + } + + /** + * Unsupported operation + * + * @throws UnsupportedOperationException always + */ + @Override + @Deprecated + public void writeUTF(String s) + { + throw new UnsupportedOperationException(); + } + + /** + * Unsupported operation + * + * @throws UnsupportedOperationException always + */ + @Override + @Deprecated + public void writeBytes(String s) + { + throw new UnsupportedOperationException(); + } +} diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataSeekableInputStream.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataSeekableInputStream.java new file mode 100644 index 000000000000..2ec6d94e0a9f --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/DataSeekableInputStream.java @@ -0,0 +1,457 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.iceberg.io.SeekableInputStream; +import org.openjdk.jol.info.ClassLayout; + +import java.io.DataInput; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.airlift.slice.SizeOf.SIZE_OF_SHORT; +import static io.airlift.slice.SizeOf.sizeOf; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public final class DataSeekableInputStream + extends InputStream + implements DataInput +{ + private static final int INSTANCE_SIZE = toIntExact(ClassLayout.parseClass(DataSeekableInputStream.class).instanceSize()); + private static final int DEFAULT_BUFFER_SIZE = 4 * 1024; + private static final int MINIMUM_CHUNK_SIZE = 1024; + + private final SeekableInputStream inputStream; + private long readTimeNanos; + private long readBytes; + + private final byte[] buffer; + private final Slice slice; + /** + * Offset of buffer within stream. + */ + private long bufferOffset; + /** + * Current position for reading from buffer. + */ + private int bufferPosition; + + private int bufferFill; + + public DataSeekableInputStream(SeekableInputStream inputStream) + { + this(inputStream, DEFAULT_BUFFER_SIZE); + } + + public DataSeekableInputStream(SeekableInputStream inputStream, int bufferSize) + { + requireNonNull(inputStream, "inputStream is null"); + checkArgument(bufferSize >= MINIMUM_CHUNK_SIZE, "minimum buffer size of " + MINIMUM_CHUNK_SIZE + " required"); + + this.inputStream = inputStream; + this.buffer = new byte[bufferSize]; + this.slice = Slices.wrappedBuffer(buffer); + } + + public long getReadTimeNanos() + { + return readTimeNanos; + } + + public long getReadBytes() + { + return readBytes; + } + + public long getPos() + throws IOException + { + return checkedCast(bufferOffset + bufferPosition); + } + + public void seek(long newPos) + throws IOException + { + // todo check if new position is within the current buffer + + // drop current buffer + bufferPosition = 0; + bufferFill = 0; + + // skip the rest in inputStream + inputStream.seek(newPos); + + // update buffer offset to the new position + bufferOffset = newPos; + + verify(newPos == getPos()); + } + + @Override + public int available() + throws IOException + { + if (bufferPosition < bufferFill) { + return availableBytes(); + } + + return fillBuffer(); + } + + @Override + public int skipBytes(int n) + throws IOException + { + return (int) skip(n); + } + + @Override + public boolean readBoolean() + throws IOException + { + return readByte() != 0; + } + + @Override + public byte readByte() + throws IOException + { + ensureAvailable(SIZE_OF_BYTE); + byte v = slice.getByte(bufferPosition); + bufferPosition += SIZE_OF_BYTE; + return v; + } + + @Override + public int readUnsignedByte() + throws IOException + { + return readByte() & 0xFF; + } + + @Override + public short readShort() + throws IOException + { + ensureAvailable(SIZE_OF_SHORT); + short v = slice.getShort(bufferPosition); + bufferPosition += SIZE_OF_SHORT; + return v; + } + + @Override + public int readUnsignedShort() + throws IOException + { + return readShort() & 0xFFFF; + } + + @Override + public int readInt() + throws IOException + { + ensureAvailable(SIZE_OF_INT); + int v = slice.getInt(bufferPosition); + bufferPosition += SIZE_OF_INT; + return v; + } + + /** + * Gets an unsigned 32-bit integer at the current {@code position} + * and increases the {@code position} by {@code 4} in this buffer. + * + * @throws IndexOutOfBoundsException if {@code this.available()} is less than {@code 4} + */ + public long readUnsignedInt() + throws IOException + { + return readInt() & 0xFFFFFFFFL; + } + + @Override + public long readLong() + throws IOException + { + ensureAvailable(SIZE_OF_LONG); + long v = slice.getLong(bufferPosition); + bufferPosition += SIZE_OF_LONG; + return v; + } + + @Override + public float readFloat() + throws IOException + { + return Float.intBitsToFloat(readInt()); + } + + @Override + public double readDouble() + throws IOException + { + return Double.longBitsToDouble(readLong()); + } + + @Override + public int read() + throws IOException + { + if (available() == 0) { + return -1; + } + + verify(availableBytes() > 0); + int v = slice.getByte(bufferPosition) & 0xFF; + bufferPosition += SIZE_OF_BYTE; + return v; + } + + @Override + public long skip(long length) + throws IOException + { + int availableBytes = availableBytes(); + // is skip within the current buffer? + if (availableBytes >= length) { + bufferPosition += length; + return length; + } + + // drop current buffer + bufferPosition = bufferFill; + + // skip the rest in inputStream + long start = System.nanoTime(); + long inputStreamSkip = inputStream.skip(length - availableBytes); + readTimeNanos += System.nanoTime() - start; + readBytes += inputStreamSkip; + + bufferOffset += inputStreamSkip; + return availableBytes + inputStreamSkip; + } + + @Override + public int read(byte[] destination) + throws IOException + { + return read(destination, 0, destination.length); + } + + @Override + public int read(byte[] destination, int destinationIndex, int length) + throws IOException + { + if (available() == 0) { + return -1; + } + + verify(availableBytes() > 0); + int batch = Math.min(availableBytes(), length); + slice.getBytes(bufferPosition, destination, destinationIndex, batch); + bufferPosition += batch; + return batch; + } + + @Override + public void readFully(byte[] destination) + throws IOException + { + readFully(destination, 0, destination.length); + } + + @Override + public void readFully(byte[] destination, int destinationIndex, int length) + throws IOException + { + while (length > 0) { + int batch = Math.min(availableBytes(), length); + slice.getBytes(bufferPosition, destination, destinationIndex, batch); + + bufferPosition += batch; + destinationIndex += batch; + length -= batch; + + ensureAvailable(Math.min(length, MINIMUM_CHUNK_SIZE)); + } + } + + public Slice readSlice(int length) + throws IOException + { + if (length == 0) { + return Slices.EMPTY_SLICE; + } + + Slice newSlice = Slices.allocate(length); + readFully(newSlice, 0, length); + return newSlice; + } + + public void readFully(Slice destination) + throws IOException + { + readFully(destination, 0, destination.length()); + } + + public void readFully(Slice destination, int destinationIndex, int length) + throws IOException + { + while (length > 0) { + int batch = Math.min(availableBytes(), length); + slice.getBytes(bufferPosition, destination, destinationIndex, batch); + + bufferPosition += batch; + destinationIndex += batch; + length -= batch; + + ensureAvailable(Math.min(length, MINIMUM_CHUNK_SIZE)); + } + } + + public void readFully(OutputStream out, int length) + throws IOException + { + while (length > 0) { + int batch = Math.min(availableBytes(), length); + out.write(buffer, bufferPosition, batch); + + bufferPosition += batch; + length -= batch; + + ensureAvailable(Math.min(length, MINIMUM_CHUNK_SIZE)); + } + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + public long getRetainedSize() + { + return INSTANCE_SIZE + sizeOf(buffer); + } + + private int availableBytes() + { + return bufferFill - bufferPosition; + } + + private void ensureAvailable(int size) + throws IOException + { + if (bufferPosition + size < bufferFill) { + return; + } + + if (fillBuffer() < size) { + throw new EOFException("End of stream"); + } + } + + private int fillBuffer() + throws IOException + { + // Keep the rest + int rest = bufferFill - bufferPosition; + // Use System.arraycopy for small copies + System.arraycopy(buffer, bufferPosition, buffer, 0, rest); + + bufferFill = rest; + bufferOffset += bufferPosition; + bufferPosition = 0; + // Fill buffer with a minimum of bytes + long start = System.nanoTime(); + while (bufferFill < MINIMUM_CHUNK_SIZE) { + int bytesRead = inputStream.read(buffer, bufferFill, buffer.length - bufferFill); + if (bytesRead < 0) { + break; + } + + readBytes += bytesRead; + bufferFill += bytesRead; + } + readTimeNanos += System.nanoTime() - start; + + return bufferFill; + } + + private static int checkedCast(long value) + { + int result = (int) value; + checkArgument(result == value, "Size is greater than maximum int value"); + return result; + } + + // + // Unsupported operations + // + + @Override + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") + @Deprecated + public void mark(int readLimit) + { + throw new UnsupportedOperationException(); + } + + @Override + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") + @Deprecated + public void reset() + + { + throw new UnsupportedOperationException(); + } + + @Override + @Deprecated + public boolean markSupported() + { + throw new UnsupportedOperationException(); + } + + @Override + @Deprecated + public char readChar() + { + throw new UnsupportedOperationException(); + } + + @Override + @Deprecated + public String readLine() + { + throw new UnsupportedOperationException(); + } + + @Override + @Deprecated + public String readUTF() + { + throw new UnsupportedOperationException(); + } +} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecoderUtils.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ReadWriteUtils.java similarity index 69% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecoderUtils.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ReadWriteUtils.java index ca496ad6d563..7d9a73619a49 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecoderUtils.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/ReadWriteUtils.java @@ -11,12 +11,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats; import io.airlift.slice.Slice; import io.airlift.slice.SliceInput; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; import io.trino.spi.type.CharType; import io.trino.spi.type.Type; import io.trino.spi.type.VarcharType; @@ -32,14 +34,12 @@ import static java.util.Objects.requireNonNull; // faster versions of org.apache.hadoop.io.WritableUtils methods adapted for Slice -public final class RcFileDecoderUtils +public final class ReadWriteUtils { // 0xFFFF_FFFF + syncFirst(long) + syncSecond(long) private static final int SYNC_SEQUENCE_LENGTH = SIZE_OF_INT + SIZE_OF_LONG + SIZE_OF_LONG; - private RcFileDecoderUtils() - { - } + private ReadWriteUtils() {} public static int decodeVIntSize(Slice slice, int offset) { @@ -67,6 +67,23 @@ public static boolean isNegativeVInt(byte value) return value < -120 || (value >= -112 && value < 0); } + public static long readVInt(DataSeekableInputStream in) + throws IOException + { + byte firstByte = in.readByte(); + int length = decodeVIntSize(firstByte); + if (length == 1) { + return firstByte; + } + + long value = 0; + for (int i = 1; i < length; i++) { + value <<= 8; + value |= (in.readByte() & 0xFF); + } + return isNegativeVInt(firstByte) ? ~value : value; + } + public static long readVInt(SliceInput in) { byte firstByte = in.readByte(); @@ -116,13 +133,13 @@ private static long readVIntInternal(Slice slice, int start, int length) /** * Find the beginning of the first full sync sequence that starts within the specified range. */ - public static long findFirstSyncPosition(RcFileDataSource dataSource, long offset, long length, long syncFirst, long syncSecond) + public static long findFirstSyncPosition(TrinoInputFile inputFile, long offset, long length, long syncFirst, long syncSecond) throws IOException { - requireNonNull(dataSource, "dataSource is null"); + requireNonNull(inputFile, "inputFile is null"); checkArgument(offset >= 0, "offset is negative"); checkArgument(length >= 1, "length must be at least 1"); - checkArgument(offset + length <= dataSource.getSize(), "offset plus length is greater than data size"); + checkArgument(offset + length <= inputFile.length(), "offset plus length is greater than data size"); // The full sync sequence is "0xFFFFFFFF syncFirst syncSecond". If // this sequence begins the file range, the start position is returned @@ -138,36 +155,69 @@ public static long findFirstSyncPosition(RcFileDataSource dataSource, long offse // this causes a re-read of SYNC_SEQUENCE_LENGTH bytes each time, but is much simpler code byte[] buffer = new byte[toIntExact(min(1 << 22, length + (SYNC_SEQUENCE_LENGTH - 1)))]; Slice bufferSlice = Slices.wrappedBuffer(buffer); - for (long position = 0; position < length; position += bufferSlice.length() - (SYNC_SEQUENCE_LENGTH - 1)) { - // either fill the buffer entirely, or read enough to allow all bytes in offset + length to be a start sequence - int bufferSize = toIntExact(min(buffer.length, length + (SYNC_SEQUENCE_LENGTH - 1) - position)); - // don't read off the end of the file - bufferSize = toIntExact(min(bufferSize, dataSource.getSize() - offset - position)); - - dataSource.readFully(offset + position, buffer, 0, bufferSize); - - // find the starting index position of the sync sequence - int index = bufferSlice.indexOf(sync); - if (index >= 0) { - // If the starting position is before the end of the search region, return the - // absolute start position of the sequence. - if (position + index < length) { - long startOfSyncSequence = offset + position + index; - return startOfSyncSequence; + try (TrinoInput input = inputFile.newInput()) { + for (long position = 0; position < length; position += bufferSlice.length() - (SYNC_SEQUENCE_LENGTH - 1)) { + // either fill the buffer entirely, or read enough to allow all bytes in offset + length to be a start sequence + int bufferSize = toIntExact(min(buffer.length, length + (SYNC_SEQUENCE_LENGTH - 1) - position)); + // don't read off the end of the file + bufferSize = toIntExact(min(bufferSize, inputFile.length() - offset - position)); + + input.readFully(offset + position, buffer, 0, bufferSize); + + // find the starting index position of the sync sequence + int index = bufferSlice.indexOf(sync); + if (index >= 0) { + // If the starting position is before the end of the search region, return the + // absolute start position of the sequence. + if (position + index < length) { + long startOfSyncSequence = offset + position + index; + return startOfSyncSequence; + } + // Otherwise, this is not a match for this region + // Note: this case isn't strictly needed as the loop will exit, but it is + // simpler to explicitly call it out. + return -1; } - // Otherwise, this is not a match for this region - // Note: this case isn't strictly needed as the loop will exit, but it is - // simpler to explicitly call it out. - return -1; } } return -1; } - public static void writeLengthPrefixedString(SliceOutput out, Slice slice) + public static void writeLengthPrefixedString(DataOutputStream out, Slice slice) + throws IOException { writeVInt(out, slice.length()); - out.writeBytes(slice); + out.write(slice); + } + + public static void writeVInt(DataOutputStream out, int value) + throws IOException + { + if (value >= -112 && value <= 127) { + out.writeByte(value); + return; + } + + int length = -112; + if (value < 0) { + value ^= -1; // take one's complement' + length = -120; + } + + int tmp = value; + while (tmp != 0) { + tmp = tmp >> 8; + length--; + } + + out.writeByte(length); + + length = (length < -120) ? -(length + 120) : -(length + 112); + + for (int idx = length; idx != 0; idx--) { + int shiftBits = (idx - 1) * 8; + out.writeByte((value >> shiftBits) & 0xFF); + } } public static void writeVInt(SliceOutput out, int value) diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/AircompressorCodec.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/AircompressorCodec.java new file mode 100644 index 000000000000..a5b4d3a46a16 --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/AircompressorCodec.java @@ -0,0 +1,146 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats.compression; + +import io.airlift.slice.DynamicSliceOutput; +import io.airlift.slice.Slice; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionInputStream; +import org.apache.hadoop.io.compress.CompressionOutputStream; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.function.Supplier; + +import static java.util.Objects.requireNonNull; + +public class AircompressorCodec + implements Codec +{ + // Airlift Codecs are assumed to not retain memory and are assumed to not be pooled + private final CompressionCodec codec; + + public AircompressorCodec(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + } + + @Override + public OutputStream createStreamCompressor(OutputStream outputStream) + throws IOException + { + return codec.createOutputStream(outputStream); + } + + @Override + public ValueCompressor createValueCompressor() + { + return new AircompressorValueCompressor(codec); + } + + private static class AircompressorValueCompressor + implements ValueCompressor + { + private final CompressionCodec codec; + private final DynamicSliceOutput buffer; + + private AircompressorValueCompressor(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + this.buffer = new DynamicSliceOutput(1024); + } + + @Override + public Slice compress(Slice slice) + throws IOException + { + buffer.reset(); + try (CompressionOutputStream compressionStream = codec.createOutputStream(buffer, codec.createCompressor())) { + slice.getInput().transferTo(compressionStream); + } + return buffer.slice(); + } + } + + @Override + public MemoryCompressedSliceOutput createMemoryCompressedSliceOutput(int minChunkSize, int maxChunkSize) + { + return new AircompressorCompressedSliceOutputSupplier(codec, minChunkSize, maxChunkSize).get(); + } + + // this can be dramatically simplified when actual hadoop codecs are dropped + private static class AircompressorCompressedSliceOutputSupplier + implements Supplier + { + private final CompressionCodec codec; + private final ChunkedSliceOutput compressedOutput; + + public AircompressorCompressedSliceOutputSupplier(CompressionCodec codec, int minChunkSize, int maxChunkSize) + { + this.codec = requireNonNull(codec, "codec is null"); + this.compressedOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); + } + + @Override + public MemoryCompressedSliceOutput get() + { + try { + compressedOutput.reset(); + CompressionOutputStream compressionStream = codec.createOutputStream(compressedOutput); + return new MemoryCompressedSliceOutput(compressionStream, compressedOutput, this, () -> {}); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + @Override + public InputStream createStreamDecompressor(InputStream inputStream) + throws IOException + { + return codec.createInputStream(inputStream); + } + + @Override + public ValueDecompressor createValueDecompressor() + { + return new AircompressorValueDecompressor(codec); + } + + private static class AircompressorValueDecompressor + implements ValueDecompressor + { + private final CompressionCodec codec; + + private AircompressorValueDecompressor(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + } + + @Override + public void decompress(Slice compressed, Slice uncompressed) + throws IOException + { + try (CompressionInputStream decompressorStream = codec.createInputStream(compressed.getInput())) { + uncompressed.setBytes(0, decompressorStream, uncompressed.length()); + } + catch (IndexOutOfBoundsException | IOException e) { + throw new IOException("Compressed stream is truncated", e); + } + } + } +} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/BufferedOutputStreamSliceOutput.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/BufferedOutputStreamSliceOutput.java similarity index 99% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/BufferedOutputStreamSliceOutput.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/BufferedOutputStreamSliceOutput.java index 45df863814f9..0dcb3e009d7f 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/BufferedOutputStreamSliceOutput.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/BufferedOutputStreamSliceOutput.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.compression; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ChunkedSliceOutput.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ChunkedSliceOutput.java similarity index 99% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/ChunkedSliceOutput.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ChunkedSliceOutput.java index ccebbb3070e2..740c223f9ffa 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ChunkedSliceOutput.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ChunkedSliceOutput.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.compression; import com.google.common.collect.ImmutableList; import io.airlift.slice.Slice; diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/Codec.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/Codec.java new file mode 100644 index 000000000000..9b55665110f9 --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/Codec.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats.compression; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +public interface Codec +{ + OutputStream createStreamCompressor(OutputStream outputStream) + throws IOException; + + ValueCompressor createValueCompressor(); + + MemoryCompressedSliceOutput createMemoryCompressedSliceOutput(int minChunkSize, int maxChunkSize) + throws IOException; + + InputStream createStreamDecompressor(InputStream inputStream) + throws IOException; + + ValueDecompressor createValueDecompressor(); +} diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/CompressionKind.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/CompressionKind.java new file mode 100644 index 000000000000..f126e2e5edb7 --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/CompressionKind.java @@ -0,0 +1,137 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats.compression; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.compress.gzip.JdkGzipCodec; +import io.airlift.compress.lz4.Lz4Codec; +import io.airlift.compress.lzo.LzoCodec; +import io.airlift.compress.snappy.SnappyCodec; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; +import static java.util.Objects.requireNonNull; + +public enum CompressionKind +{ + SNAPPY(".snappy", "org.apache.hadoop.io.compress.SnappyCodec") { + @Override + public Codec createCodec() + { + return new AircompressorCodec(new SnappyCodec()); + } + }, + LZO(".lzo_deflate", "org.apache.hadoop.io.compress.LzoCodec", "com.hadoop.compression.lzo.LzoCodec") { + @Override + public Codec createCodec() + { + return new AircompressorCodec(new LzoCodec()); + } + }, + LZ4(".lz4", "org.apache.hadoop.io.compress.Lz4Codec") { + @Override + public Codec createCodec() + { + return new AircompressorCodec(new Lz4Codec()); + } + }, + GZIP(".gz", "org.apache.hadoop.io.compress.GzipCodec") { + @Override + public Codec createCodec() + { + return new AircompressorCodec(new JdkGzipCodec()); + } + }, + ZSTD(".zst", "org.apache.hadoop.io.compress.ZStandardCodec") { + @Override + public Codec createCodec() + { + org.apache.hadoop.io.compress.ZStandardCodec codec = new org.apache.hadoop.io.compress.ZStandardCodec(); + codec.setConf(newEmptyConfiguration()); + return new HadoopCodec(codec); + } + }, + BZIP2(".bz2", "org.apache.hadoop.io.compress.BZip2Codec") { + @Override + public Codec createCodec() + { + org.apache.hadoop.io.compress.BZip2Codec codec = new org.apache.hadoop.io.compress.BZip2Codec(); + codec.setConf(newEmptyConfiguration()); + return new HadoopCodec(codec); + } + }; + + private final List hadoopClassNames; + private final String fileExtension; + + CompressionKind(String fileExtension, String... hadoopClassNames) + { + this.hadoopClassNames = ImmutableList.copyOf(hadoopClassNames); + this.fileExtension = requireNonNull(fileExtension, "fileExtension is null"); + } + + public String getHadoopClassName() + { + return hadoopClassNames.get(0); + } + + public String getFileExtension() + { + return fileExtension; + } + + public abstract Codec createCodec(); + + private static final Map CODECS_BY_HADOOP_CLASS_NAME; + + static { + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (CompressionKind codec : values()) { + for (String hadoopClassNames : codec.hadoopClassNames) { + builder.put(hadoopClassNames, codec); + } + } + CODECS_BY_HADOOP_CLASS_NAME = builder.buildOrThrow(); + } + + public static CompressionKind fromHadoopClassName(String hadoopClassName) + { + return Optional.ofNullable(CODECS_BY_HADOOP_CLASS_NAME.get(hadoopClassName)) + .orElseThrow(() -> new IllegalArgumentException("Unknown codec: " + hadoopClassName)); + } + + public static Codec createCodecFromHadoopClassName(String hadoopClassName) + { + return Optional.ofNullable(CODECS_BY_HADOOP_CLASS_NAME.get(hadoopClassName)) + .orElseThrow(() -> new IllegalArgumentException("Unknown codec: " + hadoopClassName)) + .createCodec(); + } + + private static final Map CODECS_BY_FILE_EXTENSION = Arrays.stream(values()) + .filter(codec -> codec.fileExtension != null) + .collect(toImmutableMap(codec -> codec.fileExtension, Function.identity())); + + public static Optional createCodecFromExtension(String extension) + { + return Optional.ofNullable(CODECS_BY_FILE_EXTENSION.get(extension)) + .map(CompressionKind::createCodec); + } +} diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/HadoopCodec.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/HadoopCodec.java new file mode 100644 index 000000000000..91afbae054eb --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/HadoopCodec.java @@ -0,0 +1,175 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats.compression; + +import io.airlift.slice.DynamicSliceOutput; +import io.airlift.slice.Slice; +import org.apache.hadoop.io.compress.CodecPool; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionInputStream; +import org.apache.hadoop.io.compress.CompressionOutputStream; +import org.apache.hadoop.io.compress.Compressor; +import org.apache.hadoop.io.compress.Decompressor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.function.Supplier; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; + +public class HadoopCodec + implements Codec +{ + private final CompressionCodec codec; + + public HadoopCodec(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + } + + @Override + public OutputStream createStreamCompressor(OutputStream outputStream) + throws IOException + { + return codec.createOutputStream(outputStream); + } + + @Override + public ValueCompressor createValueCompressor() + { + return new HadoopValueCompressor(codec); + } + + private static class HadoopValueCompressor + implements ValueCompressor + { + private final CompressionCodec codec; + private final Compressor compressor; + private final DynamicSliceOutput buffer; + + private HadoopValueCompressor(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + this.compressor = CodecPool.getCompressor(requireNonNull(codec, "codec is null")); + this.buffer = new DynamicSliceOutput(1024); + } + + @Override + public Slice compress(Slice slice) + throws IOException + { + compressor.reset(); + buffer.reset(); + try (CompressionOutputStream compressionStream = codec.createOutputStream(buffer, compressor)) { + slice.getInput().transferTo(compressionStream); + } + return buffer.slice(); + } + + @Override + public void close() + { + CodecPool.returnCompressor(compressor); + } + } + + @Override + public MemoryCompressedSliceOutput createMemoryCompressedSliceOutput(int minChunkSize, int maxChunkSize) + { + return new HadoopCompressedSliceOutputSupplier(codec, minChunkSize, maxChunkSize).get(); + } + + private static class HadoopCompressedSliceOutputSupplier + implements Supplier + { + private final CompressionCodec codec; + private final Compressor compressor; + private final ChunkedSliceOutput bufferedOutput; + + public HadoopCompressedSliceOutputSupplier(CompressionCodec codec, int minChunkSize, int maxChunkSize) + { + this.codec = requireNonNull(codec, "codec is null"); + this.compressor = CodecPool.getCompressor(requireNonNull(codec, "codec is null")); + this.bufferedOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); + } + + @Override + public MemoryCompressedSliceOutput get() + { + try { + compressor.reset(); + bufferedOutput.reset(); + CompressionOutputStream compressionStream = codec.createOutputStream(bufferedOutput, compressor); + return new MemoryCompressedSliceOutput(compressionStream, bufferedOutput, this, () -> CodecPool.returnCompressor(compressor)); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + @Override + public InputStream createStreamDecompressor(InputStream inputStream) + throws IOException + { + return codec.createInputStream(inputStream); + } + + @Override + public ValueDecompressor createValueDecompressor() + { + return new HadoopValueDecompressor(codec); + } + + private static class HadoopValueDecompressor + implements ValueDecompressor + { + private final CompressionCodec codec; + private final Decompressor decompressor; + private boolean closed; + + private HadoopValueDecompressor(CompressionCodec codec) + { + this.codec = requireNonNull(codec, "codec is null"); + decompressor = CodecPool.getDecompressor(codec); + } + + @Override + public void decompress(Slice compressed, Slice uncompressed) + throws IOException + { + checkState(!closed, "Value decompressor has been closed"); + decompressor.reset(); + try (CompressionInputStream decompressorStream = codec.createInputStream(compressed.getInput(), decompressor)) { + uncompressed.setBytes(0, decompressorStream, uncompressed.length()); + } + catch (IndexOutOfBoundsException | IOException e) { + throw new IOException("Compressed stream is truncated", e); + } + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + CodecPool.returnDecompressor(decompressor); + } + } +} diff --git a/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/MemoryCompressedSliceOutput.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/MemoryCompressedSliceOutput.java new file mode 100644 index 000000000000..04b5a636084f --- /dev/null +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/MemoryCompressedSliceOutput.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats.compression; + +import io.airlift.slice.Slice; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.List; +import java.util.function.Supplier; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; + +// This specialized SliceOutput has direct access buffered output slices to +// report buffer sizes and to get the final output. Additionally, a new +// CompressedSliceOutput can be created that reuses the underlying output +// buffer +public final class MemoryCompressedSliceOutput + extends BufferedOutputStreamSliceOutput +{ + private final ChunkedSliceOutput bufferedOutput; + private final Supplier resetFactory; + private final Runnable onDestroy; + private boolean closed; + private boolean destroyed; + + /** + * @param compressionStream the compressed output stream to delegate to + * @param bufferedOutput the output for the compressionStream + * @param resetFactory the function to create a new CompressedSliceOutput that reuses the bufferedOutput + * @param onDestroy used to cleanup the compression when done + */ + public MemoryCompressedSliceOutput( + OutputStream compressionStream, + ChunkedSliceOutput bufferedOutput, + Supplier resetFactory, + Runnable onDestroy) + { + super(compressionStream); + this.bufferedOutput = requireNonNull(bufferedOutput, "bufferedOutput is null"); + this.resetFactory = requireNonNull(resetFactory, "resetFactory is null"); + this.onDestroy = requireNonNull(onDestroy, "onDestroy is null"); + } + + @Override + public long getRetainedSize() + { + return super.getRetainedSize() + bufferedOutput.getRetainedSize(); + } + + public int getCompressedSize() + { + checkState(closed, "Stream has not been closed"); + checkState(!destroyed, "Stream has been destroyed"); + return bufferedOutput.size(); + } + + public List getCompressedSlices() + { + checkState(closed, "Stream has not been closed"); + checkState(!destroyed, "Stream has been destroyed"); + return bufferedOutput.getSlices(); + } + + public MemoryCompressedSliceOutput createRecycledCompressedSliceOutput() + { + checkState(closed, "Stream has not been closed"); + checkState(!destroyed, "Stream has been destroyed"); + destroyed = true; + return resetFactory.get(); + } + + @Override + public void close() + throws IOException + { + if (!closed) { + closed = true; + super.close(); + } + } + + public void destroy() + throws IOException + { + if (!destroyed) { + destroyed = true; + try { + close(); + } + finally { + onDestroy.run(); + } + } + } + + public static MemoryCompressedSliceOutput createUncompressedMemorySliceOutput(int minChunkSize, int maxChunkSize) + { + return new UncompressedSliceOutputSupplier(minChunkSize, maxChunkSize).get(); + } + + private static class UncompressedSliceOutputSupplier + implements Supplier + { + private final ChunkedSliceOutput chunkedSliceOutput; + + private UncompressedSliceOutputSupplier(int minChunkSize, int maxChunkSize) + { + chunkedSliceOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); + } + + @Override + public MemoryCompressedSliceOutput get() + { + chunkedSliceOutput.reset(); + return new MemoryCompressedSliceOutput(chunkedSliceOutput, chunkedSliceOutput, this, () -> {}); + } + } +} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSource.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueCompressor.java similarity index 73% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSource.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueCompressor.java index 6da2c05d63b7..0dd99b5b677b 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSource.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueCompressor.java @@ -11,22 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.compression; + +import io.airlift.slice.Slice; import java.io.Closeable; import java.io.IOException; -public interface RcFileDataSource +public interface ValueCompressor extends Closeable { - RcFileDataSourceId getId(); - - long getReadBytes(); - - long getReadTimeNanos(); - - long getSize(); - - void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + Slice compress(Slice slice) throws IOException; + + @Override + default void close() {} } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecompressor.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueDecompressor.java similarity index 73% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecompressor.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueDecompressor.java index 3391524c6958..3eaa64836a2e 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDecompressor.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/compression/ValueDecompressor.java @@ -11,14 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.compression; import io.airlift.slice.Slice; -public interface RcFileDecompressor +import java.io.Closeable; +import java.io.IOException; + +public interface ValueDecompressor + extends Closeable { void decompress(Slice compressed, Slice uncompressed) - throws RcFileCorruptionException; + throws IOException; - void destroy(); + @Override + default void close() {} } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnData.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnData.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnData.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnData.java index 123377ed9c2b..1abec9ddafd3 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnData.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnData.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import io.airlift.slice.Slice; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnEncoding.java index a12846cea8d7..657b76ec9151 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ColumnEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ColumnEncoding.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import io.airlift.slice.SliceOutput; import io.trino.spi.block.Block; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/EncodeOutput.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/EncodeOutput.java similarity index 94% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/EncodeOutput.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/EncodeOutput.java index a90036988511..240a84b98247 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/EncodeOutput.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/EncodeOutput.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; public interface EncodeOutput { diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/PageSplitterUtil.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/PageSplitterUtil.java similarity index 98% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/PageSplitterUtil.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/PageSplitterUtil.java index ec6cab630d33..db4419935c3f 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/PageSplitterUtil.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/PageSplitterUtil.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.ImmutableList; import io.trino.spi.Page; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCorruptionException.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileCorruptionException.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCorruptionException.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileCorruptionException.java index c21512163e23..7720660b985c 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCorruptionException.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileCorruptionException.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import java.io.IOException; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileEncoding.java similarity index 99% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileEncoding.java index 53f3382fffef..4f3ef608907e 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileEncoding.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import io.trino.spi.TrinoException; import io.trino.spi.type.ArrayType; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileReader.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileReader.java similarity index 77% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileReader.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileReader.java index 7ab8849fb06b..bd4d2c9ec7fe 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileReader.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileReader.java @@ -11,20 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.ImmutableMap; import io.airlift.slice.BasicSliceInput; -import io.airlift.slice.ChunkedSliceInput; -import io.airlift.slice.ChunkedSliceInput.BufferReference; -import io.airlift.slice.ChunkedSliceInput.SliceLoader; import io.airlift.slice.Slice; -import io.airlift.slice.SliceInput; import io.airlift.slice.Slices; -import io.airlift.units.DataSize; -import io.airlift.units.DataSize.Unit; -import io.trino.rcfile.RcFileWriteValidation.WriteChecksum; -import io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder; +import io.trino.filesystem.TrinoInputFile; +import io.trino.hive.formats.DataSeekableInputStream; +import io.trino.hive.formats.ReadWriteUtils; +import io.trino.hive.formats.compression.CompressionKind; +import io.trino.hive.formats.compression.ValueDecompressor; +import io.trino.hive.formats.rcfile.RcFileWriteValidation.WriteChecksum; +import io.trino.hive.formats.rcfile.RcFileWriteValidation.WriteChecksumBuilder; import io.trino.spi.Page; import io.trino.spi.block.Block; import io.trino.spi.block.RunLengthEncodedBlock; @@ -32,7 +31,6 @@ import java.io.Closeable; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -44,9 +42,6 @@ import static com.google.common.io.ByteStreams.skipFully; import static io.airlift.slice.SizeOf.SIZE_OF_INT; import static io.airlift.slice.SizeOf.SIZE_OF_LONG; -import static io.trino.rcfile.RcFileDecoderUtils.findFirstSyncPosition; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder; import static java.lang.Math.min; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; @@ -72,14 +67,15 @@ public class RcFileReader private static final String COLUMN_COUNT_METADATA_KEY = "hive.io.rcfile.column.number"; - private final RcFileDataSource dataSource; + private final String location; + private final long fileSize; private final Map readColumns; - private final ChunkedSliceInput input; + private final DataSeekableInputStream input; private final long length; private final byte version; - private final RcFileDecompressor decompressor; + private final ValueDecompressor decompressor; private final Map metadata; private final int columnCount; @@ -106,49 +102,47 @@ public class RcFileReader private final Optional writeChecksumBuilder; public RcFileReader( - RcFileDataSource dataSource, + TrinoInputFile inputFile, RcFileEncoding encoding, Map readColumns, - RcFileCodecFactory codecFactory, long offset, - long length, - DataSize bufferSize) + long length) throws IOException { - this(dataSource, encoding, readColumns, codecFactory, offset, length, bufferSize, Optional.empty()); + this(inputFile, encoding, readColumns, offset, length, Optional.empty()); } private RcFileReader( - RcFileDataSource dataSource, + TrinoInputFile inputFile, RcFileEncoding encoding, Map readColumns, - RcFileCodecFactory codecFactory, long offset, long length, - DataSize bufferSize, Optional writeValidation) throws IOException { - this.dataSource = requireNonNull(dataSource, "dataSource is null"); + requireNonNull(inputFile, "inputFile is null"); + this.location = inputFile.location(); + this.fileSize = inputFile.length(); this.readColumns = ImmutableMap.copyOf(requireNonNull(readColumns, "readColumns is null")); - this.input = new ChunkedSliceInput(new DataSourceSliceLoader(dataSource), toIntExact(bufferSize.toBytes())); + this.input = new DataSeekableInputStream(inputFile.newInput().inputStream()); this.writeValidation = requireNonNull(writeValidation, "writeValidation is null"); - this.writeChecksumBuilder = writeValidation.map(validation -> createWriteChecksumBuilder(readColumns)); + this.writeChecksumBuilder = writeValidation.map(validation -> WriteChecksumBuilder.createWriteChecksumBuilder(readColumns)); verify(offset >= 0, "offset is negative"); - verify(offset < dataSource.getSize(), "offset is greater than data size"); + verify(offset < inputFile.length(), "offset is greater than data size"); verify(length >= 1, "length must be at least 1"); this.length = length; this.end = offset + length; - verify(end <= dataSource.getSize(), "offset plus length is greater than data size"); + verify(end <= fileSize, "offset plus length is greater than data size"); // read header Slice magic = input.readSlice(RCFILE_MAGIC.length()); boolean compressed; if (RCFILE_MAGIC.equals(magic)) { version = input.readByte(); - verify(version <= CURRENT_VERSION, "RCFile version %s not supported: %s", version, dataSource); + verify(version <= CURRENT_VERSION, "RCFile version %s not supported: %s", version, inputFile.location()); validateWrite(validation -> validation.getVersion() == version, "Unexpected file version"); compressed = input.readBoolean(); } @@ -157,30 +151,30 @@ else if (SEQUENCE_FILE_MAGIC.equals(magic)) { // first version of RCFile used magic SEQ with version 6 byte sequenceFileVersion = input.readByte(); - verify(sequenceFileVersion == SEQUENCE_FILE_VERSION, "File %s is a SequenceFile not an RCFile", dataSource); + verify(sequenceFileVersion == SEQUENCE_FILE_VERSION, "File %s is a SequenceFile not an RCFile", inputFile.location()); // this is the first version of RCFile this.version = FIRST_VERSION; Slice keyClassName = readLengthPrefixedString(input); Slice valueClassName = readLengthPrefixedString(input); - verify(RCFILE_KEY_BUFFER_NAME.equals(keyClassName) && RCFILE_VALUE_BUFFER_NAME.equals(valueClassName), "File %s is a SequenceFile not an RCFile", dataSource); + verify(RCFILE_KEY_BUFFER_NAME.equals(keyClassName) && RCFILE_VALUE_BUFFER_NAME.equals(valueClassName), "File %s is a SequenceFile not an RCFile", inputFile); compressed = input.readBoolean(); // RC file is never block compressed if (input.readBoolean()) { - throw corrupt("File %s is a SequenceFile not an RCFile", dataSource); + throw corrupt("File %s is a SequenceFile not an RCFile", inputFile.location()); } } else { - throw corrupt("File %s is not an RCFile", dataSource); + throw corrupt("File %s is not an RCFile", inputFile.location()); } // setup the compression codec if (compressed) { String codecClassName = readLengthPrefixedString(input).toStringUtf8(); validateWrite(validation -> validation.getCodecClassName().equals(Optional.of(codecClassName)), "Unexpected compression codec"); - this.decompressor = codecFactory.createDecompressor(codecClassName); + this.decompressor = CompressionKind.createCodecFromHadoopClassName(codecClassName).createValueDecompressor(); } else { validateWrite(validation -> validation.getCodecClassName().equals(Optional.empty()), "Expected file to be compressed"); @@ -189,8 +183,8 @@ else if (SEQUENCE_FILE_MAGIC.equals(magic)) { // read metadata int metadataEntries = Integer.reverseBytes(input.readInt()); - verify(metadataEntries >= 0, "Invalid metadata entry count %s in RCFile %s", metadataEntries, dataSource); - verify(metadataEntries <= MAX_METADATA_ENTRIES, "Too many metadata entries (%s) in RCFile %s", metadataEntries, dataSource); + verify(metadataEntries >= 0, "Invalid metadata entry count %s in RCFile %s", metadataEntries, inputFile.location()); + verify(metadataEntries <= MAX_METADATA_ENTRIES, "Too many metadata entries (%s) in RCFile %s", metadataEntries, inputFile.location()); ImmutableMap.Builder metadataBuilder = ImmutableMap.builder(); for (int i = 0; i < metadataEntries; i++) { metadataBuilder.put(readLengthPrefixedString(input).toStringUtf8(), readLengthPrefixedString(input).toStringUtf8()); @@ -200,15 +194,16 @@ else if (SEQUENCE_FILE_MAGIC.equals(magic)) { // get column count from metadata String columnCountString = metadata.get(COLUMN_COUNT_METADATA_KEY); + verify(columnCountString != null, "Column count not specified in metadata RCFile %s", inputFile.location()); try { columnCount = Integer.parseInt(columnCountString); } catch (NumberFormatException e) { - throw corrupt("Invalid column count %s in RCFile %s", columnCountString, dataSource); + throw corrupt("Invalid column count %s in RCFile %s", columnCountString, inputFile.location()); } // initialize columns - verify(columnCount <= MAX_COLUMN_COUNT, "Too many columns (%s) in RCFile %s", columnCountString, dataSource); + verify(columnCount <= MAX_COLUMN_COUNT, "Too many columns (%s) in RCFile %s", columnCountString, inputFile.location()); columns = new Column[columnCount]; for (Entry entry : readColumns.entrySet()) { if (entry.getKey() < columnCount) { @@ -227,7 +222,12 @@ else if (SEQUENCE_FILE_MAGIC.equals(magic)) { // of the file. In that case, the reader owns all row groups up to the first sync point. if (offset != 0) { // if the specified file region does not contain the start of a sync sequence, this call will close the reader - seekToFirstRowGroupInRange(offset, length); + long startOfSyncSequence = ReadWriteUtils.findFirstSyncPosition(inputFile, offset, length, syncFirst, syncSecond); + if (startOfSyncSequence < 0) { + closeQuietly(); + return; + } + input.seek(startOfSyncSequence); } } @@ -253,7 +253,7 @@ public long getLength() public long getBytesRead() { - return dataSource.getReadBytes(); + return input.getReadBytes(); } public long getRowsRead() @@ -263,7 +263,7 @@ public long getRowsRead() public long getReadTimeNanos() { - return dataSource.getReadTimeNanos(); + return input.getReadTimeNanos(); } public Slice getSync() @@ -290,7 +290,7 @@ public void close() } finally { if (decompressor != null) { - decompressor.destroy(); + decompressor.close(); } } if (writeChecksumBuilder.isPresent()) { @@ -322,18 +322,18 @@ public int advance() } // are we at the end? - if (input.remaining() == 0) { + if (fileSize - input.getPos() == 0) { close(); return -1; } // read uncompressed size of row group (which is useless information) - verify(input.remaining() >= SIZE_OF_INT, "RCFile truncated %s", dataSource.getId()); + verify(fileSize - input.getPos() >= SIZE_OF_INT, "RCFile truncated %s", location); int unusedRowGroupSize = Integer.reverseBytes(input.readInt()); // read sequence sync if present if (unusedRowGroupSize == -1) { - verify(input.remaining() >= SIZE_OF_LONG + SIZE_OF_LONG + SIZE_OF_INT, "RCFile truncated %s", dataSource.getId()); + verify(fileSize - input.getPos() >= SIZE_OF_LONG + SIZE_OF_LONG + SIZE_OF_INT, "RCFile truncated %s", length); // The full sync sequence is "0xFFFFFFFF syncFirst syncSecond". If // this sequence begins in our segment, we must continue process until the @@ -341,12 +341,12 @@ public int advance() // We have already read the 0xFFFFFFFF above, so we must test the // end condition back 4 bytes. // NOTE: this decision must agree with RcFileDecoderUtils.findFirstSyncPosition - if (input.position() - SIZE_OF_INT >= end) { + if (input.getPos() - SIZE_OF_INT >= end) { close(); return -1; } - verify(syncFirst == input.readLong() && syncSecond == input.readLong(), "Invalid sync in RCFile %s", dataSource.getId()); + verify(syncFirst == input.readLong() && syncSecond == input.readLong(), "Invalid sync in RCFile %s", location); // read the useless uncompressed length unusedRowGroupSize = Integer.reverseBytes(input.readInt()); @@ -362,7 +362,9 @@ else if (rowsRead > 0) { if (compressedHeaderSize > compressedHeaderBuffer.length()) { compressedHeaderBuffer = Slices.allocate(compressedHeaderSize); } - input.readBytes(compressedHeaderBuffer, 0, compressedHeaderSize); + // use exact sized compressed header to avoid problems where compression algorithms over read + Slice compressedHeader = compressedHeaderBuffer.slice(0, compressedHeaderSize); + input.readFully(compressedHeader); // decompress row group header Slice header; @@ -372,18 +374,18 @@ else if (rowsRead > 0) { } Slice buffer = headerBuffer.slice(0, uncompressedHeaderSize); - decompressor.decompress(compressedHeaderBuffer, buffer); + decompressor.decompress(compressedHeader, buffer); header = buffer; } else { - verify(compressedHeaderSize == uncompressedHeaderSize, "Invalid RCFile %s", dataSource.getId()); - header = compressedHeaderBuffer; + verify(compressedHeaderSize == uncompressedHeaderSize, "Invalid RCFile %s", location); + header = compressedHeader; } BasicSliceInput headerInput = header.getInput(); // read number of rows in row group - rowGroupRowCount = toIntExact(readVInt(headerInput)); + rowGroupRowCount = toIntExact(ReadWriteUtils.readVInt(headerInput)); rowsRead += rowGroupRowCount; rowGroupPosition = 0; currentChunkRowCount = min(ColumnData.MAX_SIZE, rowGroupRowCount); @@ -391,14 +393,14 @@ else if (rowsRead > 0) { // set column buffers int totalCompressedDataSize = 0; for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) { - int compressedDataSize = toIntExact(readVInt(headerInput)); + int compressedDataSize = toIntExact(ReadWriteUtils.readVInt(headerInput)); totalCompressedDataSize += compressedDataSize; - int uncompressedDataSize = toIntExact(readVInt(headerInput)); + int uncompressedDataSize = toIntExact(ReadWriteUtils.readVInt(headerInput)); if (decompressor == null && compressedDataSize != uncompressedDataSize) { - throw corrupt("Invalid RCFile %s", dataSource.getId()); + throw corrupt("Invalid RCFile %s", location); } - int lengthsSize = toIntExact(readVInt(headerInput)); + int lengthsSize = toIntExact(ReadWriteUtils.readVInt(headerInput)); Slice lengthsBuffer = headerInput.readSlice(lengthsSize); @@ -434,20 +436,9 @@ public Block readBlock(int columnIndex) return columns[columnIndex].readBlock(rowGroupPosition, currentChunkRowCount); } - public RcFileDataSourceId getId() - { - return dataSource.getId(); - } - - private void seekToFirstRowGroupInRange(long offset, long length) - throws IOException + public String getFileLocation() { - long startOfSyncSequence = findFirstSyncPosition(dataSource, offset, length, syncFirst, syncSecond); - if (startOfSyncSequence < 0) { - closeQuietly(); - return; - } - input.setPosition(startOfSyncSequence); + return location; } private void closeQuietly() @@ -459,10 +450,10 @@ private void closeQuietly() } } - private Slice readLengthPrefixedString(SliceInput in) - throws RcFileCorruptionException + private Slice readLengthPrefixedString(DataSeekableInputStream in) + throws IOException { - int length = toIntExact(readVInt(in)); + int length = toIntExact(ReadWriteUtils.readVInt(in)); verify(length <= MAX_METADATA_STRING_LENGTH, "Metadata string value is too long (%s) in RCFile %s", length, in); return in.readSlice(length); } @@ -491,9 +482,7 @@ private void validateWrite(Predicate test, String message private void validateWriteRowGroupChecksum() { - if (writeChecksumBuilder.isPresent()) { - writeChecksumBuilder.get().addRowGroup(rowGroupRowCount); - } + writeChecksumBuilder.ifPresent(checksumBuilder -> checksumBuilder.addRowGroup(rowGroupRowCount)); } private void validateWritePageChecksum() @@ -510,10 +499,9 @@ private void validateWritePageChecksum() static void validateFile( RcFileWriteValidation writeValidation, - RcFileDataSource input, + TrinoInputFile inputFile, RcFileEncoding encoding, - List types, - RcFileCodecFactory codecFactory) + List types) throws RcFileCorruptionException { ImmutableMap.Builder readTypes = ImmutableMap.builder(); @@ -521,13 +509,11 @@ static void validateFile( readTypes.put(columnIndex, types.get(columnIndex)); } try (RcFileReader rcFileReader = new RcFileReader( - input, + inputFile, encoding, readTypes.buildOrThrow(), - codecFactory, 0, - input.getSize(), - DataSize.of(8, Unit.MEGABYTE), + inputFile.length(), Optional.of(writeValidation))) { while (rcFileReader.advance() >= 0) { // ignored @@ -544,7 +530,7 @@ static void validateFile( private static class Column { private final ColumnEncoding encoding; - private final RcFileDecompressor decompressor; + private final ValueDecompressor decompressor; private BasicSliceInput lengthsInput; private Slice dataBuffer; @@ -560,7 +546,7 @@ private static class Column private int runLength; private int lastValueLength = -1; - public Column(ColumnEncoding encoding, RcFileDecompressor decompressor) + public Column(ColumnEncoding encoding, ValueDecompressor decompressor) { this.encoding = encoding; this.decompressor = decompressor; @@ -630,7 +616,7 @@ private int readNextValueLength() return lastValueLength; } - int valueLength = toIntExact(readVInt(lengthsInput)); + int valueLength = toIntExact(ReadWriteUtils.readVInt(lengthsInput)); // negative length is used to encode a run or the last value if (valueLength < 0) { @@ -663,73 +649,4 @@ private Slice getDataBuffer() return dataBuffer; } } - - private static class DataSourceSliceLoader - implements SliceLoader - { - private final RcFileDataSource dataSource; - - public DataSourceSliceLoader(RcFileDataSource dataSource) - { - this.dataSource = dataSource; - } - - @Override - public ByteArrayBufferReference createBuffer(int bufferSize) - { - return new ByteArrayBufferReference(bufferSize); - } - - @Override - public long getSize() - { - return dataSource.getSize(); - } - - @Override - public void load(long position, ByteArrayBufferReference bufferReference, int length) - { - try { - dataSource.readFully(position, bufferReference.getByteBuffer(), 0, length); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public void close() - { - try { - dataSource.close(); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - - private static class ByteArrayBufferReference - implements BufferReference - { - private final byte[] byteBuffer; - private final Slice sliceBuffer; - - public ByteArrayBufferReference(int size) - { - byteBuffer = new byte[size]; - sliceBuffer = Slices.wrappedBuffer(byteBuffer); - } - - public byte[] getByteBuffer() - { - return byteBuffer; - } - - @Override - public Slice getSlice() - { - return sliceBuffer; - } - } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriteValidation.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriteValidation.java similarity index 90% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriteValidation.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriteValidation.java index 21a3ab3103b1..668c43efd4dd 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriteValidation.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriteValidation.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -200,46 +200,39 @@ public RcFileWriteValidationBuilder(List types) this.checksum = new WriteChecksumBuilder(types); } - public RcFileWriteValidationBuilder setVersion(byte version) + public void setVersion(byte version) { this.version = version; - return this; } - public RcFileWriteValidationBuilder addMetadataProperty(String key, String value) + public void addMetadataProperty(String key, String value) { metadata.put(key, value); - return this; } - public RcFileWriteValidationBuilder setCodecClassName(Optional codecClassName) + public void setCodecClassName(Optional codecClassName) { this.codecClassName = codecClassName; - return this; } - public RcFileWriteValidationBuilder setSyncFirst(long syncFirst) + public void setSyncFirst(long syncFirst) { this.syncFirst = syncFirst; - return this; } - public RcFileWriteValidationBuilder setSyncSecond(long syncSecond) + public void setSyncSecond(long syncSecond) { this.syncSecond = syncSecond; - return this; } - public RcFileWriteValidationBuilder addRowGroup(int rowCount) + public void addRowGroup(int rowCount) { checksum.addRowGroup(rowCount); - return this; } - public RcFileWriteValidationBuilder addPage(Page page) + public void addPage(Page page) { checksum.addPage(page); - return this; } public RcFileWriteValidation build() diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriter.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriter.java similarity index 85% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriter.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriter.java index 572e483add83..d3369063358b 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileWriter.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/RcFileWriter.java @@ -11,15 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.io.Closer; import io.airlift.slice.DynamicSliceOutput; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.units.DataSize; -import io.trino.rcfile.RcFileCompressor.CompressedSliceOutput; -import io.trino.rcfile.RcFileWriteValidation.RcFileWriteValidationBuilder; +import io.trino.filesystem.TrinoInputFile; +import io.trino.hive.formats.DataOutputStream; +import io.trino.hive.formats.compression.Codec; +import io.trino.hive.formats.compression.CompressionKind; +import io.trino.hive.formats.compression.MemoryCompressedSliceOutput; +import io.trino.hive.formats.rcfile.RcFileWriteValidation.RcFileWriteValidationBuilder; import io.trino.spi.Page; import io.trino.spi.block.Block; import io.trino.spi.type.Type; @@ -29,6 +33,7 @@ import java.io.Closeable; import java.io.IOException; +import java.io.OutputStream; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -41,10 +46,9 @@ import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.units.DataSize.Unit.KILOBYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; -import static io.trino.rcfile.PageSplitterUtil.splitPage; -import static io.trino.rcfile.RcFileDecoderUtils.writeLengthPrefixedString; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; -import static io.trino.rcfile.RcFileReader.validateFile; +import static io.trino.hive.formats.ReadWriteUtils.writeLengthPrefixedString; +import static io.trino.hive.formats.ReadWriteUtils.writeVInt; +import static io.trino.hive.formats.rcfile.RcFileReader.validateFile; import static java.lang.StrictMath.toIntExact; import static java.util.Objects.requireNonNull; @@ -68,15 +72,14 @@ public class RcFileWriter PRESTO_RCFILE_WRITER_VERSION = version == null ? "UNKNOWN" : version; } - private final SliceOutput output; + private final DataOutputStream output; private final List types; private final RcFileEncoding encoding; - private final RcFileCodecFactory codecFactory; private final long syncFirst = ThreadLocalRandom.current().nextLong(); private final long syncSecond = ThreadLocalRandom.current().nextLong(); - private CompressedSliceOutput keySectionOutput; + private MemoryCompressedSliceOutput keySectionOutput; private final ColumnEncoder[] columnEncoders; private final int targetMinRowGroupSize; @@ -91,21 +94,19 @@ public class RcFileWriter private final RcFileWriteValidationBuilder validationBuilder; public RcFileWriter( - SliceOutput output, + OutputStream rawOutput, List types, RcFileEncoding encoding, - Optional codecName, - RcFileCodecFactory codecFactory, + Optional compressionKind, Map metadata, boolean validate) throws IOException { this( - output, + rawOutput, types, encoding, - codecName, - codecFactory, + compressionKind, metadata, DEFAULT_TARGET_MIN_ROW_GROUP_SIZE, DEFAULT_TARGET_MAX_ROW_GROUP_SIZE, @@ -113,23 +114,21 @@ public RcFileWriter( } public RcFileWriter( - SliceOutput output, + OutputStream rawOutput, List types, RcFileEncoding encoding, - Optional codecName, - RcFileCodecFactory codecFactory, + Optional compressionKind, Map metadata, DataSize targetMinRowGroupSize, DataSize targetMaxRowGroupSize, boolean validate) throws IOException { - requireNonNull(output, "output is null"); + requireNonNull(rawOutput, "rawOutput is null"); requireNonNull(types, "types is null"); checkArgument(!types.isEmpty(), "types is empty"); requireNonNull(encoding, "encoding is null"); - requireNonNull(codecName, "codecName is null"); - requireNonNull(codecFactory, "codecFactory is null"); + requireNonNull(compressionKind, "compressionKind is null"); requireNonNull(metadata, "metadata is null"); checkArgument(!metadata.containsKey(PRESTO_RCFILE_WRITER_VERSION_METADATA_KEY), "Cannot set property %s", PRESTO_RCFILE_WRITER_VERSION_METADATA_KEY); checkArgument(!metadata.containsKey(COLUMN_COUNT_METADATA_KEY), "Cannot set property %s", COLUMN_COUNT_METADATA_KEY); @@ -139,20 +138,21 @@ public RcFileWriter( this.validationBuilder = validate ? new RcFileWriteValidationBuilder(types) : null; - this.output = output; + this.output = new DataOutputStream(rawOutput); this.types = types; this.encoding = encoding; - this.codecFactory = codecFactory; // write header - output.writeBytes(RCFILE_MAGIC); + output.write(RCFILE_MAGIC); output.writeByte(CURRENT_VERSION); recordValidation(validation -> validation.setVersion((byte) CURRENT_VERSION)); // write codec information - output.writeBoolean(codecName.isPresent()); - codecName.ifPresent(name -> writeLengthPrefixedString(output, utf8Slice(name))); - recordValidation(validation -> validation.setCodecClassName(codecName)); + output.writeBoolean(compressionKind.isPresent()); + if (compressionKind.isPresent()) { + writeLengthPrefixedString(output, utf8Slice(compressionKind.get().getHadoopClassName())); + } + recordValidation(validation -> validation.setCodecClassName(compressionKind.map(CompressionKind::getHadoopClassName))); // write metadata output.writeInt(Integer.reverseBytes(metadata.size() + 2)); @@ -169,20 +169,21 @@ public RcFileWriter( recordValidation(validation -> validation.setSyncSecond(syncSecond)); // initialize columns - RcFileCompressor compressor = codecName.map(codecFactory::createCompressor).orElse(new NoneCompressor()); - keySectionOutput = compressor.createCompressedSliceOutput((int) MIN_BUFFER_SIZE.toBytes(), (int) MAX_BUFFER_SIZE.toBytes()); - keySectionOutput.close(); // output is recycled on first use which requires output to be closed + Optional codec = compressionKind.map(CompressionKind::createCodec); + keySectionOutput = createMemoryCompressedSliceOutput(codec); + keySectionOutput.close(); // output is recycled on first use which requires the output to be closed columnEncoders = new ColumnEncoder[types.size()]; for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) { Type type = types.get(columnIndex); ColumnEncoding columnEncoding = encoding.getEncoding(type); - columnEncoders[columnIndex] = new ColumnEncoder(columnEncoding, compressor); + columnEncoders[columnIndex] = new ColumnEncoder(columnEncoding, codec); } this.targetMinRowGroupSize = toIntExact(targetMinRowGroupSize.toBytes()); this.targetMaxRowGroupSize = toIntExact(targetMaxRowGroupSize.toBytes()); } private void writeMetadataProperty(String key, String value) + throws IOException { writeLengthPrefixedString(output, utf8Slice(key)); writeLengthPrefixedString(output, utf8Slice(value)); @@ -210,16 +211,15 @@ private void recordValidation(Consumer task) } } - public void validate(RcFileDataSource input) + public void validate(TrinoInputFile inputFile) throws RcFileCorruptionException { checkState(validationBuilder != null, "validation is not enabled"); validateFile( validationBuilder.build(), - input, + inputFile, encoding, - types, - codecFactory); + types); } public long getRetainedSizeInBytes() @@ -239,7 +239,7 @@ public void write(Page page) if (page.getPositionCount() == 0) { return; } - List pages = splitPage(page, targetMaxRowGroupSize); + List pages = PageSplitterUtil.splitPage(page, targetMaxRowGroupSize); for (Page splitPage : pages) { bufferPage(splitPage); } @@ -310,14 +310,14 @@ private void writeRowGroup() output.writeInt(Integer.reverseBytes(keySectionOutput.size())); output.writeInt(Integer.reverseBytes(keySectionOutput.getCompressedSize())); for (Slice slice : keySectionOutput.getCompressedSlices()) { - output.writeBytes(slice); + output.write(slice); } // write value section for (ColumnEncoder columnEncoder : columnEncoders) { List slices = columnEncoder.getCompressedData(); for (Slice slice : slices) { - output.writeBytes(slice); + output.write(slice); } columnEncoder.reset(); } @@ -327,6 +327,15 @@ private void writeRowGroup() bufferedRows = 0; } + private static MemoryCompressedSliceOutput createMemoryCompressedSliceOutput(Optional codec) + throws IOException + { + if (codec.isPresent()) { + return codec.get().createMemoryCompressedSliceOutput((int) MIN_BUFFER_SIZE.toBytes(), (int) MAX_BUFFER_SIZE.toBytes()); + } + return MemoryCompressedSliceOutput.createUncompressedMemorySliceOutput((int) MIN_BUFFER_SIZE.toBytes(), (int) MAX_BUFFER_SIZE.toBytes()); + } + private static class ColumnEncoder { private static final int INSTANCE_SIZE = toIntExact(ClassLayout.parseClass(ColumnEncoder.class).instanceSize() + ClassLayout.parseClass(ColumnEncodeOutput.class).instanceSize()); @@ -337,14 +346,15 @@ private static class ColumnEncoder private final SliceOutput lengthOutput = new DynamicSliceOutput(512); - private CompressedSliceOutput output; + private MemoryCompressedSliceOutput output; private boolean columnClosed; - public ColumnEncoder(ColumnEncoding columnEncoding, RcFileCompressor compressor) + public ColumnEncoder(ColumnEncoding columnEncoding, Optional codec) + throws IOException { this.columnEncoding = columnEncoding; - this.output = compressor.createCompressedSliceOutput((int) MIN_BUFFER_SIZE.toBytes(), (int) MAX_BUFFER_SIZE.toBytes()); + this.output = createMemoryCompressedSliceOutput(codec); this.encodeOutput = new ColumnEncodeOutput(lengthOutput, output); } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/TimestampHolder.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/TimestampHolder.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/TimestampHolder.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/TimestampHolder.java index c770ec18a18e..cb0f38341984 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/TimestampHolder.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/TimestampHolder.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import io.trino.spi.block.Block; import io.trino.spi.type.LongTimestamp; @@ -33,7 +33,7 @@ public final class TimestampHolder private final long seconds; private final int nanosOfSecond; - public TimestampHolder(long epochMicros, int picosOfMicro) + private TimestampHolder(long epochMicros, int picosOfMicro) { this.seconds = floorDiv(epochMicros, MICROSECONDS_PER_SECOND); long picosOfSecond = (long) floorMod(epochMicros, MICROSECONDS_PER_SECOND) * PICOSECONDS_PER_MICROSECOND + picosOfMicro; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ValidationHash.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ValidationHash.java similarity index 98% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/ValidationHash.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ValidationHash.java index 2f90bf94dfbd..dd1f992b0fc5 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/ValidationHash.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/ValidationHash.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import io.trino.spi.block.Block; import io.trino.spi.function.InvocationConvention; @@ -59,7 +59,7 @@ class ValidationHash } } - // This should really come from the environment, but there is not good way to get a value here + // This should really come from the environment, but there is no good way to get a value here private static final TypeOperators VALIDATION_TYPE_OPERATORS_CACHE = new TypeOperators(); public static ValidationHash createValidationHash(Type type) diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryColumnEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryColumnEncoding.java similarity index 91% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryColumnEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryColumnEncoding.java index 2ef66e228c70..c00fc5bfce3c 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryColumnEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryColumnEncoding.java @@ -11,11 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnEncoding; +import io.trino.hive.formats.rcfile.ColumnEncoding; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryEncoding.java similarity index 87% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryEncoding.java index 1e4033f5a258..7cd095afd296 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryEncoding.java @@ -11,19 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; +import static io.trino.hive.formats.ReadWriteUtils.decodeVIntSize; +import static io.trino.hive.formats.ReadWriteUtils.readVInt; +import static io.trino.hive.formats.ReadWriteUtils.writeVInt; import static java.lang.Math.toIntExact; public class BinaryEncoding @@ -55,7 +55,7 @@ public void encodeColumn(Block block, SliceOutput output, EncodeOutput encodeOut public void encodeValueInto(Block block, int position, SliceOutput output) { Slice slice = type.getSlice(block, position); - // Note binary nested in complex structures do no use the empty marker. + // Note binary nested in complex structures do not use the empty marker. // Therefore, empty VARBINARY values are ok. writeVInt(output, slice.length()); output.writeBytes(slice); diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryRcFileEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryRcFileEncoding.java similarity index 93% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryRcFileEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryRcFileEncoding.java index ac085212816f..60fd3760b832 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BinaryRcFileEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BinaryRcFileEncoding.java @@ -11,10 +11,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; -import io.trino.rcfile.ColumnEncoding; -import io.trino.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.ColumnEncoding; +import io.trino.hive.formats.rcfile.RcFileEncoding; import io.trino.spi.type.TimestampType; import io.trino.spi.type.Type; import org.joda.time.DateTimeZone; @@ -127,7 +127,7 @@ public ColumnEncoding structEncoding(Type type, List fieldEncodi return new StructEncoding( type, fieldEncodings.stream() - .map(field -> (BinaryColumnEncoding) field) + .map(BinaryColumnEncoding.class::cast) .collect(Collectors.toList())); } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BlockEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BlockEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BlockEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BlockEncoding.java index 0327dfc39852..4f5e329a148c 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BlockEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BlockEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.DynamicSliceOutput; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BooleanEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BooleanEncoding.java similarity index 94% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BooleanEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BooleanEncoding.java index fa1b0e3fb9df..04f1e2a6bec9 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/BooleanEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/BooleanEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ByteEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ByteEncoding.java similarity index 94% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ByteEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ByteEncoding.java index 4e78a098d6cf..d41dc44c58ee 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ByteEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ByteEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DateEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DateEncoding.java similarity index 88% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DateEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DateEncoding.java index 78f0528239e1..3b872c313d9d 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DateEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DateEncoding.java @@ -11,19 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; +import static io.trino.hive.formats.ReadWriteUtils.decodeVIntSize; +import static io.trino.hive.formats.ReadWriteUtils.readVInt; +import static io.trino.hive.formats.ReadWriteUtils.writeVInt; import static java.lang.Math.toIntExact; public class DateEncoding diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DecimalEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DecimalEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DecimalEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DecimalEncoding.java index 4df1218e111a..cf2eddeaad7a 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DecimalEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DecimalEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.DecimalType; @@ -28,9 +28,9 @@ import java.math.BigInteger; import static com.google.common.base.Preconditions.checkState; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; +import static io.trino.hive.formats.ReadWriteUtils.decodeVIntSize; +import static io.trino.hive.formats.ReadWriteUtils.readVInt; +import static io.trino.hive.formats.ReadWriteUtils.writeVInt; import static io.trino.spi.type.Decimals.rescale; import static java.lang.Math.toIntExact; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DoubleEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DoubleEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DoubleEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DoubleEncoding.java index 338b67b3939b..c0acf5f96343 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/DoubleEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/DoubleEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/FloatEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/FloatEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/FloatEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/FloatEncoding.java index b3ff029ed84f..d6dd88f6474c 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/FloatEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/FloatEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ListEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ListEncoding.java similarity index 88% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ListEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ListEncoding.java index 0cd8690ee523..ca455c4d0ae7 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ListEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ListEncoding.java @@ -11,17 +11,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; +import io.trino.hive.formats.ReadWriteUtils; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; import static java.lang.Math.toIntExact; public class ListEncoding @@ -39,7 +37,7 @@ public ListEncoding(Type type, BinaryColumnEncoding elementEncoding) public void encodeValue(Block block, int position, SliceOutput output) { Block list = block.getObject(position, Block.class); - writeVInt(output, list.getPositionCount()); + ReadWriteUtils.writeVInt(output, list.getPositionCount()); // write null bits int nullByte = 0; @@ -66,8 +64,8 @@ public void encodeValue(Block block, int position, SliceOutput output) public void decodeValueInto(BlockBuilder builder, Slice slice, int offset, int length) { // entries in list - int entries = toIntExact(readVInt(slice, offset)); - offset += decodeVIntSize(slice.getByte(offset)); + int entries = toIntExact(ReadWriteUtils.readVInt(slice, offset)); + offset += ReadWriteUtils.decodeVIntSize(slice.getByte(offset)); // null bytes int nullByteCur = offset; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/LongEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/LongEncoding.java similarity index 78% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/LongEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/LongEncoding.java index 622bf0d5592d..c8f60434526f 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/LongEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/LongEncoding.java @@ -11,20 +11,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.ReadWriteUtils; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVLong; - public class LongEncoding implements BinaryColumnEncoding { @@ -40,7 +37,7 @@ public void encodeColumn(Block block, SliceOutput output, EncodeOutput encodeOut { for (int position = 0; position < block.getPositionCount(); position++) { if (!block.isNull(position)) { - writeVLong(output, type.getLong(block, position)); + ReadWriteUtils.writeVLong(output, type.getLong(block, position)); } encodeOutput.closeEntry(); } @@ -49,7 +46,7 @@ public void encodeColumn(Block block, SliceOutput output, EncodeOutput encodeOut @Override public void encodeValueInto(Block block, int position, SliceOutput output) { - writeVLong(output, type.getLong(block, position)); + ReadWriteUtils.writeVLong(output, type.getLong(block, position)); } @Override @@ -66,7 +63,7 @@ public Block decodeColumn(ColumnData columnData) builder.appendNull(); } else { - type.writeLong(builder, readVInt(slice, offset, length)); + type.writeLong(builder, ReadWriteUtils.readVInt(slice, offset, length)); } } return builder.build(); @@ -81,12 +78,12 @@ public int getValueOffset(Slice slice, int offset) @Override public int getValueLength(Slice slice, int offset) { - return decodeVIntSize(slice, offset); + return ReadWriteUtils.decodeVIntSize(slice, offset); } @Override public void decodeValueInto(BlockBuilder builder, Slice slice, int offset, int length) { - type.writeLong(builder, readVInt(slice, offset, length)); + type.writeLong(builder, ReadWriteUtils.readVInt(slice, offset, length)); } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/MapEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/MapEncoding.java similarity index 92% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/MapEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/MapEncoding.java index 88c049dfa31a..6886db230e5c 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/MapEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/MapEncoding.java @@ -11,19 +11,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; +import io.trino.hive.formats.ReadWriteUtils; import io.trino.spi.StandardErrorCode; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; import static java.lang.Math.toIntExact; public class MapEncoding @@ -45,7 +43,7 @@ public void encodeValue(Block block, int position, SliceOutput output) Block map = block.getObject(position, Block.class); // write entry count - writeVInt(output, map.getPositionCount() / 2); + ReadWriteUtils.writeVInt(output, map.getPositionCount() / 2); // write null bits int nullByte = 0b0101_0101; @@ -86,8 +84,8 @@ public void encodeValue(Block block, int position, SliceOutput output) public void decodeValueInto(BlockBuilder builder, Slice slice, int offset, int length) { // entries in list - int entries = toIntExact(readVInt(slice, offset)); - offset += decodeVIntSize(slice.getByte(offset)); + int entries = toIntExact(ReadWriteUtils.readVInt(slice, offset)); + offset += ReadWriteUtils.decodeVIntSize(slice.getByte(offset)); // null bytes int nullByteCur = offset; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ShortEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ShortEncoding.java similarity index 91% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ShortEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ShortEncoding.java index 7269e942afd4..d3b8a2ccecbc 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/ShortEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/ShortEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; @@ -62,7 +62,7 @@ public Block decodeColumn(ColumnData columnData) int length = columnData.getLength(i); if (length != 0) { checkState(length == SIZE_OF_SHORT, "Short should be 2 bytes"); - type.writeLong(builder, (long) Short.reverseBytes(slice.getShort(columnData.getOffset(i)))); + type.writeLong(builder, Short.reverseBytes(slice.getShort(columnData.getOffset(i)))); } else { builder.appendNull(); diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StringEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StringEncoding.java similarity index 84% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StringEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StringEncoding.java index edb2b387caba..eebf717aa4b9 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StringEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StringEncoding.java @@ -11,21 +11,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; import static io.airlift.slice.Slices.EMPTY_SLICE; -import static io.trino.rcfile.RcFileDecoderUtils.calculateTruncationLength; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; +import static io.trino.hive.formats.ReadWriteUtils.calculateTruncationLength; +import static io.trino.hive.formats.ReadWriteUtils.decodeVIntSize; +import static io.trino.hive.formats.ReadWriteUtils.readVInt; +import static io.trino.hive.formats.ReadWriteUtils.writeVInt; import static java.lang.Math.toIntExact; public class StringEncoding @@ -61,7 +61,7 @@ public void encodeColumn(Block block, SliceOutput output, EncodeOutput encodeOut public void encodeValueInto(Block block, int position, SliceOutput output) { Slice slice = type.getSlice(block, position); - // Note strings nested in complex structures do no use the empty string marker + // Note strings nested in complex structures do not use the empty string marker writeVInt(output, slice.length()); output.writeBytes(slice); } @@ -107,7 +107,7 @@ public int getValueLength(Slice slice, int offset) @Override public void decodeValueInto(BlockBuilder builder, Slice slice, int offset, int length) { - // Note strings nested in complex structures do no use the empty string marker + // Note strings nested in complex structures do not use the empty string marker length = calculateTruncationLength(type, slice, offset, length); type.writeSlice(builder, slice, offset, length); } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StructEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StructEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StructEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StructEncoding.java index 9f7d48145fc9..1532be23fad9 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/StructEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/StructEncoding.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import com.google.common.collect.ImmutableList; import io.airlift.slice.Slice; @@ -89,8 +89,7 @@ public void decodeValueInto(BlockBuilder builder, Slice slice, int offset, int l fieldId++; } - // Some times a struct does not have all fields written - // so we fill with nulls + // Sometimes a struct does not have all fields written, so we fill with nulls while (fieldId < structFields.size()) { rowBuilder.appendNull(); fieldId++; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/TimestampEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/TimestampEncoding.java similarity index 87% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/TimestampEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/TimestampEncoding.java index 2f7d7134b902..89d7b3fb6de4 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/binary/TimestampEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/binary/TimestampEncoding.java @@ -11,15 +11,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.binary; +package io.trino.hive.formats.rcfile.binary; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; +import io.trino.hive.formats.ReadWriteUtils; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.TimestampHolder; import io.trino.plugin.base.type.DecodedTimestamp; import io.trino.plugin.base.type.TrinoTimestampEncoder; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; -import io.trino.rcfile.TimestampHolder; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.TimestampType; @@ -29,10 +30,6 @@ import static io.airlift.slice.SizeOf.SIZE_OF_INT; import static io.trino.plugin.base.type.TrinoTimestampEncoderFactory.createTimestampEncoder; -import static io.trino.rcfile.RcFileDecoderUtils.decodeVIntSize; -import static io.trino.rcfile.RcFileDecoderUtils.isNegativeVInt; -import static io.trino.rcfile.RcFileDecoderUtils.readVInt; -import static io.trino.rcfile.RcFileDecoderUtils.writeVInt; import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; import static java.util.Objects.requireNonNull; @@ -100,12 +97,12 @@ public int getValueLength(Slice slice, int offset) { int length = 4; if (hasNanosVInt(slice.getByte(offset))) { - int nanosVintLength = decodeVIntSize(slice, offset + 4); + int nanosVintLength = ReadWriteUtils.decodeVIntSize(slice, offset + 4); length += nanosVintLength; // is there extra data for "seconds" - if (isNegativeVInt(slice, offset + 4)) { - length += decodeVIntSize(slice, offset + 4 + nanosVintLength); + if (ReadWriteUtils.isNegativeVInt(slice, offset + 4)) { + length += ReadWriteUtils.decodeVIntSize(slice, offset + 4 + nanosVintLength); } } return length; @@ -123,7 +120,7 @@ private static boolean hasNanosVInt(byte b) return (b >> 7) != 0; } - private DecodedTimestamp getTimestamp(Slice slice, int offset) + private static DecodedTimestamp getTimestamp(Slice slice, int offset) { // read seconds (low 32 bits) int lowest31BitsOfSecondsAndFlag = Integer.reverseBytes(slice.getInt(offset)); @@ -133,18 +130,18 @@ private DecodedTimestamp getTimestamp(Slice slice, int offset) int nanos = 0; if (lowest31BitsOfSecondsAndFlag < 0) { // read nanos - // this is an inline version of readVint so it can be stitched together + // this is an inline version of readVint, so it can be stitched together // the code to read the seconds high bits below byte nanosFirstByte = slice.getByte(offset); - int nanosLength = decodeVIntSize(nanosFirstByte); - nanos = (int) readVInt(slice, offset, nanosLength); + int nanosLength = ReadWriteUtils.decodeVIntSize(nanosFirstByte); + nanos = (int) ReadWriteUtils.readVInt(slice, offset, nanosLength); nanos = decodeNanos(nanos); // read seconds (high 32 bits) - if (isNegativeVInt(nanosFirstByte)) { + if (ReadWriteUtils.isNegativeVInt(nanosFirstByte)) { // We compose the seconds field from two parts. The lowest 31 bits come from the first four // bytes. The higher-order bits come from the second VInt that follows the nanos field. - long highBits = readVInt(slice, offset + nanosLength); + long highBits = ReadWriteUtils.readVInt(slice, offset + nanosLength); seconds |= (highBits << 31); } } @@ -208,12 +205,12 @@ private static void writeTimestamp(long seconds, int nanos, SliceOutput output) if (hasSecondsHigh32 || nanosReversed != 0) { // The sign of the reversed-nanoseconds field indicates that there is a second VInt present int value = hasSecondsHigh32 ? ~nanosReversed : nanosReversed; - writeVInt(output, value); + ReadWriteUtils.writeVInt(output, value); } if (hasSecondsHigh32) { int secondsHigh32 = (int) (seconds >> 31); - writeVInt(output, secondsHigh32); + ReadWriteUtils.writeVInt(output, secondsHigh32); } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BinaryEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BinaryEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BinaryEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BinaryEncoding.java index dba5c0912aea..70a201e95809 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BinaryEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BinaryEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BlockEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BlockEncoding.java similarity index 93% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BlockEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BlockEncoding.java index e4c9b486dd90..49ebb57f6ca4 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BlockEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BlockEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BooleanEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BooleanEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BooleanEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BooleanEncoding.java index b540ff0b39d0..16fcc8d3db27 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/BooleanEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/BooleanEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DateEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DateEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DateEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DateEncoding.java index 8ede2e33cd6d..e1bb3f9628d8 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DateEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DateEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; @@ -88,7 +88,6 @@ public Block decodeColumn(ColumnData columnData) builder.appendNull(); } else { - //noinspection deprecation type.writeLong(builder, parseDate(slice, offset, length)); } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DecimalEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DecimalEncoding.java similarity index 97% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DecimalEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DecimalEncoding.java index 4fff1d62ea1b..0d1b254c5243 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DecimalEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DecimalEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.DecimalType; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DoubleEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DoubleEncoding.java similarity index 94% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DoubleEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DoubleEncoding.java index 18eda352b42c..425349c6facb 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/DoubleEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/DoubleEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/FloatEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/FloatEncoding.java similarity index 94% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/FloatEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/FloatEncoding.java index 6f64f52d5503..0fba42219358 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/FloatEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/FloatEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/ListEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/ListEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/ListEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/ListEncoding.java index 140a445633ab..87f0800ac6ce 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/ListEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/ListEncoding.java @@ -11,11 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/LongEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/LongEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/LongEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/LongEncoding.java index 280064a3f496..e9253fa8bd5e 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/LongEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/LongEncoding.java @@ -11,13 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/MapEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/MapEncoding.java similarity index 97% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/MapEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/MapEncoding.java index ed5731e47fc3..9000401c8b99 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/MapEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/MapEncoding.java @@ -11,11 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.StandardErrorCode; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StringEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StringEncoding.java similarity index 95% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StringEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StringEncoding.java index be7a1a8bf427..3fc5fa753638 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StringEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StringEncoding.java @@ -11,18 +11,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; -import static io.trino.rcfile.RcFileDecoderUtils.calculateTruncationLength; +import static io.trino.hive.formats.ReadWriteUtils.calculateTruncationLength; public class StringEncoding implements TextColumnEncoding diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StructEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StructEncoding.java similarity index 97% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StructEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StructEncoding.java index d70941058fc8..560cbdfda5c8 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/StructEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/StructEncoding.java @@ -11,11 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextColumnEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextColumnEncoding.java similarity index 86% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextColumnEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextColumnEncoding.java index 91d2890bead5..4f3da2c52ac0 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextColumnEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextColumnEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.trino.rcfile.ColumnEncoding; -import io.trino.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.ColumnEncoding; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextRcFileEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextRcFileEncoding.java similarity index 97% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextRcFileEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextRcFileEncoding.java index 8186ee23d366..4eb1781ab642 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TextRcFileEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TextRcFileEncoding.java @@ -11,12 +11,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.Slices; -import io.trino.rcfile.ColumnEncoding; -import io.trino.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.ColumnEncoding; +import io.trino.hive.formats.rcfile.RcFileEncoding; import io.trino.spi.type.TimestampType; import io.trino.spi.type.Type; diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TimestampEncoding.java b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TimestampEncoding.java similarity index 96% rename from lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TimestampEncoding.java rename to lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TimestampEncoding.java index 2230fdf7bb6b..a4630a42b11b 100644 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/text/TimestampEncoding.java +++ b/lib/trino-hive-formats/src/main/java/io/trino/hive/formats/rcfile/text/TimestampEncoding.java @@ -11,15 +11,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile.text; +package io.trino.hive.formats.rcfile.text; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; +import io.trino.hive.formats.rcfile.ColumnData; +import io.trino.hive.formats.rcfile.EncodeOutput; +import io.trino.hive.formats.rcfile.TimestampHolder; import io.trino.plugin.base.type.DecodedTimestamp; import io.trino.plugin.base.type.TrinoTimestampEncoder; -import io.trino.rcfile.ColumnData; -import io.trino.rcfile.EncodeOutput; -import io.trino.rcfile.TimestampHolder; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.type.TimestampType; diff --git a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataOutputStream.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataOutputStream.java new file mode 100644 index 000000000000..290e5a709714 --- /dev/null +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataOutputStream.java @@ -0,0 +1,234 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.openjdk.jol.info.ClassLayout; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.concurrent.ThreadLocalRandom; + +import static org.testng.Assert.assertEquals; + +public class TestDataOutputStream +{ + @Test + public void testEncodingBoolean() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeBoolean(true), + new byte[] {1}); + assertEncoding(sliceOutput -> sliceOutput.writeBoolean(false), + new byte[] {0}); + } + + @Test + public void testEncodingByte() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeByte(92), + new byte[] {92}); + assertEncoding(sliceOutput -> sliceOutput.writeByte(156), + new byte[] {-100}); + assertEncoding(sliceOutput -> sliceOutput.writeByte(-17), + new byte[] {-17}); + + assertEncoding(sliceOutput -> sliceOutput.write(92), + new byte[] {92}); + assertEncoding(sliceOutput -> sliceOutput.write(156), + new byte[] {-100}); + assertEncoding(sliceOutput -> sliceOutput.write(-17), + new byte[] {-17}); + } + + @Test + public void testEncodingShort() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeShort(23661), + new byte[] {109, 92}); + assertEncoding(sliceOutput -> sliceOutput.writeShort(40045), + new byte[] {109, -100}); + assertEncoding(sliceOutput -> sliceOutput.writeShort(-27188), + new byte[] {-52, -107}); + } + + @Test + public void testEncodingInteger() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeInt(978017389), + new byte[] {109, 92, 75, 58}); + assertEncoding(sliceOutput -> sliceOutput.writeInt(-7813904), + new byte[] {-16, -60, -120, -1}); + } + + @Test + public void testEncodingLong() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeLong(9214541725452766769L), + new byte[] {49, -114, -96, -23, -32, -96, -32, 127}); + assertEncoding(sliceOutput -> sliceOutput.writeLong(-1184314682315678611L), + new byte[] {109, 92, 75, 58, 18, 120, -112, -17}); + } + + @Test + public void testEncodingDouble() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeDouble(3.14), + new byte[] {31, -123, -21, 81, -72, 30, 9, 64}); + assertEncoding(sliceOutput -> sliceOutput.writeDouble(Double.NaN), + new byte[] {0, 0, 0, 0, 0, 0, -8, 127}); + assertEncoding(sliceOutput -> sliceOutput.writeDouble(Double.NEGATIVE_INFINITY), + new byte[] {0, 0, 0, 0, 0, 0, -16, -1}); + assertEncoding(sliceOutput -> sliceOutput.writeDouble(Double.POSITIVE_INFINITY), + new byte[] {0, 0, 0, 0, 0, 0, -16, 127}); + } + + @Test + public void testEncodingFloat() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeFloat(3.14f), + new byte[] {-61, -11, 72, 64}); + assertEncoding(sliceOutput -> sliceOutput.writeFloat(Float.NaN), + new byte[] {0, 0, -64, 127}); + assertEncoding(sliceOutput -> sliceOutput.writeFloat(Float.NEGATIVE_INFINITY), + new byte[] {0, 0, -128, -1}); + assertEncoding(sliceOutput -> sliceOutput.writeFloat(Float.POSITIVE_INFINITY), + new byte[] {0, 0, -128, 127}); + } + + @Test + public void testEncodingBytes() + throws Exception + { + byte[] data = new byte[18000]; + ThreadLocalRandom.current().nextBytes(data); + + assertEncoding(sliceOutput -> sliceOutput.write(data), data); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 0), Arrays.copyOfRange(data, 0, 0)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 3), Arrays.copyOfRange(data, 0, 3)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 370), Arrays.copyOfRange(data, 0, 370)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 4095), Arrays.copyOfRange(data, 0, 4095)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 4096), Arrays.copyOfRange(data, 0, 4096)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 12348), Arrays.copyOfRange(data, 0, 12348)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 16384), Arrays.copyOfRange(data, 0, 16384)); + assertEncoding(sliceOutput -> sliceOutput.write(data, 0, 18000), Arrays.copyOfRange(data, 0, 18000)); + } + + @Test + public void testEncodingSlice() + throws Exception + { + byte[] data = new byte[18000]; + ThreadLocalRandom.current().nextBytes(data); + Slice slice = Slices.wrappedBuffer(data); + + assertEncoding(sliceOutput -> sliceOutput.write(slice), data); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 0), Arrays.copyOfRange(data, 0, 0)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 3), Arrays.copyOfRange(data, 0, 3)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 370), Arrays.copyOfRange(data, 0, 370)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 4095), Arrays.copyOfRange(data, 0, 4095)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 4096), Arrays.copyOfRange(data, 0, 4096)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 12348), Arrays.copyOfRange(data, 0, 12348)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 16384), Arrays.copyOfRange(data, 0, 16384)); + assertEncoding(sliceOutput -> sliceOutput.write(slice, 0, 18000), Arrays.copyOfRange(data, 0, 18000)); + } + + @Test + public void testWriteZero() + throws Exception + { + assertEncoding(sliceOutput -> sliceOutput.writeZero(0), new byte[0]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(1), new byte[1]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(2), new byte[2]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(3), new byte[3]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(4), new byte[4]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(6), new byte[6]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(7), new byte[7]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(8), new byte[8]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(9), new byte[9]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(16), new byte[16]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(22), new byte[22]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(227), new byte[227]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(4227), new byte[4227]); + assertEncoding(sliceOutput -> sliceOutput.writeZero(18349), new byte[18349]); + } + + @Test + public void testRetainedSize() + throws IOException + { + int bufferSize = 1337; + DataOutputStream output = new DataOutputStream(new ByteArrayOutputStream(0), bufferSize); + + long originalRetainedSize = output.getRetainedSize(); + assertEquals(originalRetainedSize, ClassLayout.parseClass(DataOutputStream.class).instanceSize() + Slices.allocate(bufferSize).getRetainedSize()); + output.writeLong(0); + output.writeShort(0); + assertEquals(output.getRetainedSize(), originalRetainedSize); + } + + /** + * Asserting different offsets of operations. + */ + private static void assertEncoding(DataOutputTester operations, byte... expected) + throws IOException + { + assertEncoding(operations, 0, expected); + assertEncoding(operations, 1, expected); + assertEncoding(operations, 2, expected); + assertEncoding(operations, 3, expected); + assertEncoding(operations, 4, expected); + assertEncoding(operations, 7, expected); + assertEncoding(operations, 8, expected); + assertEncoding(operations, 16, expected); + assertEncoding(operations, 511, expected); + assertEncoding(operations, 12000, expected); + assertEncoding(operations, 13000, expected); + assertEncoding(operations, 16000, expected); + assertEncoding(operations, 16380, expected); + assertEncoding(operations, 16383, expected); + assertEncoding(operations, 16384, expected); + assertEncoding(operations, 18349, expected); + } + + private static void assertEncoding(DataOutputTester operations, int offset, byte... output) + throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream, 16384)) { + dataOutputStream.writeZero(offset); + operations.test(dataOutputStream); + assertEquals(dataOutputStream.longSize(), offset + output.length); + } + + byte[] expected = new byte[offset + output.length]; + System.arraycopy(output, 0, expected, offset, output.length); + assertEquals(byteArrayOutputStream.toByteArray(), expected); + } + + private interface DataOutputTester + { + void test(DataOutputStream dataOutputStream) + throws IOException; + } +} diff --git a/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataSeekableInputStream.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataSeekableInputStream.java new file mode 100644 index 000000000000..8b91d6c820a3 --- /dev/null +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestDataSeekableInputStream.java @@ -0,0 +1,733 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.hive.formats; + +import com.google.common.collect.ImmutableList; +import com.google.common.io.ByteSource; +import com.google.common.io.ByteStreams; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.filesystem.memory.MemorySeekableInputStream; +import org.apache.iceberg.io.SeekableInputStream; +import org.openjdk.jol.info.ClassLayout; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.util.List; + +import static com.google.common.collect.Iterables.cycle; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; +import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; +import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.airlift.slice.SizeOf.SIZE_OF_SHORT; +import static io.airlift.slice.SizeOf.sizeOfByteArray; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +@SuppressWarnings("resource") +public class TestDataSeekableInputStream +{ + private static final int BUFFER_SIZE = 129; + + private static final List VARIABLE_READ_SIZES = ImmutableList.of( + 1, + 7, + 15, + BUFFER_SIZE - 1, + BUFFER_SIZE, + BUFFER_SIZE + 1, + BUFFER_SIZE + 13); + + @Test + public void testReadBoolean() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_BYTE) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeBoolean(valueIndex % 2 == 0); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readBoolean(), valueIndex % 2 == 0); + } + }); + } + + @Test + public void testReadByte() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_BYTE) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeByte((byte) valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readByte(), (byte) valueIndex); + } + }); + } + + @Test + public void testRead() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_BYTE) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeByte((byte) valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.read(), valueIndex & 0xFF); + } + + @Override + public void verifyReadOffEnd(DataSeekableInputStream input) + throws IOException + { + assertEquals(input.read(), -1); + } + }); + } + + @Test + public void testReadShort() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_SHORT) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeShort(valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readShort(), (short) valueIndex); + } + }); + } + + @Test + public void testReadUnsignedShort() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_SHORT) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeShort(valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readUnsignedShort(), valueIndex & 0xFFF); + } + }); + } + + @Test + public void testReadInt() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_INT) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeInt(valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readInt(), valueIndex); + } + }); + } + + @Test + public void testUnsignedReadInt() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_INT) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeInt(valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readUnsignedInt(), valueIndex); + } + }); + } + + @Test + public void testReadLong() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_LONG) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeLong(valueIndex); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readLong(), valueIndex); + } + }); + } + + @Test + public void testReadFloat() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_FLOAT) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeFloat(valueIndex + 0.12f); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readFloat(), valueIndex + 0.12f); + } + }); + } + + @Test + public void testReadDouble() + throws IOException + { + testDataInput(new DataInputTester(SIZE_OF_DOUBLE) + { + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.writeDouble(valueIndex + 0.12); + } + + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + assertEquals(input.readDouble(), valueIndex + 0.12); + } + }); + } + + @Test + public void testSkip() + throws IOException + { + for (int readSize : VARIABLE_READ_SIZES) { + // skip without any reads + testDataInput(new SkipDataInputTester(readSize) + { + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + input.skip(valueSize()); + } + + @Override + public void verifyReadOffEnd(DataSeekableInputStream input) + throws IOException + { + assertEquals(input.skip(valueSize()), valueSize() - 1); + } + }); + testDataInput(new SkipDataInputTester(readSize) + { + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + input.skipBytes(valueSize()); + } + + @Override + public void verifyReadOffEnd(DataSeekableInputStream input) + throws IOException + { + assertEquals(input.skip(valueSize()), valueSize() - 1); + } + }); + + // read when no data available to force buffering + testDataInput(new SkipDataInputTester(readSize) + { + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + int length = valueSize(); + while (length > 0) { + if (input.available() == 0) { + input.readByte(); + length--; + } + int skipSize = input.skipBytes(length); + length -= skipSize; + } + assertEquals(input.skip(0), 0); + } + }); + testDataInput(new SkipDataInputTester(readSize) + { + @Override + public void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + long length = valueSize(); + while (length > 0) { + if (input.available() == 0) { + input.readByte(); + length--; + } + long skipSize = input.skip(length); + length -= skipSize; + } + assertEquals(input.skip(0), 0); + } + }); + } + } + + @Test + public void testReadSlice() + throws IOException + { + for (int readSize : VARIABLE_READ_SIZES) { + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + return input.readSlice(valueSize()).toStringUtf8(); + } + }); + } + } + + @Test + public void testReadFully() + throws IOException + { + for (int readSize : VARIABLE_READ_SIZES) { + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + Slice slice = Slices.allocate(valueSize()); + input.readFully(slice); + return slice.toStringUtf8(); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + Slice slice = Slices.allocate(valueSize() + 10); + input.readFully(slice, 5, valueSize()); + return slice.slice(5, valueSize()).toStringUtf8(); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + byte[] bytes = new byte[valueSize()]; + input.readFully(bytes, 0, valueSize()); + return new String(bytes, 0, valueSize(), UTF_8); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + byte[] bytes = new byte[valueSize() + 10]; + input.readFully(bytes, 5, valueSize()); + return new String(bytes, 5, valueSize(), UTF_8); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + byte[] bytes = new byte[valueSize()]; + int bytesRead = input.read(bytes); + if (bytesRead == -1) { + throw new EOFException(); + } + assertTrue(bytesRead > 0, "Expected to read at least one byte"); + input.readFully(bytes, bytesRead, bytes.length - bytesRead); + return new String(bytes, 0, valueSize(), UTF_8); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + byte[] bytes = new byte[valueSize() + 10]; + ByteStreams.readFully(input, bytes, 5, valueSize()); + return new String(bytes, 5, valueSize(), UTF_8); + } + }); + testDataInput(new StringDataInputTester(readSize) + { + @Override + public String readActual(DataSeekableInputStream input) + throws IOException + { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + input.readFully(out, valueSize()); + return out.toString(UTF_8); + } + }); + } + } + + @Test + public void testEmptyInput() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[0]); + assertEquals(input.getPos(), 0); + } + + @Test + public void testEmptyRead() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[0]); + assertEquals(input.read(), -1); + } + + @Test(expectedExceptions = EOFException.class) + public void testReadByteBeyondEnd() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[0]); + input.readByte(); + } + + @Test(expectedExceptions = EOFException.class) + public void testReadShortBeyondEnd() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[1]); + input.readShort(); + } + + @Test(expectedExceptions = EOFException.class) + public void testReadIntBeyondEnd() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[3]); + input.readInt(); + } + + @Test(expectedExceptions = EOFException.class) + public void testReadLongBeyondEnd() + throws Exception + { + DataSeekableInputStream input = createDataSeekableInputStream(new byte[7]); + input.readLong(); + } + + @Test + public void testEncodingBoolean() + throws Exception + { + assertTrue(createDataSeekableInputStream(new byte[] {1}).readBoolean()); + assertFalse(createDataSeekableInputStream(new byte[] {0}).readBoolean()); + } + + @Test + public void testEncodingByte() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {92}).readByte(), 92); + assertEquals(createDataSeekableInputStream(new byte[] {-100}).readByte(), -100); + assertEquals(createDataSeekableInputStream(new byte[] {-17}).readByte(), -17); + + assertEquals(createDataSeekableInputStream(new byte[] {92}).readUnsignedByte(), 92); + assertEquals(createDataSeekableInputStream(new byte[] {-100}).readUnsignedByte(), 156); + assertEquals(createDataSeekableInputStream(new byte[] {-17}).readUnsignedByte(), 239); + } + + @Test + public void testEncodingShort() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {109, 92}).readShort(), 23661); + assertEquals(createDataSeekableInputStream(new byte[] {109, -100}).readShort(), -25491); + assertEquals(createDataSeekableInputStream(new byte[] {-52, -107}).readShort(), -27188); + + assertEquals(createDataSeekableInputStream(new byte[] {109, -100}).readUnsignedShort(), 40045); + assertEquals(createDataSeekableInputStream(new byte[] {-52, -107}).readUnsignedShort(), 38348); + } + + @Test + public void testEncodingInteger() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {109, 92, 75, 58}).readInt(), 978017389); + assertEquals(createDataSeekableInputStream(new byte[] {-16, -60, -120, -1}).readInt(), -7813904); + } + + @Test + public void testEncodingLong() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {49, -114, -96, -23, -32, -96, -32, 127}).readLong(), 9214541725452766769L); + assertEquals(createDataSeekableInputStream(new byte[] {109, 92, 75, 58, 18, 120, -112, -17}).readLong(), -1184314682315678611L); + } + + @Test + public void testEncodingDouble() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {31, -123, -21, 81, -72, 30, 9, 64}).readDouble(), 3.14); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, 0, 0, 0, 0, -8, 127}).readDouble(), Double.NaN); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, 0, 0, 0, 0, -16, -1}).readDouble(), Double.NEGATIVE_INFINITY); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, 0, 0, 0, 0, -16, 127}).readDouble(), Double.POSITIVE_INFINITY); + } + + @Test + public void testEncodingFloat() + throws Exception + { + assertEquals(createDataSeekableInputStream(new byte[] {-61, -11, 72, 64}).readFloat(), 3.14f); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, -64, 127}).readFloat(), Float.NaN); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, -128, -1}).readFloat(), Float.NEGATIVE_INFINITY); + assertEquals(createDataSeekableInputStream(new byte[] {0, 0, -128, 127}).readFloat(), Float.POSITIVE_INFINITY); + } + + @Test + public void testRetainedSize() + { + int bufferSize = 1024; + SeekableInputStream inputStream = new MemorySeekableInputStream(Slices.wrappedBuffer(new byte[] {0, 1})); + DataSeekableInputStream input = new DataSeekableInputStream(inputStream, bufferSize); + assertEquals(input.getRetainedSize(), ClassLayout.parseClass(DataSeekableInputStream.class).instanceSize() + sizeOfByteArray(bufferSize)); + } + + private static void testDataInput(DataInputTester tester) + throws IOException + { + int size = (BUFFER_SIZE * 3) + 10; + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(size); + try (DataOutputStream output = new DataOutputStream(byteArrayOutputStream)) { + for (int i = 0; i < size / tester.valueSize(); i++) { + tester.loadValue(output, i); + } + } + byte[] bytes = byteArrayOutputStream.toByteArray(); + + testReadForward(tester, bytes); + testReadReverse(tester, bytes); + testReadOffEnd(tester, bytes); + } + + private static void testReadForward(DataInputTester tester, byte[] bytes) + throws IOException + { + DataSeekableInputStream input = createDataSeekableInputStream(bytes); + for (int i = 0; i < bytes.length / tester.valueSize(); i++) { + int position = i * tester.valueSize(); + assertEquals(input.getPos(), position); + tester.verifyValue(input, i); + } + } + + private static void testReadReverse(DataInputTester tester, byte[] bytes) + throws IOException + { + DataSeekableInputStream input = createDataSeekableInputStream(bytes); + for (int i = bytes.length / tester.valueSize() - 1; i >= 0; i--) { + int position = i * tester.valueSize(); + input.seek(position); + assertEquals(input.getPos(), position); + tester.verifyValue(input, i); + } + } + + private static void testReadOffEnd(DataInputTester tester, byte[] bytes) + throws IOException + { + DataSeekableInputStream input = createDataSeekableInputStream(bytes); + ByteStreams.skipFully(input, bytes.length - tester.valueSize() + 1); + tester.verifyReadOffEnd(input); + } + + private static String getExpectedStringValue(int index, int size) + throws IOException + { + return ByteSource.concat(cycle(ByteSource.wrap(String.valueOf(index).getBytes(UTF_8)))).slice(0, size).asCharSource(UTF_8).read(); + } + + protected abstract static class DataInputTester + { + private final int size; + + public DataInputTester(int size) + { + this.size = size; + } + + public final int valueSize() + { + return size; + } + + public abstract void loadValue(DataOutputStream slice, int valueIndex) + throws IOException; + + public abstract void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException; + + public void verifyReadOffEnd(DataSeekableInputStream input) + throws IOException + { + try { + verifyValue(input, 1); + fail("expected EOFException"); + } + catch (EOFException expected) { + } + } + } + + private abstract static class SkipDataInputTester + extends DataInputTester + { + public SkipDataInputTester(int size) + { + super(size); + } + + @Override + public void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.write(new byte[valueSize()]); + } + } + + private abstract static class StringDataInputTester + extends DataInputTester + { + public StringDataInputTester(int size) + { + super(size); + } + + @Override + public final void loadValue(DataOutputStream output, int valueIndex) + throws IOException + { + output.write(getExpectedStringValue(valueIndex, valueSize()).getBytes(UTF_8)); + } + + @Override + public final void verifyValue(DataSeekableInputStream input, int valueIndex) + throws IOException + { + String actual = readActual(input); + String expected = getExpectedStringValue(valueIndex, valueSize()); + assertEquals(actual, expected); + } + + protected abstract String readActual(DataSeekableInputStream input) + throws IOException; + } + + private static DataSeekableInputStream createDataSeekableInputStream(byte[] bytes) + { + SeekableInputStream inputStream = new MemorySeekableInputStream(Slices.wrappedBuffer(bytes)); + return new DataSeekableInputStream(inputStream, 16 * 1024); + } +} diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileDecoderUtils.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestReadWriteUtils.java similarity index 89% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileDecoderUtils.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestReadWriteUtils.java index 070965c6b2dd..153c3d2ee536 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileDecoderUtils.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/TestReadWriteUtils.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; @@ -23,7 +23,7 @@ import static org.testng.Assert.assertEquals; -public class TestRcFileDecoderUtils +public class TestReadWriteUtils { @Test public void testVInt() @@ -58,10 +58,10 @@ private static void assertVIntRoundTrip(SliceOutput output, long value) long readValueOld = WritableUtils.readVLong(oldBytes.getInput()); assertEquals(readValueOld, value); - long readValueNew = RcFileDecoderUtils.readVInt(oldBytes, 0); + long readValueNew = ReadWriteUtils.readVInt(oldBytes, 0); assertEquals(readValueNew, value); - long readValueNewStream = RcFileDecoderUtils.readVInt(oldBytes.getInput()); + long readValueNewStream = ReadWriteUtils.readVInt(oldBytes.getInput()); assertEquals(readValueNewStream, value); } @@ -73,7 +73,7 @@ private static Slice writeVintOld(SliceOutput output, long value) Slice vLongOld = Slices.copyOf(output.slice()); output.reset(); - RcFileDecoderUtils.writeVLong(output, value); + ReadWriteUtils.writeVLong(output, value); Slice vLongNew = Slices.copyOf(output.slice()); assertEquals(vLongNew, vLongOld); @@ -84,7 +84,7 @@ private static Slice writeVintOld(SliceOutput output, long value) assertEquals(vIntOld, vLongOld); output.reset(); - RcFileDecoderUtils.writeVInt(output, (int) value); + ReadWriteUtils.writeVInt(output, (int) value); Slice vIntNew = Slices.copyOf(output.slice()); assertEquals(vIntNew, vLongOld); } diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestBufferedOutputStreamSliceOutput.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/compression/TestBufferedOutputStreamSliceOutput.java similarity index 98% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/TestBufferedOutputStreamSliceOutput.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/compression/TestBufferedOutputStreamSliceOutput.java index dc41c1283175..f8b20a6f76c0 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestBufferedOutputStreamSliceOutput.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/compression/TestBufferedOutputStreamSliceOutput.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.compression; import io.airlift.slice.Slice; import io.airlift.slice.Slices; diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/AbstractTestRcFileReader.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/AbstractTestRcFileReader.java similarity index 98% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/AbstractTestRcFileReader.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/AbstractTestRcFileReader.java index 4aec0b4b75b9..5df8dfb866b5 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/AbstractTestRcFileReader.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/AbstractTestRcFileReader.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.ContiguousSet; import com.google.common.collect.DiscreteDomain; @@ -32,7 +32,7 @@ import static com.google.common.collect.Iterables.cycle; import static com.google.common.collect.Iterables.limit; -import static io.trino.rcfile.RcFileTester.Format.BINARY; +import static io.trino.hive.formats.rcfile.RcFileTester.Format.BINARY; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.DateType.DATE; diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/RcFileTester.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/RcFileTester.java similarity index 92% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/RcFileTester.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/RcFileTester.java index d657066f9cd1..c60224f69d43 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/RcFileTester.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/RcFileTester.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.AbstractIterator; import com.google.common.collect.ImmutableList; @@ -19,13 +19,15 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; -import io.airlift.slice.OutputStreamSliceOutput; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import io.airlift.units.DataSize; +import io.trino.filesystem.TrinoInputFile; +import io.trino.filesystem.local.LocalInputFile; import io.trino.hadoop.HadoopNative; -import io.trino.rcfile.binary.BinaryRcFileEncoding; -import io.trino.rcfile.text.TextRcFileEncoding; +import io.trino.hive.formats.compression.CompressionKind; +import io.trino.hive.formats.rcfile.binary.BinaryRcFileEncoding; +import io.trino.hive.formats.rcfile.text.TextRcFileEncoding; import io.trino.spi.Page; import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; @@ -79,10 +81,6 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.compress.BZip2Codec; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.io.compress.Lz4Codec; -import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; @@ -119,16 +117,15 @@ import static io.airlift.slice.SizeOf.SIZE_OF_INT; import static io.airlift.slice.SizeOf.SIZE_OF_LONG; import static io.airlift.units.DataSize.Unit.KILOBYTE; -import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; -import static io.trino.rcfile.RcFileDecoderUtils.findFirstSyncPosition; -import static io.trino.rcfile.RcFileTester.Compression.BZIP2; -import static io.trino.rcfile.RcFileTester.Compression.LZ4; -import static io.trino.rcfile.RcFileTester.Compression.NONE; -import static io.trino.rcfile.RcFileTester.Compression.SNAPPY; -import static io.trino.rcfile.RcFileTester.Compression.ZLIB; -import static io.trino.rcfile.RcFileWriter.PRESTO_RCFILE_WRITER_VERSION; -import static io.trino.rcfile.RcFileWriter.PRESTO_RCFILE_WRITER_VERSION_METADATA_KEY; +import static io.trino.hive.formats.ReadWriteUtils.findFirstSyncPosition; +import static io.trino.hive.formats.rcfile.RcFileTester.Compression.BZIP2; +import static io.trino.hive.formats.rcfile.RcFileTester.Compression.GZIP; +import static io.trino.hive.formats.rcfile.RcFileTester.Compression.LZ4; +import static io.trino.hive.formats.rcfile.RcFileTester.Compression.NONE; +import static io.trino.hive.formats.rcfile.RcFileTester.Compression.SNAPPY; +import static io.trino.hive.formats.rcfile.RcFileWriter.PRESTO_RCFILE_WRITER_VERSION; +import static io.trino.hive.formats.rcfile.RcFileWriter.PRESTO_RCFILE_WRITER_VERSION_METADATA_KEY; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.DateType.DATE; @@ -231,43 +228,24 @@ public RcFileEncoding getVectorEncoding() public enum Compression { - BZIP2 { - @Override - Optional getCodecName() - { - return Optional.of(BZip2Codec.class.getName()); - } - }, - ZLIB { - @Override - Optional getCodecName() - { - return Optional.of(GzipCodec.class.getName()); - } - }, - SNAPPY { - @Override - Optional getCodecName() - { - return Optional.of(SnappyCodec.class.getName()); - } - }, - LZ4 { - @Override - Optional getCodecName() - { - return Optional.of(Lz4Codec.class.getName()); - } - }, - NONE { - @Override - Optional getCodecName() - { - return Optional.empty(); - } - }; + SNAPPY(CompressionKind.SNAPPY), + LZ4(CompressionKind.LZ4), + GZIP(CompressionKind.GZIP), + ZSTD(CompressionKind.ZSTD), + BZIP2(CompressionKind.BZIP2), + NONE(null); + + private final Optional compressionKind; + + Compression(CompressionKind compressionKind) + { + this.compressionKind = Optional.ofNullable(compressionKind); + } - abstract Optional getCodecName(); + public Optional getCompressionKind() + { + return compressionKind; + } } private boolean structTestsEnabled; @@ -303,7 +281,7 @@ public static RcFileTester fullTestRcFileReader() // These compression algorithms were chosen to cover the three different // cases: uncompressed, aircompressor, and hadoop compression // We assume that the compression algorithms generally work - rcFileTester.compressions = ImmutableSet.of(NONE, LZ4, ZLIB, BZIP2); + rcFileTester.compressions = ImmutableSet.of(NONE, LZ4, GZIP, BZIP2); return rcFileTester; } @@ -589,21 +567,20 @@ private static List getSyncPositionsSimple(RcFileReader recordReader, File long syncFirst = sync.getLong(0); long syncSecond = sync.getLong(8); long syncPosition = 0; - try (RcFileDataSource dataSource = new FileRcFileDataSource(file)) { - while (syncPosition >= 0) { - syncPosition = findFirstSyncPosition(dataSource, syncPosition, file.length() - syncPosition, syncFirst, syncSecond); - if (syncPosition > 0) { - assertEquals(findFirstSyncPosition(dataSource, syncPosition, 1, syncFirst, syncSecond), syncPosition); - assertEquals(findFirstSyncPosition(dataSource, syncPosition, 2, syncFirst, syncSecond), syncPosition); - assertEquals(findFirstSyncPosition(dataSource, syncPosition, 10, syncFirst, syncSecond), syncPosition); - - assertEquals(findFirstSyncPosition(dataSource, syncPosition - 1, 1, syncFirst, syncSecond), -1); - assertEquals(findFirstSyncPosition(dataSource, syncPosition - 2, 2, syncFirst, syncSecond), -1); - assertEquals(findFirstSyncPosition(dataSource, syncPosition + 1, 1, syncFirst, syncSecond), -1); - - syncPositions.add(syncPosition); - syncPosition++; - } + TrinoInputFile inputFile = new LocalInputFile(file); + while (syncPosition >= 0) { + syncPosition = findFirstSyncPosition(inputFile, syncPosition, file.length() - syncPosition, syncFirst, syncSecond); + if (syncPosition > 0) { + assertEquals(findFirstSyncPosition(inputFile, syncPosition, 1, syncFirst, syncSecond), syncPosition); + assertEquals(findFirstSyncPosition(inputFile, syncPosition, 2, syncFirst, syncSecond), syncPosition); + assertEquals(findFirstSyncPosition(inputFile, syncPosition, 10, syncFirst, syncSecond), syncPosition); + + assertEquals(findFirstSyncPosition(inputFile, syncPosition - 1, 1, syncFirst, syncSecond), -1); + assertEquals(findFirstSyncPosition(inputFile, syncPosition - 2, 2, syncFirst, syncSecond), -1); + assertEquals(findFirstSyncPosition(inputFile, syncPosition + 1, 1, syncFirst, syncSecond), -1); + + syncPositions.add(syncPosition); + syncPosition++; } } return syncPositions; @@ -612,15 +589,13 @@ private static List getSyncPositionsSimple(RcFileReader recordReader, File private static RcFileReader createRcFileReader(TempFile tempFile, Type type, RcFileEncoding encoding) throws IOException { - RcFileDataSource rcFileDataSource = new FileRcFileDataSource(tempFile.getFile()); + TrinoInputFile rcFileDataSource = new LocalInputFile(tempFile.getFile()); RcFileReader rcFileReader = new RcFileReader( rcFileDataSource, encoding, ImmutableMap.of(0, type), - new AircompressorCodecFactory(new HadoopCodecFactory(RcFileTester.class.getClassLoader())), 0, - tempFile.getFile().length(), - DataSize.of(8, MEGABYTE)); + tempFile.getFile().length()); assertEquals(rcFileReader.getColumnCount(), 1); @@ -630,14 +605,11 @@ private static RcFileReader createRcFileReader(TempFile tempFile, Type type, RcF private static DataSize writeRcFileColumnNew(File outputFile, Format format, Compression compression, Type type, Iterator values, Map metadata) throws Exception { - OutputStreamSliceOutput output = new OutputStreamSliceOutput(new FileOutputStream(outputFile)); - AircompressorCodecFactory codecFactory = new AircompressorCodecFactory(new HadoopCodecFactory(RcFileTester.class.getClassLoader())); RcFileWriter writer = new RcFileWriter( - output, + new FileOutputStream(outputFile), ImmutableList.of(type), format.getVectorEncoding(), - compression.getCodecName(), - codecFactory, + compression.getCompressionKind(), metadata, DataSize.of(100, KILOBYTE), // use a smaller size to create more row groups DataSize.of(200, KILOBYTE), @@ -651,9 +623,9 @@ private static DataSize writeRcFileColumnNew(File outputFile, Format format, Com writer.write(new Page(blockBuilder.build())); writer.close(); - writer.validate(new FileRcFileDataSource(outputFile)); + writer.validate(new LocalInputFile(outputFile)); - return DataSize.ofBytes(output.size()); + return DataSize.ofBytes(outputFile.length()); } private static void writeValue(Type type, BlockBuilder blockBuilder, Object value) @@ -1059,7 +1031,7 @@ private static RecordWriter createRcFileWriterOld(File outputFile, Compression c throws IOException { JobConf jobConf = new JobConf(false); - Optional codecName = compression.getCodecName(); + Optional codecName = compression.getCompressionKind().map(CompressionKind::getHadoopClassName); codecName.ifPresent(s -> jobConf.set(COMPRESS_CODEC, s)); return new RCFileOutputFormat().getHiveRecordWriter( diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestFullRcFileReader.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestFullRcFileReader.java similarity index 95% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/TestFullRcFileReader.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestFullRcFileReader.java index 6d9db727169d..bfa0722f2aa3 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestFullRcFileReader.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestFullRcFileReader.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; public class TestFullRcFileReader extends AbstractTestRcFileReader diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReader.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReader.java similarity index 95% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReader.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReader.java index c975c4a7d4b3..2cd920674543 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReader.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReader.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; public class TestRcFileReader extends AbstractTestRcFileReader diff --git a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReaderManual.java b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReaderManual.java similarity index 93% rename from lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReaderManual.java rename to lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReaderManual.java index e87f4527aaac..66ee6bfedd77 100644 --- a/lib/trino-rcfile/src/test/java/io/trino/rcfile/TestRcFileReaderManual.java +++ b/lib/trino-hive-formats/src/test/java/io/trino/hive/formats/rcfile/TestRcFileReaderManual.java @@ -11,15 +11,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.rcfile; +package io.trino.hive.formats.rcfile; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.slice.DynamicSliceOutput; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; -import io.airlift.units.DataSize; -import io.trino.rcfile.binary.BinaryRcFileEncoding; +import io.trino.filesystem.memory.MemoryInputFile; +import io.trino.hive.formats.rcfile.binary.BinaryRcFileEncoding; import io.trino.spi.block.Block; import org.joda.time.DateTimeZone; import org.testng.annotations.Test; @@ -29,7 +29,6 @@ import static com.google.common.base.Preconditions.checkArgument; import static io.airlift.slice.Slices.utf8Slice; -import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.spi.type.SmallintType.SMALLINT; import static java.util.stream.Collectors.toList; import static org.testng.Assert.assertEquals; @@ -236,13 +235,11 @@ private static List readValues(Slice data, int offset, int length) } RcFileReader reader = new RcFileReader( - new MemoryRcFileDataSource(new RcFileDataSourceId("test"), data), + new MemoryInputFile("test", data), new BinaryRcFileEncoding(DateTimeZone.UTC), ImmutableMap.of(0, SMALLINT), - new BogusRcFileCodecFactory(), offset, - length, - DataSize.of(8, MEGABYTE)); + length); ImmutableList.Builder values = ImmutableList.builder(); while (reader.advance() >= 0) { @@ -290,20 +287,4 @@ public List getRowGroupSegmentOffsets() return rowGroupSegmentOffsets; } } - - private static class BogusRcFileCodecFactory - implements RcFileCodecFactory - { - @Override - public RcFileCompressor createCompressor(String codecName) - { - throw new UnsupportedOperationException(); - } - - @Override - public RcFileDecompressor createDecompressor(String codecName) - { - throw new UnsupportedOperationException(); - } - } } diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCodecFactory.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCodecFactory.java deleted file mode 100644 index 869be0c63973..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCodecFactory.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import io.airlift.compress.gzip.JdkGzipCodec; -import io.airlift.compress.lz4.Lz4Codec; -import io.airlift.compress.lzo.LzoCodec; -import io.airlift.compress.snappy.SnappyCodec; - -import static java.util.Objects.requireNonNull; - -public class AircompressorCodecFactory - implements RcFileCodecFactory -{ - private static final String SNAPPY_CODEC_NAME = "org.apache.hadoop.io.compress.SnappyCodec"; - private static final String LZO_CODEC_NAME = "com.hadoop.compression.lzo.LzoCodec"; - private static final String LZO_CODEC_NAME_DEPRECATED = "org.apache.hadoop.io.compress.LzoCodec"; - private static final String LZ4_CODEC_NAME = "org.apache.hadoop.io.compress.Lz4Codec"; - private static final String LZ4_HC_CODEC_NAME = "org.apache.hadoop.io.compress.Lz4Codec"; - private static final String GZIP_CODEC_NAME = "org.apache.hadoop.io.compress.GzipCodec"; - - private final RcFileCodecFactory delegate; - - public AircompressorCodecFactory(RcFileCodecFactory delegate) - { - this.delegate = requireNonNull(delegate, "delegate is null"); - } - - @Override - public RcFileCompressor createCompressor(String codecName) - { - if (SNAPPY_CODEC_NAME.equals(codecName)) { - return new AircompressorCompressor(new SnappyCodec()); - } - if (LZO_CODEC_NAME.equals(codecName) || LZO_CODEC_NAME_DEPRECATED.equals(codecName)) { - return new AircompressorCompressor(new LzoCodec()); - } - if (LZ4_CODEC_NAME.equals(codecName)) { - return new AircompressorCompressor(new Lz4Codec()); - } - if (GZIP_CODEC_NAME.equals(codecName)) { - return new AircompressorCompressor(new JdkGzipCodec()); - } - return delegate.createCompressor(codecName); - } - - @Override - public RcFileDecompressor createDecompressor(String codecName) - { - if (SNAPPY_CODEC_NAME.equals(codecName)) { - return new AircompressorDecompressor(new SnappyCodec()); - } - if (LZO_CODEC_NAME.equals(codecName) || LZO_CODEC_NAME_DEPRECATED.equals(codecName)) { - return new AircompressorDecompressor(new LzoCodec()); - } - if (LZ4_CODEC_NAME.equals(codecName) || LZ4_HC_CODEC_NAME.equals(codecName)) { - return new AircompressorDecompressor(new Lz4Codec()); - } - if (GZIP_CODEC_NAME.equals(codecName)) { - return new AircompressorDecompressor(new JdkGzipCodec()); - } - return delegate.createDecompressor(codecName); - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCompressor.java deleted file mode 100644 index 849fae82baf4..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorCompressor.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionOutputStream; -import org.apache.hadoop.io.compress.Compressor; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.function.Supplier; - -import static java.util.Objects.requireNonNull; - -public class AircompressorCompressor - implements RcFileCompressor -{ - private final CompressionCodec codec; - - public AircompressorCompressor(CompressionCodec codec) - { - this.codec = requireNonNull(codec, "codec is null"); - } - - @Override - public CompressedSliceOutput createCompressedSliceOutput(int minChunkSize, int maxChunkSize) - { - return new AircompressorCompressedSliceOutputSupplier(codec, minChunkSize, maxChunkSize).get(); - } - - private static class AircompressorCompressedSliceOutputSupplier - implements Supplier - { - private final CompressionCodec codec; - private final Compressor compressor; - private final ChunkedSliceOutput compressedOutput; - - public AircompressorCompressedSliceOutputSupplier(CompressionCodec codec, int minChunkSize, int maxChunkSize) - { - this.codec = requireNonNull(codec, "codec is null"); - this.compressor = codec.createCompressor(); - this.compressedOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); - } - - @Override - public CompressedSliceOutput get() - { - try { - compressor.reset(); - compressedOutput.reset(); - CompressionOutputStream compressionStream = codec.createOutputStream(compressedOutput, compressor); - return new CompressedSliceOutput(compressionStream, compressedOutput, this, () -> {}); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorDecompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorDecompressor.java deleted file mode 100644 index b7759c43a6ea..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/AircompressorDecompressor.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import io.airlift.slice.Slice; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionInputStream; - -import java.io.IOException; - -import static java.util.Objects.requireNonNull; - -public class AircompressorDecompressor - implements RcFileDecompressor -{ - private final CompressionCodec codec; - - public AircompressorDecompressor(CompressionCodec codec) - { - this.codec = requireNonNull(codec, "codec is null"); - } - - @Override - public void decompress(Slice compressed, Slice uncompressed) - throws RcFileCorruptionException - { - try (CompressionInputStream decompressorStream = codec.createInputStream(compressed.getInput())) { - uncompressed.setBytes(0, decompressorStream, uncompressed.length()); - } - catch (IndexOutOfBoundsException | IOException e) { - throw new RcFileCorruptionException(e, "Compressed stream is truncated"); - } - } - - @Override - public void destroy() - { - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCodecFactory.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCodecFactory.java deleted file mode 100644 index f33c91ef4a1a..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCodecFactory.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.io.compress.CompressionCodec; - -import java.lang.reflect.Constructor; - -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; - -public class HadoopCodecFactory - implements RcFileCodecFactory -{ - private final ClassLoader classLoader; - - public HadoopCodecFactory(ClassLoader classLoader) - { - this.classLoader = classLoader; - } - - @Override - public RcFileCompressor createCompressor(String codecName) - { - CompressionCodec codec = createCompressionCodec(codecName); - return new HadoopCompressor(codec); - } - - @Override - public RcFileDecompressor createDecompressor(String codecName) - { - CompressionCodec codec = createCompressionCodec(codecName); - return new HadoopDecompressor(codec); - } - - private CompressionCodec createCompressionCodec(String codecName) - { - try { - Class codecClass = classLoader.loadClass(codecName).asSubclass(CompressionCodec.class); - Constructor constructor = codecClass.getDeclaredConstructor(); - constructor.setAccessible(true); - CompressionCodec codec = constructor.newInstance(); - if (codec instanceof Configurable) { - // Hadoop is crazy... you have to give codecs an empty configuration or they throw NPEs - // but you need to make sure the configuration doesn't "load" defaults or it spends - // forever loading XML with no useful information - ((Configurable) codec).setConf(newEmptyConfiguration()); - } - return codec; - } - catch (ReflectiveOperationException e) { - throw new IllegalArgumentException("Unknown codec: " + codecName, e); - } - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCompressor.java deleted file mode 100644 index 9c4ff29b9fda..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopCompressor.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import org.apache.hadoop.io.compress.CodecPool; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionOutputStream; -import org.apache.hadoop.io.compress.Compressor; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.function.Supplier; - -import static java.util.Objects.requireNonNull; - -public class HadoopCompressor - implements RcFileCompressor -{ - private final CompressionCodec codec; - - public HadoopCompressor(CompressionCodec codec) - { - this.codec = requireNonNull(codec, "codec is null"); - } - - @Override - public CompressedSliceOutput createCompressedSliceOutput(int minChunkSize, int maxChunkSize) - { - return new HadoopCompressedSliceOutputSupplier(codec, minChunkSize, maxChunkSize).get(); - } - - private static class HadoopCompressedSliceOutputSupplier - implements Supplier - { - private final CompressionCodec codec; - private final Compressor compressor; - private final ChunkedSliceOutput bufferedOutput; - - public HadoopCompressedSliceOutputSupplier(CompressionCodec codec, int minChunkSize, int maxChunkSize) - { - this.codec = requireNonNull(codec, "codec is null"); - this.compressor = CodecPool.getCompressor(requireNonNull(codec, "codec is null")); - this.bufferedOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); - } - - @Override - public CompressedSliceOutput get() - { - try { - compressor.reset(); - bufferedOutput.reset(); - CompressionOutputStream compressionStream = codec.createOutputStream(bufferedOutput, compressor); - return new CompressedSliceOutput(compressionStream, bufferedOutput, this, () -> CodecPool.returnCompressor(compressor)); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopDecompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopDecompressor.java deleted file mode 100644 index 99de02061158..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/HadoopDecompressor.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import io.airlift.slice.Slice; -import org.apache.hadoop.io.compress.CodecPool; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionInputStream; -import org.apache.hadoop.io.compress.Decompressor; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkState; -import static java.util.Objects.requireNonNull; - -public class HadoopDecompressor - implements RcFileDecompressor -{ - private final CompressionCodec codec; - private final Decompressor decompressor; - private boolean destroyed; - - public HadoopDecompressor(CompressionCodec codec) - { - this.codec = requireNonNull(codec, "codec is null"); - decompressor = CodecPool.getDecompressor(codec); - } - - @Override - public void decompress(Slice compressed, Slice uncompressed) - throws RcFileCorruptionException - { - checkState(!destroyed, "Codec has been destroyed"); - decompressor.reset(); - try (CompressionInputStream decompressorStream = codec.createInputStream(compressed.getInput(), decompressor)) { - uncompressed.setBytes(0, decompressorStream, uncompressed.length()); - } - catch (IndexOutOfBoundsException | IOException e) { - throw new RcFileCorruptionException(e, "Compressed stream is truncated"); - } - } - - @Override - public void destroy() - { - if (destroyed) { - return; - } - destroyed = true; - CodecPool.returnDecompressor(decompressor); - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/NoneCompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/NoneCompressor.java deleted file mode 100644 index c93a2d1e5ee1..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/NoneCompressor.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import java.util.function.Supplier; - -class NoneCompressor - implements RcFileCompressor -{ - @Override - public CompressedSliceOutput createCompressedSliceOutput(int minChunkSize, int maxChunkSize) - { - return new NoneCompressedSliceOutputSupplier(minChunkSize, maxChunkSize).get(); - } - - private static class NoneCompressedSliceOutputSupplier - implements Supplier - { - private final ChunkedSliceOutput chunkedSliceOutput; - - private NoneCompressedSliceOutputSupplier(int minChunkSize, int maxChunkSize) - { - chunkedSliceOutput = new ChunkedSliceOutput(minChunkSize, maxChunkSize); - } - - @Override - public CompressedSliceOutput get() - { - chunkedSliceOutput.reset(); - return new CompressedSliceOutput(chunkedSliceOutput, chunkedSliceOutput, this, () -> {}); - } - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCodecFactory.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCodecFactory.java deleted file mode 100644 index 059d4403e7c7..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCodecFactory.java +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -public interface RcFileCodecFactory -{ - RcFileCompressor createCompressor(String codecName); - - RcFileDecompressor createDecompressor(String codecName); -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCompressor.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCompressor.java deleted file mode 100644 index f4fb9cc6d1bc..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileCompressor.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import io.airlift.slice.Slice; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.List; -import java.util.function.Supplier; - -import static com.google.common.base.Preconditions.checkState; -import static java.util.Objects.requireNonNull; - -public interface RcFileCompressor -{ - CompressedSliceOutput createCompressedSliceOutput(int minChunkSize, int maxChunkSize); - - // This specialized SliceOutput has direct access buffered output slices to - // report buffer sizes and to get he final output. Additionally, a new - // CompressedSliceOutput can be created that reuses the underlying output - // buffer - final class CompressedSliceOutput - extends BufferedOutputStreamSliceOutput - { - private final ChunkedSliceOutput bufferedOutput; - private final Supplier resetFactory; - private final Runnable onDestroy; - private boolean closed; - private boolean destroyed; - - /** - * @param compressionStream the compressed output stream to delegate to - * @param bufferedOutput the output for the compressionStream - * @param resetFactory the function to create a new CompressedSliceOutput that reuses the bufferedOutput - * @param onDestroy used to cleanup the compression when done - */ - public CompressedSliceOutput(OutputStream compressionStream, ChunkedSliceOutput bufferedOutput, Supplier resetFactory, Runnable onDestroy) - { - super(compressionStream); - this.bufferedOutput = requireNonNull(bufferedOutput, "bufferedOutput is null"); - this.resetFactory = requireNonNull(resetFactory, "resetFactory is null"); - this.onDestroy = requireNonNull(onDestroy, "onDestroy is null"); - } - - @Override - public long getRetainedSize() - { - return super.getRetainedSize() + bufferedOutput.getRetainedSize(); - } - - public int getCompressedSize() - { - checkState(closed, "Stream has not been closed"); - checkState(!destroyed, "Stream has been destroyed"); - return bufferedOutput.size(); - } - - public List getCompressedSlices() - { - checkState(closed, "Stream has not been closed"); - checkState(!destroyed, "Stream has been destroyed"); - return bufferedOutput.getSlices(); - } - - public CompressedSliceOutput createRecycledCompressedSliceOutput() - { - checkState(closed, "Stream has not been closed"); - checkState(!destroyed, "Stream has been destroyed"); - destroyed = true; - return resetFactory.get(); - } - - @Override - public void close() - throws IOException - { - if (!closed) { - closed = true; - super.close(); - } - } - - public void destroy() - throws IOException - { - if (!destroyed) { - destroyed = true; - try { - close(); - } - finally { - onDestroy.run(); - } - } - } - } -} diff --git a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSourceId.java b/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSourceId.java deleted file mode 100644 index b15257685184..000000000000 --- a/lib/trino-rcfile/src/main/java/io/trino/rcfile/RcFileDataSourceId.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.rcfile; - -import java.util.Objects; - -import static java.util.Objects.requireNonNull; - -public final class RcFileDataSourceId -{ - private final String id; - - public RcFileDataSourceId(String id) - { - this.id = requireNonNull(id, "id is null"); - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - RcFileDataSourceId that = (RcFileDataSourceId) o; - return Objects.equals(id, that.id); - } - - @Override - public int hashCode() - { - return Objects.hash(id); - } - - @Override - public String toString() - { - return id; - } -} diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index 0912ff40b426..2511b125199e 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -49,27 +49,27 @@ io.trino - trino-memory-context + trino-hive-formats io.trino - trino-orc + trino-memory-context io.trino - trino-parquet + trino-orc io.trino - trino-plugin-toolkit + trino-parquet io.trino - trino-rcfile + trino-plugin-toolkit @@ -260,6 +260,11 @@ alluxio-shaded-client + + org.apache.iceberg + iceberg-api + + org.apache.thrift libthrift diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveCompressionCodec.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveCompressionCodec.java index 7a9aab74fd9a..eeda7a40a137 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveCompressionCodec.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveCompressionCodec.java @@ -15,11 +15,6 @@ import io.trino.orc.metadata.CompressionKind; import org.apache.avro.file.DataFileConstants; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.io.compress.Lz4Codec; -import org.apache.hadoop.io.compress.SnappyCodec; -import org.apache.hadoop.io.compress.ZStandardCodec; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import java.util.Optional; @@ -29,34 +24,34 @@ public enum HiveCompressionCodec { NONE(null, CompressionKind.NONE, CompressionCodecName.UNCOMPRESSED, DataFileConstants.NULL_CODEC), - SNAPPY(SnappyCodec.class, CompressionKind.SNAPPY, CompressionCodecName.SNAPPY, DataFileConstants.SNAPPY_CODEC), - LZ4(Lz4Codec.class, CompressionKind.LZ4, CompressionCodecName.LZ4, null), - ZSTD(ZStandardCodec.class, CompressionKind.ZSTD, CompressionCodecName.ZSTD, DataFileConstants.ZSTANDARD_CODEC), + SNAPPY(io.trino.hive.formats.compression.CompressionKind.SNAPPY, CompressionKind.SNAPPY, CompressionCodecName.SNAPPY, DataFileConstants.SNAPPY_CODEC), + LZ4(io.trino.hive.formats.compression.CompressionKind.LZ4, CompressionKind.LZ4, CompressionCodecName.LZ4, null), + ZSTD(io.trino.hive.formats.compression.CompressionKind.ZSTD, CompressionKind.ZSTD, CompressionCodecName.ZSTD, DataFileConstants.ZSTANDARD_CODEC), // Using DEFLATE for GZIP for Avro for now so Avro files can be written in default configuration // TODO(https://github.com/trinodb/trino/issues/12580) change GZIP to be unsupported for Avro when we change Trino default compression to be storage format aware - GZIP(GzipCodec.class, CompressionKind.ZLIB, CompressionCodecName.GZIP, DataFileConstants.DEFLATE_CODEC); + GZIP(io.trino.hive.formats.compression.CompressionKind.GZIP, CompressionKind.ZLIB, CompressionCodecName.GZIP, DataFileConstants.DEFLATE_CODEC); - private final Optional> codec; + private final Optional hiveCompressionKind; private final CompressionKind orcCompressionKind; private final CompressionCodecName parquetCompressionCodec; private final Optional avroCompressionCodec; HiveCompressionCodec( - Class codec, + io.trino.hive.formats.compression.CompressionKind hiveCompressionKind, CompressionKind orcCompressionKind, CompressionCodecName parquetCompressionCodec, String avroCompressionCodec) { - this.codec = Optional.ofNullable(codec); + this.hiveCompressionKind = Optional.ofNullable(hiveCompressionKind); this.orcCompressionKind = requireNonNull(orcCompressionKind, "orcCompressionKind is null"); this.parquetCompressionCodec = requireNonNull(parquetCompressionCodec, "parquetCompressionCodec is null"); this.avroCompressionCodec = Optional.ofNullable(avroCompressionCodec); } - public Optional> getCodec() + public Optional getHiveCompressionKind() { - return codec; + return hiveCompressionKind; } public CompressionKind getOrcCompressionKind() diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/MonitoredTrinoInputFile.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/MonitoredTrinoInputFile.java new file mode 100644 index 000000000000..2e642cba3db7 --- /dev/null +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/MonitoredTrinoInputFile.java @@ -0,0 +1,184 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive; + +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; +import org.apache.iceberg.io.SeekableInputStream; + +import java.io.IOException; + +import static java.util.Objects.requireNonNull; + +public class MonitoredTrinoInputFile + implements TrinoInputFile +{ + private final FileFormatDataSourceStats stats; + private final TrinoInputFile delegate; + + public MonitoredTrinoInputFile(FileFormatDataSourceStats stats, TrinoInputFile delegate) + { + this.stats = requireNonNull(stats, "stats is null"); + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public TrinoInput newInput() + throws IOException + { + return new MonitoredTrinoInput(stats, delegate.newInput()); + } + + @Override + public long length() + throws IOException + { + return delegate.length(); + } + + @Override + public long modificationTime() + throws IOException + { + return delegate.modificationTime(); + } + + @Override + public boolean exists() + throws IOException + { + return delegate.exists(); + } + + @Override + public String location() + { + return delegate.location(); + } + + @Override + public String toString() + { + return delegate.toString(); + } + + private static final class MonitoredTrinoInput + implements TrinoInput + { + private final FileFormatDataSourceStats stats; + private final TrinoInput delegate; + + public MonitoredTrinoInput(FileFormatDataSourceStats stats, TrinoInput delegate) + { + this.stats = requireNonNull(stats, "stats is null"); + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public SeekableInputStream inputStream() + { + return new MonitoredSeekableInputStream(stats, delegate.inputStream()); + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long readStart = System.nanoTime(); + delegate.readFully(position, buffer, bufferOffset, bufferLength); + stats.readDataBytesPerSecond(bufferLength, System.nanoTime() - readStart); + } + + @Override + public int readTail(byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long readStart = System.nanoTime(); + int size = delegate.readTail(buffer, bufferOffset, bufferLength); + stats.readDataBytesPerSecond(size, System.nanoTime() - readStart); + return size; + } + + @Override + public void close() + throws IOException + { + delegate.close(); + } + } + + private static final class MonitoredSeekableInputStream + extends SeekableInputStream + { + private final FileFormatDataSourceStats stats; + private final SeekableInputStream delegate; + + public MonitoredSeekableInputStream(FileFormatDataSourceStats stats, SeekableInputStream delegate) + { + this.stats = requireNonNull(stats, "stats is null"); + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public long getPos() + throws IOException + { + return delegate.getPos(); + } + + @Override + public void seek(long newPos) + throws IOException + { + delegate.seek(newPos); + } + + @Override + public int read() + throws IOException + { + long readStart = System.nanoTime(); + int value = delegate.read(); + stats.readDataBytesPerSecond(1, System.nanoTime() - readStart); + return value; + } + + @Override + public int read(byte[] b, int off, int len) + throws IOException + { + long readStart = System.nanoTime(); + int size = delegate.read(b, off, len); + stats.readDataBytesPerSecond(size, System.nanoTime() - readStart); + return size; + } + + @Override + public long skip(long n) + throws IOException + { + long readStart = System.nanoTime(); + long size = delegate.skip(n); + stats.readDataBytesPerSecond(size, System.nanoTime() - readStart); + return size; + } + + @Override + public void close() + throws IOException + { + delegate.close(); + } + } +} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriter.java index 50ec569d9024..7f27b70b8204 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriter.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriter.java @@ -15,12 +15,11 @@ import com.google.common.collect.ImmutableList; import com.google.common.io.CountingOutputStream; -import io.airlift.slice.OutputStreamSliceOutput; -import io.trino.rcfile.AircompressorCodecFactory; -import io.trino.rcfile.HadoopCodecFactory; -import io.trino.rcfile.RcFileDataSource; -import io.trino.rcfile.RcFileEncoding; -import io.trino.rcfile.RcFileWriter; +import io.trino.filesystem.TrinoInputFile; +import io.trino.hive.formats.compression.CompressionKind; +import io.trino.hive.formats.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.RcFileWriter; +import io.trino.memory.context.AggregatedMemoryContext; import io.trino.spi.Page; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; @@ -54,32 +53,34 @@ public class RcFileFileWriter private static final ThreadMXBean THREAD_MX_BEAN = ManagementFactory.getThreadMXBean(); private final CountingOutputStream outputStream; + private final AggregatedMemoryContext outputStreamMemoryContext; private final RcFileWriter rcFileWriter; private final Closeable rollbackAction; private final int[] fileInputColumnIndexes; private final List nullBlocks; - private final Optional> validationInputFactory; + private final Optional> validationInputFactory; private long validationCpuNanos; public RcFileFileWriter( OutputStream outputStream, + AggregatedMemoryContext outputStreamMemoryContext, Closeable rollbackAction, RcFileEncoding rcFileEncoding, List fileColumnTypes, - Optional codecName, + Optional compressionKind, int[] fileInputColumnIndexes, Map metadata, - Optional> validationInputFactory) + Optional> validationInputFactory) throws IOException { this.outputStream = new CountingOutputStream(outputStream); + this.outputStreamMemoryContext = outputStreamMemoryContext; rcFileWriter = new RcFileWriter( - new OutputStreamSliceOutput(this.outputStream), + this.outputStream, fileColumnTypes, rcFileEncoding, - codecName, - new AircompressorCodecFactory(new HadoopCodecFactory(getClass().getClassLoader())), + compressionKind, metadata, validationInputFactory.isPresent()); this.rollbackAction = requireNonNull(rollbackAction, "rollbackAction is null"); @@ -105,7 +106,7 @@ public long getWrittenBytes() @Override public long getMemoryUsage() { - return INSTANCE_SIZE + rcFileWriter.getRetainedSizeInBytes(); + return INSTANCE_SIZE + rcFileWriter.getRetainedSizeInBytes() + outputStreamMemoryContext.getBytes(); } @Override @@ -148,11 +149,10 @@ public Closeable commit() if (validationInputFactory.isPresent()) { try { - try (RcFileDataSource input = validationInputFactory.get().get()) { - long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime(); - rcFileWriter.validate(input); - validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime; - } + TrinoInputFile inputFile = validationInputFactory.get().get(); + long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime(); + rcFileWriter.validate(inputFile); + validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime; } catch (IOException | UncheckedIOException e) { throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriterFactory.java index 61672051aebe..9dcc2c7a890a 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RcFileFileWriterFactory.java @@ -14,18 +14,20 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableMap; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoInputFile; +import io.trino.filesystem.hdfs.HdfsFileSystemFactory; import io.trino.hdfs.HdfsEnvironment; +import io.trino.hive.formats.compression.CompressionKind; +import io.trino.hive.formats.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.binary.BinaryRcFileEncoding; +import io.trino.memory.context.AggregatedMemoryContext; import io.trino.plugin.hive.acid.AcidTransaction; import io.trino.plugin.hive.metastore.StorageFormat; -import io.trino.plugin.hive.rcfile.HdfsRcFileDataSource; -import io.trino.rcfile.RcFileDataSource; -import io.trino.rcfile.RcFileEncoding; -import io.trino.rcfile.binary.BinaryRcFileEncoding; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; @@ -34,7 +36,6 @@ import javax.inject.Inject; import java.io.Closeable; -import java.io.IOException; import java.io.OutputStream; import java.util.List; import java.util.Optional; @@ -42,8 +43,8 @@ import java.util.Properties; import java.util.function.Supplier; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED; import static io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; import static io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME; import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; @@ -64,31 +65,27 @@ public class RcFileFileWriterFactory private final HdfsEnvironment hdfsEnvironment; private final TypeManager typeManager; private final NodeVersion nodeVersion; - private final FileFormatDataSourceStats stats; @Inject public RcFileFileWriterFactory( HdfsEnvironment hdfsEnvironment, TypeManager typeManager, NodeVersion nodeVersion, - HiveConfig hiveConfig, - FileFormatDataSourceStats stats) + HiveConfig hiveConfig) { - this(hdfsEnvironment, typeManager, nodeVersion, hiveConfig.getRcfileDateTimeZone(), stats); + this(hdfsEnvironment, typeManager, nodeVersion, hiveConfig.getRcfileDateTimeZone()); } public RcFileFileWriterFactory( HdfsEnvironment hdfsEnvironment, TypeManager typeManager, NodeVersion nodeVersion, - DateTimeZone timeZone, - FileFormatDataSourceStats stats) + DateTimeZone timeZone) { this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null"); this.timeZone = requireNonNull(timeZone, "timeZone is null"); - this.stats = requireNonNull(stats, "stats is null"); } @Override @@ -119,7 +116,8 @@ else if (COLUMNAR_SERDE_CLASS.equals(storageFormat.getSerde())) { return Optional.empty(); } - Optional codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC)); + Optional compressionKind = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC)) + .map(CompressionKind::fromHadoopClassName); // existing tables and partitions may have columns in a different order than the writer is providing, so build // an index to rearrange columns in the proper order @@ -133,33 +131,24 @@ else if (COLUMNAR_SERDE_CLASS.equals(storageFormat.getSerde())) { .toArray(); try { - FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration); - OutputStream outputStream = fileSystem.create(path, false); + TrinoFileSystem fileSystem = new HdfsFileSystemFactory(hdfsEnvironment).create(session.getIdentity()); + AggregatedMemoryContext outputStreamMemoryContext = newSimpleAggregatedMemoryContext(); + OutputStream outputStream = fileSystem.newOutputFile(path.toString()).create(outputStreamMemoryContext); - Optional> validationInputFactory = Optional.empty(); + Optional> validationInputFactory = Optional.empty(); if (isRcfileOptimizedWriterValidate(session)) { - validationInputFactory = Optional.of(() -> { - try { - return new HdfsRcFileDataSource( - path.toString(), - fileSystem.open(path), - fileSystem.getFileStatus(path).getLen(), - stats); - } - catch (IOException e) { - throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e); - } - }); + validationInputFactory = Optional.of(() -> fileSystem.newInputFile(path.toString())); } - Closeable rollbackAction = () -> fileSystem.delete(path, false); + Closeable rollbackAction = () -> fileSystem.deleteFile(path.toString()); return Optional.of(new RcFileFileWriter( outputStream, + outputStreamMemoryContext, rollbackAction, rcFileEncoding, fileColumnTypes, - codecName, + compressionKind, fileInputColumnIndexes, ImmutableMap.builder() .put(PRESTO_VERSION_NAME, nodeVersion.toString()) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/HdfsRcFileDataSource.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/HdfsRcFileDataSource.java deleted file mode 100644 index 6def5227717e..000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/HdfsRcFileDataSource.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.rcfile; - -import io.trino.plugin.hive.FileFormatDataSourceStats; -import io.trino.rcfile.RcFileDataSource; -import io.trino.rcfile.RcFileDataSourceId; -import org.apache.hadoop.fs.FSDataInputStream; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkArgument; -import static java.util.Objects.requireNonNull; - -public class HdfsRcFileDataSource - implements RcFileDataSource -{ - private final FSDataInputStream inputStream; - private final String path; - private final long size; - private final FileFormatDataSourceStats stats; - private long readTimeNanos; - private long readBytes; - - public HdfsRcFileDataSource(String path, FSDataInputStream inputStream, long size, FileFormatDataSourceStats stats) - { - this.path = requireNonNull(path, "path is null"); - this.inputStream = requireNonNull(inputStream, "inputStream is null"); - this.size = size; - checkArgument(size >= 0, "size is negative"); - this.stats = requireNonNull(stats, "stats is null"); - } - - @Override - public RcFileDataSourceId getId() - { - return new RcFileDataSourceId(path); - } - - @Override - public void close() - throws IOException - { - inputStream.close(); - } - - @Override - public long getReadBytes() - { - return readBytes; - } - - @Override - public long getReadTimeNanos() - { - return readTimeNanos; - } - - @Override - public long getSize() - { - return size; - } - - @Override - public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException - { - long start = System.nanoTime(); - - inputStream.readFully(position, buffer, bufferOffset, bufferLength); - - long readDuration = System.nanoTime() - start; - stats.readDataBytesPerSecond(bufferLength, readDuration); - - readTimeNanos += readDuration; - readBytes += bufferLength; - } - - @Override - public String toString() - { - return path; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSource.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSource.java index 918ff1942236..192a71fce9d6 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSource.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSource.java @@ -15,10 +15,10 @@ import com.google.common.collect.ImmutableList; import io.airlift.units.DataSize; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.RcFileReader; import io.trino.plugin.hive.HiveColumnHandle; import io.trino.plugin.hive.HiveType; -import io.trino.rcfile.RcFileCorruptionException; -import io.trino.rcfile.RcFileReader; import io.trino.spi.Page; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; @@ -144,11 +144,11 @@ public Page getNextPage() } catch (RcFileCorruptionException e) { closeAllSuppress(e, this); - throw new TrinoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getId()), e); + throw new TrinoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getFileLocation()), e); } catch (IOException | RuntimeException e) { closeAllSuppress(e, this); - throw new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getId()), e); + throw new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getFileLocation()), e); } } @@ -212,10 +212,10 @@ public Block load() block = rcFileReader.readBlock(columnIndex); } catch (RcFileCorruptionException e) { - throw new TrinoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getId()), e); + throw new TrinoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getFileLocation()), e); } catch (IOException | RuntimeException e) { - throw new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getId()), e); + throw new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getFileLocation()), e); } loaded = true; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSourceFactory.java index 90af2d161ae2..162a8769b0ab 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/rcfile/RcFilePageSourceFactory.java @@ -18,27 +18,27 @@ import io.airlift.slice.Slices; import io.airlift.units.DataSize; import io.airlift.units.DataSize.Unit; -import io.trino.hdfs.FSDataInputStreamTail; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; +import io.trino.filesystem.hdfs.HdfsFileSystemFactory; +import io.trino.filesystem.memory.MemoryInputFile; import io.trino.hdfs.HdfsEnvironment; +import io.trino.hive.formats.rcfile.RcFileCorruptionException; +import io.trino.hive.formats.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.RcFileReader; +import io.trino.hive.formats.rcfile.binary.BinaryRcFileEncoding; +import io.trino.hive.formats.rcfile.text.TextRcFileEncoding; import io.trino.plugin.hive.AcidInfo; import io.trino.plugin.hive.FileFormatDataSourceStats; import io.trino.plugin.hive.HiveColumnHandle; import io.trino.plugin.hive.HiveConfig; import io.trino.plugin.hive.HivePageSourceFactory; import io.trino.plugin.hive.HiveTimestampPrecision; +import io.trino.plugin.hive.MonitoredTrinoInputFile; import io.trino.plugin.hive.ReaderColumns; import io.trino.plugin.hive.ReaderPageSource; import io.trino.plugin.hive.acid.AcidTransaction; -import io.trino.rcfile.AircompressorCodecFactory; -import io.trino.rcfile.HadoopCodecFactory; -import io.trino.rcfile.MemoryRcFileDataSource; -import io.trino.rcfile.RcFileCorruptionException; -import io.trino.rcfile.RcFileDataSource; -import io.trino.rcfile.RcFileDataSourceId; -import io.trino.rcfile.RcFileEncoding; -import io.trino.rcfile.RcFileReader; -import io.trino.rcfile.binary.BinaryRcFileEncoding; -import io.trino.rcfile.text.TextRcFileEncoding; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorSession; @@ -47,8 +47,6 @@ import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.BlockMissingException; import org.joda.time.DateTimeZone; @@ -56,7 +54,7 @@ import javax.inject.Inject; import java.io.FileNotFoundException; -import java.io.IOException; +import java.io.InputStream; import java.util.List; import java.util.Optional; import java.util.OptionalInt; @@ -65,6 +63,8 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Strings.nullToEmpty; import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.hive.formats.rcfile.text.TextRcFileEncoding.DEFAULT_NULL_SEQUENCE; +import static io.trino.hive.formats.rcfile.text.TextRcFileEncoding.getDefaultSeparators; import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; import static io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; import static io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA; @@ -82,10 +82,7 @@ import static io.trino.plugin.hive.util.SerdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST; import static io.trino.plugin.hive.util.SerdeConstants.SERIALIZATION_LIB; import static io.trino.plugin.hive.util.SerdeConstants.SERIALIZATION_NULL_FORMAT; -import static io.trino.rcfile.text.TextRcFileEncoding.DEFAULT_NULL_SEQUENCE; -import static io.trino.rcfile.text.TextRcFileEncoding.getDefaultSeparators; import static java.lang.Math.min; -import static java.lang.Math.toIntExact; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS; @@ -161,24 +158,22 @@ else if (deserializerClassName.equals(COLUMNAR_SERDE_CLASS)) { .collect(toImmutableList()); } - RcFileDataSource dataSource; + TrinoFileSystem trinoFileSystem = new HdfsFileSystemFactory(hdfsEnvironment).create(session.getIdentity()); + TrinoInputFile inputFile = new MonitoredTrinoInputFile(stats, trinoFileSystem.newInputFile(path.toString())); try { - FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration); - FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path)); + length = min(inputFile.length() - start, length); + if (!inputFile.exists()) { + throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "File does not exist"); + } if (estimatedFileSize < BUFFER_SIZE.toBytes()) { - // Handle potentially imprecise file lengths by reading the footer - try { - FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes())); - dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice()); - } - finally { - inputStream.close(); + try (TrinoInput input = inputFile.newInput(); InputStream inputStream = input.inputStream()) { + byte[] data = inputStream.readAllBytes(); + inputFile = new MemoryInputFile(path.toString(), Slices.wrappedBuffer(data)); } } - else { - long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen()); - dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats); - } + } + catch (TrinoException e) { + throw e; } catch (Exception e) { if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || @@ -188,7 +183,6 @@ else if (deserializerClassName.equals(COLUMNAR_SERDE_CLASS)) { throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e); } - length = min(dataSource.getSize() - start, length); // Split may be empty now that the correct file size is known if (length <= 0) { return Optional.of(noProjectionAdaptation(new EmptyPageSource())); @@ -202,23 +196,16 @@ else if (deserializerClassName.equals(COLUMNAR_SERDE_CLASS)) { } RcFileReader rcFileReader = new RcFileReader( - dataSource, + inputFile, rcFileEncoding, readColumns.buildOrThrow(), - new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, - length, - BUFFER_SIZE); + length); ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns); return Optional.of(new ReaderPageSource(pageSource, readerProjections)); } catch (Throwable e) { - try { - dataSource.close(); - } - catch (IOException ignored) { - } if (e instanceof TrinoException) { throw (TrinoException) e; } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java index b7ced2c11f5f..525edff6c559 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java @@ -41,9 +41,9 @@ public static void configureCompression(Configuration config, HiveCompressionCod OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name()); // For RCFile and Text - if (compressionCodec.getCodec().isPresent()) { - config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName()); - config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName()); + if (compressionCodec.getHiveCompressionKind().isPresent()) { + config.set("mapred.output.compression.codec", compressionCodec.getHiveCompressionKind().get().getHadoopClassName()); + config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getHiveCompressionKind().get().getHadoopClassName()); } else { config.unset("mapred.output.compression.codec"); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java index 7ac8b58e1eae..4b84368c4f6e 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java @@ -203,7 +203,7 @@ public static Set getDefaultHiveRecordCursorProviders( public static Set getDefaultHiveFileWriterFactories(HiveConfig hiveConfig, HdfsEnvironment hdfsEnvironment) { return ImmutableSet.builder() - .add(new RcFileFileWriterFactory(hdfsEnvironment, TESTING_TYPE_MANAGER, new NodeVersion("test_version"), hiveConfig, new FileFormatDataSourceStats())) + .add(new RcFileFileWriterFactory(hdfsEnvironment, TESTING_TYPE_MANAGER, new NodeVersion("test_version"), hiveConfig)) .add(getDefaultOrcFileWriterFactory(hdfsEnvironment)) .build(); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java index b12ab6308eca..95035c2660df 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java @@ -20,6 +20,7 @@ import io.airlift.compress.lzo.LzopCodec; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.filesystem.hdfs.HdfsFileSystemFactory; +import io.trino.hive.formats.compression.CompressionKind; import io.trino.orc.OrcReaderOptions; import io.trino.orc.OrcWriterOptions; import io.trino.plugin.hive.orc.OrcFileWriterFactory; @@ -248,7 +249,7 @@ public void testRcTextOptimizedWriter(int rowCount) assertThatFileFormat(RCTEXT) .withColumns(testColumns) .withRowsCount(rowCount) - .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); } @@ -289,7 +290,7 @@ public void testRcBinaryOptimizedWriter(int rowCount) assertThatFileFormat(RCBINARY) .withColumns(testColumns) .withRowsCount(rowCount) - .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) .isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())) .withColumns(testColumnsNoTimestamps) .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); @@ -773,7 +774,7 @@ public void testRCBinaryProjectedColumns(int rowCount) .withWriteColumns(writeColumns) .withReadColumns(readColumns) .withRowsCount(rowCount) - .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) .isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); } @@ -801,7 +802,7 @@ public void testRCBinaryProjectedColumnsPageSource(int rowCount) .withWriteColumns(writeColumns) .withReadColumns(readColumns) .withRowsCount(rowCount) - .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) .isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); } @@ -1270,15 +1271,8 @@ private void assertRead(Optional pageSourceFactory, Optio assertNotNull(session, "session must be specified"); assertTrue(rowsCount >= 0, "rowsCount must be non-negative"); - String compressionSuffix = compressionCodec.getCodec() - .map(codec -> { - try { - return codec.getConstructor().newInstance().getDefaultExtension(); - } - catch (Exception e) { - throw new RuntimeException(e); - } - }) + String compressionSuffix = compressionCodec.getHiveCompressionKind() + .map(CompressionKind::getFileExtension) .orElse(""); File file = File.createTempFile("trino_test", formatName + compressionSuffix); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java index b30ea5c6053f..25dab4c05fb2 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java @@ -14,9 +14,12 @@ package io.trino.plugin.hive.benchmark; import com.google.common.collect.ImmutableMap; -import io.airlift.slice.OutputStreamSliceOutput; import io.trino.filesystem.hdfs.HdfsFileSystemFactory; import io.trino.hdfs.HdfsEnvironment; +import io.trino.hive.formats.rcfile.RcFileEncoding; +import io.trino.hive.formats.rcfile.RcFileWriter; +import io.trino.hive.formats.rcfile.binary.BinaryRcFileEncoding; +import io.trino.hive.formats.rcfile.text.TextRcFileEncoding; import io.trino.orc.OrcReaderOptions; import io.trino.orc.OrcWriter; import io.trino.orc.OrcWriterOptions; @@ -37,12 +40,6 @@ import io.trino.plugin.hive.parquet.ParquetPageSourceFactory; import io.trino.plugin.hive.parquet.ParquetReaderConfig; import io.trino.plugin.hive.rcfile.RcFilePageSourceFactory; -import io.trino.rcfile.AircompressorCodecFactory; -import io.trino.rcfile.HadoopCodecFactory; -import io.trino.rcfile.RcFileEncoding; -import io.trino.rcfile.RcFileWriter; -import io.trino.rcfile.binary.BinaryRcFileEncoding; -import io.trino.rcfile.text.TextRcFileEncoding; import io.trino.spi.Page; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.type.Type; @@ -386,11 +383,10 @@ public PrestoRcFileFormatWriter(File targetFile, List types, RcFileEncodin throws IOException { writer = new RcFileWriter( - new OutputStreamSliceOutput(new FileOutputStream(targetFile)), + new FileOutputStream(targetFile), types, encoding, - compressionCodec.getCodec().map(Class::getName), - new AircompressorCodecFactory(new HadoopCodecFactory(getClass().getClassLoader())), + compressionCodec.getHiveCompressionKind(), ImmutableMap.of(), true); } diff --git a/pom.xml b/pom.xml index 8acafb4b0d44..1ae8edee5ea0 100644 --- a/pom.xml +++ b/pom.xml @@ -118,6 +118,7 @@ lib/trino-geospatial-toolkit lib/trino-hadoop-toolkit lib/trino-hdfs + lib/trino-hive-formats lib/trino-matching lib/trino-memory-context lib/trino-orc @@ -125,7 +126,6 @@ lib/trino-phoenix5-patched lib/trino-plugin-toolkit - lib/trino-rcfile lib/trino-record-decoder plugin/trino-accumulo plugin/trino-accumulo-iterators @@ -357,6 +357,12 @@ ${project.version} + + io.trino + trino-hive-formats + ${project.version} + + io.trino trino-hive-hadoop2 @@ -558,12 +564,6 @@ ${project.version} - - io.trino - trino-rcfile - ${project.version} - - io.trino trino-record-decoder