@@ -14,6 +14,9 @@ export enum ParquetWriterVersion {
1414 V2 = 'V2' ,
1515}
1616
17+ /**
18+ * Props for Parquet output format for data record format conversion
19+ */
1720export interface ParquetOutputFormatProps {
1821
1922 /**
@@ -82,7 +85,7 @@ export interface ParquetOutputFormatProps {
8285 * You should only need to specify an instance of this class if the default configuration does not suit your needs.
8386 */
8487export class ParquetOutputFormat implements IOutputFormat {
85- private readonly VALID_COMPRESSIONS = [ Compression . SNAPPY , Compression . UNCOMPRESSED , Compression . GZIP ] ;
88+ private static readonly VALID_COMPRESSIONS = [ Compression . SNAPPY , Compression . UNCOMPRESSED , Compression . GZIP ] ;
8689
8790 public constructor ( readonly props ?: ParquetOutputFormatProps ) {
8891 this . validateProps ( props ) ;
@@ -93,8 +96,8 @@ export class ParquetOutputFormat implements IOutputFormat {
9396 return ;
9497 }
9598
96- if ( props . compression !== undefined && ! this . VALID_COMPRESSIONS . map ( compression => compression . value ) . includes ( props . compression . value ) ) {
97- throw new core . UnscopedValidationError ( `Compression ${ props . compression } is invalid, it must be one of ${ this . VALID_COMPRESSIONS } ` ) ;
99+ if ( props . compression !== undefined && ! ParquetOutputFormat . VALID_COMPRESSIONS . map ( compression => compression . value ) . includes ( props . compression . value ) ) {
100+ throw new core . UnscopedValidationError ( `Compression ${ props . compression } is invalid, it must be one of ${ ParquetOutputFormat . VALID_COMPRESSIONS } ` ) ;
98101 }
99102
100103 if ( props . blockSize !== undefined && props . blockSize . toMebibytes ( ) < 64 ) {
@@ -132,21 +135,132 @@ export enum OrcFormatVersion {
132135 V0_12 = 'V0_12' ,
133136}
134137
138+ /**
139+ * Props for ORC output format for data record format conversion
140+ */
135141export interface OrcOutputFormatProps {
142+
143+ /**
144+ * The Hadoop Distributed File System (HDFS) block size.
145+ * This is useful if you intend to copy the data from Amazon S3 to HDFS before querying.
146+ * Firehose uses this value for padding calculations.
147+ *
148+ * @minimum `Size.mebibytes(64)`
149+ * @default `Size.mebibytes(256)`
150+ */
136151 readonly blockSize ?: core . Size ;
152+
153+ /**
154+ * The compression code to use over data blocks.
155+ *
156+ * The possible values are `UNCOMPRESSED` , `SNAPPY` , and `GZIP`.
157+ * Use `SNAPPY` for higher decompression speed.
158+ * Use `GZIP` if the compression ratio is more important than speed.
159+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-parquetserde.html#cfn-kinesisfirehose-deliverystream-parquetserde-compression
160+ * @default `SNAPPY`
161+ */
162+ readonly compression ?: Compression ;
163+
164+ /**
165+ * The column names for which you want Firehose to create bloom filters.
166+ *
167+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-bloomfiltercolumns
168+ *
169+ * @default no bloom filters are created
170+ */
137171 readonly bloomFilterColumns ?: string [ ] ;
172+
173+ /**
174+ * The Bloom filter false positive probability (FPP).
175+ *
176+ * The lower the FPP, the bigger the bloom filter.
177+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-bloomfilterfalsepositiveprobability
178+ *
179+ * @minimum `0`
180+ * @maximum `1`
181+ * @default `0.05`
182+ */
138183 readonly bloomFilterFalsePositiveProbability ?: number ;
139- readonly compression ?: Compression ;
184+
185+ /**
186+ * Determines whether dictionary encoding should be applied to a column.
187+ *
188+ * If the number of distinct keys (unique values) in a column exceeds this fraction of the total non-null rows in that column, dictionary encoding will be turned off for that specific column.
189+ *
190+ * To turn off dictionary encoding, set this threshold to 0. To always use dictionary encoding, set this threshold to 1.
191+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-dictionarykeythreshold
192+ *
193+ * @minimum `0`
194+ * @maximum `1`
195+ * @default `0.8`
196+ */
140197 readonly dictionaryKeyThreshold ?: number ;
198+
199+ /**
200+ * Set this to `true` to indicate that you want stripes to be padded to the HDFS block boundaries.
201+ *
202+ * This is useful if you intend to copy the data from Amazon S3 to HDFS before querying.
203+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-enablepadding
204+ *
205+ * @default `false`
206+ */
141207 readonly enablePadding ?: boolean ;
208+
209+ /**
210+ * The version of the ORC format to write.
211+ *
212+ * The possible values are `V0_11` and `V0_12`.
213+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-formatversion
214+ *
215+ * @default `V0_12`
216+ */
142217 readonly formatVersion ?: OrcFormatVersion ;
218+
219+ /**
220+ * A number between 0 and 1 that defines the tolerance for block padding as a decimal fraction of stripe size.
221+ *
222+ * The default value is 0.05, which means 5 percent of stripe size.
223+ *
224+ * For the default values of 64 MiB ORC stripes and 256 MiB HDFS blocks, the default block padding tolerance of 5 percent reserves a maximum of 3.2 MiB for padding within the 256 MiB block.
225+ * In such a case, if the available size within the block is more than 3.2 MiB, a new, smaller stripe is inserted to fit within that space.
226+ * This ensures that no stripe crosses block boundaries and causes remote reads within a node-local task.
227+ *
228+ * Kinesis Data Firehose ignores this parameter when `EnablePadding` is `false` .
229+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-paddingtolerance
230+ *
231+ * @default `0.05` if `enablePadding` is `true`
232+ */
143233 readonly paddingTolerance ?: number ;
234+
235+ /**
236+ * The number of rows between index entries.
237+ *
238+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-rowindexstride
239+ *
240+ * @minimum 1000
241+ * @default 10000
242+ */
144243 readonly rowIndexStride ?: number ;
244+
245+ /**
246+ * The number of bytes in each stripe.
247+ *
248+ * The default is 64 MiB and the minimum is 8 MiB.
249+ * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-stripesizebytes
250+ *
251+ * @minimum `Size.mebibytes(8)`
252+ * @default `Size.mebibytes(64)`
253+ */
145254 readonly stripeSize ?: core . Size ;
146255}
147256
257+ /**
258+ * This class specifies properties for ORC output format for record format conversion.
259+ *
260+ * You should only need to specify an instance of this class if the default configuration does not suit your needs.
261+ */
148262class OrcOutputFormat implements IOutputFormat {
149- private readonly VALID_COMPRESSIONS = [ Compression . SNAPPY , Compression . UNCOMPRESSED , Compression . GZIP ] ;
263+ private static readonly VALID_COMPRESSIONS = [ Compression . SNAPPY , Compression . UNCOMPRESSED , Compression . GZIP ] ;
150264
151265 public constructor ( readonly props ?: OrcOutputFormatProps ) {
152266 this . validateProps ( props ) ;
@@ -161,8 +275,8 @@ class OrcOutputFormat implements IOutputFormat {
161275 return ;
162276 }
163277
164- if ( props . compression !== undefined && ! this . VALID_COMPRESSIONS . map ( compression => compression . value ) . includes ( props . compression . value ) ) {
165- throw new core . UnscopedValidationError ( `Compression ${ props . compression } is invalid, it must be one of ${ this . VALID_COMPRESSIONS } ` ) ;
278+ if ( props . compression !== undefined && ! OrcOutputFormat . VALID_COMPRESSIONS . map ( compression => compression . value ) . includes ( props . compression . value ) ) {
279+ throw new core . UnscopedValidationError ( `Compression ${ props . compression } is invalid, it must be one of ${ OrcOutputFormat . VALID_COMPRESSIONS } ` ) ;
166280 }
167281
168282 if ( props . blockSize !== undefined && props . blockSize . toMebibytes ( ) < 64 ) {
@@ -215,8 +329,20 @@ class OrcOutputFormat implements IOutputFormat {
215329 }
216330}
217331
332+
333+ /**
334+ * Represents possible output formats when performing record data conversion.
335+ */
218336export class OutputFormat {
337+
338+ /**
339+ * Write output files in Parquet
340+ */
219341 public static readonly PARQUET = new ParquetOutputFormat ( ) ;
342+
343+ /**
344+ * Write output files in ORC
345+ */
220346 public static readonly ORC = new OrcOutputFormat ( ) ;
221347
222348 private constructor ( ) { }
0 commit comments