Skip to content

Commit 90f3001

Browse files
committed
added comments for ORC format
1 parent 3088c0c commit 90f3001

File tree

3 files changed

+142
-16
lines changed

3 files changed

+142
-16
lines changed

packages/aws-cdk-lib/aws-kinesisfirehose/lib/record-format/input.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ export class TimestampParser {
7474
/**
7575
* Default timestamp parser.
7676
*
77-
* You should specify this parser if you want to preserve the default timestamp parsing logic.
77+
* Specify this parser to preserve the default timestamp parsing logic.
7878
*/
7979
public static readonly DEFAULT = new TimestampParser('java.sql.Timestamp::valueOf');
8080

@@ -109,10 +109,10 @@ export interface HiveJsonInputFormatProps {
109109
/**
110110
* List of TimestampParsers.
111111
*
112-
* These are used to parse custom timestamp strings from your input JSON into dates.
112+
* These are used to parse custom timestamp strings from input JSON into dates.
113113
*
114-
* Note: Specifying a parser will override the default timestamp parser. If you require the default timestamp parser,
115-
* include `TimestampParser.DEFAULT` in the list of parsers along with your custom parser.
114+
* Note: Specifying a parser will override the default timestamp parser. If the default timestamp parser is required,
115+
* include `TimestampParser.DEFAULT` in the list of parsers along with the custom parser.
116116
*
117117
* @default the default timestamp parser is used
118118
*/
@@ -147,18 +147,16 @@ export class HiveJsonInputFormat implements IInputFormat {
147147
}
148148

149149
/**
150-
* Represents possible input formats when perform record data conversion.
151-
*
152-
* You can choose to parse your input JSON with OpenX JSON specification or Hive JSON specification.
150+
* Represents possible input formats when performing record data conversion.
153151
*/
154152
export class InputFormat {
155153
/**
156-
* Parse your JSON with OpenX JSON specification. This will typically suffice.
154+
* Parse input JSON with OpenX JSON specification. This will typically suffice.
157155
*/
158156
public static readonly OPENX_JSON = new OpenXJsonInputFormat();
159157

160158
/**
161-
* Parse your JSON with Hive JSON specification.
159+
* Parse input JSON with Hive JSON specification.
162160
*/
163161
public static readonly HIVE_JSON = new HiveJsonInputFormat();
164162

packages/aws-cdk-lib/aws-kinesisfirehose/lib/record-format/output.ts

Lines changed: 133 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ export enum ParquetWriterVersion {
1414
V2 = 'V2',
1515
}
1616

17+
/**
18+
* Props for Parquet output format for data record format conversion
19+
*/
1720
export interface ParquetOutputFormatProps {
1821

1922
/**
@@ -82,7 +85,7 @@ export interface ParquetOutputFormatProps {
8285
* You should only need to specify an instance of this class if the default configuration does not suit your needs.
8386
*/
8487
export class ParquetOutputFormat implements IOutputFormat {
85-
private readonly VALID_COMPRESSIONS = [Compression.SNAPPY, Compression.UNCOMPRESSED, Compression.GZIP];
88+
private static readonly VALID_COMPRESSIONS = [Compression.SNAPPY, Compression.UNCOMPRESSED, Compression.GZIP];
8689

8790
public constructor(readonly props?: ParquetOutputFormatProps) {
8891
this.validateProps(props);
@@ -93,8 +96,8 @@ export class ParquetOutputFormat implements IOutputFormat {
9396
return;
9497
}
9598

96-
if (props.compression !== undefined && !this.VALID_COMPRESSIONS.map(compression => compression.value).includes(props.compression.value)) {
97-
throw new core.UnscopedValidationError(`Compression ${props.compression} is invalid, it must be one of ${this.VALID_COMPRESSIONS}`);
99+
if (props.compression !== undefined && !ParquetOutputFormat.VALID_COMPRESSIONS.map(compression => compression.value).includes(props.compression.value)) {
100+
throw new core.UnscopedValidationError(`Compression ${props.compression} is invalid, it must be one of ${ParquetOutputFormat.VALID_COMPRESSIONS}`);
98101
}
99102

100103
if (props.blockSize !== undefined && props.blockSize.toMebibytes() < 64) {
@@ -132,21 +135,132 @@ export enum OrcFormatVersion {
132135
V0_12 = 'V0_12',
133136
}
134137

138+
/**
139+
* Props for ORC output format for data record format conversion
140+
*/
135141
export interface OrcOutputFormatProps {
142+
143+
/**
144+
* The Hadoop Distributed File System (HDFS) block size.
145+
* This is useful if you intend to copy the data from Amazon S3 to HDFS before querying.
146+
* Firehose uses this value for padding calculations.
147+
*
148+
* @minimum `Size.mebibytes(64)`
149+
* @default `Size.mebibytes(256)`
150+
*/
136151
readonly blockSize?: core.Size;
152+
153+
/**
154+
* The compression code to use over data blocks.
155+
*
156+
* The possible values are `UNCOMPRESSED` , `SNAPPY` , and `GZIP`.
157+
* Use `SNAPPY` for higher decompression speed.
158+
* Use `GZIP` if the compression ratio is more important than speed.
159+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-parquetserde.html#cfn-kinesisfirehose-deliverystream-parquetserde-compression
160+
* @default `SNAPPY`
161+
*/
162+
readonly compression?: Compression;
163+
164+
/**
165+
* The column names for which you want Firehose to create bloom filters.
166+
*
167+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-bloomfiltercolumns
168+
*
169+
* @default no bloom filters are created
170+
*/
137171
readonly bloomFilterColumns?: string[];
172+
173+
/**
174+
* The Bloom filter false positive probability (FPP).
175+
*
176+
* The lower the FPP, the bigger the bloom filter.
177+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-bloomfilterfalsepositiveprobability
178+
*
179+
* @minimum `0`
180+
* @maximum `1`
181+
* @default `0.05`
182+
*/
138183
readonly bloomFilterFalsePositiveProbability?: number;
139-
readonly compression?: Compression;
184+
185+
/**
186+
* Determines whether dictionary encoding should be applied to a column.
187+
*
188+
* If the number of distinct keys (unique values) in a column exceeds this fraction of the total non-null rows in that column, dictionary encoding will be turned off for that specific column.
189+
*
190+
* To turn off dictionary encoding, set this threshold to 0. To always use dictionary encoding, set this threshold to 1.
191+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-dictionarykeythreshold
192+
*
193+
* @minimum `0`
194+
* @maximum `1`
195+
* @default `0.8`
196+
*/
140197
readonly dictionaryKeyThreshold?: number;
198+
199+
/**
200+
* Set this to `true` to indicate that you want stripes to be padded to the HDFS block boundaries.
201+
*
202+
* This is useful if you intend to copy the data from Amazon S3 to HDFS before querying.
203+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-enablepadding
204+
*
205+
* @default `false`
206+
*/
141207
readonly enablePadding?: boolean;
208+
209+
/**
210+
* The version of the ORC format to write.
211+
*
212+
* The possible values are `V0_11` and `V0_12`.
213+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-formatversion
214+
*
215+
* @default `V0_12`
216+
*/
142217
readonly formatVersion?: OrcFormatVersion;
218+
219+
/**
220+
* A number between 0 and 1 that defines the tolerance for block padding as a decimal fraction of stripe size.
221+
*
222+
* The default value is 0.05, which means 5 percent of stripe size.
223+
*
224+
* For the default values of 64 MiB ORC stripes and 256 MiB HDFS blocks, the default block padding tolerance of 5 percent reserves a maximum of 3.2 MiB for padding within the 256 MiB block.
225+
* In such a case, if the available size within the block is more than 3.2 MiB, a new, smaller stripe is inserted to fit within that space.
226+
* This ensures that no stripe crosses block boundaries and causes remote reads within a node-local task.
227+
*
228+
* Kinesis Data Firehose ignores this parameter when `EnablePadding` is `false` .
229+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-paddingtolerance
230+
*
231+
* @default `0.05` if `enablePadding` is `true`
232+
*/
143233
readonly paddingTolerance?: number;
234+
235+
/**
236+
* The number of rows between index entries.
237+
*
238+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-rowindexstride
239+
*
240+
* @minimum 1000
241+
* @default 10000
242+
*/
144243
readonly rowIndexStride?: number;
244+
245+
/**
246+
* The number of bytes in each stripe.
247+
*
248+
* The default is 64 MiB and the minimum is 8 MiB.
249+
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-orcserde.html#cfn-kinesisfirehose-deliverystream-orcserde-stripesizebytes
250+
*
251+
* @minimum `Size.mebibytes(8)`
252+
* @default `Size.mebibytes(64)`
253+
*/
145254
readonly stripeSize?: core.Size;
146255
}
147256

257+
/**
258+
* This class specifies properties for ORC output format for record format conversion.
259+
*
260+
* You should only need to specify an instance of this class if the default configuration does not suit your needs.
261+
*/
148262
class OrcOutputFormat implements IOutputFormat {
149-
private readonly VALID_COMPRESSIONS = [Compression.SNAPPY, Compression.UNCOMPRESSED, Compression.GZIP];
263+
private static readonly VALID_COMPRESSIONS = [Compression.SNAPPY, Compression.UNCOMPRESSED, Compression.GZIP];
150264

151265
public constructor(readonly props?: OrcOutputFormatProps) {
152266
this.validateProps(props);
@@ -161,8 +275,8 @@ class OrcOutputFormat implements IOutputFormat {
161275
return;
162276
}
163277

164-
if (props.compression !== undefined && !this.VALID_COMPRESSIONS.map(compression => compression.value).includes(props.compression.value)) {
165-
throw new core.UnscopedValidationError(`Compression ${props.compression} is invalid, it must be one of ${this.VALID_COMPRESSIONS}`);
278+
if (props.compression !== undefined && !OrcOutputFormat.VALID_COMPRESSIONS.map(compression => compression.value).includes(props.compression.value)) {
279+
throw new core.UnscopedValidationError(`Compression ${props.compression} is invalid, it must be one of ${OrcOutputFormat.VALID_COMPRESSIONS}`);
166280
}
167281

168282
if (props.blockSize !== undefined && props.blockSize.toMebibytes() < 64) {
@@ -215,8 +329,20 @@ class OrcOutputFormat implements IOutputFormat {
215329
}
216330
}
217331

332+
333+
/**
334+
* Represents possible output formats when performing record data conversion.
335+
*/
218336
export class OutputFormat {
337+
338+
/**
339+
* Write output files in Parquet
340+
*/
219341
public static readonly PARQUET = new ParquetOutputFormat();
342+
343+
/**
344+
* Write output files in ORC
345+
*/
220346
public static readonly ORC = new OrcOutputFormat();
221347

222348
private constructor() {}

packages/aws-cdk-lib/aws-kinesisfirehose/lib/s3-bucket.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ export interface S3BucketProps extends CommonDestinationS3Props, CommonDestinati
3636
/**
3737
* The input format, output format, and schema for converting data from the JSON format to the Parquet or ORC format before writing it to Amazon S3.
3838
* @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-kinesisfirehose-deliverystream-extendeds3destinationconfiguration.html#cfn-kinesisfirehose-deliverystream-extendeds3destinationconfiguration-dataformatconversionconfiguration
39+
*
40+
* @default no data format conversion is done
3941
*/
4042
readonly dataFormatConversionConfiguration?: DataFormatConversionConfiguration;
4143
}

0 commit comments

Comments
 (0)