Skip to content

Commit 38381bf

Browse files
authored
Add better comments to examples (#34)
* Add better comments to examples * format/lint
1 parent 095fc26 commit 38381bf

8 files changed

+85
-32
lines changed

Diff for: Python/basics/pardo_with_output.py

+6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323

2424
class SplitFn(DoFn):
2525
def process(self, element):
26+
# Generate 3 PCollections from the input:
27+
# 1) Even elements, with the 'even' tag
28+
# 2) Odd elements, with the 'odd' tag
29+
# 3) All elements emitted as the main untagged output
2630
if element % 2 == 0:
2731
yield pvalue.TaggedOutput("even", element)
2832
else:
@@ -40,6 +44,8 @@ def run(argv=None):
4044
| "Split Output" >> ParDo(SplitFn()).with_outputs("even", "odd")
4145
)
4246

47+
# Log each element of both tagged PCollections
48+
# and the main untagged PCollection
4349
odd = output.odd | "odd log" >> Map(
4450
lambda x: logging.info("odds %d" % x)
4551
)

Diff for: Python/bigquery/read_query_bigquery.py

+5
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
class ReadQueryOptions(PipelineOptions):
2626
@classmethod
2727
def _add_argparse_args(cls, parser):
28+
# Add a command line flag to be parsed along with other
29+
# normal PipelineOptions. This flag will store the SQL query
30+
# to be run against BigQuery.
2831
parser.add_argument(
2932
"--query",
3033
default=(
@@ -37,6 +40,8 @@ def _add_argparse_args(cls, parser):
3740

3841
def run(argv=None):
3942
options = ReadQueryOptions()
43+
# Create a Beam pipeline with 2 steps:
44+
# run a query against BigQuery and log the results
4045
with beam.Pipeline(options=options) as p:
4146
output = (
4247
p

Diff for: Python/bigquery/read_table_ref_bigquery.py

+2
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@
2323

2424

2525
def run(argv=None):
26+
# Configure the table we are reading from.
2627
table = bigquery.TableReference(
2728
projectId="bigquery-public-data",
2829
datasetId="samples",
2930
tableId="github_timeline",
3031
)
3132

33+
# Create a Beam pipeline with 2 steps: read from BigQuery and log the data.
3234
with beam.Pipeline() as p:
3335
output = (
3436
p

Diff for: Python/bigquery/write_bigquery.py

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def run(argv=None):
4444
class WriteBigQueryOptions(PipelineOptions):
4545
@classmethod
4646
def _add_argparse_args(cls, parser):
47+
# Add a command line flag to be parsed along
48+
# with other normal PipelineOptions
4749
parser.add_argument(
4850
"--output_table", required=True, help="BQ Table to write"
4951
)

Diff for: Python/extra_examples/file_system_dynamics.py

+4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
class WriteFileSystems(DoFn):
2626
def process(self, element):
27+
# Beam's built-in FileSystems module has built in support for many
28+
# different backing storage systems, we use this to write our element.
29+
# Each input element is formatted as a Tuple of the form
30+
# <destination file, data to write>
2731
writer = FileSystems.create(element[0])
2832
writer.write(bytes(element[1], encoding="utf8"))
2933
writer.close()

Diff for: Python/gcs/read_textio.py

+7
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def run(argv=None):
2727
class ReadTextOptions(PipelineOptions):
2828
@classmethod
2929
def _add_argparse_args(cls, parser):
30+
# Add a command line flag to be parsed along
31+
# with other normal PipelineOptions
3032
parser.add_argument(
3133
"--path",
3234
default="gs://dataflow-samples/shakespeare/kinglear.txt",
@@ -35,6 +37,11 @@ def _add_argparse_args(cls, parser):
3537

3638
options = ReadTextOptions()
3739

40+
# Create a Beam pipeline with 3 steps:
41+
# 1) Read text. This will emit one record per line
42+
# 2) Count.Globally(). This will count the number of
43+
# elements in the PCollection.
44+
# 3) Log the output.
3845
with beam.Pipeline(options=options) as p:
3946
(
4047
p

Diff for: Python/pubsub/gcloud_logs_filter_with_dlq.py

+54-31
Original file line numberDiff line numberDiff line change
@@ -12,46 +12,46 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import logging
15+
# standard libraries
1616
import json
17+
import logging
1718

19+
# third party libraries
1820
import apache_beam as beam
19-
from apache_beam import DoFn
20-
from apache_beam import Filter
21-
from apache_beam import Map
22-
from apache_beam import ParDo
23-
from apache_beam.io import ReadFromPubSub
24-
from apache_beam.io import WriteToPubSub
21+
from apache_beam import DoFn, Filter, Map, ParDo
22+
from apache_beam.io import ReadFromPubSub, WriteToPubSub
2523
from apache_beam.options.pipeline_options import PipelineOptions
2624

27-
2825
PROCESSED_TAG = "processed"
2926
UNPROCESSED_TAG = "unprocessed"
3027

3128

3229
class PubSubOptions(PipelineOptions):
33-
3430
@classmethod
3531
def _add_argparse_args(cls, parser):
3632
parser.add_argument(
3733
"--input_topic",
3834
default="projects/your-project/topics/your-input-test",
39-
help="Input PubSub topic")
35+
help="Input PubSub topic",
36+
)
4037
parser.add_argument(
4138
"--output_topic",
4239
default="projects/your-project/topics/your-output-test",
43-
help="Output PubSub topic")
40+
help="Output PubSub topic",
41+
)
4442
parser.add_argument(
4543
"--dlq_topic",
4644
default="projects/your-project/topics/your-dlq-test",
47-
help="Dead Letter Queue PubSub topic")
45+
help="Dead Letter Queue PubSub topic",
46+
)
4847

4948

5049
def run():
5150
"""
52-
This Apache Beam pipeline processes log messages from a Google Cloud Pub/Sub topic.
53-
The expected data format follows the standard Google Cloud log format,
54-
which can be achieved by routing logs to a Pub/Sub topic via https://console.cloud.google.com/logs/router.
51+
This Apache Beam pipeline processes log messages from a Google Cloud
52+
Pub/Sub topic. The expected data format follows the standard Google Cloud
53+
log format, which can be achieved by routing logs to a Pub/Sub topic via
54+
https://console.cloud.google.com/logs/router.
5555
5656
It performs the following steps:
5757
1. Input Configuration:
@@ -66,40 +66,57 @@ def run():
6666
b. UNPROCESSED (missing one or both of these fields).
6767
6868
4. Severity Filtering:
69-
- For PROCESSED messages, filters out those with severity other than "ERROR".
69+
- For PROCESSED messages, filters out those with severity other than
70+
"ERROR".
7071
7172
5. Data Transformation:
72-
- Extracts timestamp and message content from the 'jsonPayload' field for PROCESSED messages.
73+
- Extracts timestamp and message content from the 'jsonPayload' field
74+
for PROCESSED messages.
7375
7476
6. Output Handling:
75-
- Writes transformed PROCESSED messages to a specified output Pub/Sub topic.
77+
- Writes transformed PROCESSED messages to a specified output Pub/Sub
78+
topic.
7679
- Sends UNPROCESSED messages to a Dead Letter Queue (DLQ) topic.
7780
"""
7881

7982
options = PubSubOptions(streaming=True)
8083

8184
with beam.Pipeline(options=options) as p:
82-
split_result = (p | "Read from PubSub" >> ReadFromPubSub(topic=options.input_topic)
83-
| "Parse JSON" >> Map(lambda msg: json.loads(msg))
84-
| "Split Messages" >> ParDo(SplitMessages()).with_outputs(UNPROCESSED_TAG, PROCESSED_TAG))
85+
split_result = (
86+
p
87+
| "Read from PubSub" >> ReadFromPubSub(topic=options.input_topic)
88+
| "Parse JSON" >> Map(lambda msg: json.loads(msg))
89+
| "Split Messages"
90+
>> ParDo(SplitMessages()).with_outputs(
91+
UNPROCESSED_TAG, PROCESSED_TAG
92+
)
93+
)
8594

8695
# Filter processed messages and write to output topic
87-
(split_result[PROCESSED_TAG]
88-
| "Filter by Severity" >> Filter(filter_by_severity)
89-
| "Map to PubsubMessage for output" >> Map(to_pubsub_message_for_output)
90-
| "Write to PubSub" >> WriteToPubSub(options.output_topic, with_attributes=True))
96+
(
97+
split_result[PROCESSED_TAG]
98+
| "Filter by Severity" >> Filter(filter_by_severity)
99+
| "Map to PubsubMessage for output"
100+
>> Map(to_pubsub_message_for_output)
101+
| "Write to PubSub"
102+
>> WriteToPubSub(options.output_topic, with_attributes=True)
103+
)
91104

92105
# Write unprocessed messages to DLQ
93-
(split_result[UNPROCESSED_TAG]
94-
| "Map to PubsubMessage for DLQ" >> Map(to_pubsub_message_for_dlq)
95-
| "Write to DLQ" >> WriteToPubSub(options.dlq_topic, with_attributes=True))
106+
(
107+
split_result[UNPROCESSED_TAG]
108+
| "Map to PubsubMessage for DLQ" >> Map(to_pubsub_message_for_dlq)
109+
| "Write to DLQ"
110+
>> WriteToPubSub(options.dlq_topic, with_attributes=True)
111+
)
96112

97113

98114
class SplitMessages(DoFn):
99115
def process(self, element):
116+
# third party libraries
100117
from apache_beam.pvalue import TaggedOutput
101118

102-
if ('severity' in element) & ('jsonPayload' in element):
119+
if ("severity" in element) & ("jsonPayload" in element):
103120
yield TaggedOutput(PROCESSED_TAG, element)
104121
else:
105122
yield TaggedOutput(UNPROCESSED_TAG, element)
@@ -111,20 +128,26 @@ def filter_by_severity(log):
111128

112129

113130
def to_pubsub_message_for_dlq(msg):
131+
# third party libraries
114132
from apache_beam.io import PubsubMessage
115133

116134
return PubsubMessage(data=bytes(json.dumps(msg), "utf-8"), attributes=None)
117135

118136

119137
def to_pubsub_message_for_output(log):
138+
# third party libraries
120139
from apache_beam.io import PubsubMessage
121140

122141
# Example transformation: Extract relevant information from the log
123142
transformed_data = {
124143
"timestamp": log.get("timestamp"),
125-
"message": log.get("jsonPayload").get("message")
144+
"message": log.get("jsonPayload").get("message"),
126145
}
127-
data = bytes(f"Error log message: {transformed_data['message']} [{transformed_data['timestamp']}]", "utf-8")
146+
data = bytes(
147+
f"Error log message: {transformed_data['message']} "
148+
f"[{transformed_data['timestamp']}]",
149+
"utf-8",
150+
)
128151
return PubsubMessage(data=data, attributes=transformed_data)
129152

130153

Diff for: Python/pubsub/read_pubsub_multiple.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,18 @@ def run():
2626
class ReadPubSubOptions(PipelineOptions):
2727
@classmethod
2828
def _add_argparse_args(cls, parser):
29+
# Add a command line flag to be parsed along
30+
# with other normal PipelineOptions
2931
parser.add_argument(
3032
"--sources",
3133
required=True,
32-
help="PubSub topics or subscriptions, separated by a coma,"
34+
help="PubSub topics or subscriptions, separated by a comma,"
3335
"e.g.: projects/a/topics/t1,projects/a/topics/t2.",
3436
)
3537

3638
options = ReadPubSubOptions(streaming=True)
39+
# Split the source argument into a list of sources that can be read by
40+
# Beam's MultipleReadFromPubSub transform
3741
sources = [PubSubSourceDescriptor(s) for s in options.sources.split(",")]
3842

3943
with beam.Pipeline(options=options) as p:

0 commit comments

Comments
 (0)