GoogleCloudPlatform
diff --git a/Diff for: ‎.gitignore
+1 b/Diff for: ‎.gitignore
+1
diff --git a/Diff for: ‎Python/.flake8
+16 b/Diff for: ‎Python/.flake8
+16
diff --git a/Diff for: ‎Python/.isort.cfg
+25 b/Diff for: ‎Python/.isort.cfg
+25
diff --git a/Diff for: ‎Python/.pre-commit-config.yaml
+30 b/Diff for: ‎Python/.pre-commit-config.yaml
+30
diff --git a/Diff for: ‎Python/CONTRIBUTING.md
+21 b/Diff for: ‎Python/CONTRIBUTING.md
+21
diff --git a/Diff for: ‎Python/Makefile
+74 b/Diff for: ‎Python/Makefile
+74
diff --git a/Diff for: ‎Python/advanced/stateful_dofn.py
+76-64 b/Diff for: ‎Python/advanced/stateful_dofn.py
+76-64
@@ -0,0 +1 @@
+.vscode
@@ -0,0 +1,16 @@
+[flake8]
+max-line-length = 80
+max-complexity = 40
+ignore =
+  E203
+  W503
+  F841
+  E501
+exclude =
+    .eggs
+    .git
+    .tox
+    __pycache__
+    build
+    dist
+    venv
@@ -0,0 +1,25 @@
+#  Copyright 2023 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+[settings]
+sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
+import_heading_stdlib=standard libraries
+import_heading_thirdparty=third party libraries
+include_trailing_comma=True
+indent='    '
+known_dfml=src
+dedup_headings=True
+line_length=80
+multi_line_output=3
+skip=./venv/,./venv-docs/,./.git/
@@ -0,0 +1,30 @@
+#  Copyright 2023 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+exclude: ^docs/notebooks/
+repos:
+-   repo: https://github.com/ambv/black
+    rev: 23.11.0
+    hooks:
+    - id: black
+      args: ["--config=Python/pyproject.toml", "--check", "--diff"]
+-   repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+    - id: flake8
+      args: ["--config=Python/.flake8"]
+-   repo: https://github.com/timothycrosley/isort
+    rev: 5.12.0
+    hooks:
+    - id: isort
@@ -0,0 +1,21 @@
+# How to Contribute Python Examples
+
+It is encouraged to install `make` to create your local development environment.
+
+1. Create the local Python environment:
+```bash
+make init
+```
+2. Use `source venv/bin/activate` to activate venv
+3. Clean up the local enviroment:
+```bash
+make clean
+```
+4. Format the Python code:
+```bash
+make format
+```
+5. Run the Python code linter:
+```bash
+make lint
+```
@@ -0,0 +1,74 @@
+#  Copyright 2023 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+SILENT:
+.PHONY:
+.DEFAULT_GOAL := help
+
+define PRINT_HELP_PYSCRIPT
+import re, sys # isort:skip
+
+matches = []
+for line in sys.stdin:
+	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+	if match:
+		matches.append(match.groups())
+
+for target, help in sorted(matches):
+    print("     %-25s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+
+PYTHON = python$(PYTHON_VERSION)
+
+ifndef TF_MODEL_URI
+	MODEL_ENV := "TORCH"
+else
+	MODEL_ENV := "TF"
+endif
+
+help: ## Print this help
+	@echo
+	@echo "  make targets:"
+	@echo
+	@$(PYTHON) -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+
+init-venv: ## Create virtual environment in venv folder
+	@$(PYTHON) -m venv venv
+
+init: init-venv ## Init virtual environment
+	@./venv/bin/python3 -m pip install -U pip
+	@./venv/bin/python3 -m pip install -r requirements.txt
+	@./venv/bin/python3 -m pip install -r requirements.dev.txt
+	@./venv/bin/python3 -m pre_commit install --install-hooks --overwrite
+	@echo "use 'source venv/bin/activate' to activate venv "
+
+format: ## Run formatter on source code
+	@./venv/bin/python3 -m black --config=pyproject.toml .
+
+lint: ## Run linter on source code
+	@./venv/bin/python3 -m black --config=pyproject.toml --check .
+	@./venv/bin/python3 -m flake8 --config=.flake8 .
+
+clean-lite: ## Remove pycache files, pytest files, etc
+	@rm -rf build dist .cache .coverage .coverage.* *.egg-info
+	@find . -name .coverage | xargs rm -rf
+	@find . -name .pytest_cache | xargs rm -rf
+	@find . -name .tox | xargs rm -rf
+	@find . -name __pycache__ | xargs rm -rf
+	@find . -name *.egg-info | xargs rm -rf
+
+clean: clean-lite ## Remove virtual environment, downloaded models, etc
+	@rm -rf venv
+	@echo "run 'make init'"
@@ -12,79 +12,91 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+# standard libraries
 import json
 import logging
 
+# third party libraries
 import apache_beam as beam
-from apache_beam import DoFn
-from apache_beam import Map
-from apache_beam import ParDo
+from apache_beam import DoFn, Map, ParDo
 from apache_beam.coders.coders import StrUtf8Coder
 from apache_beam.io.gcp.pubsub import ReadFromPubSub
 from apache_beam.options.pipeline_options import PipelineOptions
-from apache_beam.transforms.userstate import BagStateSpec
-from apache_beam.transforms.userstate import CombiningValueStateSpec
+from apache_beam.transforms.userstate import (
+    BagStateSpec,
+    CombiningValueStateSpec,
+)
 
 
 def run(argv=None):
-  class StatefulDoFn(DoFn):
-    """
-    GroupIntoBatches implements a similar logic.
-    When using StatefulDoFns be careful of not keeping the state forever and clearing state. This example is OK
-    because we know the keys are always incoming, but if we have  sparse keys, we may keep the buffer up forever
-    (e.g., we trigger every 100 elements but we only got 99 for that key. See `timer_dofn` for an example that
-    would fix that)
-    """
-    BUFFER_RIDES = BagStateSpec('rides', StrUtf8Coder())
-    COUNT_STATE = CombiningValueStateSpec('count', combine_fn=sum)
-
-    def __init__(self):
-      self.status_max_bag = {
-        "pickup": 100,
-        "enroute": 10000,
-        "dropoff": 100
-      }
-
-    def process(self,
-                element,
-                count_state=DoFn.StateParam(COUNT_STATE),
-                ride_state=DoFn.StateParam(BUFFER_RIDES)):
-
-      ride_id = element[1]
-      ride_status = element[0]
-
-      # Add ride id to bag
-      ride_state.add(ride_id)
-
-      # Increase counter
-      count_state.add(1)
-      count = count_state.read()
-
-      max_size = self.status_max_bag[ride_status]
-
-      # If counter is over max bag size, release buffer
-      if count > max_size:
-        logging.info("Releasing buffer for key %s", element[0])
-        for ride in ride_state.read():
-          yield ride
-
-        # Clear states
-        ride_state.clear()
-        count_state.clear()
-
-  options = PipelineOptions(streaming=True)
-  with beam.Pipeline(options=options) as p:
-    topic = "projects/pubsub-public-data/topics/taxirides-realtime"
-
-    pubsub = (p | "Read Topic" >> ReadFromPubSub(topic=topic)
-                | "Json Loads" >> Map(json.loads)
-                # SDFn need KVs as input. They are applied in a Key and Window basis
-                | "KV" >> Map(lambda x: (x["ride_status"], x["ride_id"]))
-              )
-
-    (pubsub | "StatefulDoFn" >> ParDo(StatefulDoFn())
-            | "Pass" >> Map(lambda x: x))
+    class StatefulDoFn(DoFn):
+        """
+        GroupIntoBatches implements a similar logic.
+        When using StatefulDoFns be careful of not keeping the state forever
+        and clearing state. This example is OK
+        because we know the keys are always incoming, but if we have  sparse
+        keys, we may keep the buffer up forever
+        (e.g., we trigger every 100 elements but we only got 99 for that key.
+        See `timer_dofn` for an example that would fix that)
+        """
+
+        BUFFER_RIDES = BagStateSpec("rides", StrUtf8Coder())
+        COUNT_STATE = CombiningValueStateSpec("count", combine_fn=sum)
+
+        def __init__(self):
+            self.status_max_bag = {
+                "pickup": 100,
+                "enroute": 10000,
+                "dropoff": 100,
+            }
+
+        def process(
+            self,
+            element,
+            count_state=DoFn.StateParam(COUNT_STATE),
+            ride_state=DoFn.StateParam(BUFFER_RIDES),
+        ):
+            ride_id = element[1]
+            ride_status = element[0]
+
+            # Add ride id to bag
+            ride_state.add(ride_id)
+
+            # Increase counter
+            count_state.add(1)
+            count = count_state.read()
+
+            max_size = self.status_max_bag[ride_status]
+
+            # If counter is over max bag size, release buffer
+            if count > max_size:
+                logging.info("Releasing buffer for key %s", element[0])
+                for ride in ride_state.read():
+                    yield ride
+
+                # Clear states
+                ride_state.clear()
+                count_state.clear()
+
+    options = PipelineOptions(streaming=True)
+    with beam.Pipeline(options=options) as p:
+        topic = "projects/pubsub-public-data/topics/taxirides-realtime"
+
+        pubsub = (
+            p
+            | "Read Topic" >> ReadFromPubSub(topic=topic)
+            | "Json Loads" >> Map(json.loads)
+            # SDFn need KVs as input. They are applied in a Key and Window basis
+            | "KV" >> Map(lambda x: (x["ride_status"], x["ride_id"]))
+        )
+
+        (
+            pubsub
+            | "StatefulDoFn" >> ParDo(StatefulDoFn())
+            | "Pass" >> Map(lambda x: x)
+        )
+
 
 if __name__ == "__main__":
-  logging.getLogger().setLevel(logging.INFO)
-  run()
+    logging.getLogger().setLevel(logging.INFO)
+    run()