apache · riteshghorse · Jan 19, 2024 · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023
diff --git a/sdks/python/apache_beam/io/requestresponse.py b/sdks/python/apache_beam/io/requestresponse.py
@@ -278,14 +278,14 @@ def __init__(
       timeout (float): timeout value in seconds to wait for response from API.
       should_backoff (~apache_beam.io.requestresponse.ShouldBackOff):
         (Optional) provides methods for backoff.
-      repeater (~apache_beam.io.requestresponse.Repeater): (Optional)
-        provides methods to repeat requests to API.
+      repeater (~apache_beam.io.requestresponse.Repeater): provides methods to
+        repeat requests to API.
       cache_reader (~apache_beam.io.requestresponse.CacheReader): (Optional)
         provides methods to read external cache.
       cache_writer (~apache_beam.io.requestresponse.CacheWriter): (Optional)
         provides methods to write to external cache.
       throttler (~apache_beam.io.requestresponse.PreCallThrottler):
-        (Optional) provides methods to pre-throttle a request.
+        provides methods to pre-throttle a request.
     """
     self._caller = caller
     self._timeout = timeout
@@ -387,7 +387,7 @@ def process(self, request: RequestT, *args, **kwargs):
         _LOGGER.info(
             "Delaying request for %d seconds" % self._throttler.delay_secs)
         time.sleep(self._throttler.delay_secs)
-        self._metrics_collector.throttled_secs.inc(5)
+        self._metrics_collector.throttled_secs.inc(self._throttler.delay_secs)
         is_throttled_request = True
 
     if is_throttled_request:

diff --git a/sdks/python/apache_beam/io/requestresponse_test.py b/sdks/python/apache_beam/io/requestresponse_test.py
@@ -23,7 +23,7 @@
 # pylint: disable=ungrouped-imports
 try:
   from google.api_core.exceptions import TooManyRequests
-  from apache_beam.io.requestresponse import Caller
+  from apache_beam.io.requestresponse import Caller, DefaultThrottler
   from apache_beam.io.requestresponse import RequestResponseIO
   from apache_beam.io.requestresponse import UserCodeExecutionException
   from apache_beam.io.requestresponse import UserCodeTimeoutException
@@ -128,6 +128,29 @@ def test_caller_no_retry_strategy(self):
             | RequestResponseIO(caller=caller, repeater=None))
     self.assertRegex(cm.exception.message, 'retries = 0')
 
+  def test_default_throttler(self):
+    caller = CallerWithTimeout()
+    throttler = DefaultThrottler(
+        window_ms=10000, bucket_ms=5000, overload_ratio=1)
+    # manually override the number of received requests for testing.
+    throttler.throttler._all_requests.add(time.time() * 1000, 100)
+    test_pipeline = TestPipeline()
+    _ = (
+        test_pipeline
+        | beam.Create(['sample_request'])
+        | RequestResponseIO(caller=caller, throttler=throttler))
+    result = test_pipeline.run()
+    result.wait_until_finish()
+    metrics = result.metrics().query(
+        beam.metrics.MetricsFilter().with_name('throttled_requests'))
+    self.assertEqual(metrics['counters'][0].committed, 1)
+    metrics = result.metrics().query(
+        beam.metrics.MetricsFilter().with_name('cumulativeThrottlingSeconds'))
+    self.assertGreater(metrics['counters'][0].committed, 0)
+    metrics = result.metrics().query(
+        beam.metrics.MetricsFilter().with_name('responses'))
+    self.assertEqual(metrics['counters'][0].committed, 1)
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/transforms/enrichment.py b/sdks/python/apache_beam/transforms/enrichment.py
@@ -24,6 +24,10 @@
 import apache_beam as beam
 from apache_beam.io.requestresponse import DEFAULT_TIMEOUT_SECS
 from apache_beam.io.requestresponse import Caller
+from apache_beam.io.requestresponse import DefaultThrottler
+from apache_beam.io.requestresponse import ExponentialBackOffRepeater
+from apache_beam.io.requestresponse import PreCallThrottler
+from apache_beam.io.requestresponse import Repeater
 from apache_beam.io.requestresponse import RequestResponseIO
 
 __all__ = [
@@ -96,20 +100,32 @@ class Enrichment(beam.PTransform[beam.PCollection[InputT],
     join_fn: A lambda function to join original element with lookup metadata.
       Defaults to `CROSS_JOIN`.
     timeout: (Optional) timeout for source requests. Defaults to 30 seconds.
+    repeater (~apache_beam.io.requestresponse.Repeater): provides methods to
+      repeat requests to API.
+    throttler (~apache_beam.io.requestresponse.PreCallThrottler):
+      provides methods to pre-throttle a request.
   """
   def __init__(
       self,
       source_handler: EnrichmentSourceHandler,
       join_fn: JoinFn = cross_join,
-      timeout: Optional[float] = DEFAULT_TIMEOUT_SECS):
+      timeout: Optional[float] = DEFAULT_TIMEOUT_SECS,
+      repeater: Repeater = ExponentialBackOffRepeater(),
+      throttler: PreCallThrottler = DefaultThrottler(),
+  ):
     self._source_handler = source_handler
     self._join_fn = join_fn
     self._timeout = timeout
+    self._repeater = repeater
+    self._throttler = throttler
 
   def expand(self,
              input_row: beam.PCollection[InputT]) -> beam.PCollection[OutputT]:
     fetched_data = input_row | RequestResponseIO(
-        caller=self._source_handler, timeout=self._timeout)
+        caller=self._source_handler,
+        timeout=self._timeout,
+        repeater=self._repeater,
+        throttler=self._throttler)
 
     # EnrichmentSourceHandler returns a tuple of (request,response).
     return fetched_data | beam.Map(

diff --git a/sdks/python/apache_beam/transforms/enrichment_handlers/bigtable.py b/sdks/python/apache_beam/transforms/enrichment_handlers/bigtable.py
@@ -21,7 +21,9 @@
 from typing import Optional
 
 from google.api_core.exceptions import NotFound
+from google.cloud import bigtable
 from google.cloud.bigtable import Client
+from google.cloud.bigtable.row_filters import CellsColumnLimitFilter
 from google.cloud.bigtable.row_filters import RowFilter
 
 import apache_beam as beam
@@ -41,12 +43,12 @@ class ExceptionLevel(Enum):
   returns an empty row.
 
   Members:
-    - WARNING_ONLY: Log a warning for exception without raising it.
     - RAISE: Raise the exception.
+    - WARN: Log a warning for exception without raising it.
     - QUIET: Neither log nor raise the exception.
   """
-  WARNING_ONLY = 0
-  RAISE = 1
+  RAISE = 0
+  WARN = 1
   QUIET = 2
 
 
@@ -63,32 +65,43 @@ class EnrichWithBigTable(EnrichmentSourceHandler[beam.Row, beam.Row]):
       to use as `row_key` for BigTable querying.
     row_filter: a ``:class:`google.cloud.bigtable.row_filters.RowFilter``` to
       filter data read with ``read_row()``.
+      Defaults to `CellsColumnLimitFilter(1)`.
+    app_profile_id (str): App profile ID to use for BigTable.
+    encoding (str): encoding type to convert the string to bytes and vice-versa
+      from BigTable. Default is `utf-8`.
     exception_level: a `enum.Enum` value from
       ``apache_beam.transforms.enrichment_handlers.bigtable.ExceptionLevel``
       to set the level when an empty row is returned from the BigTable query.
-      Defaults to ``ExceptionLevel.QUIET``.
+      Defaults to ``ExceptionLevel.WARN``.
   """
   def __init__(
       self,
       project_id: str,
       instance_id: str,
       table_id: str,
       row_key: str,
-      row_filter: Optional[RowFilter] = None,
-      exception_level: ExceptionLevel = ExceptionLevel.QUIET,
+      row_filter: Optional[RowFilter] = CellsColumnLimitFilter(1),
+      app_profile_id: str = "",
+      encoding: str = 'utf-8',
+      exception_level: ExceptionLevel = ExceptionLevel.WARN,
   ):
     self._project_id = project_id
     self._instance_id = instance_id
     self._table_id = table_id
     self._row_key = row_key
     self._row_filter = row_filter
+    self._app_profile_id = app_profile_id
+    self._encoding = encoding
     self._exception_level = exception_level
 
   def __enter__(self):
     """connect to the Google BigTable cluster."""
     self.client = Client(project=self._project_id)
     self.instance = self.client.instance(self._instance_id)
-    self._table = self.instance.table(self._table_id)
+    self._table = bigtable.table.Table(
+        table_id=self._table_id,
+        instance=self.instance,
+        app_profile_id=self._app_profile_id)
 
   def __call__(self, request: beam.Row, *args, **kwargs):
     """
@@ -99,27 +112,28 @@ def __call__(self, request: beam.Row, *args, **kwargs):
     request: the input `beam.Row` to enrich.
     """
     response_dict: Dict[str, Any] = {}
-    row_key: str = ""
+    row_key_str: str = ""
     try:
       request_dict = request._asdict()
-      row_key = str(request_dict[self._row_key]).encode()
+      row_key_str = str(request_dict[self._row_key])
+      row_key = row_key_str.encode(self._encoding)
       row = self._table.read_row(row_key, filter_=self._row_filter)
       if row:
         for cf_id, cf_v in row.cells.items():
           response_dict[cf_id] = {}
           for k, v in cf_v.items():
-            response_dict[cf_id][k.decode('utf-8')] = \
-              v[0].value.decode('utf-8')
-      elif self._exception_level == ExceptionLevel.WARNING_ONLY:
+            response_dict[cf_id][k.decode(self._encoding)] = \
+              v[0].value.decode(self._encoding)
+      elif self._exception_level == ExceptionLevel.WARN:
         _LOGGER.warning(
             'no matching row found for row_key: %s '
-            'with row_filter: %s' % (row_key, self._row_filter))
+            'with row_filter: %s' % (row_key_str, self._row_filter))
       elif self._exception_level == ExceptionLevel.RAISE:
         raise ValueError(
             'no matching row found for row_key: %s '
-            'with row_filter=%s' % (row_key, self._row_filter))
+            'with row_filter=%s' % (row_key_str, self._row_filter))
     except KeyError:
-      raise KeyError('row_key %s not found in input PCollection.' % row_key)
+      raise KeyError('row_key %s not found in input PCollection.' % row_key_str)
     except NotFound:
       raise NotFound(
           'GCP BigTable cluster `%s:%s:%s` not found.' %