qiskit-community · nkanazawa1989 · Oct 28, 2021 · Oct 29, 2021 · Oct 29, 2021 · Oct 29, 2021
diff --git a/qiskit_experiments/data_processing/data_action.py b/qiskit_experiments/data_processing/data_action.py
@@ -13,7 +13,9 @@
 """Defines the steps that can be used to analyse data."""
 
 from abc import ABCMeta, abstractmethod
-from typing import Any, List, Optional, Tuple
+from typing import Generator, Iterator, Optional
+
+import numpy as np
 
 
 class DataAction(metaclass=ABCMeta):
@@ -29,50 +31,40 @@ def __init__(self, validate: bool = True):
         """
         self._validate = validate
 
-    @abstractmethod
-    def _process(self, datum: Any, error: Optional[Any] = None) -> Tuple[Any, Any]:
-        """
-        Applies the data processing step to the datum.
+    def _process(self, gen_datum: Iterator) -> Generator:
+        """Applies the data processing step to the datum.
 
         Args:
-            datum: A single item of data which will be processed.
-            error: An optional error estimation on the datum that can be further propagated.
+            gen_datum: A generator of unprocessed data. Each entry is a tuple of data and error.
 
-        Returns:
-            processed data: The data that has been processed along with the propagated error.
+        Yields:
+            A tuple of processed data and error.
         """
+        yield from gen_datum
 
-    @abstractmethod
-    def _format_data(self, datum: Any, error: Optional[Any] = None) -> Tuple[Any, Any]:
-        """Format and validate the input.
+    def _format_data(self, gen_datum: Iterator) -> Generator:
+        """Validate and format the input.
 
-        Check that the given data and error has the correct structure. This method may
-        additionally change the data type, e.g. converting a list to a numpy array.
+        Check that the given data and error have the correct structure.
 
         Args:
-            datum: The data instance to check and format.
-            error: An optional error estimation on the datum to check and format.
-
-        Returns:
-            datum, error: The formatted datum and its optional error.
+            gen_datum: A generator of unformatted data. Each entry is a tuple of data and error.
 
-        Raises:
-            DataProcessorError: If either the data or the error do not have the proper format.
+        Yields:
+            A tuple of formatted data and error.
         """
+        yield from gen_datum
 
-    def __call__(self, data: Any, error: Optional[Any] = None) -> Tuple[Any, Any]:
+    def __call__(self, gen_datum: Iterator) -> Generator:
         """Call the data action of this node on the data and propagate the error.
 
         Args:
-            data: The data to process. The action nodes in the data processor will
-                raise errors if the data does not have the appropriate format.
-            error: An optional error estimation on the datum that can be further processed.
+            gen_datum: A generator of raw data. Each entry is a tuple of data and error.
 
-        Returns:
-            processed data: The data processed by self as a tuple of processed datum and
-                optionally the propagated error estimate.
+        Yields:
+            A generator that implements a data processing pipeline.
         """
-        return self._process(*self._format_data(data, error))
+        yield from self._process(self._format_data(gen_datum))
 
     def __repr__(self):
         """String representation of the node."""
@@ -94,11 +86,12 @@ def is_trained(self) -> bool:
         """
 
     @abstractmethod
-    def train(self, data: List[Any]):
+    def train(self, full_val_arr: np.ndarray, full_err_arr: Optional[np.ndarray] = None):
         """Train a DataAction.
 
         Certain data processing nodes, such as a SVD, require data to first train.
 
         Args:
-            data: A list of datum. Each datum is a point used to train the node.
+            full_val_arr: A list of values. Each datum will be converted to a 2D array.
+            full_err_arr: A list of errors. Each datm will be converted to a 2D array.
         """
diff --git a/qiskit_experiments/data_processing/data_processor.py b/qiskit_experiments/data_processing/data_processor.py
@@ -12,7 +12,10 @@
 
 """Actions done on the data to bring it in a usable form."""
 
-from typing import Any, Dict, List, Set, Tuple, Union
+import itertools
+from typing import Any, Dict, List, Set, Tuple, Union, Generator, Iterator
+
+import numpy as np
 
 from qiskit_experiments.data_processing.data_action import DataAction, TrainableDataAction
 from qiskit_experiments.data_processing.exceptions import DataProcessorError
@@ -36,7 +39,7 @@ class DataProcessor:
     def __init__(
         self,
         input_key: str,
-        data_actions: List[DataAction] = None,
+        data_actions: Union[DataAction, TrainableDataAction] = None,
     ):
         """Create a chain of data processing actions.
 
@@ -45,12 +48,11 @@ def __init__(
                 will find the data to process.
             data_actions: A list of data processing actions to construct this data processor with.
                 If None is given an empty DataProcessor will be created.
-            to_array: Boolean indicating if the input data will be converted to a numpy array.
         """
         self._input_key = input_key
         self._nodes = data_actions if data_actions else []
 
-    def append(self, node: DataAction):
+    def append(self, node: Union[DataAction, TrainableDataAction]):
         """
         Append new data action node to this data processor.
 
@@ -125,43 +127,59 @@ def _call_internal(
                 then all nodes in the data processing chain will be called.
 
         Returns:
-            datum_ and history if with_history is True or datum_ if with_history is False.
+            When ``with_history`` is ``False`` it returns a tuple of array-like of data and error.
+            Otherwise it returns a tuple of above with a list of intermediate data at each step.
         """
         if call_up_to_node is None:
             call_up_to_node = len(self._nodes)
 
-        datum_, error_ = self._data_extraction(data), None
+        # This is generator
+        gen_datum = self._data_extraction(data)
 
         history = []
-        for index, node in enumerate(self._nodes):
+        for index, node in enumerate(self._nodes[:call_up_to_node]):
+            # Create pipeline of data processing
+            gen_datum = node(gen_datum)
+
+            if with_history and (history_nodes is None or index in history_nodes):
+                # make sure not to kill pipeline by execution
+                gen_datum, gen_datum_copy = itertools.tee(gen_datum)
+                out_values, out_errors = execute_pipeline(gen_datum_copy)
+                history.append((node.__class__.__name__, out_values, out_errors, index))
+
+        # Execute pipeline
+        out_values, out_errors = execute_pipeline(gen_datum)
 
-            if index < call_up_to_node:
-                datum_, error_ = node(datum_, error_)
+        # Return only first element if length=1, e.g. [[0, 1]] -> [0, 1]
+        if out_values.shape[0] == 1:
+            out_values = out_values[0]
 
-                if with_history and (
-                    history_nodes is None or (history_nodes and index in history_nodes)
-                ):
-                    history.append((node.__class__.__name__, datum_, error_, index))
+        # Return only first element if length=1, e.g. [[0, 1]] -> [0, 1]
+        if out_errors.shape[0] == 1:
+            out_errors = out_errors[0]
+
+        # Return None if error is not computed
+        if np.isnan(out_errors).all():
+            out_errors = None
 
         if with_history:
-            return datum_, error_, history
+            return out_values, out_errors, history
         else:
-            return datum_, error_
+            return out_values, out_errors
 
     def train(self, data: List[Dict[str, Any]]):
         """Train the nodes of the data processor.
 
         Args:
             data: The data to use to train the data processor.
         """
-
         for index, node in enumerate(self._nodes):
             if isinstance(node, TrainableDataAction):
                 if not node.is_trained:
                     # Process the data up to the untrained node.
-                    node.train(self._call_internal(data, call_up_to_node=index)[0])
+                    node.train(*self._call_internal(data, call_up_to_node=index))
 
-    def _data_extraction(self, data: Union[Dict, List[Dict]]) -> List:
+    def _data_extraction(self, data: Union[Dict, List[Dict]]) -> Generator:
         """Extracts the data on which to run the nodes.
 
         If the datum is a list of dicts then the data under self._input_key is extracted
@@ -172,35 +190,84 @@ def _data_extraction(self, data: Union[Dict, List[Dict]]) -> List:
         Args:
             data: A list of such dicts where the data is contained under the key self._input_key.
 
-        Returns:
-            The data formatted in such a way that it is ready to be processed by the nodes.
+        Yields:
+            A tuple of numpy array object representing a data and error.
 
         Raises:
             DataProcessorError:
                 - If the input datum is not a list or a dict.
-                - If the data processor received a single datum but requires all the data to
-                  process it properly.
                 - If the input key of the data processor is not contained in the data.
         """
         if isinstance(data, dict):
             data = [data]
 
-        try:
-            data_ = [_datum[self._input_key] for _datum in iter(data)]
-        except KeyError as error:
-            raise DataProcessorError(
-                f"The input key {self._input_key} was not found in the input datum."
-            ) from error
-        except TypeError as error:
-            raise DataProcessorError(
-                f"{self.__class__.__name__} only extracts data from "
-                f"lists or dicts, received {type(data)}."
-            ) from error
-
-        return data_
+        for datum in data:
+            try:
+                target = datum[self._input_key]
+
+                # returns data and initial error
+                if isinstance(target, dict):
+                    # likely level2 data, forcibly convert into array
+                    yield np.asarray([target], dtype=object), np.asarray([np.nan], dtype=float)
+                else:
+                    try:
+                        # level1 or below
+                        nominal_arr = np.asarray(target, dtype=float)
+                        stdev_arr = np.full_like(target, np.nan, dtype=float)
+                    except TypeError:
+                        # level2 memory ["00", "11", "01", ...]
+                        nominal_arr = np.asarray(target, dtype=object)
+                        stdev_arr = np.asarray([np.nan], dtype=float)
+                    yield nominal_arr, stdev_arr
+
+            except KeyError as error:
+                raise DataProcessorError(
+                    f"The input key {self._input_key} was not found in the input datum."
+                ) from error
+            except TypeError as error:
+                raise DataProcessorError(
+                    f"{self.__class__.__name__} only extracts data from "
+                    f"lists or dicts, received {type(data)}."
+                ) from error
 
     def __repr__(self):
         """String representation of data processors."""
         names = ", ".join(node.__class__.__name__ for node in self._nodes)
 
         return f"{self.__class__.__name__}(input_key={self._input_key}, nodes=[{names}])"
+
+
+def execute_pipeline(gen_datum: Iterator) -> Tuple[np.ndarray, np.ndarray]:
+    """Execute processing pipeline and return processed data array.
+
+    Args:
+        gen_datum: A generator to sequentially return datum.
+
+    Returns:
+        A tuple of nominal values and standard errors.
+    """
+    out_values, out_errors = list(zip(*gen_datum))
+
+    try:
+        # try to convert into float object for performance
+        out_values = np.asarray(out_values, dtype=float)
+    except TypeError:
+        # if not convert into arbitrary array
+        out_values = np.asarray(out_values, dtype=object)
+
+    # convert into 1D array e.g. [[0], [1], ...] -> [0, 1, ...]
+    if len(out_values.shape) == 2 and out_values.shape[1] == 1:
+        out_values = out_values[:, 0]
+
+    try:
+        # try to convert into float object for performance
+        out_errors = np.asarray(out_errors, dtype=float)
+    except TypeError:
+        # if not convert into arbitrary array
+        out_errors = np.asarray(out_errors, dtype=object)
+
+    # convert into 1D array e.g. [[0], [1], ...] -> [0, 1, ...]
+    if len(out_errors.shape) == 2 and out_errors.shape[1] == 1:
+        out_errors = out_errors[:, 0]
+
+    return out_values, out_errors