qiskit-community · nkanazawa1989 · Oct 14, 2021 · Sep 29, 2021 · Sep 29, 2021 · Sep 29, 2021
diff --git a/docs/tutorials/randomized_benchmarking.ipynb b/docs/tutorials/randomized_benchmarking.ipynb
diff --git a/qiskit_experiments/curve_analysis/curve_analysis.py b/qiskit_experiments/curve_analysis/curve_analysis.py
@@ -33,6 +33,7 @@
     FitOptions,
 )
 from qiskit_experiments.curve_analysis.curve_fit import multi_curve_fit
+from qiskit_experiments.curve_analysis.data_processing import multi_mean_xy_data, data_sort
 from qiskit_experiments.curve_analysis.visualization import FitResultPlotters, PlotterStyle
 from qiskit_experiments.data_processing import DataProcessor
 from qiskit_experiments.data_processing.exceptions import DataProcessorError
@@ -199,6 +200,8 @@ class AnalysisExample(CurveAnalysis):
         - Customize pre-data processing:
             Override :meth:`~self._format_data`. For example, here you can apply smoothing
             to y values, remove outlier, or apply filter function to the data.
+            By default, data is sorted by x values and the measured values at the same
+            x value are averaged.
 
         - Create extra data from fit result:
             Override :meth:`~self._extra_database_entry`. You need to return a list of
@@ -448,7 +451,6 @@ def _format_data(self, data: CurveData) -> CurveData:
         """An optional subroutine to perform data pre-processing.
 
         Subclasses can override this method to apply pre-precessing to data values to fit.
-        Otherwise the analysis uses extracted data values as-is.
 
         For example,
 
@@ -458,20 +460,42 @@ def _format_data(self, data: CurveData) -> CurveData:
 
         etc...
 
+        By default, the analysis just takes average over the same x values and sort
+        data index by the x values in ascending order.
+
         .. note::
 
             The data returned by this method should have the label "fit_ready".
 
         Returns:
             Formatted CurveData instance.
         """
+        # take average over the same x value by keeping sigma
+        series, xdata, ydata, sigma, shots = multi_mean_xy_data(
+            series=data.data_index,
+            xdata=data.x,
+            ydata=data.y,
+            sigma=data.y_err,
+            shots=data.shots,
+            method="shots_weighted",
+        )
+
+        # sort by x value in ascending order
+        series, xdata, ydata, sigma, shots = data_sort(
+            series=series,
+            xdata=xdata,
+            ydata=ydata,
+            sigma=sigma,
+            shots=shots,
+        )
+
         return CurveData(
             label="fit_ready",
-            x=data.x,
-            y=data.y,
-            y_err=data.y_err,
-            data_index=data.data_index,
-            metadata=data.metadata,
+            x=xdata,
+            y=ydata,
+            y_err=sigma,
+            shots=shots,
+            data_index=series,
         )
 
     # pylint: disable=unused-argument
@@ -566,6 +590,9 @@ def _is_target_series(datum, **filters):
         # Store metadata
         metadata = np.asarray([datum["metadata"] for datum in data], dtype=object)
 
+        # Store shots
+        shots = np.asarray([datum.get("shots", np.nan) for datum in data])
+
         # Format data
         x_values = np.asarray(x_values, dtype=float)
         y_values = np.asarray(y_values, dtype=float)
@@ -585,6 +612,7 @@ def _is_target_series(datum, **filters):
             x=x_values,
             y=y_values,
             y_err=y_sigmas,
+            shots=shots,
             data_index=data_index,
             metadata=metadata,
         )
@@ -733,6 +761,7 @@ def _data(
                     x=data.x[locs],
                     y=data.y[locs],
                     y_err=data.y_err[locs],
+                    shots=data.shots[locs],
                     data_index=idx,
                     metadata=data.metadata[locs] if data.metadata is not None else None,
                 )

diff --git a/qiskit_experiments/curve_analysis/curve_data.py b/qiskit_experiments/curve_analysis/curve_data.py
@@ -68,6 +68,9 @@ class CurveData:
     # Error bar
     y_err: np.ndarray
 
+    # Shots number
+    shots: np.ndarray
+
     # Maping of data index to series index
     data_index: Union[np.ndarray, int]
 

diff --git a/qiskit_experiments/curve_analysis/data_processing.py b/qiskit_experiments/curve_analysis/data_processing.py
@@ -47,7 +47,11 @@ def filter_data(data: List[Dict[str, any]], **filters) -> List[Dict[str, any]]:
 
 
 def mean_xy_data(
-    xdata: np.ndarray, ydata: np.ndarray, sigma: Optional[np.ndarray] = None, method: str = "sample"
+    xdata: np.ndarray,
+    ydata: np.ndarray,
+    sigma: Optional[np.ndarray] = None,
+    shots: Optional[np.ndarray] = None,
+    method: str = "sample",
 ) -> Tuple[np.ndarray, ...]:
     r"""Return (x, y_mean, sigma) data.
 
@@ -61,41 +65,58 @@ def mean_xy_data(
     * ``"iwv"`` *Inverse-weighted variance*
       :math:`\overline{y} = (\sum_{i=1}^N y_i / \sigma_i^2 ) \sigma^2`
       :math:`\sigma^2 = 1 / (\sum_{i=1}^N 1 / \sigma_i^2)`
+    * ``"shots_weighted_variance"`` *Sample mean and variance with weights from shots*
+      :math:`\overline{y} = \sum_{i=1}^N n_i y_i / M`,
+      :math:`\sigma^2 = \sum_{i=1}^N (n_i \sigma_i / M)^2`,
+      where :math:`n_i` is the number of shots per data point and :math:`M = \sum_{i=1}^N n_i`
+      is a total number of shots from different circuit execution at the same x value.
+      If ``shots`` is not provided, this applies uniform weights to all values.
 
     Args
         xdata: 1D or 2D array of xdata from curve_fit_data or
                multi_curve_fit_data
         ydata: array of ydata returned from curve_fit_data or
                multi_curve_fit_data
         sigma: Optional, array of standard deviations in ydata.
+        shots: Optional, array of shots used to get a data point.
         method: The method to use for computing y means and
                 standard deviations sigma (default: "sample").
 
     Returns:
-        tuple: ``(x, y_mean, sigma)`` if ``return_raw==False``, where
+        tuple: ``(x, y_mean, sigma, shots)``, where
                ``x`` is an arrays of unique x-values, ``y`` is an array of
-               sample mean y-values, and ``sigma`` is an array of sample standard
-               deviation of y values.
+               sample mean y-values, ``sigma`` is an array of sample standard
+               deviation of y values, and ``shots`` are the total number of experiment shots
+               used to evaluate the data point. If ``shots`` in the function call is ``None``,
+               the numbers appear in the returned value will represent just a number of
+               duplicated x value entries.
 
     Raises:
         QiskitError: if "ivw" method is used without providing a sigma.
     """
     x_means = np.unique(xdata, axis=0)
     y_means = np.zeros(x_means.size)
     y_sigmas = np.zeros(x_means.size)
+    y_shots = np.zeros(x_means.size)
+
+    if shots is None or any(np.isnan(shots)):
+        # this will become standard average
+        shots = np.ones_like(xdata)
 
     # Sample mean and variance method
     if method == "sample":
         for i in range(x_means.size):
             # Get positions of y to average
             idxs = xdata == x_means[i]
             ys = ydata[idxs]
+            ns = shots[idxs]
 
             # Compute sample mean and biased sample variance
             y_means[i] = np.mean(ys)
             y_sigmas[i] = np.sqrt(np.mean((y_means[i] - ys) ** 2))
+            y_shots[i] = np.sum(ns)
 
-        return x_means, y_means, y_sigmas
+        return x_means, y_means, y_sigmas, y_shots
 
     # Inverse-weighted variance method
     if method == "iwv":
@@ -107,14 +128,33 @@ def mean_xy_data(
             # Get positions of y to average
             idxs = xdata == x_means[i]
             ys = ydata[idxs]
+            ns = shots[idxs]
 
             # Compute the inverse-variance weighted y mean and variance
             weights = 1 / sigma[idxs] ** 2
             y_var = 1 / np.sum(weights)
             y_means[i] = y_var * np.sum(weights * ys)
             y_sigmas[i] = np.sqrt(y_var)
+            y_shots[i] = np.sum(ns)
+
+        return x_means, y_means, y_sigmas, y_shots
+
+    # Quadrature sum of variance
+    if method == "shots_weighted":
+        for i in range(x_means.size):
+            # Get positions of y to average
+            idxs = xdata == x_means[i]
+            ys = ydata[idxs]
+            ss = sigma[idxs]
+            ns = shots[idxs]
+            weights = ns / np.sum(ns)
+
+            # Compute sample mean and sum of variance with weights based on shots
+            y_means[i] = np.sum(weights * ys)
+            y_sigmas[i] = np.sqrt(np.sum(weights ** 2 * ss ** 2))
+            y_shots[i] = np.sum(ns)
 
-        return x_means, y_means, y_sigmas
+        return x_means, y_means, y_sigmas, y_shots
 
     # Invalid method
     raise QiskitError(f"Unsupported method {method}")
@@ -125,40 +165,99 @@ def multi_mean_xy_data(
     xdata: np.ndarray,
     ydata: np.ndarray,
     sigma: Optional[np.ndarray] = None,
+    shots: Optional[np.ndarray] = None,
     method: str = "sample",
-):
-    r"""Return (series, x, y_mean, sigma) data.
-    Performs `mean_xy_data` for each series
-    and returns the concatenated results
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Take mean of multi series data set.
+
+    Args:
+        series: Series index.
+        xdata: 1D or 2D array of xdata from curve_fit_data or
+               multi_curve_fit_data
+        ydata: array of ydata returned from curve_fit_data or
+               multi_curve_fit_data
+        sigma: Optional, array of standard deviations in ydata.
+        shots: Optional, array of shots used to get a data point.
+        method: The method to use for computing y means and
+                standard deviations sigma (default: "sample").
+
+    Returns:
+        Tuple of (series, xdata, ydata, sigma, shots)
+
+    See also:
+        :py:func:`~qiskit_experiments.curve_analysis.data_processing.mean_xy_data`
     """
     series_vals = np.unique(series)
 
     series_means = []
     xdata_means = []
     ydata_means = []
     sigma_means = []
+    shots_sums = []
 
     # Get x, y, sigma data for series and process mean data
     for series_val in series_vals:
         idxs = series == series_val
         sigma_i = sigma[idxs] if sigma is not None else None
-        x_mean, y_mean, sigma_mean = mean_xy_data(
-            xdata[idxs], ydata[idxs], sigma=sigma_i, method=method
+        shots_i = shots[idxs] if shots is not None else None
+
+        x_mean, y_mean, sigma_mean, shots_sum = mean_xy_data(
+            xdata[idxs], ydata[idxs], sigma=sigma_i, shots=shots_i, method=method
         )
         series_means.append(np.full(x_mean.size, series_val, dtype=int))
         xdata_means.append(x_mean)
         ydata_means.append(y_mean)
         sigma_means.append(sigma_mean)
+        shots_sums.append(shots_sum)
 
     # Concatenate lists
     return (
         np.concatenate(series_means),
         np.concatenate(xdata_means),
         np.concatenate(ydata_means),
         np.concatenate(sigma_means),
+        np.concatenate(shots_sums),
     )
 
 
+def data_sort(
+    series: np.ndarray,
+    xdata: np.ndarray,
+    ydata: np.ndarray,
+    sigma: Optional[np.ndarray] = None,
+    shots: Optional[np.ndarray] = None,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Sort data.
+
+    Input x values may not be lined up in order, since experiment may accept user input array,
+    or data may be concatenated with previous scan. This sometimes confuses the algorithmic
+    generation of initial guesses especially when guess depends on derivative.
+
+    This returns data set that is sorted by xdata and series in ascending order.
+
+    Args:
+        series: Series index.
+        xdata: 1D or 2D array of xdata from curve_fit_data or
+               multi_curve_fit_data
+        ydata: array of ydata returned from curve_fit_data or
+               multi_curve_fit_data
+        sigma: Optional, array of standard deviations in ydata.
+        shots: Optional, array of shots used to get a data point.
+
+    Returns:
+        Tuple of (series, xdata, ydata, sigma, shots) sorted in ascending order of xdata and series.
+    """
+    if sigma is None:
+        sigma = np.full(series.size, np.nan, dtype=float)
+
+    if shots is None:
+        shots = np.full(series.size, np.nan, dtype=float)
+
+    sorted_data = sorted(zip(series, xdata, ydata, sigma, shots), key=lambda d: (d[0], d[1]))
+
+    return np.asarray(sorted_data).T
+
+
 def level2_probability(data: Dict[str, any], outcome: str) -> Tuple[float, float]:
     """Return the outcome probability mean and variance.
 

diff --git a/qiskit_experiments/curve_analysis/guess.py b/qiskit_experiments/curve_analysis/guess.py
@@ -22,7 +22,6 @@
 from scipy import signal
 
 from qiskit_experiments.exceptions import AnalysisError
-from .data_processing import mean_xy_data
 
 
 def frequency(
@@ -48,6 +47,7 @@ def frequency(
 
         This function returns always positive frequency.
         This function is sensitive to the DC offset.
+        This function assumes sorted, no-overlapping x values.
 
     Args:
         x: Array of x values.
@@ -58,14 +58,6 @@ def frequency(
     Returns:
         Frequency estimation of oscillation signal.
     """
-    # TODO averaging and sort should be done by data processor
-
-    # average the same data points
-    x, y, _ = mean_xy_data(x, y)
-
-    # sorted by x to be monotonic increase
-    x, y = np.asarray(sorted(zip(x, y), key=lambda xy: xy[0])).T
-
     # to run FFT x interval should be identical
     sampling_interval = np.unique(np.round(np.diff(x), decimals=20))