From ae6d17fbe4a9e65401c3db41c024a7ce0fe2534f Mon Sep 17 00:00:00 2001
From: Hiroshi Horii <horii@jp.ibm.com>
Date: Tue, 15 Mar 2022 01:59:14 +0900
Subject: [PATCH 01/10] Bump version strings to prepare for release

With the bug fix PRs all backported this commit bumps the version string
to indicate this is the next bugfix release.
---
 docs/conf.py                     | 2 +-
 qiskit/providers/aer/VERSION.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/conf.py b/docs/conf.py
index 7bc02513e5..b8163803c0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -46,7 +46,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.10.3'
+release = '0.10.4'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/qiskit/providers/aer/VERSION.txt b/qiskit/providers/aer/VERSION.txt
index a3f5a8ed4d..9b40aa6c21 100644
--- a/qiskit/providers/aer/VERSION.txt
+++ b/qiskit/providers/aer/VERSION.txt
@@ -1 +1 @@
-0.10.3
+0.10.4

From 4f0cd3db74f922a6a3922d106498bb37d9ae1aaa Mon Sep 17 00:00:00 2001
From: Jake Lishman <jake.lishman@ibm.com>
Date: Tue, 1 Mar 2022 15:27:21 +0000
Subject: [PATCH 02/10] Do not build with `-ffast-math` (#1469)

This flag enables several unsafe floating-point transformations that are
not generally appropriate for a distributed mathematical library.  Most
importantly, however, `-ffast-math` on `gcc` also forcibly enables the
floating-point "flush to zero" mode for subnormal numbers when the
dynamic library is loaded, which affects not just Qiskit Aer, but every
other floating-point operation in the running process.  This is not our
decision to make for users, and it shouldn't be distributed.
---
 CMakeLists.txt                                 |  1 -
 .../notes/no-fast-math-1de357a9650094f3.yaml   | 18 ++++++++++++++++++
 .../extended_stabilizer/ch_runner.hpp          |  4 ----
 3 files changed, 18 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/no-fast-math-1de357a9650094f3.yaml

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4eeb296f69..8c7e0d296d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,7 +110,6 @@ if(STATIC_LINKING)
 endif()
 
 if(NOT MSVC)
-	enable_cxx_compiler_flag_if_supported("-ffast-math")
 	if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
 		# PowerPC builds are not meant to be redistributable, we build them
 		# in place, so we can have CPU = native.
diff --git a/releasenotes/notes/no-fast-math-1de357a9650094f3.yaml b/releasenotes/notes/no-fast-math-1de357a9650094f3.yaml
new file mode 100644
index 0000000000..829b474809
--- /dev/null
+++ b/releasenotes/notes/no-fast-math-1de357a9650094f3.yaml
@@ -0,0 +1,18 @@
+---
+upgrade:
+  - |
+    Qiskit Aer is no longer compiled with unsafe floating-point optimisations.
+    While most of the effects should have been localised to Qiskit Aer, some
+    aspects of subnormal handling may previously have been leaked into user code
+    by the library incorrectly setting the "flush to zero" mode.  This will not
+    happen any more.
+fixes:
+  - |
+    Qiskit Aer will no longer set the floating-point mode to "flush to zero"
+    when loaded.  Downstream users may previously have seen warnings from Numpy
+    such as:
+
+      The value of the smallest subnormal for <class 'numpy.float64'> type is zero.
+
+    These will now no longer be emitted, and the floating-point handling will be
+    correct.
diff --git a/src/simulators/extended_stabilizer/ch_runner.hpp b/src/simulators/extended_stabilizer/ch_runner.hpp
index 81303dd29d..b9303e727e 100644
--- a/src/simulators/extended_stabilizer/ch_runner.hpp
+++ b/src/simulators/extended_stabilizer/ch_runner.hpp
@@ -664,11 +664,7 @@ void Runner::metropolis_step(AER::RngEngine &rng)
   }
   complex_t ampsum(real_part, imag_part);
   double p_threshold = std::norm(ampsum)/std::norm(old_ampsum_);
-  #ifdef  __FAST_MATH__ //isnan doesn't behave well under fastmath, so use absolute tolerance check instead
-  if(std::isinf(p_threshold) || std::abs(std::norm(old_ampsum_)-0.) < 1e-8)
-  #else
   if(std::isinf(p_threshold) || std::isnan(p_threshold))
-  #endif
   {
     accept_ = true;
     old_ampsum_ = ampsum;

From e5f78f7cb1f53404af5c56bb18bb0da3257b7059 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 3 Mar 2022 17:26:28 -0500
Subject: [PATCH 03/10] Split out gpu py310 wheel build (#1472)

The aer gpu package wheel job for python 3.10 builds in the manylinux2014
container image, while for the older Python versions it uses the
manylinux2010 image. The manylinux2014 image is based on Centos 7 while
the manylinux2010 image is based on Centos 6. During the qiskit-aer
0.10.3 release the gpu wheel job failed because when running the py3.10
build we installed the cuda package for centos 6 in the manylinux2014
centos 7 based image. This caused the job to fail when nvcc was called
because it was not installed properly. To fix this issue the Python 3.10
build is split out into a separate job. This separate job will install
the same cuda version but for centos 7 which is appropriate for the
manylinux2014 image.
---
 .github/workflows/deploy.yml | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 157db0cb5d..d3be004436 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -134,8 +134,40 @@ jobs:
           python -m pip install cibuildwheel==2.2.2
       - name: Build wheels
         env:
-          CIBW_BEFORE_ALL: "yum install -y yum-utils wget && wget https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-rhel6-10-1-local-10.1.243-418.87.00-1.0-1.x86_64.rpm && rpm -i cuda-repo-rhel6-10-1-local-10.1.243-418.87.00-1.0-1.x86_64.rpm && yum clean all && yum -y install cuda-10-1 openblas-devel"
-          CIBW_SKIP: "*-manylinux_i686 pp* *musllinux*"
+          CIBW_BEFORE_ALL: "yum install -y yum-utils wget && wget -q https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-rhel6-10-1-local-10.1.243-418.87.00-1.0-1.x86_64.rpm && rpm -i cuda-repo-rhel6-10-1-local-10.1.243-418.87.00-1.0-1.x86_64.rpm && yum clean all && yum -y install cuda-10-1 openblas-devel"
+          CIBW_SKIP: "*-manylinux_i686 cp310* pp* cp36* *musllinux*"
+          CIBW_ENVIRONMENT: QISKIT_AER_PACKAGE_NAME=qiskit-aer-gpu AER_THRUST_BACKEND=CUDA CUDACXX=/usr/local/cuda/bin/nvcc
+        run: |
+          python -m cibuildwheel --output-dir wheelhouse
+      - uses: actions/upload-artifact@v2
+        with:
+          path: ./wheelhouse/*.whl
+      - name: Publish Wheels
+        env:
+          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+          TWINE_USERNAME: qiskit
+        run : |
+          pip install -U twine
+          twine upload wheelhouse/*
+  gpu-build-310:
+    name: Build qiskit-aer-gpu wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        name: Install Python
+        with:
+          python-version: '3.7'
+      - name: Add msbuild to PATH
+        uses: microsoft/setup-msbuild@v1.0.2
+        if: runner.os == 'Windows'
+      - name: Install cibuildwheel
+        run: |
+          python -m pip install cibuildwheel==2.2.2
+      - name: Build wheels
+        env:
+          CIBW_BEFORE_ALL: "yum install -y yum-utils wget && wget -q https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm && rpm -i cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm && yum clean all && yum -y install cuda-10-1 openblas-devel"
+          CIBW_BUILD: "cp310-manylinux_x86_64"
           CIBW_ENVIRONMENT: QISKIT_AER_PACKAGE_NAME=qiskit-aer-gpu AER_THRUST_BACKEND=CUDA CUDACXX=/usr/local/cuda/bin/nvcc
         run: |
           python -m cibuildwheel --output-dir wheelhouse

From 80478fec494bdf942f056cef704d3df3f6a1ac99 Mon Sep 17 00:00:00 2001
From: Toshinari Itoko <15028342+itoko@users.noreply.github.com>
Date: Tue, 8 Mar 2022 05:18:17 +0900
Subject: [PATCH 04/10] Change to truncate T2 up to 2 * T1 in
 NoiseModel.from_backend (#1467)

* Change not to fail to run noisy simulation with T2 > 2 * T1

Previously NoiseModel.from_backend failed when T2 value is greater than 2 * T1,
however, sometimes backend returns such T2 value.
This change truncates T2 value up to 2 * T1 with issuing a user warning.

Co-authored-by: Hiroshi Horii <horii@jp.ibm.com>
---
 qiskit/providers/aer/noise/device/models.py   | 12 ++++
 .../providers/aer/noise/device/parameters.py  |  5 --
 qiskit/providers/aer/noise/noise_model.py     |  8 ++-
 ...fix-invalid-t2-error-a3685e4a3ad0a1e7.yaml | 12 ++++
 test/terra/noise/test_noise_model.py          | 67 ++++++++++++++++++-
 5 files changed, 93 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/fix-invalid-t2-error-a3685e4a3ad0a1e7.yaml

diff --git a/qiskit/providers/aer/noise/device/models.py b/qiskit/providers/aer/noise/device/models.py
index daa6c1ab96..c6ec388ec1 100644
--- a/qiskit/providers/aer/noise/device/models.py
+++ b/qiskit/providers/aer/noise/device/models.py
@@ -245,6 +245,7 @@ def _device_thermal_relaxation_error(qubits,
     error = None
     for qubit in qubits:
         t1, t2, freq = relax_params[qubit]
+        t2 = _truncate_t2_value(t1, t2)
         population = _excited_population(freq, temperature)
         if first:
             error = thermal_relaxation_error(t1, t2, gate_time, population)
@@ -255,6 +256,17 @@ def _device_thermal_relaxation_error(qubits,
     return error
 
 
+def _truncate_t2_value(t1, t2):
+    """Return t2 value truncated to 2 * t1 (for t2 > 2 * t1)"""
+    new_t2 = t2
+    if t2 > 2 * t1:
+        new_t2 = 2 * t1
+        warn("Device model returned an invalid T_2 relaxation time greater than"
+             f" the theoretical maximum value 2 * T_1 ({t2} > 2 * {t1})."
+             " Truncating to maximum value.", UserWarning)
+    return new_t2
+
+
 def _excited_population(freq, temperature):
     """Return excited state population"""
     population = 0
diff --git a/qiskit/providers/aer/noise/device/parameters.py b/qiskit/providers/aer/noise/device/parameters.py
index ead1fa1836..7468e526d4 100644
--- a/qiskit/providers/aer/noise/device/parameters.py
+++ b/qiskit/providers/aer/noise/device/parameters.py
@@ -180,11 +180,6 @@ def thermal_relaxation_values(properties):
                 # Convert to Gigahertz
                 freq *= _GHZ_UNITS.get(freq_params.unit, 1)
 
-        # NOTE: T2 cannot be larger than 2 * T1 for a physical noise
-        # channel, however if a backend erroneously reports such a value we
-        # truncated it here:
-        t2 = min(2 * t1, t2)
-
         values.append((t1, t2, freq))
     return values
 
diff --git a/qiskit/providers/aer/noise/noise_model.py b/qiskit/providers/aer/noise/noise_model.py
index 6bcf58d265..082ec62e1a 100644
--- a/qiskit/providers/aer/noise/noise_model.py
+++ b/qiskit/providers/aer/noise/noise_model.py
@@ -25,7 +25,7 @@
 from qiskit.providers.exceptions import BackendPropertyError
 from qiskit.providers.models import BackendProperties
 from qiskit.transpiler import PassManager
-from .device.models import _excited_population
+from .device.models import _excited_population, _truncate_t2_value
 from .device.models import basic_device_gate_errors
 from .device.models import basic_device_readout_errors
 from .errors.quantum_error import QuantumError
@@ -373,9 +373,11 @@ def from_backend(cls, backend,
             except BackendPropertyError:
                 excited_state_populations = None
             try:
+                t1s = [properties.t1(q) for q in range(num_qubits)]
+                t2s = [properties.t2(q) for q in range(num_qubits)]
                 delay_pass = RelaxationNoisePass(
-                    t1s=[properties.t1(q) for q in range(num_qubits)],
-                    t2s=[properties.t2(q) for q in range(num_qubits)],
+                    t1s=t1s,
+                    t2s=[_truncate_t2_value(t1, t2) for t1, t2 in zip(t1s, t2s)],
                     dt=dt,
                     op_types=Delay,
                     excited_state_populations=excited_state_populations
diff --git a/releasenotes/notes/fix-invalid-t2-error-a3685e4a3ad0a1e7.yaml b/releasenotes/notes/fix-invalid-t2-error-a3685e4a3ad0a1e7.yaml
new file mode 100644
index 0000000000..cb1abebbe7
--- /dev/null
+++ b/releasenotes/notes/fix-invalid-t2-error-a3685e4a3ad0a1e7.yaml
@@ -0,0 +1,12 @@
+---
+fixes:
+  - |
+    Fixes a bug in ``NoiseModel.from_backend()`` that raised an error when
+    T2 value greater than 2 * T1 was supplied by the backend.
+    After this fix, it becomes to truncate T2 value up to 2 * T1 and
+    issue a user warning if truncates.
+    The bug was introduced at #1391 and, before that, ``NoiseModel.from_backend()`` had
+    truncated the T2 value up to 2 * T1 silently.
+
+    See `Issue 1464 <https://github.com/Qiskit/qiskit-aer/issues/1464>`__
+    for details.
diff --git a/test/terra/noise/test_noise_model.py b/test/terra/noise/test_noise_model.py
index 24f05e1bfc..9abd2af487 100644
--- a/test/terra/noise/test_noise_model.py
+++ b/test/terra/noise/test_noise_model.py
@@ -19,17 +19,18 @@
 import numpy as np
 from qiskit.providers.aer.backends import AerSimulator
 from qiskit.providers.aer.noise import NoiseModel
-from qiskit.providers.aer.utils.noise_transformation import transform_noise_model
+from qiskit.providers.aer.noise.device.models import _excited_population
 from qiskit.providers.aer.noise.errors.standard_errors import amplitude_damping_error
 from qiskit.providers.aer.noise.errors.standard_errors import kraus_error
 from qiskit.providers.aer.noise.errors.standard_errors import pauli_error
 from qiskit.providers.aer.noise.errors.standard_errors import reset_error
-from test.terra.common import QiskitAerTestCase
+from qiskit.providers.aer.noise.errors.standard_errors import thermal_relaxation_error
+from qiskit.providers.aer.utils.noise_transformation import transform_noise_model
 
 from qiskit.circuit import QuantumRegister, ClassicalRegister, QuantumCircuit
 from qiskit.compiler import transpile
-from qiskit.transpiler import TranspilerError
 from qiskit.test import mock
+from test.terra.common import QiskitAerTestCase
 
 
 class TestNoiseModel(QiskitAerTestCase):
@@ -229,6 +230,66 @@ def test_noise_model_from_mumbai(self):
         result = AerSimulator().run(circ, noise_model=noise_model).result()
         self.assertTrue(result.success)
 
+    def test_noise_model_from_invalid_t2_backend(self):
+        """Test if issue user warning when creating a noise model from invalid t2 backend"""
+        from qiskit.providers.models.backendproperties import BackendProperties, Gate, Nduv
+        import datetime
+
+        t1_ns, invalid_t2_ns = 75_1000, 200_1000
+        u3_time_ns = 320
+        frequency = 4919.96800692
+
+        class InvalidT2Fake1Q(mock.FakeBackend):
+            def __init__(self):
+                mock_time = datetime.datetime.now()
+                dt = 1.3333
+                configuration = BackendProperties(
+                    backend_name="invalid_t2",
+                    backend_version="0.0.0",
+                    num_qubits=1,
+                    basis_gates=["u3"],
+                    qubits=[
+                        [
+                            Nduv(date=mock_time, name="T1", unit="µs", value=t1_ns/1000),
+                            Nduv(date=mock_time, name="T2", unit="µs", value=invalid_t2_ns/1000),
+                            Nduv(date=mock_time, name="frequency", unit="MHz", value=frequency),
+                        ],
+                    ],
+                    gates=[
+                        Gate(
+                            gate="u3",
+                            name="u3_0",
+                            qubits=[0],
+                            parameters=[
+                                Nduv(date=mock_time, name="gate_error", unit="", value=0.001),
+                                Nduv(date=mock_time, name="gate_length", unit="ns", value=u3_time_ns),
+                            ],
+                        ),
+                    ],
+                    last_update_date=mock_time,
+                    general=[],
+                )
+                super().__init__(configuration)
+
+            def defaults(self):
+                """defaults == configuration"""
+                return self._configuration
+
+            def properties(self):
+                """properties == configuration"""
+                return self._configuration
+
+        backend = InvalidT2Fake1Q()
+        with self.assertWarns(UserWarning):
+            noise_model = NoiseModel.from_backend(backend, gate_error=False)
+            expected = thermal_relaxation_error(
+                t1=t1_ns,
+                t2=2*t1_ns,
+                time=u3_time_ns,
+                excited_state_population=_excited_population(frequency, temperature=0)
+            )
+            self.assertEqual(expected, noise_model._local_quantum_errors["u3"][(0, )])
+
     def test_transform_noise(self):
         org_error = reset_error(0.2)
         new_error = pauli_error([("I", 0.5), ("Z", 0.5)])

From 346ec243d31192eef100663e9a7b90055cb84f6b Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Fri, 18 Mar 2022 11:09:31 +0900
Subject: [PATCH 05/10] Fix: cache blocking transpiler for density_matrix with
 noises (#1480)

* Enable cacheblocking of superop operations and fix a bug in measurement with multi-chunks

Previously, superop was not recognized in transplation of cacheblocking and then it was not correctly handled in
multi-gpu environment and hybrid environment with gpu and cpu.
This change enhance cacheblocking for superop and then make density matrix simulation with GPU(s) correctly.
In addition, a measurement issue with cacheblocking was also fixed.
---
 ...sity-multi-chunk-fix-e9effc67d0365418.yaml |  13 +
 src/simulators/statevector/qubitvector.hpp    |   2 +-
 .../statevector/qubitvector_thrust.hpp        |  14 +-
 .../statevector/statevector_state.hpp         |  10 +-
 src/transpile/cacheblocking.hpp               | 363 +++++++-----------
 .../backends/aer_simulator/test_chunk.py      |  29 ++
 .../backends/aer_simulator/test_noise.py      |  22 +-
 7 files changed, 217 insertions(+), 236 deletions(-)
 create mode 100644 releasenotes/notes/density-multi-chunk-fix-e9effc67d0365418.yaml

diff --git a/releasenotes/notes/density-multi-chunk-fix-e9effc67d0365418.yaml b/releasenotes/notes/density-multi-chunk-fix-e9effc67d0365418.yaml
new file mode 100644
index 0000000000..0815925509
--- /dev/null
+++ b/releasenotes/notes/density-multi-chunk-fix-e9effc67d0365418.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Fix cache blocking transpiler to recognize superop to be cache blocked.
+    This is fix for
+    `issue 1479 <https://github.com/Qiskit/qiskit-aer/issues/1479>`
+    now density_matrix with noise models can be parallelized.
+    New test, test_noise.TestNoise.test_kraus_gate_noise_on_QFT_cache_blocking
+    is added to verify this issue.
+    Also this fix include fix for
+    `issue 1483 <https://github.com/Qiskit/qiskit-aer/issues/1483>`
+    discovered by adding new test case.
+    This fixes measure over chunks for statevector.
diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
index 79fad5745b..79be2dc657 100755
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@@ -399,7 +399,7 @@ class QubitVector {
   // Get the sample_measure index size
   int get_sample_measure_index_size() {return sample_measure_index_size_;}
 
-  virtual bool enable_batch(bool flg)
+  virtual bool enable_batch(bool flg) const
   {
     return false;
   }
diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp
index fdbbbbe2b5..b18568f003 100644
--- a/src/simulators/statevector/qubitvector_thrust.hpp
+++ b/src/simulators/statevector/qubitvector_thrust.hpp
@@ -312,7 +312,7 @@ class QubitVectorThrust {
 #endif
   }
 
-  bool enable_batch(bool flg);
+  bool enable_batch(bool flg) const;
 
   virtual void apply_bfunc(const Operations::Op &op);
   virtual void set_conditional(int_t reg);
@@ -450,7 +450,7 @@ class QubitVectorThrust {
   uint_t chunk_index_;
   bool multi_chunk_distribution_;
   bool multi_shots_;
-  bool enable_batch_;
+  mutable bool enable_batch_;
 
   bool register_blocking_;
 
@@ -1274,7 +1274,7 @@ void QubitVectorThrust<data_t>::set_conditional(int_t reg)
 }
 
 template <typename data_t>
-bool QubitVectorThrust<data_t>::enable_batch(bool flg)
+bool QubitVectorThrust<data_t>::enable_batch(bool flg) const
 {
   bool prev = enable_batch_;
 
@@ -1530,6 +1530,8 @@ void QubitVectorThrust<data_t>::apply_function_sum(double* pSum,Function func,bo
   if(func.batch_enable() && ((multi_chunk_distribution_ && chunk_.device() >= 0 && num_qubits_ == num_qubits()) || (enable_batch_))){
     if(chunk_.pos() != 0){
       //only first chunk on device calculates all the chunks
+      if(pSum)
+        *pSum = 0.0;
       return;
     }
     count = chunk_.container()->num_chunks();
@@ -1555,6 +1557,10 @@ void QubitVectorThrust<data_t>::apply_function_sum2(double* pSum,Function func,b
   if(func.batch_enable() && ((multi_chunk_distribution_ && chunk_.device() >= 0 && num_qubits_ == num_qubits()) || (enable_batch_))){
     if(chunk_.pos() != 0){
       //only first chunk on device calculates all the chunks
+      if(pSum){
+        pSum[0] = 0.0;
+        pSum[1] = 0.0;
+      }
       return;
     }
     count = chunk_.container()->num_chunks();
@@ -3439,7 +3445,7 @@ double QubitVectorThrust<data_t>::norm() const
 {
   double ret;
 #ifdef AER_THRUST_CUDA
-  if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){
+  if(enable_batch_ && ((multi_chunk_distribution_ && chunk_.device() >= 0) || !multi_chunk_distribution_)){
     if(chunk_.pos() != 0)
       return 0.0;   //first chunk execute all in batch
   }
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
index 5606be96e7..e9084b8713 100755
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -1838,12 +1838,18 @@ std::vector<reg_t> State<statevec_t>::sample_measure(const reg_t &qubits,
     //calculate per chunk sum
     if(BaseState::chunk_omp_parallel_){
 #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) 
-      for(i=0;i<BaseState::qregs_.size();i++)
+      for(i=0;i<BaseState::qregs_.size();i++){
+        bool batched = BaseState::qregs_[i].enable_batch(true);   //return sum of all chunks in group
         chunkSum[i] = BaseState::qregs_[i].norm();
+        BaseState::qregs_[i].enable_batch(batched);
+      }
     }
     else{
-      for(i=0;i<BaseState::qregs_.size();i++)
+      for(i=0;i<BaseState::qregs_.size();i++){
+        bool batched = BaseState::qregs_[i].enable_batch(true);   //return sum of all chunks in group
         chunkSum[i] = BaseState::qregs_[i].norm();
+        BaseState::qregs_[i].enable_batch(batched);
+      }
     }
 
     localSum = 0.0;
diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp
index 900365ae38..041228d686 100644
--- a/src/transpile/cacheblocking.hpp
+++ b/src/transpile/cacheblocking.hpp
@@ -1,7 +1,7 @@
 /**
  * This code is part of Qiskit.
  *
- * (C) Copyright IBM 2018, 2019.
+ * (C) Copyright IBM 2018, 2019, 2022.
  *
  * This code is licensed under the Apache License, Version 2.0. You may
  * obtain a copy of this license in the LICENSE.txt file in the root directory
@@ -332,13 +332,9 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector<Operations::Op>&
   //check if the operation can be reordered in front of waiting queue
   uint_t j,iq,jq;
 
-  //only gate and matrix can be reordered
-  if(op.type != Operations::OpType::gate && op.type != Operations::OpType::matrix && op.type != Operations::OpType::diagonal_matrix){
-    //except for reset for density matrix
-    if(!density_matrix_ || op.type != Operations::OpType::reset){
-      return false;
-    }
-  }
+  //only blockable ops can be reordered
+  if(!is_blockable_operation(op))
+    return false;
 
   for(j=0;j<waiting_ops.size();j++){
     if(is_blockable_operation(waiting_ops[j])){
@@ -505,266 +501,187 @@ uint_t CacheBlocking::add_ops(std::vector<Operations::Op>& ops,std::vector<Opera
   pos_begin = out.size();
   num_gates_added = 0;
 
-//  if(doSwap){
-    //find qubits to be blocked
-    if(first && doSwap){
-      //use lower bits for initialization
-      for(i=0;i<block_bits_;i++){
-        blockedQubits.push_back(i);
-      }
+  //find qubits to be blocked
+  if(first && doSwap){
+    //use lower bits for initialization
+    for(i=0;i<block_bits_;i++){
+      blockedQubits.push_back(i);
     }
-    else{
-      if(crossQubitOnly){
-        //add multi-qubits gate at first
-        define_blocked_qubits(ops,blockedQubits,true);
+  }
+  else{
+    if(crossQubitOnly){
+      //add multi-qubits gate at first
+      define_blocked_qubits(ops,blockedQubits,true);
 
-        //not enough qubits are blocked, then add one qubit gate
-        if(blockedQubits.size() < block_bits_)
-          define_blocked_qubits(ops,blockedQubits,false);
-      }
-      else{
+      //not enough qubits are blocked, then add one qubit gate
+      if(blockedQubits.size() < block_bits_)
         define_blocked_qubits(ops,blockedQubits,false);
-      }
     }
+    else{
+      define_blocked_qubits(ops,blockedQubits,false);
+    }
+  }
 
-    pos_begin = out.size();
-    num_gates_added = 0;
+  pos_begin = out.size();
+  num_gates_added = 0;
 
-    if(doSwap){
-      //insert swap gates to block operations
-      reg_t swap(block_bits_);
-      std::vector<bool> mapped(block_bits_,false);
-      nq = blockedQubits.size();
-      for(i=0;i<nq;i++){
-        swap[i] = qubits_;  //not defined
+  if(doSwap){
+    //insert swap gates to block operations
+    reg_t swap(block_bits_);
+    std::vector<bool> mapped(block_bits_,false);
+    nq = blockedQubits.size();
+    for(i=0;i<nq;i++){
+      swap[i] = qubits_;  //not defined
+      for(j=0;j<block_bits_;j++){
+        if(blockedQubits[i] == qubitSwapped_[j]){
+          swap[i] = j;
+          mapped[j] = true;
+          break;
+        }
+      }
+    }
+    for(i=0;i<nq;i++){
+      if(swap[i] == qubits_){
         for(j=0;j<block_bits_;j++){
-          if(blockedQubits[i] == qubitSwapped_[j]){
+          if(!mapped[j]){
             swap[i] = j;
             mapped[j] = true;
             break;
           }
         }
       }
-      for(i=0;i<nq;i++){
-        if(swap[i] == qubits_){
-          for(j=0;j<block_bits_;j++){
-            if(!mapped[j]){
-              swap[i] = j;
-              mapped[j] = true;
-              break;
-            }
-          }
+    }
+    for(i=0;i<nq;i++){
+      if(qubitSwapped_[swap[i]] != blockedQubits[i]){ //need swap gate
+        if(!first){   //swap gate is not required for initial state
+          insert_swap(out,swap[i],qubitMap_[blockedQubits[i]],true);
         }
-      }
-      for(i=0;i<nq;i++){
-        if(qubitSwapped_[swap[i]] != blockedQubits[i]){ //need swap gate
-          if(!first){   //swap gate is not required for initial state
-            insert_swap(out,swap[i],qubitMap_[blockedQubits[i]],true);
-          }
 
-          //swap map
-          j = qubitMap_[blockedQubits[i]];
-          qubitMap_[qubitSwapped_[swap[i]]] = j;
-          qubitMap_[blockedQubits[i]] = swap[i];
+        //swap map
+        j = qubitMap_[blockedQubits[i]];
+        qubitMap_[qubitSwapped_[swap[i]]] = j;
+        qubitMap_[blockedQubits[i]] = swap[i];
 
-          qubitSwapped_[j] = qubitSwapped_[swap[i]];
-          qubitSwapped_[swap[i]] = blockedQubits[i];
-        }
+        qubitSwapped_[j] = qubitSwapped_[swap[i]];
+        qubitSwapped_[swap[i]] = blockedQubits[i];
       }
     }
+  }
 
-    if(doSwap)
-      insert_sim_op(out,"begin_blocking",blockedQubits);
-    else
-      insert_sim_op(out,"begin_memory_blocking",blockedQubits);
-    end_block_inserted = false;
-
-    //gather blocked gates
-    for(i=0;i<ops.size();i++){
-      if(is_blockable_operation(ops[i])){
-        if(!end_block_inserted){
-          if(is_diagonal_op(ops[i]) || can_block(ops[i],blockedQubits)){
-            if(can_reorder(ops[i],queue)){
-              //mapping swapped qubits
-              for(iq=0;iq<ops[i].qubits.size();iq++){
-                ops[i].qubits[iq] = qubitMap_[ops[i].qubits[iq]];
-              }
-              out.push_back(ops[i]);
-              num_gates_added++;
-              continue;
+  if(doSwap)
+    insert_sim_op(out,"begin_blocking",blockedQubits);
+  else
+    insert_sim_op(out,"begin_memory_blocking",blockedQubits);
+  end_block_inserted = false;
+
+  //gather blocked gates
+  for(i=0;i<ops.size();i++){
+    if(is_blockable_operation(ops[i])){
+      if(!end_block_inserted){
+        if(is_diagonal_op(ops[i]) || can_block(ops[i],blockedQubits)){
+          if(can_reorder(ops[i],queue)){
+            //mapping swapped qubits
+            for(iq=0;iq<ops[i].qubits.size();iq++){
+              ops[i].qubits[iq] = qubitMap_[ops[i].qubits[iq]];
             }
+            out.push_back(ops[i]);
+            num_gates_added++;
+            continue;
           }
-          else if(ops[i].name == "pauli"){
-            if(can_reorder(ops[i],queue)){
-              if(split_pauli(ops[i],blockedQubits,out,queue))
-                num_gates_added++;
-              continue;
-            }
+        }
+        else if(ops[i].name == "pauli"){
+          if(can_reorder(ops[i],queue)){
+            if(split_pauli(ops[i],blockedQubits,out,queue))
+              num_gates_added++;
+            continue;
           }
-          else if(ops[i].type == Operations::OpType::reset){    //reset for density matrix can be cache blocked
-            if(can_reorder(ops[i],queue)){
-              if(split_op(ops[i],blockedQubits,out,queue))
-                num_gates_added++;
-              continue;
-            }
+        }
+        else if(ops[i].type == Operations::OpType::reset){    //reset for density matrix can be cache blocked
+          if(can_reorder(ops[i],queue)){
+            if(split_op(ops[i],blockedQubits,out,queue))
+              num_gates_added++;
+            continue;
           }
         }
       }
-      else{
-        if(queue.size() == 0){          //if queue is empty, apply op here
-          bool restore_qubits = false;
-          if(ops[i].type == Operations::OpType::kraus){
-            if(ops[i].qubits.size() > block_bits_){
-              throw std::runtime_error("CacheBlocking : Kraus operator, number of qubits should be smaller than chunk qubit size");
-              break;
-            }
+    }
+    else{
+      if(queue.size() == 0){          //if queue is empty, apply op here
+        bool restore_qubits = false;
+        if(ops[i].type == Operations::OpType::kraus){
+          if(ops[i].qubits.size() > block_bits_){
+            throw std::runtime_error("CacheBlocking : Kraus operator, number of qubits should be smaller than chunk qubit size");
+            break;
+          }
+          if(!can_block(ops[i],blockedQubits)){  //if some qubits are out of chunk, queued for next step
+            queue.push_back(ops[i]);
+            continue;
+          }
+        }
+        else if(ops[i].type == Operations::OpType::initialize){
+          if(ops[i].qubits.size() <= block_bits_){
             if(!can_block(ops[i],blockedQubits)){  //if some qubits are out of chunk, queued for next step
               queue.push_back(ops[i]);
               continue;
             }
           }
-          else if(ops[i].type == Operations::OpType::initialize){
-            if(ops[i].qubits.size() <= block_bits_){
-              if(!can_block(ops[i],blockedQubits)){  //if some qubits are out of chunk, queued for next step
-                queue.push_back(ops[i]);
-                continue;
-              }
-            }
-            //otherwise StateChunk have to parallelize initialize operation
-          }
-          else if(sample_measure_ && ops[i].type == Operations::OpType::measure){
-            //currently sampling should be done with original qubit mapping (TO DO : sampling without inserting swaps)
+          //otherwise StateChunk have to parallelize initialize operation
+        }
+        else if(sample_measure_ && ops[i].type == Operations::OpType::measure){
+          //currently sampling should be done with original qubit mapping (TO DO : sampling without inserting swaps)
+          restore_qubits = true;
+        }
+        else if(ops[i].type != Operations::OpType::measure && ops[i].type != Operations::OpType::reset && 
+                ops[i].type != Operations::OpType::save_amps && ops[i].type != Operations::OpType::save_amps_sq &&
+                ops[i].type != Operations::OpType::save_densmat){
+          if(!(ops[i].type == Operations::OpType::snapshot && ops[i].name == "density_matrix")){
             restore_qubits = true;
           }
-          else if(ops[i].type != Operations::OpType::measure && ops[i].type != Operations::OpType::reset && 
-                  ops[i].type != Operations::OpType::save_amps && ops[i].type != Operations::OpType::save_amps_sq &&
-                  ops[i].type != Operations::OpType::save_densmat){
-            if(!(ops[i].type == Operations::OpType::snapshot && ops[i].name == "density_matrix")){
-              restore_qubits = true;
-            }
-          }
-
-          if(num_gates_added > 0 && !end_block_inserted){  //insert end of block to synchronize chunks
-            if(doSwap)
-              insert_sim_op(out,"end_blocking",blockedQubits);
-            else
-              insert_sim_op(out,"end_memory_blocking",blockedQubits);
-          }
-          else if(!end_block_inserted){
-            out.pop_back();
-          }
-          if(restore_qubits && doSwap)
-            restore_qubits_order(out);
-
-          //mapping swapped qubits
-          if(doSwap){
-            for(iq=0;iq<ops[i].qubits.size();iq++){
-              ops[i].qubits[iq] = qubitMap_[ops[i].qubits[iq]];
-            }
-          }
-
-          out.push_back(ops[i]);
-          num_gates_added++;
-
-          end_block_inserted = true;
-          continue;
         }
-      }
-      queue.push_back(ops[i]);
-    }
 
-    if(!end_block_inserted){
-      if(num_gates_added > 0){
-        if(doSwap)
-          insert_sim_op(out,"end_blocking",blockedQubits);
-        else
-          insert_sim_op(out,"end_memory_blocking",blockedQubits);
-      }
-      else{
-        //pop unnecessary operations
-        while(out.size() > pos_begin){
-          out.pop_back();
-        }
-      }
-    }
-/*  }
-  else{
-    i = 0;
-    //add chunk swap and block ops (if blocking is enabled)
-    if(blocking_enabled_){
-      while(i <ops.size()){
-        if(ops[i].type == Operations::OpType::sim_op){
-          out.push_back(ops[i]);
-        }
-        else if(ops[i].type == Operations::OpType::gate && ops[i].name == "swap_chunk"){
-          out.push_back(ops[i]);
+        if(num_gates_added > 0 && !end_block_inserted){  //insert end of block to synchronize chunks
+          if(doSwap)
+            insert_sim_op(out,"end_blocking",blockedQubits);
+          else
+            insert_sim_op(out,"end_memory_blocking",blockedQubits);
         }
-        else{
-          break;
+        else if(!end_block_inserted){
+          out.pop_back();
         }
-        i++;
-      }
-    }
+        if(restore_qubits && doSwap)
+          restore_qubits_order(out);
 
-    insert_sim_op(out,"begin_register_blocking",blockedQubits);
-    //gather blocked gates
-    while(i < ops.size()){
-      if(ops[i].type == Operations::OpType::gate || ops[i].type == Operations::OpType::matrix){
-        if((ops[i].qubits.size() > 1 && ops[i].type == Operations::OpType::matrix) || ops[i].name == "pauli"){
-          queue.push_back(ops[i]);
-        }
-        else{
-          if(can_reorder(ops[i],queue)){
-            if(is_diagonal_op(ops[i])){
-              //diagonal gate can be applied
-              out.push_back(ops[i]);
-              num_gates_added++;
-            }
-            else{
-              exist = false;
-              iq = ops[i].qubits[ops[i].qubits.size()-1]; //block target bit
-              nq = blockedQubits.size();
-              for(j=0;j<nq;j++){
-                if(iq == blockedQubits[j]){
-                  exist = true;
-                  break;
-                }
-              }
-              if(exist){
-                out.push_back(ops[i]);
-                num_gates_added++;
-              }
-              else{
-                if(nq == memory_blocking_bits_){
-                  queue.push_back(ops[i]);
-                }
-                else{
-                  blockedQubits.push_back(iq);
-                  out.push_back(ops[i]);
-                  num_gates_added++;
-                }
-              }
-            }
-          }
-          else{
-            queue.push_back(ops[i]);
+        //mapping swapped qubits
+        if(doSwap){
+          for(iq=0;iq<ops[i].qubits.size();iq++){
+            ops[i].qubits[iq] = qubitMap_[ops[i].qubits[iq]];
           }
         }
+
+        out.push_back(ops[i]);
+        num_gates_added++;
+
+        end_block_inserted = true;
+        continue;
       }
-      else{
-        queue.push_back(ops[i]);
-      }
-      i++;
     }
+    queue.push_back(ops[i]);
+  }
 
-    if(out.size() > pos_begin + 1){
-      out[pos_begin].qubits = blockedQubits;  //store qubits to be blocked in the sim_op::begin_register_blocking
-      insert_sim_op(out,"end_register_blocking",blockedQubits);
+  if(!end_block_inserted){
+    if(num_gates_added > 0){
+      if(doSwap)
+        insert_sim_op(out,"end_blocking",blockedQubits);
+      else
+        insert_sim_op(out,"end_memory_blocking",blockedQubits);
     }
     else{
-      out.pop_back();
+      //pop unnecessary operations
+      while(out.size() > pos_begin){
+        out.pop_back();
+      }
     }
-  }*/
+  }
 
   return num_gates_added;
 }
diff --git a/test/terra/backends/aer_simulator/test_chunk.py b/test/terra/backends/aer_simulator/test_chunk.py
index 1939818a7e..1ccaf35b85 100644
--- a/test/terra/backends/aer_simulator/test_chunk.py
+++ b/test/terra/backends/aer_simulator/test_chunk.py
@@ -88,6 +88,35 @@ def test_chunk_QuantumVolumeWithFusion(self, method, device):
 
         self.assertEqual(counts_no_chunk, counts)
 
+    @supported_methods(['statevector', 'density_matrix'])
+    def test_chunk_QFT(self, method, device):
+        """Test multi-chunk with QFT"""
+        opts_no_chunk = {
+            "fusion_enable": False,
+            "fusion_threshold": 10,
+        } 
+        opts_chunk = copy.copy(opts_no_chunk)
+        opts_chunk["blocking_enable"] = True
+        opts_chunk["blocking_qubits"] = 2
+
+        backend = self.backend(
+            method=method, device=device, **opts_chunk)
+        backend_no_chunk = self.backend(
+            method=method, device=device, **opts_no_chunk)
+
+        shots = 100
+        num_qubits = 3
+        circuit = transpile(QFT(num_qubits), backend=backend,
+                            optimization_level=0)
+        circuit.measure_all()
+        
+        result = backend.run(circuit, shots=shots, memory=True).result()
+        counts = result.get_counts(circuit)
+        result_no_chunk = backend_no_chunk.run(circuit, shots=shots, memory=True).result()
+        counts_no_chunk = result_no_chunk.get_counts(circuit)
+
+        self.assertEqual(counts_no_chunk, counts)
+
     @supported_methods(['statevector', 'density_matrix'])
     def test_chunk_QFTWithFusion(self, method, device):
         """Test multi-chunk with fused QFT (testing multi-chunk diagonal matrix)"""
diff --git a/test/terra/backends/aer_simulator/test_noise.py b/test/terra/backends/aer_simulator/test_noise.py
index 0c9e8f9d44..2549d22f2d 100644
--- a/test/terra/backends/aer_simulator/test_noise.py
+++ b/test/terra/backends/aer_simulator/test_noise.py
@@ -153,10 +153,7 @@ def test_kraus_gate_noise(self, method, device):
             self.assertSuccess(result)
             self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
 
-    @supported_methods([
-        'automatic', 'statevector', 'density_matrix', 'matrix_product_state'])
-    def test_kraus_gate_noise_on_QFT(self, method, device):
-        """Test Kraus noise on a QFT circuit"""
+    def _test_kraus_gate_noise_on_QFT(self, **options):
         shots = 10000
 
         # Build noise model
@@ -166,8 +163,7 @@ def test_kraus_gate_noise_on_QFT(self, method, device):
         noise_model.add_all_qubit_quantum_error(error1, ['h'])
         noise_model.add_all_qubit_quantum_error(error2, ['cp', 'swap'])
 
-        backend = self.backend(
-            method=method, device=device, noise_model=noise_model)
+        backend = self.backend(**options, noise_model=noise_model)
         ideal_circuit = transpile(QFT(3), backend)
 
         # manaully build noise circuit
@@ -188,6 +184,20 @@ def test_kraus_gate_noise_on_QFT(self, method, device):
         self.assertSuccess(result)
         self.compare_counts(result, [ideal_circuit], [ref_target], hex_counts=False, delta=0.1 * shots)
 
+    @supported_methods([
+        'automatic', 'statevector', 'density_matrix', 'matrix_product_state'])
+    def test_kraus_gate_noise_on_QFT(self, method, device):
+        """Test Kraus noise on a QFT circuit"""
+        self._test_kraus_gate_noise_on_QFT(
+            method=method, device=device)
+
+    @supported_methods([
+        'statevector', 'density_matrix'])
+    def test_kraus_gate_noise_on_QFT_cache_blocking(self, method, device):
+        """Test Kraus noise on a QFT circuit with caceh blocking"""
+        self._test_kraus_gate_noise_on_QFT(
+            method=method, device=device, blocking_qubits=2)
+
     @supported_methods(ALL_METHODS)
     def test_clifford_circuit_noise(self, method, device):
         """Test simulation with mixed Clifford quantum errors in circuit."""

From 0f338cd03641386705eca8127f109b37d56078f9 Mon Sep 17 00:00:00 2001
From: Jake Lishman <jake.lishman@ibm.com>
Date: Fri, 25 Mar 2022 14:23:06 +0000
Subject: [PATCH 06/10] Pin jinja2 in CI (#1490)

The 3.1 release of Jinja breaks some part of the Sphinx/nbsphinx
pipeline, causing tutorial jobs to fail.  The jinja2 developers consider
this a Sphinx bug, so we need to pin Jinja until a new version of Sphinx
or the nbsphinx extension is released to fix the issue.
---
 constraints.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/constraints.txt b/constraints.txt
index 7934cfcbee..a0a43723c5 100644
--- a/constraints.txt
+++ b/constraints.txt
@@ -7,3 +7,8 @@ scipy>=1.0
 # with modern importlib-metadata (4.8.1). importlib-metadata is only needed on
 # Python <3.8.
 importlib-metadata==4.6.4
+
+# Jinja2 3.1.0 is incompatible with sphinx and/or jupyter until they are updated
+# to work with the new jinja version (the jinja maintainers aren't going to
+# fix things) pin to the previous working version.
+jinja2==3.0.3

From 14799691fdad6b593428d8a14c86221b14b34938 Mon Sep 17 00:00:00 2001
From: Hiroshi Horii <hhorii@users.noreply.github.com>
Date: Tue, 29 Mar 2022 11:45:19 +0900
Subject: [PATCH 07/10] error message includes required memory mb (#1494)

* Add a size of required memory in an error message

Previously there is no way to know required memory to simulate a circuit when simulation fails
due to shortage of memory. This change adds required memory and available max memory in
a message of error state.
---
 src/controllers/aer_controller.hpp            | 12 +++++---
 .../backends/aer_simulator/test_options.py    | 28 +++++++++++++++++++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
index 2f1f35639f..316effa8e4 100755
--- a/src/controllers/aer_controller.hpp
+++ b/src/controllers/aer_controller.hpp
@@ -1930,10 +1930,14 @@ bool Controller::validate_state(const state_t &state, const Circuit &circ,
     size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) / num_process_per_experiment_;                                        
     size_t mem_size = (sim_device_ == Device::GPU) ? max_memory_mb_ + max_gpu_memory_mb_ : max_memory_mb_;
     memory_valid = (required_mb <= mem_size);
-  }
-  if (throw_except && !memory_valid) {
-    error_msg << "Insufficient memory to run circuit " << circ_name;
-    error_msg << " using the " << state.name() << " simulator.";
+    if (throw_except && !memory_valid) {
+      error_msg << "Insufficient memory to run circuit " << circ_name;
+      error_msg << " using the " << state.name() << " simulator.";
+      error_msg << " Required memory: " << required_mb << "M, max memory: " << max_memory_mb_ << "M";
+      if (sim_device_ == Device::GPU) {
+        error_msg << " (Host) + " << max_gpu_memory_mb_ << "M (GPU)";
+      }
+    }
   }
 
   if (noise_valid && circ_valid && memory_valid) {
diff --git a/test/terra/backends/aer_simulator/test_options.py b/test/terra/backends/aer_simulator/test_options.py
index cf5e31ab8a..4400701e39 100644
--- a/test/terra/backends/aer_simulator/test_options.py
+++ b/test/terra/backends/aer_simulator/test_options.py
@@ -186,3 +186,31 @@ def test_mps_options(self):
         # Check that the approximated result is not identical to the exact
         # result, because that could mean there was actually no approximation
         self.assertLessEqual(state_fidelity(sv_left, sv_approx), 0.999)
+
+    def test_statevector_memory(self):
+        """Test required memory is correctly checked in statevector"""
+        method = "statevector"
+        backend = self.backend(method=method)
+
+        # attempt to simulate a circuit with too many qubits
+        n = 50
+        circuit = QuantumCircuit(n)
+        for q in range(n):
+            circuit.h(q)
+        circuit.measure_all()
+        result = backend.run(circuit).result()
+        self.assertNotSuccess(result)
+        self.assertTrue('Insufficient memory' in result.results[0].status)
+        self.assertTrue('Required memory: {}'.format(2**(n-20)*16) in result.results[0].status)
+
+        n = 30
+        max_memory_mb = 16
+        circuit = QuantumCircuit(n)
+        for q in range(n):
+            circuit.h(q)
+        circuit.measure_all()
+        result = backend.run(circuit, max_memory_mb=max_memory_mb).result()
+        self.assertNotSuccess(result)
+        self.assertTrue('Insufficient memory' in result.results[0].status)
+        self.assertTrue('Required memory: {}'.format(2**(n-20)*16) in result.results[0].status)
+        self.assertTrue('max memory: {}'.format(max_memory_mb) in result.results[0].status)

From 23f7c4b52119ceaa7332f638d6115472c08129d5 Mon Sep 17 00:00:00 2001
From: Hiroshi Horii <hhorii@users.noreply.github.com>
Date: Tue, 29 Mar 2022 23:40:18 +0900
Subject: [PATCH 08/10] Fix handling of circuit metadata (#1436)

The to_json() method  is called for each circuit header.
A header may include metadata and metadata can be a python object.
This PR changes serialization of circuit headers to use py::handle without serializing to a json.

Fixes #1435


* use python parser for circuit.header

* support metadata copy with parameterization

* avoid serialization of circuit metadata

* use circuit_index to specify metadata

* remove metadata from qobj for Aer to simulate circuits

* add release note

* clear circuite metadata correctly.

* take unnecessary tests for circuit metadata backup/recovery

* work around metadata serialization issue within _run method

* Update releasenotes/notes/remove_circuit_metadata_from_qobj-324e7ea9b369ee67.yaml
---
 qiskit/providers/aer/backends/aerbackend.py   |  29 +++++
 ...t_metadata_from_qobj-324e7ea9b369ee67.yaml |  13 +++
 .../backends/aer_simulator/test_metadata.py   | 104 ++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 releasenotes/notes/remove_circuit_metadata_from_qobj-324e7ea9b369ee67.yaml
 create mode 100644 test/terra/backends/aer_simulator/test_metadata.py

diff --git a/qiskit/providers/aer/backends/aerbackend.py b/qiskit/providers/aer/backends/aerbackend.py
index cc219e9fcb..2e802c66d1 100644
--- a/qiskit/providers/aer/backends/aerbackend.py
+++ b/qiskit/providers/aer/backends/aerbackend.py
@@ -288,9 +288,30 @@ def _run(self, qobj, job_id='', format_result=True):
         # Start timer
         start = time.time()
 
+        # Take metadata from headers of experiments to work around JSON serialization error
+        metadata_list = []
+        metadata_index = 0
+        for expr in qobj.experiments:
+            if hasattr(expr.header, "metadata"):
+                metadata_copy = expr.header.metadata.copy()
+                metadata_list.append(metadata_copy)
+                expr.header.metadata.clear()
+                if "id" in metadata_copy:
+                    expr.header.metadata["id"] = metadata_copy["id"]
+                expr.header.metadata["metadata_index"] = metadata_index
+                metadata_index += 1
+
         # Run simulation
         output = self._execute(qobj)
 
+        # Recover metadata
+        metadata_index = 0
+        for expr in qobj.experiments:
+            if hasattr(expr.header, "metadata"):
+                expr.header.metadata.clear()
+                expr.header.metadata.update(metadata_list[metadata_index])
+                metadata_index += 1
+
         # Validate output
         if not isinstance(output, dict):
             logger.error("%s: simulation failed.", self.name())
@@ -305,6 +326,14 @@ def _run(self, qobj, job_id='', format_result=True):
         output["backend_name"] = self.name()
         output["backend_version"] = self.configuration().backend_version
 
+        # Push metadata to experiment headers
+        for result in output["results"]:
+            if ("header" in result and
+                    "metadata" in result["header"] and
+                    "metadata_index" in result["header"]["metadata"]):
+                metadata_index = result["header"]["metadata"]["metadata_index"]
+                result["header"]["metadata"] = metadata_list[metadata_index]
+
         # Add execution time
         output["time_taken"] = time.time() - start
 
diff --git a/releasenotes/notes/remove_circuit_metadata_from_qobj-324e7ea9b369ee67.yaml b/releasenotes/notes/remove_circuit_metadata_from_qobj-324e7ea9b369ee67.yaml
new file mode 100644
index 0000000000..b5c0266890
--- /dev/null
+++ b/releasenotes/notes/remove_circuit_metadata_from_qobj-324e7ea9b369ee67.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Fixed a potential issue with running simulations on circuits that have the
+    :attr:`.QuantumCircuit.metadata` attribute set. The :attr:`~.QuantumCircuit.metadata`
+    attribute can be any python dictionary and previously qiskit-aer would attempt to
+    JSON serialize the contents of the attribute to process it with the rest of the rest
+    of the circuit input, even if the contents were not JSON serializable. This no longer
+    occurs as the :attr:`.QuantumCircuit.metadata`  attribute is not used to run the
+    simulation so now the contents are no serialized and instead are directly attached
+    to the :class:`qiskit.result.Result` object without attempting to JSON serialize
+    the contents.
+    Fixed `#1435 <https://github.com/Qiskit/qiskit-aer/issues/1435>`__
diff --git a/test/terra/backends/aer_simulator/test_metadata.py b/test/terra/backends/aer_simulator/test_metadata.py
new file mode 100644
index 0000000000..96902d322a
--- /dev/null
+++ b/test/terra/backends/aer_simulator/test_metadata.py
@@ -0,0 +1,104 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2018, 2019, 2020, 2021, 2022.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+"""
+AerSimulator Integration Tests
+"""
+from math import sqrt
+from ddt import ddt
+from qiskit import transpile, QuantumCircuit
+from test.terra.reference import ref_algorithms
+
+from test.terra.backends.simulator_test_case import (
+    SimulatorTestCase, supported_methods)
+
+
+@ddt
+class TestMetadata(SimulatorTestCase):
+    """AerSimulator algorithm tests in the default basis"""
+
+    @supported_methods(
+        ['automatic', 'statevector', 'density_matrix',
+         'matrix_product_state', 'extended_stabilizer'])
+    def test_single_circuit_metadata(self, method, device):
+        """Test circuits with object metadata."""
+        backend = self.backend(method=method, device=device)
+        metadata = {1: object}
+        circuit = QuantumCircuit(1, name='circ0', metadata=metadata.copy())
+        result = backend.run(circuit).result()
+        self.assertSuccess(result)
+        self.assertEqual(result.results[0].header.metadata, metadata)
+        self.assertEqual(circuit.metadata, metadata)
+
+    @supported_methods(
+        ['automatic', 'statevector', 'density_matrix',
+         'matrix_product_state', 'extended_stabilizer'])
+    def test_three_circuit_metadata(self, method, device):
+        """Test circuits with object metadata."""
+        backend = self.backend(method=method, device=device)
+
+        metadata0 = {0: object}
+        circuit0 = QuantumCircuit(1, name='circ0', metadata=metadata0.copy())
+        
+        metadata1 = {1: object}
+        circuit1 = QuantumCircuit(1, name='circ1', metadata=metadata1.copy())
+
+        metadata2 = {2: object}
+        circuit2 = QuantumCircuit(1, name='circ2', metadata=metadata2.copy())
+        
+        result = backend.run([circuit0, circuit1, circuit2]).result()
+        self.assertSuccess(result)
+        self.assertEqual(len(result.results), 3)
+        self.assertEqual(result.results[0].header.metadata, metadata0)
+        self.assertEqual(result.results[1].header.metadata, metadata1)
+        self.assertEqual(result.results[2].header.metadata, metadata2)
+        self.assertEqual(circuit0.metadata, metadata0)
+        self.assertEqual(circuit1.metadata, metadata1)
+        self.assertEqual(circuit2.metadata, metadata2)
+
+    @supported_methods(
+        ['automatic', 'statevector', 'density_matrix', 'matrix_product_state'])
+    def test_three_parameterized_circuit_metadata(self, method, device):
+        """Test circuits with object metadata."""
+        backend = self.backend(method=method, device=device)
+
+        metadata0 = {0: object}
+        circuit0 = QuantumCircuit(1, name='circ0', metadata=metadata0.copy())
+        circuit0.ry(0.1, 0)
+        circuit0.measure_all()
+        
+        metadata1 = {1: object}
+        circuit1 = QuantumCircuit(1, name='circ1', metadata=metadata1.copy())
+        circuit1.ry(0.1, 0)
+        circuit1.measure_all()
+
+        metadata2 = {2: object}
+        circuit2 = QuantumCircuit(1, name='circ2', metadata=metadata2.copy())
+        circuit2.ry(0.1, 0)
+        circuit2.measure_all()
+        
+        parameterizations=[[[[0, 0], [0, 1]]],
+                           [[[0, 0], [0, 1, 2]]],
+                           []]
+        
+        result = backend.run([circuit0, circuit1, circuit2],
+                             parameterizations=parameterizations).result()
+        self.assertSuccess(result)
+        self.assertEqual(len(result.results), 6)
+        self.assertEqual(result.results[0].header.metadata, metadata0)
+        self.assertEqual(result.results[1].header.metadata, metadata0)
+        self.assertEqual(result.results[2].header.metadata, metadata1)
+        self.assertEqual(result.results[3].header.metadata, metadata1)
+        self.assertEqual(result.results[4].header.metadata, metadata1)
+        self.assertEqual(result.results[5].header.metadata, metadata2)
+        self.assertEqual(circuit0.metadata, metadata0)
+        self.assertEqual(circuit1.metadata, metadata1)
+        self.assertEqual(circuit2.metadata, metadata2)

From 61e91e2277b72ff6e0feaf85054c06821fb1a6a0 Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Mon, 4 Apr 2022 19:13:28 +0900
Subject: [PATCH 09/10] thread control for Thrust CPU is now same as device=CPU

---
 ...x-thrust-cpu-threads-67db86b2edcf06b3.yaml |  6 +++++
 .../statevector/chunk/chunk_container.hpp     | 26 ++++++++++++++++---
 .../chunk/device_chunk_container.hpp          | 25 +++++++++++++-----
 .../statevector/qubitvector_thrust.hpp        | 20 ++++++++++++--
 4 files changed, 66 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml

diff --git a/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml b/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml
new file mode 100644
index 0000000000..5495540da3
--- /dev/null
+++ b/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    device=Thrust was very slow for small number of qubits because OpenMP
+    threading was always applied. This fix applies OpenMP threads as same
+    as device=CPU by using statevector_parallel_threshold.
diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp
index 5fd68798e4..69157fcb04 100644
--- a/src/simulators/statevector/chunk/chunk_container.hpp
+++ b/src/simulators/statevector/chunk/chunk_container.hpp
@@ -487,6 +487,10 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
   uint_t num_cmemory_;
   mutable int_t conditional_bit_;
   bool keep_conditional_bit_;         //keep conditional bit alive
+  int_t num_pow2_qubits_;             //largest number of qubits that meets num_chunks_ = m*(2^num_pow2_qubits_)
+  bool density_matrix_;
+
+  int_t omp_threads_;                 //number of threads can be used for parallelization on CPU
 public:
   ChunkContainer()
   {
@@ -498,6 +502,8 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
     conditional_bit_ = -1;
     keep_conditional_bit_ = false;
     matrix_bits_ = AER_DEFAULT_MATRIX_BITS;
+    density_matrix_ = false;
+    omp_threads_ = 1;
   }
   virtual ~ChunkContainer(){}
 
@@ -569,6 +575,11 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
     keep_conditional_bit_ = keep;
   }
 
+  void set_omp_threads(int_t nthreads)
+  {
+    omp_threads_ = nthreads;
+  }
+
   virtual thrust::complex<data_t>& operator[](uint_t i) = 0;
 
   virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS) = 0;
@@ -820,7 +831,10 @@ void ChunkContainer<data_t>::Execute(Function func,uint_t iChunk,uint_t count)
 #else
   uint_t size = count * func.size(chunk_bits_);
   auto ci = thrust::counting_iterator<uint_t>(0);
-  thrust::for_each_n(thrust::device, ci , size, func);
+  if(omp_threads_ > 1)
+    thrust::for_each_n(thrust::device, ci , size, func);
+  else
+    thrust::for_each_n(thrust::seq, ci , size, func);
 #endif
 
 }
@@ -971,7 +985,10 @@ void ChunkContainer<data_t>::ExecuteSum(double* pSum,Function func,uint_t iChunk
     auto ci = thrust::counting_iterator<uint_t>(0);
 
     double sum;
-    sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus<double>());
+    if(omp_threads_ > 1)
+      sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus<double>());
+    else
+      sum = thrust::transform_reduce(thrust::seq, ci, ci + size, func,0.0,thrust::plus<double>());
     if(count == 1 && pSum){
       *pSum = sum;
     }
@@ -1108,7 +1125,10 @@ void ChunkContainer<data_t>::ExecuteSum2(double* pSum,Function func,uint_t iChun
 
     auto ci = thrust::counting_iterator<uint_t>(0);
 
-    ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum());
+    if(omp_threads_ > 1)
+      ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum());
+    else
+      ret = thrust::transform_reduce(thrust::seq, ci, ci + size, func,zero,complex_sum());
 
     if(count == 1 && pSum){
       *((thrust::complex<double>*)pSum) = ret;
diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp
index 34e92ab1c8..c9ce2c9a07 100644
--- a/src/simulators/statevector/chunk/device_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/device_chunk_container.hpp
@@ -617,7 +617,10 @@ void DeviceChunkContainer<data_t>::Zero(uint_t iChunk,uint_t count)
 #ifdef AER_THRUST_CUDA
   thrust::fill_n(thrust::cuda::par.on(stream_[iChunk]),data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
 #else
-  thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
+  if(this->omp_threads_ > 1)
+    thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
+  else
+    thrust::fill_n(thrust::seq,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
 #endif
 }
 
@@ -665,12 +668,22 @@ reg_t DeviceChunkContainer<data_t>::sample_measure(uint_t iChunk,const std::vect
   cudaStreamSynchronize(stream_[iChunk]);
 
 #else
-  if(dot)
-    thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
-  else
-    thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());
+  if(this->omp_threads_ > 1){
+    if(dot)
+      thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
+    else
+      thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());
 
-  thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
+    thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
+  }
+  else{
+    if(dot)
+      thrust::transform_inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
+    else
+      thrust::inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());
+
+    thrust::lower_bound(thrust::seq, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
+  }
 #endif
 
   return samples;
diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp
index b18568f003..c183ab9abf 100644
--- a/src/simulators/statevector/qubitvector_thrust.hpp
+++ b/src/simulators/statevector/qubitvector_thrust.hpp
@@ -34,6 +34,11 @@
 
 #include "simulators/statevector/chunk/chunk_manager.hpp"
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+
 namespace AER {
 namespace QV {
 
@@ -463,7 +468,7 @@ class QubitVectorThrust {
   // Config settings
   //----------------------------------------------------------------------- 
   uint_t omp_threads_ = 1;     // Disable multithreading by default
-  uint_t omp_threshold_ = 1;  // Qubit threshold for multithreading when enabled
+  uint_t omp_threshold_ = 14;  // Qubit threshold for multithreading when enabled
   int sample_measure_index_size_ = 1; // Sample measure indexing qubit size
   double json_chop_threshold_ = 0;  // Threshold for choping small values
                                     // in JSON serialization
@@ -1053,6 +1058,10 @@ void QubitVectorThrust<data_t>::set_num_qubits(size_t num_qubits)
 
   register_blocking_ = false;
 
+  //set OpenMP threads for ThrustCPU
+  if(num_qubits_ > omp_threshold_ && omp_threads_ > 1)
+    chunk_.container()->set_omp_threads(omp_threads_);
+
 #ifdef AER_DEBUG
   if(chunk_.pos() == 0){
     spdlog::debug(" ==== Thrust qubit vector initialization {} qubits ====",num_qubits_);
@@ -1584,9 +1593,16 @@ void QubitVectorThrust<data_t>::apply_function_sum2(double* pSum,Function func,b
  ******************************************************************************/
 
 template <typename data_t>
-void QubitVectorThrust<data_t>::set_omp_threads(int n) {
+void QubitVectorThrust<data_t>::set_omp_threads(int n) 
+{
   if (n > 0)
     omp_threads_ = n;
+
+#ifdef _OPENMP
+  //disable nested parallel for ThrustCPU
+  if(omp_get_num_threads() > 1)
+    omp_threads_ = 1;
+#endif
 }
 
 template <typename data_t>

From ce1030a527465899fed6fa5bb9234cb36e493cb1 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Wed, 6 Apr 2022 00:11:14 -0400
Subject: [PATCH 10/10] Stop using deprecated BaseBackend class (#1501)

The BaseBackend class in qiskit-terra (along with the rest of the legacy
provider interface) is deprecated and being removed soon in
Qiskit/qiskit-terra#7886. To avoid potential compatibility issues or
deprecation warnings we shouldn't be using this class anymore. This
commit removes the last 2 uses of these legacy classes to avoid issues
moving forward.

Co-authored-by: Hiroshi Horii <hhorii@users.noreply.github.com>
(cherry picked from commit 07a853269f7d436bc79493fd484a17151654c032)
---
 qiskit/providers/aer/noise/noise_model.py                | 9 ++++++---
 .../aer/pulse/system_models/pulse_system_model.py        | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/qiskit/providers/aer/noise/noise_model.py b/qiskit/providers/aer/noise/noise_model.py
index 082ec62e1a..bdbed55143 100644
--- a/qiskit/providers/aer/noise/noise_model.py
+++ b/qiskit/providers/aer/noise/noise_model.py
@@ -21,7 +21,6 @@
 from numpy import ndarray
 
 from qiskit.circuit import Instruction, Delay
-from qiskit.providers import BaseBackend, BackendV1, BackendV2
 from qiskit.providers.exceptions import BackendPropertyError
 from qiskit.providers.models import BackendProperties
 from qiskit.transpiler import PassManager
@@ -303,10 +302,14 @@ def from_backend(cls, backend,
         Raises:
             NoiseError: If the input backend is not valid.
         """
-        if isinstance(backend, BackendV2):
+        backend_interface_version = getattr(backend, "version", None)
+        if not isinstance(backend_interface_version, int):
+            backend_interface_version = 0
+
+        if backend_interface_version == 2:
             raise NoiseError(
                 "NoiseModel.from_backend does not currently support V2 Backends.")
-        if isinstance(backend, (BaseBackend, BackendV1)):
+        if backend_interface_version <= 1:
             properties = backend.properties()
             configuration = backend.configuration()
             basis_gates = configuration.basis_gates
diff --git a/qiskit/providers/aer/pulse/system_models/pulse_system_model.py b/qiskit/providers/aer/pulse/system_models/pulse_system_model.py
index 72efc860b0..67a0af9b59 100644
--- a/qiskit/providers/aer/pulse/system_models/pulse_system_model.py
+++ b/qiskit/providers/aer/pulse/system_models/pulse_system_model.py
@@ -17,7 +17,7 @@
 
 from warnings import warn
 from collections import OrderedDict
-from qiskit.providers import BaseBackend, Backend
+from qiskit.providers import Backend
 from ...aererror import AerError
 from .hamiltonian_model import HamiltonianModel
 
@@ -94,7 +94,7 @@ def from_backend(cls, backend, subsystem_list=None):
             AerError: If channel or u_channel_lo are invalid.
         """
 
-        if not isinstance(backend, (BaseBackend, Backend)):
+        if not isinstance(backend, Backend):
             raise AerError("{} is not a Qiskit backend".format(backend))
 
         # get relevant information from backend