Qiskit · hhorii · Mar 29, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 9, 2022
diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py
@@ -252,6 +252,11 @@ class AerSimulator(AerBackend):
       number of active circuit qubits is greater than this value batching of
       simulation shots will not be used. (Default: 16).
 
+    * ``num_threads_per_device`` (int): This option sets the number of
+      threads per device. For GPU simulation, this value sets number of
+      threads per GPU. This parameter is used to optimize Pauli noise
+      simulation with multiple-GPUs (Default: 1).
+
     These backend options only apply when using the ``"statevector"``
     simulation method:
 
@@ -551,6 +556,7 @@ def _default_options(cls):
             # multi-shots optimization options (GPU only)
             batched_shots_gpu=True,
             batched_shots_gpu_max_qubits=16,
+            num_threads_per_device=1,
             # statevector options
             statevector_parallel_threshold=14,
             statevector_sample_measure_opt=10,

diff --git a/releasenotes/notes/multi-shots-pauli-noise-improvements-87637a02e81806cf.yaml b/releasenotes/notes/multi-shots-pauli-noise-improvements-87637a02e81806cf.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fix performance issue in multi-shots batched optimization for GPU when
+    using Pauli noise. This fix allows multi-threading to runtime noise
+    sampling, and uses nested OpenMP parallelization when using multiple GPUs.
+    This is fix for
+    `issue 1473 <https://github.com/Qiskit/qiskit-aer/issues/1473>`
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
@@ -978,19 +978,13 @@ Result Controller::execute(std::vector<Circuit> &circuits,
         parallel_experiments_ < max_parallel_threads_) {
       // Nested parallel experiments
       parallel_nested_ = true;
-#ifdef _WIN32
-      omp_set_nested(1);
-#else
-      omp_set_max_active_levels(3);
-#endif
+
+      //nested should be set to zero if num_threads clause will be used
+      omp_set_nested(0);
+
       result.metadata.add(parallel_nested_, "omp_nested");
     } else {
       parallel_nested_ = false;
-#ifdef _WIN32
-      omp_set_nested(0);
-#else
-      omp_set_max_active_levels(1);
-#endif
     }
 #endif
 

diff --git a/src/framework/utils.hpp b/src/framework/utils.hpp
@@ -1291,12 +1291,19 @@ size_t get_system_memory_mb()
 
 //apply OpenMP parallel loop to lambda function if enabled
 template<typename Lambda>
-void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& func)
+void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& func, int nthreads = 0)
 {
   if(enabled){
+    if(nthreads > 0){
+#pragma omp parallel for num_threads(nthreads)
+      for(int_t i=i_begin;i<i_end;i++)
+        func(i);
+    }
+    else{
 #pragma omp parallel for
-    for(int_t i=i_begin;i<i_end;i++)
-      func(i);
+      for(int_t i=i_begin;i<i_end;i++)
+        func(i);
+    }
   }
   else{
     for(int_t i=i_begin;i<i_end;i++)
@@ -1306,13 +1313,20 @@ void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& fu
 
 //apply OpenMP parallel loop to lambda function and return reduced double if enabled
 template<typename Lambda>
-double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func)
+double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func, int nthreads = 0)
 {
   double val = 0.0;
   if(enabled){
+    if(nthreads > 0){
+#pragma omp parallel for reduction(+:val) num_threads(nthreads)
+      for(int_t i=i_begin;i<i_end;i++)
+        val += func(i);
+    }
+    else{
 #pragma omp parallel for reduction(+:val)
-    for(int_t i=i_begin;i<i_end;i++)
-      val += func(i);
+      for(int_t i=i_begin;i<i_end;i++)
+        val += func(i);
+    }
   }
   else{
     for(int_t i=i_begin;i<i_end;i++)

diff --git a/src/noise/noise_model.hpp b/src/noise/noise_model.hpp
@@ -1047,6 +1047,7 @@ inline void from_json(const json_t &js, NoiseModel &model) {
   model = NoiseModel(js);
 }
 
+
 //-------------------------------------------------------------------------
 } // end namespace Noise
 //-------------------------------------------------------------------------

diff --git a/src/noise/quantum_error.hpp b/src/noise/quantum_error.hpp
@@ -387,6 +387,8 @@ void QuantumError::load_from_json(const json_t &js) {
   set_circuits(circuits, probs);
 }
 
+
+
 //-------------------------------------------------------------------------
 } // end namespace Noise
 //-------------------------------------------------------------------------

diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -701,7 +701,9 @@ size_t State<densmat_t>::required_memory_mb(
 }
 
 template <class densmat_t>
-void State<densmat_t>::set_config(const json_t &config) {
+void State<densmat_t>::set_config(const json_t &config) 
+{
+  BaseState::set_config(config);
 
   // Set threshold for truncating snapshots
   JSON::get_value(json_chop_threshold_, "chop_threshold", config);

diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp
@@ -342,6 +342,8 @@ class State {
   complex_t global_phase_ = 1;
 
   int_t max_matrix_qubits_ = 0;
+
+  std::string sim_device_name_ = "CPU";
 };
 
 
@@ -357,7 +359,7 @@ State<state_t>::~State(void)
 template <class state_t>
 void State<state_t>::set_config(const json_t &config) 
 {
-
+  JSON::get_value(sim_device_name_, "device", config);
 }
 
 template <class state_t>

diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp
@@ -390,6 +390,7 @@ class StateChunk : public State<state_t> {
   uint_t num_groups_;            //number of groups of chunks
   reg_t top_chunk_of_group_;
   reg_t num_chunks_in_group_;
+  int num_threads_per_group_;   //number of outer threads per group
 
   //cuStateVec settings
   bool cuStateVec_enable_ = false;
@@ -532,6 +533,11 @@ void StateChunk<state_t>::set_config(const json_t &config)
 {
   BaseState::set_config(config);
 
+  num_threads_per_group_ = 1;
+  if(JSON::check_key("num_threads_per_device", config)) {
+    JSON::get_value(num_threads_per_group_, "num_threads_per_device", config);
+  }
+
 #ifdef AER_CUSTATEVEC
   //cuStateVec configs
   if(JSON::check_key("cuStateVec_enable", config)) {
@@ -616,16 +622,9 @@ bool StateChunk<state_t>::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu
   global_chunk_index_ = chunk_index_begin_[distributed_rank_];
   local_shot_index_ = 0;
 
-  if(multi_shots_parallelization_){
-    allocate_qregs(std::min(num_local_chunks_,max_batched_shots_));
-  }
-  else{
-    allocate_qregs(num_local_chunks_);
-  }
-
   thrust_optimization_ = false;
   chunk_omp_parallel_ = false;
-  if(qregs_[0].name().find("gpu") != std::string::npos){
+  if(BaseState::sim_device_name_ == "GPU"){
 #ifdef _OPENMP
     if(omp_get_num_threads() == 1)
       chunk_omp_parallel_ = true;
@@ -644,11 +643,18 @@ bool StateChunk<state_t>::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu
       thrust_optimization_ = true;    //cuStateVec does not handle global chunk index for diagonal matrix
 #endif
   }
-  else if(qregs_[0].name().find("thrust") != std::string::npos){
+  else if(BaseState::sim_device_name_ == "Thrust"){
     thrust_optimization_ = true;
     chunk_omp_parallel_ = false;
   }
 
+  if(multi_shots_parallelization_){
+    allocate_qregs(std::min(num_local_chunks_,max_batched_shots_));
+  }
+  else{
+    allocate_qregs(num_local_chunks_);
+  }
+
 
   //initialize qubit map
   qubit_map_.resize(num_qubits_);
@@ -677,11 +683,13 @@ bool StateChunk<state_t>::allocate_qregs(uint_t num_chunks)
   uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0;
   bool ret = true;
   qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_);
+  qregs_[0].set_num_threads_per_group(num_threads_per_group_);
   qregs_[0].cuStateVec_enable(cuStateVec_enable_);
   ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks);
   for(i=1;i<num_chunks;i++){
     uint_t gid = i + chunk_id;
     ret &= qregs_[i].chunk_setup(qregs_[0],gid);
+    qregs_[i].set_num_threads_per_group(num_threads_per_group_);
   }
 
   //initialize groups
@@ -922,7 +930,7 @@ void StateChunk<state_t>::apply_ops_multi_shots(InputIterator first, InputIterat
     //apply ops to multiple-shots
     if(num_groups_ > 1 && chunk_omp_parallel_){
       std::vector<ExperimentResult> par_results(num_groups_);
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_groups_)
       for(i=0;i<num_groups_;i++)
         apply_ops_multi_shots_for_group(i, first, last, noise, par_results[i], rng_seed, final_ops);
 
@@ -952,10 +960,15 @@ void StateChunk<state_t>::apply_ops_multi_shots_for_group(int_t i_group,
                                const Noise::NoiseModel &noise,
                                ExperimentResult &result,
                                uint_t rng_seed,
-                               bool final_ops) 
+                               bool final_ops)
 {
   uint_t istate = top_chunk_of_group_[i_group];
   std::vector<RngEngine> rng(num_chunks_in_group_[i_group]);
+#ifdef _OPENMP
+  int num_inner_threads = omp_get_max_threads() / omp_get_num_threads();
+#else
+  int num_inner_threads = 1;
+#endif
 
   for(uint_t j=top_chunk_of_group_[i_group];j<top_chunk_of_group_[i_group+1];j++)
     rng[j-top_chunk_of_group_[i_group]].set_seed(rng_seed + global_chunk_index_ + local_shot_index_ + j);
@@ -964,32 +977,46 @@ void StateChunk<state_t>::apply_ops_multi_shots_for_group(int_t i_group,
     if(op->type == Operations::OpType::qerror_loc){
       //sample error here
       uint_t count = num_chunks_in_group_[i_group];
-      uint_t max_ops = 0;
-      bool pauli_only = true;
       std::vector<std::vector<Operations::Op>> noise_ops(count);
-      for(uint_t j=0;j<count;j++){
-        noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);
 
-        if(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))
-          continue;
-        else{
-          if(max_ops < noise_ops[j].size())
-            max_ops = noise_ops[j].size();
-          if(pauli_only){
+      uint_t count_ops = 0;
+      uint_t non_pauli_gate_count = 0;
+      if(num_inner_threads > 1){
+#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads)
+        for(int_t j=0;j<count;j++){
+          noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);
+
+          if(!(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))){
+            count_ops++;
+            for(int_t k=0;k<noise_ops[j].size();k++){
+              if(noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && noise_ops[j][k].name != "pauli"){
+                non_pauli_gate_count++;
+                break;
+              }
+            }
+          }
+        }
+      }
+      else{
+        for(int_t j=0;j<count;j++){
+          noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);
+
+          if(!(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))){
+            count_ops++;
             for(int_t k=0;k<noise_ops[j].size();k++){
-              if(noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" 
-                                             && noise_ops[j][k].name != "pauli" && noise_ops[j][k].name != "id"){
-                pauli_only = false;
+              if(noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && noise_ops[j][k].name != "pauli"){
+                non_pauli_gate_count++;
+                break;
               }
             }
           }
         }
       }
 
-      if(max_ops == 0){
+      if(count_ops == 0){
         continue;   //do nothing
       }
-      if(pauli_only){   //batched Pauli can be applied (optimization for Pauli error)
+      if(non_pauli_gate_count == 0){   //ptimization for Pauli error
         qregs_[istate].apply_batched_pauli_ops(noise_ops);
       }
       else{