Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions qiskit/providers/aer/backends/aer_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,11 @@ class AerSimulator(AerBackend):
number of active circuit qubits is greater than this value batching of
simulation shots will not be used. (Default: 16).

* ``num_threads_per_device`` (int): This option sets the number of
threads per device. For GPU simulation, this value sets number of
threads per GPU. This parameter is used to optimize Pauli noise
simulation with multiple-GPUs (Default: 1).

These backend options only apply when using the ``"statevector"``
simulation method:

Expand Down Expand Up @@ -551,6 +556,7 @@ def _default_options(cls):
# multi-shots optimization options (GPU only)
batched_shots_gpu=True,
batched_shots_gpu_max_qubits=16,
num_threads_per_device=1,
# statevector options
statevector_parallel_threshold=14,
statevector_sample_measure_opt=10,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
fixes:
- |
Fix performance issue in multi-shots batched optimization for GPU when
using Pauli noise. This fix allows multi-threading to runtime noise
sampling, and uses nested OpenMP parallelization when using multiple GPUs.
This is fix for
`issue 1473 <https://github.com/Qiskit/qiskit-aer/issues/1473>`
14 changes: 4 additions & 10 deletions src/controllers/aer_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,19 +978,13 @@ Result Controller::execute(std::vector<Circuit> &circuits,
parallel_experiments_ < max_parallel_threads_) {
// Nested parallel experiments
parallel_nested_ = true;
#ifdef _WIN32
omp_set_nested(1);
#else
omp_set_max_active_levels(3);
#endif

//nested should be set to zero if num_threads clause will be used
omp_set_nested(0);

result.metadata.add(parallel_nested_, "omp_nested");
} else {
parallel_nested_ = false;
#ifdef _WIN32
omp_set_nested(0);
#else
omp_set_max_active_levels(1);
#endif
}
#endif

Expand Down
26 changes: 20 additions & 6 deletions src/framework/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1291,12 +1291,19 @@ size_t get_system_memory_mb()

//apply OpenMP parallel loop to lambda function if enabled
template<typename Lambda>
void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& func)
void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& func, int nthreads = 0)
{
if(enabled){
if(nthreads > 0){
#pragma omp parallel for num_threads(nthreads)
for(int_t i=i_begin;i<i_end;i++)
func(i);
}
else{
#pragma omp parallel for
for(int_t i=i_begin;i<i_end;i++)
func(i);
for(int_t i=i_begin;i<i_end;i++)
func(i);
}
}
else{
for(int_t i=i_begin;i<i_end;i++)
Expand All @@ -1306,13 +1313,20 @@ void apply_omp_parallel_for(bool enabled, int_t i_begin, int_t i_end, Lambda& fu

//apply OpenMP parallel loop to lambda function and return reduced double if enabled
template<typename Lambda>
double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func)
double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func, int nthreads = 0)
{
double val = 0.0;
if(enabled){
if(nthreads > 0){
#pragma omp parallel for reduction(+:val) num_threads(nthreads)
for(int_t i=i_begin;i<i_end;i++)
val += func(i);
}
else{
#pragma omp parallel for reduction(+:val)
for(int_t i=i_begin;i<i_end;i++)
val += func(i);
for(int_t i=i_begin;i<i_end;i++)
val += func(i);
}
}
else{
for(int_t i=i_begin;i<i_end;i++)
Expand Down
1 change: 1 addition & 0 deletions src/noise/noise_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,7 @@ inline void from_json(const json_t &js, NoiseModel &model) {
model = NoiseModel(js);
}


//-------------------------------------------------------------------------
} // end namespace Noise
//-------------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions src/noise/quantum_error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,8 @@ void QuantumError::load_from_json(const json_t &js) {
set_circuits(circuits, probs);
}



//-------------------------------------------------------------------------
} // end namespace Noise
//-------------------------------------------------------------------------
Expand Down
4 changes: 3 additions & 1 deletion src/simulators/density_matrix/densitymatrix_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,9 @@ size_t State<densmat_t>::required_memory_mb(
}

template <class densmat_t>
void State<densmat_t>::set_config(const json_t &config) {
void State<densmat_t>::set_config(const json_t &config)
{
BaseState::set_config(config);

// Set threshold for truncating snapshots
JSON::get_value(json_chop_threshold_, "chop_threshold", config);
Expand Down
4 changes: 3 additions & 1 deletion src/simulators/state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ class State {
complex_t global_phase_ = 1;

int_t max_matrix_qubits_ = 0;

std::string sim_device_name_ = "CPU";
};


Expand All @@ -357,7 +359,7 @@ State<state_t>::~State(void)
template <class state_t>
void State<state_t>::set_config(const json_t &config)
{

JSON::get_value(sim_device_name_, "device", config);
}

template <class state_t>
Expand Down
79 changes: 53 additions & 26 deletions src/simulators/state_chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ class StateChunk : public State<state_t> {
uint_t num_groups_; //number of groups of chunks
reg_t top_chunk_of_group_;
reg_t num_chunks_in_group_;
int num_threads_per_group_; //number of outer threads per group

//cuStateVec settings
bool cuStateVec_enable_ = false;
Expand Down Expand Up @@ -532,6 +533,11 @@ void StateChunk<state_t>::set_config(const json_t &config)
{
BaseState::set_config(config);

num_threads_per_group_ = 1;
if(JSON::check_key("num_threads_per_device", config)) {
JSON::get_value(num_threads_per_group_, "num_threads_per_device", config);
}

#ifdef AER_CUSTATEVEC
//cuStateVec configs
if(JSON::check_key("cuStateVec_enable", config)) {
Expand Down Expand Up @@ -616,16 +622,9 @@ bool StateChunk<state_t>::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu
global_chunk_index_ = chunk_index_begin_[distributed_rank_];
local_shot_index_ = 0;

if(multi_shots_parallelization_){
allocate_qregs(std::min(num_local_chunks_,max_batched_shots_));
}
else{
allocate_qregs(num_local_chunks_);
}

thrust_optimization_ = false;
chunk_omp_parallel_ = false;
if(qregs_[0].name().find("gpu") != std::string::npos){
if(BaseState::sim_device_name_ == "GPU"){
#ifdef _OPENMP
if(omp_get_num_threads() == 1)
chunk_omp_parallel_ = true;
Expand All @@ -644,11 +643,18 @@ bool StateChunk<state_t>::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu
thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix
#endif
}
else if(qregs_[0].name().find("thrust") != std::string::npos){
else if(BaseState::sim_device_name_ == "Thrust"){
thrust_optimization_ = true;
chunk_omp_parallel_ = false;
}

if(multi_shots_parallelization_){
allocate_qregs(std::min(num_local_chunks_,max_batched_shots_));
}
else{
allocate_qregs(num_local_chunks_);
}


//initialize qubit map
qubit_map_.resize(num_qubits_);
Expand Down Expand Up @@ -677,11 +683,13 @@ bool StateChunk<state_t>::allocate_qregs(uint_t num_chunks)
uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0;
bool ret = true;
qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_);
qregs_[0].set_num_threads_per_group(num_threads_per_group_);
qregs_[0].cuStateVec_enable(cuStateVec_enable_);
ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks);
for(i=1;i<num_chunks;i++){
uint_t gid = i + chunk_id;
ret &= qregs_[i].chunk_setup(qregs_[0],gid);
qregs_[i].set_num_threads_per_group(num_threads_per_group_);
}

//initialize groups
Expand Down Expand Up @@ -922,7 +930,7 @@ void StateChunk<state_t>::apply_ops_multi_shots(InputIterator first, InputIterat
//apply ops to multiple-shots
if(num_groups_ > 1 && chunk_omp_parallel_){
std::vector<ExperimentResult> par_results(num_groups_);
#pragma omp parallel for
#pragma omp parallel for num_threads(num_groups_)
for(i=0;i<num_groups_;i++)
apply_ops_multi_shots_for_group(i, first, last, noise, par_results[i], rng_seed, final_ops);

Expand Down Expand Up @@ -952,10 +960,15 @@ void StateChunk<state_t>::apply_ops_multi_shots_for_group(int_t i_group,
const Noise::NoiseModel &noise,
ExperimentResult &result,
uint_t rng_seed,
bool final_ops)
bool final_ops)
{
uint_t istate = top_chunk_of_group_[i_group];
std::vector<RngEngine> rng(num_chunks_in_group_[i_group]);
#ifdef _OPENMP
int num_inner_threads = omp_get_max_threads() / omp_get_num_threads();
#else
int num_inner_threads = 1;
#endif

for(uint_t j=top_chunk_of_group_[i_group];j<top_chunk_of_group_[i_group+1];j++)
rng[j-top_chunk_of_group_[i_group]].set_seed(rng_seed + global_chunk_index_ + local_shot_index_ + j);
Expand All @@ -964,32 +977,46 @@ void StateChunk<state_t>::apply_ops_multi_shots_for_group(int_t i_group,
if(op->type == Operations::OpType::qerror_loc){
//sample error here
uint_t count = num_chunks_in_group_[i_group];
uint_t max_ops = 0;
bool pauli_only = true;
std::vector<std::vector<Operations::Op>> noise_ops(count);
for(uint_t j=0;j<count;j++){
noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);

if(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))
continue;
else{
if(max_ops < noise_ops[j].size())
max_ops = noise_ops[j].size();
if(pauli_only){
uint_t count_ops = 0;
uint_t non_pauli_gate_count = 0;
if(num_inner_threads > 1){
#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads)
for(int_t j=0;j<count;j++){
noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);

if(!(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))){
count_ops++;
for(int_t k=0;k<noise_ops[j].size();k++){
if(noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && noise_ops[j][k].name != "pauli"){
non_pauli_gate_count++;
break;
}
}
}
}
}
else{
for(int_t j=0;j<count;j++){
noise_ops[j] = noise.sample_noise_loc(*op,rng[j]);

if(!(noise_ops[j].size() == 0 || (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))){
count_ops++;
for(int_t k=0;k<noise_ops[j].size();k++){
if(noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z"
&& noise_ops[j][k].name != "pauli" && noise_ops[j][k].name != "id"){
pauli_only = false;
if(noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && noise_ops[j][k].name != "pauli"){
non_pauli_gate_count++;
break;
}
}
}
}
}

if(max_ops == 0){
if(count_ops == 0){
continue; //do nothing
}
if(pauli_only){ //batched Pauli can be applied (optimization for Pauli error)
if(non_pauli_gate_count == 0){ //ptimization for Pauli error
qregs_[istate].apply_batched_pauli_ops(noise_ops);
}
else{
Expand Down
Loading