Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
fixes:
- |
device=Thrust was very slow for small number of qubits because OpenMP
threading was always applied. This fix applies OpenMP threads as same
as device=CPU by using statevector_parallel_threshold.
2 changes: 1 addition & 1 deletion src/controllers/aer_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ Result Controller::execute(std::vector<Circuit> &circuits,
const int NUM_RESULTS = result.results.size();
//following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation
//(DO NOT use if statement in #pragma omp)
if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) {
if (parallel_experiments_ == 1) {
for (int j = 0; j < NUM_RESULTS; ++j) {
set_parallelization_circuit(circuits[j], noise_model, methods[j]);
run_circuit(circuits[j], noise_model,methods[j],
Expand Down
23 changes: 20 additions & 3 deletions src/simulators/statevector/chunk/chunk_container.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
bool keep_conditional_bit_; //keep conditional bit alive
int_t num_pow2_qubits_; //largest number of qubits that meets num_chunks_ = m*(2^num_pow2_qubits_)
bool density_matrix_;

int_t omp_threads_; //number of threads can be used for parallelization on CPU
public:
ChunkContainer()
{
Expand All @@ -121,6 +123,7 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
keep_conditional_bit_ = false;
matrix_bits_ = AER_DEFAULT_MATRIX_BITS;
density_matrix_ = false;
omp_threads_ = 1;
}
virtual ~ChunkContainer(){}

Expand Down Expand Up @@ -197,6 +200,11 @@ class ChunkContainer : public std::enable_shared_from_this<ChunkContainer<data_t
chunk_index_ = chunk_index;
}

void set_omp_threads(int_t nthreads)
{
omp_threads_ = nthreads;
}

virtual thrust::complex<data_t>& operator[](uint_t i) = 0;

virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS, bool density_matrix = false) = 0;
Expand Down Expand Up @@ -485,7 +493,10 @@ void ChunkContainer<data_t>::Execute(Function func,uint_t iChunk,uint_t count)
else
size = count * func.size(chunk_bits_);
auto ci = thrust::counting_iterator<uint_t>(0);
thrust::for_each_n(thrust::device, ci , size, func);
if(omp_threads_ > 1)
thrust::for_each_n(thrust::device, ci , size, func);
else
thrust::for_each_n(thrust::seq, ci , size, func);
#endif

}
Expand Down Expand Up @@ -642,7 +653,10 @@ void ChunkContainer<data_t>::ExecuteSum(double* pSum,Function func,uint_t iChunk
auto ci = thrust::counting_iterator<uint_t>(0);

double sum;
sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus<double>());
if(omp_threads_ > 1)
sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus<double>());
else
sum = thrust::transform_reduce(thrust::seq, ci, ci + size, func,0.0,thrust::plus<double>());
if(count == 1 && pSum){
*pSum = sum;
}
Expand Down Expand Up @@ -783,7 +797,10 @@ void ChunkContainer<data_t>::ExecuteSum2(double* pSum,Function func,uint_t iChun

auto ci = thrust::counting_iterator<uint_t>(0);

ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum());
if(omp_threads_ > 1)
ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum());
else
ret = thrust::transform_reduce(thrust::seq, ci, ci + size, func,zero,complex_sum());

if(count == 1 && pSum){
*((thrust::complex<double>*)pSum) = ret;
Expand Down
25 changes: 19 additions & 6 deletions src/simulators/statevector/chunk/device_chunk_container.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,10 @@ void DeviceChunkContainer<data_t>::Zero(uint_t iChunk,uint_t count)
#ifdef AER_THRUST_CUDA
thrust::fill_n(thrust::cuda::par.on(stream_),data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
#else
thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
if(this->omp_threads_ > 1)
thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
else
thrust::fill_n(thrust::seq,data_.begin() + (iChunk << this->chunk_bits_),count,0.0);
#endif
}

Expand Down Expand Up @@ -700,12 +703,22 @@ reg_t DeviceChunkContainer<data_t>::sample_measure(uint_t iChunk,const std::vect
cudaStreamSynchronize(stream_);

#else
if(dot)
thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
else
thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());
if(this->omp_threads_ > 1){
if(dot)
thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
else
thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());

thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
}
else{
if(dot)
thrust::transform_inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),complex_dot_scan<data_t>(),thrust::plus<thrust::complex<data_t>>());
else
thrust::inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),thrust::plus<thrust::complex<data_t>>());

thrust::lower_bound(thrust::seq, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less<data_t>());
}
#endif

return samples;
Expand Down
20 changes: 18 additions & 2 deletions src/simulators/statevector/qubitvector_thrust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@

#include "simulators/statevector/chunk/chunk_manager.hpp"

#ifdef _OPENMP
#include <omp.h>
#endif


namespace AER {
namespace QV {

Expand Down Expand Up @@ -483,7 +488,7 @@ class QubitVectorThrust {
// Config settings
//-----------------------------------------------------------------------
uint_t omp_threads_ = 1; // Disable multithreading by default
uint_t omp_threshold_ = 1; // Qubit threshold for multithreading when enabled
uint_t omp_threshold_ = 14; // Qubit threshold for multithreading when enabled
int sample_measure_index_size_ = 1; // Sample measure indexing qubit size
double json_chop_threshold_ = 0; // Threshold for choping small values
// in JSON serialization
Expand Down Expand Up @@ -901,6 +906,10 @@ void QubitVectorThrust<data_t>::set_num_qubits(size_t num_qubits)

register_blocking_ = false;

//set OpenMP threads for ThrustCPU
if(num_qubits_ > omp_threshold_ && omp_threads_ > 1)
chunk_.container()->set_omp_threads(omp_threads_);

#ifdef AER_DEBUG
if(chunk_.pos() == 0){
spdlog::debug(" ==== Thrust qubit vector initialization {} qubits ====",num_qubits_);
Expand Down Expand Up @@ -1391,9 +1400,16 @@ void QubitVectorThrust<data_t>::apply_function_sum2(double* pSum,Function func,b
******************************************************************************/

template <typename data_t>
void QubitVectorThrust<data_t>::set_omp_threads(int n) {
void QubitVectorThrust<data_t>::set_omp_threads(int n)
{
if (n > 0)
omp_threads_ = n;

#ifdef _OPENMP
//disable nested parallel for ThrustCPU
if(omp_get_num_threads() > 1)
omp_threads_ = 1;
#endif
}

template <typename data_t>
Expand Down