diff --git a/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml b/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml new file mode 100644 index 0000000000..5495540da3 --- /dev/null +++ b/releasenotes/notes/fix-thrust-cpu-threads-67db86b2edcf06b3.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + device=Thrust was very slow for small number of qubits because OpenMP + threading was always applied. This fix applies OpenMP threads as same + as device=CPU by using statevector_parallel_threshold. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 50b57b4eb2..e0f2de079f 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -991,7 +991,7 @@ Result Controller::execute(std::vector &circuits, const int NUM_RESULTS = result.results.size(); //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation //(DO NOT use if statement in #pragma omp) - if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) { + if (parallel_experiments_ == 1) { for (int j = 0; j < NUM_RESULTS; ++j) { set_parallelization_circuit(circuits[j], noise_model, methods[j]); run_circuit(circuits[j], noise_model,methods[j], diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 0a5fd0a91a..834c4f30ea 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -108,6 +108,8 @@ class ChunkContainer : public std::enable_shared_from_this& operator[](uint_t i) = 0; virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS, bool density_matrix = false) = 0; @@ -485,7 +493,10 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) else size = count * func.size(chunk_bits_); auto ci = thrust::counting_iterator(0); - thrust::for_each_n(thrust::device, ci , size, func); + if(omp_threads_ > 1) + thrust::for_each_n(thrust::device, ci , size, func); + else + thrust::for_each_n(thrust::seq, ci , size, func); #endif } @@ -642,7 +653,10 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk auto ci = thrust::counting_iterator(0); double sum; - sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus()); + if(omp_threads_ > 1) + sum = thrust::transform_reduce(thrust::device, ci, ci + size, func,0.0,thrust::plus()); + else + sum = thrust::transform_reduce(thrust::seq, ci, ci + size, func,0.0,thrust::plus()); if(count == 1 && pSum){ *pSum = sum; } @@ -783,7 +797,10 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun auto ci = thrust::counting_iterator(0); - ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum()); + if(omp_threads_ > 1) + ret = thrust::transform_reduce(thrust::device, ci, ci + size, func,zero,complex_sum()); + else + ret = thrust::transform_reduce(thrust::seq, ci, ci + size, func,zero,complex_sum()); if(count == 1 && pSum){ *((thrust::complex*)pSum) = ret; diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 7d5cf6f108..b836a7c4b1 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -653,7 +653,10 @@ void DeviceChunkContainer::Zero(uint_t iChunk,uint_t count) #ifdef AER_THRUST_CUDA thrust::fill_n(thrust::cuda::par.on(stream_),data_.begin() + (iChunk << this->chunk_bits_),count,0.0); #else - thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0); + if(this->omp_threads_ > 1) + thrust::fill_n(thrust::device,data_.begin() + (iChunk << this->chunk_bits_),count,0.0); + else + thrust::fill_n(thrust::seq,data_.begin() + (iChunk << this->chunk_bits_),count,0.0); #endif } @@ -700,12 +703,22 @@ reg_t DeviceChunkContainer::sample_measure(uint_t iChunk,const std::vect cudaStreamSynchronize(stream_); #else - if(dot) - thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); - else - thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus>()); + if(this->omp_threads_ > 1){ + if(dot) + thrust::transform_inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); + else + thrust::inclusive_scan(thrust::device,iter.begin(),iter.end(),iter.begin(),thrust::plus>()); - thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less()); + thrust::lower_bound(thrust::device, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less()); + } + else{ + if(dot) + thrust::transform_inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); + else + thrust::inclusive_scan(thrust::seq,iter.begin(),iter.end(),iter.begin(),thrust::plus>()); + + thrust::lower_bound(thrust::seq, iter.begin(), iter.end(), rnds.begin(), rnds.begin() + SHOTS, samples.begin() ,complex_less()); + } #endif return samples; diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index ec28046fdb..485310cc7d 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -34,6 +34,11 @@ #include "simulators/statevector/chunk/chunk_manager.hpp" +#ifdef _OPENMP +#include +#endif + + namespace AER { namespace QV { @@ -483,7 +488,7 @@ class QubitVectorThrust { // Config settings //----------------------------------------------------------------------- uint_t omp_threads_ = 1; // Disable multithreading by default - uint_t omp_threshold_ = 1; // Qubit threshold for multithreading when enabled + uint_t omp_threshold_ = 14; // Qubit threshold for multithreading when enabled int sample_measure_index_size_ = 1; // Sample measure indexing qubit size double json_chop_threshold_ = 0; // Threshold for choping small values // in JSON serialization @@ -901,6 +906,10 @@ void QubitVectorThrust::set_num_qubits(size_t num_qubits) register_blocking_ = false; + //set OpenMP threads for ThrustCPU + if(num_qubits_ > omp_threshold_ && omp_threads_ > 1) + chunk_.container()->set_omp_threads(omp_threads_); + #ifdef AER_DEBUG if(chunk_.pos() == 0){ spdlog::debug(" ==== Thrust qubit vector initialization {} qubits ====",num_qubits_); @@ -1391,9 +1400,16 @@ void QubitVectorThrust::apply_function_sum2(double* pSum,Function func,b ******************************************************************************/ template -void QubitVectorThrust::set_omp_threads(int n) { +void QubitVectorThrust::set_omp_threads(int n) +{ if (n > 0) omp_threads_ = n; + +#ifdef _OPENMP + //disable nested parallel for ThrustCPU + if(omp_get_num_threads() > 1) + omp_threads_ = 1; +#endif } template