From 309c73daf18764f34b7248c0a024ae0e849fffd0 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Mon, 13 Dec 2021 18:27:02 +0900 Subject: [PATCH 01/17] add cuStateVec support --- CMakeLists.txt | 5 + CONTRIBUTING.md | 20 ++ .../providers/aer/backends/aer_simulator.py | 7 +- src/controllers/aer_controller.hpp | 59 ++++-- .../density_matrix/densitymatrix_state.hpp | 2 +- src/simulators/state.hpp | 8 +- src/simulators/state_chunk.hpp | 198 +++++++++++------- src/simulators/statevector/chunk/chunk.hpp | 10 +- .../statevector/chunk/chunk_container.hpp | 5 +- .../statevector/chunk/chunk_manager.hpp | 13 +- .../chunk/device_chunk_container.hpp | 173 ++++++++++++++- .../chunk/host_chunk_container.hpp | 4 +- src/simulators/statevector/qubitvector.hpp | 4 +- .../statevector/qubitvector_thrust.hpp | 57 ++++- .../superoperator/superoperator_state.hpp | 2 +- src/simulators/unitary/unitary_state.hpp | 2 +- 16 files changed, 451 insertions(+), 118 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60b19219e3..1d704e83a3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,6 +255,11 @@ if(AER_THRUST_SUPPORTED) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA) set(THRUST_DEPENDENT_LIBS "") + if(CUSTATEVEC_ROOT) + set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) + set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") set(THRUST_DEPENDENT_LIBS AER_DEPENDENCY_PKG::tbb) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 976c93f7a0..b64821d10e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -643,6 +643,26 @@ Few notes on GPU builds: 3. We don't need NVIDIA® drivers for building, but we need them for running simulations 4. Only Linux platforms are supported +Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®. +cuStateVec APIs can be exploited to accelerate statevector and density_matrix methods. +Because cuQuantum is beta version, usage of cuStateVec is limited to matrix multiplications. +Some gate operations that can be applied by matrix multiplication are accelerated. + +To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following. + +For example, + + qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum + +To run with cuStateVec, set the simulator device argument to cuStateVec as following. + + +``` +sim = AerSimulator(method='statevector', device='cuStateVec') +``` + + + ### Building with MPI support Qiskit Aer can parallelize its simulation on the cluster systems by using MPI. diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 1bfb496026..7fa11b7e49 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -147,6 +147,11 @@ class AerSimulator(AerBackend): initialization or with :meth:`set_options`. The list of supported devices for the current system can be returned using :meth:`available_devices`. + If AerSimulator is built with cuQuantum support, cuQuantum APIs are enabled + by using ``device="cuStateVec"``. This is experimental implementation + for cuQuantum Beta 1. All the calculations of gates that can be executed by + multiplying matrices will be done by cuStateVec matrix API. + **Additional Backend Options** The following simulator specific backend options are supported @@ -441,7 +446,7 @@ class AerSimulator(AerBackend): _AVAILABLE_METHODS = None - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') _AVAILABLE_DEVICES = None diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 451054a5e4..cc1b31c85c 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -115,7 +115,7 @@ class Controller { superop }; - enum class Device { CPU, GPU, ThrustCPU }; + enum class Device { CPU, GPU, ThrustCPU, cuStateVec }; // Simulation precision enum class Precision { Double, Single }; @@ -316,7 +316,7 @@ class Controller { size_t get_gpu_memory_mb(); size_t get_min_memory_mb() const { - if (sim_device_ == Device::GPU && num_gpus_ > 0) { + if ((sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0) { return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size } return max_memory_mb_; @@ -495,18 +495,37 @@ void Controller::set_config(const json_t &config) { #endif } else if (sim_device_name_ == "GPU") { #ifndef AER_THRUST_CUDA - throw std::runtime_error( - "Simulation device \"GPU\" is not supported on this system"); + throw std::runtime_error( + "Simulation device \"GPU\" is not supported on this system"); #else - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - throw std::runtime_error("No CUDA device available!"); - } - - sim_device_ = Device::GPU; + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + throw std::runtime_error("No CUDA device available!"); + } + sim_device_ = Device::GPU; #endif + } + else if(sim_device_name_ == "cuStateVec"){ +#ifndef AER_CUSTATEVEC + throw std::runtime_error( + "Simulation device \"cuStateVec\" is not supported on this system"); +#else + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + throw std::runtime_error("No CUDA device available!"); + } + sim_device_ = Device::cuStateVec; + //initialize custatevevtor handle once before actual calculation (takes long time at first call) + custatevecStatus_t err; + custatevecHandle_t stHandle; + err = custatevecCreate(&stHandle); + if(err == CUSTATEVEC_STATUS_SUCCESS){ + custatevecDestroy(stHandle); } +#endif + } else { throw std::runtime_error(std::string("Invalid simulation device (\"") + sim_device_name_ + std::string("\").")); @@ -629,8 +648,9 @@ void Controller::set_parallelization_circuit(const Circuit &circ, const Method method) { enable_batch_multi_shots_ = false; - if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ + if(batched_shots_gpu_ && (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && + circ.shots > 1 && max_batched_states_ >= num_gpus_ && + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ enable_batch_multi_shots_ = true; } @@ -687,7 +707,7 @@ void Controller::set_parallelization_circuit(const Circuit &circ, // And assign the remaining threads to state update int circ_memory_mb = required_memory_mb(circ, noise, method) / num_process_per_experiment_; - size_t mem_size = (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_; + size_t mem_size = (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) ? max_gpu_memory_mb_ : max_memory_mb_; if (mem_size < circ_memory_mb) throw std::runtime_error( "a circuit requires more memory than max_memory_mb."); @@ -713,12 +733,12 @@ bool Controller::multiple_chunk_required(const Circuit &circ, if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits) return true; - if(num_process_per_experiment_ == 1 && sim_device_ == Device::GPU && num_gpus_ > 0){ + if(num_process_per_experiment_ == 1 && (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0){ return (max_gpu_memory_mb_ / num_gpus_ < required_memory_mb(circ, noise, method)); } if(num_process_per_experiment_ > 1){ size_t total_mem = max_memory_mb_; - if(sim_device_ == Device::GPU) + if(sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) total_mem += max_gpu_memory_mb_; if(total_mem*num_process_per_experiment_ > required_memory_mb(circ, noise, method)) return true; @@ -778,6 +798,7 @@ size_t Controller::get_gpu_memory_mb() { } num_gpus_ = nDev; #endif + #ifdef AER_MPI // get minimum memory size per process uint64_t locMem, minMem; @@ -810,7 +831,7 @@ Controller::transpile_cache_blocking(Controller::Method method, const Circuit &c // if blocking is not set by config, automatically set if required if (multiple_chunk_required(circ, noise, method)) { int nplace = num_process_per_experiment_; - if(sim_device_ == Device::GPU && num_gpus_ > 0) + if((sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0) nplace *= num_gpus_; cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20, nplace, complex_size, is_matrix); @@ -865,7 +886,7 @@ Result Controller::execute(const inputdata_t &input_qobj) { auto timer_stop = myclock_t::now(); auto time_taken = std::chrono::duration(timer_stop - timer_start).count(); - result.metadata.add(time_taken, "time_taken"); + result.metadata.add(time_taken, "time_taken_qobj"); return result; } catch (std::exception &e) { // qobj was invalid, return valid output containing error message @@ -1887,7 +1908,7 @@ bool Controller::validate_state(const state_t &state, const Circuit &circ, bool memory_valid = true; if (max_memory_mb_ > 0) { size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) / num_process_per_experiment_; - size_t mem_size = (sim_device_ == Device::GPU) ? max_memory_mb_ + max_gpu_memory_mb_ : max_memory_mb_; + size_t mem_size = (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) ? max_memory_mb_ + max_gpu_memory_mb_ : max_memory_mb_; memory_valid = (required_mb <= mem_size); } if (throw_except && !memory_valid) { diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index d7c4fa0e26..7ae37e689c 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -1348,7 +1348,7 @@ void State::apply_gate_u3(const int_t iChunk, uint_t qubit, double th template void State::apply_diagonal_unitary_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t & diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits,diag); } diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 9893a2c6c2..f795cc2244 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -341,6 +341,8 @@ class State { complex_t global_phase_ = 1; int_t max_matrix_qubits_ = 0; + + std::string sim_device_name_; //name of device }; @@ -354,8 +356,10 @@ State::~State(void) } template -void State::set_config(const json_t &config) { - (ignore_argument)config; +void State::set_config(const json_t &config) +{ + //get device name + JSON::get_value(sim_device_name_, "device", config); } template diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index 353c1715b1..35a89a140f 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -408,6 +408,15 @@ class StateChunk : public State { ExperimentResult &result, RngEngine &rng); + //apply ops for multi-shots to one group + template + void apply_ops_multi_shots_for_group(int_t i_group, + InputIterator first, InputIterator last, + const Noise::NoiseModel &noise, + ExperimentResult &result, + uint_t rng_seed, + bool final_ops); + //apply op to multiple shots , return flase if op is not supported to execute in a batch virtual bool apply_batched_op(const int_t iChunk, const Operations::Op &op, ExperimentResult &result, @@ -516,8 +525,9 @@ StateChunk::~StateChunk(void) } template -void StateChunk::set_config(const json_t &config) { - (ignore_argument)config; +void StateChunk::set_config(const json_t &config) +{ + BaseState::set_config(config); } template @@ -607,12 +617,14 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu chunk_omp_parallel_ = false; if(qregs_[0].name().find("gpu") != std::string::npos){ #ifdef _OPENMP - if(multi_chunk_distribution_){ - if(omp_get_num_threads() == 1) - chunk_omp_parallel_ = true; - } + if(omp_get_num_threads() == 1) + chunk_omp_parallel_ = true; #endif - thrust_optimization_ = true; + + if(BaseState::sim_device_name_ == "cuStateVec") + chunk_omp_parallel_ = false; //because cuQuantum Beta 1 is not thread safe (TODO: check if cuQuantum will be updated) + else + thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix } else if(qregs_[0].name().find("thrust") != std::string::npos){ thrust_optimization_ = true; @@ -646,7 +658,7 @@ bool StateChunk::allocate_qregs(uint_t num_chunks) uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0; bool ret = true; qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_); - ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(),num_qubits_*qubit_scale(),chunk_id,num_chunks); + ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks, BaseState::sim_device_name_); for(i=1;i::apply_ops_multi_shots(InputIterator first, InputIterat //resize qregs allocate_qregs(n_shots); } - std::vector par_results(num_groups_); //initialization (equivalent to initialize_qreg + initialize_creg) -#pragma omp parallel for if(num_groups_ > 1) - for(i=0;i 1 && chunk_omp_parallel_){ +#pragma omp parallel for + for(i=0;i 1) - for(i=0;i rng(num_chunks_in_group_[i]); - - for(uint_t j=top_chunk_of_group_[i];jtype == Operations::OpType::qerror_loc){ - //sample error here - uint_t count = num_chunks_in_group_[i]; - uint_t max_ops = 0; - bool pauli_only = true; - std::vector> noise_ops(count); - for(uint_t j=0;j 1 && chunk_omp_parallel_){ + std::vector par_results(num_groups_); +#pragma omp parallel for + for(i=0;i::apply_ops_multi_shots(InputIterator first, InputIterat gather_creg_memory(); } +template +template +void StateChunk::apply_ops_multi_shots_for_group(int_t i_group, + InputIterator first, InputIterator last, + const Noise::NoiseModel &noise, + ExperimentResult &result, + uint_t rng_seed, + bool final_ops) +{ + uint_t istate = top_chunk_of_group_[i_group]; + std::vector rng(num_chunks_in_group_[i_group]); + + for(uint_t j=top_chunk_of_group_[i_group];jtype == Operations::OpType::qerror_loc){ + //sample error here + uint_t count = num_chunks_in_group_[i_group]; + uint_t max_ops = 0; + bool pauli_only = true; + std::vector> noise_ops(count); + for(uint_t j=0;j void StateChunk::apply_batched_noise_ops(const int_t i_group, const std::vector> &ops, ExperimentResult &result, diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index f5c7993cff..ddaa20fae7 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -349,7 +349,15 @@ class Chunk chunk_container_.lock()->keep_conditional(keep); } - + //apply matrix using cuStatevec + void apply_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) + { + chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,mat,count); + } + void apply_diagonal_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) + { + chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,diag,count); + } }; //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 5fd68798e4..e9d69cdff2 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -571,7 +571,7 @@ class ChunkContainer : public std::enable_shared_from_this& operator[](uint_t i) = 0; - virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS) = 0; + virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS, bool enable_cuStatevec = false) = 0; virtual void Deallocate(void) = 0; virtual void Set(uint_t i,const thrust::complex& t) = 0; @@ -683,6 +683,9 @@ class ChunkContainer : public std::enable_shared_from_this &mat,const uint_t count){} + virtual void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count){} protected: int convert_blocked_qubit(int qubit) diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index be3abf65c2..20e769373d 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -49,6 +49,8 @@ class ChunkManager int iplace_host_; //chunk container for host memory bool multi_shots_; + + bool enable_cuStatevec_; public: ChunkManager(); @@ -65,7 +67,7 @@ class ChunkManager return chunks_.size(); } - uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit); + uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit,bool enable_cuStatevec); void Free(void); int num_devices(void) @@ -161,7 +163,7 @@ ChunkManager::~ChunkManager() } template -uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit) +uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit, bool enable_cuStatevec) { uint_t num_buffers; int iDev; @@ -182,7 +184,8 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, hybrid = true; } //--- - + enable_cuStatevec_ = enable_cuStatevec; + if(num_qubits_ != nqubits || chunk_bits_ != chunk_bits || nchunks > num_chunks_){ //free previous allocation Free(); @@ -263,9 +266,9 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, nc /= 2; } if(num_devices_ > 0) - chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); + chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit,enable_cuStatevec_); else - chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); + chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit,enable_cuStatevec_); } if(chunks_allocated < nchunks){ //rest of chunks are stored on host diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 34e92ab1c8..6246892add 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -18,7 +18,9 @@ #include "simulators/statevector/chunk/chunk_container.hpp" - +#ifdef AER_CUSTATEVEC +#include "custatevec.h" +#endif namespace AER { namespace QV { @@ -49,6 +51,8 @@ class DeviceChunkContainer : public ChunkContainer bool creg_host_update_; + bool enable_cuStatevec_; + //for register blocking thrust::host_vector blocked_qubits_holder_; uint_t max_blocked_gates_; @@ -58,7 +62,17 @@ class DeviceChunkContainer : public ChunkContainer #ifdef AER_THRUST_CUDA std::vector stream_; //asynchronous execution + +#ifdef AER_CUSTATEVEC + //for cuStatevec + custatevecHandle_t custatevec_handle_; //cuStatevec handle for this chunk container + AERDeviceVector custatevec_work_; //work buffer for cuStatevec + uint_t custatevec_work_size_; //buffer size + uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix + uint_t custatevec_chunk_count_; //number of counts for all chunks #endif +#endif + public: DeviceChunkContainer() { @@ -103,7 +117,7 @@ class DeviceChunkContainer : public ChunkContainer return raw_reference_cast(data_[i]); } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec); void Deallocate(void); void StoreMatrix(const std::vector>& mat,uint_t iChunk); @@ -123,6 +137,20 @@ class DeviceChunkContainer : public ChunkContainer { return stream_[iChunk]; } + +#ifdef AER_CUSTATEVEC + unsigned char* custatevec_work_pointer(uint_t iChunk) const + { + if(custatevec_work_size_ == 0) + return nullptr; + if(iChunk >= this->num_chunks_){ //for buffer chunks + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((num_matrices_ + iChunk - this->num_chunks_) * custatevec_work_size_); + } + else{ + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((iChunk % num_matrices_) * custatevec_work_size_); + } + } +#endif #endif void Set(uint_t i,const thrust::complex& t) @@ -234,6 +262,10 @@ class DeviceChunkContainer : public ChunkContainer //queue gate for blocked execution void queue_blocked_gate(uint_t iChunk,char gate,uint_t qubit,uint_t mask,const std::complex* pMat = NULL); + + //apply matrix using cuStatevec + void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count); + void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count); }; template @@ -243,7 +275,7 @@ DeviceChunkContainer::~DeviceChunkContainer(void) } template -uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) +uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec) { uint_t nc = chunks; uint_t i; @@ -255,6 +287,8 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu device_id_ = idev; set_device(); + enable_cuStatevec_ = enable_cuStatevec; + #ifdef AER_THRUST_CUDA if(!multi_shots){ int ip,nd; @@ -282,6 +316,19 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu } #endif +#ifdef AER_CUSTATEVEC + if(enable_cuStatevec_){ + //initialize custatevevtor handle + custatevecStatus_t err; + err = custatevecCreate(&custatevec_handle_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::allocate : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + } +#endif + this->num_buffers_ = buffers; if(multi_shots){ //mult-shot parallelization for small qubits @@ -338,6 +385,44 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu reduce_buffer_size_ = 1; #endif +#ifdef AER_CUSTATEVEC + if(enable_cuStatevec_){ + custatevecStatus_t err; + //set stream to custatevec handle + err = custatevecSetStream(custatevec_handle_,stream_[0]); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::allocate : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //allocate extra workspace for custatevec + std::vector> mat(1ull << (matrix_bit*2)); + + //count bits for multi-chunks + custatevec_chunk_total_qubits_ = this->chunk_bits_; + custatevec_chunk_count_ = this->num_chunks_; + if(custatevec_chunk_count_ > 1){ + while((custatevec_chunk_count_ & 1) == 0){ + custatevec_chunk_count_ >>= 1; + custatevec_chunk_total_qubits_++; + } + } + + err = custatevecApplyMatrix_bufferSize( + custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::ResizeMatrixBuffers : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + if(custatevec_work_size_ > 0) + custatevec_work_.resize(custatevec_work_size_*num_matrices_); + } +#endif + reduce_buffer_size_ *= 2; reduce_buffer_.resize(reduce_buffer_size_*nc); probability_buffer_.resize(nc*QV_PROBABILITY_BUFFER_SIZE); @@ -400,6 +485,15 @@ void DeviceChunkContainer::Deallocate(void) num_blocked_qubits_.clear(); blocked_qubits_holder_.clear(); +#ifdef AER_CUSTATEVEC + if(enable_cuStatevec_){ + custatevec_work_.clear(); + custatevec_work_.shrink_to_fit(); + + custatevecDestroy(custatevec_handle_); + } +#endif + #ifdef AER_THRUST_CUDA uint_t i; for(i=0;i::copy_to_probability_buffer(std::vector +void DeviceChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ +#ifdef AER_CUSTATEVEC + if(!enable_cuStatevec_) + return; + + thrust::complex* pMat; + + if(count == this->num_chunks_ && iChunk == 0){ + StoreMatrix(mat,iChunk); + pMat = matrix_pointer(iChunk); + } + else{ + //if operation is not batchable, use host memory + pMat = (thrust::complex*)&mat[0]; + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + custatevecStatus_t err; + for(int_t i=0;i +void DeviceChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + if(enable_cuStatevec_){ + //convert diagonal elements to matrix unless cuQuantum has no diagonal matrix multiplication + //TO DO: call diagonal matrix multiplication API if cuStatevec supports it + cvector_t mat(diag.size()*diag.size(),0.0); + for(int_t i=0;i return data_[i]; } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec = false); void Deallocate(void); void StoreMatrix(const std::vector>& mat,uint_t iChunk) @@ -124,7 +124,7 @@ HostChunkContainer::~HostChunkContainer(void) } template -uint_t HostChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) +uint_t HostChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit, bool enable_cuStatevec) { uint_t nc = chunks; uint_t i; diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 79fad5745b..7166ba574d 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -131,7 +131,7 @@ class QubitVector { void initialize_component(const reg_t &qubits, const cvector_t &state); //setup chunk - bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks); + bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name); bool chunk_setup(QubitVector& base,const uint_t chunk_index); //cache control for chunks on host @@ -925,7 +925,7 @@ std::complex QubitVector::inner_product() const { //setup chunk template -bool QubitVector::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) +bool QubitVector::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name) { chunk_index_ = chunk_index; return true; diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index f5e9e22ebc..3f55e88cfe 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -142,7 +142,7 @@ class QubitVectorThrust { void initialize_component(const reg_t &qubits, const cvector_t &state); //chunk setup - bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks); + bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name); bool chunk_setup(QubitVectorThrust& base,const uint_t chunk_index); //cache control for chunks on host @@ -451,6 +451,7 @@ class QubitVectorThrust { bool multi_chunk_distribution_; bool multi_shots_; bool enable_batch_; + bool enable_cuStatevec_ = false; bool register_blocking_; @@ -966,11 +967,16 @@ void QubitVectorThrust::zero() template -bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) +bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name) { //set global chunk ID / shot ID chunk_index_ = chunk_index; + //check device name if cuStateVec is specified + if(device_name == "cuStateVec"){ + enable_cuStatevec_ = true; + } + if(chunk_manager_){ if(chunk_.is_mapped()){ chunk_.unmap(); @@ -988,7 +994,7 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ chunk_manager_ = std::make_shared>(); - chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,max_matrix_bits_); + chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,max_matrix_bits_, enable_cuStatevec_); } multi_chunk_distribution_ = false; @@ -1020,6 +1026,7 @@ bool QubitVectorThrust::chunk_setup(QubitVectorThrust& base,cons base.multi_shots_ = true; } } + enable_cuStatevec_ = base.enable_cuStatevec_; //set global chunk ID / shot ID chunk_index_ = chunk_index; @@ -2269,6 +2276,10 @@ void QubitVectorThrust::apply_matrix(const reg_t &qubits, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + return chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); + } + const size_t N = qubits.size(); auto qubits_sorted = qubits; std::sort(qubits_sorted.begin(), qubits_sorted.end()); @@ -2513,6 +2524,10 @@ void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + return chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); + } + const int_t N = qubits.size(); if(N == 1){ @@ -2710,6 +2725,11 @@ void QubitVectorThrust::apply_mcx(const reg_t &qubits) if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + //TO DO: implement MCX specific function for cuStatevec + return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::X,chunk_.container()->num_chunks()); + } + if(register_blocking_){ int i; uint_t mask = 0; @@ -2793,6 +2813,11 @@ void QubitVectorThrust::apply_mcy(const reg_t &qubits) if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + //TO DO: implement MCY specific function for cuStatevec + return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::Y,chunk_.container()->num_chunks()); + } + if(register_blocking_){ int i; uint_t mask = 0; @@ -2889,7 +2914,15 @@ class CSwap_func : public GateFuncBase template void QubitVectorThrust::apply_mcswap(const reg_t &qubits) { - apply_function(CSwap_func(qubits)); + if(enable_cuStatevec_){ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + + chunk_.apply_matrix(qubits,qubits.size()-2,Linalg::VMatrix::SWAP,chunk_.container()->num_chunks()); + } + else{ + apply_function(CSwap_func(qubits)); + } } @@ -3148,6 +3181,9 @@ void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::co if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_) + return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::phase(phase),chunk_.container()->num_chunks()); + if(register_blocking_){ int i; uint_t mask = 0; @@ -3299,6 +3335,9 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + return chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); + } // Calculate the permutation positions for the last qubit. const size_t N = qubits.size(); @@ -3367,6 +3406,11 @@ void QubitVectorThrust::apply_matrix(const uint_t qubit, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + reg_t qubits(1,qubit); + return chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); + } + // Check if matrix is diagonal and if so use optimized lambda if (mat[1] == 0.0 && mat[2] == 0.0) { const std::vector> diag = {{mat[0], mat[3]}}; @@ -3388,6 +3432,11 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch + if(enable_cuStatevec_){ + reg_t qubits(1,qubit); + return chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); + } + if(register_blocking_){ chunk_.queue_blocked_gate('d',qubit,0,&diag[0]); } diff --git a/src/simulators/superoperator/superoperator_state.hpp b/src/simulators/superoperator/superoperator_state.hpp index 5a85c36028..7bb81c31f1 100755 --- a/src/simulators/superoperator/superoperator_state.hpp +++ b/src/simulators/superoperator/superoperator_state.hpp @@ -358,7 +358,7 @@ template void State::initialize_omp() { template bool State::allocate(uint_t num_qubits, uint_t block_bits,uint_t num_parallel_shots) { - return BaseState::qreg_.chunk_setup(num_qubits * 4, num_qubits * 4, 0, 1); + return BaseState::qreg_.chunk_setup(num_qubits * 4, num_qubits * 4, 0, 1, BaseState::sim_device_name_); } //========================================================================= diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp index 51b8582298..f0b2771983 100755 --- a/src/simulators/unitary/unitary_state.hpp +++ b/src/simulators/unitary/unitary_state.hpp @@ -671,7 +671,7 @@ void State::apply_matrix(const int_t iChunk, const reg_t &qubi template void State::apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t &diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is reg_t qubits_chunk = qubits; for(uint_t i;i Date: Mon, 13 Dec 2021 18:42:28 +0900 Subject: [PATCH 02/17] delete space --- qiskit/providers/aer/backends/aer_simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index a28090d8f2..6e8151efee 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -149,7 +149,7 @@ class AerSimulator(AerBackend): If AerSimulator is built with cuQuantum support, cuQuantum APIs are enabled by using ``device="cuStateVec"``. This is experimental implementation - for cuQuantum Beta 1. All the calculations of gates that can be executed by + for cuQuantum Beta 1. All the calculations of gates that can be executed by multiplying matrices will be done by cuStateVec matrix API. **Additional Backend Options** From a40898c8ee4fc73544a49b11b783ab615985ad8a Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 15 Dec 2021 15:00:24 +0900 Subject: [PATCH 03/17] disable batched shots optimization for cuStateVec --- src/controllers/aer_controller.hpp | 7 +- .../backends/aer_simulator/test_algorithms.py | 8 +- .../aer_simulator/test_control_flow.py | 490 ++++++++++++++++++ test/terra/backends/test_compatibility.py | 87 ++++ test/terra/common.py | 6 + .../noise/passes/test_local_noise_pass.py | 3 +- .../passes/test_relaxation_noise_pass.py | 3 +- 7 files changed, 594 insertions(+), 10 deletions(-) create mode 100644 test/terra/backends/aer_simulator/test_control_flow.py diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index cc1b31c85c..53c3a869f5 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -482,6 +482,9 @@ void Controller::set_config(const json_t &config) { } } + if(method_ == Method::density_matrix || method_ == Method::unitary) + batched_shots_gpu_max_qubits_ /= 2; + // Override automatic simulation method with a fixed method if (JSON::get_value(sim_device_name_, "device", config)) { if (sim_device_name_ == "CPU") { @@ -648,9 +651,9 @@ void Controller::set_parallelization_circuit(const Circuit &circ, const Method method) { enable_batch_multi_shots_ = false; - if(batched_shots_gpu_ && (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && + if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ //cuStateVec is not supported currently enable_batch_multi_shots_ = true; } diff --git a/test/terra/backends/aer_simulator/test_algorithms.py b/test/terra/backends/aer_simulator/test_algorithms.py index 2b2a476b2d..ff4c69ef7c 100644 --- a/test/terra/backends/aer_simulator/test_algorithms.py +++ b/test/terra/backends/aer_simulator/test_algorithms.py @@ -103,7 +103,7 @@ def test_extended_stabilizer_sparse_output_probs(self): ) shots = 100 - nqubits = 5 + nqubits = 2 circ = QuantumCircuit(nqubits) circ.h(0) circ.t(0) @@ -114,9 +114,9 @@ def test_extended_stabilizer_sparse_output_probs(self): circ = transpile(circ, backend) target = { - '0x0': shots * (0.5 + sqrt(2)/4.), - '0x1f': shots * (0.5 - sqrt(2)/4.) + nqubits * "0": shots * (0.5 + sqrt(2)/4.), + nqubits * "1": shots * (0.5 - sqrt(2)/4.) } result = backend.run(circ, shots=shots).result() self.assertSuccess(result) - self.compare_counts(result, [circ], [target], delta=0.1 * shots) + self.compare_counts(result, [circ], [target], hex_counts=False, delta=0.1 * shots) diff --git a/test/terra/backends/aer_simulator/test_control_flow.py b/test/terra/backends/aer_simulator/test_control_flow.py new file mode 100644 index 0000000000..98e549309b --- /dev/null +++ b/test/terra/backends/aer_simulator/test_control_flow.py @@ -0,0 +1,490 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2018, 2021. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. +""" +Integration Tests for jump/mark instructions +""" +from ddt import ddt, data +import unittest +import numpy +import logging +from test.terra.backends.simulator_test_case import ( + SimulatorTestCase, supported_methods) +from qiskit.providers.aer import AerSimulator +from qiskit import QuantumCircuit, transpile +from qiskit.circuit import Parameter, Qubit, QuantumRegister, ClassicalRegister +from qiskit.circuit.controlflow import * +from qiskit.providers.aer.library.default_qubits import default_qubits +from qiskit.providers.aer.library.control_flow_instructions import AerMark, AerJump + +@ddt +class TestControlFlow(SimulatorTestCase): + """Test instructions for jump and mark instructions and compiler functions.""" + + def add_mark(self, circ, name): + """Create a mark instruction which can be a destination of jump instructions. + + Args: + name (str): an unique name of this mark instruction in a circuit + """ + qubits = default_qubits(circ) + instr = AerMark(name, + len(qubits)) + return circ.append(instr, qubits) + + def add_jump(self, circ, jump_to, clbit=None, value=0): + """Create a jump instruction to move a program counter to a named mark. + + Args: + jump_to (str): a name of a destination mark instruction + clbit (Clbit): a classical bit for a condition + value (int): an int value for a condition. if clbit is value, jump is performed. + """ + qubits = default_qubits(circ) + instr = AerJump(jump_to, len(qubits)) + if clbit: + instr.c_if(clbit, value) + return circ.append(instr, qubits) + + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_jump_always(self, method): + backend = self.backend(method=method) + + circ = QuantumCircuit(4) + mark = 'mark' + self.add_jump(circ, mark) + + for i in range(4): + circ.h(i) + + self.add_mark(circ, mark) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0000', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_jump_conditional(self, method): + backend = self.backend(method=method) + + circ = QuantumCircuit(4, 1) + mark = 'mark' + self.add_jump(circ, mark, circ.clbits[0]) + + for i in range(4): + circ.h(i) + + self.add_mark(circ, mark) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0000 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_no_jump_conditional(self, method): + backend = self.backend(method=method) + + circ = QuantumCircuit(4, 1) + mark = 'mark' + self.add_jump(circ, mark, circ.clbits[0], 1) + + for i in range(4): + circ.h(i) + + self.add_mark(circ, mark) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertNotEqual(len(counts), 1) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_invalid_jump(self, method): + logging.disable(level=logging.WARN) + + backend = self.backend(method=method) + + circ = QuantumCircuit(4, 1) + mark = 'mark' + invalid_mark = 'invalid_mark' + self.add_jump(circ, invalid_mark, circ.clbits[0]) + + for i in range(4): + circ.h(i) + + self.add_mark(circ, mark) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertNotSuccess(result) + + logging.disable(level=logging.NOTSET) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_duplicated_mark(self, method): + logging.disable(level=logging.WARN) + + backend = self.backend(method=method) + + circ = QuantumCircuit(4, 1) + mark = 'mark' + self.add_jump(circ, mark, circ.clbits[0]) + + for i in range(4): + circ.h(i) + + self.add_mark(circ, mark) + self.add_mark(circ, mark) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertNotSuccess(result) + + logging.disable(level=logging.NOTSET) + + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_if_true_body_builder(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(4) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.y(0) + circ.h(circ.qubits[1:4]) + circ.barrier() + circ.measure(0, 0) + + with circ.if_test((creg, 1)): + circ.h(circ.qubits[1:4]) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0001 1', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_if_else_body_builder(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(4) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.h(circ.qubits[1:4]) + circ.barrier() + circ.measure(0, 0) + + with circ.if_test((creg, 1)) as else_: + pass + with else_: + circ.h(circ.qubits[1:4]) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0000 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_for_loop_builder(self, method): + backend = self.backend(method=method) + + circ = QuantumCircuit(5, 0) + + with circ.for_loop(range(0)) as a: + circ.ry(a * numpy.pi, 0) + with circ.for_loop(range(1)) as a: + circ.ry(a * numpy.pi, 1) + with circ.for_loop(range(2)) as a: + circ.ry(a * numpy.pi, 2) + with circ.for_loop(range(3)) as a: + circ.ry(a * numpy.pi, 3) + with circ.for_loop(range(4)) as a: + circ.ry(a * numpy.pi, 4) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('01100', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_for_loop_break_builder(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(5) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + + with circ.for_loop(range(0)) as a: + circ.ry(a * numpy.pi, 0) + circ.measure(0, 0) + with circ.if_test((creg, 1)): + circ.break_loop() + with circ.for_loop(range(1)) as a: + circ.ry(a * numpy.pi, 1) + circ.measure(1, 0) + with circ.if_test((creg, 1)): + circ.break_loop() + with circ.for_loop(range(2)) as a: + circ.ry(a * numpy.pi, 2) + circ.measure(2, 0) + with circ.if_test((creg, 1)): + circ.break_loop() + with circ.for_loop(range(3)) as a: + circ.ry(a * numpy.pi, 3) + circ.measure(3, 0) + with circ.if_test((creg, 1)): + circ.break_loop() + with circ.for_loop(range(4)) as a: + circ.ry(a * numpy.pi, 4) + circ.measure(4, 0) + with circ.if_test((creg, 1)): + circ.break_loop() + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('11100 1', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_for_loop_continue_builder(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(5) + cregs = [ClassicalRegister(1) for _ in range(5)] + circ = QuantumCircuit(qreg, *cregs) + + with circ.for_loop(range(0)) as a: + circ.ry(a * numpy.pi, 0) # dead code + circ.measure(0, 0) # dead code + with circ.if_test((cregs[0], 1)): + circ.continue_loop() # dead code + circ.y(0) # dead code + # 1st cbit -> 0 + # 1st meas cbit -> 0 + + with circ.for_loop(range(1)) as a: + circ.ry(a * numpy.pi, 1) + circ.measure(1, 1) + with circ.if_test((cregs[1], 1)): + circ.continue_loop() # dead code + circ.y(1) + # 2nd cbit -> 0 + # 2nd meas cbit -> 1 + + with circ.for_loop(range(2)) as a: + circ.ry(a * numpy.pi, 2) + circ.measure(2, 2) + with circ.if_test((cregs[2], 1)): + circ.continue_loop() + circ.y(2) + # 3rd cbit -> 0 + # 3rd meas cbit -> 1 + + with circ.for_loop(range(3)) as a: + circ.ry(a * numpy.pi, 3) + circ.measure(3, 3) + with circ.if_test((cregs[3], 1)): + circ.continue_loop() + circ.y(3) + # 4th cbit -> 1 + # 4th meas cbit -> 1 + + with circ.for_loop(range(4)) as a: + circ.ry(a * numpy.pi, 4) + circ.measure(4, 4) + with circ.if_test((cregs[4], 1)): + circ.continue_loop() + circ.y(4) + # 5th cbit -> 0 + # 5th meas cbit -> 1 + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('11110 0 1 0 0 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_while_loop_no_iteration(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(1) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.measure(0, 0) + with circ.while_loop((creg, 1)): + circ.y(0) + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_while_loop_single_iteration(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(2) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.y(0) + circ.measure(0, 0) + + # does not work + # while circ.while_loop((creg, 1)): + # circ.y(0) + # circ.measure(0, 0) + # circ.y(1) + + circ_while = QuantumCircuit(qreg, creg) + circ_while.y(0) + circ_while.measure(0, 0) + circ_while.y(1) + circ.while_loop((creg, 1), circ_while, [0, 1], [0]) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('10 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_while_loop_double_iterations(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(2) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.y(0) + circ.measure(0, 0) + + # does not work + # while circ.while_loop((creg, 1)): + # circ.y(0) + # circ.measure(0, 0) + # circ.y(1) + + circ_while = QuantumCircuit(qreg, creg) + circ_while.measure(0, 0) + circ_while.y(0) + circ_while.y(1) + circ.while_loop((creg, 1), circ_while, [0, 1], [0]) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('01 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_while_loop_continue(self, method): + backend = self.backend(method=method) + + qreg = QuantumRegister(1) + creg = ClassicalRegister(1) + circ = QuantumCircuit(qreg, creg) + circ.y(0) + circ.measure(0, 0) + + # does not work + # while circ.while_loop((creg, 1)): + # circ.y(0) + # circ.measure(0, 0) + # circ.continue_loop() + # circ.y(0) + + circ_while = QuantumCircuit(qreg, creg) + circ_while.y(0) + circ_while.measure(0, 0) + circ_while.continue_loop() + circ_while.y(0) + circ_while.break_loop() + circ.while_loop((creg, 1), circ_while, [0], [0]) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('0 0', counts) + + @data('statevector', 'density_matrix', 'matrix_product_state') + def test_nested_loop(self, method): + backend = self.backend(method=method) + + circ = QuantumCircuit(3) + + with circ.for_loop(range(2)) as a: + with circ.for_loop(range(2)) as b: + circ.ry(a * b * numpy.pi, 0) + + with circ.for_loop(range(3)) as a: + with circ.for_loop(range(3)) as b: + circ.ry(a * b * numpy.pi, 1) + + with circ.for_loop(range(4)) as a: + with circ.for_loop(range(2)) as b: + circ.ry(a * b * numpy.pi, 2) + + circ.measure_all() + + result = backend.run(circ, method=method).result() + self.assertSuccess(result) + + counts = result.get_counts() + self.assertEqual(len(counts), 1) + self.assertIn('011', counts) diff --git a/test/terra/backends/test_compatibility.py b/test/terra/backends/test_compatibility.py index bd9c8898d6..f1170603b1 100644 --- a/test/terra/backends/test_compatibility.py +++ b/test/terra/backends/test_compatibility.py @@ -66,6 +66,22 @@ def test_statevector_evolve(self): self.assertEqual(compat.evolve(orig_op), target) self.assertEqual(compat.evolve(compat_op), target) + def test_statevector_iterable_methods(self): + """Test that the iterable magic methods and related Numpy properties + work on the compatibility classes.""" + compat = cqi.Statevector([0.5, 0.5j, -0.5, 0.5j]) + compat_data = compat.data + + with self.assertWarns(DeprecationWarning): + compat_len = len(compat) + self.assertEqual(compat_len, len(compat_data)) + with self.assertWarns(DeprecationWarning): + compat_shape = compat.shape + self.assertEqual(compat_shape, compat_data.shape) + with self.assertWarns(DeprecationWarning): + compat_iter = tuple(compat) + self.assertEqual(compat_iter, tuple(compat.data)) + def test_density_matrix_eq(self): orig = qi.random_density_matrix(4, seed=10) compat = cqi.DensityMatrix(orig.data) @@ -107,6 +123,22 @@ def test_density_matrix_evolve(self): self.assertEqual(compat.evolve(orig_op), target) self.assertEqual(compat.evolve(compat_op), target) + def test_density_matrix_iterable_methods(self): + """Test that the iterable magic methods and related Numpy properties + work on the compatibility classes.""" + compat = cqi.DensityMatrix([[0.5, 0.5j], [-0.5j, 0.5]]) + compat_data = compat.data + + with self.assertWarns(DeprecationWarning): + compat_len = len(compat) + self.assertEqual(compat_len, len(compat_data)) + with self.assertWarns(DeprecationWarning): + compat_shape = compat.shape + self.assertEqual(compat_shape, compat_data.shape) + with self.assertWarns(DeprecationWarning): + compat_iter = tuple(compat) + np.testing.assert_array_equal(compat_iter, compat.data) + def test_unitary_eq(self): orig = qi.random_unitary(4, seed=10) compat = cqi.Operator(orig.data) @@ -152,6 +184,22 @@ def test_unitary_evolve(self): target = state.evolve(orig) self.assertEqual(state.evolve(compat), target) + def test_unitary_iterable_methods(self): + """Test that the iterable magic methods and related Numpy properties + work on the compatibility classes.""" + compat = cqi.Operator(qi.random_unitary(2, seed=10)) + compat_data = compat.data + + with self.assertWarns(DeprecationWarning): + compat_len = len(compat) + self.assertEqual(compat_len, len(compat_data)) + with self.assertWarns(DeprecationWarning): + compat_shape = compat.shape + self.assertEqual(compat_shape, compat_data.shape) + with self.assertWarns(DeprecationWarning): + compat_iter = tuple(compat) + np.testing.assert_array_equal(compat_iter, compat.data) + def test_superop_eq(self): orig = qi.SuperOp(qi.random_quantum_channel(4, seed=10)) compat = cqi.SuperOp(orig.data) @@ -176,6 +224,22 @@ def test_superop_linop(self): self.assertEqual(2 * compat - orig, orig) self.assertEqual(2 * orig - compat, orig) + def test_superop_iterable_methods(self): + """Test that the iterable magic methods and related Numpy properties + work on the compatibility classes.""" + compat = cqi.SuperOp(np.eye(4)) + compat_data = compat.data + + with self.assertWarns(DeprecationWarning): + compat_len = len(compat) + self.assertEqual(compat_len, len(compat_data)) + with self.assertWarns(DeprecationWarning): + compat_shape = compat.shape + self.assertEqual(compat_shape, compat_data.shape) + with self.assertWarns(DeprecationWarning): + compat_iter = tuple(compat) + np.testing.assert_array_equal(compat_iter, compat.data) + def test_stabilizer_eq(self): orig = qi.StabilizerState(qi.random_clifford(4, seed=10)) compat = cqi.StabilizerState(orig.clifford) @@ -205,3 +269,26 @@ def test_stabilizer_copy(self): compat = cqi.StabilizerState(clifford) cpy = copy.copy(compat) self.assertEqual(cpy, compat) + + def test_stabilizer_iterable_methods(self): + """Test that the iterable magic methods and related dict properties + work on the compatibility classes.""" + clifford = qi.random_clifford(4, seed=10) + cliff_dict = clifford.to_dict() + compat = cqi.StabilizerState(clifford) + + with self.assertWarns(DeprecationWarning): + compat_keys = compat.keys() + self.assertEqual(compat_keys, cliff_dict.keys()) + + with self.assertWarns(DeprecationWarning): + compat_iter = set(compat) + self.assertEqual(compat_iter, set(cliff_dict)) + + with self.assertWarns(DeprecationWarning): + compat_items = compat.items() + self.assertEqual(sorted(compat_items), sorted(cliff_dict.items())) + + with self.assertWarns(DeprecationWarning): + compat_len = len(compat) + self.assertEqual(compat_len, len(cliff_dict)) diff --git a/test/terra/common.py b/test/terra/common.py index 256f3b95d8..9d159b86bb 100644 --- a/test/terra/common.py +++ b/test/terra/common.py @@ -102,6 +102,12 @@ def assertSuccess(self, result): msg += ', (Circuit {}) {}'.format(i, res.status) self.assertTrue(success, msg=msg) + def assertNotSuccess(self, result): + """Assert that simulation executed with errors""" + success = getattr(result, 'success', False) + msg = result.status + self.assertFalse(success, msg=msg) + @staticmethod def gate_circuits(gate_cls, num_angles=0, has_ctrl_qubits=False, rng=None, basis_states=None): diff --git a/test/terra/noise/passes/test_local_noise_pass.py b/test/terra/noise/passes/test_local_noise_pass.py index 60d08f35cb..d4cb211e44 100644 --- a/test/terra/noise/passes/test_local_noise_pass.py +++ b/test/terra/noise/passes/test_local_noise_pass.py @@ -13,8 +13,7 @@ LocalNoisePass class tests """ -from qiskit.providers.aer.noise.errors import ReadoutError -from qiskit.providers.aer.noise.passes import LocalNoisePass +from qiskit.providers.aer.noise import ReadoutError, LocalNoisePass from qiskit.circuit import QuantumCircuit from qiskit.circuit.library.standard_gates import SXGate, HGate diff --git a/test/terra/noise/passes/test_relaxation_noise_pass.py b/test/terra/noise/passes/test_relaxation_noise_pass.py index 2037dd7fc1..916a7f9eb2 100644 --- a/test/terra/noise/passes/test_relaxation_noise_pass.py +++ b/test/terra/noise/passes/test_relaxation_noise_pass.py @@ -13,8 +13,7 @@ RelaxationNoisePass class tests """ -from qiskit.providers.aer.noise.errors import thermal_relaxation_error -from qiskit.providers.aer.noise.passes import RelaxationNoisePass +from qiskit.providers.aer.noise import thermal_relaxation_error, RelaxationNoisePass import qiskit.quantum_info as qi from qiskit.circuit import QuantumCircuit, Delay From 26c45380c1ab1999e13c244f3e3d71ac5987e191 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 15 Dec 2021 19:16:24 +0900 Subject: [PATCH 04/17] Fix cuStateVec test fails --- qiskit/providers/aer/backends/backend_utils.py | 3 +++ qiskit/providers/aer/backends/qasm_simulator.py | 16 ++++++++-------- .../aer/backends/statevector_simulator.py | 2 +- .../providers/aer/backends/unitary_simulator.py | 2 +- src/controllers/aer_controller.hpp | 5 +++++ src/simulators/statevector/chunk/chunk.hpp | 2 +- .../statevector/chunk/device_chunk_container.hpp | 14 +++++++------- .../statevector/qubitvector_thrust.hpp | 8 ++++++-- 8 files changed, 32 insertions(+), 20 deletions(-) diff --git a/qiskit/providers/aer/backends/backend_utils.py b/qiskit/providers/aer/backends/backend_utils.py index db44c0ec5b..19888980dc 100644 --- a/qiskit/providers/aer/backends/backend_utils.py +++ b/qiskit/providers/aer/backends/backend_utils.py @@ -40,12 +40,15 @@ LEGACY_METHOD_MAP = { "statevector_cpu": ("statevector", "CPU"), "statevector_gpu": ("statevector", "GPU"), + "statevector_custatevec": ("statevector", "cuStateVec"), "statevector_thrust": ("statevector", "Thrust"), "density_matrix_cpu": ("density_matrix", "CPU"), "density_matrix_gpu": ("density_matrix", "GPU"), + "density_matrix_custatevec": ("density_matrix", "cuStateVec"), "density_matrix_thrust": ("density_matrix", "Thrust"), "unitary_cpu": ("unitary", "CPU"), "unitary_gpu": ("unitary", "GPU"), + "unitary_custatevec": ("unitary", "cuStateVec"), "unitary_thrust": ("unitary", "Thrust"), } diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py index 9abbce9056..05e2c1d5bc 100644 --- a/qiskit/providers/aer/backends/qasm_simulator.py +++ b/qiskit/providers/aer/backends/qasm_simulator.py @@ -339,15 +339,15 @@ class QasmSimulator(AerBackend): } _SIMULATION_METHODS = [ - 'automatic', 'statevector', 'statevector_gpu', + 'automatic', 'statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust', 'density_matrix', - 'density_matrix_gpu', 'density_matrix_thrust', + 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust', 'stabilizer', 'matrix_product_state', 'extended_stabilizer' ] _AVAILABLE_METHODS = None - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') _AVAILABLE_DEVICES = None @@ -595,7 +595,7 @@ def _basis_gates(self): def _method_basis_gates(self): """Return method basis gates and custom instructions""" method = self._options.get('method', None) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x', 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx', @@ -628,7 +628,7 @@ def _custom_instructions(self): return self._options_configuration['custom_instructions'] method = self._options.get('method', None) - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -636,7 +636,7 @@ def _custom_instructions(self): 'save_density_matrix', 'save_statevector', 'save_statevector_dict', 'set_statevector' ]) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -666,10 +666,10 @@ def _custom_instructions(self): def _set_method_config(self, method=None): """Set non-basis gate options when setting method""" # Update configuration description and number of qubits - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust']: description = 'A C++ statevector simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR - elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: description = 'A C++ density matrix simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR // 2 elif method == 'matrix_product_state': diff --git a/qiskit/providers/aer/backends/statevector_simulator.py b/qiskit/providers/aer/backends/statevector_simulator.py index 100cfb7b57..2cc6d09327 100644 --- a/qiskit/providers/aer/backends/statevector_simulator.py +++ b/qiskit/providers/aer/backends/statevector_simulator.py @@ -165,7 +165,7 @@ class StatevectorSimulator(AerBackend): 'gates': [] } - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') _AVAILABLE_DEVICES = None diff --git a/qiskit/providers/aer/backends/unitary_simulator.py b/qiskit/providers/aer/backends/unitary_simulator.py index 2db5880aa9..a3fa9de7a7 100644 --- a/qiskit/providers/aer/backends/unitary_simulator.py +++ b/qiskit/providers/aer/backends/unitary_simulator.py @@ -163,7 +163,7 @@ class UnitarySimulator(AerBackend): 'gates': [] } - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') _AVAILABLE_DEVICES = None diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 8278bf5454..5f812c0938 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -668,6 +668,11 @@ void Controller::set_parallelization_circuit(const Circuit &circ, enable_batch_multi_shots_ = true; } + if(sim_device_ == Device::cuStateVec){ + parallel_shots_ = 1; //cuStateVec beta 1 is not thread safe + return; + } + if(explicit_parallelization_) return; diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index ddaa20fae7..02b8956f51 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -356,7 +356,7 @@ class Chunk } void apply_diagonal_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) { - chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,diag,count); + chunk_container_.lock()->apply_diagonal_matrix(chunk_pos_,qubits,control_bits,diag,count); } }; diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 6246892add..a47db7b0c1 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -264,8 +264,8 @@ class DeviceChunkContainer : public ChunkContainer void queue_blocked_gate(uint_t iChunk,char gate,uint_t qubit,uint_t mask,const std::complex* pMat = NULL); //apply matrix using cuStatevec - void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count); - void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count); + void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) override; + void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) override; }; template @@ -1280,14 +1280,14 @@ void DeviceChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& thrust::complex* pMat; - if(count == this->num_chunks_ && iChunk == 0){ +// if(count == this->num_chunks_ && iChunk == 0){ StoreMatrix(mat,iChunk); pMat = matrix_pointer(iChunk); - } - else{ +// } +// else{ //if operation is not batchable, use host memory - pMat = (thrust::complex*)&mat[0]; - } +// pMat = (thrust::complex*)&mat[0]; +// } std::vector qubits32(qubits.size()); for(int_t i=0;i::apply_mcphase(const reg_t &qubits, const std::co if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_) - return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::phase(phase),chunk_.container()->num_chunks()); + if(enable_cuStatevec_){ + cvector_t diag(2); + diag[0] = 1.0; + diag[1] = phase; + return chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); + } if(register_blocking_){ int i; From 87afff5bc1eb715412d4a762878ebbe3b25c96f6 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Thu, 16 Dec 2021 15:27:45 +0900 Subject: [PATCH 05/17] Fix qasm_simulator.py --- qiskit/providers/aer/backends/qasm_simulator.py | 15 ++++++++++----- src/simulators/statevector/chunk/chunk.hpp | 5 +++++ .../statevector/chunk/device_chunk_container.hpp | 9 +++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py index 05e2c1d5bc..5f3570cf2c 100644 --- a/qiskit/providers/aer/backends/qasm_simulator.py +++ b/qiskit/providers/aer/backends/qasm_simulator.py @@ -595,7 +595,8 @@ def _basis_gates(self): def _method_basis_gates(self): """Return method basis gates and custom instructions""" method = self._options.get('method', None) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x', 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx', @@ -628,7 +629,8 @@ def _custom_instructions(self): return self._options_configuration['custom_instructions'] method = self._options.get('method', None) - if method in ['statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -636,7 +638,8 @@ def _custom_instructions(self): 'save_density_matrix', 'save_statevector', 'save_statevector_dict', 'set_statevector' ]) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -666,10 +669,12 @@ def _custom_instructions(self): def _set_method_config(self, method=None): """Set non-basis gate options when setting method""" # Update configuration description and number of qubits - if method in ['statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: description = 'A C++ statevector simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR - elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust']: + elif method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: description = 'A C++ density matrix simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR // 2 elif method == 'matrix_product_state': diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index 02b8956f51..326591344f 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -44,6 +44,7 @@ class Chunk num_qubits_ = 0; chunk_index_ = 0; mapped_ = false; + cache_ = nullptr; } Chunk(std::weak_ptr> cc,uint_t pos) @@ -54,6 +55,7 @@ class Chunk num_qubits_ = 0; chunk_index_ = 0; mapped_ = false; + cache_ = nullptr; } Chunk(Chunk& chunk) //map chunk from exisiting chunk (used fo cache chunk) { @@ -63,9 +65,12 @@ class Chunk num_qubits_ = chunk.num_qubits_; chunk_index_ = chunk.chunk_index_; mapped_ = true; + cache_ = nullptr; } ~Chunk() { + if(cache_) + cache_.reset(); } void set_device(void) const diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index a47db7b0c1..6154f4dd32 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -317,6 +317,7 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu #endif #ifdef AER_CUSTATEVEC + custatevec_handle_ = nullptr; if(enable_cuStatevec_){ //initialize custatevevtor handle custatevecStatus_t err; @@ -486,10 +487,10 @@ void DeviceChunkContainer::Deallocate(void) blocked_qubits_holder_.clear(); #ifdef AER_CUSTATEVEC - if(enable_cuStatevec_){ - custatevec_work_.clear(); - custatevec_work_.shrink_to_fit(); - + custatevec_work_.clear(); + custatevec_work_.shrink_to_fit(); + if(custatevec_handle_){ + custatevecSetStream(custatevec_handle_,nullptr); custatevecDestroy(custatevec_handle_); } #endif From f16a35c62ad3e4a082b5ad0eddcf0d33fcb17b19 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 4 Jan 2022 18:46:55 +0900 Subject: [PATCH 06/17] update for the latest cuQuantum / added diagonal matrix --- CMakeLists.txt | 2 +- src/controllers/aer_controller.hpp | 9 +- src/simulators/state_chunk.hpp | 6 +- src/simulators/statevector/chunk/chunk.hpp | 8 + .../statevector/chunk/chunk_container.hpp | 18 ++ .../chunk/device_chunk_container.hpp | 245 ++++++++++++++---- .../statevector/qubitvector_thrust.hpp | 52 ++-- .../statevector/statevector_state.hpp | 1 + 8 files changed, 257 insertions(+), 84 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d704e83a3..395f90b0ba 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,7 +258,7 @@ if(AER_THRUST_SUPPORTED) if(CUSTATEVEC_ROOT) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") - set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 5f812c0938..f2a62e3b57 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -531,6 +531,7 @@ void Controller::set_config(const json_t &config) { throw std::runtime_error("No CUDA device available!"); } sim_device_ = Device::cuStateVec; + //initialize custatevevtor handle once before actual calculation (takes long time at first call) custatevecStatus_t err; custatevecHandle_t stHandle; @@ -664,12 +665,12 @@ void Controller::set_parallelization_circuit(const Circuit &circ, enable_batch_multi_shots_ = false; if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ //cuStateVec is not supported currently + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ //cuStateVec is not supported currently, because cuStateVec does not handle conditional functions enable_batch_multi_shots_ = true; } if(sim_device_ == Device::cuStateVec){ - parallel_shots_ = 1; //cuStateVec beta 1 is not thread safe + parallel_shots_ = 1; //cuStateVec is not thread safe return; } @@ -1478,7 +1479,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, // Check if measure sampler and optimization are valid if (can_sample) { // Implement measure sampler - if (parallel_shots_ <= 1) { + if (parallel_shots_ <= 1 || (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec)) { state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed); @@ -1499,7 +1500,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, shot_state.set_parallelization(parallel_state_update_); shot_state.set_global_phase(circ.global_phase_angle); - state.set_max_matrix_qubits(max_bits); + shot_state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed + i); diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index 056234d72d..239e3bfa55 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -623,7 +623,7 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu #endif if(BaseState::sim_device_name_ == "cuStateVec") - chunk_omp_parallel_ = false; //because cuQuantum Beta 1 is not thread safe (TODO: check if cuQuantum will be updated) + chunk_omp_parallel_ = false; //because cuStateVec is not thread safe else thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix } @@ -967,8 +967,10 @@ void StateChunk::apply_ops_multi_shots_for_group(int_t i_group, max_ops = noise_ops[j].size(); if(pauli_only){ for(int_t k=0;kapply_diagonal_matrix(chunk_pos_,qubits,control_bits,diag,count); } + + //largest number of qubits that meets num_chunks_ = m*(2^num_pow2_qubits_) + uint_t num_pow2_qubits(void) + { + chunk_container_.lock()->num_pow2_qubits(); + } + + }; //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index e9d69cdff2..451fa6c345 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -487,6 +487,7 @@ class ChunkContainer : public std::enable_shared_from_this::deallocate_chunks(void) reduced_queue_end_.clear(); } +template +void ChunkContainer::update_pow2_qubits(void) +{ + uint_t n = num_chunks_; + num_pow2_qubits_ = chunk_bits_; + while((n & 1) == 0){ + num_pow2_qubits_++; + n >>= 1; + } +} + //------------------------------------------------------------------------------ } // end namespace QV } // end namespace AER diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 6154f4dd32..5c482cf72a 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -65,7 +65,7 @@ class DeviceChunkContainer : public ChunkContainer #ifdef AER_CUSTATEVEC //for cuStatevec - custatevecHandle_t custatevec_handle_; //cuStatevec handle for this chunk container + std::vector custatevec_handle_; //cuStatevec handle for this chunk container AERDeviceVector custatevec_work_; //work buffer for cuStatevec uint_t custatevec_work_size_; //buffer size uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix @@ -316,20 +316,6 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu } #endif -#ifdef AER_CUSTATEVEC - custatevec_handle_ = nullptr; - if(enable_cuStatevec_){ - //initialize custatevevtor handle - custatevecStatus_t err; - err = custatevecCreate(&custatevec_handle_); - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::allocate : " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); - } - } -#endif - this->num_buffers_ = buffers; if(multi_shots){ //mult-shot parallelization for small qubits @@ -370,6 +356,8 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu this->num_chunks_ = nc; data_.resize((nc+buffers) << chunk_bits); + this->update_pow2_qubits(); + #ifdef AER_THRUST_CUDA stream_.resize(nc + buffers); for(i=0;i::Allocate(int idev,int chunk_bits,int num_qu #ifdef AER_CUSTATEVEC if(enable_cuStatevec_){ + //initialize custatevevtor handle custatevecStatus_t err; - //set stream to custatevec handle - err = custatevecSetStream(custatevec_handle_,stream_[0]); - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::allocate : " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); + + custatevec_handle_.resize(nc + buffers); + for(i=0;i> mat(1ull << (matrix_bit*2)); //count bits for multi-chunks - custatevec_chunk_total_qubits_ = this->chunk_bits_; - custatevec_chunk_count_ = this->num_chunks_; - if(custatevec_chunk_count_ > 1){ - while((custatevec_chunk_count_ & 1) == 0){ - custatevec_chunk_count_ >>= 1; - custatevec_chunk_total_qubits_++; - } - } + custatevec_chunk_total_qubits_ = this->num_pow2_qubits_; + custatevec_chunk_count_ = this->num_chunks_ >> (this->num_pow2_qubits_ - this->chunk_bits_); + //matrix err = custatevecApplyMatrix_bufferSize( - custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + custatevec_handle_[0], CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; @@ -419,6 +414,24 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu throw std::runtime_error(str.str()); } + //diagonal matrix + size_t diag_size; + std::vector perm(matrix_bit); + std::vector basis(matrix_bit); + for(i=0;i 0) custatevec_work_.resize(custatevec_work_size_*num_matrices_); } @@ -489,15 +502,14 @@ void DeviceChunkContainer::Deallocate(void) #ifdef AER_CUSTATEVEC custatevec_work_.clear(); custatevec_work_.shrink_to_fit(); - if(custatevec_handle_){ - custatevecSetStream(custatevec_handle_,nullptr); - custatevecDestroy(custatevec_handle_); + for(int_t i=0;i::sample_measure(uint_t iChunk,const std::vect set_device(); +#ifdef AER_CUSTATEVEC + if(enable_cuStatevec_ && count == (1ull << (this->num_qubits_ - this->chunk_bits_))){ + //custatevecSampler_sample only can be applied to whole statevector + custatevecStatus_t err; + custatevecSamplerDescriptor_t sampler; + size_t extSize; + + cudaStreamSynchronize(stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + err = custatevecSampler_create(custatevec_handle_[iChunk], chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + AERDeviceVector extBuf; + void* pExtBuf = nullptr; + if(extSize > 0){ + extBuf.resize(extSize); + pExtBuf = thrust::raw_pointer_cast(extBuf.data()); + } + + err = custatevecSampler_preprocess(custatevec_handle_[iChunk],&sampler,pExtBuf,extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + std::vector bitStr(SHOTS); + std::vector bitOrdering(this->num_qubits_); + for(int_t i=0;inum_qubits_;i++){ + bitOrdering[i] = i; + } + + err = custatevecSampler_sample(custatevec_handle_[iChunk], &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, + CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "DeviceChunkContainer::sample_measure : custatevecSampler_sample " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + for(int_t i=0;i 0){ + extBuf.clear(); + extBuf.shrink_to_fit(); + } + return samples; + } +#endif + strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); #ifdef AER_THRUST_CUDA -// cudaGetLastError(); if(dot) thrust::transform_inclusive_scan(thrust::cuda::par.on(stream_[iChunk]),iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); else @@ -1280,15 +1353,17 @@ void DeviceChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& return; thrust::complex* pMat; + int_t num_qubits = qubits.size()-control_bits; -// if(count == this->num_chunks_ && iChunk == 0){ + if((matrix_buffer_size_ >= (1ull << (num_qubits*2))) && ((count == this->num_chunks_ && iChunk == 0) || num_matrices_ > 1)){ StoreMatrix(mat,iChunk); pMat = matrix_pointer(iChunk); -// } -// else{ + } + else{ //if operation is not batchable, use host memory -// pMat = (thrust::complex*)&mat[0]; -// } + pMat = (thrust::complex*)&mat[0]; + set_device(); + } std::vector qubits32(qubits.size()); for(int_t i=0;i::apply_matrix(const uint_t iChunk,const reg_t& } } } + cudaDataType_t state_type; + custatevecComputeType_t comp_type; + if(sizeof(data_t) == sizeof(double)){ + state_type = CUDA_C_64F; + comp_type = CUSTATEVEC_COMPUTE_64F; + } + else{ + state_type = CUDA_C_32F; + comp_type = CUSTATEVEC_COMPUTE_32F; + } custatevecStatus_t err; for(int_t i=0;i void DeviceChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) { - if(enable_cuStatevec_){ - //convert diagonal elements to matrix unless cuQuantum has no diagonal matrix multiplication - //TO DO: call diagonal matrix multiplication API if cuStatevec supports it +#ifdef AER_CUSTATEVEC + if(!enable_cuStatevec_) + return; + + thrust::complex* pMat; + int_t num_qubits = qubits.size()-control_bits; + + if(control_bits > 0){ + //custatevecApplyGeneralizedPermutationMatrix does not support control bits??? cvector_t mat(diag.size()*diag.size(),0.0); for(int_t i=0;i= (1ull << num_qubits)) && ((count == this->num_chunks_ && iChunk == 0) || num_matrices_ > 1)){ + StoreMatrix(diag,iChunk); + pMat = matrix_pointer(iChunk); + } + else{ + //if operation is not batchable, use host memory + pMat = (thrust::complex*)&diag[0]; + set_device(); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i::apply_matrix(const reg_t &qubits, for(i=0;i(N), mat, qubits_sorted); } else{ @@ -2310,9 +2307,6 @@ void QubitVectorThrust::apply_matrix(const reg_t &qubits, reg_t params; MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); -// chunk_.StoreMatrix(matLU); -// chunk_.StoreUintParams(params); - apply_function(f, matLU, params); } @@ -2542,9 +2536,6 @@ void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, apply_function(DiagonalMult4x4(diag,qubits[0],qubits[1])); } else{ -// chunk_.StoreMatrix(diag); -// chunk_.StoreUintParams(qubits); - apply_function(DiagonalMultNxN(qubits), diag, qubits); } } @@ -2725,11 +2716,6 @@ void QubitVectorThrust::apply_mcx(const reg_t &qubits) if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - //TO DO: implement MCX specific function for cuStatevec - return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::X,chunk_.container()->num_chunks()); - } - if(register_blocking_){ int i; uint_t mask = 0; @@ -2813,11 +2799,6 @@ void QubitVectorThrust::apply_mcy(const reg_t &qubits) if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - //TO DO: implement MCY specific function for cuStatevec - return chunk_.apply_matrix(qubits,qubits.size()-1,Linalg::VMatrix::Y,chunk_.container()->num_chunks()); - } - if(register_blocking_){ int i; uint_t mask = 0; @@ -3339,9 +3320,6 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - return chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); - } // Calculate the permutation positions for the last qubit. const size_t N = qubits.size(); @@ -3372,7 +3350,10 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('d',qubits[qubits.size()-1],mask,&diag[0]); } else{ - apply_function(DiagonalMult2x2Controlled(diag,qubits) ); + if(enable_cuStatevec_) + chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); + else + apply_function(DiagonalMult2x2Controlled(diag,qubits) ); } } } @@ -3392,7 +3373,10 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('u',qubits[qubits.size()-1],mask,&mat[0]); } else{ - apply_function(MatrixMult2x2Controlled(mat,qubits) ); + if(enable_cuStatevec_) + chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); + else + apply_function(MatrixMult2x2Controlled(mat,qubits) ); } } } @@ -3491,6 +3475,7 @@ template double QubitVectorThrust::norm() const { double ret; + #ifdef AER_THRUST_CUDA if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ if(chunk_.pos() != 0) @@ -4545,9 +4530,11 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) { uint_t count = 1; #ifdef AER_THRUST_CUDA - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return reg_t(); //first chunk execute all in batch - count = chunk_.container()->num_chunks(); + if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ + if(chunk_.pos() != 0) + return reg_t(); //first chunk execute all in batch + count = chunk_.container()->num_chunks(); + } #endif #ifdef AER_DEBUG @@ -5076,7 +5063,7 @@ void QubitVectorThrust::apply_batched_pauli_ops(const std::vector::apply_batched_pauli_ops(const std::vector(x_max, (ops[i][j].qubits[0])); num_y++; } + else if(ops[i][j].name == "pauli"){ + uint_t pauli_x_mask = 0, pauli_z_mask = 0, pauli_num_y = 0, pauli_x_max = 0; + std::tie(pauli_x_mask, pauli_z_mask, pauli_num_y, pauli_x_max) = pauli_masks_and_phase(ops[i][j].qubits, ops[i][j].string_params[0]); + + x_mask ^= pauli_x_mask; + z_mask ^= pauli_z_mask; + x_max = std::max(x_max, pauli_x_max); + num_y += pauli_num_y; + } } params[i*4] = x_max; params[i*4+1] = num_y % 4; diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index 5606be96e7..70fed314eb 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -1835,6 +1835,7 @@ std::vector State::sample_measure(const reg_t &qubits, else{ std::vector chunkSum(BaseState::qregs_.size()+1,0); double sum,localSum; + //calculate per chunk sum if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) From 0c10325024675d2229f12a9850189ff909f8bcba Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 18 Jan 2022 14:47:50 +0900 Subject: [PATCH 07/17] add more cuStateVec support / refactor qubitvector_thrust and chunk_container --- src/controllers/aer_controller.hpp | 2 +- src/simulators/statevector/chunk/chunk.hpp | 53 +- .../statevector/chunk/chunk_container.hpp | 596 ++-- .../statevector/chunk/chunk_manager.hpp | 37 +- .../chunk/cuStateVec_chunk_container.hpp | 772 +++++ .../chunk/device_chunk_container.hpp | 370 +-- .../chunk/host_chunk_container.hpp | 53 +- .../statevector/chunk/thrust_kernels.hpp | 2697 +++++++++++++++++ .../statevector/qubitvector_thrust.hpp | 2465 +-------------- .../unitary/unitarymatrix_thrust.hpp | 43 +- 10 files changed, 3887 insertions(+), 3201 deletions(-) create mode 100644 src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp create mode 100644 src/simulators/statevector/chunk/thrust_kernels.hpp diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 6f3c4a0592..4e2742582f 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -666,7 +666,7 @@ void Controller::set_parallelization_circuit(const Circuit &circ, } if(sim_device_ == Device::cuStateVec){ - parallel_shots_ = 1; //cuStateVec is not thread safe + parallel_shots_ = 1; //cuStateVec is currently not thread safe return; } diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index b9e5b2823c..13e0603dfa 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -18,6 +18,11 @@ #include "simulators/statevector/chunk/device_chunk_container.hpp" #include "simulators/statevector/chunk/host_chunk_container.hpp" +#ifdef AER_CUSTATEVEC +#include "simulators/statevector/chunk/cuStateVec_chunk_container.hpp" +#endif + + namespace AER { namespace QV { @@ -261,9 +266,13 @@ class Chunk return chunk_container_.lock()->sample_measure(chunk_pos_,rnds,stride,dot,count); } - thrust::complex norm(uint_t count=1,uint_t stride = 1,bool dot = true) const + double norm(uint_t count) const { - return chunk_container_.lock()->norm(chunk_pos_,count,stride,dot); + return chunk_container_.lock()->norm(chunk_pos_,count); + } + double trace(uint_t row, uint_t count) const + { + return chunk_container_.lock()->trace(chunk_pos_,row,count); } #ifdef AER_THRUST_CUDA @@ -354,16 +363,54 @@ class Chunk chunk_container_.lock()->keep_conditional(keep); } - //apply matrix using cuStatevec + //apply matrix void apply_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) { chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,mat,count); } + //apply diagonal matrix void apply_diagonal_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) { chunk_container_.lock()->apply_diagonal_matrix(chunk_pos_,qubits,control_bits,diag,count); } + //apply (controlled) X + void apply_X(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_X(chunk_pos_,qubits,count); + } + //apply (controlled) Y + void apply_Y(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_Y(chunk_pos_,qubits,count); + } + //apply (controlled) phase + void apply_phase(const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) + { + chunk_container_.lock()->apply_phase(chunk_pos_,qubits,control_bits,phase,count); + } + //apply (controlled) swap gate + void apply_swap(const reg_t& qubits,const int_t control_bits,const uint_t count) + { + chunk_container_.lock()->apply_swap(chunk_pos_,qubits,control_bits,count); + } + //apply permutation + void apply_permutation(const reg_t& qubits,const std::vector> &pairs, const uint_t count) + { + chunk_container_.lock()->apply_permutation(chunk_pos_,qubits,pairs,count); + } + + //get probabilities of chunk + void probabilities(std::vector& probs, const reg_t& qubits) const + { + chunk_container_.lock()->probabilities(probs, chunk_pos_,qubits); + } + //Pauli expectation values + double expval_pauli(const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const + { + return chunk_container_.lock()->expval_pauli(chunk_pos_,qubits,pauli,initial_phase); + } + //largest number of qubits that meets num_chunks_ = m*(2^num_pow2_qubits_) uint_t num_pow2_qubits(void) { diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 451fa6c345..67866d650e 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -63,6 +63,8 @@ DISABLE_WARNING_POP #include "simulators/statevector/chunk/cuda_kernels.hpp" #endif +#include "simulators/statevector/chunk/thrust_kernels.hpp" + namespace AER { namespace QV { @@ -77,391 +79,6 @@ struct BlockedGateParams unsigned char qubit_; }; -//======================================== -// base class of gate functions -//======================================== -template -class GateFuncBase -{ -protected: - thrust::complex* data_; //pointer to state vector buffer - thrust::complex* matrix_; //storage for matrix on device - uint_t* params_; //storage for additional parameters on device - uint_t base_index_; //start index of state vector - uint_t chunk_bits_; - uint_t* cregs_; - uint_t num_creg_bits_; - int_t conditional_bit_; -#ifndef AER_THRUST_CUDA - uint_t index_offset_; -#endif -public: - GateFuncBase() - { - data_ = NULL; - base_index_ = 0; - cregs_ = NULL; - num_creg_bits_ = 0; - conditional_bit_ = -1; -#ifndef AER_THRUST_CUDA - index_offset_ = 0; -#endif - } - virtual void set_data(thrust::complex* p) - { - data_ = p; - } - void set_matrix(thrust::complex* mat) - { - matrix_ = mat; - } - void set_params(uint_t* p) - { - params_ = p; - } - void set_chunk_bits(uint_t bits) - { - chunk_bits_ = bits; - } - - void set_base_index(uint_t i) - { - base_index_ = i; - } - void set_cregs_(uint_t* cbits,uint_t nreg) - { - cregs_ = cbits; - num_creg_bits_ = nreg; - } - void set_conditional(int_t bit) - { - conditional_bit_ = bit; - } - -#ifndef AER_THRUST_CUDA - void set_index_offset(uint_t i) - { - index_offset_ = i; - } -#endif - - __host__ __device__ thrust::complex* data(void) - { - return data_; - } - - virtual bool is_diagonal(void) - { - return false; - } - virtual int qubits_count(void) - { - return 1; - } - virtual int num_control_bits(void) - { - return 0; - } - virtual int control_mask(void) - { - return 1; - } - virtual bool use_cache(void) - { - return false; - } - virtual bool batch_enable(void) - { - return true; - } - - virtual const char* name(void) - { - return "base function"; - } - virtual uint_t size(int num_qubits) - { - if(is_diagonal()){ - chunk_bits_ = num_qubits; - return (1ull << num_qubits); - } - else{ - chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); - return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); - } - } - - virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const - { - return _tid; - } - virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - } - virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - return 0.0; - } - - virtual __host__ __device__ bool check_conditional(uint_t i) const - { - if(conditional_bit_ < 0) - return true; - - uint_t iChunk = i >> chunk_bits_; - uint_t n64,i64,ibit; - n64 = (num_creg_bits_ + 63) >> 6; - i64 = conditional_bit_ >> 6; - ibit = conditional_bit_ & 63; - return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); - } -}; - -//======================================== - // gate functions with cache -//======================================== -template -class GateFuncWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ void sync_threads() const - { -#ifdef CUDA_ARCH - __syncthreads(); -#endif - } - - __host__ __device__ void operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache(j,idx,cache); - } - } - - virtual int qubits_count(void) - { - return nqubits_; - } -}; - -template -class GateFuncSumWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncSumWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return 0.0; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - double sum = 0.0; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache_sum(j,idx,cache); - } - return sum; - } - - virtual int qubits_count(void) - { - return nqubits_; - } - -}; - -//stridded iterator to access diagonal probabilities -template -class strided_range -{ - public: - - typedef typename thrust::iterator_difference::type difference_type; - - struct stride_functor : public thrust::unary_function - { - difference_type stride; - - stride_functor(difference_type stride) - : stride(stride) {} - - __host__ __device__ - difference_type operator()(const difference_type& i) const - { - if(stride == 1) //statevector - return i; - - //density matrix - difference_type i_chunk; - i_chunk = i / (stride - 1); - difference_type ret = stride * i - i_chunk*(stride-1); - return ret; - } - }; - - typedef typename thrust::counting_iterator CountingIterator; - typedef typename thrust::transform_iterator TransformIterator; - typedef typename thrust::permutation_iterator PermutationIterator; - - // type of the strided_range iterator - typedef PermutationIterator iterator; - - // construct strided_range for the range [first,last) - strided_range(Iterator first, Iterator last, difference_type stride) - : first(first), last(last), stride(stride) {} - - iterator begin(void) const - { - return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); - } - - iterator end(void) const - { - if(stride == 1) //statevector - return begin() + (last - first); - - //density matrix - return begin() + (last - first) / (stride-1); - } - - protected: - Iterator first; - Iterator last; - difference_type stride; -}; - -template -struct complex_dot_scan : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } -}; - -template -struct complex_norm : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } -}; - -template -struct complex_less -{ - typedef thrust::complex first_argument_type; - typedef thrust::complex second_argument_type; - typedef bool result_type; - __thrust_exec_check_disable__ - __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} -}; // end less - - -class HostFuncBase -{ -protected: -public: - HostFuncBase(){} - - virtual void execute(){} -}; //============================================================================ // chunk container base class @@ -474,6 +91,7 @@ class ChunkContainer : public std::enable_shared_from_this& operator[](uint_t i) = 0; - virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS, bool enable_cuStatevec = false) = 0; + virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS) = 0; virtual void Deallocate(void) = 0; virtual void Set(uint_t i,const thrust::complex& t) = 0; @@ -607,7 +231,8 @@ class ChunkContainer : public std::enable_shared_from_this &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const = 0; - virtual thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const = 0; + virtual double norm(uint_t iChunk,uint_t count) const; + virtual double trace(uint_t iChunk,uint_t row,uint_t count) const; size_t size_of_complex(void) @@ -690,9 +315,32 @@ class ChunkContainer : public std::enable_shared_from_this &mat,const uint_t count){} - virtual void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count){} + //apply matrix + virtual void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count); + + //apply diagonal matrix + virtual void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count); + + //apply (controlled) X + virtual void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) Y + virtual void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count); + + //apply (controlled) swap gate + virtual void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count); + + //apply permutation + virtual void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count); + + //get probabilities of chunk + virtual void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const; + + //Pauli expectation values + virtual double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const; protected: int convert_blocked_qubit(int qubit) @@ -775,6 +423,7 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) { set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -845,6 +494,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1010,6 +660,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1169,6 +820,171 @@ void ChunkContainer::update_pow2_qubits(void) } } +template +void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + if(N == 1){ + if(control_bits == 0) + Execute(MatrixMult2x2(mat,qubits[0]), iChunk, count); + else //2x2 matrix with control bits + Execute(MatrixMult2x2Controlled(mat,qubits), iChunk, count); + } + else if(N == 2){ + Execute(MatrixMult4x4(mat,qubits[0],qubits[1]), iChunk, count); + } + else{ + if(N <= 10){ + int i; + for(i=0;i(N), iChunk, count); + } + else{ + cvector_t matLU; + reg_t params; + MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); + + StoreMatrix(matLU, iChunk); + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); + } + } +} + +template +void ChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + + if(N == 1){ + if(control_bits == 0) + Execute(DiagonalMult2x2(diag,qubits[0]), iChunk, count); + else + Execute(DiagonalMult2x2Controlled(diag,qubits), iChunk, count); + } + else if(N == 2){ + Execute(DiagonalMult4x4(diag,qubits[0],qubits[1]), iChunk, count); + } + else{ + StoreMatrix(diag, iChunk); + StoreUintParams(qubits, iChunk); + + Execute(DiagonalMultNxN(qubits), iChunk, count); + } +} + +template +void ChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CX_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CY_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + Execute(phase_func(qubits,*(thrust::complex*)&phase), iChunk, count ); +} + +template +void ChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + Execute(CSwap_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + const size_t N = qubits.size(); + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + reg_t params; + Permutation f(qubits_sorted,qubits,pairs,params); + + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); +} + +template +void ChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + const size_t N = qubits.size(); + const int_t DIM = 1 << N; + probs.resize(DIM); + + if(N == 1){ //special case for 1 qubit (optimized for measure) + ExecuteSum2(&probs[0],probability_1qubit_func(qubits[0]), iChunk, 1); + } + else{ + for(int_t i=0;i(qubits,i), iChunk, 1); + } + } +} + +template +double ChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + double ret; + ExecuteSum(&ret,norm_func(), iChunk, count); + + return ret; +} + +template +double ChunkContainer::trace(uint_t iChunk,uint_t row,uint_t count) const +{ + double ret; + ExecuteSum(&ret,trace_func(row), iChunk, count); + + return ret; +} + +template +double ChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); + + // Special case for only I Paulis + if (x_mask + z_mask == 0) { + thrust::complex ret = norm(iChunk, 1); + return ret.real() + ret.imag(); + } + double ret; + // specialize x_max == 0 + if(x_mask == 0) { + ExecuteSum(&ret, expval_pauli_Z_func(z_mask), iChunk, 1 ); + return ret; + } + + // Compute the overall phase of the operator. + // This is (-1j) ** number of Y terms modulo 4 + auto phase = std::complex(initial_phase); + add_y_phase(num_y, phase); + ExecuteSum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase), iChunk, 1 ); + return ret; +} + + + + //------------------------------------------------------------------------------ } // end namespace QV } // end namespace AER diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index 20e769373d..313a6f486e 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -43,6 +43,7 @@ class ChunkManager int num_qubits_; //number of global qubits uint_t num_chunks_; //number of chunks on this process + uint_t chunk_index_; //global chunk index for the first chunk int i_dev_map_; //device index chunk to be mapped int idev_buffer_map_; //device index buffer to be mapped @@ -67,7 +68,7 @@ class ChunkManager return chunks_.size(); } - uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit,bool enable_cuStatevec); + uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit,bool enable_cuStatevec); void Free(void); int num_devices(void) @@ -115,6 +116,7 @@ ChunkManager::ChunkManager() num_places_ = 1; chunk_bits_ = 0; num_chunks_ = 0; + chunk_index_ = 0; num_qubits_ = 0; multi_shots_ = false; @@ -163,7 +165,7 @@ ChunkManager::~ChunkManager() } template -uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit, bool enable_cuStatevec) +uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit, bool enable_cuStatevec) { uint_t num_buffers; int iDev; @@ -185,7 +187,9 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, } //--- enable_cuStatevec_ = enable_cuStatevec; - + + chunk_index_ = chunk_index; + if(num_qubits_ != nqubits || chunk_bits_ != chunk_bits || nchunks > num_chunks_){ //free previous allocation Free(); @@ -249,40 +253,45 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, num_places_ = num_chunks_; } - nchunks = num_chunks_; - //allocate chunk container before parallel loop using push_back to store shared pointer for(i=0;i>()); + continue; + } +#endif chunks_.push_back(std::make_shared>()); } uint_t chunks_allocated = 0; #pragma omp parallel for if(num_places_ > 1) private(is,ie,nc) reduction(+:chunks_allocated) for(iDev=0;iDevset_chunk_index(chunk_index_ + chunks_allocated); //set first chunk index for the container if(num_devices_ > 0) - chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit,enable_cuStatevec_); + chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); else - chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit,enable_cuStatevec_); + chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); } - if(chunks_allocated < nchunks){ + if(chunks_allocated < num_chunks_){ //rest of chunks are stored on host for(iDev=0;iDev 0){ + chunks_[num_places_]->set_chunk_index(chunk_index_ + chunks_allocated + is); //set first chunk index for the container chunks_.push_back(std::make_shared>()); chunks_[num_places_]->Allocate(-1,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); num_places_ += 1; } } - num_chunks_ = chunks_allocated; } #ifdef AER_DISABLE_GDR diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp new file mode 100644 index 0000000000..d764053a03 --- /dev/null +++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp @@ -0,0 +1,772 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + + +#ifndef _qv_cuStateVec_chunk_container_hpp_ +#define _qv_cuStateVec_chunk_container_hpp_ + +#include "simulators/statevector/chunk/device_chunk_container.hpp" + +#include "custatevec.h" + +namespace AER { +namespace QV { + + +//============================================================================ +// cuStateVec chunk container class +//============================================================================ +template +class cuStateVecChunkContainer : public DeviceChunkContainer +{ +protected: + std::vector custatevec_handle_; //cuStatevec handle for this chunk container + AERDeviceVector custatevec_work_; //work buffer for cuStatevec + uint_t custatevec_work_size_; //buffer size + uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix + uint_t custatevec_chunk_count_; //number of counts for all chunks + +public: + using BaseContainer = DeviceChunkContainer; + + cuStateVecChunkContainer() + { + } + ~cuStateVecChunkContainer(); + + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; + + unsigned char* custatevec_work_pointer(uint_t iChunk) const + { + if(custatevec_work_size_ == 0) + return nullptr; + if(iChunk >= this->num_chunks_){ //for buffer chunks + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((BaseContainer::num_matrices_ + iChunk - this->num_chunks_) * custatevec_work_size_); + } + else{ + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((iChunk % BaseContainer::num_matrices_) * custatevec_work_size_); + } + } + + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; + double norm(uint_t iChunk,uint_t count) const override; + + //apply matrix + void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) override; + + //apply diagonal matrix + void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) override; + + //apply (controlled) X + void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) Y + void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) override; + + //apply (controlled) swap gate + void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) override; + + //apply permutation + void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) override; + + //get probabilities of chunk + void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const override; + + //Pauli expectation values + double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const override; +}; + +template +cuStateVecChunkContainer::~cuStateVecChunkContainer(void) +{ + Deallocate(); +} + +template +uint_t cuStateVecChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) +{ + uint_t nc; + nc = BaseContainer::Allocate(idev,chunk_bits,num_qubits,chunks,buffers,multi_shots,matrix_bit); + + //initialize custatevevtor handle + custatevecStatus_t err; + + custatevec_handle_.resize(nc + buffers); + for(uint_t i=0;i> mat(1ull << (matrix_bit*2)); + + //count bits for multi-chunks + custatevec_chunk_total_qubits_ = this->num_pow2_qubits_; + custatevec_chunk_count_ = this->num_chunks_ >> (this->num_pow2_qubits_ - this->chunk_bits_); + + //matrix + err = custatevecApplyMatrix_bufferSize( + custatevec_handle_[0], CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::ResizeMatrixBuffers : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //diagonal matrix + size_t diag_size; + std::vector perm(matrix_bit); + std::vector basis(matrix_bit); + for(int_t i=0;i 0) + custatevec_work_.resize(custatevec_work_size_*BaseContainer::num_matrices_); + + return nc; +} + +template +void cuStateVecChunkContainer::Deallocate(void) +{ + BaseContainer::Deallocate(); + + custatevec_work_.clear(); + custatevec_work_.shrink_to_fit(); + for(int_t i=0;i +reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride, bool dot,uint_t count) const +{ + if(count == (1ull << (this->num_qubits_ - this->chunk_bits_))){ + //custatevecSampler_sample only can be applied to whole statevector + const int_t SHOTS = rnds.size(); + reg_t samples(SHOTS,0); + + BaseContainer::set_device(); + + custatevecStatus_t err; + custatevecSamplerDescriptor_t sampler; + size_t extSize; + + cudaStreamSynchronize(BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + err = custatevecSampler_create(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + AERDeviceVector extBuf; + void* pExtBuf = nullptr; + if(extSize > 0){ + extBuf.resize(extSize); + pExtBuf = thrust::raw_pointer_cast(extBuf.data()); + } + + err = custatevecSampler_preprocess(custatevec_handle_[iChunk],&sampler,pExtBuf,extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + std::vector bitStr(SHOTS); + std::vector bitOrdering(this->num_qubits_); + for(int_t i=0;inum_qubits_;i++){ + bitOrdering[i] = i; + } + + err = custatevecSampler_sample(custatevec_handle_[iChunk], &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, + CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_sample " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + for(int_t i=0;i 0){ + extBuf.clear(); + extBuf.shrink_to_fit(); + } + return samples; + } + else{ + return BaseContainer::sample_measure(iChunk, rnds, stride, dot, count); + } +} + +template +void cuStateVecChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size()-control_bits; + + if((BaseContainer::matrix_buffer_size_ >= (1ull << (num_qubits*2))) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ + BaseContainer::StoreMatrix(mat,iChunk); + pMat = BaseContainer::matrix_pointer(iChunk); + } + else{ + //if operation is not batchable, use host memory + pMat = (thrust::complex*)&mat[0]; + BaseContainer::set_device(); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + cudaDataType_t state_type; + custatevecComputeType_t comp_type; + if(sizeof(data_t) == sizeof(double)){ + state_type = CUDA_C_64F; + comp_type = CUSTATEVEC_COMPUTE_64F; + } + else{ + state_type = CUDA_C_32F; + comp_type = CUSTATEVEC_COMPUTE_32F; + } + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size(); + + if(control_bits > 0){ + uint_t size = 1ull << num_qubits; + cvector_t diag_ctrl(size); //make diagonal matrix with controls + + for(int_t i=0;i= (1ull << num_qubits)) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ + BaseContainer::StoreMatrix(diag,iChunk); + pMat = BaseContainer::matrix_pointer(iChunk); + } + else{ + //if operation is not batchable, use host memory + pMat = (thrust::complex*)&diag[0]; + BaseContainer::set_device(); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + + uint_t perm_size = 1ull << num_qubits; + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + + uint_t perm_size = 1ull << num_qubits; + cvector_t diag(perm_size); + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + uint_t size = 1ull << qubits.size(); + cvector_t diag(size); + for(int_t i=0;i +void cuStateVecChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + + uint_t perm_size = 1ull << num_qubits; + std::vector swap(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + BaseContainer::set_device(); + + int_t size = 1ull << qubits.size(); + custatevecIndex_t perm[size]; + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + double ret = 0.0; + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + std::vector qubits32(qubits.size()); + for(int_t i=0;ichunk_bits_, + &p0, &p1, &qubits32[0], 1); + probs.resize(2); + probs[0] = p0; + probs[1] = p1; + } + else{ + probs.resize(1ull << qubits.size()); + err = custatevecAbs2SumArray(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + &probs[0], &qubits32[0], qubits.size(), nullptr,nullptr,0); + } + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::probabilities : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } +} + +template +double cuStateVecChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + if(initial_phase != 1.0){ + return BaseContainer::expval_pauli(iChunk, qubits, pauli, initial_phase); + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecPauli_t pauliOps[pauli.size()]; + int32_t qubits32[qubits.size()]; + for(int_t i=0;ichunk_bits_, + ret, pauliOperatorsArray, basisBitsArray, nBasisBitsArray, 1); + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::expval_pauli : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + return ret[0]; +} + + + +//------------------------------------------------------------------------------ +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 5c482cf72a..035c60ad32 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -18,10 +18,6 @@ #include "simulators/statevector/chunk/chunk_container.hpp" -#ifdef AER_CUSTATEVEC -#include "custatevec.h" -#endif - namespace AER { namespace QV { @@ -51,8 +47,6 @@ class DeviceChunkContainer : public ChunkContainer bool creg_host_update_; - bool enable_cuStatevec_; - //for register blocking thrust::host_vector blocked_qubits_holder_; uint_t max_blocked_gates_; @@ -62,15 +56,6 @@ class DeviceChunkContainer : public ChunkContainer #ifdef AER_THRUST_CUDA std::vector stream_; //asynchronous execution - -#ifdef AER_CUSTATEVEC - //for cuStatevec - std::vector custatevec_handle_; //cuStatevec handle for this chunk container - AERDeviceVector custatevec_work_; //work buffer for cuStatevec - uint_t custatevec_work_size_; //buffer size - uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix - uint_t custatevec_chunk_count_; //number of counts for all chunks -#endif #endif public: @@ -117,13 +102,13 @@ class DeviceChunkContainer : public ChunkContainer return raw_reference_cast(data_[i]); } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk); - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size); - void StoreUintParams(const std::vector& prm,uint_t iChunk); - void ResizeMatrixBuffers(int bits); + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override; + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override; + void StoreUintParams(const std::vector& prm,uint_t iChunk) override; + void ResizeMatrixBuffers(int bits) override; void set_device(void) const { @@ -137,20 +122,6 @@ class DeviceChunkContainer : public ChunkContainer { return stream_[iChunk]; } - -#ifdef AER_CUSTATEVEC - unsigned char* custatevec_work_pointer(uint_t iChunk) const - { - if(custatevec_work_size_ == 0) - return nullptr; - if(iChunk >= this->num_chunks_){ //for buffer chunks - return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((num_matrices_ + iChunk - this->num_chunks_) * custatevec_work_size_); - } - else{ - return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((iChunk % num_matrices_) * custatevec_work_size_); - } - } -#endif #endif void Set(uint_t i,const thrust::complex& t) @@ -162,16 +133,15 @@ class DeviceChunkContainer : public ChunkContainer return data_[i]; } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; thrust::complex* chunk_pointer(uint_t iChunk) const { @@ -262,10 +232,6 @@ class DeviceChunkContainer : public ChunkContainer //queue gate for blocked execution void queue_blocked_gate(uint_t iChunk,char gate,uint_t qubit,uint_t mask,const std::complex* pMat = NULL); - - //apply matrix using cuStatevec - void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) override; - void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) override; }; template @@ -275,7 +241,7 @@ DeviceChunkContainer::~DeviceChunkContainer(void) } template -uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec) +uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) { uint_t nc = chunks; uint_t i; @@ -287,8 +253,6 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu device_id_ = idev; set_device(); - enable_cuStatevec_ = enable_cuStatevec; - #ifdef AER_THRUST_CUDA if(!multi_shots){ int ip,nd; @@ -374,69 +338,6 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu reduce_buffer_size_ = 1; #endif -#ifdef AER_CUSTATEVEC - if(enable_cuStatevec_){ - //initialize custatevevtor handle - custatevecStatus_t err; - - custatevec_handle_.resize(nc + buffers); - for(i=0;i> mat(1ull << (matrix_bit*2)); - - //count bits for multi-chunks - custatevec_chunk_total_qubits_ = this->num_pow2_qubits_; - custatevec_chunk_count_ = this->num_chunks_ >> (this->num_pow2_qubits_ - this->chunk_bits_); - - //matrix - err = custatevecApplyMatrix_bufferSize( - custatevec_handle_[0], CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, - 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::ResizeMatrixBuffers : " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); - } - - //diagonal matrix - size_t diag_size; - std::vector perm(matrix_bit); - std::vector basis(matrix_bit); - for(i=0;i 0) - custatevec_work_.resize(custatevec_work_size_*num_matrices_); - } -#endif - reduce_buffer_size_ *= 2; reduce_buffer_.resize(reduce_buffer_size_*nc); probability_buffer_.resize(nc*QV_PROBABILITY_BUFFER_SIZE); @@ -499,15 +400,6 @@ void DeviceChunkContainer::Deallocate(void) num_blocked_qubits_.clear(); blocked_qubits_holder_.clear(); -#ifdef AER_CUSTATEVEC - custatevec_work_.clear(); - custatevec_work_.shrink_to_fit(); - for(int_t i=0;i::sample_measure(uint_t iChunk,const std::vect set_device(); -#ifdef AER_CUSTATEVEC - if(enable_cuStatevec_ && count == (1ull << (this->num_qubits_ - this->chunk_bits_))){ - //custatevecSampler_sample only can be applied to whole statevector - custatevecStatus_t err; - custatevecSamplerDescriptor_t sampler; - size_t extSize; - - cudaStreamSynchronize(stream_[iChunk]); - - cudaDataType_t state_type; - if(sizeof(data_t) == sizeof(double)) - state_type = CUDA_C_64F; - else - state_type = CUDA_C_32F; - - err = custatevecSampler_create(custatevec_handle_[iChunk], chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); - } - - AERDeviceVector extBuf; - void* pExtBuf = nullptr; - if(extSize > 0){ - extBuf.resize(extSize); - pExtBuf = thrust::raw_pointer_cast(extBuf.data()); - } - - err = custatevecSampler_preprocess(custatevec_handle_[iChunk],&sampler,pExtBuf,extSize); - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); - } - - std::vector bitStr(SHOTS); - std::vector bitOrdering(this->num_qubits_); - for(int_t i=0;inum_qubits_;i++){ - bitOrdering[i] = i; - } - - err = custatevecSampler_sample(custatevec_handle_[iChunk], &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, - CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; - if(err != CUSTATEVEC_STATUS_SUCCESS){ - std::stringstream str; - str << "DeviceChunkContainer::sample_measure : custatevecSampler_sample " << custatevecGetErrorString(err); - throw std::runtime_error(str.str()); - } - - for(int_t i=0;i 0){ - extBuf.clear(); - extBuf.shrink_to_fit(); - } - return samples; - } -#endif - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); #ifdef AER_THRUST_CUDA @@ -844,30 +674,6 @@ reg_t DeviceChunkContainer::sample_measure(uint_t iChunk,const std::vect return samples; } -template -thrust::complex DeviceChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - set_device(); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - -#ifdef AER_THRUST_CUDA - cudaStreamSynchronize(stream_[iChunk]); - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#else - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#endif - - return sum; -} - //set qubits to be blocked template @@ -1345,152 +1151,6 @@ void DeviceChunkContainer::copy_to_probability_buffer(std::vector -void DeviceChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) -{ -#ifdef AER_CUSTATEVEC - if(!enable_cuStatevec_) - return; - - thrust::complex* pMat; - int_t num_qubits = qubits.size()-control_bits; - - if((matrix_buffer_size_ >= (1ull << (num_qubits*2))) && ((count == this->num_chunks_ && iChunk == 0) || num_matrices_ > 1)){ - StoreMatrix(mat,iChunk); - pMat = matrix_pointer(iChunk); - } - else{ - //if operation is not batchable, use host memory - pMat = (thrust::complex*)&mat[0]; - set_device(); - } - - std::vector qubits32(qubits.size()); - for(int_t i=0;i 0) - pControl = &qubits32[0]; - - uint_t bits; - uint_t nc; - if(count == this->num_chunks_){ - bits = custatevec_chunk_total_qubits_; - nc = custatevec_chunk_count_; - } - else{ - nc = count; - bits = this->chunk_bits_; - if(nc > 0){ - while((nc & 1) == 0){ - nc >>= 1; - bits++; - } - } - } - cudaDataType_t state_type; - custatevecComputeType_t comp_type; - if(sizeof(data_t) == sizeof(double)){ - state_type = CUDA_C_64F; - comp_type = CUSTATEVEC_COMPUTE_64F; - } - else{ - state_type = CUDA_C_32F; - comp_type = CUSTATEVEC_COMPUTE_32F; - } - - custatevecStatus_t err; - for(int_t i=0;i -void DeviceChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) -{ -#ifdef AER_CUSTATEVEC - if(!enable_cuStatevec_) - return; - - thrust::complex* pMat; - int_t num_qubits = qubits.size()-control_bits; - - if(control_bits > 0){ - //custatevecApplyGeneralizedPermutationMatrix does not support control bits??? - cvector_t mat(diag.size()*diag.size(),0.0); - for(int_t i=0;i= (1ull << num_qubits)) && ((count == this->num_chunks_ && iChunk == 0) || num_matrices_ > 1)){ - StoreMatrix(diag,iChunk); - pMat = matrix_pointer(iChunk); - } - else{ - //if operation is not batchable, use host memory - pMat = (thrust::complex*)&diag[0]; - set_device(); - } - - std::vector qubits32(qubits.size()); - for(int_t i=0;i 0) - pControl = &qubits32[0]; - - uint_t bits; - uint_t nc; - if(count == this->num_chunks_){ - bits = custatevec_chunk_total_qubits_; - nc = custatevec_chunk_count_; - } - else{ - nc = count; - bits = this->chunk_bits_; - if(nc > 0){ - while((nc & 1) == 0){ - nc >>= 1; - bits++; - } - } - } - - cudaDataType_t state_type; - if(sizeof(data_t) == sizeof(double)) - state_type = CUDA_C_64F; - else - state_type = CUDA_C_32F; - - custatevecStatus_t err; - for(int_t i=0;i return data_[i]; } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit,bool enable_cuStatevec = false); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk) + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override { matrix_[iChunk] = (thrust::complex*)&mat[0]; } - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override { matrix_[iChunk] = (thrust::complex*)mat; } - void StoreUintParams(const std::vector& prm,uint_t iChunk) + void StoreUintParams(const std::vector& prm,uint_t iChunk) override { params_[iChunk] = (uint_t*)&prm[0]; } void ResizeMatrixBuffers(int bits){} - void Set(uint_t i,const thrust::complex& t) + void Set(uint_t i,const thrust::complex& t) override { data_[i] = t; } - thrust::complex Get(uint_t i) const + thrust::complex Get(uint_t i) const override { return data_[i]; } - thrust::complex* chunk_pointer(uint_t iChunk) const + thrust::complex* chunk_pointer(uint_t iChunk) const override { return (thrust::complex*)thrust::raw_pointer_cast(data_.data()) + (iChunk << this->chunk_bits_); } - thrust::complex* matrix_pointer(uint_t iChunk) const + thrust::complex* matrix_pointer(uint_t iChunk) const override { return matrix_[iChunk]; } - uint_t* param_pointer(uint_t iChunk) const + uint_t* param_pointer(uint_t iChunk) const override { return params_[iChunk]; } @@ -104,16 +104,15 @@ class HostChunkContainer : public ChunkContainer #endif } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; }; @@ -124,7 +123,7 @@ HostChunkContainer::~HostChunkContainer(void) } template -uint_t HostChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit, bool enable_cuStatevec) +uint_t HostChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) { uint_t nc = chunks; uint_t i; @@ -267,20 +266,6 @@ reg_t HostChunkContainer::sample_measure(uint_t iChunk,const std::vector return samples; } -template -thrust::complex HostChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - - if(dot) - sum = thrust::transform_reduce(thrust::omp::par, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::omp::par, iter.begin(),iter.end(),zero,thrust::plus>()); - - return sum; -} //------------------------------------------------------------------------------ } // end namespace QV diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp new file mode 100644 index 0000000000..701eb05e0a --- /dev/null +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -0,0 +1,2697 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019, 2020. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + + +#ifndef _qv_thrust_kernels_hpp_ +#define _qv_thrust_kernels_hpp_ + +#include "misc/warnings.hpp" +DISABLE_WARNING_PUSH +#ifdef AER_THRUST_CUDA +#include +#include +#endif +DISABLE_WARNING_POP + +#include "misc/wrap_thrust.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "framework/utils.hpp" + +#ifdef AER_THRUST_CUDA +#include "simulators/statevector/chunk/cuda_kernels.hpp" +#endif + +namespace AER { +namespace QV { + +//======================================== +// base class of gate functions +//======================================== +template +class GateFuncBase +{ +protected: + thrust::complex* data_; //pointer to state vector buffer + thrust::complex* matrix_; //storage for matrix on device + uint_t* params_; //storage for additional parameters on device + uint_t base_index_; //start index of state vector + uint_t chunk_bits_; + uint_t* cregs_; + uint_t num_creg_bits_; + int_t conditional_bit_; +#ifndef AER_THRUST_CUDA + uint_t index_offset_; +#endif +public: + GateFuncBase() + { + data_ = NULL; + base_index_ = 0; + cregs_ = NULL; + num_creg_bits_ = 0; + conditional_bit_ = -1; +#ifndef AER_THRUST_CUDA + index_offset_ = 0; +#endif + } + virtual void set_data(thrust::complex* p) + { + data_ = p; + } + void set_matrix(thrust::complex* mat) + { + matrix_ = mat; + } + void set_params(uint_t* p) + { + params_ = p; + } + void set_chunk_bits(uint_t bits) + { + chunk_bits_ = bits; + } + + void set_base_index(uint_t i) + { + base_index_ = i; + } + void set_cregs_(uint_t* cbits,uint_t nreg) + { + cregs_ = cbits; + num_creg_bits_ = nreg; + } + void set_conditional(int_t bit) + { + conditional_bit_ = bit; + } + +#ifndef AER_THRUST_CUDA + void set_index_offset(uint_t i) + { + index_offset_ = i; + } +#endif + + __host__ __device__ thrust::complex* data(void) + { + return data_; + } + + virtual bool is_diagonal(void) + { + return false; + } + virtual int qubits_count(void) + { + return 1; + } + virtual int num_control_bits(void) + { + return 0; + } + virtual int control_mask(void) + { + return 1; + } + virtual bool use_cache(void) + { + return false; + } + virtual bool batch_enable(void) + { + return true; + } + + virtual const char* name(void) + { + return "base function"; + } + virtual uint_t size(int num_qubits) + { + if(is_diagonal()){ + chunk_bits_ = num_qubits; + return (1ull << num_qubits); + } + else{ + chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); + return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); + } + } + + virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const + { + return _tid; + } + virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + } + virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + return 0.0; + } + + virtual __host__ __device__ bool check_conditional(uint_t i) const + { + if(conditional_bit_ < 0) + return true; + + uint_t iChunk = i >> chunk_bits_; + uint_t n64,i64,ibit; + n64 = (num_creg_bits_ + 63) >> 6; + i64 = conditional_bit_ >> 6; + ibit = conditional_bit_ & 63; + return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); + } +}; + +//======================================== + // gate functions with cache +//======================================== +template +class GateFuncWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ void sync_threads() const + { +#ifdef CUDA_ARCH + __syncthreads(); +#endif + } + + __host__ __device__ void operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache(j,idx,cache); + } + } + + virtual int qubits_count(void) + { + return nqubits_; + } +}; + +template +class GateFuncSumWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncSumWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return 0.0; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + double sum = 0.0; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache_sum(j,idx,cache); + } + return sum; + } + + virtual int qubits_count(void) + { + return nqubits_; + } + +}; + +//stridded iterator to access diagonal probabilities +template +class strided_range +{ + public: + + typedef typename thrust::iterator_difference::type difference_type; + + struct stride_functor : public thrust::unary_function + { + difference_type stride; + + stride_functor(difference_type stride) + : stride(stride) {} + + __host__ __device__ + difference_type operator()(const difference_type& i) const + { + if(stride == 1) //statevector + return i; + + //density matrix + difference_type i_chunk; + i_chunk = i / (stride - 1); + difference_type ret = stride * i - i_chunk*(stride-1); + return ret; + } + }; + + typedef typename thrust::counting_iterator CountingIterator; + typedef typename thrust::transform_iterator TransformIterator; + typedef typename thrust::permutation_iterator PermutationIterator; + + // type of the strided_range iterator + typedef PermutationIterator iterator; + + // construct strided_range for the range [first,last) + strided_range(Iterator first, Iterator last, difference_type stride) + : first(first), last(last), stride(stride) {} + + iterator begin(void) const + { + return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); + } + + iterator end(void) const + { + if(stride == 1) //statevector + return begin() + (last - first); + + //density matrix + return begin() + (last - first) / (stride-1); + } + + protected: + Iterator first; + Iterator last; + difference_type stride; +}; + +template +struct complex_dot_scan : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } +}; + +template +struct complex_norm : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } +}; + +template +struct complex_less +{ + typedef thrust::complex first_argument_type; + typedef thrust::complex second_argument_type; + typedef bool result_type; + __thrust_exec_check_disable__ + __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} +}; // end less + + +class HostFuncBase +{ +protected: +public: + HostFuncBase(){} + + virtual void execute(){} +}; + + +//------------------------------------------------------------------------------ +// State initialize component +//------------------------------------------------------------------------------ +template +class initialize_component_1qubit_func : public GateFuncBase +{ +protected: + thrust::complex s0,s1; + uint_t mask; + uint_t offset; +public: + initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) + { + s0 = state0; + s1 = state1; + + mask = (1ull << qubit) - 1; + offset = 1ull << qubit; + } + + virtual __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + + vec0[i0] = s0*q0; + vec1[i0] = s1*q0; + } + + const char* name(void) + { + return "initialize_component 1 qubit"; + } +}; + +template +class initialize_component_func : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; +public: + initialize_component_func(const cvector_t& mat,const reg_t &qb) + { + nqubits = qb.size(); + matSize = 1ull << nqubits; + } + + int qubits_count(void) + { + return nqubits; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q; + thrust::complex* state; + uint_t* qubits; + uint_t* qubits_sorted; + uint_t j,k; + uint_t ii,idx,t; + uint_t mask; + + //get parameters from iterator + vec = this->data_; + state = this->matrix_; + qubits = this->params_; + qubits_sorted = qubits + nqubits; + + idx = 0; + ii = i; + for(j=0;j> j) & 1) != 0) + ii += (1ull << qubits[j]); + } + q = q0 * state[k]; + vec[ii] = q; + } + } + + const char* name(void) + { + return "initialize_component"; + } +}; + +template +class initialize_large_component_func : public GateFuncBase +{ +protected: + int num_qubits_; + uint_t mask_; + uint_t cmask_; + thrust::complex init_; +public: + initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) + { + num_qubits_ = qubits.size(); + init_ = m; + + mask_ = 0; + cmask_ = 0; + for(int k=0;k> k) & 1) != 0){ + cmask_ |= (1ull << qubits[k]); + } + } + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q; + vec = this->data_; + if((i & mask_) == cmask_){ + q = vec[i]; + vec[i] = init_*q; + } + } + const char* name(void) + { + return "initialize_large_component"; + } +}; + +//------------------------------------------------------------------------------ +// Zero clear +//------------------------------------------------------------------------------ +template +class ZeroClear : public GateFuncBase +{ +protected: +public: + ZeroClear() {} + bool is_diagonal(void) + { + return true; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + vec = this->data_; + vec[i] = 0.0; + } + const char* name(void) + { + return "zero"; + } +}; + + +//------------------------------------------------------------------------------ +// Initialize state +//------------------------------------------------------------------------------ +template +class initialize_kernel : public GateFuncBase +{ +protected: + int num_qubits_state_; + uint_t offset_; + thrust::complex init_val_; +public: + initialize_kernel(thrust::complex v,int nqs,uint_t offset) + { + num_qubits_state_ = nqs; + offset_ = offset; + init_val_ = v; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + uint_t iChunk = (i >> num_qubits_state_); + + vec = this->data_; + + if(i == iChunk * offset_){ + vec[i] = init_val_; + } + else{ + vec[i] = 0.0; + } + } + const char* name(void) + { + return "initialize"; + } +}; + +//------------------------------------------------------------------------------ +// Matrix multiplication +//------------------------------------------------------------------------------ +template +class MatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset0; + +public: + MatrixMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + mask = (1ull << qubit) - 1; + + offset0 = 1ull << qubit; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset0; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + const char* name(void) + { + return "mult2x2"; + } +}; + + +template +class MatrixMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m00,m10,m20,m30; + thrust::complex m01,m11,m21,m31; + thrust::complex m02,m12,m22,m32; + thrust::complex m03,m13,m23,m33; + uint_t mask0; + uint_t mask1; + uint_t offset0; + uint_t offset1; + +public: + MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) + { + m00 = mat[0]; + m01 = mat[1]; + m02 = mat[2]; + m03 = mat[3]; + + m10 = mat[4]; + m11 = mat[5]; + m12 = mat[6]; + m13 = mat[7]; + + m20 = mat[8]; + m21 = mat[9]; + m22 = mat[10]; + m23 = mat[11]; + + m30 = mat[12]; + m31 = mat[13]; + m32 = mat[14]; + m33 = mat[15]; + + offset0 = 1ull << qubit0; + offset1 = 1ull << qubit1; + if(qubit0 < qubit1){ + mask0 = offset0 - 1; + mask1 = offset1 - 1; + } + else{ + mask0 = offset1 - 1; + mask1 = offset0 - 1; + } + } + + int qubits_count(void) + { + return 2; + } + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex* vec2; + thrust::complex* vec3; + thrust::complex q0,q1,q2,q3; + + vec0 = this->data_; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + vec1 = vec0 + offset0; + vec2 = vec0 + offset1; + vec3 = vec2 + offset0; + + q0 = vec0[i0]; + q1 = vec1[i0]; + q2 = vec2[i0]; + q3 = vec3[i0]; + + vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; + vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; + vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; + vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; + } + const char* name(void) + { + return "mult4x4"; + } +}; + +template +class MatrixMult8x8 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t mask0; + uint_t mask1; + uint_t mask2; + +public: + MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + } + + int qubits_count(void) + { + return 3; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i3 = (i - i0) << 1; + i1 = i3 & mask1; + i3 = (i3 - i1) << 1; + i2 = i3 & mask2; + i3 = (i3 - i2) << 1; + + i0 = i0 + i1 + i2 + i3; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + + m0 = pMat[0]; + m1 = pMat[8]; + m2 = pMat[16]; + m3 = pMat[24]; + m4 = pMat[32]; + m5 = pMat[40]; + m6 = pMat[48]; + m7 = pMat[56]; + + vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[1]; + m1 = pMat[9]; + m2 = pMat[17]; + m3 = pMat[25]; + m4 = pMat[33]; + m5 = pMat[41]; + m6 = pMat[49]; + m7 = pMat[57]; + + vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[2]; + m1 = pMat[10]; + m2 = pMat[18]; + m3 = pMat[26]; + m4 = pMat[34]; + m5 = pMat[42]; + m6 = pMat[50]; + m7 = pMat[58]; + + vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[3]; + m1 = pMat[11]; + m2 = pMat[19]; + m3 = pMat[27]; + m4 = pMat[35]; + m5 = pMat[43]; + m6 = pMat[51]; + m7 = pMat[59]; + + vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[4]; + m1 = pMat[12]; + m2 = pMat[20]; + m3 = pMat[28]; + m4 = pMat[36]; + m5 = pMat[44]; + m6 = pMat[52]; + m7 = pMat[60]; + + vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[5]; + m1 = pMat[13]; + m2 = pMat[21]; + m3 = pMat[29]; + m4 = pMat[37]; + m5 = pMat[45]; + m6 = pMat[53]; + m7 = pMat[61]; + + vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[6]; + m1 = pMat[14]; + m2 = pMat[22]; + m3 = pMat[30]; + m4 = pMat[38]; + m5 = pMat[46]; + m6 = pMat[54]; + m7 = pMat[62]; + + vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[7]; + m1 = pMat[15]; + m2 = pMat[23]; + m3 = pMat[31]; + m4 = pMat[39]; + m5 = pMat[47]; + m6 = pMat[55]; + m7 = pMat[63]; + + vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + } + const char* name(void) + { + return "mult8x8"; + } +}; + +template +class MatrixMult16x16 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t offset3; + uint_t mask0; + uint_t mask1; + uint_t mask2; + uint_t mask3; +public: + MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + offset3 = (1ull << qubit[3]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + mask3 = (1ull << qubit_ordered[3]) - 1; + } + + int qubits_count(void) + { + return 4; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; + thrust::complex r; + thrust::complex* pMat; + int j; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i4 = (i - i0) << 1; + i1 = i4 & mask1; + i4 = (i4 - i1) << 1; + i2 = i4 & mask2; + i4 = (i4 - i2) << 1; + i3 = i4 & mask3; + i4 = (i4 - i3) << 1; + + i0 = i0 + i1 + i2 + i3 + i4; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + q8 = vec[i0 + offset3]; + q9 = vec[i0 + offset3 + offset0]; + q10 = vec[i0 + offset3 + offset1]; + q11 = vec[i0 + offset3 + offset1 + offset0]; + q12 = vec[i0 + offset3 + offset2]; + q13 = vec[i0 + offset3 + offset2 + offset0]; + q14 = vec[i0 + offset3 + offset2 + offset1]; + q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; + + offset = 0; + f0 = 0; + f1 = 0; + f2 = 0; + for(j=0;j<16;j++){ + r = pMat[0+j]*q0; + r += pMat[16+j]*q1; + r += pMat[32+j]*q2; + r += pMat[48+j]*q3; + r += pMat[64+j]*q4; + r += pMat[80+j]*q5; + r += pMat[96+j]*q6; + r += pMat[112+j]*q7; + r += pMat[128+j]*q8; + r += pMat[144+j]*q9; + r += pMat[160+j]*q10; + r += pMat[176+j]*q11; + r += pMat[192+j]*q12; + r += pMat[208+j]*q13; + r += pMat[224+j]*q14; + r += pMat[240+j]*q15; + + offset = offset3 * (((uint_t)j >> 3) & 1) + + offset2 * (((uint_t)j >> 2) & 1) + + offset1 * (((uint_t)j >> 1) & 1) + + offset0 * ((uint_t)j & 1); + + vec[i0 + offset] = r; + } + } + const char* name(void) + { + return "mult16x16"; + } +}; + +template +class MatrixMultNxN : public GateFuncWithCache +{ +protected: +public: + MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) + { + ; + } + + __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j,threadID; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class MatrixMultNxN_LU : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; + int nswap; +public: + MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) + { + uint_t i,j,k,imax; + std::complex c0,c1; + double d,dmax; + uint_t* pSwap; + + nqubits = qb.size(); + matSize = 1ull << nqubits; + + matLU = mat; + params.resize(nqubits + matSize*2); + + for(k=0;k dmax){ + dmax = d; + imax = j; + } + } + if(imax != i){ + j = params[nqubits + imax]; + params[nqubits + imax] = params[nqubits + i]; + params[nqubits + i] = j; + } + + if(dmax != 0){ + c0 = matLU[(i << nqubits) + params[nqubits + i]]; + + for(j=i+1;j q,qt; + thrust::complex m; + thrust::complex r; + uint_t j,k,l,iq; + uint_t ii,idx,t; + uint_t mask,offset_j,offset_k; + thrust::complex* vec; + thrust::complex* pMat; + uint_t* qubits; + uint_t* pivot; + uint_t* table; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + + pivot = qubits + nqubits; + table = pivot + matSize; + + idx = 0; + ii = i; + for(j=0;j> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //mult L + for(j=matSize-1;j>0;j--){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + r = vec[offset_j+idx]; + + for(k=0;k> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //swap results + if(nswap > 0){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + q = vec[offset_j+idx]; + k = pivot[table[0]]; + for(j=1;j> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + qt = vec[offset_j+idx]; + + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + q = qt; + k = pivot[table[j]]; + } + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + } + } + const char* name(void) + { + return "multNxN"; + } +}; + +template +class MatrixMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; +public: + MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + nqubits = qubits.size(); + + offset = 1ull << qubits[nqubits-1]; + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + } + const char* name(void) + { + return "matrix_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Diagonal matrix multiplication +//------------------------------------------------------------------------------ +template +class DiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + + DiagonalMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i + gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult2x2"; + } +}; + +template +class DiagonalMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit0; + int qubit1; +public: + + DiagonalMult4x4(const cvector_t& mat,int q0,int q1) + { + qubit0 = q0; + qubit1 = q1; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return 2; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit1) & 1) == 0){ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + } + else{ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m2; + } + else{ + m = m3; + } + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult4x4"; + } +}; + +template +class DiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + DiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t j,im; + thrust::complex* vec; + thrust::complex q; + thrust::complex m; + thrust::complex* pMat; + uint_t* qubits; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + pMat = this->matrix_; + qubits = this->params_; + + im = 0; + for(j=0;j> qubits[j]) & 1) != 0){ + im += (1 << j); + } + } + + q = vec[i]; + m = pMat[im]; + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_multNxN"; + } +}; + +template +class DiagonalMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + uint_t mask; + uint_t cmask; + int nqubits; +public: + DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + m0 = mat[0]; + m1 = mat[1]; + + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i* vec; + thrust::complex q0; + thrust::complex m; + + vec = this->data_; + gid = this->base_index_; + + if(((i + gid) & cmask) == cmask){ + if((i + gid) & mask){ + m = m1; + } + else{ + m = m0; + } + + q0 = vec[i]; + vec[i] = m*q0; + } + } + const char* name(void) + { + return "diagonal_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Permutation +//------------------------------------------------------------------------------ +template +class Permutation : public GateFuncBase +{ +protected: + uint_t nqubits; + uint_t npairs; + +public: + Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) + { + uint_t j,k; + uint_t offset0,offset1; + + nqubits = qubits.size(); + npairs = pairs.size(); + + params.resize(nqubits + npairs*2); + + for(j=0;j> k) & 1) != 0){ + offset0 += (1ull << qubits[k]); + } + if(((pairs[j].second >> k) & 1) != 0){ + offset1 += (1ull << qubits[k]); + } + } + params[nqubits + j*2 ] = offset0; + params[nqubits + j*2+1] = offset1; + } + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t j; + uint_t ii,idx,t; + uint_t* mask; + uint_t* pairs; + + vec = this->data_; + mask = this->params_; + pairs = mask + nqubits; + + idx = 0; + ii = i; + for(j=0;j +class CX_func : public GateFuncBase +{ +protected: + uint_t offset; + uint_t mask; + uint_t cmask; + int nqubits; + int qubit_t; +public: + + CX_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = 1ull << qubit_t; + mask = offset - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = q1; + vec1[i0] = q0; + } + } + const char* name(void) + { + return "CX"; + } +}; + +//------------------------------------------------------------------------------ +// Y gate +//------------------------------------------------------------------------------ +template +class CY_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; + int qubit_t; +public: + CY_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = (1ull << qubit_t); + mask = (1ull << qubit_t) - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = thrust::complex(q1.imag(),-q1.real()); + vec1[i0] = thrust::complex(-q0.imag(),q0.real()); + } + } + const char* name(void) + { + return "CY"; + } +}; + +//------------------------------------------------------------------------------ +// Swap gate +//------------------------------------------------------------------------------ +template +class CSwap_func : public GateFuncBase +{ +protected: + uint_t mask0; + uint_t mask1; + uint_t cmask; + int nqubits; + int qubit_t0; + int qubit_t1; + uint_t offset1; + uint_t offset2; +public: + + CSwap_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t0 = qubits[nqubits-2]; + qubit_t1 = qubits[nqubits-1]; + } + else{ + qubit_t1 = qubits[nqubits-2]; + qubit_t0 = qubits[nqubits-1]; + } + mask0 = (1ull << qubit_t0) - 1; + mask1 = (1ull << qubit_t1) - 1; + + offset1 = 1ull << qubit_t0; + offset2 = 1ull << qubit_t1; + + cmask = 0; + for(i=0;i q1,q2; + thrust::complex* vec1; + thrust::complex* vec2; + + vec1 = this->data_; + + vec2 = vec1 + offset2; + vec1 = vec1 + offset1; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + if((i0 & cmask) == cmask){ + q1 = vec1[i0]; + q2 = vec2[i0]; + vec1[i0] = q2; + vec2[i0] = q1; + } + } + const char* name(void) + { + return "CSWAP"; + } +}; + +//swap operator between chunks +template +class CSwapChunk_func : public GateFuncBase +{ +protected: + uint_t mask; + thrust::complex* vec0; + thrust::complex* vec1; + bool write_back_; + bool swap_all_; +public: + + CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) + { + int i; + int nqubits; + int qubit_t; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t = qubits[nqubits-2]; + } + else{ + qubit_t = qubits[nqubits-1]; + } + mask = (1ull << qubit_t) - 1; + + vec0 = pVec0; + vec1 = pVec1; + + write_back_ = wb; + if(qubit_t >= block_bits) + swap_all_ = true; + else + swap_all_ = false; + } + + bool batch_enable(void) + { + return false; + } + bool is_diagonal(void) + { + return swap_all_; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + + i0 = i & mask; + i1 = (i - i0) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + vec0[i0] = q1; + if(write_back_) + vec1[i0] = q0; + } + const char* name(void) + { + return "Chunk SWAP"; + } +}; + + +//------------------------------------------------------------------------------ +// Phase gate +//------------------------------------------------------------------------------ +template +class phase_func : public GateFuncBase +{ +protected: + thrust::complex phase; + uint_t mask; + int nqubits; +public: + phase_func(const reg_t &qubits,thrust::complex p) + { + int i; + nqubits = qubits.size(); + phase = p; + + mask = 0; + for(i=0;i* vec; + thrust::complex q0; + + vec = this->data_; + gid = this->base_index_; + + if(((i+gid) & mask) == mask){ + q0 = vec[i]; + vec[i] = q0 * phase; + } + } + const char* name(void) + { + return "phase"; + } +}; + +//------------------------------------------------------------------------------ +// Norm functions +//------------------------------------------------------------------------------ +template +class norm_func : public GateFuncBase +{ +protected: +public: + norm_func(void) + { + + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double d; + + vec = this->data_; + q = vec[i]; + d = (double)(q.real()*q.real() + q.imag()*q.imag()); + return d; + } + + const char* name(void) + { + return "norm"; + } +}; + +template +class trace_func : public GateFuncBase +{ +protected: + uint_t rows_; +public: + trace_func(uint_t nrow) + { + rows_ = nrow; + } + bool is_diagonal(void) + { + return true; + } + uint_t size(int num_qubits) + { + this->chunk_bits_ = num_qubits; + return rows_; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + + uint_t iChunk = (i / rows_); + uint_t lid = i - (iChunk * rows_); + uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); + + vec = this->data_; + q = vec[idx]; + return q.real(); + } + + const char* name(void) + { + return "trace"; + } +}; + +template +class NormMatrixMultNxN : public GateFuncSumWithCache +{ +protected: +public: + NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) + { + ; + } + + __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class NormDiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + NormDiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t im,j,gid; + thrust::complex q; + thrust::complex m,r; + thrust::complex* pMat; + thrust::complex* vec; + uint_t* qubits; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + gid = this->base_index_; + + im = 0; + for(j=0;j +class NormMatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset; +public: + NormMatrixMult2x2(const cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + offset = 1ull << qubit; + mask = (1ull << qubit) - 1; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex* vec; + thrust::complex q0,q1; + thrust::complex r0,r1; + double sum = 0.0; + + vec = this->data_; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec[i0]; + q1 = vec[offset+i0]; + + r0 = m0 * q0 + m2 * q1; + sum += r0.real()*r0.real() + r0.imag()*r0.imag(); + r1 = m1 * q0 + m3 * q1; + sum += r1.real()*r1.real() + r1.imag()*r1.imag(); + return sum; + } + const char* name(void) + { + return "Norm_mult2x2"; + } +}; + +template +class NormDiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + NormDiagonalMult2x2(cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t gid; + thrust::complex* vec; + thrust::complex q; + thrust::complex m,r; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + r = m * q; + + return (r.real()*r.real() + r.imag()*r.imag()); + } + const char* name(void) + { + return "Norm_diagonal_mult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Probabilities +//------------------------------------------------------------------------------ +template +class probability_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; +public: + probability_func(const reg_t &qubits,int i) + { + int k; + int nq = qubits.size(); + + mask = 0; + cmask = 0; + for(k=0;k> k) & 1) != 0){ + cmask |= (1ull << qubits[k]); + } + } + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double ret; + + vec = this->data_; + + ret = 0.0; + + if((i & mask) == cmask){ + q = vec[i]; + ret = q.real()*q.real() + q.imag()*q.imag(); + } + return ret; + } + + const char* name(void) + { + return "probabilities"; + } +}; + +template +class probability_1qubit_func : public GateFuncBase +{ +protected: + uint_t offset; +public: + probability_1qubit_func(const uint_t qubit) + { + offset = 1ull << qubit; + } + + __host__ __device__ thrust::complex operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex ret; + double d0,d1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & (offset - 1); + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); + d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); + + ret = thrust::complex(d0,d1); + return ret; + } + + const char* name(void) + { + return "probabilities_1qubit"; + } +}; + +//------------------------------------------------------------------------------ +// Expectation values +//------------------------------------------------------------------------------ +inline __host__ __device__ uint_t pop_count_kernel(uint_t val) +{ + uint_t count = val; + count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); + count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); + count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); + count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); + count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); + count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); + return count; +} + +//special case Z only +template +class expval_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + +public: + expval_pauli_Z_func(uint_t z) + { + z_mask_ = z; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + double ret = 0.0; + + vec = this->data_; + + q0 = vec[i]; + ret = q0.real()*q0.real() + q0.imag()*q0.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + ret = -ret; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_Z"; + } +}; + +template +class expval_pauli_XYZ_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; +public: + expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + ret = -d0; + else + ret = d0; + if(pop_count_kernel(idx1 & z_mask_) & 1) + ret -= d1; + else + ret += d1; + } + else{ + ret = d0 + d1; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_XYZ"; + } +}; + +template +class expval_pauli_inter_chunk_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + thrust::complex phase_; + thrust::complex* pair_chunk_; + uint_t z_count_; + uint_t z_count_pair_; +public: + expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + pair_chunk_ = pair_chunk; + z_count_ = zc; + z_count_pair_ = zcp; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t ip; + + vec = this->data_; + + ip = i ^ x_mask_; + q0 = vec[i]; + q1 = pair_chunk_[ip]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if((pop_count_kernel(i & z_mask_) + z_count_) & 1) + ret = -d0; + else + ret = d0; + if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) + ret -= d1; + else + ret += d1; + + return ret; + } + const char* name(void) + { + return "expval_pauli_inter_chunk"; + } +}; + +//------------------------------------------------------------------------------ +// Pauli application +//------------------------------------------------------------------------------ +template +class multi_pauli_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; + uint_t nqubits_; +public: + multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + q0 *= -1; + + if(pop_count_kernel(idx1 & z_mask_) & 1) + q1 *= -1; + } + vec[idx0] = q1 * phase_; + vec[idx1] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli"; + } +}; + +//special case Z only +template +class multi_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + thrust::complex phase_; +public: + multi_pauli_Z_func(uint_t z,std::complex p) + { + z_mask_ = z; + phase_ = p; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + + vec = this->data_; + + q0 = vec[i]; + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + q0 = -q0; + } + vec[i] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli_Z"; + } +}; + + +//------------------------------------------------------------------------------ +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index fa76daab78..1ade632a14 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -740,161 +740,10 @@ AER::Vector> QubitVectorThrust::move_to_vector() return AER::Vector>::copy_from_buffer(data_size_, &ret[0]); } + //------------------------------------------------------------------------------ // State initialize component //------------------------------------------------------------------------------ -template -class initialize_component_1qubit_func : public GateFuncBase -{ -protected: - thrust::complex s0,s1; - uint_t mask; - uint_t offset; -public: - initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) - { - s0 = state0; - s1 = state1; - - mask = (1ull << qubit) - 1; - offset = 1ull << qubit; - } - - virtual __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - - vec0[i0] = s0*q0; - vec1[i0] = s1*q0; - } - - const char* name(void) - { - return "initialize_component 1 qubit"; - } -}; - -template -class initialize_component_func : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; -public: - initialize_component_func(const cvector_t& mat,const reg_t &qb) - { - nqubits = qb.size(); - matSize = 1ull << nqubits; - } - - int qubits_count(void) - { - return nqubits; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q; - thrust::complex* state; - uint_t* qubits; - uint_t* qubits_sorted; - uint_t j,k; - uint_t ii,idx,t; - uint_t mask; - - //get parameters from iterator - vec = this->data_; - state = this->matrix_; - qubits = this->params_; - qubits_sorted = qubits + nqubits; - - idx = 0; - ii = i; - for(j=0;j> j) & 1) != 0) - ii += (1ull << qubits[j]); - } - q = q0 * state[k]; - vec[ii] = q; - } - } - - const char* name(void) - { - return "initialize_component"; - } -}; - -template -class initialize_large_component_func : public GateFuncBase -{ -protected: - int num_qubits_; - uint_t mask_; - uint_t cmask_; - thrust::complex init_; -public: - initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) - { - num_qubits_ = qubits.size(); - init_ = m; - - mask_ = 0; - cmask_ = 0; - for(int k=0;k> k) & 1) != 0){ - cmask_ |= (1ull << qubits[k]); - } - } - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q; - vec = this->data_; - if((i & mask_) == cmask_){ - q = vec[i]; - vec[i] = init_*q; - } - } - const char* name(void) - { - return "initialize_large_component"; - } -}; - template void QubitVectorThrust::initialize_component(const reg_t &qubits, const cvector_t &state0) { @@ -928,29 +777,6 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const //------------------------------------------------------------------------------ // Utility //------------------------------------------------------------------------------ - -template -class ZeroClear : public GateFuncBase -{ -protected: -public: - ZeroClear() {} - bool is_diagonal(void) - { - return true; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - vec = this->data_; - vec[i] = 0.0; - } - const char* name(void) - { - return "zero"; - } -}; - template void QubitVectorThrust::zero() { @@ -965,7 +791,6 @@ void QubitVectorThrust::zero() #endif } - template bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name) { @@ -994,7 +819,7 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ chunk_manager_ = std::make_shared>(); - chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,max_matrix_bits_, enable_cuStatevec_); + chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, enable_cuStatevec_); } multi_chunk_distribution_ = false; @@ -1296,47 +1121,6 @@ bool QubitVectorThrust::enable_batch(bool flg) //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ - -template -class initialize_kernel : public GateFuncBase -{ -protected: - int num_qubits_state_; - uint_t offset_; - thrust::complex init_val_; -public: - initialize_kernel(thrust::complex v,int nqs,uint_t offset) - { - num_qubits_state_ = nqs; - offset_ = offset; - init_val_ = v; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - uint_t iChunk = (i >> num_qubits_state_); - - vec = this->data_; - - if(i == iChunk * offset_){ - vec[i] = init_val_; - } - else{ - vec[i] = 0.0; - } - } - const char* name(void) - { - return "initialize"; - } -}; - template void QubitVectorThrust::initialize() { @@ -1607,1114 +1391,101 @@ void QubitVectorThrust::set_json_chop_threshold(double threshold) { * MATRIX MULTIPLICATION * ******************************************************************************/ + + template -class MatrixMult2x2 : public GateFuncBase +void QubitVectorThrust::apply_matrix(const reg_t &qubits, + const cvector_t &mat) { -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset0; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch -public: - MatrixMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; + const size_t N = qubits.size(); + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); - mask = (1ull << qubit) - 1; + if(N == 1 && register_blocking_) + chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); + else + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); +} - offset0 = 1ull << qubit; - } +template +void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, + const reg_t &target_qubits, + const cvector_t &mat) +{ + const size_t control_count = control_qubits.size(); + const size_t target_count = target_qubits.size(); + const uint_t DIM = 1ull << (target_count+control_count); + const uint_t columns = 1ull << target_count; + const uint_t blocks = 1ull << control_count; - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; + auto qubits = target_qubits; + for (const auto &q : control_qubits) {qubits.push_back(q);} + size_t N = qubits.size(); - vec0 = this->data_; - vec1 = vec0 + offset0; + cvector_t matMP(DIM*DIM,0.0); + uint_t b,i,j; - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; + //make DIMxDIM matrix + for(b = 0; b < blocks; b++){ + for(i = 0; i < columns; i++){ + for(j = 0; j < columns; j++){ + matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; + } + } + } - q0 = vec0[i0]; - q1 = vec1[i0]; +#ifdef AER_DEBUG + DebugMsg("apply_multiplexer",control_qubits); + DebugMsg(" ",target_qubits); +#endif - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - const char* name(void) - { - return "mult2x2"; - } -}; + apply_matrix(qubits,matMP); +} template -class MatrixMult4x4 : public GateFuncBase +void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, + const cvector_t &diag) { -protected: - thrust::complex m00,m10,m20,m30; - thrust::complex m01,m11,m21,m31; - thrust::complex m02,m12,m22,m32; - thrust::complex m03,m13,m23,m33; - uint_t mask0; - uint_t mask1; - uint_t offset0; - uint_t offset1; - -public: - MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) - { - m00 = mat[0]; - m01 = mat[1]; - m02 = mat[2]; - m03 = mat[3]; - - m10 = mat[4]; - m11 = mat[5]; - m12 = mat[6]; - m13 = mat[7]; - - m20 = mat[8]; - m21 = mat[9]; - m22 = mat[10]; - m23 = mat[11]; - - m30 = mat[12]; - m31 = mat[13]; - m32 = mat[14]; - m33 = mat[15]; - - offset0 = 1ull << qubit0; - offset1 = 1ull << qubit1; - if(qubit0 < qubit1){ - mask0 = offset0 - 1; - mask1 = offset1 - 1; - } - else{ - mask0 = offset1 - 1; - mask1 = offset0 - 1; - } - } - - int qubits_count(void) - { - return 2; - } - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex* vec2; - thrust::complex* vec3; - thrust::complex q0,q1,q2,q3; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - vec0 = this->data_; + const int_t N = qubits.size(); + if(N == 1 && register_blocking_) + chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); + else + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); +} - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; - i0 = i0 + i1 + i2; +template +void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, + const std::vector> &pairs) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - vec1 = vec0 + offset0; - vec2 = vec0 + offset1; - vec3 = vec2 + offset0; + chunk_.apply_permutation(qubits,pairs,chunk_.container()->num_chunks()); +} - q0 = vec0[i0]; - q1 = vec1[i0]; - q2 = vec2[i0]; - q3 = vec3[i0]; - vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; - vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; - vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; - vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; - } - const char* name(void) - { - return "mult4x4"; - } -}; +/******************************************************************************* + * + * APPLY OPTIMIZED GATES + * + ******************************************************************************/ +//------------------------------------------------------------------------------ +// Multi-controlled gates +//------------------------------------------------------------------------------ template -class MatrixMult8x8 : public GateFuncBase +void QubitVectorThrust::apply_mcx(const reg_t &qubits) { -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t mask0; - uint_t mask1; - uint_t mask2; - -public: - MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - } - - int qubits_count(void) - { - return 3; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i3 = (i - i0) << 1; - i1 = i3 & mask1; - i3 = (i3 - i1) << 1; - i2 = i3 & mask2; - i3 = (i3 - i2) << 1; - - i0 = i0 + i1 + i2 + i3; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - - m0 = pMat[0]; - m1 = pMat[8]; - m2 = pMat[16]; - m3 = pMat[24]; - m4 = pMat[32]; - m5 = pMat[40]; - m6 = pMat[48]; - m7 = pMat[56]; - - vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[1]; - m1 = pMat[9]; - m2 = pMat[17]; - m3 = pMat[25]; - m4 = pMat[33]; - m5 = pMat[41]; - m6 = pMat[49]; - m7 = pMat[57]; - - vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[2]; - m1 = pMat[10]; - m2 = pMat[18]; - m3 = pMat[26]; - m4 = pMat[34]; - m5 = pMat[42]; - m6 = pMat[50]; - m7 = pMat[58]; - - vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[3]; - m1 = pMat[11]; - m2 = pMat[19]; - m3 = pMat[27]; - m4 = pMat[35]; - m5 = pMat[43]; - m6 = pMat[51]; - m7 = pMat[59]; - - vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[4]; - m1 = pMat[12]; - m2 = pMat[20]; - m3 = pMat[28]; - m4 = pMat[36]; - m5 = pMat[44]; - m6 = pMat[52]; - m7 = pMat[60]; - - vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[5]; - m1 = pMat[13]; - m2 = pMat[21]; - m3 = pMat[29]; - m4 = pMat[37]; - m5 = pMat[45]; - m6 = pMat[53]; - m7 = pMat[61]; - - vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[6]; - m1 = pMat[14]; - m2 = pMat[22]; - m3 = pMat[30]; - m4 = pMat[38]; - m5 = pMat[46]; - m6 = pMat[54]; - m7 = pMat[62]; - - vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[7]; - m1 = pMat[15]; - m2 = pMat[23]; - m3 = pMat[31]; - m4 = pMat[39]; - m5 = pMat[47]; - m6 = pMat[55]; - m7 = pMat[63]; - - vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - } - const char* name(void) - { - return "mult8x8"; - } -}; - -template -class MatrixMult16x16 : public GateFuncBase -{ -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t offset3; - uint_t mask0; - uint_t mask1; - uint_t mask2; - uint_t mask3; -public: - MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - offset3 = (1ull << qubit[3]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - mask3 = (1ull << qubit_ordered[3]) - 1; - } - - int qubits_count(void) - { - return 4; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; - thrust::complex r; - thrust::complex* pMat; - int j; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i4 = (i - i0) << 1; - i1 = i4 & mask1; - i4 = (i4 - i1) << 1; - i2 = i4 & mask2; - i4 = (i4 - i2) << 1; - i3 = i4 & mask3; - i4 = (i4 - i3) << 1; - - i0 = i0 + i1 + i2 + i3 + i4; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - q8 = vec[i0 + offset3]; - q9 = vec[i0 + offset3 + offset0]; - q10 = vec[i0 + offset3 + offset1]; - q11 = vec[i0 + offset3 + offset1 + offset0]; - q12 = vec[i0 + offset3 + offset2]; - q13 = vec[i0 + offset3 + offset2 + offset0]; - q14 = vec[i0 + offset3 + offset2 + offset1]; - q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; - - offset = 0; - f0 = 0; - f1 = 0; - f2 = 0; - for(j=0;j<16;j++){ - r = pMat[0+j]*q0; - r += pMat[16+j]*q1; - r += pMat[32+j]*q2; - r += pMat[48+j]*q3; - r += pMat[64+j]*q4; - r += pMat[80+j]*q5; - r += pMat[96+j]*q6; - r += pMat[112+j]*q7; - r += pMat[128+j]*q8; - r += pMat[144+j]*q9; - r += pMat[160+j]*q10; - r += pMat[176+j]*q11; - r += pMat[192+j]*q12; - r += pMat[208+j]*q13; - r += pMat[224+j]*q14; - r += pMat[240+j]*q15; - - offset = offset3 * (((uint_t)j >> 3) & 1) + - offset2 * (((uint_t)j >> 2) & 1) + - offset1 * (((uint_t)j >> 1) & 1) + - offset0 * ((uint_t)j & 1); - - vec[i0 + offset] = r; - } - } - const char* name(void) - { - return "mult16x16"; - } -}; - -template -class MatrixMultNxN : public GateFuncWithCache -{ -protected: -public: - MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) - { - ; - } - - __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j,threadID; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j -class MatrixMultNxN_LU : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; - int nswap; -public: - MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) - { - uint_t i,j,k,imax; - std::complex c0,c1; - double d,dmax; - uint_t* pSwap; - - nqubits = qb.size(); - matSize = 1ull << nqubits; - - matLU = mat; - params.resize(nqubits + matSize*2); - - for(k=0;k dmax){ - dmax = d; - imax = j; - } - } - if(imax != i){ - j = params[nqubits + imax]; - params[nqubits + imax] = params[nqubits + i]; - params[nqubits + i] = j; - } - - if(dmax != 0){ - c0 = matLU[(i << nqubits) + params[nqubits + i]]; - - for(j=i+1;j q,qt; - thrust::complex m; - thrust::complex r; - uint_t j,k,l,iq; - uint_t ii,idx,t; - uint_t mask,offset_j,offset_k; - thrust::complex* vec; - thrust::complex* pMat; - uint_t* qubits; - uint_t* pivot; - uint_t* table; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - - pivot = qubits + nqubits; - table = pivot + matSize; - - idx = 0; - ii = i; - for(j=0;j> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //mult L - for(j=matSize-1;j>0;j--){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - r = vec[offset_j+idx]; - - for(k=0;k> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //swap results - if(nswap > 0){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - q = vec[offset_j+idx]; - k = pivot[table[0]]; - for(j=1;j> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - qt = vec[offset_j+idx]; - - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - q = qt; - k = pivot[table[j]]; - } - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - } - } - const char* name(void) - { - return "multNxN"; - } -}; - - - -template -void QubitVectorThrust::apply_matrix(const reg_t &qubits, - const cvector_t &mat) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - if(enable_cuStatevec_){ - return chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); - } - - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); - } - else{ - apply_function(MatrixMult2x2(mat,qubits[0])); - } - } - else if(N == 2){ - apply_function(MatrixMult4x4(mat,qubits[0],qubits[1])); - } - else if(N <= 10){ - int i; - for(i=0;i(N), mat, qubits_sorted); - } - else{ - cvector_t matLU; - reg_t params; - MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); - - apply_function(f, matLU, params); - } - -} - -template -void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, - const reg_t &target_qubits, - const cvector_t &mat) -{ - const size_t control_count = control_qubits.size(); - const size_t target_count = target_qubits.size(); - const uint_t DIM = 1ull << (target_count+control_count); - const uint_t columns = 1ull << target_count; - const uint_t blocks = 1ull << control_count; - - auto qubits = target_qubits; - for (const auto &q : control_qubits) {qubits.push_back(q);} - size_t N = qubits.size(); - - cvector_t matMP(DIM*DIM,0.0); - uint_t b,i,j; - - //make DIMxDIM matrix - for(b = 0; b < blocks; b++){ - for(i = 0; i < columns; i++){ - for(j = 0; j < columns; j++){ - matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; - } - } - } - -#ifdef AER_DEBUG - DebugMsg("apply_multiplexer",control_qubits); - DebugMsg(" ",target_qubits); -#endif - - apply_matrix(qubits,matMP); -} - -template -class DiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - - DiagonalMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i + gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult2x2"; - } -}; - -template -class DiagonalMult4x4 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit0; - int qubit1; -public: - - DiagonalMult4x4(const cvector_t& mat,int q0,int q1) - { - qubit0 = q0; - qubit1 = q1; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return 2; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit1) & 1) == 0){ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - } - else{ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m2; - } - else{ - m = m3; - } - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult4x4"; - } -}; - -template -class DiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - DiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t j,im; - thrust::complex* vec; - thrust::complex q; - thrust::complex m; - thrust::complex* pMat; - uint_t* qubits; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - pMat = this->matrix_; - qubits = this->params_; - - im = 0; - for(j=0;j> qubits[j]) & 1) != 0){ - im += (1 << j); - } - } - - q = vec[i]; - m = pMat[im]; - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_multNxN"; - } -}; - -template -void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, - const cvector_t &diag) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - if(enable_cuStatevec_){ - return chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); - } - - const int_t N = qubits.size(); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); - } - else{ - apply_function(DiagonalMult2x2(diag,qubits[0])); - } - } - else if(N == 2){ - apply_function(DiagonalMult4x4(diag,qubits[0],qubits[1])); - } - else{ - apply_function(DiagonalMultNxN(qubits), diag, qubits); - } -} - - -template -class Permutation : public GateFuncBase -{ -protected: - uint_t nqubits; - uint_t npairs; - -public: - Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) - { - uint_t j,k; - uint_t offset0,offset1; - - nqubits = qubits.size(); - npairs = pairs.size(); - - params.resize(nqubits + npairs*2); - - for(j=0;j> k) & 1) != 0){ - offset0 += (1ull << qubits[k]); - } - if(((pairs[j].second >> k) & 1) != 0){ - offset1 += (1ull << qubits[k]); - } - } - params[nqubits + j*2 ] = offset0; - params[nqubits + j*2+1] = offset1; - } - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t j; - uint_t ii,idx,t; - uint_t* mask; - uint_t* pairs; - - vec = this->data_; - mask = this->params_; - pairs = mask + nqubits; - - idx = 0; - ii = i; - for(j=0;j -void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, - const std::vector> &pairs) -{ - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - reg_t params; - Permutation f(qubits_sorted,qubits,pairs,params); -// chunk_.StoreUintParams(params); - - apply_function(f, cvector_t(), params); -} - - -/******************************************************************************* - * - * APPLY OPTIMIZED GATES - * - ******************************************************************************/ - -//------------------------------------------------------------------------------ -// Multi-controlled gates -//------------------------------------------------------------------------------ - -template -class CX_func : public GateFuncBase -{ -protected: - uint_t offset; - uint_t mask; - uint_t cmask; - int nqubits; - int qubit_t; -public: - - CX_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = 1ull << qubit_t; - mask = offset - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = q1; - vec1[i0] = q0; - } - } - const char* name(void) - { - return "CX"; - } -}; - -template -void QubitVectorThrust::apply_mcx(const reg_t &qubits) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch if(register_blocking_){ int i; @@ -2725,74 +1496,11 @@ void QubitVectorThrust::apply_mcx(const reg_t &qubits) chunk_.queue_blocked_gate('x',qubits[qubits.size()-1],mask); } else{ - apply_function(CX_func(qubits)); + chunk_.apply_X(qubits, chunk_.container()->num_chunks()); } } -template -class CY_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; - int qubit_t; -public: - CY_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = (1ull << qubit_t); - mask = (1ull << qubit_t) - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = thrust::complex(q1.imag(),-q1.real()); - vec1[i0] = thrust::complex(-q0.imag(),q0.real()); - } - } - const char* name(void) - { - return "CY"; - } -}; - template void QubitVectorThrust::apply_mcy(const reg_t &qubits) { @@ -2808,172 +1516,19 @@ void QubitVectorThrust::apply_mcy(const reg_t &qubits) chunk_.queue_blocked_gate('y',qubits[qubits.size()-1],mask); } else{ - apply_function(CY_func(qubits)); + chunk_.apply_Y(qubits, chunk_.container()->num_chunks()); } } -template -class CSwap_func : public GateFuncBase -{ -protected: - uint_t mask0; - uint_t mask1; - uint_t cmask; - int nqubits; - int qubit_t0; - int qubit_t1; - uint_t offset1; - uint_t offset2; -public: - - CSwap_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t0 = qubits[nqubits-2]; - qubit_t1 = qubits[nqubits-1]; - } - else{ - qubit_t1 = qubits[nqubits-2]; - qubit_t0 = qubits[nqubits-1]; - } - mask0 = (1ull << qubit_t0) - 1; - mask1 = (1ull << qubit_t1) - 1; - - offset1 = 1ull << qubit_t0; - offset2 = 1ull << qubit_t1; - - cmask = 0; - for(i=0;i q1,q2; - thrust::complex* vec1; - thrust::complex* vec2; - - vec1 = this->data_; - - vec2 = vec1 + offset2; - vec1 = vec1 + offset1; - - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; - - i0 = i0 + i1 + i2; - - if((i0 & cmask) == cmask){ - q1 = vec1[i0]; - q2 = vec2[i0]; - vec1[i0] = q2; - vec2[i0] = q1; - } - } - const char* name(void) - { - return "CSWAP"; - } -}; - template void QubitVectorThrust::apply_mcswap(const reg_t &qubits) { - if(enable_cuStatevec_){ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - chunk_.apply_matrix(qubits,qubits.size()-2,Linalg::VMatrix::SWAP,chunk_.container()->num_chunks()); - } - else{ - apply_function(CSwap_func(qubits)); - } + chunk_.apply_swap(qubits,qubits.size()-2,chunk_.container()->num_chunks()); } - -//swap operator between chunks -template -class CSwapChunk_func : public GateFuncBase -{ -protected: - uint_t mask; - thrust::complex* vec0; - thrust::complex* vec1; - bool write_back_; - bool swap_all_; -public: - - CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) - { - int i; - int nqubits; - int qubit_t; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t = qubits[nqubits-2]; - } - else{ - qubit_t = qubits[nqubits-1]; - } - mask = (1ull << qubit_t) - 1; - - vec0 = pVec0; - vec1 = pVec1; - - write_back_ = wb; - if(qubit_t >= block_bits) - swap_all_ = true; - else - swap_all_ = false; - } - - bool batch_enable(void) - { - return false; - } - bool is_diagonal(void) - { - return swap_all_; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - - i0 = i & mask; - i1 = (i - i0) << 1; - i0 += i1; - - q0 = vec0[i0]; - q1 = vec1[i0]; - vec0[i0] = q1; - if(write_back_) - vec1[i0] = q0; - } - const char* name(void) - { - return "Chunk SWAP"; - } -}; - template void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &src, bool write_back) { @@ -3097,64 +1652,21 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem DebugMsg("chunk swap (process)",qubits); #endif - chunk_.Execute(CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); - chunk_.synchronize(); //should be synchronized here - - if(buffer.is_mapped()){ - chunk_manager_->UnmapBufferChunk(buffer); - } - } - - release_recv_buffer(); - -#ifdef AER_DISABLE_GDR - release_send_buffer(); -#endif -} - -template -class phase_func : public GateFuncBase -{ -protected: - thrust::complex phase; - uint_t mask; - int nqubits; -public: - phase_func(const reg_t &qubits,thrust::complex p) - { - int i; - nqubits = qubits.size(); - phase = p; - - mask = 0; - for(i=0;i* vec; - thrust::complex q0; - - vec = this->data_; - gid = this->base_index_; + chunk_.Execute(CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); + chunk_.synchronize(); //should be synchronized here - if(((i+gid) & mask) == mask){ - q0 = vec[i]; - vec[i] = q0 * phase; + if(buffer.is_mapped()){ + chunk_manager_->UnmapBufferChunk(buffer); } } - const char* name(void) - { - return "phase"; - } -}; + + release_recv_buffer(); + +#ifdef AER_DISABLE_GDR + release_send_buffer(); +#endif +} + template void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::complex phase) @@ -3162,13 +1674,6 @@ void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::co if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - cvector_t diag(2); - diag[0] = 1.0; - diag[1] = phase; - return chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); - } - if(register_blocking_){ int i; uint_t mask = 0; @@ -3178,140 +1683,10 @@ void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::co chunk_.queue_blocked_gate('p',qubits[qubits.size()-1],mask,&phase); } else{ - apply_function(phase_func(qubits,*(thrust::complex*)&phase) ); + chunk_.apply_phase(qubits,qubits.size()-1,phase,chunk_.container()->num_chunks()); } } -template -class DiagonalMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - uint_t mask; - uint_t cmask; - int nqubits; -public: - DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - m0 = mat[0]; - m1 = mat[1]; - - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i* vec; - thrust::complex q0; - thrust::complex m; - - vec = this->data_; - gid = this->base_index_; - - if(((i + gid) & cmask) == cmask){ - if((i + gid) & mask){ - m = m1; - } - else{ - m = m0; - } - - q0 = vec[i]; - vec[i] = m*q0; - } - } - const char* name(void) - { - return "diagonal_Cmult2x2"; - } -}; - -template -class MatrixMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; -public: - MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - nqubits = qubits.size(); - - offset = 1ull << qubits[nqubits-1]; - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - } - const char* name(void) - { - return "matrix_Cmult2x2"; - } -}; template void QubitVectorThrust::apply_mcu(const reg_t &qubits, @@ -3350,10 +1725,7 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('d',qubits[qubits.size()-1],mask,&diag[0]); } else{ - if(enable_cuStatevec_) - chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); - else - apply_function(DiagonalMult2x2Controlled(diag,qubits) ); + chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); } } } @@ -3373,10 +1745,7 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('u',qubits[qubits.size()-1],mask,&mat[0]); } else{ - if(enable_cuStatevec_) - chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); - else - apply_function(MatrixMult2x2Controlled(mat,qubits) ); + chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); } } } @@ -3394,11 +1763,6 @@ void QubitVectorThrust::apply_matrix(const uint_t qubit, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - reg_t qubits(1,qubit); - return chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); - } - // Check if matrix is diagonal and if so use optimized lambda if (mat[1] == 0.0 && mat[2] == 0.0) { const std::vector> diag = {{mat[0], mat[3]}}; @@ -3409,7 +1773,8 @@ void QubitVectorThrust::apply_matrix(const uint_t qubit, chunk_.queue_blocked_gate('u',qubit,0,&mat[0]); } else{ - apply_function(MatrixMult2x2(mat,qubit)); + reg_t qubits = {qubit}; + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); } } @@ -3420,17 +1785,12 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - if(enable_cuStatevec_){ - reg_t qubits(1,qubit); - return chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); - } - if(register_blocking_){ chunk_.queue_blocked_gate('d',qubit,0,&diag[0]); } else{ reg_t qubits = {qubit}; - apply_function(DiagonalMult2x2(diag,qubits[0])); + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); } } @@ -3439,38 +1799,6 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, * NORMS * ******************************************************************************/ -template -class norm_func : public GateFuncBase -{ -protected: -public: - norm_func(void) - { - - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double d; - - vec = this->data_; - q = vec[i]; - d = (double)(q.real()*q.real() + q.imag()*q.imag()); - return d; - } - - const char* name(void) - { - return "norm"; - } -}; - template double QubitVectorThrust::norm() const { @@ -3483,7 +1811,7 @@ double QubitVectorThrust::norm() const } #endif - apply_function_sum(&ret,norm_func()); + ret = chunk_.norm(chunk_.container()->num_chunks()); #ifdef AER_DEBUG DebugMsg("norm",ret); @@ -3492,48 +1820,6 @@ double QubitVectorThrust::norm() const return ret; } -template -class NormMatrixMultNxN : public GateFuncSumWithCache -{ -protected: -public: - NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) - { - ; - } - - __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t &mat) const @@ -3559,58 +1845,6 @@ double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t -class NormDiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - NormDiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t im,j,gid; - thrust::complex q; - thrust::complex m,r; - thrust::complex* pMat; - thrust::complex* vec; - uint_t* qubits; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - gid = this->base_index_; - - im = 0; - for(j=0;j double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvector_t &mat) const { @@ -3633,112 +1867,14 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect //------------------------------------------------------------------------------ // Single-qubit specialization //------------------------------------------------------------------------------ -template -class NormMatrixMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset; -public: - NormMatrixMult2x2(const cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - - offset = 1ull << qubit; - mask = (1ull << qubit) - 1; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex* vec; - thrust::complex q0,q1; - thrust::complex r0,r1; - double sum = 0.0; - - vec = this->data_; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec[i0]; - q1 = vec[offset+i0]; - - r0 = m0 * q0 + m2 * q1; - sum += r0.real()*r0.real() + r0.imag()*r0.imag(); - r1 = m1 * q0 + m3 * q1; - sum += r1.real()*r1.real() + r1.imag()*r1.imag(); - return sum; - } - const char* name(void) - { - return "Norm_mult2x2"; - } -}; - template double QubitVectorThrust::norm(const uint_t qubit, const cvector_t &mat) const { double ret; apply_function_sum(&ret,NormMatrixMult2x2(mat,qubit)); - return ret; -} - - -template -class NormDiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - NormDiagonalMult2x2(cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t gid; - thrust::complex* vec; - thrust::complex q; - thrust::complex m,r; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - r = m * q; - - return (r.real()*r.real() + r.imag()*r.imag()); - } - const char* name(void) - { - return "Norm_diagonal_mult2x2"; - } -}; + return ret; +} template double QubitVectorThrust::norm_diagonal(const uint_t qubit, const cvector_t &mat) const @@ -3784,101 +1920,6 @@ std::vector QubitVectorThrust::probabilities() const { return probs; } - -template -class probability_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; -public: - probability_func(const reg_t &qubits,int i) - { - int k; - int nq = qubits.size(); - - mask = 0; - cmask = 0; - for(k=0;k> k) & 1) != 0){ - cmask |= (1ull << qubits[k]); - } - } - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double ret; - - vec = this->data_; - - ret = 0.0; - - if((i & mask) == cmask){ - q = vec[i]; - ret = q.real()*q.real() + q.imag()*q.imag(); - } - return ret; - } - - const char* name(void) - { - return "probabilities"; - } -}; - -template -class probability_1qubit_func : public GateFuncBase -{ -protected: - uint_t offset; -public: - probability_1qubit_func(const uint_t qubit) - { - offset = 1ull << qubit; - } - - __host__ __device__ thrust::complex operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex ret; - double d0,d1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & (offset - 1); - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - q1 = vec1[i0]; - - d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); - d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); - - ret = thrust::complex(d0,d1); - return ret; - } - - const char* name(void) - { - return "probabilities_1qubit"; - } -}; - template std::vector QubitVectorThrust::probabilities(const reg_t &qubits) const { @@ -3886,25 +1927,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits const int_t DIM = 1 << N; std::vector probs(DIM, 0.); - if(N == 1){ //special case for 1 qubit (optimized for measure) - apply_function_sum2(&probs[0],probability_1qubit_func(qubits[0])); - -#ifdef AER_DEBUG - DebugMsg("probabilities",probs); -#endif - return probs; - } - - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((N == num_qubits_) && (qubits == qubits_sorted)) - return probabilities(); - - - int i; - for(i=0;i(qubits,i)); - } + chunk_.probabilities(probs, qubits); #ifdef AER_DEBUG DebugMsg("probabilities",probs); @@ -4556,136 +2579,12 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) * ******************************************************************************/ -inline __host__ __device__ uint_t pop_count_kernel(uint_t val) -{ - uint_t count = val; - count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); - count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); - count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); - count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); - count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); - count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); - return count; -} - -//special case Z only -template -class expval_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - -public: - expval_pauli_Z_func(uint_t z) - { - z_mask_ = z; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - double ret = 0.0; - - vec = this->data_; - - q0 = vec[i]; - ret = q0.real()*q0.real() + q0.imag()*q0.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - ret = -ret; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_Z"; - } -}; - -template -class expval_pauli_XYZ_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; -public: - expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - ret = -d0; - else - ret = d0; - if(pop_count_kernel(idx1 & z_mask_) & 1) - ret -= d1; - else - ret += d1; - } - else{ - ret = d0 + d1; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_XYZ"; - } -}; - template double QubitVectorThrust::expval_pauli(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase) const { + return chunk_.expval_pauli(qubits,pauli,initial_phase); + uint_t x_mask, z_mask, num_y, x_max; std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); @@ -4709,73 +2608,6 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, return ret; } -template -class expval_pauli_inter_chunk_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - thrust::complex phase_; - thrust::complex* pair_chunk_; - uint_t z_count_; - uint_t z_count_pair_; -public: - expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - pair_chunk_ = pair_chunk; - z_count_ = zc; - z_count_pair_ = zcp; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t ip; - - vec = this->data_; - - ip = i ^ x_mask_; - q0 = vec[i]; - q1 = pair_chunk_[ip]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if((pop_count_kernel(i & z_mask_) + z_count_) & 1) - ret = -d0; - else - ret = d0; - if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) - ret -= d1; - else - ret += d1; - - return ret; - } - const char* name(void) - { - return "expval_pauli_inter_chunk"; - } -}; template double QubitVectorThrust::expval_pauli(const reg_t &qubits, @@ -4860,97 +2692,6 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, * ******************************************************************************/ -template -class multi_pauli_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; - uint_t nqubits_; -public: - multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - q0 *= -1; - - if(pop_count_kernel(idx1 & z_mask_) & 1) - q1 *= -1; - } - vec[idx0] = q1 * phase_; - vec[idx1] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli"; - } -}; - -//special case Z only -template -class multi_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - thrust::complex phase_; -public: - multi_pauli_Z_func(uint_t z,std::complex p) - { - z_mask_ = z; - phase_ = p; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - - vec = this->data_; - - q0 = vec[i]; - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - q0 = -q0; - } - vec[i] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli_Z"; - } -}; template void QubitVectorThrust::apply_pauli(const reg_t &qubits, diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp index 0fc2bfa6be..9c17562203 100755 --- a/src/simulators/unitary/unitarymatrix_thrust.hpp +++ b/src/simulators/unitary/unitarymatrix_thrust.hpp @@ -280,54 +280,13 @@ void UnitaryMatrixThrust::set_num_qubits(size_t num_qubits) { BaseVector::set_num_qubits(2 * num_qubits); } -template -class trace_func : public GateFuncBase -{ -protected: - uint_t rows_; -public: - trace_func(uint_t nrow) - { - rows_ = nrow; - } - bool is_diagonal(void) - { - return true; - } - uint_t size(int num_qubits) - { - this->chunk_bits_ = num_qubits; - return rows_; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - - uint_t iChunk = (i / rows_); - uint_t lid = i - (iChunk * rows_); - uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); - - vec = this->data_; - q = vec[idx]; - return q.real(); - } - - const char* name(void) - { - return "trace"; - } -}; template std::complex UnitaryMatrixThrust::trace() const { thrust::complex sum; - double ret; - BaseVector::apply_function_sum(&ret,trace_func(rows_),false); - sum = ret; + sum = BaseVector::chunk_.trace(rows_, 1); #ifdef AER_DEBUG BaseVector::DebugMsg("trace",sum); From eba2594c258dbd7e1c48b8228771957e7a6e3e4d Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 18 Jan 2022 16:54:46 +0900 Subject: [PATCH 08/17] Fix norm() for Thrust CPU --- src/simulators/statevector/chunk/chunk_container.hpp | 2 ++ src/simulators/statevector/qubitvector_thrust.hpp | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 67866d650e..f53244bd56 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -619,6 +619,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -758,6 +759,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 1ade632a14..7f002acf05 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -1803,15 +1803,17 @@ template double QubitVectorThrust::norm() const { double ret; + uint_t count = 1; #ifdef AER_THRUST_CUDA if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ if(chunk_.pos() != 0) return 0.0; //first chunk execute all in batch + count = chunk_.container()->num_chunks(); } #endif - ret = chunk_.norm(chunk_.container()->num_chunks()); + ret = chunk_.norm(count); #ifdef AER_DEBUG DebugMsg("norm",ret); From 5a9380774aaa4fbc20e199f62a6d9ff7831a05d4 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 26 Jan 2022 16:58:10 +0900 Subject: [PATCH 09/17] change cuStateVec from device to option --- CMakeLists.txt | 2 +- CONTRIBUTING.md | 15 ++-- .../providers/aer/backends/aer_simulator.py | 25 +++++-- src/controllers/aer_controller.hpp | 75 +++++++++++-------- src/simulators/state_chunk.hpp | 40 +++++++++- src/simulators/statevector/qubitvector.hpp | 11 ++- .../statevector/qubitvector_thrust.hpp | 27 ++++--- .../superoperator/superoperator_state.hpp | 2 +- 8 files changed, 138 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 395f90b0ba..2bd46bbc5f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,7 +258,7 @@ if(AER_THRUST_SUPPORTED) if(CUSTATEVEC_ROOT) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") - set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas") endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b64821d10e..836e4824ba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -644,9 +644,8 @@ Few notes on GPU builds: 4. Only Linux platforms are supported Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®. -cuStateVec APIs can be exploited to accelerate statevector and density_matrix methods. -Because cuQuantum is beta version, usage of cuStateVec is limited to matrix multiplications. -Some gate operations that can be applied by matrix multiplication are accelerated. +cuStateVec APIs can be exploited to accelerate statevector, density_matrix and unitary methods. +Because cuQuantum is beta version currently, some of the operations are not accelerated by cuStateVec. To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following. @@ -654,11 +653,15 @@ For example, qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum -To run with cuStateVec, set the simulator device argument to cuStateVec as following. - +To run with cuStateVec, set `device='GPU'` to AerSimulator option and cuStateVec is enabled +if the number of qubits of input circuit is equal or greater than 22 qubits by default. +This threshold can be modified by setting `cuStateVec_threshold` option. +By setting `cuStateVec_enable=False` to disable using cuStateVec. +Following example shows how you accelerate 10 or more qubits simulations using cuStateVec. ``` -sim = AerSimulator(method='statevector', device='cuStateVec') +sim = AerSimulator(method='statevector', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True,cuStateVec_threshold=10).result() ``` diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 880157cab4..4dea785728 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -148,10 +148,9 @@ class AerSimulator(AerBackend): initialization or with :meth:`set_options`. The list of supported devices for the current system can be returned using :meth:`available_devices`. - If AerSimulator is built with cuQuantum support, cuQuantum APIs are enabled - by using ``device="cuStateVec"``. This is experimental implementation - for cuQuantum Beta 1. All the calculations of gates that can be executed by - multiplying matrices will be done by cuStateVec matrix API. + If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled + by setting ``cuStateVec_enable=True``. This is experimental implementation + based on cuQuantum Beta 2. **Additional Backend Options** @@ -221,6 +220,19 @@ class AerSimulator(AerBackend): values (16 Bytes). If set to 0, the maximum will be automatically set to the system memory size (Default: 0). + * ``cuStateVec_enable`` (bool): This option enables accelerating by + cuStateVec library of cuQuantum from NVIDIA, that has highly optimized + kernels for GPUs. This option is enabled when the number of qubits of + the input circuit is equal or greater than ``cuStateVec_threshold``. + Currently this option only works well for large number of qubits. + Also this option will be disabled for noise simulation + (Default: True). + + * ``cuStateVec_threshold`` (int): This option sets the threshold + number of qubits to enable ``cuStateVec_enable`` option. + cuStateVec is enabled when the number of qubits is equal or greater + than this option (Default: 22). + * ``blocking_enable`` (bool): This option enables parallelization with multiple GPUs or multiple processes with MPI (CPU/GPU). This option is only available for ``"statevector"``, ``"density_matrix"`` and @@ -459,7 +471,7 @@ class AerSimulator(AerBackend): _AVAILABLE_METHODS = None - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') _AVAILABLE_DEVICES = None @@ -519,6 +531,9 @@ def _default_options(cls): memory=None, noise_model=None, seed_simulator=None, + # cuStateVec (cuQuantum) options + cuStateVec_enable=True, + cuStateVec_threshold=22, # cache blocking for multi-GPUs/MPI options blocking_qubits=None, blocking_enable=False, diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 4e2742582f..b000906a37 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -115,7 +115,7 @@ class Controller { superop }; - enum class Device { CPU, GPU, ThrustCPU, cuStateVec }; + enum class Device { CPU, GPU, ThrustCPU }; // Simulation precision enum class Precision { Double, Single }; @@ -327,7 +327,7 @@ class Controller { size_t get_gpu_memory_mb(); size_t get_min_memory_mb() const { - if ((sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0) { + if (sim_device_ == Device::GPU && num_gpus_ > 0) { return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size } return max_memory_mb_; @@ -377,6 +377,9 @@ class Controller { int_t batched_shots_gpu_max_qubits_ = 16; //multi-shot parallelization is applied if qubits is less than max qubits bool enable_batch_multi_shots_ = false; //multi-shot parallelization can be applied + //settings for cuStateVec + bool cuStateVec_enable_ = false; + int cuStateVec_threshold_ = 22; }; //========================================================================= @@ -466,6 +469,16 @@ void Controller::set_config(const json_t &config) { JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config); } +#ifdef AER_CUSTATEVEC + //cuStateVec configs + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } + if(JSON::check_key("cuStateVec_threshold", config)) { + JSON::get_value(cuStateVec_threshold_, "cuStateVec_threshold", config); + } +#endif + // Override automatic simulation method with a fixed method std::string method; if (JSON::get_value(method, "method", config)) { @@ -514,27 +527,18 @@ void Controller::set_config(const json_t &config) { throw std::runtime_error("No CUDA device available!"); } sim_device_ = Device::GPU; -#endif - } - else if(sim_device_name_ == "cuStateVec"){ -#ifndef AER_CUSTATEVEC - throw std::runtime_error( - "Simulation device \"cuStateVec\" is not supported on this system"); -#else - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - throw std::runtime_error("No CUDA device available!"); - } - sim_device_ = Device::cuStateVec; - - //initialize custatevevtor handle once before actual calculation (takes long time at first call) - custatevecStatus_t err; - custatevecHandle_t stHandle; - err = custatevecCreate(&stHandle); - if(err == CUSTATEVEC_STATUS_SUCCESS){ - custatevecDestroy(stHandle); + +#ifdef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //initialize custatevevtor handle once before actual calculation (takes long time at first call) + custatevecStatus_t err; + custatevecHandle_t stHandle; + err = custatevecCreate(&stHandle); + if(err == CUSTATEVEC_STATUS_SUCCESS){ + custatevecDestroy(stHandle); + } } +#endif #endif } else { @@ -661,11 +665,15 @@ void Controller::set_parallelization_circuit(const Circuit &circ, enable_batch_multi_shots_ = false; if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ //cuStateVec is not supported currently, because cuStateVec does not handle conditional functions - enable_batch_multi_shots_ = true; + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ + //cuStateVec is not supported currently, because cuStateVec does not handle conditional functions + if(cuStateVec_enable_ && circ.num_qubits >= cuStateVec_threshold_) + enable_batch_multi_shots_ = false; + else + enable_batch_multi_shots_ = true; } - if(sim_device_ == Device::cuStateVec){ + if(cuStateVec_enable_ && circ.num_qubits >= cuStateVec_threshold_){ parallel_shots_ = 1; //cuStateVec is currently not thread safe return; } @@ -723,7 +731,7 @@ void Controller::set_parallelization_circuit(const Circuit &circ, // And assign the remaining threads to state update int circ_memory_mb = required_memory_mb(circ, noise, method) / num_process_per_experiment_; - size_t mem_size = (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) ? max_gpu_memory_mb_ : max_memory_mb_; + size_t mem_size = (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_; if (mem_size < circ_memory_mb) throw std::runtime_error( "a circuit requires more memory than max_memory_mb."); @@ -749,12 +757,12 @@ bool Controller::multiple_chunk_required(const Circuit &circ, if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits) return true; - if(num_process_per_experiment_ == 1 && (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0){ + if(num_process_per_experiment_ == 1 && sim_device_ == Device::GPU && num_gpus_ > 0){ return (max_gpu_memory_mb_ / num_gpus_ < required_memory_mb(circ, noise, method)); } if(num_process_per_experiment_ > 1){ size_t total_mem = max_memory_mb_; - if(sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) + if(sim_device_ == Device::GPU) total_mem += max_gpu_memory_mb_; if(total_mem*num_process_per_experiment_ > required_memory_mb(circ, noise, method)) return true; @@ -847,7 +855,7 @@ Controller::transpile_cache_blocking(Controller::Method method, const Circuit &c // if blocking is not set by config, automatically set if required if (multiple_chunk_required(circ, noise, method)) { int nplace = num_process_per_experiment_; - if((sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) && num_gpus_ > 0) + if(sim_device_ == Device::GPU && num_gpus_ > 0) nplace *= num_gpus_; cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20, nplace, complex_size, is_matrix); @@ -1468,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, // Check if measure sampler and optimization are valid if (can_sample) { // Implement measure sampler - if (parallel_shots_ <= 1 || (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec)) { + if (parallel_shots_ <= 1 || sim_device_ == Device::GPU) { state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed); @@ -1765,7 +1773,12 @@ void Controller::measure_sampler( shots_or_index = shots; else shots_or_index = shot_index; + + auto timer_start = myclock_t::now(); auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); // Make qubit map of position in vector of measured qubits std::unordered_map qubit_map; @@ -1957,7 +1970,7 @@ bool Controller::validate_state(const state_t &state, const Circuit &circ, bool memory_valid = true; if (max_memory_mb_ > 0) { size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) / num_process_per_experiment_; - size_t mem_size = (sim_device_ == Device::GPU || sim_device_ == Device::cuStateVec) ? max_memory_mb_ + max_gpu_memory_mb_ : max_memory_mb_; + size_t mem_size = (sim_device_ == Device::GPU) ? max_memory_mb_ + max_gpu_memory_mb_ : max_memory_mb_; memory_valid = (required_mb <= mem_size); } if (throw_except && !memory_valid) { diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index 239e3bfa55..b9b6990fcb 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -391,6 +391,10 @@ class StateChunk : public State { reg_t top_chunk_of_group_; reg_t num_chunks_in_group_; + //cuStateVec settings + bool cuStateVec_enable_ = false; + int cuStateVec_threshold_ = 22; + //----------------------------------------------------------------------- // Apply circuits and ops //----------------------------------------------------------------------- @@ -529,6 +533,16 @@ template void StateChunk::set_config(const json_t &config) { BaseState::set_config(config); + +#ifdef AER_CUSTATEVEC + //cuStateVec configs + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } + if(JSON::check_key("cuStateVec_threshold", config)) { + JSON::get_value(cuStateVec_threshold_, "cuStateVec_threshold", config); + } +#endif } template @@ -622,10 +636,20 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu chunk_omp_parallel_ = true; #endif - if(BaseState::sim_device_name_ == "cuStateVec") +#ifdef AER_CUSTATEVEC + //set cuStateVec_enable_ + if(cuStateVec_enable_){ + if(num_qubits_ < cuStateVec_threshold_) + cuStateVec_enable_ = false; //disable if number of qubits is smaller than threshold + else if(multi_shots_parallelization_) + cuStateVec_enable_ = false; //multi-shots parallelization is not supported for cuStateVec + } + + if(cuStateVec_enable_) chunk_omp_parallel_ = false; //because cuStateVec is not thread safe else thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix +#endif } else if(qregs_[0].name().find("thrust") != std::string::npos){ thrust_optimization_ = true; @@ -659,7 +683,8 @@ bool StateChunk::allocate_qregs(uint_t num_chunks) uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0; bool ret = true; qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_); - ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks, BaseState::sim_device_name_); + qregs_[0].cuStateVec_enable(cuStateVec_enable_); + ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks); for(i=1;i::apply_ops(InputIterator first, InputIterator last, } } } + + qregs_[0].synchronize(); + +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template @@ -804,6 +835,11 @@ void StateChunk::apply_ops_chunks(InputIterator first, InputIterator la } iOp++; } + + qregs_[0].synchronize(); +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 7166ba574d..6e925ecde4 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -131,7 +131,7 @@ class QubitVector { void initialize_component(const reg_t &qubits, const cvector_t &state); //setup chunk - bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name); + bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks); bool chunk_setup(QubitVector& base,const uint_t chunk_index); //cache control for chunks on host @@ -159,6 +159,8 @@ class QubitVector { void set_max_matrix_bits(int_t bits){} + void synchronize(void){} + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -389,6 +391,11 @@ class QubitVector { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -925,7 +932,7 @@ std::complex QubitVector::inner_product() const { //setup chunk template -bool QubitVector::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name) +bool QubitVector::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) { chunk_index_ = chunk_index; return true; diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 7f002acf05..5d26570bb9 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -142,7 +142,7 @@ class QubitVectorThrust { void initialize_component(const reg_t &qubits, const cvector_t &state); //chunk setup - bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name); + bool chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks); bool chunk_setup(QubitVectorThrust& base,const uint_t chunk_index); //cache control for chunks on host @@ -162,6 +162,11 @@ class QubitVectorThrust { void set_max_matrix_bits(int_t bits); + void synchronize(void) + { + chunk_.synchronize(); + } + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -420,6 +425,12 @@ class QubitVectorThrust { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + cuStateVec_enable_ = flg; + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -430,7 +441,6 @@ class QubitVectorThrust { // Get the sample_measure index size int get_sample_measure_index_size() {return sample_measure_index_size_;} - protected: //----------------------------------------------------------------------- @@ -451,7 +461,7 @@ class QubitVectorThrust { bool multi_chunk_distribution_; bool multi_shots_; bool enable_batch_; - bool enable_cuStatevec_ = false; + bool cuStateVec_enable_ = false; bool register_blocking_; @@ -792,16 +802,11 @@ void QubitVectorThrust::zero() } template -bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks, std::string& device_name) +bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) { //set global chunk ID / shot ID chunk_index_ = chunk_index; - //check device name if cuStateVec is specified - if(device_name == "cuStateVec"){ - enable_cuStatevec_ = true; - } - if(chunk_manager_){ if(chunk_.is_mapped()){ chunk_.unmap(); @@ -819,7 +824,7 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ chunk_manager_ = std::make_shared>(); - chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, enable_cuStatevec_); + chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, cuStateVec_enable_); } multi_chunk_distribution_ = false; @@ -851,7 +856,7 @@ bool QubitVectorThrust::chunk_setup(QubitVectorThrust& base,cons base.multi_shots_ = true; } } - enable_cuStatevec_ = base.enable_cuStatevec_; + cuStateVec_enable_ = base.cuStateVec_enable_; //set global chunk ID / shot ID chunk_index_ = chunk_index; diff --git a/src/simulators/superoperator/superoperator_state.hpp b/src/simulators/superoperator/superoperator_state.hpp index 9a8be6ab19..403622d354 100755 --- a/src/simulators/superoperator/superoperator_state.hpp +++ b/src/simulators/superoperator/superoperator_state.hpp @@ -360,7 +360,7 @@ template void State::initialize_omp() { template bool State::allocate(uint_t num_qubits, uint_t block_bits,uint_t num_parallel_shots) { - return BaseState::qreg_.chunk_setup(num_qubits * 4, num_qubits * 4, 0, 1, BaseState::sim_device_name_); + return BaseState::qreg_.chunk_setup(num_qubits * 4, num_qubits * 4, 0, 1); } //========================================================================= From 983773b448fbe6cdb40d3401297253c7f84440a2 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 26 Jan 2022 17:16:02 +0900 Subject: [PATCH 10/17] Fix unchanged device=cuStateVec --- qiskit/providers/aer/backends/backend_utils.py | 3 --- qiskit/providers/aer/backends/qasm_simulator.py | 2 +- qiskit/providers/aer/backends/statevector_simulator.py | 2 +- qiskit/providers/aer/backends/unitary_simulator.py | 2 +- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/qiskit/providers/aer/backends/backend_utils.py b/qiskit/providers/aer/backends/backend_utils.py index dca818bcdd..110ed4383d 100644 --- a/qiskit/providers/aer/backends/backend_utils.py +++ b/qiskit/providers/aer/backends/backend_utils.py @@ -40,15 +40,12 @@ LEGACY_METHOD_MAP = { "statevector_cpu": ("statevector", "CPU"), "statevector_gpu": ("statevector", "GPU"), - "statevector_custatevec": ("statevector", "cuStateVec"), "statevector_thrust": ("statevector", "Thrust"), "density_matrix_cpu": ("density_matrix", "CPU"), "density_matrix_gpu": ("density_matrix", "GPU"), - "density_matrix_custatevec": ("density_matrix", "cuStateVec"), "density_matrix_thrust": ("density_matrix", "Thrust"), "unitary_cpu": ("unitary", "CPU"), "unitary_gpu": ("unitary", "GPU"), - "unitary_custatevec": ("unitary", "cuStateVec"), "unitary_thrust": ("unitary", "Thrust"), } diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py index 5f3570cf2c..23ad8a4927 100644 --- a/qiskit/providers/aer/backends/qasm_simulator.py +++ b/qiskit/providers/aer/backends/qasm_simulator.py @@ -347,7 +347,7 @@ class QasmSimulator(AerBackend): _AVAILABLE_METHODS = None - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') _AVAILABLE_DEVICES = None diff --git a/qiskit/providers/aer/backends/statevector_simulator.py b/qiskit/providers/aer/backends/statevector_simulator.py index 2cc6d09327..100cfb7b57 100644 --- a/qiskit/providers/aer/backends/statevector_simulator.py +++ b/qiskit/providers/aer/backends/statevector_simulator.py @@ -165,7 +165,7 @@ class StatevectorSimulator(AerBackend): 'gates': [] } - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') _AVAILABLE_DEVICES = None diff --git a/qiskit/providers/aer/backends/unitary_simulator.py b/qiskit/providers/aer/backends/unitary_simulator.py index a3fa9de7a7..2db5880aa9 100644 --- a/qiskit/providers/aer/backends/unitary_simulator.py +++ b/qiskit/providers/aer/backends/unitary_simulator.py @@ -163,7 +163,7 @@ class UnitarySimulator(AerBackend): 'gates': [] } - _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust', 'cuStateVec') + _SIMULATION_DEVICES = ('CPU', 'GPU', 'Thrust') _AVAILABLE_DEVICES = None From 5bea04d630d649b27d5cad8eb033ba613bf44dfd Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Thu, 27 Jan 2022 17:19:20 +0900 Subject: [PATCH 11/17] Add build option to link cuStateVec statically --- CMakeLists.txt | 6 ++++- .../statevector/chunk/chunk_container.hpp | 22 ++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bd46bbc5f..4134e1c7e8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,7 +258,11 @@ if(AER_THRUST_SUPPORTED) if(CUSTATEVEC_ROOT) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") - set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas") + if(CUSTATEVEC_STATIC) + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas") + else() + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + endif() endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index f53244bd56..660659b214 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -477,7 +477,11 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) thrust::for_each_n(thrust::seq, ci , size, func); } #else - uint_t size = count * func.size(chunk_bits_); + uint_t size; + if(func.use_cache()) + size = count << (chunk_bits_ - func.qubits_count()); + else + size = count * func.size(chunk_bits_); auto ci = thrust::counting_iterator(0); thrust::for_each_n(thrust::device, ci , size, func); #endif @@ -826,8 +830,6 @@ template void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) { const size_t N = qubits.size() - control_bits; - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); if(N == 1){ if(control_bits == 0) @@ -839,7 +841,21 @@ void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubit Execute(MatrixMult4x4(mat,qubits[0],qubits[1]), iChunk, count); } else{ + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); +#ifndef AER_THRUST_CUDA + if(N == 3){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult8x8(qubits,qubits_sorted), iChunk, count); + } + else if(N == 4){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult16x16(qubits,qubits_sorted), iChunk, count); + } + else if(N <= 10){ +#else if(N <= 10){ +#endif int i; for(i=0;i Date: Thu, 27 Jan 2022 17:28:01 +0900 Subject: [PATCH 12/17] removed whitespace --- qiskit/providers/aer/backends/aer_simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 4dea785728..a8ef4a9573 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -150,7 +150,7 @@ class AerSimulator(AerBackend): If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled by setting ``cuStateVec_enable=True``. This is experimental implementation - based on cuQuantum Beta 2. + based on cuQuantum Beta 2. **Additional Backend Options** From c7812080fa48b4b40ba8716e24a7cd4b0b559ddb Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 1 Feb 2022 17:33:22 +0900 Subject: [PATCH 13/17] reflecting review comments --- CONTRIBUTING.md | 7 ++ .../providers/aer/backends/aer_simulator.py | 6 +- src/controllers/aer_controller.hpp | 4 - .../density_matrix/densitymatrix_thrust.hpp | 36 +++---- src/simulators/state.hpp | 5 +- src/simulators/statevector/chunk/chunk.hpp | 9 +- .../statevector/chunk/chunk_container.hpp | 19 +--- .../statevector/chunk/chunk_manager.hpp | 3 + .../chunk/cuStateVec_chunk_container.hpp | 13 ++- .../statevector/chunk/cuda_kernels.hpp | 2 + .../chunk/device_chunk_container.hpp | 10 +- .../chunk/host_chunk_container.hpp | 2 + .../statevector/chunk/thrust_kernels.hpp | 2 + .../statevector/qubitvector_thrust.hpp | 98 +++++++++---------- 14 files changed, 109 insertions(+), 107 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 836e4824ba..68483d1675 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -664,6 +664,13 @@ sim = AerSimulator(method='statevector', device='GPU') results = execute(circuit,sim,cuStateVec_enable=True,cuStateVec_threshold=10).result() ``` +Also you can accelrate density matrix simulation as well. +Following example shows how to enable cuStateVec for 5 or more qubits circuit on density matrix method +``` +sim = AerSimulator(method='density_matrix', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True,cuStateVec_threshold=5).result() +``` + ### Building with MPI support diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index a8ef4a9573..07531c5647 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -226,12 +226,14 @@ class AerSimulator(AerBackend): the input circuit is equal or greater than ``cuStateVec_threshold``. Currently this option only works well for large number of qubits. Also this option will be disabled for noise simulation - (Default: True). + (Default: True). This option will be ignored + if AerSimulator is not built with cuStateVec support. * ``cuStateVec_threshold`` (int): This option sets the threshold number of qubits to enable ``cuStateVec_enable`` option. cuStateVec is enabled when the number of qubits is equal or greater - than this option (Default: 22). + than this option (Default: 22). This option will be ignored + if AerSimulator is not built with cuStateVec support. * ``blocking_enable`` (bool): This option enables parallelization with multiple GPUs or multiple processes with MPI (CPU/GPU). This option diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index b000906a37..1de932a293 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -666,10 +666,6 @@ void Controller::set_parallelization_circuit(const Circuit &circ, if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ - //cuStateVec is not supported currently, because cuStateVec does not handle conditional functions - if(cuStateVec_enable_ && circ.num_qubits >= cuStateVec_threshold_) - enable_batch_multi_shots_ = false; - else enable_batch_multi_shots_ = true; } diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 810ef2056b..7eaddb75c1 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -262,7 +262,7 @@ void DensityMatrixThrust::apply_diagonal_superop_matrix(const reg_t &qub template -class DensityMatrixUnitary2x2 : public GateFuncBase +class DensityMatrixUnitary2x2 : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -364,7 +364,7 @@ void DensityMatrixThrust::apply_unitary_matrix(const reg_t &qubits, } template -class DensityDiagMatMult2x2 : public GateFuncBase +class DensityDiagMatMult2x2 : public Chunk::GateFuncBase { protected: uint_t offset; @@ -429,7 +429,7 @@ class DensityDiagMatMult2x2 : public GateFuncBase }; template -class DensityDiagMatMultNxN : public GateFuncBase +class DensityDiagMatMultNxN : public Chunk::GateFuncBase { protected: int nqubits_; @@ -512,7 +512,7 @@ void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qub // Apply Specialized Gates //----------------------------------------------------------------------- template -class DensityCX : public GateFuncBase +class DensityCX : public Chunk::GateFuncBase { protected: uint_t offset; @@ -599,7 +599,7 @@ void DensityMatrixThrust::apply_cnot(const uint_t qctrl, const uint_t qt } template -class DensityPhase : public GateFuncBase +class DensityPhase : public Chunk::GateFuncBase { protected: thrust::complex phase_; @@ -665,7 +665,7 @@ void DensityMatrixThrust::apply_phase(const uint_t q,const complex_t &ph } template -class DensityCPhase : public GateFuncBase +class DensityCPhase : public Chunk::GateFuncBase { protected: uint_t offset; @@ -753,7 +753,7 @@ void DensityMatrixThrust::apply_swap(const uint_t q0, const uint_t q1) { } template -class DensityX : public GateFuncBase +class DensityX : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -829,7 +829,7 @@ void DensityMatrixThrust::apply_x(const uint_t qubit) } template -class DensityY : public GateFuncBase +class DensityY : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -929,7 +929,7 @@ void DensityMatrixThrust::apply_toffoli(const uint_t qctrl0, //special case Z only template -class expval_pauli_Z_func_dm : public GateFuncBase +class expval_pauli_Z_func_dm : public Chunk::GateFuncBase { protected: uint_t z_mask_; @@ -966,7 +966,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } @@ -979,7 +979,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase }; template -class expval_pauli_XYZ_func_dm : public GateFuncBase +class expval_pauli_XYZ_func_dm : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1026,7 +1026,7 @@ class expval_pauli_XYZ_func_dm : public GateFuncBase q0 = 2 * phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(idx_vec & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx_vec & z_mask_) & 1) ret = -ret; } return ret; @@ -1067,7 +1067,7 @@ double DensityMatrixThrust::expval_pauli(const reg_t &qubits, } template -class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase +class expval_pauli_XYZ_func_dm_non_diagonal : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1108,7 +1108,7 @@ class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase q0 = phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } return ret; @@ -1151,7 +1151,7 @@ double DensityMatrixThrust::probability(const uint_t outcome) const template -class density_probability_func : public GateFuncBase +class density_probability_func : public Chunk::GateFuncBase { protected: uint_t qubit_sp_; @@ -1257,7 +1257,7 @@ reg_t DensityMatrixThrust::sample_measure(const std::vector &rnd } template -class density_reset_after_measure_func : public GateFuncBase +class density_reset_after_measure_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; @@ -1325,7 +1325,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: count = BaseVector::chunk_.container()->num_chunks(); //total probability - BaseVector::apply_function_sum(nullptr,trace_func(BaseMatrix::rows_),true); + BaseVector::apply_function_sum(nullptr,Chunk::trace_func(BaseMatrix::rows_),true); BaseVector::apply_function(set_probability_buffer_for_reset_func(BaseVector::chunk_.probability_buffer(),BaseVector::chunk_.container()->num_chunks(), BaseVector::chunk_.reduce_buffer(),BaseVector::chunk_.reduce_buffer_size()) ); @@ -1374,7 +1374,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: } template -class density_reset_func : public GateFuncBase +class density_reset_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 7476d88c65..c07b5e99df 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -342,8 +342,6 @@ class State { complex_t global_phase_ = 1; int_t max_matrix_qubits_ = 0; - - std::string sim_device_name_; //name of device }; @@ -359,8 +357,7 @@ State::~State(void) template void State::set_config(const json_t &config) { - //get device name - JSON::get_value(sim_device_name_, "device", config); + } template diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index 13e0603dfa..f2fc4f5d29 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -25,6 +25,7 @@ namespace AER { namespace QV { +namespace Chunk { //============================================================================ @@ -410,17 +411,11 @@ class Chunk return chunk_container_.lock()->expval_pauli(chunk_pos_,qubits,pauli,initial_phase); } - - //largest number of qubits that meets num_chunks_ = m*(2^num_pow2_qubits_) - uint_t num_pow2_qubits(void) - { - chunk_container_.lock()->num_pow2_qubits(); - } - }; //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 660659b214..59ca7f7f03 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -67,6 +67,7 @@ DISABLE_WARNING_POP namespace AER { namespace QV { +namespace Chunk { template class Chunk; template class DeviceChunkContainer; @@ -154,12 +155,6 @@ class ChunkContainer : public std::enable_shared_from_this::deallocate_chunks(void) reduced_queue_end_.clear(); } -template -void ChunkContainer::update_pow2_qubits(void) -{ - uint_t n = num_chunks_; - num_pow2_qubits_ = chunk_bits_; - while((n & 1) == 0){ - num_pow2_qubits_++; - n >>= 1; - } -} - template void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) { @@ -1004,6 +988,7 @@ double ChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qub //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index 313a6f486e..d8a8a4fbfa 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -25,6 +25,8 @@ namespace AER { namespace QV { +namespace Chunk { + //============================================================================ // chunk manager class @@ -400,6 +402,7 @@ void ChunkManager::execute_on_device(Function func,const std::vector::apply_matrix(const uint_t iChunk,const re { thrust::complex* pMat; int_t num_qubits = qubits.size()-control_bits; - +/* if((BaseContainer::matrix_buffer_size_ >= (1ull << (num_qubits*2))) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ BaseContainer::StoreMatrix(mat,iChunk); pMat = BaseContainer::matrix_pointer(iChunk); } - else{ + else{*/ //if operation is not batchable, use host memory pMat = (thrust::complex*)&mat[0]; BaseContainer::set_device(); - } +// } std::vector qubits32(qubits.size()); for(int_t i=0;i::apply_diagonal_matrix(const uint_t iChunk return apply_diagonal_matrix(iChunk, qubits, 0, diag_ctrl, count); } + /* if((BaseContainer::matrix_buffer_size_ >= (1ull << num_qubits)) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ BaseContainer::StoreMatrix(diag,iChunk); pMat = BaseContainer::matrix_pointer(iChunk); } - else{ + else{*/ //if operation is not batchable, use host memory pMat = (thrust::complex*)&diag[0]; BaseContainer::set_device(); - } +// } std::vector qubits32(qubits.size()); for(int_t i=0;i::expval_pauli(const uint_t iChunk,const //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/cuda_kernels.hpp b/src/simulators/statevector/chunk/cuda_kernels.hpp index 9322a69279..4380578813 100644 --- a/src/simulators/statevector/chunk/cuda_kernels.hpp +++ b/src/simulators/statevector/chunk/cuda_kernels.hpp @@ -18,6 +18,7 @@ namespace AER { namespace QV { +namespace Chunk { template __global__ @@ -339,6 +340,7 @@ __global__ void dev_reduce_sum_uint(uint_t *pReduceBuffer,uint_t n,uint_t buf_si //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 035c60ad32..68126695c6 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -20,6 +20,7 @@ namespace AER { namespace QV { +namespace Chunk { //============================================================================ @@ -320,7 +321,13 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu this->num_chunks_ = nc; data_.resize((nc+buffers) << chunk_bits); - this->update_pow2_qubits(); + //init number of bits for chunk count + uint_t nc_tmp = this->num_chunks_; + this->num_pow2_qubits_ = this->chunk_bits_; + while((nc_tmp & 1) == 0){ + this->num_pow2_qubits_++; + nc_tmp >>= 1; + } #ifdef AER_THRUST_CUDA stream_.resize(nc + buffers); @@ -1153,6 +1160,7 @@ void DeviceChunkContainer::copy_to_probability_buffer(std::vector::sample_measure(uint_t iChunk,const std::vector //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp index 701eb05e0a..a882f5c8fc 100644 --- a/src/simulators/statevector/chunk/thrust_kernels.hpp +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -45,6 +45,7 @@ DISABLE_WARNING_POP namespace AER { namespace QV { +namespace Chunk { //======================================== // base class of gate functions @@ -2689,6 +2690,7 @@ class multi_pauli_Z_func : public GateFuncBase //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 5d26570bb9..87bc26c4c4 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -449,11 +449,11 @@ class QubitVectorThrust { size_t num_qubits_; size_t data_size_; - mutable Chunk chunk_; - mutable Chunk buffer_chunk_; - mutable Chunk send_chunk_; - mutable Chunk recv_chunk_; - std::shared_ptr> chunk_manager_ = nullptr; + mutable Chunk::Chunk chunk_; + mutable Chunk::Chunk buffer_chunk_; + mutable Chunk::Chunk send_chunk_; + mutable Chunk::Chunk recv_chunk_; + std::shared_ptr> chunk_manager_ = nullptr; mutable thrust::host_vector> checkpoint_; @@ -758,7 +758,7 @@ template void QubitVectorThrust::initialize_component(const reg_t &qubits, const cvector_t &state0) { if(qubits.size() == 1){ - apply_function(initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); + apply_function(Chunk::initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); } else if(qubits.size() <= chunk_.container()->matrix_bits()){ auto qubits_sorted = qubits; @@ -772,14 +772,14 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const // chunk_.StoreMatrix(state0); // chunk_.StoreUintParams(qubits_param); - apply_function(initialize_component_func(state0,qubits_sorted), state0, qubits_param ); + apply_function(Chunk::initialize_component_func(state0,qubits_sorted), state0, qubits_param ); } else{ //if initial state is larger that matrix buffer, set one by one. uint_t DIM = 1ull << qubits.size(); uint_t i; for(i=0;i(state0[i],qubits,i) ); + apply_function(Chunk::initialize_large_component_func(state0[i],qubits,i) ); } } } @@ -794,7 +794,7 @@ void QubitVectorThrust::zero() DebugMsg("zero"); #endif - apply_function(ZeroClear(), cvector_t(), reg_t()); + apply_function(Chunk::ZeroClear(), cvector_t(), reg_t()); #ifdef AER_DEBUG DebugMsg("zero done"); @@ -823,7 +823,7 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ - chunk_manager_ = std::make_shared>(); + chunk_manager_ = std::make_shared>(); chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, cuStateVec_enable_); } @@ -1134,7 +1134,7 @@ void QubitVectorThrust::initialize() if(multi_chunk_distribution_){ if(chunk_index_ == 0){ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); } else{ zero(); @@ -1142,7 +1142,7 @@ void QubitVectorThrust::initialize() chunk_.synchronize(); } else{ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); } #ifdef AER_DEBUG @@ -1550,7 +1550,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto thrust::complex* pChunk0; thrust::complex* pChunk1; - Chunk bufferChunk; + Chunk::Chunk bufferChunk; bool exec_on_src = false; if(chunk_.device() >= 0){ @@ -1594,13 +1594,13 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto } if(exec_on_src){ - src.apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + src.apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); src.chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(chunk_); } else{ - apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(src.chunk_); @@ -1632,7 +1632,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem else{ thrust::complex* pLocal; thrust::complex* pRemote; - Chunk buffer; + Chunk::Chunk buffer; #ifdef AER_DISABLE_GDR if(chunk_.device() >= 0){ //if there is no GPUDirectRDMA support, copy chunk from CPU @@ -1657,7 +1657,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem DebugMsg("chunk swap (process)",qubits); #endif - chunk_.Execute(CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); + chunk_.Execute(Chunk::CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); chunk_.synchronize(); //should be synchronized here if(buffer.is_mapped()){ @@ -1847,7 +1847,7 @@ double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t(N)); + apply_function_sum(&ret,Chunk::NormMatrixMultNxN(N)); return ret; } } @@ -1866,7 +1866,7 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect chunk_.StoreUintParams(qubits); double ret; - apply_function_sum(&ret,NormDiagonalMultNxN(qubits) ); + apply_function_sum(&ret,Chunk::NormDiagonalMultNxN(qubits) ); return ret; } } @@ -1878,7 +1878,7 @@ template double QubitVectorThrust::norm(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormMatrixMult2x2(mat,qubit)); + apply_function_sum(&ret,Chunk::NormMatrixMult2x2(mat,qubit)); return ret; } @@ -1887,7 +1887,7 @@ template double QubitVectorThrust::norm_diagonal(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormDiagonalMult2x2(mat,qubit)); + apply_function_sum(&ret,Chunk::NormDiagonalMult2x2(mat,qubit)); return ret; } @@ -1949,7 +1949,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits #define QV_RESET_TARGET_PROB 3 template -class reset_after_measure_func : public GateFuncBase +class reset_after_measure_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -2001,7 +2001,7 @@ class reset_after_measure_func : public GateFuncBase }; template -class set_probability_buffer_for_reset_func : public GateFuncBase +class set_probability_buffer_for_reset_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -2042,7 +2042,7 @@ class set_probability_buffer_for_reset_func : public GateFuncBase }; template -class check_measure_probability_func : public GateFuncBase +class check_measure_probability_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -2184,7 +2184,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -2210,7 +2210,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v //loop for probability for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -2229,7 +2229,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v } template -class reset_func : public GateFuncBase +class reset_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -2322,7 +2322,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -2336,7 +2336,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.StoreUintParams(qubits); for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -2393,7 +2393,7 @@ void QubitVectorThrust::get_creg(ClassicalRegister& creg) } template -class set_creg_func : public GateFuncBase +class set_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_set_; @@ -2443,7 +2443,7 @@ void QubitVectorThrust::store_cmemory(uint_t qubit,int val) } template -class set_batched_creg_func : public GateFuncBase +class set_batched_creg_func : public Chunk::GateFuncBase { protected: int_t reg_set_; @@ -2505,7 +2505,7 @@ int_t QubitVectorThrust::set_batched_system_conditional(int_t src_reg, r } template -class copy_creg_func : public GateFuncBase +class copy_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_dest_; @@ -2603,7 +2603,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, double ret; // specialize x_max == 0 if(x_mask == 0) { - apply_function_sum(&ret, expval_pauli_Z_func(z_mask) ); + apply_function_sum(&ret, Chunk::expval_pauli_Z_func(z_mask) ); return ret; } @@ -2611,7 +2611,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, // This is (-1j) ** number of Y terms modulo 4 auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); + apply_function_sum(&ret, Chunk::expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); return ret; } @@ -2628,7 +2628,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, //get pointer to pairing chunk (copy if needed) double ret; thrust::complex* pair_ptr; - Chunk buffer; + Chunk::Chunk buffer; if(pair_chunk.data() == this->data()){ #ifdef AER_DISABLE_GDR @@ -2676,7 +2676,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); + apply_function_sum(&ret, Chunk::expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); if(buffer.is_mapped()){ chunk_manager_->UnmapBufferChunk(buffer); @@ -2716,16 +2716,16 @@ void QubitVectorThrust::apply_pauli(const reg_t &qubits, add_y_phase(num_y, phase); if(x_mask == 0){ - apply_function(multi_pauli_Z_func(z_mask, phase)); + apply_function(Chunk::multi_pauli_Z_func(z_mask, phase)); } else{ - apply_function(multi_pauli_func(x_mask, z_mask, x_max, phase) ); + apply_function(Chunk::multi_pauli_func(x_mask, z_mask, x_max, phase) ); } } //batched Pauli operation used for Pauli noise template -class batched_pauli_func : public GateFuncBase +class batched_pauli_func : public Chunk::GateFuncBase { protected: thrust::complex coeff_; @@ -2783,10 +2783,10 @@ class batched_pauli_func : public GateFuncBase phase = thrust::complex(-coeff_.imag(),coeff_.real()); if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx0 & z_mask_) & 1) q0 *= -1; - if(pop_count_kernel(idx1 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx1 & z_mask_) & 1) q1 *= -1; } if(x_mask_ == 0){ @@ -2859,7 +2859,7 @@ void QubitVectorThrust::apply_batched_pauli_ops(const std::vector -class MatrixMult2x2_conditional : public GateFuncBase +class MatrixMult2x2_conditional : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -2910,13 +2910,13 @@ class MatrixMult2x2_conditional : public GateFuncBase }; template -class MatrixMultNxN_conditional : public GateFuncWithCache +class MatrixMultNxN_conditional : public Chunk::GateFuncWithCache { protected: uint_t prob_buf_size_; double* probs_; public: - MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : GateFuncWithCache(nq) + MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : Chunk::GateFuncWithCache(nq) { probs_ = probs; prob_buf_size_ = prob_size; @@ -2957,7 +2957,7 @@ class MatrixMultNxN_conditional : public GateFuncWithCache }; template -class check_kraus_probability_func : public GateFuncBase +class check_kraus_probability_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -3065,7 +3065,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, cvector_t vmat = Utils::vectorize_matrix(kmats[i]); chunk_.set_conditional(system_reg); - apply_function_sum(nullptr,NormMatrixMult2x2(vmat,qubits[0]),true); + apply_function_sum(nullptr,Chunk::NormMatrixMult2x2(vmat,qubits[0]),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -3087,7 +3087,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, chunk_.set_conditional(system_reg); chunk_.StoreMatrix(Utils::vectorize_matrix(kmats[i])); - apply_function_sum(nullptr,NormMatrixMultNxN(N),true); + apply_function_sum(nullptr,Chunk::NormMatrixMultNxN(N),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -3103,7 +3103,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, } template -class bfunc_kernel : public GateFuncBase +class bfunc_kernel : public Chunk::GateFuncBase { protected: uint_t bfunc_num_regs_; @@ -3229,7 +3229,7 @@ void QubitVectorThrust::apply_bfunc(const Operations::Op &op) } template -class roerror_kernel : public GateFuncBase +class roerror_kernel : public Chunk::GateFuncBase { protected: uint_t num_regs_; From 0f4a93eb1649bef747c14c4a4904844c213445fd Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 1 Feb 2022 17:54:39 +0900 Subject: [PATCH 14/17] added release note --- .../notes/cuQuantum-support-d33abe5b1cb778a8.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml diff --git a/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml new file mode 100644 index 0000000000..a302cda5fb --- /dev/null +++ b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added support for cuQuantum, NVIDIA's APIs for quantum computing, + to accelerate statevector, density matrix and unitary simulators + by using GPUs. + This is experiemental implementation for cuQuantum Beta 2. (0.1.0) + cuStateVec APIs are enabled to accelerate instead of Aer's implementations + by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``. + (binary distribution is not available currently.) + cuStateVector is enabled by setting ``device='GPU'`` and + ``cuStateVec_threshold`` options. cuStateVec is enabled when number of + qubits of input circuit is equal or greater than ``cuStateVec_threshold``. From c509131224a8963be64d96bb39d51ef3f764b331 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Thu, 3 Feb 2022 17:07:23 +0900 Subject: [PATCH 15/17] set cuStateVec_enable to False as default, added test cases for cuStateVec --- CONTRIBUTING.md | 16 ++- .../providers/aer/backends/aer_simulator.py | 17 +--- src/controllers/aer_controller.hpp | 18 ++-- src/simulators/state_chunk.hpp | 8 +- .../chunk/cuStateVec_chunk_container.hpp | 98 ++++++++----------- .../backends/aer_simulator/test_options.py | 4 +- .../test_wrapper_qasm_simulator.py | 3 + test/terra/backends/simulator_test_case.py | 37 ++++++- 8 files changed, 106 insertions(+), 95 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68483d1675..8ae8bc9ac1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -653,22 +653,20 @@ For example, qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum -To run with cuStateVec, set `device='GPU'` to AerSimulator option and cuStateVec is enabled -if the number of qubits of input circuit is equal or greater than 22 qubits by default. -This threshold can be modified by setting `cuStateVec_threshold` option. -By setting `cuStateVec_enable=False` to disable using cuStateVec. -Following example shows how you accelerate 10 or more qubits simulations using cuStateVec. +if you want to link cuQuantum library statically, set `CUSTATEVEC_STATIC` to setup.py. +Otherwise you also have to set environmental variable LD_LIBRARY_PATH to indicate path to the cuQuantum libraries. + +To run with cuStateVec, set `device='GPU'` to AerSimulator option and set `cuStateVec_enable=True` to option in execute method. ``` sim = AerSimulator(method='statevector', device='GPU') -results = execute(circuit,sim,cuStateVec_enable=True,cuStateVec_threshold=10).result() +results = execute(circuit,sim,cuStateVec_enable=True).result() ``` -Also you can accelrate density matrix simulation as well. -Following example shows how to enable cuStateVec for 5 or more qubits circuit on density matrix method +Also you can accelrate density matrix and unitary matrix simulations as well. ``` sim = AerSimulator(method='density_matrix', device='GPU') -results = execute(circuit,sim,cuStateVec_enable=True,cuStateVec_threshold=5).result() +results = execute(circuit,sim,cuStateVec_enable=True).result() ``` diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 07531c5647..c9836b68b5 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -222,17 +222,7 @@ class AerSimulator(AerBackend): * ``cuStateVec_enable`` (bool): This option enables accelerating by cuStateVec library of cuQuantum from NVIDIA, that has highly optimized - kernels for GPUs. This option is enabled when the number of qubits of - the input circuit is equal or greater than ``cuStateVec_threshold``. - Currently this option only works well for large number of qubits. - Also this option will be disabled for noise simulation - (Default: True). This option will be ignored - if AerSimulator is not built with cuStateVec support. - - * ``cuStateVec_threshold`` (int): This option sets the threshold - number of qubits to enable ``cuStateVec_enable`` option. - cuStateVec is enabled when the number of qubits is equal or greater - than this option (Default: 22). This option will be ignored + kernels for GPUs (Default: False). This option will be ignored if AerSimulator is not built with cuStateVec support. * ``blocking_enable`` (bool): This option enables parallelization with @@ -533,9 +523,8 @@ def _default_options(cls): memory=None, noise_model=None, seed_simulator=None, - # cuStateVec (cuQuantum) options - cuStateVec_enable=True, - cuStateVec_threshold=22, + # cuStateVec (cuQuantum) option + cuStateVec_enable=False, # cache blocking for multi-GPUs/MPI options blocking_qubits=None, blocking_enable=False, diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 1de932a293..8c3a383890 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -379,7 +379,6 @@ class Controller { //settings for cuStateVec bool cuStateVec_enable_ = false; - int cuStateVec_threshold_ = 22; }; //========================================================================= @@ -469,15 +468,11 @@ void Controller::set_config(const json_t &config) { JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config); } -#ifdef AER_CUSTATEVEC //cuStateVec configs + cuStateVec_enable_ = false; if(JSON::check_key("cuStateVec_enable", config)) { JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); } - if(JSON::check_key("cuStateVec_threshold", config)) { - JSON::get_value(cuStateVec_threshold_, "cuStateVec_threshold", config); - } -#endif // Override automatic simulation method with a fixed method std::string method; @@ -521,6 +516,14 @@ void Controller::set_config(const json_t &config) { throw std::runtime_error( "Simulation device \"GPU\" is not supported on this system"); #else + +#ifndef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //Aer is not built for cuStateVec + throw std::runtime_error( + "Simulation device \"GPU\" does not supported cuStateVec on this system"); + } +#endif int nDev; if (cudaGetDeviceCount(&nDev) != cudaSuccess) { cudaGetLastError(); @@ -669,7 +672,8 @@ void Controller::set_parallelization_circuit(const Circuit &circ, enable_batch_multi_shots_ = true; } - if(cuStateVec_enable_ && circ.num_qubits >= cuStateVec_threshold_){ + if(cuStateVec_enable_){ + enable_batch_multi_shots_ = false; //cuStateVec does not support batch execution of multi-shots parallel_shots_ = 1; //cuStateVec is currently not thread safe return; } diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index b9b6990fcb..f59cc1a12c 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -393,7 +393,6 @@ class StateChunk : public State { //cuStateVec settings bool cuStateVec_enable_ = false; - int cuStateVec_threshold_ = 22; //----------------------------------------------------------------------- // Apply circuits and ops @@ -539,9 +538,6 @@ void StateChunk::set_config(const json_t &config) if(JSON::check_key("cuStateVec_enable", config)) { JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); } - if(JSON::check_key("cuStateVec_threshold", config)) { - JSON::get_value(cuStateVec_threshold_, "cuStateVec_threshold", config); - } #endif } @@ -639,9 +635,7 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu #ifdef AER_CUSTATEVEC //set cuStateVec_enable_ if(cuStateVec_enable_){ - if(num_qubits_ < cuStateVec_threshold_) - cuStateVec_enable_ = false; //disable if number of qubits is smaller than threshold - else if(multi_shots_parallelization_) + if(multi_shots_parallelization_) cuStateVec_enable_ = false; //multi-shots parallelization is not supported for cuStateVec } diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp index 671ac9355f..783a4492e4 100644 --- a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp +++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp @@ -32,7 +32,7 @@ template class cuStateVecChunkContainer : public DeviceChunkContainer { protected: - std::vector custatevec_handle_; //cuStatevec handle for this chunk container + custatevecHandle_t custatevec_handle_; //cuStatevec handle for this chunk container AERDeviceVector custatevec_work_; //work buffer for cuStatevec uint_t custatevec_work_size_; //buffer size uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix @@ -107,22 +107,11 @@ uint_t cuStateVecChunkContainer::Allocate(int idev,int chunk_bits,int nu //initialize custatevevtor handle custatevecStatus_t err; - custatevec_handle_.resize(nc + buffers); - for(uint_t i=0;i::Allocate(int idev,int chunk_bits,int nu //matrix err = custatevecApplyMatrix_bufferSize( - custatevec_handle_[0], CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; @@ -151,7 +140,7 @@ uint_t cuStateVecChunkContainer::Allocate(int idev,int chunk_bits,int nu basis[i] = i; } err = custatevecApplyGeneralizedPermutationMatrix_bufferSize( - custatevec_handle_[0], CUDA_C_64F, custatevec_chunk_total_qubits_ , &perm[0], &mat[0], CUDA_C_64F, + custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &perm[0], &mat[0], CUDA_C_64F, &basis[0], matrix_bit, 0, &diag_size); if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; @@ -173,10 +162,7 @@ void cuStateVecChunkContainer::Deallocate(void) custatevec_work_.clear(); custatevec_work_.shrink_to_fit(); - for(int_t i=0;i @@ -188,6 +174,7 @@ reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std:: reg_t samples(SHOTS,0); BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); custatevecStatus_t err; custatevecSamplerDescriptor_t sampler; @@ -201,7 +188,7 @@ reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std:: else state_type = CUDA_C_32F; - err = custatevecSampler_create(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); + err = custatevecSampler_create(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); @@ -215,7 +202,7 @@ reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std:: pExtBuf = thrust::raw_pointer_cast(extBuf.data()); } - err = custatevecSampler_preprocess(custatevec_handle_[iChunk],&sampler,pExtBuf,extSize); + err = custatevecSampler_preprocess(custatevec_handle_,&sampler,pExtBuf,extSize); if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); @@ -228,7 +215,7 @@ reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std:: bitOrdering[i] = i; } - err = custatevecSampler_sample(custatevec_handle_[iChunk], &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, + err = custatevecSampler_sample(custatevec_handle_, &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; if(err != CUSTATEVEC_STATUS_SUCCESS){ std::stringstream str; @@ -256,16 +243,10 @@ void cuStateVecChunkContainer::apply_matrix(const uint_t iChunk,const re { thrust::complex* pMat; int_t num_qubits = qubits.size()-control_bits; -/* - if((BaseContainer::matrix_buffer_size_ >= (1ull << (num_qubits*2))) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ - BaseContainer::StoreMatrix(mat,iChunk); - pMat = BaseContainer::matrix_pointer(iChunk); - } - else{*/ - //if operation is not batchable, use host memory - pMat = (thrust::complex*)&mat[0]; - BaseContainer::set_device(); -// } + + pMat = (thrust::complex*)&mat[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); std::vector qubits32(qubits.size()); for(int_t i=0;i::apply_matrix(const uint_t iChunk,const re custatevecStatus_t err; for(int_t i=0;i::apply_diagonal_matrix(const uint_t iChunk return apply_diagonal_matrix(iChunk, qubits, 0, diag_ctrl, count); } - /* - if((BaseContainer::matrix_buffer_size_ >= (1ull << num_qubits)) && ((count == this->num_chunks_ && iChunk == 0) || BaseContainer::num_matrices_ > 1)){ - BaseContainer::StoreMatrix(diag,iChunk); - pMat = BaseContainer::matrix_pointer(iChunk); - } - else{*/ - //if operation is not batchable, use host memory - pMat = (thrust::complex*)&diag[0]; - BaseContainer::set_device(); -// } + pMat = (thrust::complex*)&diag[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); std::vector qubits32(qubits.size()); for(int_t i=0;i::apply_diagonal_matrix(const uint_t iChunk custatevecStatus_t err; for(int_t i=0;i::apply_X(const uint_t iChunk,const reg_t& int_t num_qubits = qubits.size(); BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); uint_t perm_size = 1ull << num_qubits; std::vector perm(perm_size); @@ -439,7 +414,7 @@ void cuStateVecChunkContainer::apply_X(const uint_t iChunk,const reg_t& custatevecStatus_t err; for(int_t i=0;i::apply_Y(const uint_t iChunk,const reg_t& int_t num_qubits = qubits.size(); BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); uint_t perm_size = 1ull << num_qubits; cvector_t diag(perm_size); @@ -503,7 +479,7 @@ void cuStateVecChunkContainer::apply_Y(const uint_t iChunk,const reg_t& custatevecStatus_t err; for(int_t i=0;i::apply_swap(const uint_t iChunk,const reg_ int_t num_qubits = qubits.size(); BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); uint_t perm_size = 1ull << num_qubits; std::vector swap(perm_size); @@ -575,7 +552,7 @@ void cuStateVecChunkContainer::apply_swap(const uint_t iChunk,const reg_ custatevecStatus_t err; for(int_t i=0;i void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) { BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); int_t size = 1ull << qubits.size(); custatevecIndex_t perm[size]; @@ -629,7 +607,7 @@ void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,con custatevecStatus_t err; for(int_t i=0;i::apply_permutation(const uint_t iChunk,con template double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const { + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + double ret = 0.0; uint_t bits; uint_t nc; @@ -670,7 +651,7 @@ double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const custatevecStatus_t err; for(int_t i=0;i::norm(uint_t iChunk,uint_t count) const template void cuStateVecChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const { + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + cudaDataType_t state_type; if(sizeof(data_t) == sizeof(double)) state_type = CUDA_C_64F; @@ -699,7 +683,7 @@ void cuStateVecChunkContainer::probabilities(std::vector& probs, custatevecStatus_t err; if(qubits.size() == 1){ double p0,p1; - err = custatevecAbs2SumOnZBasis(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + err = custatevecAbs2SumOnZBasis(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, &p0, &p1, &qubits32[0], 1); probs.resize(2); probs[0] = p0; @@ -707,7 +691,7 @@ void cuStateVecChunkContainer::probabilities(std::vector& probs, } else{ probs.resize(1ull << qubits.size()); - err = custatevecAbs2SumArray(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + err = custatevecAbs2SumArray(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, &probs[0], &qubits32[0], qubits.size(), nullptr,nullptr,0); } @@ -724,6 +708,8 @@ double cuStateVecChunkContainer::expval_pauli(const uint_t iChunk,const if(initial_phase != 1.0){ return BaseContainer::expval_pauli(iChunk, qubits, pauli, initial_phase); } + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); cudaDataType_t state_type; if(sizeof(data_t) == sizeof(double)) @@ -751,7 +737,7 @@ double cuStateVecChunkContainer::expval_pauli(const uint_t iChunk,const const uint32_t nBasisBitsArray[] = {qubits.size()}; custatevecStatus_t err; - err = custatevecExpectationsOnPauliBasis(custatevec_handle_[iChunk], BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + err = custatevecExpectationsOnPauliBasis(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, ret, pauliOperatorsArray, basisBitsArray, nBasisBitsArray, 1); if(err != CUSTATEVEC_STATUS_SUCCESS){ diff --git a/test/terra/backends/aer_simulator/test_options.py b/test/terra/backends/aer_simulator/test_options.py index cf5e31ab8a..e96f2a1719 100644 --- a/test/terra/backends/aer_simulator/test_options.py +++ b/test/terra/backends/aer_simulator/test_options.py @@ -91,7 +91,9 @@ def test_device_option(self, method, device): result = backend.run(qc).result() value = result.results[0].metadata.get('device', None) - self.assertEqual(value, device) + # device = 'GPU_cuStateVec' when cuStateVec is enabled + # so check if 'GPU' is included in value from result + self.assertTrue((value in device)) @data('automatic', 'statevector', 'density_matrix', 'stabilizer', 'matrix_product_state', 'extended_stabilizer') diff --git a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py index 57c2422168..5f79d43f83 100644 --- a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py +++ b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py @@ -30,6 +30,9 @@ class TestQasmSimulator(SimulatorTestCase): def test_legacy_methods(self, method, device): """Test legacy device method options.""" backend = self.backend() + # GPU_cuStateVec is converted to GPU + if device == "GPU_cuStateVec": + device = "GPU" legacy_method = f"{method}_{device.lower()}" backend.set_options(method=legacy_method) self.assertEqual(backend.options.method, method) diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 331fb1fcf8..9f3fa91484 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -18,6 +18,10 @@ import itertools as it from qiskit.providers.aer import AerSimulator from test.terra.common import QiskitAerTestCase +from qiskit.circuit import QuantumCircuit +from qiskit.compiler import assemble +from qiskit.providers.aer.backends.backend_utils import cpp_execute +from qiskit.providers.aer.backends.controller_wrappers import aer_controller_execute class SimulatorTestCase(QiskitAerTestCase): @@ -30,7 +34,11 @@ def backend(self, **options): """Return AerSimulator backend using current class options""" sim_options = self.OPTIONS.copy() for key, val in options.items(): - sim_options[key] = val + if 'device' == key and 'cuStateVec' in val: + sim_options['device'] = 'GPU' + sim_options['cuStateVec_enable'] = True + else: + sim_options[key] = val return self.BACKEND(**sim_options) @@ -66,12 +74,39 @@ def _method_device(methods): if not methods: methods = AerSimulator().available_methods() available_devices = AerSimulator().available_devices() + #add special test device for cuStateVec if available + cuStateVec = check_cuStateVec(available_devices) + gpu_methods = ['statevector', 'density_matrix', 'unitary'] data_args = [] for method in methods: if method in gpu_methods: for device in available_devices: data_args.append((method, device)) + #add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' + #'GPU_cuStateVec' is used only inside tests not available in Aer + #and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added + if cuStateVec: + data_args.append((method, 'GPU_cuStateVec')) else: data_args.append((method, 'CPU')) return data_args + +def check_cuStateVec(devices): + """Return if the system supports cuStateVec or not""" + if 'GPU' in devices: + dummy_circ = QuantumCircuit(1) + dummy_circ.i(0) + qobj = assemble(dummy_circ, + optimization_level=0, + shots=1, + method="statevector", + device="GPU", + cuStateVec_enable=True) + #run dummy circuit to check if Aer is built with cuStateVec + result = cpp_execute(aer_controller_execute(), qobj) + return result.get('success', False) + else: + return False + + From 3a31cefeadb773ed57aa6041310d80714aee9d49 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Fri, 4 Feb 2022 18:33:36 +0900 Subject: [PATCH 16/17] Fix omp setting for non-GPU / Fix omp nested loops --- src/controllers/aer_controller.hpp | 6 +- .../density_matrix/densitymatrix_state.hpp | 423 +++++++++++++----- src/simulators/state_chunk.hpp | 230 +++++----- .../statevector/statevector_state.hpp | 363 ++++++++++----- src/simulators/unitary/unitary_state.hpp | 136 ++++-- 5 files changed, 791 insertions(+), 367 deletions(-) diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 8c3a383890..c3f4f9aac9 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -672,7 +672,7 @@ void Controller::set_parallelization_circuit(const Circuit &circ, enable_batch_multi_shots_ = true; } - if(cuStateVec_enable_){ + if(sim_device_ == Device::GPU && cuStateVec_enable_){ enable_batch_multi_shots_ = false; //cuStateVec does not support batch execution of multi-shots parallel_shots_ = 1; //cuStateVec is currently not thread safe return; @@ -996,7 +996,7 @@ Result Controller::execute(std::vector &circuits, const int NUM_RESULTS = result.results.size(); //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation //(DO NOT use if statement in #pragma omp) - if (parallel_experiments_ == 1) { + if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) { for (int j = 0; j < NUM_RESULTS; ++j) { set_parallelization_circuit(circuits[j], noise_model, methods[j]); run_circuit(circuits[j], noise_model,methods[j], @@ -1476,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, // Check if measure sampler and optimization are valid if (can_sample) { // Implement measure sampler - if (parallel_shots_ <= 1 || sim_device_ == Device::GPU) { + if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) { state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed); diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index 858b3835b9..dcce3e8e09 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -443,20 +443,38 @@ void State::initialize_qreg(uint_t num_qubits, if(BaseState::multi_chunk_distribution_){ auto input = state.copy_to_matrix(); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -485,20 +503,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -526,20 +562,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -569,21 +623,40 @@ void State::initialize_from_vector(const int_t iChunkIn, const list_t else if((1ull << (BaseState::num_qubits_*2)) == vec.size() * vec.size()) { int_t iChunk; if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - list_t vec1(1ull << BaseState::chunk_bits_); - list_t vec2(1ull << BaseState::chunk_bits_); - - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; - vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } - BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } } else{ @@ -876,38 +949,76 @@ double State::expval_pauli(const int_t iChunk, const reg_t &qubits, const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); const uint_t mask_l = (1ull << x_max) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - double sign = 2.0; - if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) - sign = -2.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } } } } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + } + } + else{ //all bits are inside chunk + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) for(i=0;i iChunk){ //on this process - double sign = 1.0; - if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) - sign = -1.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); } } } - } - else{ //all bits are inside chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + else{ + for(i=0;i iChunk){ //on this process + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + } } } } @@ -1441,51 +1552,99 @@ rvector_t State::measure_probs(const int_t iChunk, const reg_t &qubit } } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - if(irow == icol){ //diagonal chunk - if(qubits_in_chunk.size() > 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } } - else{ //there is no bit in chunk - auto tr = std::real(BaseState::qregs_[i].trace()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + } + else{ + for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += tr; + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += tr; + } } } } @@ -1531,9 +1690,14 @@ void State::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub BaseState::qregs_[iChunk].apply_x(qubits[0]); else{ if(qubits[0] < BaseState::chunk_bits_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub } } if(qubits_in_chunk.size() > 0){ //in chunk exchange -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //out of chunk exchange diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index f59cc1a12c..85571f98e0 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -511,6 +511,12 @@ class StateChunk : public State { uint_t mapped_index(const uint_t idx); + //apply OpenMP parallelization if enabled + template + void apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func); + + template + double apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func); }; @@ -568,6 +574,38 @@ void StateChunk::set_distribution(uint_t nprocs) #endif } +template +template +void StateChunk::apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + if(enabled){ +#pragma omp parallel for + for(int_t i=i_begin;i +template +double StateChunk::apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + double val = 0.0; + if(enabled){ +#pragma omp parallel for reduction(+:val) + for(int_t i=i_begin;i bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t num_parallel_shots) { @@ -647,6 +685,7 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu } else if(qregs_[0].name().find("thrust") != std::string::npos){ thrust_optimization_ = true; + chunk_omp_parallel_ = false; } @@ -901,42 +940,23 @@ void StateChunk::apply_ops_multi_shots(InputIterator first, InputIterat //resize qregs allocate_qregs(n_shots); } - //initialization (equivalent to initialize_qreg + initialize_creg) - if(num_groups_ > 1 && chunk_omp_parallel_){ -#pragma omp parallel for - for(i=0;i 1 && chunk_omp_parallel_),0,num_groups_,init_group); - for(uint_t j=top_chunk_of_group_[i];j::initialize_from_vector(const int_t iChunkIn, const lis int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk::initialize_from_matrix(const int_t iChunkIn, const lis { int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); - uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ - uint_t icol = i & ((1ull << chunk_bits_)-1); - uint_t irow = i >> chunk_bits_; - tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + if(chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); } - qregs_[iChunk].initialize_from_matrix(tmp); } } else{ @@ -1644,49 +1693,26 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) nPair = num_local_chunks_ >> 2; } - if(chunk_omp_parallel_){ -#pragma omp parallel for private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair::apply_chunk_x(const uint_t qubit) if(qubit < chunk_bits_*qubit_scale()){ - reg_t qubits(1,qubit); -#pragma omp parallel for if(chunk_omp_parallel_ && num_groups_ > 1) - for(int_t ig=0;ig 1),0,num_groups_,apply_mcx); } else{ //exchange over chunks int_t iPair; @@ -1825,16 +1852,17 @@ void StateChunk::apply_chunk_x(const uint_t qubit) if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed nPair = num_local_chunks_ >> 1; -#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair::expval_pauli(const int_t iChunk, const reg_t &qubits, z_mask >>= BaseState::chunk_bits_; x_max -= BaseState::chunk_bits_; - const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); - const uint_t mask_l = (1ull << x_max) - 1; - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && on_same_process) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process uint_t z_count,z_count_pair; z_count = AER::Utils::popcount(iChunk & z_mask); z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); if(iProc == BaseState::distributed_rank_){ //pair is on the same process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); } else{ BaseState::recv_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); //refer receive buffer to calculate expectation value - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); } } else if(iProc == BaseState::distributed_rank_){ //pair is on this process BaseState::send_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); } - } + return expval; + }; + expval += BaseState::apply_omp_parallel_reduction((BaseState::chunk_omp_parallel_ && on_same_process),0,BaseState::num_global_chunks_/2,apply_expval_pauli_chunk); } else{ //no exchange between chunks z_mask >>= BaseState::chunk_bits_; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i::apply_save_density_matrix(const int_t iChunk, const Oper } else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].checkpoint(); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(true); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) apply_diagonal_matrix(iChunk, sub_qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati exp_im += exp_tmp.imag(); } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:exp_re,exp_im) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(false); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_density_matrix(const int_t iChunk, const Operat reduced_state[0] = BaseState::qregs_[iChunk].norm(); else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::measure_probs(const int_t iChunk, const reg_t &qubi BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); + if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } + } + } + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } } +#pragma omp atomic + sum[idx] += nr; } } - else{ //there is no bit in chunk - auto nr = std::real(BaseState::qregs_[i].norm()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + else{ + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += nr; + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += nr; + } } } @@ -1753,10 +1861,18 @@ void State::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::apply_initialize(const int_t iChunk, const reg_t &qubits BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); if(qubits_out_chunk.size() == 0){ //no qubits outside of chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_initialize(const int_t iChunk, const reg_t &qubits perm[i] = 1.0; } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //then scatter outside chunk @@ -2009,9 +2144,14 @@ void State::apply_initialize(const int_t iChunk, const reg_t &qubits } //initialize by params -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, } else{ p = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:p) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::initialize_qreg(uint_t num_qubits) } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if(irow == icol) - BaseState::qregs_[iChunk].initialize(); - else - BaseState::qregs_[iChunk].zero(); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } } } else{ @@ -441,21 +454,40 @@ void State::initialize_qreg(uint_t num_qubits, auto input = unitary.copy_to_matrix(); uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = input[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -489,21 +521,40 @@ void State::initialize_qreg(uint_t num_qubits, BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = unitary[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -736,9 +787,14 @@ template void State::apply_global_phase() { if (BaseState::has_global_phase_) { -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i Date: Mon, 14 Feb 2022 17:58:32 +0900 Subject: [PATCH 17/17] Implemented optimized rotation gates --- src/simulators/statevector/chunk/chunk.hpp | 6 ++ .../statevector/chunk/chunk_container.hpp | 35 +++++++ .../chunk/cuStateVec_chunk_container.hpp | 99 +++++++++++++++++++ src/simulators/statevector/qubitvector.hpp | 40 ++++++++ .../statevector/qubitvector_thrust.hpp | 17 +++- .../statevector/statevector_state.hpp | 14 +-- 6 files changed, 199 insertions(+), 12 deletions(-) diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index f2fc4f5d29..612ac23378 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -400,6 +400,12 @@ class Chunk chunk_container_.lock()->apply_permutation(chunk_pos_,qubits,pairs,count); } + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta, const uint_t count) + { + chunk_container_.lock()->apply_rotation(chunk_pos_,qubits,r,theta,count); + } + //get probabilities of chunk void probabilities(std::vector& probs, const reg_t& qubits) const { diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 59ca7f7f03..69604d6e55 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -331,6 +331,9 @@ class ChunkContainer : public std::enable_shared_from_this> &pairs, const uint_t count); + //apply rotation around axis + virtual void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count); + //get probabilities of chunk virtual void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const; @@ -923,6 +926,38 @@ void ChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& Execute(f, iChunk, count); } +template +void ChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + int control_bits = qubits.size() - 1; + switch(r){ + case Rotation::x: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rx(theta), count); + break; + case Rotation::y: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::ry(theta), count); + break; + case Rotation::z: + apply_diagonal_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rz_diag(theta), count); + break; + case Rotation::xx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rxx(theta), count); + break; + case Rotation::yy: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::ryy(theta), count); + break; + case Rotation::zz: + apply_diagonal_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzz_diag(theta), count); + break; + case Rotation::zx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzx(theta), count); + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } +} + template void ChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const { diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp index 783a4492e4..248d3a50fc 100644 --- a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp +++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp @@ -85,6 +85,9 @@ class cuStateVecChunkContainer : public DeviceChunkContainer //apply permutation void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) override; + //apply rotation around axis + void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) override; + //get probabilities of chunk void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const override; @@ -618,6 +621,102 @@ void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,con } } +template +void cuStateVecChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + custatevecPauli_t pauli[2]; + int nPauli = 1; + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + int control_bits = qubits.size() - 1; + + switch(r){ + case Rotation::x: + pauli[0] = CUSTATEVEC_PAULI_X; + break; + case Rotation::y: + pauli[0] = CUSTATEVEC_PAULI_Y; + break; + case Rotation::z: + pauli[0] = CUSTATEVEC_PAULI_Z; + break; + case Rotation::xx: + pauli[0] = CUSTATEVEC_PAULI_X; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + case Rotation::yy: + pauli[0] = CUSTATEVEC_PAULI_Y; + pauli[1] = CUSTATEVEC_PAULI_Y; + nPauli = 2; + control_bits--; + break; + case Rotation::zz: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_Z; + nPauli = 2; + control_bits--; + break; + case Rotation::zx: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const { diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 6e925ecde4..ee037cb5fb 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -41,6 +41,12 @@ namespace QV { template using cvector_t = std::vector>; template using cdict_t = std::map>; +enum class Rotation { + x, y, z, + xx, yy, zz, + zx, +}; + //============================================================================ // QubitVector class //============================================================================ @@ -258,6 +264,9 @@ class QubitVector { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVector &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -1583,6 +1592,37 @@ void QubitVector::apply_mcu(const reg_t &qubits, } // end switch } +template +void QubitVector::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + switch(r){ + case Rotation::x: + apply_mcu(qubits, Linalg::VMatrix::rx(theta)); + break; + case Rotation::y: + apply_mcu(qubits, Linalg::VMatrix::ry(theta)); + break; + case Rotation::z: + apply_mcu(qubits, Linalg::VMatrix::rz(theta)); + break; + case Rotation::xx: + apply_matrix(qubits, Linalg::VMatrix::rxx(theta)); + break; + case Rotation::yy: + apply_matrix(qubits, Linalg::VMatrix::ryy(theta)); + break; + case Rotation::zz: + apply_diagonal_matrix(qubits, Linalg::VMatrix::rzz_diag(theta)); + break; + case Rotation::zx: + apply_matrix(qubits, Linalg::VMatrix::rzx(theta)); + break; + default: + throw std::invalid_argument( + "QubitVector::invalid rotation axis."); + } +} + template void QubitVector::apply_chunk_swap(const reg_t &qubits, QubitVector &src, bool write_back) { diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 87bc26c4c4..3c4ca7c334 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -272,6 +272,9 @@ class QubitVectorThrust { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -1405,11 +1408,7 @@ void QubitVectorThrust::apply_matrix(const reg_t &qubits, if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) return; //first chunk execute all in batch - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - if(N == 1 && register_blocking_) + if(qubits.size() == 1 && register_blocking_) chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); else chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); @@ -1756,6 +1755,14 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, } } +template +void QubitVectorThrust::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + + chunk_.apply_rotation(qubits,r,theta,chunk_.container()->num_chunks()); +} //------------------------------------------------------------------------------ // Single-qubit matrices diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index b0ccfee054..542d839fc0 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -1551,25 +1551,25 @@ void State::apply_gate(const int_t iChunk, const Operations::Op &op) BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::mcrx: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::x, std::real(op.params[0])); break; case Gates::mcry: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::y, std::real(op.params[0])); break; case Gates::mcrz: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::z, std::real(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rxx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::xx, std::real(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ryy(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::yy, std::real(op.params[0])); break; case Gates::rzz: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zz, std::real(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zx, std::real(op.params[0])); break; case Gates::id: break;