diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eeb296f69..79ad5f80db 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,6 +257,15 @@ if(AER_THRUST_SUPPORTED) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA) set(THRUST_DEPENDENT_LIBS "") + if(CUSTATEVEC_ROOT) + set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) + set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") + if(CUSTATEVEC_STATIC) + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas") + else() + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + endif() + endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") set(THRUST_DEPENDENT_LIBS AER_DEPENDENCY_PKG::tbb) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 976c93f7a0..8ae8bc9ac1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -643,6 +643,34 @@ Few notes on GPU builds: 3. We don't need NVIDIA® drivers for building, but we need them for running simulations 4. Only Linux platforms are supported +Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®. +cuStateVec APIs can be exploited to accelerate statevector, density_matrix and unitary methods. +Because cuQuantum is beta version currently, some of the operations are not accelerated by cuStateVec. + +To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following. + +For example, + + qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum + +if you want to link cuQuantum library statically, set `CUSTATEVEC_STATIC` to setup.py. +Otherwise you also have to set environmental variable LD_LIBRARY_PATH to indicate path to the cuQuantum libraries. + +To run with cuStateVec, set `device='GPU'` to AerSimulator option and set `cuStateVec_enable=True` to option in execute method. + +``` +sim = AerSimulator(method='statevector', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True).result() +``` + +Also you can accelrate density matrix and unitary matrix simulations as well. +``` +sim = AerSimulator(method='density_matrix', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True).result() +``` + + + ### Building with MPI support Qiskit Aer can parallelize its simulation on the cluster systems by using MPI. diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 4846a9e0f6..c9836b68b5 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -148,6 +148,10 @@ class AerSimulator(AerBackend): initialization or with :meth:`set_options`. The list of supported devices for the current system can be returned using :meth:`available_devices`. + If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled + by setting ``cuStateVec_enable=True``. This is experimental implementation + based on cuQuantum Beta 2. + **Additional Backend Options** The following simulator specific backend options are supported @@ -216,6 +220,11 @@ class AerSimulator(AerBackend): values (16 Bytes). If set to 0, the maximum will be automatically set to the system memory size (Default: 0). + * ``cuStateVec_enable`` (bool): This option enables accelerating by + cuStateVec library of cuQuantum from NVIDIA, that has highly optimized + kernels for GPUs (Default: False). This option will be ignored + if AerSimulator is not built with cuStateVec support. + * ``blocking_enable`` (bool): This option enables parallelization with multiple GPUs or multiple processes with MPI (CPU/GPU). This option is only available for ``"statevector"``, ``"density_matrix"`` and @@ -514,6 +523,8 @@ def _default_options(cls): memory=None, noise_model=None, seed_simulator=None, + # cuStateVec (cuQuantum) option + cuStateVec_enable=False, # cache blocking for multi-GPUs/MPI options blocking_qubits=None, blocking_enable=False, diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py index 9abbce9056..23ad8a4927 100644 --- a/qiskit/providers/aer/backends/qasm_simulator.py +++ b/qiskit/providers/aer/backends/qasm_simulator.py @@ -339,9 +339,9 @@ class QasmSimulator(AerBackend): } _SIMULATION_METHODS = [ - 'automatic', 'statevector', 'statevector_gpu', + 'automatic', 'statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust', 'density_matrix', - 'density_matrix_gpu', 'density_matrix_thrust', + 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust', 'stabilizer', 'matrix_product_state', 'extended_stabilizer' ] @@ -595,7 +595,8 @@ def _basis_gates(self): def _method_basis_gates(self): """Return method basis gates and custom instructions""" method = self._options.get('method', None) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x', 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx', @@ -628,7 +629,8 @@ def _custom_instructions(self): return self._options_configuration['custom_instructions'] method = self._options.get('method', None) - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -636,7 +638,8 @@ def _custom_instructions(self): 'save_density_matrix', 'save_statevector', 'save_statevector_dict', 'set_statevector' ]) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -666,10 +669,12 @@ def _custom_instructions(self): def _set_method_config(self, method=None): """Set non-basis gate options when setting method""" # Update configuration description and number of qubits - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: description = 'A C++ statevector simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR - elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + elif method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: description = 'A C++ density matrix simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR // 2 elif method == 'matrix_product_state': diff --git a/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml new file mode 100644 index 0000000000..a302cda5fb --- /dev/null +++ b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added support for cuQuantum, NVIDIA's APIs for quantum computing, + to accelerate statevector, density matrix and unitary simulators + by using GPUs. + This is experiemental implementation for cuQuantum Beta 2. (0.1.0) + cuStateVec APIs are enabled to accelerate instead of Aer's implementations + by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``. + (binary distribution is not available currently.) + cuStateVector is enabled by setting ``device='GPU'`` and + ``cuStateVec_threshold`` options. cuStateVec is enabled when number of + qubits of input circuit is equal or greater than ``cuStateVec_threshold``. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 2f1f35639f..c3f4f9aac9 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -377,6 +377,8 @@ class Controller { int_t batched_shots_gpu_max_qubits_ = 16; //multi-shot parallelization is applied if qubits is less than max qubits bool enable_batch_multi_shots_ = false; //multi-shot parallelization can be applied + //settings for cuStateVec + bool cuStateVec_enable_ = false; }; //========================================================================= @@ -466,6 +468,12 @@ void Controller::set_config(const json_t &config) { JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config); } + //cuStateVec configs + cuStateVec_enable_ = false; + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } + // Override automatic simulation method with a fixed method std::string method; if (JSON::get_value(method, "method", config)) { @@ -489,6 +497,9 @@ void Controller::set_config(const json_t &config) { } } + if(method_ == Method::density_matrix || method_ == Method::unitary) + batched_shots_gpu_max_qubits_ /= 2; + // Override automatic simulation method with a fixed method if (JSON::get_value(sim_device_name_, "device", config)) { if (sim_device_name_ == "CPU") { @@ -502,18 +513,37 @@ void Controller::set_config(const json_t &config) { #endif } else if (sim_device_name_ == "GPU") { #ifndef AER_THRUST_CUDA - throw std::runtime_error( - "Simulation device \"GPU\" is not supported on this system"); + throw std::runtime_error( + "Simulation device \"GPU\" is not supported on this system"); #else - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - throw std::runtime_error("No CUDA device available!"); - } - sim_device_ = Device::GPU; +#ifndef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //Aer is not built for cuStateVec + throw std::runtime_error( + "Simulation device \"GPU\" does not supported cuStateVec on this system"); + } #endif + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + throw std::runtime_error("No CUDA device available!"); + } + sim_device_ = Device::GPU; + +#ifdef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //initialize custatevevtor handle once before actual calculation (takes long time at first call) + custatevecStatus_t err; + custatevecHandle_t stHandle; + err = custatevecCreate(&stHandle); + if(err == CUSTATEVEC_STATUS_SUCCESS){ + custatevecDestroy(stHandle); + } } +#endif +#endif + } else { throw std::runtime_error(std::string("Invalid simulation device (\"") + sim_device_name_ + std::string("\").")); @@ -636,9 +666,16 @@ void Controller::set_parallelization_circuit(const Circuit &circ, const Method method) { enable_batch_multi_shots_ = false; - if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ - enable_batch_multi_shots_ = true; + if(batched_shots_gpu_ && sim_device_ == Device::GPU && + circ.shots > 1 && max_batched_states_ >= num_gpus_ && + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ + enable_batch_multi_shots_ = true; + } + + if(sim_device_ == Device::GPU && cuStateVec_enable_){ + enable_batch_multi_shots_ = false; //cuStateVec does not support batch execution of multi-shots + parallel_shots_ = 1; //cuStateVec is currently not thread safe + return; } if(explicit_parallelization_) @@ -785,6 +822,7 @@ size_t Controller::get_gpu_memory_mb() { } num_gpus_ = nDev; #endif + #ifdef AER_MPI // get minimum memory size per process uint64_t locMem, minMem; @@ -866,7 +904,6 @@ Result Controller::execute(const inputdata_t &input_qobj) { auto time_taken = std::chrono::duration(myclock_t::now() - timer_start).count(); result.metadata.add(time_taken, "time_taken"); - return result; } catch (std::exception &e) { // qobj was invalid, return valid output containing error message @@ -959,7 +996,7 @@ Result Controller::execute(std::vector &circuits, const int NUM_RESULTS = result.results.size(); //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation //(DO NOT use if statement in #pragma omp) - if (parallel_experiments_ == 1) { + if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) { for (int j = 0; j < NUM_RESULTS; ++j) { set_parallelization_circuit(circuits[j], noise_model, methods[j]); run_circuit(circuits[j], noise_model,methods[j], @@ -1439,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, // Check if measure sampler and optimization are valid if (can_sample) { // Implement measure sampler - if (parallel_shots_ <= 1) { + if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) { state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed); @@ -1460,7 +1497,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, shot_state.set_parallelization(parallel_state_update_); shot_state.set_global_phase(circ.global_phase_angle); - state.set_max_matrix_qubits(max_bits); + shot_state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed + i); @@ -1736,7 +1773,12 @@ void Controller::measure_sampler( shots_or_index = shots; else shots_or_index = shot_index; + + auto timer_start = myclock_t::now(); auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); // Make qubit map of position in vector of measured qubits std::unordered_map qubit_map; diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index 5804321769..dcce3e8e09 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -443,20 +443,38 @@ void State::initialize_qreg(uint_t num_qubits, if(BaseState::multi_chunk_distribution_){ auto input = state.copy_to_matrix(); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -485,20 +503,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -526,20 +562,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -569,21 +623,40 @@ void State::initialize_from_vector(const int_t iChunkIn, const list_t else if((1ull << (BaseState::num_qubits_*2)) == vec.size() * vec.size()) { int_t iChunk; if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - list_t vec1(1ull << BaseState::chunk_bits_); - list_t vec2(1ull << BaseState::chunk_bits_); - - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; - vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } - BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } } else{ @@ -876,38 +949,76 @@ double State::expval_pauli(const int_t iChunk, const reg_t &qubits, const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); const uint_t mask_l = (1ull << x_max) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - double sign = 2.0; - if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) - sign = -2.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } } } } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + } + } + else{ //all bits are inside chunk + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) for(i=0;i iChunk){ //on this process - double sign = 1.0; - if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) - sign = -1.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); } } } - } - else{ //all bits are inside chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + else{ + for(i=0;i iChunk){ //on this process + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + } } } } @@ -1344,7 +1455,7 @@ void State::apply_gate_u3(const int_t iChunk, uint_t qubit, double th template void State::apply_diagonal_unitary_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t & diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits,diag); } @@ -1441,51 +1552,99 @@ rvector_t State::measure_probs(const int_t iChunk, const reg_t &qubit } } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - if(irow == icol){ //diagonal chunk - if(qubits_in_chunk.size() > 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } } - else{ //there is no bit in chunk - auto tr = std::real(BaseState::qregs_[i].trace()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + } + else{ + for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += tr; + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += tr; + } } } } @@ -1531,9 +1690,14 @@ void State::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub BaseState::qregs_[iChunk].apply_x(qubits[0]); else{ if(qubits[0] < BaseState::chunk_bits_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub } } if(qubits_in_chunk.size() > 0){ //in chunk exchange -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //out of chunk exchange diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 810ef2056b..7eaddb75c1 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -262,7 +262,7 @@ void DensityMatrixThrust::apply_diagonal_superop_matrix(const reg_t &qub template -class DensityMatrixUnitary2x2 : public GateFuncBase +class DensityMatrixUnitary2x2 : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -364,7 +364,7 @@ void DensityMatrixThrust::apply_unitary_matrix(const reg_t &qubits, } template -class DensityDiagMatMult2x2 : public GateFuncBase +class DensityDiagMatMult2x2 : public Chunk::GateFuncBase { protected: uint_t offset; @@ -429,7 +429,7 @@ class DensityDiagMatMult2x2 : public GateFuncBase }; template -class DensityDiagMatMultNxN : public GateFuncBase +class DensityDiagMatMultNxN : public Chunk::GateFuncBase { protected: int nqubits_; @@ -512,7 +512,7 @@ void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qub // Apply Specialized Gates //----------------------------------------------------------------------- template -class DensityCX : public GateFuncBase +class DensityCX : public Chunk::GateFuncBase { protected: uint_t offset; @@ -599,7 +599,7 @@ void DensityMatrixThrust::apply_cnot(const uint_t qctrl, const uint_t qt } template -class DensityPhase : public GateFuncBase +class DensityPhase : public Chunk::GateFuncBase { protected: thrust::complex phase_; @@ -665,7 +665,7 @@ void DensityMatrixThrust::apply_phase(const uint_t q,const complex_t &ph } template -class DensityCPhase : public GateFuncBase +class DensityCPhase : public Chunk::GateFuncBase { protected: uint_t offset; @@ -753,7 +753,7 @@ void DensityMatrixThrust::apply_swap(const uint_t q0, const uint_t q1) { } template -class DensityX : public GateFuncBase +class DensityX : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -829,7 +829,7 @@ void DensityMatrixThrust::apply_x(const uint_t qubit) } template -class DensityY : public GateFuncBase +class DensityY : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -929,7 +929,7 @@ void DensityMatrixThrust::apply_toffoli(const uint_t qctrl0, //special case Z only template -class expval_pauli_Z_func_dm : public GateFuncBase +class expval_pauli_Z_func_dm : public Chunk::GateFuncBase { protected: uint_t z_mask_; @@ -966,7 +966,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } @@ -979,7 +979,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase }; template -class expval_pauli_XYZ_func_dm : public GateFuncBase +class expval_pauli_XYZ_func_dm : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1026,7 +1026,7 @@ class expval_pauli_XYZ_func_dm : public GateFuncBase q0 = 2 * phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(idx_vec & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx_vec & z_mask_) & 1) ret = -ret; } return ret; @@ -1067,7 +1067,7 @@ double DensityMatrixThrust::expval_pauli(const reg_t &qubits, } template -class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase +class expval_pauli_XYZ_func_dm_non_diagonal : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1108,7 +1108,7 @@ class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase q0 = phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } return ret; @@ -1151,7 +1151,7 @@ double DensityMatrixThrust::probability(const uint_t outcome) const template -class density_probability_func : public GateFuncBase +class density_probability_func : public Chunk::GateFuncBase { protected: uint_t qubit_sp_; @@ -1257,7 +1257,7 @@ reg_t DensityMatrixThrust::sample_measure(const std::vector &rnd } template -class density_reset_after_measure_func : public GateFuncBase +class density_reset_after_measure_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; @@ -1325,7 +1325,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: count = BaseVector::chunk_.container()->num_chunks(); //total probability - BaseVector::apply_function_sum(nullptr,trace_func(BaseMatrix::rows_),true); + BaseVector::apply_function_sum(nullptr,Chunk::trace_func(BaseMatrix::rows_),true); BaseVector::apply_function(set_probability_buffer_for_reset_func(BaseVector::chunk_.probability_buffer(),BaseVector::chunk_.container()->num_chunks(), BaseVector::chunk_.reduce_buffer(),BaseVector::chunk_.reduce_buffer_size()) ); @@ -1374,7 +1374,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: } template -class density_reset_func : public GateFuncBase +class density_reset_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 4547ec02fa..c07b5e99df 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -355,8 +355,9 @@ State::~State(void) } template -void State::set_config(const json_t &config) { - (ignore_argument)config; +void State::set_config(const json_t &config) +{ + } template diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index 20ac3b8657..85571f98e0 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -391,6 +391,9 @@ class StateChunk : public State { reg_t top_chunk_of_group_; reg_t num_chunks_in_group_; + //cuStateVec settings + bool cuStateVec_enable_ = false; + //----------------------------------------------------------------------- // Apply circuits and ops //----------------------------------------------------------------------- @@ -508,6 +511,12 @@ class StateChunk : public State { uint_t mapped_index(const uint_t idx); + //apply OpenMP parallelization if enabled + template + void apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func); + + template + double apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func); }; @@ -526,8 +535,16 @@ StateChunk::~StateChunk(void) } template -void StateChunk::set_config(const json_t &config) { - (ignore_argument)config; +void StateChunk::set_config(const json_t &config) +{ + BaseState::set_config(config); + +#ifdef AER_CUSTATEVEC + //cuStateVec configs + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } +#endif } template @@ -557,6 +574,38 @@ void StateChunk::set_distribution(uint_t nprocs) #endif } +template +template +void StateChunk::apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + if(enabled){ +#pragma omp parallel for + for(int_t i=i_begin;i +template +double StateChunk::apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + double val = 0.0; + if(enabled){ +#pragma omp parallel for reduction(+:val) + for(int_t i=i_begin;i bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t num_parallel_shots) { @@ -617,15 +666,26 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu chunk_omp_parallel_ = false; if(qregs_[0].name().find("gpu") != std::string::npos){ #ifdef _OPENMP - if(multi_chunk_distribution_){ - if(omp_get_num_threads() == 1) - chunk_omp_parallel_ = true; + if(omp_get_num_threads() == 1) + chunk_omp_parallel_ = true; +#endif + +#ifdef AER_CUSTATEVEC + //set cuStateVec_enable_ + if(cuStateVec_enable_){ + if(multi_shots_parallelization_) + cuStateVec_enable_ = false; //multi-shots parallelization is not supported for cuStateVec } + + if(cuStateVec_enable_) + chunk_omp_parallel_ = false; //because cuStateVec is not thread safe + else + thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix #endif - thrust_optimization_ = true; } else if(qregs_[0].name().find("thrust") != std::string::npos){ thrust_optimization_ = true; + chunk_omp_parallel_ = false; } @@ -656,7 +716,8 @@ bool StateChunk::allocate_qregs(uint_t num_chunks) uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0; bool ret = true; qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_); - ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(),num_qubits_*qubit_scale(),chunk_id,num_chunks); + qregs_[0].cuStateVec_enable(cuStateVec_enable_); + ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks); for(i=1;i::apply_ops(InputIterator first, InputIterator last, } } } + + qregs_[0].synchronize(); + +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template @@ -801,6 +868,11 @@ void StateChunk::apply_ops_chunks(InputIterator first, InputIterator la } iOp++; } + + qregs_[0].synchronize(); +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template @@ -868,42 +940,23 @@ void StateChunk::apply_ops_multi_shots(InputIterator first, InputIterat //resize qregs allocate_qregs(n_shots); } - //initialization (equivalent to initialize_qreg + initialize_creg) - if(num_groups_ > 1 && chunk_omp_parallel_){ -#pragma omp parallel for - for(i=0;i 1 && chunk_omp_parallel_),0,num_groups_,init_group); - for(uint_t j=top_chunk_of_group_[i];j::initialize_from_vector(const int_t iChunkIn, const lis int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk::initialize_from_matrix(const int_t iChunkIn, const lis { int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); - uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ - uint_t icol = i & ((1ull << chunk_bits_)-1); - uint_t irow = i >> chunk_bits_; - tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + if(chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); } - qregs_[iChunk].initialize_from_matrix(tmp); } } else{ @@ -1611,49 +1693,26 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) nPair = num_local_chunks_ >> 2; } - if(chunk_omp_parallel_){ -#pragma omp parallel for private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair::apply_chunk_x(const uint_t qubit) if(qubit < chunk_bits_*qubit_scale()){ - reg_t qubits(1,qubit); -#pragma omp parallel for if(chunk_omp_parallel_ && num_groups_ > 1) - for(int_t ig=0;ig 1),0,num_groups_,apply_mcx); } else{ //exchange over chunks int_t iPair; @@ -1792,16 +1852,17 @@ void StateChunk::apply_chunk_x(const uint_t qubit) if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed nPair = num_local_chunks_ >> 1; -#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair> cc,uint_t pos) @@ -54,6 +61,7 @@ class Chunk num_qubits_ = 0; chunk_index_ = 0; mapped_ = false; + cache_ = nullptr; } Chunk(Chunk& chunk) //map chunk from exisiting chunk (used fo cache chunk) { @@ -63,9 +71,12 @@ class Chunk num_qubits_ = chunk.num_qubits_; chunk_index_ = chunk.chunk_index_; mapped_ = true; + cache_ = nullptr; } ~Chunk() { + if(cache_) + cache_.reset(); } void set_device(void) const @@ -256,9 +267,13 @@ class Chunk return chunk_container_.lock()->sample_measure(chunk_pos_,rnds,stride,dot,count); } - thrust::complex norm(uint_t count=1,uint_t stride = 1,bool dot = true) const + double norm(uint_t count) const { - return chunk_container_.lock()->norm(chunk_pos_,count,stride,dot); + return chunk_container_.lock()->norm(chunk_pos_,count); + } + double trace(uint_t row, uint_t count) const + { + return chunk_container_.lock()->trace(chunk_pos_,row,count); } #ifdef AER_THRUST_CUDA @@ -349,10 +364,64 @@ class Chunk chunk_container_.lock()->keep_conditional(keep); } + //apply matrix + void apply_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) + { + chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,mat,count); + } + //apply diagonal matrix + void apply_diagonal_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) + { + chunk_container_.lock()->apply_diagonal_matrix(chunk_pos_,qubits,control_bits,diag,count); + } + //apply (controlled) X + void apply_X(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_X(chunk_pos_,qubits,count); + } + //apply (controlled) Y + void apply_Y(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_Y(chunk_pos_,qubits,count); + } + //apply (controlled) phase + void apply_phase(const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) + { + chunk_container_.lock()->apply_phase(chunk_pos_,qubits,control_bits,phase,count); + } + //apply (controlled) swap gate + void apply_swap(const reg_t& qubits,const int_t control_bits,const uint_t count) + { + chunk_container_.lock()->apply_swap(chunk_pos_,qubits,control_bits,count); + } + //apply permutation + void apply_permutation(const reg_t& qubits,const std::vector> &pairs, const uint_t count) + { + chunk_container_.lock()->apply_permutation(chunk_pos_,qubits,pairs,count); + } + + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta, const uint_t count) + { + chunk_container_.lock()->apply_rotation(chunk_pos_,qubits,r,theta,count); + } + + //get probabilities of chunk + void probabilities(std::vector& probs, const reg_t& qubits) const + { + chunk_container_.lock()->probabilities(probs, chunk_pos_,qubits); + } + //Pauli expectation values + double expval_pauli(const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const + { + return chunk_container_.lock()->expval_pauli(chunk_pos_,qubits,pauli,initial_phase); + } + }; //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 5fd68798e4..69604d6e55 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -63,8 +63,11 @@ DISABLE_WARNING_POP #include "simulators/statevector/chunk/cuda_kernels.hpp" #endif +#include "simulators/statevector/chunk/thrust_kernels.hpp" + namespace AER { namespace QV { +namespace Chunk { template class Chunk; template class DeviceChunkContainer; @@ -77,391 +80,6 @@ struct BlockedGateParams unsigned char qubit_; }; -//======================================== -// base class of gate functions -//======================================== -template -class GateFuncBase -{ -protected: - thrust::complex* data_; //pointer to state vector buffer - thrust::complex* matrix_; //storage for matrix on device - uint_t* params_; //storage for additional parameters on device - uint_t base_index_; //start index of state vector - uint_t chunk_bits_; - uint_t* cregs_; - uint_t num_creg_bits_; - int_t conditional_bit_; -#ifndef AER_THRUST_CUDA - uint_t index_offset_; -#endif -public: - GateFuncBase() - { - data_ = NULL; - base_index_ = 0; - cregs_ = NULL; - num_creg_bits_ = 0; - conditional_bit_ = -1; -#ifndef AER_THRUST_CUDA - index_offset_ = 0; -#endif - } - virtual void set_data(thrust::complex* p) - { - data_ = p; - } - void set_matrix(thrust::complex* mat) - { - matrix_ = mat; - } - void set_params(uint_t* p) - { - params_ = p; - } - void set_chunk_bits(uint_t bits) - { - chunk_bits_ = bits; - } - - void set_base_index(uint_t i) - { - base_index_ = i; - } - void set_cregs_(uint_t* cbits,uint_t nreg) - { - cregs_ = cbits; - num_creg_bits_ = nreg; - } - void set_conditional(int_t bit) - { - conditional_bit_ = bit; - } - -#ifndef AER_THRUST_CUDA - void set_index_offset(uint_t i) - { - index_offset_ = i; - } -#endif - - __host__ __device__ thrust::complex* data(void) - { - return data_; - } - - virtual bool is_diagonal(void) - { - return false; - } - virtual int qubits_count(void) - { - return 1; - } - virtual int num_control_bits(void) - { - return 0; - } - virtual int control_mask(void) - { - return 1; - } - virtual bool use_cache(void) - { - return false; - } - virtual bool batch_enable(void) - { - return true; - } - - virtual const char* name(void) - { - return "base function"; - } - virtual uint_t size(int num_qubits) - { - if(is_diagonal()){ - chunk_bits_ = num_qubits; - return (1ull << num_qubits); - } - else{ - chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); - return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); - } - } - - virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const - { - return _tid; - } - virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - } - virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - return 0.0; - } - - virtual __host__ __device__ bool check_conditional(uint_t i) const - { - if(conditional_bit_ < 0) - return true; - - uint_t iChunk = i >> chunk_bits_; - uint_t n64,i64,ibit; - n64 = (num_creg_bits_ + 63) >> 6; - i64 = conditional_bit_ >> 6; - ibit = conditional_bit_ & 63; - return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); - } -}; - -//======================================== - // gate functions with cache -//======================================== -template -class GateFuncWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ void sync_threads() const - { -#ifdef CUDA_ARCH - __syncthreads(); -#endif - } - - __host__ __device__ void operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache(j,idx,cache); - } - } - - virtual int qubits_count(void) - { - return nqubits_; - } -}; - -template -class GateFuncSumWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncSumWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return 0.0; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - double sum = 0.0; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache_sum(j,idx,cache); - } - return sum; - } - - virtual int qubits_count(void) - { - return nqubits_; - } - -}; - -//stridded iterator to access diagonal probabilities -template -class strided_range -{ - public: - - typedef typename thrust::iterator_difference::type difference_type; - - struct stride_functor : public thrust::unary_function - { - difference_type stride; - - stride_functor(difference_type stride) - : stride(stride) {} - - __host__ __device__ - difference_type operator()(const difference_type& i) const - { - if(stride == 1) //statevector - return i; - - //density matrix - difference_type i_chunk; - i_chunk = i / (stride - 1); - difference_type ret = stride * i - i_chunk*(stride-1); - return ret; - } - }; - - typedef typename thrust::counting_iterator CountingIterator; - typedef typename thrust::transform_iterator TransformIterator; - typedef typename thrust::permutation_iterator PermutationIterator; - - // type of the strided_range iterator - typedef PermutationIterator iterator; - - // construct strided_range for the range [first,last) - strided_range(Iterator first, Iterator last, difference_type stride) - : first(first), last(last), stride(stride) {} - - iterator begin(void) const - { - return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); - } - - iterator end(void) const - { - if(stride == 1) //statevector - return begin() + (last - first); - - //density matrix - return begin() + (last - first) / (stride-1); - } - - protected: - Iterator first; - Iterator last; - difference_type stride; -}; - -template -struct complex_dot_scan : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } -}; - -template -struct complex_norm : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } -}; - -template -struct complex_less -{ - typedef thrust::complex first_argument_type; - typedef thrust::complex second_argument_type; - typedef bool result_type; - __thrust_exec_check_disable__ - __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} -}; // end less - - -class HostFuncBase -{ -protected: -public: - HostFuncBase(){} - - virtual void execute(){} -}; //============================================================================ // chunk container base class @@ -474,6 +92,7 @@ class ChunkContainer : public std::enable_shared_from_this& operator[](uint_t i) = 0; virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS) = 0; @@ -600,7 +226,8 @@ class ChunkContainer : public std::enable_shared_from_this &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const = 0; - virtual thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const = 0; + virtual double norm(uint_t iChunk,uint_t count) const; + virtual double trace(uint_t iChunk,uint_t row,uint_t count) const; size_t size_of_complex(void) @@ -683,6 +310,35 @@ class ChunkContainer : public std::enable_shared_from_this &mat,const uint_t count); + + //apply diagonal matrix + virtual void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count); + + //apply (controlled) X + virtual void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) Y + virtual void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count); + + //apply (controlled) swap gate + virtual void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count); + + //apply permutation + virtual void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count); + + //apply rotation around axis + virtual void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count); + + //get probabilities of chunk + virtual void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const; + + //Pauli expectation values + virtual double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const; protected: int convert_blocked_qubit(int qubit) @@ -765,6 +421,7 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) { set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -818,7 +475,11 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) thrust::for_each_n(thrust::seq, ci , size, func); } #else - uint_t size = count * func.size(chunk_bits_); + uint_t size; + if(func.use_cache()) + size = count << (chunk_bits_ - func.qubits_count()); + else + size = count * func.size(chunk_bits_); auto ci = thrust::counting_iterator(0); thrust::for_each_n(thrust::device, ci , size, func); #endif @@ -835,6 +496,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -959,6 +621,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1000,6 +663,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1097,6 +761,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1148,7 +813,217 @@ void ChunkContainer::deallocate_chunks(void) reduced_queue_end_.clear(); } +template +void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + + if(N == 1){ + if(control_bits == 0) + Execute(MatrixMult2x2(mat,qubits[0]), iChunk, count); + else //2x2 matrix with control bits + Execute(MatrixMult2x2Controlled(mat,qubits), iChunk, count); + } + else if(N == 2){ + Execute(MatrixMult4x4(mat,qubits[0],qubits[1]), iChunk, count); + } + else{ + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); +#ifndef AER_THRUST_CUDA + if(N == 3){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult8x8(qubits,qubits_sorted), iChunk, count); + } + else if(N == 4){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult16x16(qubits,qubits_sorted), iChunk, count); + } + else if(N <= 10){ +#else + if(N <= 10){ +#endif + int i; + for(i=0;i(N), iChunk, count); + } + else{ + cvector_t matLU; + reg_t params; + MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); + + StoreMatrix(matLU, iChunk); + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); + } + } +} + +template +void ChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + + if(N == 1){ + if(control_bits == 0) + Execute(DiagonalMult2x2(diag,qubits[0]), iChunk, count); + else + Execute(DiagonalMult2x2Controlled(diag,qubits), iChunk, count); + } + else if(N == 2){ + Execute(DiagonalMult4x4(diag,qubits[0],qubits[1]), iChunk, count); + } + else{ + StoreMatrix(diag, iChunk); + StoreUintParams(qubits, iChunk); + + Execute(DiagonalMultNxN(qubits), iChunk, count); + } +} + +template +void ChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CX_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CY_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + Execute(phase_func(qubits,*(thrust::complex*)&phase), iChunk, count ); +} + +template +void ChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + Execute(CSwap_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + const size_t N = qubits.size(); + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + reg_t params; + Permutation f(qubits_sorted,qubits,pairs,params); + + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); +} + +template +void ChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + int control_bits = qubits.size() - 1; + switch(r){ + case Rotation::x: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rx(theta), count); + break; + case Rotation::y: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::ry(theta), count); + break; + case Rotation::z: + apply_diagonal_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rz_diag(theta), count); + break; + case Rotation::xx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rxx(theta), count); + break; + case Rotation::yy: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::ryy(theta), count); + break; + case Rotation::zz: + apply_diagonal_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzz_diag(theta), count); + break; + case Rotation::zx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzx(theta), count); + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } +} + +template +void ChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + const size_t N = qubits.size(); + const int_t DIM = 1 << N; + probs.resize(DIM); + + if(N == 1){ //special case for 1 qubit (optimized for measure) + ExecuteSum2(&probs[0],probability_1qubit_func(qubits[0]), iChunk, 1); + } + else{ + for(int_t i=0;i(qubits,i), iChunk, 1); + } + } +} + +template +double ChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + double ret; + ExecuteSum(&ret,norm_func(), iChunk, count); + + return ret; +} + +template +double ChunkContainer::trace(uint_t iChunk,uint_t row,uint_t count) const +{ + double ret; + ExecuteSum(&ret,trace_func(row), iChunk, count); + + return ret; +} + +template +double ChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); + + // Special case for only I Paulis + if (x_mask + z_mask == 0) { + thrust::complex ret = norm(iChunk, 1); + return ret.real() + ret.imag(); + } + double ret; + // specialize x_max == 0 + if(x_mask == 0) { + ExecuteSum(&ret, expval_pauli_Z_func(z_mask), iChunk, 1 ); + return ret; + } + + // Compute the overall phase of the operator. + // This is (-1j) ** number of Y terms modulo 4 + auto phase = std::complex(initial_phase); + add_y_phase(num_y, phase); + ExecuteSum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase), iChunk, 1 ); + return ret; +} + + + + //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index be3abf65c2..d8a8a4fbfa 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -25,6 +25,8 @@ namespace AER { namespace QV { +namespace Chunk { + //============================================================================ // chunk manager class @@ -43,12 +45,15 @@ class ChunkManager int num_qubits_; //number of global qubits uint_t num_chunks_; //number of chunks on this process + uint_t chunk_index_; //global chunk index for the first chunk int i_dev_map_; //device index chunk to be mapped int idev_buffer_map_; //device index buffer to be mapped int iplace_host_; //chunk container for host memory bool multi_shots_; + + bool enable_cuStatevec_; public: ChunkManager(); @@ -65,7 +70,7 @@ class ChunkManager return chunks_.size(); } - uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit); + uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit,bool enable_cuStatevec); void Free(void); int num_devices(void) @@ -113,6 +118,7 @@ ChunkManager::ChunkManager() num_places_ = 1; chunk_bits_ = 0; num_chunks_ = 0; + chunk_index_ = 0; num_qubits_ = 0; multi_shots_ = false; @@ -161,7 +167,7 @@ ChunkManager::~ChunkManager() } template -uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit) +uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit, bool enable_cuStatevec) { uint_t num_buffers; int iDev; @@ -182,6 +188,9 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, hybrid = true; } //--- + enable_cuStatevec_ = enable_cuStatevec; + + chunk_index_ = chunk_index; if(num_qubits_ != nqubits || chunk_bits_ != chunk_bits || nchunks > num_chunks_){ //free previous allocation @@ -246,40 +255,45 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, num_places_ = num_chunks_; } - nchunks = num_chunks_; - //allocate chunk container before parallel loop using push_back to store shared pointer for(i=0;i>()); + continue; + } +#endif chunks_.push_back(std::make_shared>()); } uint_t chunks_allocated = 0; #pragma omp parallel for if(num_places_ > 1) private(is,ie,nc) reduction(+:chunks_allocated) for(iDev=0;iDevset_chunk_index(chunk_index_ + chunks_allocated); //set first chunk index for the container if(num_devices_ > 0) chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); else chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); } - if(chunks_allocated < nchunks){ + if(chunks_allocated < num_chunks_){ //rest of chunks are stored on host for(iDev=0;iDev 0){ + chunks_[num_places_]->set_chunk_index(chunk_index_ + chunks_allocated + is); //set first chunk index for the container chunks_.push_back(std::make_shared>()); chunks_[num_places_]->Allocate(-1,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); num_places_ += 1; } } - num_chunks_ = chunks_allocated; } #ifdef AER_DISABLE_GDR @@ -388,6 +402,7 @@ void ChunkManager::execute_on_device(Function func,const std::vector +class cuStateVecChunkContainer : public DeviceChunkContainer +{ +protected: + custatevecHandle_t custatevec_handle_; //cuStatevec handle for this chunk container + AERDeviceVector custatevec_work_; //work buffer for cuStatevec + uint_t custatevec_work_size_; //buffer size + uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix + uint_t custatevec_chunk_count_; //number of counts for all chunks + +public: + using BaseContainer = DeviceChunkContainer; + + cuStateVecChunkContainer() + { + } + ~cuStateVecChunkContainer(); + + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; + + unsigned char* custatevec_work_pointer(uint_t iChunk) const + { + if(custatevec_work_size_ == 0) + return nullptr; + if(iChunk >= this->num_chunks_){ //for buffer chunks + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((BaseContainer::num_matrices_ + iChunk - this->num_chunks_) * custatevec_work_size_); + } + else{ + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((iChunk % BaseContainer::num_matrices_) * custatevec_work_size_); + } + } + + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; + double norm(uint_t iChunk,uint_t count) const override; + + //apply matrix + void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) override; + + //apply diagonal matrix + void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) override; + + //apply (controlled) X + void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) Y + void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) override; + + //apply (controlled) swap gate + void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) override; + + //apply permutation + void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) override; + + //apply rotation around axis + void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) override; + + //get probabilities of chunk + void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const override; + + //Pauli expectation values + double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const override; +}; + +template +cuStateVecChunkContainer::~cuStateVecChunkContainer(void) +{ + Deallocate(); +} + +template +uint_t cuStateVecChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) +{ + uint_t nc; + nc = BaseContainer::Allocate(idev,chunk_bits,num_qubits,chunks,buffers,multi_shots,matrix_bit); + + //initialize custatevevtor handle + custatevecStatus_t err; + + err = custatevecCreate(&custatevec_handle_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::allocate::custatevecCreate : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //allocate extra workspace for custatevec + std::vector> mat(1ull << (matrix_bit*2)); + + //count bits for multi-chunks + custatevec_chunk_total_qubits_ = this->num_pow2_qubits_; + custatevec_chunk_count_ = this->num_chunks_ >> (this->num_pow2_qubits_ - this->chunk_bits_); + + //matrix + err = custatevecApplyMatrix_bufferSize( + custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::ResizeMatrixBuffers : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //diagonal matrix + size_t diag_size; + std::vector perm(matrix_bit); + std::vector basis(matrix_bit); + for(int_t i=0;i 0) + custatevec_work_.resize(custatevec_work_size_*BaseContainer::num_matrices_); + + return nc; +} + +template +void cuStateVecChunkContainer::Deallocate(void) +{ + BaseContainer::Deallocate(); + + custatevec_work_.clear(); + custatevec_work_.shrink_to_fit(); + custatevecDestroy(custatevec_handle_); +} + +template +reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride, bool dot,uint_t count) const +{ + if(count == (1ull << (this->num_qubits_ - this->chunk_bits_))){ + //custatevecSampler_sample only can be applied to whole statevector + const int_t SHOTS = rnds.size(); + reg_t samples(SHOTS,0); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + custatevecStatus_t err; + custatevecSamplerDescriptor_t sampler; + size_t extSize; + + cudaStreamSynchronize(BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + err = custatevecSampler_create(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + AERDeviceVector extBuf; + void* pExtBuf = nullptr; + if(extSize > 0){ + extBuf.resize(extSize); + pExtBuf = thrust::raw_pointer_cast(extBuf.data()); + } + + err = custatevecSampler_preprocess(custatevec_handle_,&sampler,pExtBuf,extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + std::vector bitStr(SHOTS); + std::vector bitOrdering(this->num_qubits_); + for(int_t i=0;inum_qubits_;i++){ + bitOrdering[i] = i; + } + + err = custatevecSampler_sample(custatevec_handle_, &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, + CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_sample " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + for(int_t i=0;i 0){ + extBuf.clear(); + extBuf.shrink_to_fit(); + } + return samples; + } + else{ + return BaseContainer::sample_measure(iChunk, rnds, stride, dot, count); + } +} + +template +void cuStateVecChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size()-control_bits; + + pMat = (thrust::complex*)&mat[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + cudaDataType_t state_type; + custatevecComputeType_t comp_type; + if(sizeof(data_t) == sizeof(double)){ + state_type = CUDA_C_64F; + comp_type = CUSTATEVEC_COMPUTE_64F; + } + else{ + state_type = CUDA_C_32F; + comp_type = CUSTATEVEC_COMPUTE_32F; + } + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size(); + + if(control_bits > 0){ + uint_t size = 1ull << num_qubits; + cvector_t diag_ctrl(size); //make diagonal matrix with controls + + for(int_t i=0;i*)&diag[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + cvector_t diag(perm_size); + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + uint_t size = 1ull << qubits.size(); + cvector_t diag(size); + for(int_t i=0;i +void cuStateVecChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + std::vector swap(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + int_t size = 1ull << qubits.size(); + custatevecIndex_t perm[size]; + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + custatevecPauli_t pauli[2]; + int nPauli = 1; + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + int control_bits = qubits.size() - 1; + + switch(r){ + case Rotation::x: + pauli[0] = CUSTATEVEC_PAULI_X; + break; + case Rotation::y: + pauli[0] = CUSTATEVEC_PAULI_Y; + break; + case Rotation::z: + pauli[0] = CUSTATEVEC_PAULI_Z; + break; + case Rotation::xx: + pauli[0] = CUSTATEVEC_PAULI_X; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + case Rotation::yy: + pauli[0] = CUSTATEVEC_PAULI_Y; + pauli[1] = CUSTATEVEC_PAULI_Y; + nPauli = 2; + control_bits--; + break; + case Rotation::zz: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_Z; + nPauli = 2; + control_bits--; + break; + case Rotation::zx: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + double ret = 0.0; + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + std::vector qubits32(qubits.size()); + for(int_t i=0;ichunk_bits_, + &p0, &p1, &qubits32[0], 1); + probs.resize(2); + probs[0] = p0; + probs[1] = p1; + } + else{ + probs.resize(1ull << qubits.size()); + err = custatevecAbs2SumArray(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + &probs[0], &qubits32[0], qubits.size(), nullptr,nullptr,0); + } + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::probabilities : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } +} + +template +double cuStateVecChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + if(initial_phase != 1.0){ + return BaseContainer::expval_pauli(iChunk, qubits, pauli, initial_phase); + } + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecPauli_t pauliOps[pauli.size()]; + int32_t qubits32[qubits.size()]; + for(int_t i=0;ichunk_bits_, + ret, pauliOperatorsArray, basisBitsArray, nBasisBitsArray, 1); + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::expval_pauli : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + return ret[0]; +} + + + +//------------------------------------------------------------------------------ +} // end namespace Chunk +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/chunk/cuda_kernels.hpp b/src/simulators/statevector/chunk/cuda_kernels.hpp index 9322a69279..4380578813 100644 --- a/src/simulators/statevector/chunk/cuda_kernels.hpp +++ b/src/simulators/statevector/chunk/cuda_kernels.hpp @@ -18,6 +18,7 @@ namespace AER { namespace QV { +namespace Chunk { template __global__ @@ -339,6 +340,7 @@ __global__ void dev_reduce_sum_uint(uint_t *pReduceBuffer,uint_t n,uint_t buf_si //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 34e92ab1c8..68126695c6 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -18,10 +18,9 @@ #include "simulators/statevector/chunk/chunk_container.hpp" - - namespace AER { namespace QV { +namespace Chunk { //============================================================================ @@ -59,6 +58,7 @@ class DeviceChunkContainer : public ChunkContainer #ifdef AER_THRUST_CUDA std::vector stream_; //asynchronous execution #endif + public: DeviceChunkContainer() { @@ -103,13 +103,13 @@ class DeviceChunkContainer : public ChunkContainer return raw_reference_cast(data_[i]); } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk); - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size); - void StoreUintParams(const std::vector& prm,uint_t iChunk); - void ResizeMatrixBuffers(int bits); + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override; + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override; + void StoreUintParams(const std::vector& prm,uint_t iChunk) override; + void ResizeMatrixBuffers(int bits) override; void set_device(void) const { @@ -134,16 +134,15 @@ class DeviceChunkContainer : public ChunkContainer return data_[i]; } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; thrust::complex* chunk_pointer(uint_t iChunk) const { @@ -322,6 +321,14 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu this->num_chunks_ = nc; data_.resize((nc+buffers) << chunk_bits); + //init number of bits for chunk count + uint_t nc_tmp = this->num_chunks_; + this->num_pow2_qubits_ = this->chunk_bits_; + while((nc_tmp & 1) == 0){ + this->num_pow2_qubits_++; + nc_tmp >>= 1; + } + #ifdef AER_THRUST_CUDA stream_.resize(nc + buffers); for(i=0;i::Deallocate(void) blocked_qubits_holder_.clear(); #ifdef AER_THRUST_CUDA - uint_t i; - for(i=0;i::sample_measure(uint_t iChunk,const std::vect #ifdef AER_THRUST_CUDA -// cudaGetLastError(); if(dot) thrust::transform_inclusive_scan(thrust::cuda::par.on(stream_[iChunk]),iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); else @@ -676,30 +681,6 @@ reg_t DeviceChunkContainer::sample_measure(uint_t iChunk,const std::vect return samples; } -template -thrust::complex DeviceChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - set_device(); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - -#ifdef AER_THRUST_CUDA - cudaStreamSynchronize(stream_[iChunk]); - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#else - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#endif - - return sum; -} - //set qubits to be blocked template @@ -1177,7 +1158,9 @@ void DeviceChunkContainer::copy_to_probability_buffer(std::vector return data_[i]; } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk) + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override { matrix_[iChunk] = (thrust::complex*)&mat[0]; } - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override { matrix_[iChunk] = (thrust::complex*)mat; } - void StoreUintParams(const std::vector& prm,uint_t iChunk) + void StoreUintParams(const std::vector& prm,uint_t iChunk) override { params_[iChunk] = (uint_t*)&prm[0]; } void ResizeMatrixBuffers(int bits){} - void Set(uint_t i,const thrust::complex& t) + void Set(uint_t i,const thrust::complex& t) override { data_[i] = t; } - thrust::complex Get(uint_t i) const + thrust::complex Get(uint_t i) const override { return data_[i]; } - thrust::complex* chunk_pointer(uint_t iChunk) const + thrust::complex* chunk_pointer(uint_t iChunk) const override { return (thrust::complex*)thrust::raw_pointer_cast(data_.data()) + (iChunk << this->chunk_bits_); } - thrust::complex* matrix_pointer(uint_t iChunk) const + thrust::complex* matrix_pointer(uint_t iChunk) const override { return matrix_[iChunk]; } - uint_t* param_pointer(uint_t iChunk) const + uint_t* param_pointer(uint_t iChunk) const override { return params_[iChunk]; } @@ -104,16 +105,15 @@ class HostChunkContainer : public ChunkContainer #endif } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; }; @@ -267,22 +267,9 @@ reg_t HostChunkContainer::sample_measure(uint_t iChunk,const std::vector return samples; } -template -thrust::complex HostChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - - if(dot) - sum = thrust::transform_reduce(thrust::omp::par, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::omp::par, iter.begin(),iter.end(),zero,thrust::plus>()); - - return sum; -} //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp new file mode 100644 index 0000000000..a882f5c8fc --- /dev/null +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -0,0 +1,2699 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019, 2020. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + + +#ifndef _qv_thrust_kernels_hpp_ +#define _qv_thrust_kernels_hpp_ + +#include "misc/warnings.hpp" +DISABLE_WARNING_PUSH +#ifdef AER_THRUST_CUDA +#include +#include +#endif +DISABLE_WARNING_POP + +#include "misc/wrap_thrust.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "framework/utils.hpp" + +#ifdef AER_THRUST_CUDA +#include "simulators/statevector/chunk/cuda_kernels.hpp" +#endif + +namespace AER { +namespace QV { +namespace Chunk { + +//======================================== +// base class of gate functions +//======================================== +template +class GateFuncBase +{ +protected: + thrust::complex* data_; //pointer to state vector buffer + thrust::complex* matrix_; //storage for matrix on device + uint_t* params_; //storage for additional parameters on device + uint_t base_index_; //start index of state vector + uint_t chunk_bits_; + uint_t* cregs_; + uint_t num_creg_bits_; + int_t conditional_bit_; +#ifndef AER_THRUST_CUDA + uint_t index_offset_; +#endif +public: + GateFuncBase() + { + data_ = NULL; + base_index_ = 0; + cregs_ = NULL; + num_creg_bits_ = 0; + conditional_bit_ = -1; +#ifndef AER_THRUST_CUDA + index_offset_ = 0; +#endif + } + virtual void set_data(thrust::complex* p) + { + data_ = p; + } + void set_matrix(thrust::complex* mat) + { + matrix_ = mat; + } + void set_params(uint_t* p) + { + params_ = p; + } + void set_chunk_bits(uint_t bits) + { + chunk_bits_ = bits; + } + + void set_base_index(uint_t i) + { + base_index_ = i; + } + void set_cregs_(uint_t* cbits,uint_t nreg) + { + cregs_ = cbits; + num_creg_bits_ = nreg; + } + void set_conditional(int_t bit) + { + conditional_bit_ = bit; + } + +#ifndef AER_THRUST_CUDA + void set_index_offset(uint_t i) + { + index_offset_ = i; + } +#endif + + __host__ __device__ thrust::complex* data(void) + { + return data_; + } + + virtual bool is_diagonal(void) + { + return false; + } + virtual int qubits_count(void) + { + return 1; + } + virtual int num_control_bits(void) + { + return 0; + } + virtual int control_mask(void) + { + return 1; + } + virtual bool use_cache(void) + { + return false; + } + virtual bool batch_enable(void) + { + return true; + } + + virtual const char* name(void) + { + return "base function"; + } + virtual uint_t size(int num_qubits) + { + if(is_diagonal()){ + chunk_bits_ = num_qubits; + return (1ull << num_qubits); + } + else{ + chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); + return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); + } + } + + virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const + { + return _tid; + } + virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + } + virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + return 0.0; + } + + virtual __host__ __device__ bool check_conditional(uint_t i) const + { + if(conditional_bit_ < 0) + return true; + + uint_t iChunk = i >> chunk_bits_; + uint_t n64,i64,ibit; + n64 = (num_creg_bits_ + 63) >> 6; + i64 = conditional_bit_ >> 6; + ibit = conditional_bit_ & 63; + return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); + } +}; + +//======================================== + // gate functions with cache +//======================================== +template +class GateFuncWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ void sync_threads() const + { +#ifdef CUDA_ARCH + __syncthreads(); +#endif + } + + __host__ __device__ void operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache(j,idx,cache); + } + } + + virtual int qubits_count(void) + { + return nqubits_; + } +}; + +template +class GateFuncSumWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncSumWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return 0.0; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + double sum = 0.0; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache_sum(j,idx,cache); + } + return sum; + } + + virtual int qubits_count(void) + { + return nqubits_; + } + +}; + +//stridded iterator to access diagonal probabilities +template +class strided_range +{ + public: + + typedef typename thrust::iterator_difference::type difference_type; + + struct stride_functor : public thrust::unary_function + { + difference_type stride; + + stride_functor(difference_type stride) + : stride(stride) {} + + __host__ __device__ + difference_type operator()(const difference_type& i) const + { + if(stride == 1) //statevector + return i; + + //density matrix + difference_type i_chunk; + i_chunk = i / (stride - 1); + difference_type ret = stride * i - i_chunk*(stride-1); + return ret; + } + }; + + typedef typename thrust::counting_iterator CountingIterator; + typedef typename thrust::transform_iterator TransformIterator; + typedef typename thrust::permutation_iterator PermutationIterator; + + // type of the strided_range iterator + typedef PermutationIterator iterator; + + // construct strided_range for the range [first,last) + strided_range(Iterator first, Iterator last, difference_type stride) + : first(first), last(last), stride(stride) {} + + iterator begin(void) const + { + return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); + } + + iterator end(void) const + { + if(stride == 1) //statevector + return begin() + (last - first); + + //density matrix + return begin() + (last - first) / (stride-1); + } + + protected: + Iterator first; + Iterator last; + difference_type stride; +}; + +template +struct complex_dot_scan : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } +}; + +template +struct complex_norm : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } +}; + +template +struct complex_less +{ + typedef thrust::complex first_argument_type; + typedef thrust::complex second_argument_type; + typedef bool result_type; + __thrust_exec_check_disable__ + __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} +}; // end less + + +class HostFuncBase +{ +protected: +public: + HostFuncBase(){} + + virtual void execute(){} +}; + + +//------------------------------------------------------------------------------ +// State initialize component +//------------------------------------------------------------------------------ +template +class initialize_component_1qubit_func : public GateFuncBase +{ +protected: + thrust::complex s0,s1; + uint_t mask; + uint_t offset; +public: + initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) + { + s0 = state0; + s1 = state1; + + mask = (1ull << qubit) - 1; + offset = 1ull << qubit; + } + + virtual __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + + vec0[i0] = s0*q0; + vec1[i0] = s1*q0; + } + + const char* name(void) + { + return "initialize_component 1 qubit"; + } +}; + +template +class initialize_component_func : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; +public: + initialize_component_func(const cvector_t& mat,const reg_t &qb) + { + nqubits = qb.size(); + matSize = 1ull << nqubits; + } + + int qubits_count(void) + { + return nqubits; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q; + thrust::complex* state; + uint_t* qubits; + uint_t* qubits_sorted; + uint_t j,k; + uint_t ii,idx,t; + uint_t mask; + + //get parameters from iterator + vec = this->data_; + state = this->matrix_; + qubits = this->params_; + qubits_sorted = qubits + nqubits; + + idx = 0; + ii = i; + for(j=0;j> j) & 1) != 0) + ii += (1ull << qubits[j]); + } + q = q0 * state[k]; + vec[ii] = q; + } + } + + const char* name(void) + { + return "initialize_component"; + } +}; + +template +class initialize_large_component_func : public GateFuncBase +{ +protected: + int num_qubits_; + uint_t mask_; + uint_t cmask_; + thrust::complex init_; +public: + initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) + { + num_qubits_ = qubits.size(); + init_ = m; + + mask_ = 0; + cmask_ = 0; + for(int k=0;k> k) & 1) != 0){ + cmask_ |= (1ull << qubits[k]); + } + } + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q; + vec = this->data_; + if((i & mask_) == cmask_){ + q = vec[i]; + vec[i] = init_*q; + } + } + const char* name(void) + { + return "initialize_large_component"; + } +}; + +//------------------------------------------------------------------------------ +// Zero clear +//------------------------------------------------------------------------------ +template +class ZeroClear : public GateFuncBase +{ +protected: +public: + ZeroClear() {} + bool is_diagonal(void) + { + return true; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + vec = this->data_; + vec[i] = 0.0; + } + const char* name(void) + { + return "zero"; + } +}; + + +//------------------------------------------------------------------------------ +// Initialize state +//------------------------------------------------------------------------------ +template +class initialize_kernel : public GateFuncBase +{ +protected: + int num_qubits_state_; + uint_t offset_; + thrust::complex init_val_; +public: + initialize_kernel(thrust::complex v,int nqs,uint_t offset) + { + num_qubits_state_ = nqs; + offset_ = offset; + init_val_ = v; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + uint_t iChunk = (i >> num_qubits_state_); + + vec = this->data_; + + if(i == iChunk * offset_){ + vec[i] = init_val_; + } + else{ + vec[i] = 0.0; + } + } + const char* name(void) + { + return "initialize"; + } +}; + +//------------------------------------------------------------------------------ +// Matrix multiplication +//------------------------------------------------------------------------------ +template +class MatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset0; + +public: + MatrixMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + mask = (1ull << qubit) - 1; + + offset0 = 1ull << qubit; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset0; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + const char* name(void) + { + return "mult2x2"; + } +}; + + +template +class MatrixMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m00,m10,m20,m30; + thrust::complex m01,m11,m21,m31; + thrust::complex m02,m12,m22,m32; + thrust::complex m03,m13,m23,m33; + uint_t mask0; + uint_t mask1; + uint_t offset0; + uint_t offset1; + +public: + MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) + { + m00 = mat[0]; + m01 = mat[1]; + m02 = mat[2]; + m03 = mat[3]; + + m10 = mat[4]; + m11 = mat[5]; + m12 = mat[6]; + m13 = mat[7]; + + m20 = mat[8]; + m21 = mat[9]; + m22 = mat[10]; + m23 = mat[11]; + + m30 = mat[12]; + m31 = mat[13]; + m32 = mat[14]; + m33 = mat[15]; + + offset0 = 1ull << qubit0; + offset1 = 1ull << qubit1; + if(qubit0 < qubit1){ + mask0 = offset0 - 1; + mask1 = offset1 - 1; + } + else{ + mask0 = offset1 - 1; + mask1 = offset0 - 1; + } + } + + int qubits_count(void) + { + return 2; + } + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex* vec2; + thrust::complex* vec3; + thrust::complex q0,q1,q2,q3; + + vec0 = this->data_; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + vec1 = vec0 + offset0; + vec2 = vec0 + offset1; + vec3 = vec2 + offset0; + + q0 = vec0[i0]; + q1 = vec1[i0]; + q2 = vec2[i0]; + q3 = vec3[i0]; + + vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; + vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; + vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; + vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; + } + const char* name(void) + { + return "mult4x4"; + } +}; + +template +class MatrixMult8x8 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t mask0; + uint_t mask1; + uint_t mask2; + +public: + MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + } + + int qubits_count(void) + { + return 3; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i3 = (i - i0) << 1; + i1 = i3 & mask1; + i3 = (i3 - i1) << 1; + i2 = i3 & mask2; + i3 = (i3 - i2) << 1; + + i0 = i0 + i1 + i2 + i3; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + + m0 = pMat[0]; + m1 = pMat[8]; + m2 = pMat[16]; + m3 = pMat[24]; + m4 = pMat[32]; + m5 = pMat[40]; + m6 = pMat[48]; + m7 = pMat[56]; + + vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[1]; + m1 = pMat[9]; + m2 = pMat[17]; + m3 = pMat[25]; + m4 = pMat[33]; + m5 = pMat[41]; + m6 = pMat[49]; + m7 = pMat[57]; + + vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[2]; + m1 = pMat[10]; + m2 = pMat[18]; + m3 = pMat[26]; + m4 = pMat[34]; + m5 = pMat[42]; + m6 = pMat[50]; + m7 = pMat[58]; + + vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[3]; + m1 = pMat[11]; + m2 = pMat[19]; + m3 = pMat[27]; + m4 = pMat[35]; + m5 = pMat[43]; + m6 = pMat[51]; + m7 = pMat[59]; + + vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[4]; + m1 = pMat[12]; + m2 = pMat[20]; + m3 = pMat[28]; + m4 = pMat[36]; + m5 = pMat[44]; + m6 = pMat[52]; + m7 = pMat[60]; + + vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[5]; + m1 = pMat[13]; + m2 = pMat[21]; + m3 = pMat[29]; + m4 = pMat[37]; + m5 = pMat[45]; + m6 = pMat[53]; + m7 = pMat[61]; + + vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[6]; + m1 = pMat[14]; + m2 = pMat[22]; + m3 = pMat[30]; + m4 = pMat[38]; + m5 = pMat[46]; + m6 = pMat[54]; + m7 = pMat[62]; + + vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[7]; + m1 = pMat[15]; + m2 = pMat[23]; + m3 = pMat[31]; + m4 = pMat[39]; + m5 = pMat[47]; + m6 = pMat[55]; + m7 = pMat[63]; + + vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + } + const char* name(void) + { + return "mult8x8"; + } +}; + +template +class MatrixMult16x16 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t offset3; + uint_t mask0; + uint_t mask1; + uint_t mask2; + uint_t mask3; +public: + MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + offset3 = (1ull << qubit[3]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + mask3 = (1ull << qubit_ordered[3]) - 1; + } + + int qubits_count(void) + { + return 4; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; + thrust::complex r; + thrust::complex* pMat; + int j; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i4 = (i - i0) << 1; + i1 = i4 & mask1; + i4 = (i4 - i1) << 1; + i2 = i4 & mask2; + i4 = (i4 - i2) << 1; + i3 = i4 & mask3; + i4 = (i4 - i3) << 1; + + i0 = i0 + i1 + i2 + i3 + i4; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + q8 = vec[i0 + offset3]; + q9 = vec[i0 + offset3 + offset0]; + q10 = vec[i0 + offset3 + offset1]; + q11 = vec[i0 + offset3 + offset1 + offset0]; + q12 = vec[i0 + offset3 + offset2]; + q13 = vec[i0 + offset3 + offset2 + offset0]; + q14 = vec[i0 + offset3 + offset2 + offset1]; + q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; + + offset = 0; + f0 = 0; + f1 = 0; + f2 = 0; + for(j=0;j<16;j++){ + r = pMat[0+j]*q0; + r += pMat[16+j]*q1; + r += pMat[32+j]*q2; + r += pMat[48+j]*q3; + r += pMat[64+j]*q4; + r += pMat[80+j]*q5; + r += pMat[96+j]*q6; + r += pMat[112+j]*q7; + r += pMat[128+j]*q8; + r += pMat[144+j]*q9; + r += pMat[160+j]*q10; + r += pMat[176+j]*q11; + r += pMat[192+j]*q12; + r += pMat[208+j]*q13; + r += pMat[224+j]*q14; + r += pMat[240+j]*q15; + + offset = offset3 * (((uint_t)j >> 3) & 1) + + offset2 * (((uint_t)j >> 2) & 1) + + offset1 * (((uint_t)j >> 1) & 1) + + offset0 * ((uint_t)j & 1); + + vec[i0 + offset] = r; + } + } + const char* name(void) + { + return "mult16x16"; + } +}; + +template +class MatrixMultNxN : public GateFuncWithCache +{ +protected: +public: + MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) + { + ; + } + + __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j,threadID; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class MatrixMultNxN_LU : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; + int nswap; +public: + MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) + { + uint_t i,j,k,imax; + std::complex c0,c1; + double d,dmax; + uint_t* pSwap; + + nqubits = qb.size(); + matSize = 1ull << nqubits; + + matLU = mat; + params.resize(nqubits + matSize*2); + + for(k=0;k dmax){ + dmax = d; + imax = j; + } + } + if(imax != i){ + j = params[nqubits + imax]; + params[nqubits + imax] = params[nqubits + i]; + params[nqubits + i] = j; + } + + if(dmax != 0){ + c0 = matLU[(i << nqubits) + params[nqubits + i]]; + + for(j=i+1;j q,qt; + thrust::complex m; + thrust::complex r; + uint_t j,k,l,iq; + uint_t ii,idx,t; + uint_t mask,offset_j,offset_k; + thrust::complex* vec; + thrust::complex* pMat; + uint_t* qubits; + uint_t* pivot; + uint_t* table; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + + pivot = qubits + nqubits; + table = pivot + matSize; + + idx = 0; + ii = i; + for(j=0;j> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //mult L + for(j=matSize-1;j>0;j--){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + r = vec[offset_j+idx]; + + for(k=0;k> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //swap results + if(nswap > 0){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + q = vec[offset_j+idx]; + k = pivot[table[0]]; + for(j=1;j> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + qt = vec[offset_j+idx]; + + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + q = qt; + k = pivot[table[j]]; + } + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + } + } + const char* name(void) + { + return "multNxN"; + } +}; + +template +class MatrixMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; +public: + MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + nqubits = qubits.size(); + + offset = 1ull << qubits[nqubits-1]; + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + } + const char* name(void) + { + return "matrix_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Diagonal matrix multiplication +//------------------------------------------------------------------------------ +template +class DiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + + DiagonalMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i + gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult2x2"; + } +}; + +template +class DiagonalMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit0; + int qubit1; +public: + + DiagonalMult4x4(const cvector_t& mat,int q0,int q1) + { + qubit0 = q0; + qubit1 = q1; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return 2; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit1) & 1) == 0){ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + } + else{ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m2; + } + else{ + m = m3; + } + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult4x4"; + } +}; + +template +class DiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + DiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t j,im; + thrust::complex* vec; + thrust::complex q; + thrust::complex m; + thrust::complex* pMat; + uint_t* qubits; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + pMat = this->matrix_; + qubits = this->params_; + + im = 0; + for(j=0;j> qubits[j]) & 1) != 0){ + im += (1 << j); + } + } + + q = vec[i]; + m = pMat[im]; + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_multNxN"; + } +}; + +template +class DiagonalMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + uint_t mask; + uint_t cmask; + int nqubits; +public: + DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + m0 = mat[0]; + m1 = mat[1]; + + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i* vec; + thrust::complex q0; + thrust::complex m; + + vec = this->data_; + gid = this->base_index_; + + if(((i + gid) & cmask) == cmask){ + if((i + gid) & mask){ + m = m1; + } + else{ + m = m0; + } + + q0 = vec[i]; + vec[i] = m*q0; + } + } + const char* name(void) + { + return "diagonal_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Permutation +//------------------------------------------------------------------------------ +template +class Permutation : public GateFuncBase +{ +protected: + uint_t nqubits; + uint_t npairs; + +public: + Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) + { + uint_t j,k; + uint_t offset0,offset1; + + nqubits = qubits.size(); + npairs = pairs.size(); + + params.resize(nqubits + npairs*2); + + for(j=0;j> k) & 1) != 0){ + offset0 += (1ull << qubits[k]); + } + if(((pairs[j].second >> k) & 1) != 0){ + offset1 += (1ull << qubits[k]); + } + } + params[nqubits + j*2 ] = offset0; + params[nqubits + j*2+1] = offset1; + } + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t j; + uint_t ii,idx,t; + uint_t* mask; + uint_t* pairs; + + vec = this->data_; + mask = this->params_; + pairs = mask + nqubits; + + idx = 0; + ii = i; + for(j=0;j +class CX_func : public GateFuncBase +{ +protected: + uint_t offset; + uint_t mask; + uint_t cmask; + int nqubits; + int qubit_t; +public: + + CX_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = 1ull << qubit_t; + mask = offset - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = q1; + vec1[i0] = q0; + } + } + const char* name(void) + { + return "CX"; + } +}; + +//------------------------------------------------------------------------------ +// Y gate +//------------------------------------------------------------------------------ +template +class CY_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; + int qubit_t; +public: + CY_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = (1ull << qubit_t); + mask = (1ull << qubit_t) - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = thrust::complex(q1.imag(),-q1.real()); + vec1[i0] = thrust::complex(-q0.imag(),q0.real()); + } + } + const char* name(void) + { + return "CY"; + } +}; + +//------------------------------------------------------------------------------ +// Swap gate +//------------------------------------------------------------------------------ +template +class CSwap_func : public GateFuncBase +{ +protected: + uint_t mask0; + uint_t mask1; + uint_t cmask; + int nqubits; + int qubit_t0; + int qubit_t1; + uint_t offset1; + uint_t offset2; +public: + + CSwap_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t0 = qubits[nqubits-2]; + qubit_t1 = qubits[nqubits-1]; + } + else{ + qubit_t1 = qubits[nqubits-2]; + qubit_t0 = qubits[nqubits-1]; + } + mask0 = (1ull << qubit_t0) - 1; + mask1 = (1ull << qubit_t1) - 1; + + offset1 = 1ull << qubit_t0; + offset2 = 1ull << qubit_t1; + + cmask = 0; + for(i=0;i q1,q2; + thrust::complex* vec1; + thrust::complex* vec2; + + vec1 = this->data_; + + vec2 = vec1 + offset2; + vec1 = vec1 + offset1; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + if((i0 & cmask) == cmask){ + q1 = vec1[i0]; + q2 = vec2[i0]; + vec1[i0] = q2; + vec2[i0] = q1; + } + } + const char* name(void) + { + return "CSWAP"; + } +}; + +//swap operator between chunks +template +class CSwapChunk_func : public GateFuncBase +{ +protected: + uint_t mask; + thrust::complex* vec0; + thrust::complex* vec1; + bool write_back_; + bool swap_all_; +public: + + CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) + { + int i; + int nqubits; + int qubit_t; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t = qubits[nqubits-2]; + } + else{ + qubit_t = qubits[nqubits-1]; + } + mask = (1ull << qubit_t) - 1; + + vec0 = pVec0; + vec1 = pVec1; + + write_back_ = wb; + if(qubit_t >= block_bits) + swap_all_ = true; + else + swap_all_ = false; + } + + bool batch_enable(void) + { + return false; + } + bool is_diagonal(void) + { + return swap_all_; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + + i0 = i & mask; + i1 = (i - i0) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + vec0[i0] = q1; + if(write_back_) + vec1[i0] = q0; + } + const char* name(void) + { + return "Chunk SWAP"; + } +}; + + +//------------------------------------------------------------------------------ +// Phase gate +//------------------------------------------------------------------------------ +template +class phase_func : public GateFuncBase +{ +protected: + thrust::complex phase; + uint_t mask; + int nqubits; +public: + phase_func(const reg_t &qubits,thrust::complex p) + { + int i; + nqubits = qubits.size(); + phase = p; + + mask = 0; + for(i=0;i* vec; + thrust::complex q0; + + vec = this->data_; + gid = this->base_index_; + + if(((i+gid) & mask) == mask){ + q0 = vec[i]; + vec[i] = q0 * phase; + } + } + const char* name(void) + { + return "phase"; + } +}; + +//------------------------------------------------------------------------------ +// Norm functions +//------------------------------------------------------------------------------ +template +class norm_func : public GateFuncBase +{ +protected: +public: + norm_func(void) + { + + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double d; + + vec = this->data_; + q = vec[i]; + d = (double)(q.real()*q.real() + q.imag()*q.imag()); + return d; + } + + const char* name(void) + { + return "norm"; + } +}; + +template +class trace_func : public GateFuncBase +{ +protected: + uint_t rows_; +public: + trace_func(uint_t nrow) + { + rows_ = nrow; + } + bool is_diagonal(void) + { + return true; + } + uint_t size(int num_qubits) + { + this->chunk_bits_ = num_qubits; + return rows_; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + + uint_t iChunk = (i / rows_); + uint_t lid = i - (iChunk * rows_); + uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); + + vec = this->data_; + q = vec[idx]; + return q.real(); + } + + const char* name(void) + { + return "trace"; + } +}; + +template +class NormMatrixMultNxN : public GateFuncSumWithCache +{ +protected: +public: + NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) + { + ; + } + + __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class NormDiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + NormDiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t im,j,gid; + thrust::complex q; + thrust::complex m,r; + thrust::complex* pMat; + thrust::complex* vec; + uint_t* qubits; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + gid = this->base_index_; + + im = 0; + for(j=0;j +class NormMatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset; +public: + NormMatrixMult2x2(const cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + offset = 1ull << qubit; + mask = (1ull << qubit) - 1; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex* vec; + thrust::complex q0,q1; + thrust::complex r0,r1; + double sum = 0.0; + + vec = this->data_; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec[i0]; + q1 = vec[offset+i0]; + + r0 = m0 * q0 + m2 * q1; + sum += r0.real()*r0.real() + r0.imag()*r0.imag(); + r1 = m1 * q0 + m3 * q1; + sum += r1.real()*r1.real() + r1.imag()*r1.imag(); + return sum; + } + const char* name(void) + { + return "Norm_mult2x2"; + } +}; + +template +class NormDiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + NormDiagonalMult2x2(cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t gid; + thrust::complex* vec; + thrust::complex q; + thrust::complex m,r; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + r = m * q; + + return (r.real()*r.real() + r.imag()*r.imag()); + } + const char* name(void) + { + return "Norm_diagonal_mult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Probabilities +//------------------------------------------------------------------------------ +template +class probability_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; +public: + probability_func(const reg_t &qubits,int i) + { + int k; + int nq = qubits.size(); + + mask = 0; + cmask = 0; + for(k=0;k> k) & 1) != 0){ + cmask |= (1ull << qubits[k]); + } + } + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double ret; + + vec = this->data_; + + ret = 0.0; + + if((i & mask) == cmask){ + q = vec[i]; + ret = q.real()*q.real() + q.imag()*q.imag(); + } + return ret; + } + + const char* name(void) + { + return "probabilities"; + } +}; + +template +class probability_1qubit_func : public GateFuncBase +{ +protected: + uint_t offset; +public: + probability_1qubit_func(const uint_t qubit) + { + offset = 1ull << qubit; + } + + __host__ __device__ thrust::complex operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex ret; + double d0,d1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & (offset - 1); + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); + d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); + + ret = thrust::complex(d0,d1); + return ret; + } + + const char* name(void) + { + return "probabilities_1qubit"; + } +}; + +//------------------------------------------------------------------------------ +// Expectation values +//------------------------------------------------------------------------------ +inline __host__ __device__ uint_t pop_count_kernel(uint_t val) +{ + uint_t count = val; + count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); + count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); + count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); + count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); + count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); + count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); + return count; +} + +//special case Z only +template +class expval_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + +public: + expval_pauli_Z_func(uint_t z) + { + z_mask_ = z; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + double ret = 0.0; + + vec = this->data_; + + q0 = vec[i]; + ret = q0.real()*q0.real() + q0.imag()*q0.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + ret = -ret; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_Z"; + } +}; + +template +class expval_pauli_XYZ_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; +public: + expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + ret = -d0; + else + ret = d0; + if(pop_count_kernel(idx1 & z_mask_) & 1) + ret -= d1; + else + ret += d1; + } + else{ + ret = d0 + d1; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_XYZ"; + } +}; + +template +class expval_pauli_inter_chunk_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + thrust::complex phase_; + thrust::complex* pair_chunk_; + uint_t z_count_; + uint_t z_count_pair_; +public: + expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + pair_chunk_ = pair_chunk; + z_count_ = zc; + z_count_pair_ = zcp; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t ip; + + vec = this->data_; + + ip = i ^ x_mask_; + q0 = vec[i]; + q1 = pair_chunk_[ip]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if((pop_count_kernel(i & z_mask_) + z_count_) & 1) + ret = -d0; + else + ret = d0; + if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) + ret -= d1; + else + ret += d1; + + return ret; + } + const char* name(void) + { + return "expval_pauli_inter_chunk"; + } +}; + +//------------------------------------------------------------------------------ +// Pauli application +//------------------------------------------------------------------------------ +template +class multi_pauli_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; + uint_t nqubits_; +public: + multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + q0 *= -1; + + if(pop_count_kernel(idx1 & z_mask_) & 1) + q1 *= -1; + } + vec[idx0] = q1 * phase_; + vec[idx1] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli"; + } +}; + +//special case Z only +template +class multi_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + thrust::complex phase_; +public: + multi_pauli_Z_func(uint_t z,std::complex p) + { + z_mask_ = z; + phase_ = p; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + + vec = this->data_; + + q0 = vec[i]; + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + q0 = -q0; + } + vec[i] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli_Z"; + } +}; + + +//------------------------------------------------------------------------------ +} // end namespace Chunk +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 79fad5745b..ee037cb5fb 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -41,6 +41,12 @@ namespace QV { template using cvector_t = std::vector>; template using cdict_t = std::map>; +enum class Rotation { + x, y, z, + xx, yy, zz, + zx, +}; + //============================================================================ // QubitVector class //============================================================================ @@ -159,6 +165,8 @@ class QubitVector { void set_max_matrix_bits(int_t bits){} + void synchronize(void){} + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -256,6 +264,9 @@ class QubitVector { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVector &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -389,6 +400,11 @@ class QubitVector { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -1576,6 +1592,37 @@ void QubitVector::apply_mcu(const reg_t &qubits, } // end switch } +template +void QubitVector::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + switch(r){ + case Rotation::x: + apply_mcu(qubits, Linalg::VMatrix::rx(theta)); + break; + case Rotation::y: + apply_mcu(qubits, Linalg::VMatrix::ry(theta)); + break; + case Rotation::z: + apply_mcu(qubits, Linalg::VMatrix::rz(theta)); + break; + case Rotation::xx: + apply_matrix(qubits, Linalg::VMatrix::rxx(theta)); + break; + case Rotation::yy: + apply_matrix(qubits, Linalg::VMatrix::ryy(theta)); + break; + case Rotation::zz: + apply_diagonal_matrix(qubits, Linalg::VMatrix::rzz_diag(theta)); + break; + case Rotation::zx: + apply_matrix(qubits, Linalg::VMatrix::rzx(theta)); + break; + default: + throw std::invalid_argument( + "QubitVector::invalid rotation axis."); + } +} + template void QubitVector::apply_chunk_swap(const reg_t &qubits, QubitVector &src, bool write_back) { diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index fdbbbbe2b5..3c4ca7c334 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -162,6 +162,11 @@ class QubitVectorThrust { void set_max_matrix_bits(int_t bits); + void synchronize(void) + { + chunk_.synchronize(); + } + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -267,6 +272,9 @@ class QubitVectorThrust { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -420,6 +428,12 @@ class QubitVectorThrust { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + cuStateVec_enable_ = flg; + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -430,7 +444,6 @@ class QubitVectorThrust { // Get the sample_measure index size int get_sample_measure_index_size() {return sample_measure_index_size_;} - protected: //----------------------------------------------------------------------- @@ -439,11 +452,11 @@ class QubitVectorThrust { size_t num_qubits_; size_t data_size_; - mutable Chunk chunk_; - mutable Chunk buffer_chunk_; - mutable Chunk send_chunk_; - mutable Chunk recv_chunk_; - std::shared_ptr> chunk_manager_ = nullptr; + mutable Chunk::Chunk chunk_; + mutable Chunk::Chunk buffer_chunk_; + mutable Chunk::Chunk send_chunk_; + mutable Chunk::Chunk recv_chunk_; + std::shared_ptr> chunk_manager_ = nullptr; mutable thrust::host_vector> checkpoint_; @@ -451,6 +464,7 @@ class QubitVectorThrust { bool multi_chunk_distribution_; bool multi_shots_; bool enable_batch_; + bool cuStateVec_enable_ = false; bool register_blocking_; @@ -739,166 +753,15 @@ AER::Vector> QubitVectorThrust::move_to_vector() return AER::Vector>::copy_from_buffer(data_size_, &ret[0]); } + //------------------------------------------------------------------------------ // State initialize component //------------------------------------------------------------------------------ -template -class initialize_component_1qubit_func : public GateFuncBase -{ -protected: - thrust::complex s0,s1; - uint_t mask; - uint_t offset; -public: - initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) - { - s0 = state0; - s1 = state1; - - mask = (1ull << qubit) - 1; - offset = 1ull << qubit; - } - - virtual __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - - vec0[i0] = s0*q0; - vec1[i0] = s1*q0; - } - - const char* name(void) - { - return "initialize_component 1 qubit"; - } -}; - -template -class initialize_component_func : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; -public: - initialize_component_func(const cvector_t& mat,const reg_t &qb) - { - nqubits = qb.size(); - matSize = 1ull << nqubits; - } - - int qubits_count(void) - { - return nqubits; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q; - thrust::complex* state; - uint_t* qubits; - uint_t* qubits_sorted; - uint_t j,k; - uint_t ii,idx,t; - uint_t mask; - - //get parameters from iterator - vec = this->data_; - state = this->matrix_; - qubits = this->params_; - qubits_sorted = qubits + nqubits; - - idx = 0; - ii = i; - for(j=0;j> j) & 1) != 0) - ii += (1ull << qubits[j]); - } - q = q0 * state[k]; - vec[ii] = q; - } - } - - const char* name(void) - { - return "initialize_component"; - } -}; - -template -class initialize_large_component_func : public GateFuncBase -{ -protected: - int num_qubits_; - uint_t mask_; - uint_t cmask_; - thrust::complex init_; -public: - initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) - { - num_qubits_ = qubits.size(); - init_ = m; - - mask_ = 0; - cmask_ = 0; - for(int k=0;k> k) & 1) != 0){ - cmask_ |= (1ull << qubits[k]); - } - } - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q; - vec = this->data_; - if((i & mask_) == cmask_){ - q = vec[i]; - vec[i] = init_*q; - } - } - const char* name(void) - { - return "initialize_large_component"; - } -}; - template void QubitVectorThrust::initialize_component(const reg_t &qubits, const cvector_t &state0) { if(qubits.size() == 1){ - apply_function(initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); + apply_function(Chunk::initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); } else if(qubits.size() <= chunk_.container()->matrix_bits()){ auto qubits_sorted = qubits; @@ -912,14 +775,14 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const // chunk_.StoreMatrix(state0); // chunk_.StoreUintParams(qubits_param); - apply_function(initialize_component_func(state0,qubits_sorted), state0, qubits_param ); + apply_function(Chunk::initialize_component_func(state0,qubits_sorted), state0, qubits_param ); } else{ //if initial state is larger that matrix buffer, set one by one. uint_t DIM = 1ull << qubits.size(); uint_t i; for(i=0;i(state0[i],qubits,i) ); + apply_function(Chunk::initialize_large_component_func(state0[i],qubits,i) ); } } } @@ -927,29 +790,6 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const //------------------------------------------------------------------------------ // Utility //------------------------------------------------------------------------------ - -template -class ZeroClear : public GateFuncBase -{ -protected: -public: - ZeroClear() {} - bool is_diagonal(void) - { - return true; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - vec = this->data_; - vec[i] = 0.0; - } - const char* name(void) - { - return "zero"; - } -}; - template void QubitVectorThrust::zero() { @@ -957,14 +797,13 @@ void QubitVectorThrust::zero() DebugMsg("zero"); #endif - apply_function(ZeroClear(), cvector_t(), reg_t()); + apply_function(Chunk::ZeroClear(), cvector_t(), reg_t()); #ifdef AER_DEBUG DebugMsg("zero done"); #endif } - template bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) { @@ -987,8 +826,8 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ - chunk_manager_ = std::make_shared>(); - chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,max_matrix_bits_); + chunk_manager_ = std::make_shared>(); + chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, cuStateVec_enable_); } multi_chunk_distribution_ = false; @@ -1020,6 +859,7 @@ bool QubitVectorThrust::chunk_setup(QubitVectorThrust& base,cons base.multi_shots_ = true; } } + cuStateVec_enable_ = base.cuStateVec_enable_; //set global chunk ID / shot ID chunk_index_ = chunk_index; @@ -1289,47 +1129,6 @@ bool QubitVectorThrust::enable_batch(bool flg) //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ - -template -class initialize_kernel : public GateFuncBase -{ -protected: - int num_qubits_state_; - uint_t offset_; - thrust::complex init_val_; -public: - initialize_kernel(thrust::complex v,int nqs,uint_t offset) - { - num_qubits_state_ = nqs; - offset_ = offset; - init_val_ = v; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - uint_t iChunk = (i >> num_qubits_state_); - - vec = this->data_; - - if(i == iChunk * offset_){ - vec[i] = init_val_; - } - else{ - vec[i] = 0.0; - } - } - const char* name(void) - { - return "initialize"; - } -}; - template void QubitVectorThrust::initialize() { @@ -1338,7 +1137,7 @@ void QubitVectorThrust::initialize() if(multi_chunk_distribution_){ if(chunk_index_ == 0){ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); } else{ zero(); @@ -1346,7 +1145,7 @@ void QubitVectorThrust::initialize() chunk_.synchronize(); } else{ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); } #ifdef AER_DEBUG @@ -1600,1035 +1399,81 @@ void QubitVectorThrust::set_json_chop_threshold(double threshold) { * MATRIX MULTIPLICATION * ******************************************************************************/ -template -class MatrixMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset0; - -public: - MatrixMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - - mask = (1ull << qubit) - 1; - - offset0 = 1ull << qubit; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset0; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - q0 = vec0[i0]; - q1 = vec1[i0]; - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - const char* name(void) - { - return "mult2x2"; - } -}; +template +void QubitVectorThrust::apply_matrix(const reg_t &qubits, + const cvector_t &mat) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + if(qubits.size() == 1 && register_blocking_) + chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); + else + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); +} template -class MatrixMult4x4 : public GateFuncBase +void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, + const reg_t &target_qubits, + const cvector_t &mat) { -protected: - thrust::complex m00,m10,m20,m30; - thrust::complex m01,m11,m21,m31; - thrust::complex m02,m12,m22,m32; - thrust::complex m03,m13,m23,m33; - uint_t mask0; - uint_t mask1; - uint_t offset0; - uint_t offset1; + const size_t control_count = control_qubits.size(); + const size_t target_count = target_qubits.size(); + const uint_t DIM = 1ull << (target_count+control_count); + const uint_t columns = 1ull << target_count; + const uint_t blocks = 1ull << control_count; -public: - MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) - { - m00 = mat[0]; - m01 = mat[1]; - m02 = mat[2]; - m03 = mat[3]; - - m10 = mat[4]; - m11 = mat[5]; - m12 = mat[6]; - m13 = mat[7]; - - m20 = mat[8]; - m21 = mat[9]; - m22 = mat[10]; - m23 = mat[11]; - - m30 = mat[12]; - m31 = mat[13]; - m32 = mat[14]; - m33 = mat[15]; - - offset0 = 1ull << qubit0; - offset1 = 1ull << qubit1; - if(qubit0 < qubit1){ - mask0 = offset0 - 1; - mask1 = offset1 - 1; - } - else{ - mask0 = offset1 - 1; - mask1 = offset0 - 1; - } - } + auto qubits = target_qubits; + for (const auto &q : control_qubits) {qubits.push_back(q);} + size_t N = qubits.size(); - int qubits_count(void) - { - return 2; + cvector_t matMP(DIM*DIM,0.0); + uint_t b,i,j; + + //make DIMxDIM matrix + for(b = 0; b < blocks; b++){ + for(i = 0; i < columns; i++){ + for(j = 0; j < columns; j++){ + matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; + } + } } - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex* vec2; - thrust::complex* vec3; - thrust::complex q0,q1,q2,q3; - vec0 = this->data_; +#ifdef AER_DEBUG + DebugMsg("apply_multiplexer",control_qubits); + DebugMsg(" ",target_qubits); +#endif - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; + apply_matrix(qubits,matMP); +} - i0 = i0 + i1 + i2; - vec1 = vec0 + offset0; - vec2 = vec0 + offset1; - vec3 = vec2 + offset0; +template +void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, + const cvector_t &diag) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - q0 = vec0[i0]; - q1 = vec1[i0]; - q2 = vec2[i0]; - q3 = vec3[i0]; + const int_t N = qubits.size(); + if(N == 1 && register_blocking_) + chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); + else + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); +} - vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; - vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; - vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; - vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; - } - const char* name(void) - { - return "mult4x4"; - } -}; template -class MatrixMult8x8 : public GateFuncBase +void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, + const std::vector> &pairs) { -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t mask0; - uint_t mask1; - uint_t mask2; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch -public: - MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - } - - int qubits_count(void) - { - return 3; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i3 = (i - i0) << 1; - i1 = i3 & mask1; - i3 = (i3 - i1) << 1; - i2 = i3 & mask2; - i3 = (i3 - i2) << 1; - - i0 = i0 + i1 + i2 + i3; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - - m0 = pMat[0]; - m1 = pMat[8]; - m2 = pMat[16]; - m3 = pMat[24]; - m4 = pMat[32]; - m5 = pMat[40]; - m6 = pMat[48]; - m7 = pMat[56]; - - vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[1]; - m1 = pMat[9]; - m2 = pMat[17]; - m3 = pMat[25]; - m4 = pMat[33]; - m5 = pMat[41]; - m6 = pMat[49]; - m7 = pMat[57]; - - vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[2]; - m1 = pMat[10]; - m2 = pMat[18]; - m3 = pMat[26]; - m4 = pMat[34]; - m5 = pMat[42]; - m6 = pMat[50]; - m7 = pMat[58]; - - vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[3]; - m1 = pMat[11]; - m2 = pMat[19]; - m3 = pMat[27]; - m4 = pMat[35]; - m5 = pMat[43]; - m6 = pMat[51]; - m7 = pMat[59]; - - vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[4]; - m1 = pMat[12]; - m2 = pMat[20]; - m3 = pMat[28]; - m4 = pMat[36]; - m5 = pMat[44]; - m6 = pMat[52]; - m7 = pMat[60]; - - vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[5]; - m1 = pMat[13]; - m2 = pMat[21]; - m3 = pMat[29]; - m4 = pMat[37]; - m5 = pMat[45]; - m6 = pMat[53]; - m7 = pMat[61]; - - vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[6]; - m1 = pMat[14]; - m2 = pMat[22]; - m3 = pMat[30]; - m4 = pMat[38]; - m5 = pMat[46]; - m6 = pMat[54]; - m7 = pMat[62]; - - vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[7]; - m1 = pMat[15]; - m2 = pMat[23]; - m3 = pMat[31]; - m4 = pMat[39]; - m5 = pMat[47]; - m6 = pMat[55]; - m7 = pMat[63]; - - vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - } - const char* name(void) - { - return "mult8x8"; - } -}; - -template -class MatrixMult16x16 : public GateFuncBase -{ -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t offset3; - uint_t mask0; - uint_t mask1; - uint_t mask2; - uint_t mask3; -public: - MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - offset3 = (1ull << qubit[3]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - mask3 = (1ull << qubit_ordered[3]) - 1; - } - - int qubits_count(void) - { - return 4; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; - thrust::complex r; - thrust::complex* pMat; - int j; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i4 = (i - i0) << 1; - i1 = i4 & mask1; - i4 = (i4 - i1) << 1; - i2 = i4 & mask2; - i4 = (i4 - i2) << 1; - i3 = i4 & mask3; - i4 = (i4 - i3) << 1; - - i0 = i0 + i1 + i2 + i3 + i4; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - q8 = vec[i0 + offset3]; - q9 = vec[i0 + offset3 + offset0]; - q10 = vec[i0 + offset3 + offset1]; - q11 = vec[i0 + offset3 + offset1 + offset0]; - q12 = vec[i0 + offset3 + offset2]; - q13 = vec[i0 + offset3 + offset2 + offset0]; - q14 = vec[i0 + offset3 + offset2 + offset1]; - q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; - - offset = 0; - f0 = 0; - f1 = 0; - f2 = 0; - for(j=0;j<16;j++){ - r = pMat[0+j]*q0; - r += pMat[16+j]*q1; - r += pMat[32+j]*q2; - r += pMat[48+j]*q3; - r += pMat[64+j]*q4; - r += pMat[80+j]*q5; - r += pMat[96+j]*q6; - r += pMat[112+j]*q7; - r += pMat[128+j]*q8; - r += pMat[144+j]*q9; - r += pMat[160+j]*q10; - r += pMat[176+j]*q11; - r += pMat[192+j]*q12; - r += pMat[208+j]*q13; - r += pMat[224+j]*q14; - r += pMat[240+j]*q15; - - offset = offset3 * (((uint_t)j >> 3) & 1) + - offset2 * (((uint_t)j >> 2) & 1) + - offset1 * (((uint_t)j >> 1) & 1) + - offset0 * ((uint_t)j & 1); - - vec[i0 + offset] = r; - } - } - const char* name(void) - { - return "mult16x16"; - } -}; - -template -class MatrixMultNxN : public GateFuncWithCache -{ -protected: -public: - MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) - { - ; - } - - __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j,threadID; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j -class MatrixMultNxN_LU : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; - int nswap; -public: - MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) - { - uint_t i,j,k,imax; - std::complex c0,c1; - double d,dmax; - uint_t* pSwap; - - nqubits = qb.size(); - matSize = 1ull << nqubits; - - matLU = mat; - params.resize(nqubits + matSize*2); - - for(k=0;k dmax){ - dmax = d; - imax = j; - } - } - if(imax != i){ - j = params[nqubits + imax]; - params[nqubits + imax] = params[nqubits + i]; - params[nqubits + i] = j; - } - - if(dmax != 0){ - c0 = matLU[(i << nqubits) + params[nqubits + i]]; - - for(j=i+1;j q,qt; - thrust::complex m; - thrust::complex r; - uint_t j,k,l,iq; - uint_t ii,idx,t; - uint_t mask,offset_j,offset_k; - thrust::complex* vec; - thrust::complex* pMat; - uint_t* qubits; - uint_t* pivot; - uint_t* table; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - - pivot = qubits + nqubits; - table = pivot + matSize; - - idx = 0; - ii = i; - for(j=0;j> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //mult L - for(j=matSize-1;j>0;j--){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - r = vec[offset_j+idx]; - - for(k=0;k> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //swap results - if(nswap > 0){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - q = vec[offset_j+idx]; - k = pivot[table[0]]; - for(j=1;j> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - qt = vec[offset_j+idx]; - - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - q = qt; - k = pivot[table[j]]; - } - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - } - } - const char* name(void) - { - return "multNxN"; - } -}; - - - -template -void QubitVectorThrust::apply_matrix(const reg_t &qubits, - const cvector_t &mat) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); - } - else{ - apply_function(MatrixMult2x2(mat,qubits[0])); - } - } - else if(N == 2){ - apply_function(MatrixMult4x4(mat,qubits[0],qubits[1])); - } - else if(N <= 10){ - int i; - for(i=0;i(N), mat, qubits_sorted); - } - else{ - cvector_t matLU; - reg_t params; - MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); - -// chunk_.StoreMatrix(matLU); -// chunk_.StoreUintParams(params); - - apply_function(f, matLU, params); - } - -} - -template -void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, - const reg_t &target_qubits, - const cvector_t &mat) -{ - const size_t control_count = control_qubits.size(); - const size_t target_count = target_qubits.size(); - const uint_t DIM = 1ull << (target_count+control_count); - const uint_t columns = 1ull << target_count; - const uint_t blocks = 1ull << control_count; - - auto qubits = target_qubits; - for (const auto &q : control_qubits) {qubits.push_back(q);} - size_t N = qubits.size(); - - cvector_t matMP(DIM*DIM,0.0); - uint_t b,i,j; - - //make DIMxDIM matrix - for(b = 0; b < blocks; b++){ - for(i = 0; i < columns; i++){ - for(j = 0; j < columns; j++){ - matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; - } - } - } - -#ifdef AER_DEBUG - DebugMsg("apply_multiplexer",control_qubits); - DebugMsg(" ",target_qubits); -#endif - - apply_matrix(qubits,matMP); -} - -template -class DiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - - DiagonalMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i + gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult2x2"; - } -}; - -template -class DiagonalMult4x4 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit0; - int qubit1; -public: - - DiagonalMult4x4(const cvector_t& mat,int q0,int q1) - { - qubit0 = q0; - qubit1 = q1; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return 2; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit1) & 1) == 0){ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - } - else{ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m2; - } - else{ - m = m3; - } - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult4x4"; - } -}; - -template -class DiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - DiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t j,im; - thrust::complex* vec; - thrust::complex q; - thrust::complex m; - thrust::complex* pMat; - uint_t* qubits; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - pMat = this->matrix_; - qubits = this->params_; - - im = 0; - for(j=0;j> qubits[j]) & 1) != 0){ - im += (1 << j); - } - } - - q = vec[i]; - m = pMat[im]; - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_multNxN"; - } -}; - -template -void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, - const cvector_t &diag) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - const int_t N = qubits.size(); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); - } - else{ - apply_function(DiagonalMult2x2(diag,qubits[0])); - } - } - else if(N == 2){ - apply_function(DiagonalMult4x4(diag,qubits[0],qubits[1])); - } - else{ -// chunk_.StoreMatrix(diag); -// chunk_.StoreUintParams(qubits); - - apply_function(DiagonalMultNxN(qubits), diag, qubits); - } -} - - -template -class Permutation : public GateFuncBase -{ -protected: - uint_t nqubits; - uint_t npairs; - -public: - Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) - { - uint_t j,k; - uint_t offset0,offset1; - - nqubits = qubits.size(); - npairs = pairs.size(); - - params.resize(nqubits + npairs*2); - - for(j=0;j> k) & 1) != 0){ - offset0 += (1ull << qubits[k]); - } - if(((pairs[j].second >> k) & 1) != 0){ - offset1 += (1ull << qubits[k]); - } - } - params[nqubits + j*2 ] = offset0; - params[nqubits + j*2+1] = offset1; - } - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t j; - uint_t ii,idx,t; - uint_t* mask; - uint_t* pairs; - - vec = this->data_; - mask = this->params_; - pairs = mask + nqubits; - - idx = 0; - ii = i; - for(j=0;j -void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, - const std::vector> &pairs) -{ - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - reg_t params; - Permutation f(qubits_sorted,qubits,pairs,params); -// chunk_.StoreUintParams(params); - - apply_function(f, cvector_t(), params); -} + chunk_.apply_permutation(qubits,pairs,chunk_.container()->num_chunks()); +} /******************************************************************************* @@ -2640,70 +1485,6 @@ void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, //------------------------------------------------------------------------------ // Multi-controlled gates //------------------------------------------------------------------------------ - -template -class CX_func : public GateFuncBase -{ -protected: - uint_t offset; - uint_t mask; - uint_t cmask; - int nqubits; - int qubit_t; -public: - - CX_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = 1ull << qubit_t; - mask = offset - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = q1; - vec1[i0] = q0; - } - } - const char* name(void) - { - return "CX"; - } -}; - template void QubitVectorThrust::apply_mcx(const reg_t &qubits) { @@ -2719,74 +1500,11 @@ void QubitVectorThrust::apply_mcx(const reg_t &qubits) chunk_.queue_blocked_gate('x',qubits[qubits.size()-1],mask); } else{ - apply_function(CX_func(qubits)); + chunk_.apply_X(qubits, chunk_.container()->num_chunks()); } } -template -class CY_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; - int qubit_t; -public: - CY_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = (1ull << qubit_t); - mask = (1ull << qubit_t) - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = thrust::complex(q1.imag(),-q1.real()); - vec1[i0] = thrust::complex(-q0.imag(),q0.real()); - } - } - const char* name(void) - { - return "CY"; - } -}; - template void QubitVectorThrust::apply_mcy(const reg_t &qubits) { @@ -2802,163 +1520,18 @@ void QubitVectorThrust::apply_mcy(const reg_t &qubits) chunk_.queue_blocked_gate('y',qubits[qubits.size()-1],mask); } else{ - apply_function(CY_func(qubits)); + chunk_.apply_Y(qubits, chunk_.container()->num_chunks()); } } -template -class CSwap_func : public GateFuncBase -{ -protected: - uint_t mask0; - uint_t mask1; - uint_t cmask; - int nqubits; - int qubit_t0; - int qubit_t1; - uint_t offset1; - uint_t offset2; -public: - - CSwap_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t0 = qubits[nqubits-2]; - qubit_t1 = qubits[nqubits-1]; - } - else{ - qubit_t1 = qubits[nqubits-2]; - qubit_t0 = qubits[nqubits-1]; - } - mask0 = (1ull << qubit_t0) - 1; - mask1 = (1ull << qubit_t1) - 1; - - offset1 = 1ull << qubit_t0; - offset2 = 1ull << qubit_t1; - - cmask = 0; - for(i=0;i q1,q2; - thrust::complex* vec1; - thrust::complex* vec2; - - vec1 = this->data_; - - vec2 = vec1 + offset2; - vec1 = vec1 + offset1; - - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; - - i0 = i0 + i1 + i2; - - if((i0 & cmask) == cmask){ - q1 = vec1[i0]; - q2 = vec2[i0]; - vec1[i0] = q2; - vec2[i0] = q1; - } - } - const char* name(void) - { - return "CSWAP"; - } -}; - template void QubitVectorThrust::apply_mcswap(const reg_t &qubits) { - apply_function(CSwap_func(qubits)); -} - - -//swap operator between chunks -template -class CSwapChunk_func : public GateFuncBase -{ -protected: - uint_t mask; - thrust::complex* vec0; - thrust::complex* vec1; - bool write_back_; - bool swap_all_; -public: - - CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) - { - int i; - int nqubits; - int qubit_t; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t = qubits[nqubits-2]; - } - else{ - qubit_t = qubits[nqubits-1]; - } - mask = (1ull << qubit_t) - 1; - - vec0 = pVec0; - vec1 = pVec1; - - write_back_ = wb; - if(qubit_t >= block_bits) - swap_all_ = true; - else - swap_all_ = false; - } - - bool batch_enable(void) - { - return false; - } - bool is_diagonal(void) - { - return swap_all_; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - - i0 = i & mask; - i1 = (i - i0) << 1; - i0 += i1; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - q0 = vec0[i0]; - q1 = vec1[i0]; - vec0[i0] = q1; - if(write_back_) - vec1[i0] = q0; - } - const char* name(void) - { - return "Chunk SWAP"; - } -}; + chunk_.apply_swap(qubits,qubits.size()-2,chunk_.container()->num_chunks()); +} template void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &src, bool write_back) @@ -2976,7 +1549,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto thrust::complex* pChunk0; thrust::complex* pChunk1; - Chunk bufferChunk; + Chunk::Chunk bufferChunk; bool exec_on_src = false; if(chunk_.device() >= 0){ @@ -3020,13 +1593,13 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto } if(exec_on_src){ - src.apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + src.apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); src.chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(chunk_); } else{ - apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(src.chunk_); @@ -3058,7 +1631,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem else{ thrust::complex* pLocal; thrust::complex* pRemote; - Chunk buffer; + Chunk::Chunk buffer; #ifdef AER_DISABLE_GDR if(chunk_.device() >= 0){ //if there is no GPUDirectRDMA support, copy chunk from CPU @@ -3083,64 +1656,21 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem DebugMsg("chunk swap (process)",qubits); #endif - chunk_.Execute(CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); + chunk_.Execute(Chunk::CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); chunk_.synchronize(); //should be synchronized here - if(buffer.is_mapped()){ - chunk_manager_->UnmapBufferChunk(buffer); - } - } - - release_recv_buffer(); - -#ifdef AER_DISABLE_GDR - release_send_buffer(); -#endif -} - -template -class phase_func : public GateFuncBase -{ -protected: - thrust::complex phase; - uint_t mask; - int nqubits; -public: - phase_func(const reg_t &qubits,thrust::complex p) - { - int i; - nqubits = qubits.size(); - phase = p; - - mask = 0; - for(i=0;i* vec; - thrust::complex q0; - - vec = this->data_; - gid = this->base_index_; - - if(((i+gid) & mask) == mask){ - q0 = vec[i]; - vec[i] = q0 * phase; + if(buffer.is_mapped()){ + chunk_manager_->UnmapBufferChunk(buffer); } } - const char* name(void) - { - return "phase"; - } -}; + + release_recv_buffer(); + +#ifdef AER_DISABLE_GDR + release_send_buffer(); +#endif +} + template void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::complex phase) @@ -3157,140 +1687,10 @@ void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::co chunk_.queue_blocked_gate('p',qubits[qubits.size()-1],mask,&phase); } else{ - apply_function(phase_func(qubits,*(thrust::complex*)&phase) ); + chunk_.apply_phase(qubits,qubits.size()-1,phase,chunk_.container()->num_chunks()); } } -template -class DiagonalMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - uint_t mask; - uint_t cmask; - int nqubits; -public: - DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - m0 = mat[0]; - m1 = mat[1]; - - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i* vec; - thrust::complex q0; - thrust::complex m; - - vec = this->data_; - gid = this->base_index_; - - if(((i + gid) & cmask) == cmask){ - if((i + gid) & mask){ - m = m1; - } - else{ - m = m0; - } - - q0 = vec[i]; - vec[i] = m*q0; - } - } - const char* name(void) - { - return "diagonal_Cmult2x2"; - } -}; - -template -class MatrixMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; -public: - MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - nqubits = qubits.size(); - - offset = 1ull << qubits[nqubits-1]; - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - } - const char* name(void) - { - return "matrix_Cmult2x2"; - } -}; template void QubitVectorThrust::apply_mcu(const reg_t &qubits, @@ -3329,7 +1729,7 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('d',qubits[qubits.size()-1],mask,&diag[0]); } else{ - apply_function(DiagonalMult2x2Controlled(diag,qubits) ); + chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); } } } @@ -3349,12 +1749,20 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('u',qubits[qubits.size()-1],mask,&mat[0]); } else{ - apply_function(MatrixMult2x2Controlled(mat,qubits) ); + chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); } } } } +template +void QubitVectorThrust::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + + chunk_.apply_rotation(qubits,r,theta,chunk_.container()->num_chunks()); +} //------------------------------------------------------------------------------ // Single-qubit matrices @@ -3377,7 +1785,8 @@ void QubitVectorThrust::apply_matrix(const uint_t qubit, chunk_.queue_blocked_gate('u',qubit,0,&mat[0]); } else{ - apply_function(MatrixMult2x2(mat,qubit)); + reg_t qubits = {qubit}; + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); } } @@ -3393,7 +1802,7 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, } else{ reg_t qubits = {qubit}; - apply_function(DiagonalMult2x2(diag,qubits[0])); + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); } } @@ -3402,50 +1811,21 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, * NORMS * ******************************************************************************/ -template -class norm_func : public GateFuncBase -{ -protected: -public: - norm_func(void) - { - - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double d; - - vec = this->data_; - q = vec[i]; - d = (double)(q.real()*q.real() + q.imag()*q.imag()); - return d; - } - - const char* name(void) - { - return "norm"; - } -}; - template double QubitVectorThrust::norm() const { double ret; + uint_t count = 1; + #ifdef AER_THRUST_CUDA if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ if(chunk_.pos() != 0) return 0.0; //first chunk execute all in batch + count = chunk_.container()->num_chunks(); } #endif - apply_function_sum(&ret,norm_func()); + ret = chunk_.norm(count); #ifdef AER_DEBUG DebugMsg("norm",ret); @@ -3454,48 +1834,6 @@ double QubitVectorThrust::norm() const return ret; } -template -class NormMatrixMultNxN : public GateFuncSumWithCache -{ -protected: -public: - NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) - { - ; - } - - __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t &mat) const @@ -3516,63 +1854,11 @@ double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t(N)); + apply_function_sum(&ret,Chunk::NormMatrixMultNxN(N)); return ret; } } -template -class NormDiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - NormDiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t im,j,gid; - thrust::complex q; - thrust::complex m,r; - thrust::complex* pMat; - thrust::complex* vec; - uint_t* qubits; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - gid = this->base_index_; - - im = 0; - for(j=0;j double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvector_t &mat) const { @@ -3587,7 +1873,7 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect chunk_.StoreUintParams(qubits); double ret; - apply_function_sum(&ret,NormDiagonalMultNxN(qubits) ); + apply_function_sum(&ret,Chunk::NormDiagonalMultNxN(qubits) ); return ret; } } @@ -3595,118 +1881,20 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect //------------------------------------------------------------------------------ // Single-qubit specialization //------------------------------------------------------------------------------ -template -class NormMatrixMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset; -public: - NormMatrixMult2x2(const cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - - offset = 1ull << qubit; - mask = (1ull << qubit) - 1; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex* vec; - thrust::complex q0,q1; - thrust::complex r0,r1; - double sum = 0.0; - - vec = this->data_; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec[i0]; - q1 = vec[offset+i0]; - - r0 = m0 * q0 + m2 * q1; - sum += r0.real()*r0.real() + r0.imag()*r0.imag(); - r1 = m1 * q0 + m3 * q1; - sum += r1.real()*r1.real() + r1.imag()*r1.imag(); - return sum; - } - const char* name(void) - { - return "Norm_mult2x2"; - } -}; - template double QubitVectorThrust::norm(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormMatrixMult2x2(mat,qubit)); - - return ret; -} - - -template -class NormDiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - NormDiagonalMult2x2(cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t gid; - thrust::complex* vec; - thrust::complex q; - thrust::complex m,r; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - r = m * q; + apply_function_sum(&ret,Chunk::NormMatrixMult2x2(mat,qubit)); - return (r.real()*r.real() + r.imag()*r.imag()); - } - const char* name(void) - { - return "Norm_diagonal_mult2x2"; - } -}; + return ret; +} template double QubitVectorThrust::norm_diagonal(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormDiagonalMult2x2(mat,qubit)); + apply_function_sum(&ret,Chunk::NormDiagonalMult2x2(mat,qubit)); return ret; } @@ -3746,101 +1934,6 @@ std::vector QubitVectorThrust::probabilities() const { return probs; } - -template -class probability_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; -public: - probability_func(const reg_t &qubits,int i) - { - int k; - int nq = qubits.size(); - - mask = 0; - cmask = 0; - for(k=0;k> k) & 1) != 0){ - cmask |= (1ull << qubits[k]); - } - } - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double ret; - - vec = this->data_; - - ret = 0.0; - - if((i & mask) == cmask){ - q = vec[i]; - ret = q.real()*q.real() + q.imag()*q.imag(); - } - return ret; - } - - const char* name(void) - { - return "probabilities"; - } -}; - -template -class probability_1qubit_func : public GateFuncBase -{ -protected: - uint_t offset; -public: - probability_1qubit_func(const uint_t qubit) - { - offset = 1ull << qubit; - } - - __host__ __device__ thrust::complex operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex ret; - double d0,d1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & (offset - 1); - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - q1 = vec1[i0]; - - d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); - d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); - - ret = thrust::complex(d0,d1); - return ret; - } - - const char* name(void) - { - return "probabilities_1qubit"; - } -}; - template std::vector QubitVectorThrust::probabilities(const reg_t &qubits) const { @@ -3848,25 +1941,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits const int_t DIM = 1 << N; std::vector probs(DIM, 0.); - if(N == 1){ //special case for 1 qubit (optimized for measure) - apply_function_sum2(&probs[0],probability_1qubit_func(qubits[0])); - -#ifdef AER_DEBUG - DebugMsg("probabilities",probs); -#endif - return probs; - } - - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((N == num_qubits_) && (qubits == qubits_sorted)) - return probabilities(); - - - int i; - for(i=0;i(qubits,i)); - } + chunk_.probabilities(probs, qubits); #ifdef AER_DEBUG DebugMsg("probabilities",probs); @@ -3881,7 +1956,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits #define QV_RESET_TARGET_PROB 3 template -class reset_after_measure_func : public GateFuncBase +class reset_after_measure_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -3933,7 +2008,7 @@ class reset_after_measure_func : public GateFuncBase }; template -class set_probability_buffer_for_reset_func : public GateFuncBase +class set_probability_buffer_for_reset_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -3974,7 +2049,7 @@ class set_probability_buffer_for_reset_func : public GateFuncBase }; template -class check_measure_probability_func : public GateFuncBase +class check_measure_probability_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -4116,7 +2191,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -4142,7 +2217,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v //loop for probability for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -4161,7 +2236,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v } template -class reset_func : public GateFuncBase +class reset_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -4254,7 +2329,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -4268,7 +2343,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.StoreUintParams(qubits); for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -4325,7 +2400,7 @@ void QubitVectorThrust::get_creg(ClassicalRegister& creg) } template -class set_creg_func : public GateFuncBase +class set_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_set_; @@ -4375,7 +2450,7 @@ void QubitVectorThrust::store_cmemory(uint_t qubit,int val) } template -class set_batched_creg_func : public GateFuncBase +class set_batched_creg_func : public Chunk::GateFuncBase { protected: int_t reg_set_; @@ -4437,7 +2512,7 @@ int_t QubitVectorThrust::set_batched_system_conditional(int_t src_reg, r } template -class copy_creg_func : public GateFuncBase +class copy_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_dest_; @@ -4492,9 +2567,11 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) { uint_t count = 1; #ifdef AER_THRUST_CUDA - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return reg_t(); //first chunk execute all in batch - count = chunk_.container()->num_chunks(); + if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ + if(chunk_.pos() != 0) + return reg_t(); //first chunk execute all in batch + count = chunk_.container()->num_chunks(); + } #endif #ifdef AER_DEBUG @@ -4516,136 +2593,12 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) * ******************************************************************************/ -inline __host__ __device__ uint_t pop_count_kernel(uint_t val) -{ - uint_t count = val; - count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); - count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); - count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); - count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); - count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); - count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); - return count; -} - -//special case Z only -template -class expval_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - -public: - expval_pauli_Z_func(uint_t z) - { - z_mask_ = z; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - double ret = 0.0; - - vec = this->data_; - - q0 = vec[i]; - ret = q0.real()*q0.real() + q0.imag()*q0.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - ret = -ret; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_Z"; - } -}; - -template -class expval_pauli_XYZ_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; -public: - expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - ret = -d0; - else - ret = d0; - if(pop_count_kernel(idx1 & z_mask_) & 1) - ret -= d1; - else - ret += d1; - } - else{ - ret = d0 + d1; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_XYZ"; - } -}; - template double QubitVectorThrust::expval_pauli(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase) const { + return chunk_.expval_pauli(qubits,pauli,initial_phase); + uint_t x_mask, z_mask, num_y, x_max; std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); @@ -4657,7 +2610,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, double ret; // specialize x_max == 0 if(x_mask == 0) { - apply_function_sum(&ret, expval_pauli_Z_func(z_mask) ); + apply_function_sum(&ret, Chunk::expval_pauli_Z_func(z_mask) ); return ret; } @@ -4665,77 +2618,10 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, // This is (-1j) ** number of Y terms modulo 4 auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); + apply_function_sum(&ret, Chunk::expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); return ret; } -template -class expval_pauli_inter_chunk_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - thrust::complex phase_; - thrust::complex* pair_chunk_; - uint_t z_count_; - uint_t z_count_pair_; -public: - expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - pair_chunk_ = pair_chunk; - z_count_ = zc; - z_count_pair_ = zcp; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t ip; - - vec = this->data_; - - ip = i ^ x_mask_; - q0 = vec[i]; - q1 = pair_chunk_[ip]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if((pop_count_kernel(i & z_mask_) + z_count_) & 1) - ret = -d0; - else - ret = d0; - if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) - ret -= d1; - else - ret += d1; - - return ret; - } - const char* name(void) - { - return "expval_pauli_inter_chunk"; - } -}; template double QubitVectorThrust::expval_pauli(const reg_t &qubits, @@ -4749,7 +2635,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, //get pointer to pairing chunk (copy if needed) double ret; thrust::complex* pair_ptr; - Chunk buffer; + Chunk::Chunk buffer; if(pair_chunk.data() == this->data()){ #ifdef AER_DISABLE_GDR @@ -4797,7 +2683,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); + apply_function_sum(&ret, Chunk::expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); if(buffer.is_mapped()){ chunk_manager_->UnmapBufferChunk(buffer); @@ -4820,97 +2706,6 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, * ******************************************************************************/ -template -class multi_pauli_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; - uint_t nqubits_; -public: - multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - q0 *= -1; - - if(pop_count_kernel(idx1 & z_mask_) & 1) - q1 *= -1; - } - vec[idx0] = q1 * phase_; - vec[idx1] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli"; - } -}; - -//special case Z only -template -class multi_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - thrust::complex phase_; -public: - multi_pauli_Z_func(uint_t z,std::complex p) - { - z_mask_ = z; - phase_ = p; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - - vec = this->data_; - - q0 = vec[i]; - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - q0 = -q0; - } - vec[i] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli_Z"; - } -}; template void QubitVectorThrust::apply_pauli(const reg_t &qubits, @@ -4928,16 +2723,16 @@ void QubitVectorThrust::apply_pauli(const reg_t &qubits, add_y_phase(num_y, phase); if(x_mask == 0){ - apply_function(multi_pauli_Z_func(z_mask, phase)); + apply_function(Chunk::multi_pauli_Z_func(z_mask, phase)); } else{ - apply_function(multi_pauli_func(x_mask, z_mask, x_max, phase) ); + apply_function(Chunk::multi_pauli_func(x_mask, z_mask, x_max, phase) ); } } //batched Pauli operation used for Pauli noise template -class batched_pauli_func : public GateFuncBase +class batched_pauli_func : public Chunk::GateFuncBase { protected: thrust::complex coeff_; @@ -4995,10 +2790,10 @@ class batched_pauli_func : public GateFuncBase phase = thrust::complex(-coeff_.imag(),coeff_.real()); if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx0 & z_mask_) & 1) q0 *= -1; - if(pop_count_kernel(idx1 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx1 & z_mask_) & 1) q1 *= -1; } if(x_mask_ == 0){ @@ -5023,7 +2818,7 @@ void QubitVectorThrust::apply_batched_pauli_ops(const std::vector::apply_batched_pauli_ops(const std::vector -class MatrixMult2x2_conditional : public GateFuncBase +class MatrixMult2x2_conditional : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -5122,13 +2917,13 @@ class MatrixMult2x2_conditional : public GateFuncBase }; template -class MatrixMultNxN_conditional : public GateFuncWithCache +class MatrixMultNxN_conditional : public Chunk::GateFuncWithCache { protected: uint_t prob_buf_size_; double* probs_; public: - MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : GateFuncWithCache(nq) + MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : Chunk::GateFuncWithCache(nq) { probs_ = probs; prob_buf_size_ = prob_size; @@ -5169,7 +2964,7 @@ class MatrixMultNxN_conditional : public GateFuncWithCache }; template -class check_kraus_probability_func : public GateFuncBase +class check_kraus_probability_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -5277,7 +3072,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, cvector_t vmat = Utils::vectorize_matrix(kmats[i]); chunk_.set_conditional(system_reg); - apply_function_sum(nullptr,NormMatrixMult2x2(vmat,qubits[0]),true); + apply_function_sum(nullptr,Chunk::NormMatrixMult2x2(vmat,qubits[0]),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -5299,7 +3094,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, chunk_.set_conditional(system_reg); chunk_.StoreMatrix(Utils::vectorize_matrix(kmats[i])); - apply_function_sum(nullptr,NormMatrixMultNxN(N),true); + apply_function_sum(nullptr,Chunk::NormMatrixMultNxN(N),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -5315,7 +3110,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, } template -class bfunc_kernel : public GateFuncBase +class bfunc_kernel : public Chunk::GateFuncBase { protected: uint_t bfunc_num_regs_; @@ -5441,7 +3236,7 @@ void QubitVectorThrust::apply_bfunc(const Operations::Op &op) } template -class roerror_kernel : public GateFuncBase +class roerror_kernel : public Chunk::GateFuncBase { protected: uint_t num_regs_; diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index 5606be96e7..542d839fc0 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -919,49 +919,65 @@ double State::expval_pauli(const int_t iChunk, const reg_t &qubits, z_mask >>= BaseState::chunk_bits_; x_max -= BaseState::chunk_bits_; - const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); - const uint_t mask_l = (1ull << x_max) - 1; - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && on_same_process) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process uint_t z_count,z_count_pair; z_count = AER::Utils::popcount(iChunk & z_mask); z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); if(iProc == BaseState::distributed_rank_){ //pair is on the same process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); } else{ BaseState::recv_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); //refer receive buffer to calculate expectation value - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); } } else if(iProc == BaseState::distributed_rank_){ //pair is on this process BaseState::send_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); } - } + return expval; + }; + expval += BaseState::apply_omp_parallel_reduction((BaseState::chunk_omp_parallel_ && on_same_process),0,BaseState::num_global_chunks_/2,apply_expval_pauli_chunk); } else{ //no exchange between chunks z_mask >>= BaseState::chunk_bits_; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i::apply_save_density_matrix(const int_t iChunk, const Oper } else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].checkpoint(); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(true); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) apply_diagonal_matrix(iChunk, sub_qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati exp_im += exp_tmp.imag(); } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:exp_re,exp_im) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(false); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_density_matrix(const int_t iChunk, const Operat reduced_state[0] = BaseState::qregs_[iChunk].norm(); else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::apply_gate(const int_t iChunk, const Operations::Op &op) BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::mcrx: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::x, std::real(op.params[0])); break; case Gates::mcry: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::y, std::real(op.params[0])); break; case Gates::mcrz: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::z, std::real(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rxx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::xx, std::real(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ryy(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::yy, std::real(op.params[0])); break; case Gates::rzz: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zz, std::real(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zx, std::real(op.params[0])); break; case Gates::id: break; @@ -1664,47 +1729,90 @@ rvector_t State::measure_probs(const int_t iChunk, const reg_t &qubi BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); + if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } + } + } + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } } +#pragma omp atomic + sum[idx] += nr; } } - else{ //there is no bit in chunk - auto nr = std::real(BaseState::qregs_[i].norm()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + else{ + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += nr; + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += nr; + } } } @@ -1753,10 +1861,18 @@ void State::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig State::sample_measure(const reg_t &qubits, else{ std::vector chunkSum(BaseState::qregs_.size()+1,0); double sum,localSum; + //calculate per chunk sum if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) @@ -1944,9 +2069,14 @@ void State::apply_initialize(const int_t iChunk, const reg_t &qubits BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); if(qubits_out_chunk.size() == 0){ //no qubits outside of chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_initialize(const int_t iChunk, const reg_t &qubits perm[i] = 1.0; } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //then scatter outside chunk @@ -2008,9 +2144,14 @@ void State::apply_initialize(const int_t iChunk, const reg_t &qubits } //initialize by params -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, } else{ p = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:p) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::initialize_qreg(uint_t num_qubits) } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if(irow == icol) - BaseState::qregs_[iChunk].initialize(); - else - BaseState::qregs_[iChunk].zero(); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } } } else{ @@ -441,21 +454,40 @@ void State::initialize_qreg(uint_t num_qubits, auto input = unitary.copy_to_matrix(); uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = input[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -489,21 +521,40 @@ void State::initialize_qreg(uint_t num_qubits, BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = unitary[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -673,7 +724,7 @@ void State::apply_matrix(const int_t iChunk, const reg_t &qubi template void State::apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t &diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is reg_t qubits_chunk = qubits; for(uint_t i;i void State::apply_global_phase() { if (BaseState::has_global_phase_) { -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::set_num_qubits(size_t num_qubits) { BaseVector::set_num_qubits(2 * num_qubits); } -template -class trace_func : public GateFuncBase -{ -protected: - uint_t rows_; -public: - trace_func(uint_t nrow) - { - rows_ = nrow; - } - bool is_diagonal(void) - { - return true; - } - uint_t size(int num_qubits) - { - this->chunk_bits_ = num_qubits; - return rows_; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - - uint_t iChunk = (i / rows_); - uint_t lid = i - (iChunk * rows_); - uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); - - vec = this->data_; - q = vec[idx]; - return q.real(); - } - - const char* name(void) - { - return "trace"; - } -}; template std::complex UnitaryMatrixThrust::trace() const { thrust::complex sum; - double ret; - BaseVector::apply_function_sum(&ret,trace_func(rows_),false); - sum = ret; + sum = BaseVector::chunk_.trace(rows_, 1); #ifdef AER_DEBUG BaseVector::DebugMsg("trace",sum); diff --git a/test/terra/backends/aer_simulator/test_options.py b/test/terra/backends/aer_simulator/test_options.py index cf5e31ab8a..e96f2a1719 100644 --- a/test/terra/backends/aer_simulator/test_options.py +++ b/test/terra/backends/aer_simulator/test_options.py @@ -91,7 +91,9 @@ def test_device_option(self, method, device): result = backend.run(qc).result() value = result.results[0].metadata.get('device', None) - self.assertEqual(value, device) + # device = 'GPU_cuStateVec' when cuStateVec is enabled + # so check if 'GPU' is included in value from result + self.assertTrue((value in device)) @data('automatic', 'statevector', 'density_matrix', 'stabilizer', 'matrix_product_state', 'extended_stabilizer') diff --git a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py index 57c2422168..5f79d43f83 100644 --- a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py +++ b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py @@ -30,6 +30,9 @@ class TestQasmSimulator(SimulatorTestCase): def test_legacy_methods(self, method, device): """Test legacy device method options.""" backend = self.backend() + # GPU_cuStateVec is converted to GPU + if device == "GPU_cuStateVec": + device = "GPU" legacy_method = f"{method}_{device.lower()}" backend.set_options(method=legacy_method) self.assertEqual(backend.options.method, method) diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 331fb1fcf8..9f3fa91484 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -18,6 +18,10 @@ import itertools as it from qiskit.providers.aer import AerSimulator from test.terra.common import QiskitAerTestCase +from qiskit.circuit import QuantumCircuit +from qiskit.compiler import assemble +from qiskit.providers.aer.backends.backend_utils import cpp_execute +from qiskit.providers.aer.backends.controller_wrappers import aer_controller_execute class SimulatorTestCase(QiskitAerTestCase): @@ -30,7 +34,11 @@ def backend(self, **options): """Return AerSimulator backend using current class options""" sim_options = self.OPTIONS.copy() for key, val in options.items(): - sim_options[key] = val + if 'device' == key and 'cuStateVec' in val: + sim_options['device'] = 'GPU' + sim_options['cuStateVec_enable'] = True + else: + sim_options[key] = val return self.BACKEND(**sim_options) @@ -66,12 +74,39 @@ def _method_device(methods): if not methods: methods = AerSimulator().available_methods() available_devices = AerSimulator().available_devices() + #add special test device for cuStateVec if available + cuStateVec = check_cuStateVec(available_devices) + gpu_methods = ['statevector', 'density_matrix', 'unitary'] data_args = [] for method in methods: if method in gpu_methods: for device in available_devices: data_args.append((method, device)) + #add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' + #'GPU_cuStateVec' is used only inside tests not available in Aer + #and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added + if cuStateVec: + data_args.append((method, 'GPU_cuStateVec')) else: data_args.append((method, 'CPU')) return data_args + +def check_cuStateVec(devices): + """Return if the system supports cuStateVec or not""" + if 'GPU' in devices: + dummy_circ = QuantumCircuit(1) + dummy_circ.i(0) + qobj = assemble(dummy_circ, + optimization_level=0, + shots=1, + method="statevector", + device="GPU", + cuStateVec_enable=True) + #run dummy circuit to check if Aer is built with cuStateVec + result = cpp_execute(aer_controller_execute(), qobj) + return result.get('success', False) + else: + return False + +