diff --git a/qiskit/providers/aer/backends/unitary_simulator.py b/qiskit/providers/aer/backends/unitary_simulator.py index 9e91c9bb31..b0353757c7 100644 --- a/qiskit/providers/aer/backends/unitary_simulator.py +++ b/qiskit/providers/aer/backends/unitary_simulator.py @@ -193,6 +193,8 @@ def _default_options(cls): fusion_verbose=False, fusion_max_qubit=5, fusion_threshold=14, + blocking_qubits=None, + blocking_enable=False, # statevector options statevector_parallel_threshold=14) diff --git a/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml b/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml new file mode 100644 index 0000000000..01ebe9e31f --- /dev/null +++ b/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes a bug introduced in 0.8.0 where GPU simulations would allocate + unneeded host memory in addition to the GPU memory. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index df379ecc93..9b34f6d6ce 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -64,6 +64,7 @@ #include "simulators/statevector/statevector_state_chunk.hpp" #include "simulators/superoperator/superoperator_state.hpp" #include "simulators/unitary/unitary_state.hpp" +#include "simulators/unitary/unitary_state_chunk.hpp" namespace AER { @@ -1399,33 +1400,67 @@ void Controller::run_circuit(const Circuit &circ, } case Method::unitary: { if (sim_device_ == Device::CPU) { - if (sim_precision_ == Precision::Double) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); + if (multiple_chunk_required(circ, noise)) { + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } + } + else{ + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } } } else { #ifdef AER_THRUST_SUPPORTED - if (sim_precision_ == Precision::Double) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); + if (multiple_chunk_required(circ, noise)) { + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } + } + else{ + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } } #endif } diff --git a/src/controllers/statevector_controller.hpp b/src/controllers/statevector_controller.hpp index db5c9a9cfe..cc67fd08c2 100755 --- a/src/controllers/statevector_controller.hpp +++ b/src/controllers/statevector_controller.hpp @@ -356,6 +356,7 @@ void StatevectorController::run_circuit_helper( } Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),false); + cache_block_pass.set_save_state(true); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); uint_t block_bits = 0; diff --git a/src/controllers/unitary_controller.hpp b/src/controllers/unitary_controller.hpp index 8b7d7126da..bdd1413634 100755 --- a/src/controllers/unitary_controller.hpp +++ b/src/controllers/unitary_controller.hpp @@ -17,6 +17,7 @@ #include "controller.hpp" #include "simulators/unitary/unitary_state.hpp" +#include "simulators/unitary/unitary_state_chunk.hpp" #include "transpile/fusion.hpp" namespace AER { @@ -196,30 +197,60 @@ void UnitaryController::run_circuit(const Circuit &circ, switch (method_) { case Method::automatic: case Method::unitary_cpu: { - if (precision_ == Precision::double_precision) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>(circ, noise, config, - shots, rng_seed, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>(circ, noise, config, - shots, rng_seed, result); + if(Base::Controller::multiple_chunk_required(circ,noise)){ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>(circ, noise, config, + shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>(circ, noise, config, + shots, rng_seed, result); + } + } + else{ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>(circ, noise, config, + shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>(circ, noise, config, + shots, rng_seed, result); + } } } case Method::unitary_thrust_gpu: { #ifdef AER_THRUST_CUDA - if (precision_ == Precision::double_precision) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, result); + if(Base::Controller::multiple_chunk_required(circ,noise)){ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, result); + } + } + else{ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, result); + } } #else throw std::runtime_error( @@ -309,6 +340,15 @@ void UnitaryController::run_circuit_helper( fusion_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); } + Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),true); + cache_block_pass.set_save_state(true); + cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); + + uint_t block_bits = 0; + if(cache_block_pass.enabled()) + block_bits = cache_block_pass.block_bits(); + state.allocate(Base::Controller::max_qubits_,block_bits); + // Run single shot collecting measure data or snapshots if (initial_unitary_.empty()) { diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index 055fa08c1a..89aceb50d5 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -40,11 +40,11 @@ const Operations::OpSet StateOpSet( OpType::barrier, OpType::bfunc, OpType::roerror, OpType::matrix, OpType::diagonal_matrix, OpType::kraus, - OpType::superop, OpType::save_expval, + OpType::superop, OpType::set_statevec, + OpType::set_densmat, OpType::save_expval, OpType::save_expval_var, OpType::save_densmat, - OpType::save_state, OpType::save_probs, OpType::save_probs_ket, - OpType::save_amps_sq + OpType::save_amps_sq, OpType::save_state }, // Gates {"U", "CX", "u1", "u2", "u3", "u", "cx", "cy", "cz", @@ -116,7 +116,9 @@ class State : public Base::StateChunk { auto move_to_matrix(); auto copy_to_matrix(); protected: - auto apply_to_matrix(bool copy = false); + + template + void initialize_from_vector(const list_t &vec); //----------------------------------------------------------------------- // Apply instructions @@ -143,7 +145,7 @@ class State : public Base::StateChunk { RngEngine &rng); // Reset the specified qubits to the |0> state by tracing out qubits - void apply_reset(const reg_t &qubits); + void apply_reset(const int_t iChunk, const reg_t &qubits); // Apply a supported snapshot instruction // If the input is not in allowed_snapshots an exeption will be raised. @@ -166,6 +168,10 @@ class State : public Base::StateChunk { // Apply an N-qubit Pauli gate void apply_pauli(const reg_t &qubits, const std::string &pauli); + // apply phase + void apply_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase); + void apply_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase); + //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- @@ -279,12 +285,22 @@ class State : public Base::StateChunk { return 2; } + virtual bool is_applied_to_each_chunk(const Operations::Op &op); }; //========================================================================= // Implementation: Base class method overrides //========================================================================= +template +bool State::is_applied_to_each_chunk(const Operations::Op &op) +{ + if(op.type == Operations::OpType::reset){ + return true; + } + return BaseState::is_applied_to_each_chunk(op); +} + //------------------------------------------------------------------------- // Initialization //------------------------------------------------------------------------- @@ -303,10 +319,13 @@ void State::initialize_qreg(uint_t num_qubits) } } else{ //multi-chunk distribution + for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -335,6 +354,11 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk - uint_t i; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -374,6 +396,10 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk - uint_t i; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -413,6 +437,10 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); //copy part of state for this chunk - uint_t i; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -446,13 +472,55 @@ void State::initialize_omp() } } +template +template +void State::initialize_from_vector(const list_t &vec) +{ + if((1ull << (BaseState::num_qubits_*2)) == vec.size()){ + BaseState::initialize_from_vector(vec); + } + else if((1ull << (BaseState::num_qubits_*2)) == vec.size() * vec.size()) { + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); + } + } + } + else { + throw std::runtime_error("DensityMatrixChunk::initialize input vector is incorrect length. Expected: " + + std::to_string((1ull << (BaseState::num_qubits_*2))) + " Received: " + + std::to_string(vec.size())); + } +} + + template auto State::move_to_matrix() { if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } - return apply_to_matrix(false); + return BaseState::apply_to_matrix(false); } template @@ -461,86 +529,9 @@ auto State::copy_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].copy_to_matrix(); } - return apply_to_matrix(true); + return BaseState::apply_to_matrix(true); } -template -auto State::apply_to_matrix(bool copy) -{ - int_t iChunk; - uint_t size = 1ull << (BaseState::chunk_bits_*2); - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); - - auto matrix = BaseState::qregs_[0].copy_to_matrix(); - - if(BaseState::distributed_rank_ == 0){ - //TO DO check memory availability - matrix.resize(1ull << (BaseState::num_qubits_),1ull << (BaseState::num_qubits_)); - -#ifdef AER_MPI - auto recv = BaseState::qregs_[0].copy_to_matrix(); - //gather states from other processes - for(iChunk=BaseState::num_local_chunks_;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = recv(icol,irow); - } - } -#endif - - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - if(copy){ - auto tmp = BaseState::qregs_[iChunk].copy_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); - } - } - else{ - auto tmp = BaseState::qregs_[iChunk].move_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); - } - } - } - } - else{ -#ifdef AER_MPI - //send matrices to process 0 - for(iChunk=0;iChunk::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::barrier: break; case Operations::OpType::reset: - apply_reset(op.qubits); + apply_reset(iChunk,op.qubits); break; case Operations::OpType::measure: apply_measure(op.qubits, op.memory, op.registers, rng); @@ -617,6 +608,15 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::superop: BaseState::qregs_[iChunk].apply_superop_matrix(op.qubits, Utils::vectorize_matrix(op.mats[0])); break; + case OpType::kraus: + apply_kraus(op.qubits, op.mats); + break; + case OpType::set_statevec: + initialize_from_vector(op.params); + break; + case OpType::set_densmat: + BaseState::initialize_from_matrix(op.mats[0]); + break; case Operations::OpType::save_expval: case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); @@ -990,54 +990,65 @@ template cmatrix_t State::reduced_density_matrix_helper(const reg_t &qubits, const reg_t &qubits_sorted) { - // Get superoperator qubits - const reg_t squbits = BaseState::qregs_[0].superop_qubits(qubits); - const reg_t squbits_sorted = BaseState::qregs_[0].superop_qubits(qubits_sorted); - - // Get dimensions - const size_t N = qubits.size(); - const size_t DIM = 1ULL << N; - const size_t VDIM = 1ULL << (2 * N); - const size_t END = 1ULL << (BaseState::qregs_[0].num_qubits() - N); - const size_t SHIFT = END + 1; - - // TODO: If we are not going to apply any additional instructions after - // this function we could move the memory when constructing rather - // than copying int_t iChunk; - auto vmat = BaseState::qregs_[0].vector(); + uint_t size = 1ull << (BaseState::chunk_bits_*2); + uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); //TO DO check memory availability - vmat.resize(BaseState::num_local_chunks_ << (BaseState::chunk_bits_*2)); - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=1;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << BaseState::chunk_bits_; + uint_t icol_chunk = (iChunk & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << BaseState::chunk_bits_; + if(iChunk < BaseState::num_local_chunks_) + tmp = BaseState::qregs_[iChunk].copy_to_matrix(); #ifdef AER_MPI - BaseState::gather_state(vmat); + else + BaseState::recv_data(tmp.data(),size,0,iChunk); #endif - - cmatrix_t reduced_state(DIM, DIM, false); - { - // Fill matrix with first iteration - const auto inds = QV::indexes(squbits, squbits_sorted, 0); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] = complex_t(vmat[inds[i]]); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_)) + irow_chunk; + uint_t icol = (i & mask) + icol_chunk; + uint_t irow_out = 0; + uint_t icol_out = 0; + int j; + for(j=0;j> qubits[j]) & 1){ + irow &= ~(1ull << qubits[j]); + irow_out += (1ull << j); + } + if((icol >> qubits[j]) & 1){ + icol &= ~(1ull << qubits[j]); + icol_out += (1ull << j); + } + } + if(irow == icol){ //only diagonal base can be reduced + uint_t idx = ((irow_out) << qubits.size()) + icol_out; +#pragma omp critical + reduced_state[idx] += tmp[i]; + } + } } } - // Accumulate with remaning blocks - for (size_t k = 1; k < END; k++) { - const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] += complex_t(vmat[inds[i]]); + else{ +#ifdef AER_MPI + //send matrices to process 0 + for(iChunk=0;iChunk::apply_gate(const uint_t iChunk, const Operations::Op &op) std::real(op.params[1])); break; case DensityMatrix::Gates::u1: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0])); + apply_phase(iChunk,op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0])); break; case DensityMatrix::Gates::cx: BaseState::qregs_[iChunk].apply_cnot(op.qubits[0], op.qubits[1]); @@ -1097,21 +1108,21 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op) apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., 0., M_PI); break; case DensityMatrix::Gates::s: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(0., 1.)); + apply_phase(iChunk,op.qubits[0], complex_t(0., 1.)); break; case DensityMatrix::Gates::sdg: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(0., -1.)); + apply_phase(iChunk,op.qubits[0], complex_t(0., -1.)); break; case DensityMatrix::Gates::sx: BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, Linalg::VMatrix::SX); break; case DensityMatrix::Gates::t: { const double isqrt2{1. / std::sqrt(2)}; - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(isqrt2, isqrt2)); + apply_phase(iChunk,op.qubits[0], complex_t(isqrt2, isqrt2)); } break; case DensityMatrix::Gates::tdg: { const double isqrt2{1. / std::sqrt(2)}; - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(isqrt2, -isqrt2)); + apply_phase(iChunk,op.qubits[0], complex_t(isqrt2, -isqrt2)); } break; case DensityMatrix::Gates::swap: { BaseState::qregs_[iChunk].apply_swap(op.qubits[0], op.qubits[1]); @@ -1192,13 +1203,44 @@ void State::apply_diagonal_unitary_matrix(const int_t iChunk, const r } else{ reg_t qubits_in = qubits; + reg_t qubits_row = qubits; cvector_t diag_in = diag; + cvector_t diag_row = diag; BaseState::block_diagonal_matrix(iChunk,qubits_in,diag_in); - BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits_in,diag_in); + + for(int_t i=0;i= BaseState::chunk_bits_) + qubits_row[i] = qubits[i] + BaseState::num_qubits_ - BaseState::chunk_bits_; + } + BaseState::block_diagonal_matrix(iChunk,qubits_row,diag_row); + + reg_t qubits_chunk(qubits_in.size()*2); + for(int_t i=0;i +void State::apply_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase) +{ + cvector_t diag(2); + diag[0] = 1.0; + diag[1] = phase; + apply_diagonal_unitary_matrix(iChunk,reg_t({qubit}), diag); +} + +template +void State::apply_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase) +{ + cvector_t diag((1 << qubits.size()),1.0); + diag[(1 << qubits.size()) - 1] = phase; + apply_diagonal_unitary_matrix(iChunk,qubits, diag); +} + //========================================================================= // Implementation: Reset and Measurement Sampling //========================================================================= @@ -1399,18 +1441,14 @@ std::vector State::sample_measure(const reg_t &qubits, template -void State::apply_reset(const reg_t &qubits) +void State::apply_reset(const int_t iChunk,const reg_t &qubits) { // TODO: This can be more efficient by adding reset // to base class rather than doing a matrix multiplication // where all but 1 row is zeros. const auto reset_op = Linalg::SMatrix::reset(1ULL << qubits.size()); - int_t i; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 645b108344..4a7730bdb0 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -317,6 +317,73 @@ class DensityDiagMatMult2x2 : public GateFuncBase } }; +template +class DensityDiagMatMultNxN : public GateFuncBase +{ +protected: + int nqubits_; + int total_bits_; + int chunk_bits_; +public: + DensityDiagMatMultNxN(const reg_t &qb,int total,int chunk) + { + nqubits_ = qb.size(); + total_bits_ = total; + chunk_bits_ = chunk; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t j,imr,imc; + thrust::complex* vec; + thrust::complex q; + thrust::complex* pMat; + uint_t* qubits; + uint_t irow,icol,gid,local_mask; + uint_t irow_chunk,icol_chunk; + + vec = this->data_; + gid = this->base_index_; + + irow_chunk = ((gid + i) >> (chunk_bits_*2)) >> (total_bits_ - chunk_bits_); + icol_chunk = ((gid + i) >> (chunk_bits_*2)) & ((1ull << (total_bits_ - chunk_bits_)-1)); + + local_mask = (1ull << (chunk_bits_*2)) - 1; + irow = (i & local_mask) >> chunk_bits_; + icol = (i & local_mask) & ((1ull << chunk_bits_)-1); + + irow += (irow_chunk << chunk_bits_); + icol += (icol_chunk << chunk_bits_); + + pMat = this->matrix_; + qubits = this->params_; + + imr = 0; + imc = 0; + for(j=0;j> qubits[j]) & 1) != 0){ + imr += (1 << j); + } + if(((icol >> qubits[j]) & 1) != 0){ + imc += (1 << j); + } + } + + q = vec[i]; + vec[i] = thrust::conj(pMat[imr])*pMat[imc]*q; + } + const char* name(void) + { + return "DensityDiagMatMultNxN"; + } +}; + + template void DensityMatrixThrust::apply_unitary_matrix(const reg_t &qubits, const cvector_t &mat) @@ -340,6 +407,11 @@ template void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qubits, const cvector_t &diag) { + BaseVector::chunk_->StoreMatrix(diag); + BaseVector::chunk_->StoreUintParams(qubits); + BaseVector::apply_function(DensityDiagMatMultNxN(qubits, BaseVector::chunk_manager_.num_qubits()/2, num_qubits())); + + /* if(qubits.size() == 1){ const reg_t qubits_sp = {{qubits[0], qubits[0] + num_qubits()}}; BaseVector::apply_function(DensityDiagMatMult2x2(diag,qubits[0], num_qubits())); @@ -348,6 +420,7 @@ void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qub // Apply as single 2N-qubit matrix mult. apply_diagonal_superop_matrix(qubits, AER::Utils::tensor_product(AER::Utils::conjugate(diag), diag)); } + */ } //----------------------------------------------------------------------- @@ -439,6 +512,62 @@ void DensityMatrixThrust::apply_cnot(const uint_t qctrl, const uint_t qt #endif } +template +class DensityPhase : public GateFuncBase +{ +protected: + thrust::complex phase_; + int qubit_; + int total_bits_; + int chunk_bits_; +public: + DensityPhase(int qubit,thrust::complex* phase,int total,int chunk) + { + qubit_ = qubit; + phase_ = *phase; + total_bits_ = total; + chunk_bits_ = chunk; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q; + uint_t irow,icol,gid,local_mask; + uint_t irow_chunk,icol_chunk; + + vec = this->data_; + gid = this->base_index_; + + irow_chunk = ((gid + i) >> (chunk_bits_*2)) >> (total_bits_ - chunk_bits_); + icol_chunk = ((gid + i) >> (chunk_bits_*2)) & ((1ull << (total_bits_ - chunk_bits_)-1)); + + local_mask = (1ull << (chunk_bits_*2)) - 1; + irow = (i & local_mask) >> chunk_bits_; + icol = (i & local_mask) & ((1ull << chunk_bits_)-1); + + irow += (irow_chunk << chunk_bits_); + icol += (icol_chunk << chunk_bits_); + + q = vec[i]; + if((icol >> qubit_) & 1) + q = phase_*q; + if((irow >> qubit_) & 1) + q = thrust::conj(phase_)*q; + vec[i] = q; + } + const char* name(void) + { + return "DensityPhase"; + } +}; + + /* template class DensityPhase : public GateFuncBase { @@ -454,10 +583,11 @@ class DensityPhase : public GateFuncBase phase_ = *phase; } - int qubits_count(void) + bool is_diagonal(void) { - return 2; + return true; } + __host__ __device__ void operator()(const uint_t &i) const { uint_t i0,i1,i2; @@ -490,11 +620,13 @@ class DensityPhase : public GateFuncBase return "DensityPhase"; } }; + */ template void DensityMatrixThrust::apply_phase(const uint_t q,const complex_t &phase) { - BaseVector::apply_function(DensityPhase(q, num_qubits(), (thrust::complex*)&phase )); + BaseVector::apply_function(DensityPhase(q, (thrust::complex*)&phase, BaseVector::chunk_manager_.num_qubits()/2, num_qubits() )); +// BaseVector::apply_function(DensityPhase(q, num_qubits(), (thrust::complex*)&phase )); #ifdef AER_DEBUG BaseVector::DebugMsg(" density::apply_phase"); diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index fc4f7924f3..f49f68256d 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -188,6 +188,15 @@ class StateChunk { const std::string &memory_hex, const std::string ®ister_hex); + //----------------------------------------------------------------------- + // Initialization + //----------------------------------------------------------------------- + template + void initialize_from_vector(const list_t &vec); + + template + void initialize_from_matrix(const list_t &mat); + //----------------------------------------------------------------------- // Save result data //----------------------------------------------------------------------- @@ -355,6 +364,12 @@ class StateChunk { // block diagonal matrix in chunk void block_diagonal_matrix(const int_t iChunk, reg_t &qubits, cvector_t &diag); + void qubits_inout(const reg_t& qubits, reg_t& qubits_in,reg_t& qubits_out) const; + + auto apply_to_matrix(bool copy = false); + + virtual bool is_applied_to_each_chunk(const Operations::Op &op); + // Set a global phase exp(1j * theta) for the state bool has_global_phase_ = false; complex_t global_phase_ = 1; @@ -520,6 +535,16 @@ void StateChunk::set_config(const json_t &config) JSON::get_value(block_bits_, "blocking_qubits", config); } +template +bool StateChunk::is_applied_to_each_chunk(const Operations::Op &op) +{ + if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer || + op.type == Operations::OpType::superop){ + return true; + } + return false; +} template void StateChunk::apply_ops(const std::vector &ops, @@ -566,9 +591,7 @@ void StateChunk::apply_ops(const std::vector &ops, iOp = iOpEnd; } - else if(ops[iOp].type == Operations::OpType::gate || ops[iOp].type == Operations::OpType::matrix || - ops[iOp].type == Operations::OpType::diagonal_matrix || ops[iOp].type == Operations::OpType::multiplexer) - { + else if(is_applied_to_each_chunk(ops[iOp])){ #pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk::block_diagonal_matrix(const int_t iChunk, reg_t &qubit } } +template +void StateChunk::qubits_inout(const reg_t& qubits, reg_t& qubits_in,reg_t& qubits_out) const +{ + int_t i; + qubits_in.clear(); + qubits_out.clear(); + for(i=0;i std::vector StateChunk::sample_measure(const reg_t &qubits, uint_t shots, @@ -646,6 +686,118 @@ void StateChunk::initialize_creg(uint_t num_memory, creg_.initialize(num_memory, num_register, memory_hex, register_hex); } +template +template +void StateChunk::initialize_from_vector(const list_t &vec) +{ + int_t iChunk; + if(chunk_bits_ == num_qubits_){ + for(iChunk=0;iChunk +template +void StateChunk::initialize_from_matrix(const list_t &mat) +{ + int_t iChunk; + if(chunk_bits_ == num_qubits_){ + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); + } + } +} + +template +auto StateChunk::apply_to_matrix(bool copy) +{ + int_t iChunk; + uint_t size = 1ull << (chunk_bits_*qubit_scale()); + uint_t mask = (1ull << (chunk_bits_)) - 1; + uint_t num_threads = qregs_[0].get_omp_threads(); + + auto matrix = qregs_[0].copy_to_matrix(); + + if(distributed_rank_ == 0){ + //TO DO check memory availability + matrix.resize(1ull << (num_qubits_),1ull << (num_qubits_)); + + auto tmp = qregs_[0].copy_to_matrix(); + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << chunk_bits_; + uint_t icol_chunk = (iChunk & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << chunk_bits_; + + if(iChunk < num_local_chunks_){ + if(copy) + tmp = qregs_[iChunk].copy_to_matrix(); + else + tmp = qregs_[iChunk].move_to_matrix(); + } +#ifdef AER_MPI + else + recv_data(tmp.data(),size,0,iChunk); +#endif +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (chunk_bits_); + uint_t icol = i & mask; + uint_t idx = ((irow+irow_chunk) << (num_qubits_)) + icol_chunk + icol; + matrix[idx] = tmp[i]; + } + } + } + else{ +#ifdef AER_MPI + //send matrices to process 0 + for(iChunk=0;iChunk void StateChunk::save_creg(ExperimentResult &result, const std::string &key, @@ -1194,6 +1346,8 @@ void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_pai MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,local_chunk_index + global_chunk_index_,distributed_comm_,&reqSend); MPI_Wait(&reqSend,&st); + + qregs_[local_chunk_index].release_send_buffer(); #endif } diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index b024313ad2..6c4c27a479 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -487,6 +487,7 @@ class ChunkContainer : public std::enable_shared_from_this @@ -709,14 +710,48 @@ void ChunkContainer::allocate_chunks(void) buffers_.resize(num_buffers_); checkpoints_.resize(num_checkpoint_); - for(i=0;i>(this->shared_from_this(),i); + if(num_chunks_ > 0){ + chunks_.resize(num_chunks_); + for(i=0;i>(this->shared_from_this(),i); + } + } + if(num_buffers_ > 0){ + buffers_.resize(num_buffers_); + for(i=0;i>(this->shared_from_this(),num_chunks_+i); + } } - for(i=0;i>(this->shared_from_this(),num_chunks_+i); + if(num_checkpoint_ > 0){ + checkpoints_.resize(num_checkpoint_); + for(i=0;i>(this->shared_from_this(),num_chunks_+num_buffers_+i); + } } - for(i=0;i>(this->shared_from_this(),num_chunks_+num_buffers_+i); +} + +template +void ChunkContainer::deallocate_chunks(void) +{ + uint_t i; + + if(num_chunks_ > 0){ + for(i=0;i 0){ + for(i=0;i 0){ + for(i=0;i::ChunkManager() #endif - chunks_.resize(num_places_*2 + 1); + chunks_.resize(num_places_*2 + 1,nullptr); iplace_host_ = num_places_ ; @@ -173,7 +173,6 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) char* str; bool multi_gpu = false; bool hybrid = false; - uint_t num_checkpoint,total_checkpoint = 0; bool multi_shot = false; //--- for test @@ -253,25 +252,13 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) nc /= 2; } - num_checkpoint = nc; chunks_[iDev] = std::make_shared>(); - -#ifdef AER_THRUST_CUDA - size_t freeMem,totalMem; - cudaSetDevice(iDev); - cudaMemGetInfo(&freeMem,&totalMem); - if(freeMem <= ( ((uint_t)sizeof(thrust::complex) * (nc + num_buffers + num_checkpoint)) << chunk_bits_)){ - num_checkpoint = 0; - } -#endif - - total_checkpoint += num_checkpoint; - num_chunks_ += chunks_[iDev]->Allocate(iDev,chunk_bits,nc,num_buffers,num_checkpoint); + num_chunks_ += chunks_[iDev]->Allocate(iDev,chunk_bits,nc,num_buffers); } if(num_chunks_ < nchunks){ //rest of chunks are stored on host chunks_[num_places_] = std::make_shared>(); - chunks_[num_places_]->Allocate(-1,chunk_bits,nchunks-num_chunks_,AER_MAX_BUFFERS); + chunks_[num_places_]->Allocate(-1,chunk_bits,nchunks-num_chunks_,num_buffers); num_places_ += 1; num_chunks_ = nchunks; } @@ -279,7 +266,11 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) //additional host buffer iplace_host_ = num_places_; chunks_[iplace_host_] = std::make_shared>(); +#ifdef AER_DISABLE_GDR chunks_[iplace_host_]->Allocate(-1,chunk_bits,0,AER_MAX_BUFFERS); +#else + chunks_[iplace_host_]->Allocate(-1,chunk_bits,0,0); +#endif } } @@ -292,9 +283,11 @@ void ChunkManager::Free(void) int i; for(i=0;iDeallocate(); - chunks_[i].reset(); + chunks_[i].reset(); + chunks_[i] = nullptr; + } } chunk_bits_ = 0; diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 8fe2ba9250..4e55457a13 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -373,6 +373,7 @@ void DeviceChunkContainer::Deallocate(void) } stream_.clear(); #endif + ChunkContainer::deallocate_chunks(); } template diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp index a6b32d1375..f00dee8195 100644 --- a/src/simulators/statevector/chunk/host_chunk_container.hpp +++ b/src/simulators/statevector/chunk/host_chunk_container.hpp @@ -131,12 +131,16 @@ uint_t HostChunkContainer::Allocate(int idev,int bits,uint_t chunks,uint ChunkContainer::num_buffers_ = buffers; ChunkContainer::num_checkpoint_ = checkpoint; ChunkContainer::num_chunks_ = nc; - data_.resize((nc + buffers + checkpoint) << bits); - matrix_.resize(nc + buffers); - params_.resize(nc + buffers); + if(nc + buffers + checkpoint > 0) + data_.resize((nc + buffers + checkpoint) << bits); + if(nc + buffers > 0){ + matrix_.resize(nc + buffers); + params_.resize(nc + buffers); + } //allocate chunk classes - ChunkContainer::allocate_chunks(); + if(nc + buffers + checkpoint > 0) + ChunkContainer::allocate_chunks(); return nc; } @@ -147,9 +151,12 @@ uint_t HostChunkContainer::Resize(uint_t chunks,uint_t buffers,uint_t ch uint_t i; if(chunks + buffers + checkpoint > this->num_chunks_ + this->num_buffers_ + this->num_checkpoint_){ - data_.resize((chunks + buffers + checkpoint) << this->chunk_bits_); - matrix_.resize(chunks + buffers); - params_.resize(chunks + buffers); + if(chunks + buffers + checkpoint > 0) + data_.resize((chunks + buffers + checkpoint) << this->chunk_bits_); + if(chunks + buffers > 0){ + matrix_.resize(chunks + buffers); + params_.resize(chunks + buffers); + } } this->num_chunks_ = chunks; @@ -157,7 +164,8 @@ uint_t HostChunkContainer::Resize(uint_t chunks,uint_t buffers,uint_t ch this->num_checkpoint_ = checkpoint; //allocate chunk classes - ChunkContainer::allocate_chunks(); + if(chunks + buffers + checkpoint > 0) + ChunkContainer::allocate_chunks(); return chunks + buffers + checkpoint; } @@ -171,6 +179,8 @@ void HostChunkContainer::Deallocate(void) matrix_.shrink_to_fit(); params_.clear(); params_.shrink_to_fit(); + + ChunkContainer::deallocate_chunks(); } diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index f04d1d5de9..5451ba7e9a 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -150,6 +150,8 @@ class QubitVector { //prepare buffer for MPI send/recv std::complex* send_buffer(uint_t& size_in_byte); std::complex* recv_buffer(uint_t& size_in_byte); + void release_send_buffer(void) const; + void release_recv_buffer(void) const; //----------------------------------------------------------------------- // Check point operations @@ -364,7 +366,7 @@ class QubitVector { std::complex* checkpoint_; uint_t chunk_index_; //global chunk index - cvector_t recv_buffer_; //receive buffer for MPI + mutable cvector_t recv_buffer_; //receive buffer for MPI //----------------------------------------------------------------------- // Config settings @@ -897,6 +899,18 @@ std::complex* QubitVector::recv_buffer(uint_t& size_in_byte) return &recv_buffer_[0]; } +template +void QubitVector::release_send_buffer(void) const +{ + +} + +template +void QubitVector::release_recv_buffer(void) const +{ + recv_buffer_.clear(); +} + //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ @@ -1226,7 +1240,6 @@ void QubitVector::apply_permutation_matrix(const reg_t& qubits, } // end switch } - /******************************************************************************* * * APPLY OPTIMIZED GATES diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index c99cb24cc0..a756a5cf25 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -144,6 +144,9 @@ class QubitVectorThrust { thrust::complex* send_buffer(uint_t& size_in_byte); thrust::complex* recv_buffer(uint_t& size_in_byte); + void release_send_buffer(void) const; + void release_recv_buffer(void) const; + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -364,12 +367,13 @@ class QubitVectorThrust { mutable std::shared_ptr> chunk_; mutable std::shared_ptr> buffer_chunk_; std::shared_ptr> checkpoint_; - std::shared_ptr> send_chunk_; - std::shared_ptr> recv_chunk_; + mutable std::shared_ptr> send_chunk_; + mutable std::shared_ptr> recv_chunk_; static ChunkManager chunk_manager_; uint_t chunk_index_; bool multi_chunk_distribution_; + bool multi_shots_; bool register_blocking_; @@ -517,7 +521,10 @@ QubitVectorThrust::QubitVectorThrust(size_t num_qubits) : num_qubits_(0) chunk_ = nullptr; chunk_index_ = 0; multi_chunk_distribution_ = false; + multi_shots_ = false; checkpoint_ = nullptr; + recv_chunk_ = nullptr; + send_chunk_ = nullptr; #ifdef AER_DEBUG debug_count = 0; @@ -788,6 +795,28 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const // Utility //------------------------------------------------------------------------------ +template +class ZeroClear : public GateFuncBase +{ +protected: +public: + ZeroClear() {} + bool is_diagonal(void) + { + return true; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + vec = this->data_; + vec[i] = 0.0; + } + const char* name(void) + { + return "zero"; + } +}; + template void QubitVectorThrust::zero() { @@ -795,8 +824,7 @@ void QubitVectorThrust::zero() DebugMsg("zero"); #endif - chunk_->Zero(); -// chunk_->synchronize(); + apply_function(ZeroClear()); #ifdef AER_DEBUG DebugMsg("zero done"); @@ -818,6 +846,9 @@ void QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t if(chunk_bits < num_qubits){ multi_chunk_distribution_ = true; } + + if(omp_get_num_threads() > 1) + multi_shots_ = true; } template @@ -914,31 +945,14 @@ std::complex QubitVectorThrust::inner_product() const chunk_->set_device(); vec0 = (data_t*)chunk_->pointer(); + vec1 = (data_t*)checkpoint_->pointer(); #ifdef AER_THRUST_CUDA cudaStream_t strm = chunk_->stream(); - if(strm){ - if(chunk_->device() == checkpoint_->device()){ - vec1 = (data_t*)checkpoint_->pointer(); - - dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); - } - else{ - std::shared_ptr> pBuffer = chunk_manager_.MapBufferChunk(chunk_->place()); - pBuffer->CopyIn(checkpoint_); - vec1 = (data_t*)pBuffer->pointer(); - - dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); - chunk_manager_.UnmapBufferChunk(pBuffer); - } - } - else{ - vec1 = (data_t*)checkpoint_->pointer(); - + if(strm) + dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); + else dot = thrust::inner_product(thrust::omp::par,vec0,vec0 + data_size_*2,vec1,0.0); - } #else - vec1 = (data_t*)checkpoint_->pointer(); - if(num_qubits_ > omp_threshold_ && omp_threads_ > 1) dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); else @@ -1056,6 +1070,26 @@ thrust::complex* QubitVectorThrust::recv_buffer(uint_t& size_in_ return recv_chunk_->pointer(); } +template +void QubitVectorThrust::release_send_buffer(void) const +{ +#ifdef AER_DISABLE_GDR + if(send_chunk_){ + chunk_manager_.UnmapBufferChunk(send_chunk_); + send_chunk_ = nullptr; + } +#endif +} + +template +void QubitVectorThrust::release_recv_buffer(void) const +{ + if(recv_chunk_){ + chunk_manager_.UnmapBufferChunk(recv_chunk_); + recv_chunk_ = nullptr; + } +} + //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ @@ -1147,13 +1181,14 @@ void QubitVectorThrust::apply_function(Function func) const #endif - func.set_base_index(chunk_index_ << num_qubits_); if(func.batch_enable() && multi_chunk_distribution_ && chunk_->device() >= 0){ if(chunk_->pos() == 0){ //only first chunk on device calculates all the chunks + func.set_base_index(chunk_index_ << num_qubits_); chunk_->Execute(func,chunk_->container()->num_chunks()); } } else{ + func.set_base_index(chunk_index_ << num_qubits_); chunk_->Execute(func,1); } @@ -2700,14 +2735,10 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem } } - chunk_manager_.UnmapBufferChunk(recv_chunk_); -// recv_chunk_.reset(); + release_recv_buffer(); #ifdef AER_DISABLE_GDR - if(send_chunk_){ - chunk_manager_.UnmapBufferChunk(send_chunk_); -// send_chunk_.reset(); - } + release_send_buffer(); #endif } @@ -2730,15 +2761,6 @@ class phase_func : public GateFuncBase mask |= (1ull << qubits[i]); } } - int qubits_count(void) - { - return nqubits; - } - int num_control_bits(void) - { - return nqubits - 1; - } - bool is_diagonal(void) { return true; @@ -3711,6 +3733,10 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, chunk_manager_.UnmapBufferChunk(buffer); } + if(pair_chunk.data() == this->data()){ + release_recv_buffer(); + } + return ret; } diff --git a/src/simulators/statevector/statevector_state_chunk.hpp b/src/simulators/statevector/statevector_state_chunk.hpp index 673a4e10e4..bfa091b35a 100644 --- a/src/simulators/statevector/statevector_state_chunk.hpp +++ b/src/simulators/statevector/statevector_state_chunk.hpp @@ -44,12 +44,12 @@ const Operations::OpSet StateOpSet( OpType::bfunc, OpType::roerror, OpType::matrix, OpType::diagonal_matrix, OpType::multiplexer, OpType::kraus, - OpType::sim_op, OpType::save_expval, - OpType::save_expval_var, OpType::save_densmat, + OpType::sim_op, OpType::set_statevec, + OpType::save_expval, OpType::save_expval_var, OpType::save_probs, OpType::save_probs_ket, OpType::save_amps, OpType::save_amps_sq, - OpType::save_statevec, OpType::save_state, - OpType::save_statevec_dict + OpType::save_state, OpType::save_statevec, + OpType::save_statevec_dict, OpType::save_densmat }, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", @@ -157,6 +157,8 @@ class State : public Base::StateChunk { // /psi> is given in params void apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng); + void initialize_from_vector(const cvector_t ¶ms); + // Apply a supported snapshot instruction // If the input is not in allowed_snapshots an exeption will be raised. virtual void apply_snapshot(const Operations::Op &op, ExperimentResult &result, bool last_op = false); @@ -293,6 +295,15 @@ class State : public Base::StateChunk { const double phi, const double lambda); + void apply_gate_mcphase(const int_t iChunk, const reg_t& qubits, const complex_t phase); + + //------------------------------------------------------------------------- + // data access + //------------------------------------------------------------------------- + complex_t get_state(const uint_t idx); + void set_state(const uint_t idx,const complex_t s); + + //----------------------------------------------------------------------- // Config Settings //----------------------------------------------------------------------- @@ -332,10 +343,13 @@ void State::initialize_qreg(uint_t num_qubits) } } else{ //multi-chunk distribution + for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -366,9 +380,12 @@ void State::initialize_qreg(uint_t num_qubits, else{ //multi-chunk distribution uint_t local_offset = BaseState::global_chunk_index_ << BaseState::chunk_bits_; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, initialize_omp(); - if(BaseState::chunk_bits_ == BaseState::num_qubits_){ - for(int_t iChunk=0;iChunk::copy_to_vector() } } +template +complex_t State::get_state(const uint_t idx) +{ + complex_t ret = 0.0; + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].get_state(idx); + } + else{ + uint_t iChunk = idx >> BaseState::chunk_bits_; + if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process + ret = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(idx - (iChunk << BaseState::chunk_bits_)); + } +#ifdef AER_MPI + BaseState::reduce_sum(ret); +#endif + } + return ret; +} + +template +void State::set_state(const uint_t idx,const complex_t s) +{ + if(BaseState::num_global_chunks_ == 1){ + BaseState::qregs_[0][idx] = s; + } + else{ + uint_t iChunk = idx >> BaseState::chunk_bits_; + if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process + BaseState::qregs_[iChunk - BaseState::global_chunk_index_][idx - (iChunk << BaseState::chunk_bits_)] = s; + } + } +} + //========================================================================= // Implementation: apply operations //========================================================================= @@ -582,6 +618,9 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::kraus: apply_kraus(op.qubits, op.mats, rng); break; + case OpType::set_statevec: + initialize_from_vector(op.params); + break; case Operations::OpType::save_expval: case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); @@ -872,9 +911,8 @@ void State::apply_snapshot(const Operations::Op &op, result.legacy_data.add_pershot_snapshot("statevector", op.string_params[0], move_to_vector()); } else { - //also using move_to_vector(actually no move) result.legacy_data.add_pershot_snapshot("statevector", op.string_params[0], - move_to_vector()); + copy_to_vector()); } break; case Statevector::Snapshots::cmemory: @@ -1181,7 +1219,7 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op break; case Statevector::Gates::mcz: // Includes Z, CZ, CCZ, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + apply_gate_mcphase(iChunk,op.qubits, -1); break; case Statevector::Gates::mcr: BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); @@ -1246,8 +1284,7 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op break; case Statevector::Gates::mcp: // Includes u1, cu1, p, cp, mcp etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, - std::exp(complex_t(0, 1) * op.params[0])); + apply_gate_mcphase(iChunk,op.qubits,std::exp(complex_t(0, 1) * op.params[0])); break; case Statevector::Gates::mcsx: // Includes sx, csx, mcsx etc @@ -1327,6 +1364,21 @@ void State::apply_gate_phase(const int_t iChunk, uint_t qubit, compl apply_matrix(iChunk,reg_t({qubit}), diag); } +template +void State::apply_gate_mcphase(const int_t iChunk, const reg_t& qubits, const complex_t phase) +{ + if(BaseState::gpu_optimization_){ + //GPU computes all chunks in one kernel + BaseState::qregs_[iChunk].apply_mcphase(qubits,phase); + } + else{ + cvector_t diag(1ull << qubits.size(),1.0); + diag[diag.size()-1] = phase; + + apply_diagonal_matrix(iChunk,qubits,diag); + } +} + template void State::apply_mcswap(const int_t iChunk,const reg_t &qubits) { @@ -1361,14 +1413,7 @@ rvector_t State::measure_probs(const reg_t &qubits) const reg_t qubits_in_chunk; reg_t qubits_out_chunk; - for(i=0;i::measure_reset_update(const std::vector &qubits, // Update a state vector based on an outcome pair [m, p] from // sample_measure_with_prob function, and a desired post-measurement final_state - int_t i; + int_t i,iChunk; // Single-qubit case if (qubits.size() == 1) { // Diagonal matrix for projecting and renormalizing to measurement outcome cvector_t mdiag(2, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const std::vector &qubits, cvector_t mdiag(dim, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const std::vector &qubits, reg_t qubits_in_chunk; reg_t qubits_out_chunk; - for(i=0;i 0){ //in chunk exchange - const size_t dim_in = 1ULL << qubits_in_chunk.size(); + if(qubits_in_chunk.size() == qubits.size()){ //all bits are inside chunk // build vectorized permutation matrix - cvector_t perm(dim_in * dim_in, 0.); - perm[final_state * dim_in + meas_state] = 1.; - perm[meas_state * dim_in + final_state] = 1.; - for (size_t j=0; j < dim_in; j++) { + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (size_t j=0; j < dim; j++) { if (j != final_state && j != meas_state) - perm[j * dim_in + j] = 1.; + perm[j * dim + j] = 1.; } // apply permutation to swap state -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange - for(i=0;i> i) & 1) != ((meas_state >> i) & 1)){ + BaseState::apply_chunk_x(qubits[i]); + } } } } @@ -1614,24 +1653,120 @@ void State::apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng) { - uint_t i; + int_t i,iChunk; + auto sorted_qubits = qubits; + std::sort(sorted_qubits.begin(), sorted_qubits.end()); if (qubits.size() == BaseState::num_qubits_) { // If qubits is all ordered qubits in the statevector // we can just initialize the whole state directly - auto sorted_qubits = qubits; - std::sort(sorted_qubits.begin(), sorted_qubits.end()); if (qubits == sorted_qubits) { - initialize_qreg(qubits.size(), params); + initialize_from_vector(params); return; } } // Apply reset to qubits apply_reset(qubits, rng); - // Apply initialize_component - for(i=0;i 0){ + //scatter inside chunks + const size_t dim = 1ULL << qubits_in_chunk.size(); + cvector_t perm(dim * dim, 0.); + for(i=0;i 0){ + //then scatter outside chunk + auto sorted_qubits_out = qubits_out_chunk; + std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end()); + + for(i=0;i<(1ull << (BaseState::num_qubits_ - BaseState::chunk_bits_ - qubits_out_chunk.size()));i++){ + uint_t baseChunk = 0; + uint_t j,ii,t; + ii = i; + for(j=0;j>= BaseState::chunk_bits_; + + for(j=1;j<(1ull << qubits_out_chunk.size());j++){ + iChunk = baseChunk; + for(t=0;t> t) & 1) + iChunk += (1ull << (qubits_out_chunk[t] - BaseState::chunk_bits_)); + } + + if(iChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && iChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //on this process + if(baseChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && baseChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //base chunk is on this process + BaseState::qregs_[iChunk].initialize_from_data(BaseState::qregs_[baseChunk].data(),1ull << BaseState::chunk_bits_); + } + else{ + BaseState::recv_chunk(iChunk,baseChunk); + //using swap chunk function to release send/recv buffers for Thrust + reg_t swap(2); + swap[0] = BaseState::chunk_bits_; + swap[1] = BaseState::chunk_bits_; + BaseState::qregs_[iChunk].apply_chunk_swap(swap,baseChunk); + } + } + else if(baseChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && baseChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //base chunk is on this process + BaseState::send_chunk(baseChunk - BaseState::global_chunk_index_,iChunk); + } + } + } + } + + //initialize by params +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk +void State::initialize_from_vector(const cvector_t ¶ms) +{ + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk class State; +} + namespace QubitUnitary { // OpSet of supported instructions @@ -62,6 +67,7 @@ enum class Gates { template > class State : public Base::State { + friend class QubitUnitaryChunk::State; public: using BaseState = Base::State; diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp new file mode 100644 index 0000000000..49be0c8677 --- /dev/null +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -0,0 +1,660 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019, 2020. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _unitary_state_chunk_hpp +#define _unitary_state_chunk_hpp + +#include +#define _USE_MATH_DEFINES +#include +#include "simulators/state.hpp" +#include "framework/json.hpp" +#include "framework/utils.hpp" +#include "simulators/state_chunk.hpp" +#include "unitarymatrix.hpp" +#ifdef AER_THRUST_SUPPORTED +#include "unitarymatrix_thrust.hpp" +#endif + +namespace AER { +namespace QubitUnitaryChunk { + +// OpSet of supported instructions +const Operations::OpSet StateOpSet( + // Op types + {Operations::OpType::gate, Operations::OpType::barrier, + Operations::OpType::bfunc, Operations::OpType::roerror, + Operations::OpType::matrix, Operations::OpType::diagonal_matrix, + Operations::OpType::snapshot, Operations::OpType::save_unitary, + Operations::OpType::save_state, Operations::OpType::set_unitary}, + // Gates + {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", + "cy", "cp", "cu1", "cu2", "cu3", "swap", "id", "p", + "x", "y", "z", "h", "s", "sdg", "t", "tdg", + "r", "rx", "ry", "rz", "rxx", "ryy", "rzz", "rzx", + "ccx", "cswap", "mcx", "mcy", "mcz", "mcu1", "mcu2", "mcu3", + "mcswap", "mcphase", "mcr", "mcrx", "mcry", "mcry", "sx", "csx", + "mcsx", "delay", "pauli"}, + // Snapshots + {"unitary"}); + +//========================================================================= +// QubitUnitary State subclass +//========================================================================= + +template > +class State : public Base::StateChunk { +public: + using BaseState = Base::StateChunk; + + State() : BaseState(StateOpSet) {} + virtual ~State() = default; + + //----------------------------------------------------------------------- + // Base class overrides + //----------------------------------------------------------------------- + + // Return the string name of the State class + virtual std::string name() const override { return "unitary"; } + + // Initializes an n-qubit unitary to the identity matrix + virtual void initialize_qreg(uint_t num_qubits) override; + + // Initializes to a specific n-qubit unitary matrix + virtual void initialize_qreg(uint_t num_qubits, + const unitary_matrix_t &unitary) override; + + // Returns the required memory for storing an n-qubit state in megabytes. + // For this state the memory is indepdentent of the number of ops + // and is approximately 16 * 1 << 2 * num_qubits bytes + virtual size_t + required_memory_mb(uint_t num_qubits, + const std::vector &ops) const override; + + // Load the threshold for applying OpenMP parallelization + // if the controller/engine allows threads for it + // Config: {"omp_qubit_threshold": 7} + virtual void set_config(const json_t &config) override; + + //----------------------------------------------------------------------- + // Additional methods + //----------------------------------------------------------------------- + + // Initializes to a specific n-qubit unitary given as a complex matrix + virtual void initialize_qreg(uint_t num_qubits, const cmatrix_t &unitary); + + // Initialize OpenMP settings for the underlying QubitVector class + void initialize_omp(); + + auto move_to_matrix(); + auto copy_to_matrix(); + +protected: + //----------------------------------------------------------------------- + // Apply Instructions + //----------------------------------------------------------------------- + virtual void apply_op(const int_t iChunk,const Operations::Op &op, + ExperimentResult &result, + RngEngine &rng, + bool final_ops = false) override; + + //swap between chunks + virtual void apply_chunk_swap(const reg_t &qubits) override; + + // Applies a Gate operation to the state class. + // This should support all and only the operations defined in + // allowed_operations. + void apply_gate(const uint_t iChunk,const Operations::Op &op); + + // Apply a supported snapshot instruction + // If the input is not in allowed_snapshots an exeption will be raised. + virtual void apply_snapshot(const Operations::Op &op, ExperimentResult &result); + + // Apply a matrix to given qubits (identity on all other qubits) + void apply_matrix(const uint_t iChunk,const reg_t &qubits, const cmatrix_t &mat); + + // Apply a matrix to given qubits (identity on all other qubits) + void apply_matrix(const uint_t iChunk,const reg_t &qubits, const cvector_t &vmat); + + // Apply a diagonal matrix + void apply_diagonal_matrix(const uint_t iChunk,const reg_t &qubits, const cvector_t &diag); + + //----------------------------------------------------------------------- + // 1-Qubit Gates + //----------------------------------------------------------------------- + + // Optimize phase gate with diagonal [1, phase] + void apply_gate_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase); + + void apply_gate_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase); + + //----------------------------------------------------------------------- + // Multi-controlled u3 + //----------------------------------------------------------------------- + + // Apply N-qubit multi-controlled single qubit waltz gate specified by + // parameters u3(theta, phi, lambda) + // NOTE: if N=1 this is just a regular u3 gate. + void apply_gate_mcu3(const uint_t iChunk,const reg_t &qubits, const double theta, + const double phi, const double lambda); + + //----------------------------------------------------------------------- + // Save data instructions + //----------------------------------------------------------------------- + // Save the unitary matrix for the simulator + void apply_save_unitary(const Operations::Op &op, + ExperimentResult &result, + bool last_op); + + // Helper function for computing expectation value + virtual double expval_pauli(const reg_t &qubits, + const std::string& pauli) override; + + //----------------------------------------------------------------------- + // Config Settings + //----------------------------------------------------------------------- + + // Apply the global phase + void apply_global_phase(); + + // OpenMP qubit threshold + int omp_qubit_threshold_ = 6; + + // Threshold for chopping small values to zero in JSON + double json_chop_threshold_ = 1e-10; + + int qubit_scale() override + { + return 2; + } +}; + + +//============================================================================ +// Implementation: Base class method overrides +//============================================================================ +template +void State::apply_op(const int_t iChunk,const Operations::Op &op, + ExperimentResult &result, + RngEngine &rng, + bool final_ops) +{ + switch (op.type) { + case Operations::OpType::barrier: + break; + case Operations::OpType::bfunc: + BaseState::creg_.apply_bfunc(op); + break; + case Operations::OpType::roerror: + BaseState::creg_.apply_roerror(op, rng); + break; + case Operations::OpType::gate: + // Note conditionals will always fail since no classical registers + if (BaseState::creg_.check_conditional(op)) + apply_gate(iChunk,op); + break; + case Operations::OpType::set_unitary: + BaseState::initialize_from_matrix(op.mats[0]); + break; + case Operations::OpType::save_state: + case Operations::OpType::save_unitary: + apply_save_unitary(op, result, final_ops); + break; + case Operations::OpType::snapshot: + apply_snapshot(op, result); + break; + case Operations::OpType::matrix: + apply_matrix(iChunk,op.qubits, op.mats[0]); + break; + case Operations::OpType::diagonal_matrix: + apply_diagonal_matrix(iChunk,op.qubits, op.params); + break; + default: + throw std::invalid_argument( + "QubitUnitary::State::invalid instruction \'" + op.name + "\'."); + } +} + +//swap between chunks +template +void State::apply_chunk_swap(const reg_t &qubits) +{ + uint_t q0,q1; + q0 = qubits[0]; + q1 = qubits[1]; + + std::swap(BaseState::qubit_map_[q0],BaseState::qubit_map_[q1]); + + if(qubits[0] >= BaseState::chunk_bits_){ + q0 += BaseState::chunk_bits_; + } + if(qubits[1] >= BaseState::chunk_bits_){ + q1 += BaseState::chunk_bits_; + } + reg_t qs0 = {{q0, q1}}; + BaseState::apply_chunk_swap(qs0); +} + +template +size_t State::required_memory_mb( + uint_t num_qubits, const std::vector &ops) const { + // An n-qubit unitary as 2^2n complex doubles + // where each complex double is 16 bytes + (void)ops; // avoid unused variable compiler warning + size_t shift_mb = std::max(0, num_qubits + 4 - 20); + size_t mem_mb = 1ULL << (2 * shift_mb); + return mem_mb; +} + +template +void State::set_config(const json_t &config) { + BaseState::set_config(config); + + // Set OMP threshold for state update functions + JSON::get_value(omp_qubit_threshold_, "unitary_parallel_threshold", config); + + // Set threshold for truncating snapshots + JSON::get_value(json_chop_threshold_, "zero_threshold", config); + uint_t i; + for(i=0;i +void State::initialize_qreg(uint_t num_qubits) +{ + int_t iChunk; + + initialize_omp(); + + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } + } + + apply_global_phase(); +} + +template +void State::initialize_qreg(uint_t num_qubits, + const unitary_matrix_t &unitary) +{ + // Check dimension of state + if (unitary.num_qubits() != num_qubits) { + throw std::invalid_argument( + "Unitary::State::initialize: initial state does not match qubit " + "number"); + } + initialize_omp(); + + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + + apply_global_phase(); +} + +template +void State::initialize_qreg(uint_t num_qubits, + const cmatrix_t &unitary) +{ + // Check dimension of unitary + if (unitary.size() != 1ULL << (2 * num_qubits)) { + throw std::invalid_argument( + "Unitary::State::initialize: initial state does not match qubit " + "number"); + } + initialize_omp(); + + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + apply_global_phase(); +} + +template +void State::initialize_omp() +{ + uint_t i; + for(i=0;i 0) + BaseState::qregs_[i].set_omp_threads(BaseState::threads_); // set allowed OMP threads in qubitvector + } +} + +template +auto State::move_to_matrix() +{ + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].move_to_matrix(); + } + return BaseState::apply_to_matrix(false); +} + +template +auto State::copy_to_matrix() +{ + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].copy_to_matrix(); + } + return BaseState::apply_to_matrix(true); +} + +//========================================================================= +// Implementation: Gates +//========================================================================= + +template +void State::apply_gate(const uint_t iChunk,const Operations::Op &op) { + // Look for gate name in gateset + auto it = QubitUnitary::State::gateset_.find(op.name); + if (it == QubitUnitary::State::gateset_.end()) + throw std::invalid_argument("Unitary::State::invalid gate instruction \'" + + op.name + "\'."); + QubitUnitary::Gates g = it->second; + switch (g) { + case QubitUnitary::Gates::mcx: + // Includes X, CX, CCX, etc + BaseState::qregs_[iChunk].apply_mcx(op.qubits); + break; + case QubitUnitary::Gates::mcy: + // Includes Y, CY, CCY, etc + BaseState::qregs_[iChunk].apply_mcy(op.qubits); + break; + case QubitUnitary::Gates::mcz: + // Includes Z, CZ, CCZ, etc + apply_gate_phase(iChunk,op.qubits, -1); + break; + case QubitUnitary::Gates::mcr: + BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); + break; + case QubitUnitary::Gates::mcrx: + BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0])); + break; + case QubitUnitary::Gates::mcry: + BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0])); + break; + case QubitUnitary::Gates::mcrz: + BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0])); + break; + case QubitUnitary::Gates::rxx: + BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rxx(op.params[0])); + break; + case QubitUnitary::Gates::ryy: + BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ryy(op.params[0])); + break; + case QubitUnitary::Gates::rzz: + apply_diagonal_matrix(iChunk,op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); + break; + case QubitUnitary::Gates::rzx: + BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); + break; + case QubitUnitary::Gates::id: + break; + case QubitUnitary::Gates::h: + apply_gate_mcu3(iChunk,op.qubits, M_PI / 2., 0., M_PI); + break; + case QubitUnitary::Gates::s: + apply_gate_phase(iChunk,op.qubits[0], complex_t(0., 1.)); + break; + case QubitUnitary::Gates::sdg: + apply_gate_phase(iChunk,op.qubits[0], complex_t(0., -1.)); + break; + case QubitUnitary::Gates::pauli: + BaseState::qregs_[iChunk].apply_pauli(op.qubits, op.string_params[0]); + break; + case QubitUnitary::Gates::t: { + const double isqrt2{1. / std::sqrt(2)}; + apply_gate_phase(iChunk,op.qubits[0], complex_t(isqrt2, isqrt2)); + } break; + case QubitUnitary::Gates::tdg: { + const double isqrt2{1. / std::sqrt(2)}; + apply_gate_phase(iChunk,op.qubits[0], complex_t(isqrt2, -isqrt2)); + } break; + case QubitUnitary::Gates::mcswap: + // Includes SWAP, CSWAP, etc + BaseState::qregs_[iChunk].apply_mcswap(op.qubits); + break; + case QubitUnitary::Gates::mcu3: + // Includes u3, cu3, etc + apply_gate_mcu3(iChunk,op.qubits, std::real(op.params[0]), std::real(op.params[1]), + std::real(op.params[2])); + break; + case QubitUnitary::Gates::mcu2: + // Includes u2, cu2, etc + apply_gate_mcu3(iChunk,op.qubits, M_PI / 2., std::real(op.params[0]), + std::real(op.params[1])); + break; + case QubitUnitary::Gates::mcp: + // Includes u1, cu1, p, cp, mcp, etc + apply_gate_phase(iChunk,op.qubits, std::exp(complex_t(0, 1) * op.params[0])); + break; + case QubitUnitary::Gates::mcsx: + // Includes sx, csx, mcsx etc + BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SX); + break; + default: + // We shouldn't reach here unless there is a bug in gateset + throw std::invalid_argument("Unitary::State::invalid gate instruction \'" + + op.name + "\'."); + } +} + +template +void State::apply_matrix(const uint_t iChunk,const reg_t &qubits, + const cmatrix_t &mat) { + if (qubits.empty() == false && mat.size() > 0) { + apply_matrix(iChunk,qubits, Utils::vectorize_matrix(mat)); + } +} + +template +void State::apply_matrix(const uint_t iChunk,const reg_t &qubits, + const cvector_t &vmat) { + // Check if diagonal matrix + if (vmat.size() == 1ULL << qubits.size()) { + apply_diagonal_matrix(iChunk,qubits, vmat); + } else { + BaseState::qregs_[iChunk].apply_matrix(qubits, vmat); + } +} + +template +void State::apply_diagonal_matrix(const uint_t iChunk, const reg_t &qubits, const cvector_t &diag) +{ + if(BaseState::gpu_optimization_){ + //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is + reg_t qubits_chunk = qubits; + for(uint_t i;i= BaseState::chunk_bits_){ + qubits_chunk[i] += BaseState::chunk_bits_; + } + } + + BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_chunk,diag); + } + else{ + reg_t qubits_in = qubits; + cvector_t diag_in = diag; + + BaseState::block_diagonal_matrix(iChunk,qubits_in,diag_in); + BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in,diag_in); + } +} + +template +void State::apply_gate_phase(const uint_t iChunk,const uint_t qubit, complex_t phase) { + cvector_t diag(2); + diag[0] = 1.0; + diag[1] = phase; + apply_diagonal_matrix(iChunk,reg_t({qubit}), diag); +} + +template +void State::apply_gate_phase(const uint_t iChunk,const reg_t& qubits, complex_t phase) +{ + cvector_t diag((1 << qubits.size()),1.0); + diag[(1 << qubits.size()) - 1] = phase; + apply_diagonal_matrix(iChunk,qubits, diag); +} + +template +void State::apply_gate_mcu3(const uint_t iChunk,const reg_t &qubits, double theta, + double phi, double lambda) { + const auto u3 = Linalg::Matrix::u3(theta, phi, lambda); + BaseState::qregs_[iChunk].apply_mcu(qubits, Utils::vectorize_matrix(u3)); +} + +template +void State::apply_snapshot(const Operations::Op &op, + ExperimentResult &result) { + // Look for snapshot type in snapshotset + if (op.name == "unitary" || op.name == "state") { + auto matrix = copy_to_matrix(); + if(BaseState::distributed_rank_ == 0){ + result.legacy_data.add_pershot_snapshot("unitary", op.string_params[0], + matrix); + } + } else { + throw std::invalid_argument( + "Unitary::State::invalid snapshot instruction \'" + op.name + "\'."); + } +} + +template +void State::apply_global_phase() { + if (BaseState::has_global_phase_) { + int_t i; +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) + for(i=0;i +void State::apply_save_unitary(const Operations::Op &op, + ExperimentResult &result, + bool last_op) +{ + if (op.qubits.size() != BaseState::num_qubits_) { + throw std::invalid_argument( + op.name + " was not applied to all qubits." + " Only the full unitary can be saved."); + } + std::string key = (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0]; + + if (last_op) { + BaseState::save_data_pershot(result, key, + move_to_matrix(), + op.save_type); + } else { + BaseState::save_data_pershot(result, key, + copy_to_matrix(), + op.save_type); + } +} + +template +double State::expval_pauli(const reg_t &qubits, + const std::string& pauli) { + throw std::runtime_error("Unitary simulator does not support Pauli expectation values."); +} + +//------------------------------------------------------------------------------ +} // namespace QubitUnitary +} // end namespace AER +//------------------------------------------------------------------------------ +#endif diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp index 2dcceea8da..300f5e9e6f 100755 --- a/src/simulators/unitary/unitarymatrix_thrust.hpp +++ b/src/simulators/unitary/unitarymatrix_thrust.hpp @@ -208,10 +208,7 @@ matrix> UnitaryMatrixThrust::copy_to_matrix() const uint_t irow, icol; #pragma omp parallel for private(i,irow,icol) if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for (i = 0; i < csize; i++) { - irow = (i >> num_qubits_); - icol = i - (irow << num_qubits_); - - ret(icol, irow) = qreg[i]; + ret[i] = qreg[i]; } return ret; } @@ -229,17 +226,13 @@ void UnitaryMatrixThrust::initialize() BaseVector::zero(); // Set to be identity matrix - uint_t is,ie,idx; + uint_t idx; int_t i; - is = BaseVector::chunk_index_ << BaseVector::num_qubits_; - ie = is + (1ull << BaseVector::num_qubits_); #pragma omp parallel private(idx) if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for(i=0;i= is && idx < ie){ - BaseVector::set_state(idx-is,one); - } + BaseVector::set_state(idx,one); } } @@ -262,15 +255,15 @@ void UnitaryMatrixThrust::initialize_from_matrix(const matrix tmp(BaseVector::data_size_); - int_t i; + matrix> tmp(nrows,nrows); #pragma omp parallel for if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for (int_t row = 0; row < nrows; ++row) for (int_t col = 0; col < nrows; ++col) { - tmp[row + nrows * col] = mat(row, col); + tmp(row,col) = mat(row, col); } - BaseVector::initialize_from_vector(tmp); + + BaseVector::initialize_from_data(tmp.data(), tmp.size()); } template diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index 035f970c30..db53160e21 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -69,6 +69,11 @@ class CacheBlocking : public CircuitOptimization { sample_measure_ = enabled; } + void set_save_state(bool enabled) + { + save_state_ = enabled; + } + //setting blocking parameters automatically void set_blocking(int bits, size_t min_memory, uint_t n_place, const size_t complex_size, bool is_matrix = false); @@ -79,7 +84,9 @@ class CacheBlocking : public CircuitOptimization { mutable reg_t qubitSwapped_; mutable bool blocking_enabled_; mutable bool sample_measure_ = false; + mutable bool save_state_ = false; int gpu_blocking_bits_; + bool density_matrix_ = false; bool block_circuit(Circuit& circ,bool doSwap) const; @@ -94,7 +101,7 @@ class CacheBlocking : public CircuitOptimization { bool is_diagonal_op(Operations::Op& op) const; void insert_swap(std::vector& ops,uint_t bit0,uint_t bit1,bool chunk) const; - void insert_sim_op(std::vector& ops,const char* name,const reg_t& qubits) const; + void insert_sim_op(std::vector& ops,char* name,const reg_t& qubits) const; void insert_pauli(std::vector& ops,reg_t& qubits,std::string& pauli) const; void define_blocked_qubits(std::vector& ops,reg_t& blockedQubits,bool crossQubitOnly) const; @@ -104,6 +111,8 @@ class CacheBlocking : public CircuitOptimization { bool split_pauli(const Operations::Op& op, const reg_t blockedQubits, std::vector& out,std::vector& queue) const; + bool split_op(const Operations::Op& op,const reg_t blockedQubits,std::vector& out,std::vector& queue) const; + bool is_blockable_operation(Operations::Op& op) const; }; @@ -123,6 +132,14 @@ void CacheBlocking::set_config(const json_t &config) gpu_blocking_bits_ = 10; } } + + std::string method; + if (JSON::get_value(method, "method", config)) { + if(method.find("density_matrix") != std::string::npos){ + density_matrix_ = true; + } + } + } @@ -130,6 +147,7 @@ void CacheBlocking::set_blocking(int bits, size_t min_memory, uint_t n_place, si { int chunk_bits = bits; uint_t scale = is_matrix ? 2 : 1; + size_t size; //get largest possible chunk bits while((complex_size << (scale*chunk_bits)) > min_memory){ @@ -172,7 +190,7 @@ void CacheBlocking::insert_swap(std::vector& ops,uint_t bit0,uin ops.push_back(sgate); } -void CacheBlocking::insert_sim_op(std::vector& ops,const char* name,const reg_t& qubits) const +void CacheBlocking::insert_sim_op(std::vector& ops,char* name,const reg_t& qubits) const { Operations::Op op; op.type = Operations::OpType::sim_op; @@ -223,7 +241,7 @@ void CacheBlocking::optimize_circuit(Circuit& circ, void CacheBlocking::define_blocked_qubits(std::vector& ops,reg_t& blockedQubits,bool crossQubitOnly) const { uint_t i,j,iq; - int nq; + int nq,nb; bool exist; for(i=0;i= block_bits_) @@ -290,7 +308,10 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector& //only gate and matrix can be reordered if(op.type != Operations::OpType::gate && op.type != Operations::OpType::matrix){ - return false; + //except for reset for density matrix + if(!density_matrix_ || op.type != Operations::OpType::reset){ + return false; + } } for(j=0;j& bool CacheBlocking::block_circuit(Circuit& circ,bool doSwap) const { - uint_t n; + uint_t i,n; std::vector out; std::vector queue; std::vector queue_next; n = add_ops(circ.ops,out,queue,doSwap,true); -// put_nongate_ops(out,queue_next,queue,doSwap); -// queue.clear(); while(queue.size() > 0){ n = add_ops(queue,out,queue_next,doSwap,false); queue = queue_next; queue_next.clear(); -// put_nongate_ops(out,queue_next,queue,doSwap); if(n == 0){ break; } -// queue.clear(); } if(queue.size() > 0) return false; -// restore_qubits_order(out); + if(save_state_) + restore_qubits_order(out); circ.ops = out; return true; } -void CacheBlocking::put_nongate_ops(std::vector& out,std::vector& queue,std::vector& input,bool doSwap) const -{ - uint_t i; - for(i=0;i& ops) const { uint_t i,j,t; @@ -400,6 +395,16 @@ void CacheBlocking::restore_qubits_order(std::vector& ops) const if(qubitMap_[i] != i){ j = qubitMap_[qubitMap_[i]]; if(j != i && j < block_bits_){ + if(nInBlock == 0){ + uint_t last = ops.size() - 1; + if(ops[last].type == Operations::OpType::sim_op && ops[last].name == "end_blocking"){ + ops.pop_back(); + nInBlock = 1; + } + else{ + insert_sim_op(ops,"begin_blocking",qubitMap_); + } + } insert_swap(ops,i,j,false); qubitMap_[qubitSwapped_[i]] = j; @@ -440,9 +445,14 @@ void CacheBlocking::restore_qubits_order(std::vector& ops) const bool CacheBlocking::is_blockable_operation(Operations::Op& op) const { if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || - op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer){ + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer || + op.type == Operations::OpType::superop){ + return true; + } + if(density_matrix_ && op.type == Operations::OpType::reset){ return true; } + return false; } @@ -450,6 +460,7 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector& ops,std::vector 0){ //swap gate is not required for initial state + if(!first){ //swap gate is not required for initial state insert_swap(out,swap[i],qubitMap_[blockedQubits[i]],true); } @@ -545,19 +556,21 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks - insert_sim_op(out,"end_blocking",blockedQubits); - } - else if(!end_block_inserted){ - out.pop_back(); - } + bool restore_qubits = false; if(ops[i].type == Operations::OpType::kraus){ - if(ops[i].qubits.size() >= block_bits_){ - throw std::runtime_error("CacheBlocking : kraus number of qubits should be smaller than chunk qubit size"); + if(ops[i].qubits.size() > block_bits_){ + throw std::runtime_error("CacheBlocking : Kraus operator, number of qubits should be smaller than chunk qubit size"); break; } if(!can_block(ops[i],blockedQubits)){ //if some qubits are out of chunk, queued for next step @@ -565,15 +578,37 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks + insert_sim_op(out,"end_blocking",blockedQubits); + } + else if(!end_block_inserted){ + out.pop_back(); } + + if(restore_qubits) + restore_qubits_order(out); + //mapping swapped qubits for(iq=0;iq 1) return true; } - else if(op.type == Operations::OpType::matrix){ //fusion + else if(op.type == Operations::OpType::matrix || op.type == Operations::OpType::multiplexer || op.type == Operations::OpType::superop){ if(op.qubits.size() > 1) return true; } @@ -792,6 +827,51 @@ bool CacheBlocking::split_pauli(const Operations::Op& op,const reg_t blockedQubi return false; } +//split op to inside op and outside op +bool CacheBlocking::split_op(const Operations::Op& op,const reg_t blockedQubits,std::vector& out,std::vector& queue) const +{ + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + int_t i,j,n; + bool inside; + + n = op.qubits.size(); + for(i=0;i 0){ //save in queue + Operations::Op op_out = op; + op_out.qubits = qubits_out_chunk; + queue.push_back(op_out); + } + + if(qubits_in_chunk.size() > 0){ + Operations::Op op_in = op; + //mapping swapped qubits + for(i=0;i