From d21d899806cfd8718122ea618f818160ee6ff698 Mon Sep 17 00:00:00 2001 From: doichanj Date: Mon, 29 Mar 2021 16:00:33 +0900 Subject: [PATCH 01/10] Fixing cache blocking transpiler and chunk states --- src/controllers/qasm_controller.hpp | 1 + .../densitymatrix_state_chunk.hpp | 115 ++++++++---- src/simulators/state_chunk.hpp | 166 +++++++++++++++++- .../statevector/statevector_state.hpp | 1 + .../statevector/statevector_state_chunk.hpp | 141 ++++++++------- src/transpile/cacheblocking.hpp | 140 +++++++++++---- 6 files changed, 431 insertions(+), 133 deletions(-) diff --git a/src/controllers/qasm_controller.hpp b/src/controllers/qasm_controller.hpp index a408b2e83d..70ede34137 100755 --- a/src/controllers/qasm_controller.hpp +++ b/src/controllers/qasm_controller.hpp @@ -1103,6 +1103,7 @@ void QasmController::run_circuit_helper(const Circuit& circ, if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu) is_matrix = true; auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),is_matrix); + cache_block_pass.set_sample_measure(check_measure_sampling_opt(opt_circ, method)); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); uint_t block_bits = 0; diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index f0ce811d05..9b6eee2e77 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -648,6 +648,9 @@ void State::apply_chunk_swap(const reg_t &qubits) uint_t q0,q1; q0 = qubits[0]; q1 = qubits[1]; + + std::swap(BaseState::qubit_map_[q0],BaseState::qubit_map_[q1]); + if(qubits[0] >= BaseState::chunk_bits_){ q0 += BaseState::chunk_bits_; } @@ -713,8 +716,9 @@ void State::apply_save_amplitudes_sq(const Operations::Op &op, BaseState::threads_ > 1) \ num_threads(BaseState::threads_) for (int_t i = 0; i < size; ++i) { - if(op.int_params[i] >= (irow << BaseState::chunk_bits_) && op.int_params[i] < ((irow+1) << BaseState::chunk_bits_)) - amps_sq[i] = BaseState::qregs_[iChunk].probability(op.int_params[i] - (irow << BaseState::chunk_bits_)); + uint_t idx = BaseState::mapped_index(op.int_params[i]); + if(idx >= (irow << BaseState::chunk_bits_) && idx < ((irow+1) << BaseState::chunk_bits_)) + amps_sq[i] = BaseState::qregs_[iChunk].probability(idx - (irow << BaseState::chunk_bits_)); } } #ifdef AER_MPI @@ -1240,35 +1244,52 @@ rvector_t State::measure_probs(const reg_t &qubits) const icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); if(irow == icol){ //diagonal chunk - auto chunkSum = BaseState::qregs_[i].probabilities(qubits); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } } } +#ifdef AER_MPI + BaseState::reduce_sum(sum); +#endif + return sum; } @@ -1426,9 +1447,15 @@ void State::measure_reset_update(const reg_t &qubits, // If it doesn't agree with the reset state update if (final_state != meas_state) { + if(qubits[0] < BaseState::chunk_bits_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const reg_t &qubits, // If it doesn't agree with the reset state update // TODO This function could be optimized as a permutation update if (final_state != meas_state) { - // build vectorized permutation matrix - cvector_t perm(dim * dim, 0.); - perm[final_state * dim + meas_state] = 1.; - perm[meas_state * dim + final_state] = 1.; - for (size_t j=0; j < dim; j++) { - if (j != final_state && j != meas_state) - perm[j * dim + j] = 1.; + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + for(i=0;i 0){ //in chunk exchange + const size_t dim_in = 1ULL << qubits_in_chunk.size(); + // build vectorized permutation matrix + cvector_t perm(dim_in * dim_in, 0.); + perm[final_state * dim_in + meas_state] = 1.; + perm[meas_state * dim_in + final_state] = 1.; + for (size_t j=0; j < dim_in; j++) { + if (j != final_state && j != meas_state) + perm[j * dim_in + j] = 1.; + } + + // apply permutation to swap state #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange + for(i=0;i @@ -488,6 +493,12 @@ void StateChunk::allocate(uint_t num_qubits,uint_t block_bits) //only first one allocates chunks, others only set chunk index nchunks = 0; } + + //initialize qubit map + qubit_map_.resize(num_qubits_); + for(i=0;i @@ -847,19 +858,36 @@ void StateChunk::apply_save_expval(const Operations::Op &op, } } +template +uint_t StateChunk::mapped_index(const uint_t idx) +{ + uint_t i,ret = 0; + uint_t t = idx; + + for(i=0;i>= 1; + } + return ret; +} + template void StateChunk::apply_chunk_swap(const reg_t &qubits) { - uint_t q0,q1,t,nLarge; + uint_t q0,q1,nLarge; int_t iChunk; q0 = qubits[qubits.size() - 2]; q1 = qubits[qubits.size() - 1]; + if(qubit_scale() == 1){ + std::swap(qubit_map_[q0],qubit_map_[q1]); + } + if(q0 > q1){ - t = q0; - q0 = q1; - q1 = t; + std::swap(q0,q1); } if(q1 < chunk_bits_*qubit_scale()){ @@ -1021,6 +1049,136 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) } } +template +void StateChunk::apply_chunk_x(const uint_t qubit) +{ + uint_t q0,q1,t,nLarge; + int_t iChunk; + + + if(qubit < chunk_bits_*qubit_scale()){ + reg_t qubits(1,qubit); +#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk>= (chunk_bits_*qubit_scale()); + + int proc_bits = 0; + uint_t procs = distributed_procs_; + while(procs > 1){ + if((procs & 1) != 0){ + proc_bits = -1; + break; + } + proc_bits++; + procs >>= 1; + } + + if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed + nPair = num_local_chunks_ >> 1; + +#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) + for(iPair=0;iPair=0;i--){ + baseChunk += (iu[i] << ub[i]); + //update for next + iu[i] += add; + add = 0; + if(iu[i] >= nu[i]){ + iu[i] = 0; + add = 1; + } + } + + iChunk1 = baseChunk; + iChunk2 = baseChunk | mask; + + if(iChunk1 >= chunk_index_begin_[distributed_rank_] && iChunk1 < chunk_index_end_[distributed_rank_]){ //chunk1 is on this process + if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process + qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap(qubits,qregs_[iChunk2 - global_chunk_index_],true); + continue; + } + else{ + iLocalChunk = iChunk1; + iRemoteChunk = iChunk2; + iProc = get_process_by_chunk(iChunk2); + } + } + else{ + if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process + iLocalChunk = iChunk2; + iRemoteChunk = iChunk1; + iProc = get_process_by_chunk(iChunk1); + } + else{ + continue; //there is no chunk for this pair on this process + } + } + + MPI_Request reqSend,reqRecv; + MPI_Status st; + uint_t sizeRecv,sizeSend; + + auto pSend = qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend); + MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,iPair,distributed_comm_,&reqSend); + + auto pRecv = qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv); + MPI_Irecv(pRecv,sizeRecv,MPI_BYTE,iProc,iPair,distributed_comm_,&reqRecv); + + MPI_Wait(&reqSend,&st); + MPI_Wait(&reqRecv,&st); + + qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap(qubits,iRemoteChunk); + } + } +#endif + + } +} template void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_pair_index) diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index 84076cc72c..a8077a85f7 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -580,6 +580,7 @@ void State::apply_ops(const std::vector &ops, else if(op.name == "end_register_blocking"){ BaseState::qreg_.leave_register_blocking(); } + break; case OpType::save_expval: case OpType::save_expval_var: BaseState::apply_save_expval(op, result); diff --git a/src/simulators/statevector/statevector_state_chunk.hpp b/src/simulators/statevector/statevector_state_chunk.hpp index 69992ba261..c7582e72af 100644 --- a/src/simulators/statevector/statevector_state_chunk.hpp +++ b/src/simulators/statevector/statevector_state_chunk.hpp @@ -546,8 +546,6 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, RngEngine &rng, bool final_ops) { - uint_t ireg; - if(BaseState::creg_.check_conditional(op)) { switch (op.type) { case Operations::OpType::barrier: @@ -562,11 +560,9 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, apply_measure(op.qubits, op.memory, op.registers, rng); break; case Operations::OpType::bfunc: - if(iChunk == 0 || ireg > 0) BaseState::creg_.apply_bfunc(op); break; case Operations::OpType::roerror: - if(iChunk == 0 || ireg > 0) BaseState::creg_.apply_roerror(op, rng); break; case Operations::OpType::gate: @@ -825,10 +821,11 @@ void State::apply_save_amplitudes(const Operations::Op &op, if (op.type == Operations::OpType::save_amps) { Vector amps(size, false); for (int_t i = 0; i < size; ++i) { - uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + uint_t idx = BaseState::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BaseState::chunk_bits_; amps[i] = 0.0; if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ - amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(idx - (iChunk << BaseState::chunk_bits_)); } #ifdef AER_MPI complex_t amp = amps[i]; @@ -842,9 +839,10 @@ void State::apply_save_amplitudes(const Operations::Op &op, else{ rvector_t amps_sq(size,0); for (int_t i = 0; i < size; ++i) { - uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + uint_t idx = BaseState::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BaseState::chunk_bits_; if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ - amps_sq[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].probability(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + amps_sq[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].probability(idx - (iChunk << BaseState::chunk_bits_)); } } #ifdef AER_MPI @@ -1376,33 +1374,46 @@ rvector_t State::measure_probs(const reg_t &qubits) const #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += nr; + } } #ifdef AER_MPI @@ -1493,28 +1504,14 @@ std::vector State::sample_measure(const reg_t &qubits, std::vector all_samples; all_samples.reserve(shots); - if(qubits.size() == 0){ - //return all bits if qubits is empty (for multi-shot parallelization) - for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); - reg_t sample; - sample.reserve(BaseState::num_qubits_); - for (uint_t qubit = 0 ; qubit < BaseState::num_qubits_; qubit++) { - sample.push_back(allbit_sample[qubit]); - } - all_samples.push_back(sample); - } - } - else{ - for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); - reg_t sample; - sample.reserve(qubits.size()); - for (uint_t qubit : qubits) { - sample.push_back(allbit_sample[qubit]); - } - all_samples.push_back(sample); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); } + all_samples.push_back(sample); } return all_samples; } @@ -1557,13 +1554,11 @@ void State::measure_reset_update(const std::vector &qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i::measure_reset_update(const std::vector &qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i 0){ //in chunk exchange + const size_t dim_in = 1ULL << qubits_in_chunk.size(); + // build vectorized permutation matrix + cvector_t perm(dim_in * dim_in, 0.); + perm[final_state * dim_in + meas_state] = 1.; + perm[meas_state * dim_in + final_state] = 1.; + for (size_t j=0; j < dim_in; j++) { + if (j != final_state && j != meas_state) + perm[j * dim_in + j] = 1.; + } + // apply permutation to swap state #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange + for(i=0;i& waiting_ops) const; bool split_pauli(const Operations::Op& op, const reg_t blockedQubits, std::vector& out,std::vector& queue) const; + + bool is_blockable_operation(Operations::Op& op) const; }; void CacheBlocking::set_config(const json_t &config) @@ -281,19 +289,24 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector& //check if the operation can be reordered in front of waiting queue uint_t j,iq,jq; - //only gate and matrix can reorder + //only gate and matrix can be reordered if(op.type != Operations::OpType::gate && op.type != Operations::OpType::matrix){ return false; } for(j=0;j queue_next; n = add_ops(circ.ops,out,queue,doSwap,true); - put_nongate_ops(out,queue_next,queue,doSwap); - queue.clear(); - while(queue_next.size() > 0){ - n = add_ops(queue_next,out,queue,doSwap,false); +// put_nongate_ops(out,queue_next,queue,doSwap); +// queue.clear(); + while(queue.size() > 0){ + n = add_ops(queue,out,queue_next,doSwap,false); + queue = queue_next; queue_next.clear(); - put_nongate_ops(out,queue_next,queue,doSwap); +// put_nongate_ops(out,queue_next,queue,doSwap); if(n == 0){ break; } - queue.clear(); +// queue.clear(); } if(queue.size() > 0) return false; + +// restore_qubits_order(out); + circ.ops = out; return true; } @@ -421,6 +438,15 @@ void CacheBlocking::restore_qubits_order(std::vector& ops) const }while(count != 0); } +bool CacheBlocking::is_blockable_operation(Operations::Op& op) const +{ + if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer){ + return true; + } + return false; +} + uint_t CacheBlocking::add_ops(std::vector& ops,std::vector& out,std::vector& queue,bool doSwap,bool first) const { uint_t i,j,iq; @@ -430,6 +456,7 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks + insert_sim_op(out,"end_blocking",blockedQubits); + } + else if(!end_block_inserted){ + out.pop_back(); + } + if(ops[i].type == Operations::OpType::kraus){ + if(ops[i].qubits.size() >= block_bits_){ + throw std::runtime_error("CacheBlocking : kraus number of qubits should be smaller than chunk qubit size"); + break; + } + if(!can_block(ops[i],blockedQubits)){ //if some qubits are out of chunk, queued for next step + queue.push_back(ops[i]); + continue; + } + } + if(!end_block_inserted){ + if(sample_measure_ && ops[i].type == Operations::OpType::measure){ + //currently sampling should be done with original qubit mapping (TO DO : sampling without inserting swaps) + restore_qubits_order(out); + } + else if(ops[i].type == Operations::OpType::save_statevec || ops[i].type == Operations::OpType::save_statevec_dict || ops[i].type == Operations::OpType::save_densmat || ops[i].type == Operations::OpType::save_unitary){ + restore_qubits_order(out); + } + } + //mapping swapped qubits + for(iq=0;iq 0){ - insert_sim_op(out,"end_blocking",blockedQubits); - } - else{ - //pop unnecessary operations - while(out.size() > pos_begin){ - out.pop_back(); + if(!end_block_inserted){ + if(num_gates_added > 0){ + insert_sim_op(out,"end_blocking",blockedQubits); + } + else{ + //pop unnecessary operations + while(out.size() > pos_begin){ + out.pop_back(); + } } } } @@ -633,6 +703,9 @@ bool CacheBlocking::is_cross_qubits_op(Operations::Op& op) const if(op.qubits.size() > 1) return true; } + if(op.type == Operations::OpType::kraus){ + return true; + } return false; } @@ -721,6 +794,7 @@ bool CacheBlocking::split_pauli(const Operations::Op& op,const reg_t blockedQubi return false; } + //------------------------------------------------------------------------- } // end namespace Transpile } // end namespace AER From 6c00af26653d857c478bdb2959183d7c59ed3311 Mon Sep 17 00:00:00 2001 From: doichanj Date: Mon, 29 Mar 2021 16:58:45 +0900 Subject: [PATCH 02/10] add save_state operation to qubit restoring --- src/transpile/cacheblocking.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index 3b9b420976..9fa69a47b6 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -567,14 +567,14 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector Date: Wed, 31 Mar 2021 18:33:17 +0900 Subject: [PATCH 03/10] fix multi-chunk unitary simulator --- .../aer/backends/unitary_simulator.py | 2 + src/controllers/statevector_controller.hpp | 1 + src/controllers/unitary_controller.hpp | 1 + .../unitary/unitary_state_chunk.hpp | 248 ++++++++++++++---- .../unitary/unitarymatrix_thrust.hpp | 21 +- src/transpile/cacheblocking.hpp | 47 ++-- 6 files changed, 225 insertions(+), 95 deletions(-) diff --git a/qiskit/providers/aer/backends/unitary_simulator.py b/qiskit/providers/aer/backends/unitary_simulator.py index ea74c57486..828822bb9d 100644 --- a/qiskit/providers/aer/backends/unitary_simulator.py +++ b/qiskit/providers/aer/backends/unitary_simulator.py @@ -186,6 +186,8 @@ def _default_options(cls): fusion_verbose=False, fusion_max_qubit=5, fusion_threshold=14, + blocking_qubits=None, + blocking_enable=False, # statevector options statevector_parallel_threshold=14) diff --git a/src/controllers/statevector_controller.hpp b/src/controllers/statevector_controller.hpp index db5c9a9cfe..cc67fd08c2 100755 --- a/src/controllers/statevector_controller.hpp +++ b/src/controllers/statevector_controller.hpp @@ -356,6 +356,7 @@ void StatevectorController::run_circuit_helper( } Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),false); + cache_block_pass.set_save_state(true); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); uint_t block_bits = 0; diff --git a/src/controllers/unitary_controller.hpp b/src/controllers/unitary_controller.hpp index f54f52d5b2..b73492aaca 100755 --- a/src/controllers/unitary_controller.hpp +++ b/src/controllers/unitary_controller.hpp @@ -357,6 +357,7 @@ void UnitaryController::run_circuit_helper( } Transpile::CacheBlocking cache_block_pass = transpile_cache_blocking(opt_circ,dummy_noise,config,(precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),true); + cache_block_pass.set_save_state(true); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); uint_t block_bits = 0; diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp index a0276cc7d1..58fdd1429c 100644 --- a/src/simulators/unitary/unitary_state_chunk.hpp +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -35,7 +35,8 @@ const Operations::OpSet StateOpSet( // Op types {Operations::OpType::gate, Operations::OpType::barrier, Operations::OpType::matrix, Operations::OpType::diagonal_matrix, - Operations::OpType::snapshot, Operations::OpType::save_unitary}, + Operations::OpType::snapshot, Operations::OpType::save_unitary, + Operations::OpType::save_state }, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", "cy", "cp", "cu1", "cu2", "cu3", "swap", "id", "p", @@ -96,6 +97,7 @@ class State : public Base::StateChunk { void initialize_omp(); auto move_to_matrix(); + auto copy_to_matrix(); protected: //----------------------------------------------------------------------- @@ -106,6 +108,9 @@ class State : public Base::StateChunk { RngEngine &rng, bool final_ops = false) override; + //swap between chunks + virtual void apply_chunk_swap(const reg_t &qubits) override; + // Applies a Gate operation to the state class. // This should support all and only the operations defined in // allowed_operations. @@ -131,6 +136,8 @@ class State : public Base::StateChunk { // Optimize phase gate with diagonal [1, phase] void apply_gate_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase); + void apply_gate_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase); + //----------------------------------------------------------------------- // Multi-controlled u3 //----------------------------------------------------------------------- @@ -144,6 +151,12 @@ class State : public Base::StateChunk { //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- + // Save the unitary matrix for the simulator + void apply_save_unitary(const Operations::Op &op, + ExperimentResult &result, + bool last_op); + + auto apply_to_matrix(bool copy = false); // Helper function for computing expectation value virtual double expval_pauli(const reg_t &qubits, @@ -186,6 +199,10 @@ void State::apply_op(const int_t iChunk,const Operations::Op & if (BaseState::creg_.check_conditional(op)) apply_gate(iChunk,op); break; + case Operations::OpType::save_state: + case Operations::OpType::save_unitary: + apply_save_unitary(op, result, final_ops); + break; case Operations::OpType::snapshot: apply_snapshot(op, result); break; @@ -201,6 +218,43 @@ void State::apply_op(const int_t iChunk,const Operations::Op & } } +//swap between chunks +template +void State::apply_chunk_swap(const reg_t &qubits) +{ + uint_t q0,q1; + q0 = qubits[0]; + q1 = qubits[1]; + + std::swap(BaseState::qubit_map_[q0],BaseState::qubit_map_[q1]); + + if(qubits[0] >= BaseState::chunk_bits_){ + q0 += BaseState::chunk_bits_; + } + if(qubits[1] >= BaseState::chunk_bits_){ + q1 += BaseState::chunk_bits_; + } + reg_t qs0 = {{q0, q1}}; + BaseState::apply_chunk_swap(qs0); + + /* + if(qubits[0] >= BaseState::chunk_bits_){ + q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_); + } + else{ + q0 += BaseState::chunk_bits_; + } + if(qubits[1] >= BaseState::chunk_bits_){ + q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_); + } + else{ + q1 += BaseState::chunk_bits_; + } + reg_t qs1 = {{q0, q1}}; + BaseState::apply_chunk_swap(qs1); + */ +} + template size_t State::required_memory_mb( uint_t num_qubits, const std::vector &ops) const { @@ -230,29 +284,29 @@ void State::set_config(const json_t &config) { template void State::initialize_qreg(uint_t num_qubits) { - int_t i; + int_t iChunk; initialize_omp(); if(BaseState::chunk_bits_ == BaseState::num_qubits_){ - for(i=0;inum_qubits_ == this->chunk_bits_){ - BaseState::qregs_[i].initialize(); - } - else{ - BaseState::qregs_[i].zero(); - } +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); } } @@ -275,18 +329,30 @@ void State::initialize_qreg(uint_t num_qubits, if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); - BaseState::qregs_[iChunk].initialize_from_data(unitary.data() + local_offset + (iChunk << BaseState::chunk_bits_*2), 1ull << BaseState::chunk_bits_*2); + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -309,28 +375,31 @@ void State::initialize_qreg(uint_t num_qubits, if(BaseState::chunk_bits_ == BaseState::num_qubits_){ for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + //copy part of state for this chunk - uint_t i; - cvector_t tmp(1ull << BaseState::chunk_bits_*2); + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - tmp[i] = unitary[local_offset + (iChunk << BaseState::chunk_bits_*2) + i]; + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; } BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } - apply_global_phase(); } @@ -351,6 +420,21 @@ auto State::move_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } + return apply_to_matrix(false); +} + +template +auto State::copy_to_matrix() +{ + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].copy_to_matrix(); + } + return apply_to_matrix(true); +} + +template +auto State::apply_to_matrix(bool copy) +{ int_t iChunk; uint_t size = 1ull << (BaseState::chunk_bits_*2); uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; @@ -369,25 +453,41 @@ auto State::move_to_matrix() BaseState::recv_data(recv.data(),size,0,iChunk); int_t i; - uint_t offset = iChunk << (BaseState::chunk_bits_*2); + uint_t irow_chunk = ((iChunk + BaseState::global_chunk_index_) >> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); #pragma omp parallel for if(num_threads > 1) num_threads(num_threads) for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix[offset+i] = recv(icol,irow); + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + matrix[idx] = recv[i]; } } #endif for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + if(copy){ + auto tmp = BaseState::qregs_[iChunk].copy_to_matrix(); #pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix[offset+i] = tmp(icol,irow); + for(i=0;i> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + matrix[idx] = tmp[i]; + } + } + else{ + auto tmp = BaseState::qregs_[iChunk].move_to_matrix(); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + matrix[idx] = tmp[i]; + } } } } @@ -397,8 +497,14 @@ auto State::move_to_matrix() for(iChunk=0;iChunk::apply_gate(const uint_t iChunk,const Operations::O break; case QubitUnitary::Gates::mcz: // Includes Z, CZ, CCZ, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + apply_gate_phase(iChunk,op.qubits, -1); break; case QubitUnitary::Gates::mcr: BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); @@ -494,8 +600,7 @@ void State::apply_gate(const uint_t iChunk,const Operations::O break; case QubitUnitary::Gates::mcp: // Includes u1, cu1, p, cp, mcp, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, - std::exp(complex_t(0, 1) * op.params[0])); + apply_gate_phase(iChunk,op.qubits, std::exp(complex_t(0, 1) * op.params[0])); break; case QubitUnitary::Gates::mcsx: // Includes sx, csx, mcsx etc @@ -532,7 +637,14 @@ void State::apply_diagonal_matrix(const uint_t iChunk, const r { if(BaseState::gpu_optimization_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits,diag); + reg_t qubits_chunk = qubits; + for(uint_t i;i= BaseState::chunk_bits_){ + qubits_chunk[i] += BaseState::chunk_bits_; + } + } + + BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_chunk,diag); } else{ reg_t qubits_in = qubits; @@ -544,11 +656,19 @@ void State::apply_diagonal_matrix(const uint_t iChunk, const r } template -void State::apply_gate_phase(const uint_t iChunk,uint_t qubit, complex_t phase) { - cmatrix_t diag(1, 2); - diag(0, 0) = 1.0; - diag(0, 1) = phase; - apply_matrix(iChunk,reg_t({qubit}), diag); +void State::apply_gate_phase(const uint_t iChunk,const uint_t qubit, complex_t phase) { + cvector_t diag(2); + diag[0] = 1.0; + diag[1] = phase; + apply_diagonal_matrix(iChunk,reg_t({qubit}), diag); +} + +template +void State::apply_gate_phase(const uint_t iChunk,const reg_t& qubits, complex_t phase) +{ + cvector_t diag((1 << qubits.size()),1.0); + diag[(1 << qubits.size()) - 1] = phase; + apply_diagonal_matrix(iChunk,qubits, diag); } template @@ -563,8 +683,8 @@ void State::apply_snapshot(const Operations::Op &op, ExperimentResult &result) { // Look for snapshot type in snapshotset if (op.name == "unitary" || op.name == "state") { - auto matrix = move_to_matrix(); - if(BaseState::myrank_ == 0){ + auto matrix = copy_to_matrix(); + if(BaseState::distributed_rank_ == 0){ result.legacy_data.add_pershot_snapshot("unitary", op.string_params[0], matrix); } @@ -586,6 +706,30 @@ void State::apply_global_phase() { } } + +template +void State::apply_save_unitary(const Operations::Op &op, + ExperimentResult &result, + bool last_op) +{ + if (op.qubits.size() != BaseState::num_qubits_) { + throw std::invalid_argument( + op.name + " was not applied to all qubits." + " Only the full unitary can be saved."); + } + std::string key = (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0]; + + if (last_op) { + BaseState::save_data_pershot(result, key, + move_to_matrix(), + op.save_type); + } else { + BaseState::save_data_pershot(result, key, + copy_to_matrix(), + op.save_type); + } +} + template double State::expval_pauli(const reg_t &qubits, const std::string& pauli) { diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp index 2dcceea8da..300f5e9e6f 100755 --- a/src/simulators/unitary/unitarymatrix_thrust.hpp +++ b/src/simulators/unitary/unitarymatrix_thrust.hpp @@ -208,10 +208,7 @@ matrix> UnitaryMatrixThrust::copy_to_matrix() const uint_t irow, icol; #pragma omp parallel for private(i,irow,icol) if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for (i = 0; i < csize; i++) { - irow = (i >> num_qubits_); - icol = i - (irow << num_qubits_); - - ret(icol, irow) = qreg[i]; + ret[i] = qreg[i]; } return ret; } @@ -229,17 +226,13 @@ void UnitaryMatrixThrust::initialize() BaseVector::zero(); // Set to be identity matrix - uint_t is,ie,idx; + uint_t idx; int_t i; - is = BaseVector::chunk_index_ << BaseVector::num_qubits_; - ie = is + (1ull << BaseVector::num_qubits_); #pragma omp parallel private(idx) if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for(i=0;i= is && idx < ie){ - BaseVector::set_state(idx-is,one); - } + BaseVector::set_state(idx,one); } } @@ -262,15 +255,15 @@ void UnitaryMatrixThrust::initialize_from_matrix(const matrix tmp(BaseVector::data_size_); - int_t i; + matrix> tmp(nrows,nrows); #pragma omp parallel for if (BaseVector::num_qubits_ > BaseVector::omp_threshold_ && BaseVector::omp_threads_ > 1) num_threads(BaseVector::omp_threads_) for (int_t row = 0; row < nrows; ++row) for (int_t col = 0; col < nrows; ++col) { - tmp[row + nrows * col] = mat(row, col); + tmp(row,col) = mat(row, col); } - BaseVector::initialize_from_vector(tmp); + + BaseVector::initialize_from_data(tmp.data(), tmp.size()); } template diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index 9fa69a47b6..9eda2cd467 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -68,6 +68,10 @@ class CacheBlocking : public CircuitOptimization { { sample_measure_ = enabled; } + void set_save_state(bool enabled) + { + save_state_ = enabled; + } //setting blocking parameters automatically void set_blocking(int bits, size_t min_memory, uint_t n_place, const size_t complex_size, bool is_matrix = false); @@ -79,6 +83,7 @@ class CacheBlocking : public CircuitOptimization { mutable reg_t qubitSwapped_; mutable bool blocking_enabled_; mutable bool sample_measure_ = false; + mutable bool save_state_ = false; int gpu_blocking_bits_; bool block_circuit(Circuit& circ,bool doSwap) const; @@ -319,51 +324,25 @@ bool CacheBlocking::block_circuit(Circuit& circ,bool doSwap) const std::vector queue_next; n = add_ops(circ.ops,out,queue,doSwap,true); -// put_nongate_ops(out,queue_next,queue,doSwap); -// queue.clear(); while(queue.size() > 0){ n = add_ops(queue,out,queue_next,doSwap,false); queue = queue_next; queue_next.clear(); -// put_nongate_ops(out,queue_next,queue,doSwap); if(n == 0){ break; } -// queue.clear(); } if(queue.size() > 0) return false; -// restore_qubits_order(out); + if(save_state_) + restore_qubits_order(out); circ.ops = out; return true; } -void CacheBlocking::put_nongate_ops(std::vector& out,std::vector& queue,std::vector& input,bool doSwap) const -{ - uint_t i; - for(i=0;i& ops) const { uint_t i,j,t; @@ -401,6 +380,16 @@ void CacheBlocking::restore_qubits_order(std::vector& ops) const if(qubitMap_[i] != i){ j = qubitMap_[qubitMap_[i]]; if(j != i && j < block_bits_){ + if(nInBlock == 0){ + uint_t last = ops.size() - 1; + if(ops[last].type == Operations::OpType::sim_op && ops[last].name == "end_blocking"){ + ops.pop_back(); + nInBlock = 1; + } + else{ + insert_sim_op(ops,"begin_blocking",qubitMap_); + } + } insert_swap(ops,i,j,false); qubitMap_[qubitSwapped_[i]] = j; @@ -508,7 +497,7 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector 0){ //swap gate is not required for initial state + if(!first){ //swap gate is not required for initial state insert_swap(out,swap[i],qubitMap_[blockedQubits[i]],true); } From 5be23825044f9343f741c769ed0ef244b59c5a78 Mon Sep 17 00:00:00 2001 From: doichanj Date: Wed, 31 Mar 2021 18:38:17 +0900 Subject: [PATCH 04/10] remove comment --- src/simulators/unitary/unitary_state_chunk.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp index 58fdd1429c..683ef85883 100644 --- a/src/simulators/unitary/unitary_state_chunk.hpp +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -236,23 +236,6 @@ void State::apply_chunk_swap(const reg_t &qubits) } reg_t qs0 = {{q0, q1}}; BaseState::apply_chunk_swap(qs0); - - /* - if(qubits[0] >= BaseState::chunk_bits_){ - q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } - else{ - q0 += BaseState::chunk_bits_; - } - if(qubits[1] >= BaseState::chunk_bits_){ - q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } - else{ - q1 += BaseState::chunk_bits_; - } - reg_t qs1 = {{q0, q1}}; - BaseState::apply_chunk_swap(qs1); - */ } template From 6a1c1a3f51cb0ceea4c40ed88b6da48b28a0b8fb Mon Sep 17 00:00:00 2001 From: doichanj Date: Wed, 31 Mar 2021 18:49:55 +0900 Subject: [PATCH 05/10] remove comment --- src/simulators/unitary/unitary_state_chunk.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp index 58fdd1429c..683ef85883 100644 --- a/src/simulators/unitary/unitary_state_chunk.hpp +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -236,23 +236,6 @@ void State::apply_chunk_swap(const reg_t &qubits) } reg_t qs0 = {{q0, q1}}; BaseState::apply_chunk_swap(qs0); - - /* - if(qubits[0] >= BaseState::chunk_bits_){ - q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } - else{ - q0 += BaseState::chunk_bits_; - } - if(qubits[1] >= BaseState::chunk_bits_){ - q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } - else{ - q1 += BaseState::chunk_bits_; - } - reg_t qs1 = {{q0, q1}}; - BaseState::apply_chunk_swap(qs1); - */ } template From b787607759580263cc42fa53e81da13f2b3ae1a1 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Thu, 15 Apr 2021 17:18:03 +0900 Subject: [PATCH 06/10] fix initialize/reset, implement set operators --- src/controllers/aer_controller.hpp | 83 +++-- src/controllers/unitary_controller.hpp | 71 ++-- .../densitymatrix_state_chunk.hpp | 340 ++++++++++-------- .../density_matrix/densitymatrix_thrust.hpp | 138 ++++++- src/simulators/state_chunk.hpp | 160 ++++++++- src/simulators/statevector/qubitvector.hpp | 17 +- .../statevector/qubitvector_thrust.hpp | 49 ++- .../statevector/statevector_state_chunk.hpp | 282 +++++++++++---- src/simulators/unitary/unitary_state.hpp | 6 + .../unitary/unitary_state_chunk.hpp | 118 ++---- src/transpile/cacheblocking.hpp | 123 ++++++- 11 files changed, 995 insertions(+), 392 deletions(-) diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 35ff6e4085..377bdc90df 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -64,6 +64,7 @@ #include "simulators/statevector/statevector_state_chunk.hpp" #include "simulators/superoperator/superoperator_state.hpp" #include "simulators/unitary/unitary_state.hpp" +#include "simulators/unitary/unitary_state_chunk.hpp" namespace AER { @@ -1393,33 +1394,67 @@ void Controller::run_circuit(const Circuit &circ, } case Method::unitary: { if (sim_device_ == Device::CPU) { - if (sim_precision_ == Precision::Double) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); + if (multiple_chunk_required(circ, noise)) { + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } + } + else{ + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } } } else { #ifdef AER_THRUST_SUPPORTED - if (sim_precision_ == Precision::Double) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, Method::unitary, - false, result); + if (multiple_chunk_required(circ, noise)) { + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } + } + else{ + if (sim_precision_ == Precision::Double) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, Method::unitary, + false, result); + } } #endif } diff --git a/src/controllers/unitary_controller.hpp b/src/controllers/unitary_controller.hpp index 36aa7baf9c..bdd1413634 100755 --- a/src/controllers/unitary_controller.hpp +++ b/src/controllers/unitary_controller.hpp @@ -17,6 +17,7 @@ #include "controller.hpp" #include "simulators/unitary/unitary_state.hpp" +#include "simulators/unitary/unitary_state_chunk.hpp" #include "transpile/fusion.hpp" namespace AER { @@ -196,30 +197,60 @@ void UnitaryController::run_circuit(const Circuit &circ, switch (method_) { case Method::automatic: case Method::unitary_cpu: { - if (precision_ == Precision::double_precision) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>(circ, noise, config, - shots, rng_seed, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>(circ, noise, config, - shots, rng_seed, result); + if(Base::Controller::multiple_chunk_required(circ,noise)){ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>(circ, noise, config, + shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>(circ, noise, config, + shots, rng_seed, result); + } + } + else{ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>(circ, noise, config, + shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>(circ, noise, config, + shots, rng_seed, result); + } } } case Method::unitary_thrust_gpu: { #ifdef AER_THRUST_CUDA - if (precision_ == Precision::double_precision) { - // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, result); - } else { - // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, shots, rng_seed, result); + if(Base::Controller::multiple_chunk_required(circ,noise)){ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitaryChunk::State>>( + circ, noise, config, shots, rng_seed, result); + } + } + else{ + if (precision_ == Precision::double_precision) { + // Double-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, result); + } else { + // Single-precision unitary simulation + return run_circuit_helper< + QubitUnitary::State>>( + circ, noise, config, shots, rng_seed, result); + } } #else throw std::runtime_error( diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index 9b6eee2e77..89e546345a 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -40,11 +40,11 @@ const Operations::OpSet StateOpSet( OpType::barrier, OpType::bfunc, OpType::roerror, OpType::matrix, OpType::diagonal_matrix, OpType::kraus, - OpType::superop, OpType::save_expval, + OpType::superop, OpType::set_statevec, + OpType::set_densmat, OpType::save_expval, OpType::save_expval_var, OpType::save_densmat, - OpType::save_state, OpType::save_probs, OpType::save_probs_ket, - OpType::save_amps_sq + OpType::save_amps_sq, OpType::save_state }, // Gates {"U", "CX", "u1", "u2", "u3", "u", "cx", "cy", "cz", @@ -116,7 +116,9 @@ class State : public Base::StateChunk { auto move_to_matrix(); auto copy_to_matrix(); protected: - auto apply_to_matrix(bool copy = false); + + template + void initialize_from_vector(const list_t &vec); //----------------------------------------------------------------------- // Apply instructions @@ -143,7 +145,7 @@ class State : public Base::StateChunk { RngEngine &rng); // Reset the specified qubits to the |0> state by tracing out qubits - void apply_reset(const reg_t &qubits); + void apply_reset(const int_t iChunk, const reg_t &qubits); // Apply a supported snapshot instruction // If the input is not in allowed_snapshots an exeption will be raised. @@ -166,6 +168,10 @@ class State : public Base::StateChunk { // Apply an N-qubit Pauli gate void apply_pauli(const reg_t &qubits, const std::string &pauli); + // apply phase + void apply_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase); + void apply_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase); + //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- @@ -279,12 +285,22 @@ class State : public Base::StateChunk { return 2; } + virtual bool is_applied_to_each_chunk(const Operations::Op &op); }; //========================================================================= // Implementation: Base class method overrides //========================================================================= +template +bool State::is_applied_to_each_chunk(const Operations::Op &op) +{ + if(op.type == Operations::OpType::reset){ + return true; + } + return BaseState::is_applied_to_each_chunk(op); +} + //------------------------------------------------------------------------- // Initialization //------------------------------------------------------------------------- @@ -303,10 +319,13 @@ void State::initialize_qreg(uint_t num_qubits) } } else{ //multi-chunk distribution + for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -335,6 +354,11 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, //copy part of state for this chunk uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -374,6 +396,10 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, //copy part of state for this chunk uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -413,6 +437,10 @@ void State::initialize_qreg(uint_t num_qubits, } } else{ //multi-chunk distribution + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, //copy part of state for this chunk uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); uint_t irow = i >> (BaseState::chunk_bits_); tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -446,13 +472,55 @@ void State::initialize_omp() } } +template +template +void State::initialize_from_vector(const list_t &vec) +{ + if((1ull << (BaseState::num_qubits_*2)) == vec.size()){ + BaseState::initialize_from_vector(vec); + } + else if((1ull << (BaseState::num_qubits_*2)) == vec.size() * vec.size()) { + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); + } + } + } + else { + throw std::runtime_error("DensityMatrixChunk::initialize input vector is incorrect length. Expected: " + + std::to_string((1ull << (BaseState::num_qubits_*2))) + " Received: " + + std::to_string(vec.size())); + } +} + + template auto State::move_to_matrix() { if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } - return apply_to_matrix(false); + return BaseState::apply_to_matrix(false); } template @@ -461,86 +529,9 @@ auto State::copy_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].copy_to_matrix(); } - return apply_to_matrix(true); + return BaseState::apply_to_matrix(true); } -template -auto State::apply_to_matrix(bool copy) -{ - int_t iChunk; - uint_t size = 1ull << (BaseState::chunk_bits_*2); - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); - - auto matrix = BaseState::qregs_[0].copy_to_matrix(); - - if(BaseState::distributed_rank_ == 0){ - //TO DO check memory availability - matrix.resize(1ull << (BaseState::num_qubits_),1ull << (BaseState::num_qubits_)); - -#ifdef AER_MPI - auto recv = BaseState::qregs_[0].copy_to_matrix(); - //gather states from other processes - for(iChunk=BaseState::num_local_chunks_;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = recv(icol,irow); - } - } -#endif - - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - if(copy){ - auto tmp = BaseState::qregs_[iChunk].copy_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); - } - } - else{ - auto tmp = BaseState::qregs_[iChunk].move_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t icol = i & mask; - matrix(icol_chunk+icol,irow_chunk+irow) = tmp(icol,irow); - } - } - } - } - else{ -#ifdef AER_MPI - //send matrices to process 0 - for(iChunk=0;iChunk::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::barrier: break; case Operations::OpType::reset: - apply_reset(op.qubits); + apply_reset(iChunk,op.qubits); break; case Operations::OpType::measure: apply_measure(op.qubits, op.memory, op.registers, rng); @@ -617,6 +608,15 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::superop: BaseState::qregs_[iChunk].apply_superop_matrix(op.qubits, Utils::vectorize_matrix(op.mats[0])); break; + case OpType::kraus: + apply_kraus(op.qubits, op.mats); + break; + case OpType::set_statevec: + initialize_from_vector(op.params); + break; + case OpType::set_densmat: + BaseState::initialize_from_matrix(op.mats[0]); + break; case Operations::OpType::save_expval: case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); @@ -991,54 +991,71 @@ template cmatrix_t State::reduced_density_matrix_helper(const reg_t &qubits, const reg_t &qubits_sorted) { - // Get superoperator qubits - const reg_t squbits = BaseState::qregs_[0].superop_qubits(qubits); - const reg_t squbits_sorted = BaseState::qregs_[0].superop_qubits(qubits_sorted); - - // Get dimensions - const size_t N = qubits.size(); - const size_t DIM = 1ULL << N; - const size_t VDIM = 1ULL << (2 * N); - const size_t END = 1ULL << (BaseState::qregs_[0].num_qubits() - N); - const size_t SHIFT = END + 1; - - // TODO: If we are not going to apply any additional instructions after - // this function we could move the memory when constructing rather - // than copying int_t iChunk; - auto vmat = BaseState::qregs_[0].vector(); + uint_t size = 1ull << (BaseState::chunk_bits_*2); + uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); //TO DO check memory availability - vmat.resize(BaseState::num_local_chunks_ << (BaseState::chunk_bits_*2)); - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=1;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << BaseState::chunk_bits_; + uint_t icol_chunk = (iChunk & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << BaseState::chunk_bits_; + if(iChunk < BaseState::num_local_chunks_) + tmp = BaseState::qregs_[iChunk].copy_to_matrix(); #ifdef AER_MPI - BaseState::gather_state(vmat); + else + recv_data(tmp.data(),size,0,iChunk); #endif - - cmatrix_t reduced_state(DIM, DIM, false); - { - // Fill matrix with first iteration - const auto inds = QV::indexes(squbits, squbits_sorted, 0); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] = complex_t(vmat[inds[i]]); +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (BaseState::chunk_bits_)) + irow_chunk; + uint_t icol = (i & mask) + icol_chunk; + uint_t irow_out = 0; + uint_t icol_out = 0; + int j; + for(j=0;j> qubits[j]) & 1){ + irow &= ~(1ull << qubits[j]); + irow_out += (1ull << j); + } + if((icol >> qubits[j]) & 1){ + icol &= ~(1ull << qubits[j]); + icol_out += (1ull << j); + } + } + if(irow == icol){ //only diagonal base can be reduced + uint_t idx = ((irow_out) << qubits.size()) + icol_out; +#pragma omp critical + reduced_state[idx] += tmp[i]; + } + } } } - // Accumulate with remaning blocks - for (size_t k = 1; k < END; k++) { - const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] += complex_t(vmat[inds[i]]); + else{ +#ifdef AER_MPI + //send matrices to process 0 + for(iChunk=0;iChunk::apply_gate(const uint_t iChunk, const Operations::Op &op) std::real(op.params[1])); break; case DensityMatrix::Gates::u1: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0])); + apply_phase(iChunk,op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0])); break; case DensityMatrix::Gates::cx: BaseState::qregs_[iChunk].apply_cnot(op.qubits[0], op.qubits[1]); @@ -1098,21 +1115,21 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op) apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., 0., M_PI); break; case DensityMatrix::Gates::s: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(0., 1.)); + apply_phase(iChunk,op.qubits[0], complex_t(0., 1.)); break; case DensityMatrix::Gates::sdg: - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(0., -1.)); + apply_phase(iChunk,op.qubits[0], complex_t(0., -1.)); break; case DensityMatrix::Gates::sx: BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, Linalg::VMatrix::SX); break; case DensityMatrix::Gates::t: { const double isqrt2{1. / std::sqrt(2)}; - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(isqrt2, isqrt2)); + apply_phase(iChunk,op.qubits[0], complex_t(isqrt2, isqrt2)); } break; case DensityMatrix::Gates::tdg: { const double isqrt2{1. / std::sqrt(2)}; - BaseState::qregs_[iChunk].apply_phase(op.qubits[0], complex_t(isqrt2, -isqrt2)); + apply_phase(iChunk,op.qubits[0], complex_t(isqrt2, -isqrt2)); } break; case DensityMatrix::Gates::swap: { BaseState::qregs_[iChunk].apply_swap(op.qubits[0], op.qubits[1]); @@ -1193,13 +1210,44 @@ void State::apply_diagonal_unitary_matrix(const int_t iChunk, const r } else{ reg_t qubits_in = qubits; + reg_t qubits_row = qubits; cvector_t diag_in = diag; + cvector_t diag_row = diag; BaseState::block_diagonal_matrix(iChunk,qubits_in,diag_in); - BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits_in,diag_in); + + for(int_t i=0;i= BaseState::chunk_bits_) + qubits_row[i] = qubits[i] + BaseState::num_qubits_ - BaseState::chunk_bits_; + } + BaseState::block_diagonal_matrix(iChunk,qubits_row,diag_row); + + reg_t qubits_chunk(qubits_in.size()*2); + for(int_t i=0;i +void State::apply_phase(const uint_t iChunk,const uint_t qubit, const complex_t phase) +{ + cvector_t diag(2); + diag[0] = 1.0; + diag[1] = phase; + apply_diagonal_unitary_matrix(iChunk,reg_t({qubit}), diag); +} + +template +void State::apply_phase(const uint_t iChunk,const reg_t& qubits, const complex_t phase) +{ + cvector_t diag((1 << qubits.size()),1.0); + diag[(1 << qubits.size()) - 1] = phase; + apply_diagonal_unitary_matrix(iChunk,qubits, diag); +} + //========================================================================= // Implementation: Reset and Measurement Sampling //========================================================================= @@ -1400,18 +1448,14 @@ std::vector State::sample_measure(const reg_t &qubits, template -void State::apply_reset(const reg_t &qubits) +void State::apply_reset(const int_t iChunk,const reg_t &qubits) { // TODO: This can be more efficient by adding reset // to base class rather than doing a matrix multiplication // where all but 1 row is zeros. const auto reset_op = Linalg::SMatrix::reset(1ULL << qubits.size()); - int_t i; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 645b108344..4a7730bdb0 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -317,6 +317,73 @@ class DensityDiagMatMult2x2 : public GateFuncBase } }; +template +class DensityDiagMatMultNxN : public GateFuncBase +{ +protected: + int nqubits_; + int total_bits_; + int chunk_bits_; +public: + DensityDiagMatMultNxN(const reg_t &qb,int total,int chunk) + { + nqubits_ = qb.size(); + total_bits_ = total; + chunk_bits_ = chunk; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t j,imr,imc; + thrust::complex* vec; + thrust::complex q; + thrust::complex* pMat; + uint_t* qubits; + uint_t irow,icol,gid,local_mask; + uint_t irow_chunk,icol_chunk; + + vec = this->data_; + gid = this->base_index_; + + irow_chunk = ((gid + i) >> (chunk_bits_*2)) >> (total_bits_ - chunk_bits_); + icol_chunk = ((gid + i) >> (chunk_bits_*2)) & ((1ull << (total_bits_ - chunk_bits_)-1)); + + local_mask = (1ull << (chunk_bits_*2)) - 1; + irow = (i & local_mask) >> chunk_bits_; + icol = (i & local_mask) & ((1ull << chunk_bits_)-1); + + irow += (irow_chunk << chunk_bits_); + icol += (icol_chunk << chunk_bits_); + + pMat = this->matrix_; + qubits = this->params_; + + imr = 0; + imc = 0; + for(j=0;j> qubits[j]) & 1) != 0){ + imr += (1 << j); + } + if(((icol >> qubits[j]) & 1) != 0){ + imc += (1 << j); + } + } + + q = vec[i]; + vec[i] = thrust::conj(pMat[imr])*pMat[imc]*q; + } + const char* name(void) + { + return "DensityDiagMatMultNxN"; + } +}; + + template void DensityMatrixThrust::apply_unitary_matrix(const reg_t &qubits, const cvector_t &mat) @@ -340,6 +407,11 @@ template void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qubits, const cvector_t &diag) { + BaseVector::chunk_->StoreMatrix(diag); + BaseVector::chunk_->StoreUintParams(qubits); + BaseVector::apply_function(DensityDiagMatMultNxN(qubits, BaseVector::chunk_manager_.num_qubits()/2, num_qubits())); + + /* if(qubits.size() == 1){ const reg_t qubits_sp = {{qubits[0], qubits[0] + num_qubits()}}; BaseVector::apply_function(DensityDiagMatMult2x2(diag,qubits[0], num_qubits())); @@ -348,6 +420,7 @@ void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qub // Apply as single 2N-qubit matrix mult. apply_diagonal_superop_matrix(qubits, AER::Utils::tensor_product(AER::Utils::conjugate(diag), diag)); } + */ } //----------------------------------------------------------------------- @@ -439,6 +512,62 @@ void DensityMatrixThrust::apply_cnot(const uint_t qctrl, const uint_t qt #endif } +template +class DensityPhase : public GateFuncBase +{ +protected: + thrust::complex phase_; + int qubit_; + int total_bits_; + int chunk_bits_; +public: + DensityPhase(int qubit,thrust::complex* phase,int total,int chunk) + { + qubit_ = qubit; + phase_ = *phase; + total_bits_ = total; + chunk_bits_ = chunk; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q; + uint_t irow,icol,gid,local_mask; + uint_t irow_chunk,icol_chunk; + + vec = this->data_; + gid = this->base_index_; + + irow_chunk = ((gid + i) >> (chunk_bits_*2)) >> (total_bits_ - chunk_bits_); + icol_chunk = ((gid + i) >> (chunk_bits_*2)) & ((1ull << (total_bits_ - chunk_bits_)-1)); + + local_mask = (1ull << (chunk_bits_*2)) - 1; + irow = (i & local_mask) >> chunk_bits_; + icol = (i & local_mask) & ((1ull << chunk_bits_)-1); + + irow += (irow_chunk << chunk_bits_); + icol += (icol_chunk << chunk_bits_); + + q = vec[i]; + if((icol >> qubit_) & 1) + q = phase_*q; + if((irow >> qubit_) & 1) + q = thrust::conj(phase_)*q; + vec[i] = q; + } + const char* name(void) + { + return "DensityPhase"; + } +}; + + /* template class DensityPhase : public GateFuncBase { @@ -454,10 +583,11 @@ class DensityPhase : public GateFuncBase phase_ = *phase; } - int qubits_count(void) + bool is_diagonal(void) { - return 2; + return true; } + __host__ __device__ void operator()(const uint_t &i) const { uint_t i0,i1,i2; @@ -490,11 +620,13 @@ class DensityPhase : public GateFuncBase return "DensityPhase"; } }; + */ template void DensityMatrixThrust::apply_phase(const uint_t q,const complex_t &phase) { - BaseVector::apply_function(DensityPhase(q, num_qubits(), (thrust::complex*)&phase )); + BaseVector::apply_function(DensityPhase(q, (thrust::complex*)&phase, BaseVector::chunk_manager_.num_qubits()/2, num_qubits() )); +// BaseVector::apply_function(DensityPhase(q, num_qubits(), (thrust::complex*)&phase )); #ifdef AER_DEBUG BaseVector::DebugMsg(" density::apply_phase"); diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index e5db28c556..61e8ef0e37 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -188,6 +188,15 @@ class StateChunk { const std::string &memory_hex, const std::string ®ister_hex); + //----------------------------------------------------------------------- + // Initialization + //----------------------------------------------------------------------- + template + void initialize_from_vector(const list_t &vec); + + template + void initialize_from_matrix(const list_t &mat); + //----------------------------------------------------------------------- // Save result data //----------------------------------------------------------------------- @@ -355,6 +364,12 @@ class StateChunk { // block diagonal matrix in chunk void block_diagonal_matrix(const int_t iChunk, reg_t &qubits, cvector_t &diag); + void qubits_inout(const reg_t& qubits, reg_t& qubits_in,reg_t& qubits_out) const; + + auto apply_to_matrix(bool copy = false); + + virtual bool is_applied_to_each_chunk(const Operations::Op &op); + // Set a global phase exp(1j * theta) for the state bool has_global_phase_ = false; complex_t global_phase_ = 1; @@ -521,6 +536,16 @@ void StateChunk::set_config(const json_t &config) JSON::get_value(block_bits_, "blocking_qubits", config); } +template +bool StateChunk::is_applied_to_each_chunk(const Operations::Op &op) +{ + if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer || + op.type == Operations::OpType::superop){ + return true; + } + return false; +} template void StateChunk::apply_ops(const std::vector &ops, @@ -567,9 +592,7 @@ void StateChunk::apply_ops(const std::vector &ops, iOp = iOpEnd; } - else if(ops[iOp].type == Operations::OpType::gate || ops[iOp].type == Operations::OpType::matrix || - ops[iOp].type == Operations::OpType::diagonal_matrix || ops[iOp].type == Operations::OpType::multiplexer) - { + else if(is_applied_to_each_chunk(ops[iOp])){ #pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk::block_diagonal_matrix(const int_t iChunk, reg_t &qubit } } +template +void StateChunk::qubits_inout(const reg_t& qubits, reg_t& qubits_in,reg_t& qubits_out) const +{ + int_t i; + qubits_in.clear(); + qubits_out.clear(); + for(i=0;i std::vector StateChunk::sample_measure(const reg_t &qubits, uint_t shots, @@ -647,6 +687,118 @@ void StateChunk::initialize_creg(uint_t num_memory, creg_.initialize(num_memory, num_register, memory_hex, register_hex); } +template +template +void StateChunk::initialize_from_vector(const list_t &vec) +{ + int_t iChunk; + if(chunk_bits_ == num_qubits_){ + for(iChunk=0;iChunk +template +void StateChunk::initialize_from_matrix(const list_t &mat) +{ + int_t iChunk; + if(chunk_bits_ == num_qubits_){ + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); + } + } +} + +template +auto StateChunk::apply_to_matrix(bool copy) +{ + int_t iChunk; + uint_t size = 1ull << (chunk_bits_*qubit_scale()); + uint_t mask = (1ull << (chunk_bits_)) - 1; + uint_t num_threads = qregs_[0].get_omp_threads(); + + auto matrix = qregs_[0].copy_to_matrix(); + + if(distributed_rank_ == 0){ + //TO DO check memory availability + matrix.resize(1ull << (num_qubits_),1ull << (num_qubits_)); + + auto tmp = qregs_[0].copy_to_matrix(); + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << chunk_bits_; + uint_t icol_chunk = (iChunk & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << chunk_bits_; + + if(iChunk < num_local_chunks_){ + if(copy) + tmp = qregs_[iChunk].copy_to_matrix(); + else + tmp = qregs_[iChunk].move_to_matrix(); + } +#ifdef AER_MPI + else + recv_data(tmp.data(),size,0,iChunk); +#endif +#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) + for(i=0;i> (chunk_bits_); + uint_t icol = i & mask; + uint_t idx = ((irow+irow_chunk) << (num_qubits_)) + icol_chunk + icol; + matrix[idx] = tmp[i]; + } + } + } + else{ +#ifdef AER_MPI + //send matrices to process 0 + for(iChunk=0;iChunk void StateChunk::save_creg(ExperimentResult &result, const std::string &key, @@ -1195,6 +1347,8 @@ void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_pai MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,local_chunk_index + global_chunk_index_,distributed_comm_,&reqSend); MPI_Wait(&reqSend,&st); + + qregs_[local_chunk_index].release_send_buffer(); #endif } diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 9599cf7f98..a6c93d429d 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -150,6 +150,8 @@ class QubitVector { //prepare buffer for MPI send/recv std::complex* send_buffer(uint_t& size_in_byte); std::complex* recv_buffer(uint_t& size_in_byte); + void release_send_buffer(void) const; + void release_recv_buffer(void) const; //----------------------------------------------------------------------- // Check point operations @@ -364,7 +366,7 @@ class QubitVector { std::complex* checkpoint_; uint_t chunk_index_; //global chunk index - cvector_t recv_buffer_; //receive buffer for MPI + mutable cvector_t recv_buffer_; //receive buffer for MPI //----------------------------------------------------------------------- // Config settings @@ -897,6 +899,18 @@ std::complex* QubitVector::recv_buffer(uint_t& size_in_byte) return &recv_buffer_[0]; } +template +void QubitVector::release_send_buffer(void) const +{ + +} + +template +void QubitVector::release_recv_buffer(void) const +{ + recv_buffer_.clear(); +} + //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ @@ -1226,7 +1240,6 @@ void QubitVector::apply_permutation_matrix(const reg_t& qubits, } // end switch } - /******************************************************************************* * * APPLY OPTIMIZED GATES diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index c99cb24cc0..13fd1863c9 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -144,6 +144,9 @@ class QubitVectorThrust { thrust::complex* send_buffer(uint_t& size_in_byte); thrust::complex* recv_buffer(uint_t& size_in_byte); + void release_send_buffer(void) const; + void release_recv_buffer(void) const; + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -364,12 +367,13 @@ class QubitVectorThrust { mutable std::shared_ptr> chunk_; mutable std::shared_ptr> buffer_chunk_; std::shared_ptr> checkpoint_; - std::shared_ptr> send_chunk_; - std::shared_ptr> recv_chunk_; + mutable std::shared_ptr> send_chunk_; + mutable std::shared_ptr> recv_chunk_; static ChunkManager chunk_manager_; uint_t chunk_index_; bool multi_chunk_distribution_; + bool multi_shots_; bool register_blocking_; @@ -517,7 +521,10 @@ QubitVectorThrust::QubitVectorThrust(size_t num_qubits) : num_qubits_(0) chunk_ = nullptr; chunk_index_ = 0; multi_chunk_distribution_ = false; + multi_shots_ = false; checkpoint_ = nullptr; + recv_chunk_ = nullptr; + send_chunk_ = nullptr; #ifdef AER_DEBUG debug_count = 0; @@ -818,6 +825,9 @@ void QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t if(chunk_bits < num_qubits){ multi_chunk_distribution_ = true; } + + if(omp_get_num_threads() > 1) + multi_shots_ = true; } template @@ -1056,6 +1066,26 @@ thrust::complex* QubitVectorThrust::recv_buffer(uint_t& size_in_ return recv_chunk_->pointer(); } +template +void QubitVectorThrust::release_send_buffer(void) const +{ +#ifdef AER_DISABLE_GDR + if(send_chunk_){ + chunk_manager_.UnmapBufferChunk(send_chunk_); + send_chunk_ = nullptr; + } +#endif +} + +template +void QubitVectorThrust::release_recv_buffer(void) const +{ + if(recv_chunk_){ + chunk_manager_.UnmapBufferChunk(recv_chunk_); + recv_chunk_ = nullptr; + } +} + //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ @@ -1147,13 +1177,14 @@ void QubitVectorThrust::apply_function(Function func) const #endif - func.set_base_index(chunk_index_ << num_qubits_); if(func.batch_enable() && multi_chunk_distribution_ && chunk_->device() >= 0){ if(chunk_->pos() == 0){ //only first chunk on device calculates all the chunks + func.set_base_index(chunk_index_ << num_qubits_); chunk_->Execute(func,chunk_->container()->num_chunks()); } } else{ + func.set_base_index(chunk_index_ << num_qubits_); chunk_->Execute(func,1); } @@ -2700,14 +2731,10 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem } } - chunk_manager_.UnmapBufferChunk(recv_chunk_); -// recv_chunk_.reset(); + release_recv_buffer(); #ifdef AER_DISABLE_GDR - if(send_chunk_){ - chunk_manager_.UnmapBufferChunk(send_chunk_); -// send_chunk_.reset(); - } + release_send_buffer(); #endif } @@ -3711,6 +3738,10 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, chunk_manager_.UnmapBufferChunk(buffer); } + if(pair_chunk.data() == this->data()){ + release_recv_buffer(); + } + return ret; } diff --git a/src/simulators/statevector/statevector_state_chunk.hpp b/src/simulators/statevector/statevector_state_chunk.hpp index c7582e72af..792e20956e 100644 --- a/src/simulators/statevector/statevector_state_chunk.hpp +++ b/src/simulators/statevector/statevector_state_chunk.hpp @@ -44,12 +44,12 @@ const Operations::OpSet StateOpSet( OpType::bfunc, OpType::roerror, OpType::matrix, OpType::diagonal_matrix, OpType::multiplexer, OpType::kraus, - OpType::sim_op, OpType::save_expval, - OpType::save_expval_var, OpType::save_densmat, + OpType::sim_op, OpType::set_statevec, + OpType::save_expval, OpType::save_expval_var, OpType::save_probs, OpType::save_probs_ket, OpType::save_amps, OpType::save_amps_sq, - OpType::save_statevec, OpType::save_state, - OpType::save_statevec_dict + OpType::save_state, OpType::save_statevec, + OpType::save_statevec_dict, OpType::save_densmat }, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", @@ -157,6 +157,8 @@ class State : public Base::StateChunk { // /psi> is given in params void apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng); + void initialize_from_vector(const cvector_t ¶ms); + // Apply a supported snapshot instruction // If the input is not in allowed_snapshots an exeption will be raised. virtual void apply_snapshot(const Operations::Op &op, ExperimentResult &result, bool last_op = false); @@ -293,6 +295,15 @@ class State : public Base::StateChunk { const double phi, const double lambda); + void apply_gate_mcphase(const int_t iChunk, const reg_t& qubits, const complex_t phase); + + //------------------------------------------------------------------------- + // data access + //------------------------------------------------------------------------- + complex_t get_state(const uint_t idx); + void set_state(const uint_t idx,const complex_t s); + + //----------------------------------------------------------------------- // Config Settings //----------------------------------------------------------------------- @@ -332,10 +343,13 @@ void State::initialize_qreg(uint_t num_qubits) } } else{ //multi-chunk distribution + for(i=0;inum_qubits_ == this->chunk_bits_){ BaseState::qregs_[i].initialize(); } @@ -366,9 +380,12 @@ void State::initialize_qreg(uint_t num_qubits, else{ //multi-chunk distribution uint_t local_offset = BaseState::global_chunk_index_ << BaseState::chunk_bits_; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, initialize_omp(); int_t iChunk; - if(BaseState::chunk_bits_ == BaseState::num_qubits_){ - for(iChunk=0;iChunk::copy_to_vector() } } +template +complex_t State::get_state(const uint_t idx) +{ + complex_t ret = 0.0; + if(BaseState::num_global_chunks_ == 1){ + return BaseState::qregs_[0].get_state(idx); + } + else{ + uint_t iChunk = idx >> BaseState::chunk_bits_; + if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process + ret = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(idx - (iChunk << BaseState::chunk_bits_)); + } +#ifdef AER_MPI + BaseState::reduce_sum(ret); +#endif + } + return ret; +} + +template +void State::set_state(const uint_t idx,const complex_t s) +{ + if(BaseState::num_global_chunks_ == 1){ + BaseState::qregs_[0][idx] = s; + } + else{ + uint_t iChunk = idx >> BaseState::chunk_bits_; + if(BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= iChunk && BaseState::chunk_index_end_[BaseState::distributed_rank_] > iChunk){ //on this process + BaseState::qregs_[iChunk - BaseState::global_chunk_index_][idx - (iChunk << BaseState::chunk_bits_)] = s; + } + } +} + //========================================================================= // Implementation: apply operations //========================================================================= @@ -583,6 +618,9 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, case Operations::OpType::kraus: apply_kraus(op.qubits, op.mats, rng); break; + case OpType::set_statevec: + initialize_from_vector(op.params); + break; case Operations::OpType::save_expval: case Operations::OpType::save_expval_var: BaseState::apply_save_expval(op, result); @@ -874,9 +912,8 @@ void State::apply_snapshot(const Operations::Op &op, result.legacy_data.add_pershot_snapshot("statevector", op.string_params[0], move_to_vector()); } else { - //also using move_to_vector(actually no move) result.legacy_data.add_pershot_snapshot("statevector", op.string_params[0], - move_to_vector()); + copy_to_vector()); } break; case Statevector::Snapshots::cmemory: @@ -1183,7 +1220,7 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op break; case Statevector::Gates::mcz: // Includes Z, CZ, CCZ, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + apply_gate_mcphase(iChunk,op.qubits, -1); break; case Statevector::Gates::mcr: BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); @@ -1248,8 +1285,7 @@ void State::apply_gate(const uint_t iChunk, const Operations::Op &op break; case Statevector::Gates::mcp: // Includes u1, cu1, p, cp, mcp etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, - std::exp(complex_t(0, 1) * op.params[0])); + apply_gate_mcphase(iChunk,op.qubits,std::exp(complex_t(0, 1) * op.params[0])); break; case Statevector::Gates::mcsx: // Includes sx, csx, mcsx etc @@ -1329,6 +1365,21 @@ void State::apply_gate_phase(const int_t iChunk, uint_t qubit, compl apply_matrix(iChunk,reg_t({qubit}), diag); } +template +void State::apply_gate_mcphase(const int_t iChunk, const reg_t& qubits, const complex_t phase) +{ + if(BaseState::gpu_optimization_){ + //GPU computes all chunks in one kernel + BaseState::qregs_[iChunk].apply_mcphase(qubits,phase); + } + else{ + cvector_t diag(1ull << qubits.size(),1.0); + diag[diag.size()-1] = phase; + + apply_diagonal_matrix(iChunk,qubits,diag); + } +} + template void State::apply_mcswap(const int_t iChunk,const reg_t &qubits) { @@ -1363,14 +1414,7 @@ rvector_t State::measure_probs(const reg_t &qubits) const reg_t qubits_in_chunk; reg_t qubits_out_chunk; - for(i=0;i::measure_reset_update(const std::vector &qubits, // Update a state vector based on an outcome pair [m, p] from // sample_measure_with_prob function, and a desired post-measurement final_state - int_t i; + int_t i,iChunk; // Single-qubit case if (qubits.size() == 1) { // Diagonal matrix for projecting and renormalizing to measurement outcome cvector_t mdiag(2, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const std::vector &qubits, cvector_t mdiag(dim, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const std::vector &qubits, reg_t qubits_in_chunk; reg_t qubits_out_chunk; - for(i=0;i 0){ //in chunk exchange - const size_t dim_in = 1ULL << qubits_in_chunk.size(); + if(qubits_in_chunk.size() == qubits.size()){ //all bits are inside chunk // build vectorized permutation matrix - cvector_t perm(dim_in * dim_in, 0.); - perm[final_state * dim_in + meas_state] = 1.; - perm[meas_state * dim_in + final_state] = 1.; - for (size_t j=0; j < dim_in; j++) { + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (size_t j=0; j < dim; j++) { if (j != final_state && j != meas_state) - perm[j * dim_in + j] = 1.; + perm[j * dim + j] = 1.; } // apply permutation to swap state -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange - for(i=0;i> i) & 1) != ((meas_state >> i) & 1)){ + BaseState::apply_chunk_x(qubits[i]); + } } } } @@ -1616,24 +1654,120 @@ void State::apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng) { - uint_t i; + int_t i,iChunk; + auto sorted_qubits = qubits; + std::sort(sorted_qubits.begin(), sorted_qubits.end()); if (qubits.size() == BaseState::num_qubits_) { // If qubits is all ordered qubits in the statevector // we can just initialize the whole state directly - auto sorted_qubits = qubits; - std::sort(sorted_qubits.begin(), sorted_qubits.end()); if (qubits == sorted_qubits) { - initialize_qreg(qubits.size(), params); + initialize_from_vector(params); return; } } // Apply reset to qubits apply_reset(qubits, rng); - // Apply initialize_component - for(i=0;i 0){ + //scatter inside chunks + const size_t dim = 1ULL << qubits_in_chunk.size(); + cvector_t perm(dim * dim, 0.); + for(i=0;i 0){ + //then scatter outside chunk + auto sorted_qubits_out = qubits_out_chunk; + std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end()); + + for(i=0;i<(1ull << (BaseState::num_qubits_ - BaseState::chunk_bits_ - qubits_out_chunk.size()));i++){ + uint_t baseChunk = 0; + uint_t j,ii,t; + ii = i; + for(j=0;j>= BaseState::chunk_bits_; + + for(j=1;j<(1ull << qubits_out_chunk.size());j++){ + iChunk = baseChunk; + for(t=0;t> t) & 1) + iChunk += (1ull << (qubits_out_chunk[t] - BaseState::chunk_bits_)); + } + + if(iChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && iChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //on this process + if(baseChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && baseChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //base chunk is on this process + BaseState::qregs_[iChunk].initialize_from_data(BaseState::qregs_[baseChunk].data(),1ull << BaseState::chunk_bits_); + } + else{ + BaseState::recv_chunk(iChunk,baseChunk); + //using swap chunk function to release send/recv buffers for Thrust + reg_t swap(2); + swap[0] = BaseState::chunk_bits_; + swap[1] = BaseState::chunk_bits_; + BaseState::qregs_[iChunk].apply_chunk_swap(swap,baseChunk); + } + } + else if(baseChunk >= BaseState::chunk_index_begin_[BaseState::distributed_rank_] && baseChunk < BaseState::chunk_index_end_[BaseState::distributed_rank_]){ //base chunk is on this process + BaseState::send_chunk(baseChunk - BaseState::global_chunk_index_,iChunk); + } + } + } + } + + //initialize by params +#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk +void State::initialize_from_vector(const cvector_t ¶ms) +{ + int_t iChunk; + if(BaseState::chunk_bits_ == BaseState::num_qubits_){ + for(iChunk=0;iChunk class State; +} + namespace QubitUnitary { // OpSet of supported instructions @@ -62,6 +67,7 @@ enum class Gates { template > class State : public Base::State { + friend class QubitUnitaryChunk::State; public: using BaseState = Base::State; diff --git a/src/simulators/unitary/unitary_state_chunk.hpp b/src/simulators/unitary/unitary_state_chunk.hpp index 683ef85883..49be0c8677 100644 --- a/src/simulators/unitary/unitary_state_chunk.hpp +++ b/src/simulators/unitary/unitary_state_chunk.hpp @@ -34,9 +34,10 @@ namespace QubitUnitaryChunk { const Operations::OpSet StateOpSet( // Op types {Operations::OpType::gate, Operations::OpType::barrier, + Operations::OpType::bfunc, Operations::OpType::roerror, Operations::OpType::matrix, Operations::OpType::diagonal_matrix, Operations::OpType::snapshot, Operations::OpType::save_unitary, - Operations::OpType::save_state }, + Operations::OpType::save_state, Operations::OpType::set_unitary}, // Gates {"u1", "u2", "u3", "u", "U", "CX", "cx", "cz", "cy", "cp", "cu1", "cu2", "cu3", "swap", "id", "p", @@ -156,8 +157,6 @@ class State : public Base::StateChunk { ExperimentResult &result, bool last_op); - auto apply_to_matrix(bool copy = false); - // Helper function for computing expectation value virtual double expval_pauli(const reg_t &qubits, const std::string& pauli) override; @@ -194,11 +193,20 @@ void State::apply_op(const int_t iChunk,const Operations::Op & switch (op.type) { case Operations::OpType::barrier: break; + case Operations::OpType::bfunc: + BaseState::creg_.apply_bfunc(op); + break; + case Operations::OpType::roerror: + BaseState::creg_.apply_roerror(op, rng); + break; case Operations::OpType::gate: // Note conditionals will always fail since no classical registers if (BaseState::creg_.check_conditional(op)) apply_gate(iChunk,op); break; + case Operations::OpType::set_unitary: + BaseState::initialize_from_matrix(op.mats[0]); + break; case Operations::OpType::save_state: case Operations::OpType::save_unitary: apply_save_unitary(op, result, final_ops); @@ -279,13 +287,16 @@ void State::initialize_qreg(uint_t num_qubits) } } else{ //multi-chunk distribution + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); if(irow == icol) BaseState::qregs_[iChunk].initialize(); else @@ -319,6 +330,10 @@ void State::initialize_qreg(uint_t num_qubits, auto input = unitary.copy_to_matrix(); uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); @@ -333,8 +348,6 @@ void State::initialize_qreg(uint_t num_qubits, uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; tmp[i] = input[idx]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -363,6 +376,10 @@ void State::initialize_qreg(uint_t num_qubits, } else{ //multi-chunk distribution uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; + for(iChunk=0;iChunk::initialize_qreg(uint_t num_qubits, uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; tmp[i] = unitary[idx]; } - - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } @@ -403,7 +418,7 @@ auto State::move_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].move_to_matrix(); } - return apply_to_matrix(false); + return BaseState::apply_to_matrix(false); } template @@ -412,88 +427,7 @@ auto State::copy_to_matrix() if(BaseState::num_global_chunks_ == 1){ return BaseState::qregs_[0].copy_to_matrix(); } - return apply_to_matrix(true); -} - -template -auto State::apply_to_matrix(bool copy) -{ - int_t iChunk; - uint_t size = 1ull << (BaseState::chunk_bits_*2); - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); - - auto matrix = BaseState::qregs_[0].copy_to_matrix(); - - if(BaseState::distributed_rank_ == 0){ - //TO DO check memory availability - matrix.resize(1ull << (BaseState::num_qubits_),1ull << (BaseState::num_qubits_)); - -#ifdef AER_MPI - auto recv = BaseState::qregs_[0].copy_to_matrix(); - //gather states from other processes - for(iChunk=BaseState::num_local_chunks_;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - matrix[idx] = recv[i]; - } - } -#endif - - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - if(copy){ - auto tmp = BaseState::qregs_[iChunk].copy_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - matrix[idx] = tmp[i]; - } - } - else{ - auto tmp = BaseState::qregs_[iChunk].move_to_matrix(); -#pragma omp parallel for if(num_threads > 1) num_threads(num_threads) - for(i=0;i> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - matrix[idx] = tmp[i]; - } - } - } - } - else{ -#ifdef AER_MPI - //send matrices to process 0 - for(iChunk=0;iChunk& out,std::vector& queue) const; + bool split_op(const Operations::Op& op,const reg_t blockedQubits,std::vector& out,std::vector& queue) const; + bool is_blockable_operation(Operations::Op& op) const; }; @@ -129,6 +132,14 @@ void CacheBlocking::set_config(const json_t &config) gpu_blocking_bits_ = 10; } } + + std::string method; + if (JSON::get_value(method, "method", config)) { + if(method.find("density_matrix") != std::string::npos){ + density_matrix_ = true; + } + } + } @@ -297,7 +308,10 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector& //only gate and matrix can be reordered if(op.type != Operations::OpType::gate && op.type != Operations::OpType::matrix){ - return false; + //except for reset for density matrix + if(!density_matrix_ || op.type != Operations::OpType::reset){ + return false; + } } for(j=0;j& ops) const bool CacheBlocking::is_blockable_operation(Operations::Op& op) const { if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || - op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer){ + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer || + op.type == Operations::OpType::superop){ + return true; + } + if(density_matrix_ && op.type == Operations::OpType::reset){ return true; } + return false; } @@ -537,19 +556,21 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks - insert_sim_op(out,"end_blocking",blockedQubits); - } - else if(!end_block_inserted){ - out.pop_back(); - } + bool restore_qubits = false; if(ops[i].type == Operations::OpType::kraus){ - if(ops[i].qubits.size() >= block_bits_){ - throw std::runtime_error("CacheBlocking : kraus number of qubits should be smaller than chunk qubit size"); + if(ops[i].qubits.size() > block_bits_){ + throw std::runtime_error("CacheBlocking : Kraus operator, number of qubits should be smaller than chunk qubit size"); break; } if(!can_block(ops[i],blockedQubits)){ //if some qubits are out of chunk, queued for next step @@ -557,19 +578,42 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks + insert_sim_op(out,"end_blocking",blockedQubits); + } + else if(!end_block_inserted){ + out.pop_back(); + } + + if(restore_qubits) + restore_qubits_order(out); + //mapping swapped qubits for(iq=0;iq 1) return true; } - else if(op.type == Operations::OpType::matrix){ //fusion + else if(op.type == Operations::OpType::matrix || op.type == Operations::OpType::multiplexer || op.type == Operations::OpType::superop){ if(op.qubits.size() > 1) return true; } @@ -783,6 +827,51 @@ bool CacheBlocking::split_pauli(const Operations::Op& op,const reg_t blockedQubi return false; } +//split op to inside op and outside op +bool CacheBlocking::split_op(const Operations::Op& op,const reg_t blockedQubits,std::vector& out,std::vector& queue) const +{ + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + int_t i,j,n; + bool inside; + + n = op.qubits.size(); + for(i=0;i 0){ //save in queue + Operations::Op op_out = op; + op_out.qubits = qubits_out_chunk; + queue.push_back(op_out); + } + + if(qubits_in_chunk.size() > 0){ + Operations::Op op_in = op; + //mapping swapped qubits + for(i=0;i Date: Fri, 16 Apr 2021 18:52:50 +0900 Subject: [PATCH 07/10] GPU optimization of zero fucntion for Thrust --- .../densitymatrix_state_chunk.hpp | 18 +++++-------- .../statevector/qubitvector_thrust.hpp | 25 +++++++++++++++++-- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index be06280efc..89aceb50d5 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -1009,7 +1009,7 @@ cmatrix_t State::reduced_density_matrix_helper(const reg_t &qubits, tmp = BaseState::qregs_[iChunk].copy_to_matrix(); #ifdef AER_MPI else - recv_data(tmp.data(),size,0,iChunk); + BaseState::recv_data(tmp.data(),size,0,iChunk); #endif #pragma omp parallel for if(num_threads > 1) num_threads(num_threads) for(i=0;i::reduced_density_matrix_helper(const reg_t &qubits, else{ #ifdef AER_MPI //send matrices to process 0 - for(iChunk=0;iChunk::initialize_component(const reg_t &qubits, const // Utility //------------------------------------------------------------------------------ +template +class ZeroClear : public GateFuncBase +{ +protected: +public: + ZeroClear() {} + bool is_diagonal(void) + { + return true; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + vec = this->data_; + vec[i] = 0.0; + } + const char* name(void) + { + return "zero"; + } +}; + template void QubitVectorThrust::zero() { @@ -802,8 +824,7 @@ void QubitVectorThrust::zero() DebugMsg("zero"); #endif - chunk_->Zero(); -// chunk_->synchronize(); + apply_function(ZeroClear()); #ifdef AER_DEBUG DebugMsg("zero done"); From 9e015cf4efc49709ebcc1f7ba7a6fc0e1699d58c Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Wed, 21 Apr 2021 10:36:27 +0900 Subject: [PATCH 08/10] remove unnecessary buffers --- .../statevector/chunk/chunk_container.hpp | 47 ++++++++++++++++--- .../statevector/chunk/chunk_manager.hpp | 29 +++++------- .../chunk/device_chunk_container.hpp | 1 + .../chunk/host_chunk_container.hpp | 26 ++++++---- .../statevector/qubitvector_thrust.hpp | 25 ++-------- src/transpile/cacheblocking.hpp | 20 ++++++-- 6 files changed, 91 insertions(+), 57 deletions(-) diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index b024313ad2..6c4c27a479 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -487,6 +487,7 @@ class ChunkContainer : public std::enable_shared_from_this @@ -709,14 +710,48 @@ void ChunkContainer::allocate_chunks(void) buffers_.resize(num_buffers_); checkpoints_.resize(num_checkpoint_); - for(i=0;i>(this->shared_from_this(),i); + if(num_chunks_ > 0){ + chunks_.resize(num_chunks_); + for(i=0;i>(this->shared_from_this(),i); + } + } + if(num_buffers_ > 0){ + buffers_.resize(num_buffers_); + for(i=0;i>(this->shared_from_this(),num_chunks_+i); + } } - for(i=0;i>(this->shared_from_this(),num_chunks_+i); + if(num_checkpoint_ > 0){ + checkpoints_.resize(num_checkpoint_); + for(i=0;i>(this->shared_from_this(),num_chunks_+num_buffers_+i); + } } - for(i=0;i>(this->shared_from_this(),num_chunks_+num_buffers_+i); +} + +template +void ChunkContainer::deallocate_chunks(void) +{ + uint_t i; + + if(num_chunks_ > 0){ + for(i=0;i 0){ + for(i=0;i 0){ + for(i=0;i::ChunkManager() #endif - chunks_.resize(num_places_*2 + 1); + chunks_.resize(num_places_*2 + 1,nullptr); iplace_host_ = num_places_ ; @@ -173,7 +173,6 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) char* str; bool multi_gpu = false; bool hybrid = false; - uint_t num_checkpoint,total_checkpoint = 0; bool multi_shot = false; //--- for test @@ -253,25 +252,13 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) nc /= 2; } - num_checkpoint = nc; chunks_[iDev] = std::make_shared>(); - -#ifdef AER_THRUST_CUDA - size_t freeMem,totalMem; - cudaSetDevice(iDev); - cudaMemGetInfo(&freeMem,&totalMem); - if(freeMem <= ( ((uint_t)sizeof(thrust::complex) * (nc + num_buffers + num_checkpoint)) << chunk_bits_)){ - num_checkpoint = 0; - } -#endif - - total_checkpoint += num_checkpoint; - num_chunks_ += chunks_[iDev]->Allocate(iDev,chunk_bits,nc,num_buffers,num_checkpoint); + num_chunks_ += chunks_[iDev]->Allocate(iDev,chunk_bits,nc,num_buffers); } if(num_chunks_ < nchunks){ //rest of chunks are stored on host chunks_[num_places_] = std::make_shared>(); - chunks_[num_places_]->Allocate(-1,chunk_bits,nchunks-num_chunks_,AER_MAX_BUFFERS); + chunks_[num_places_]->Allocate(-1,chunk_bits,nchunks-num_chunks_,num_buffers); num_places_ += 1; num_chunks_ = nchunks; } @@ -279,7 +266,11 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks) //additional host buffer iplace_host_ = num_places_; chunks_[iplace_host_] = std::make_shared>(); +#ifdef AER_DISABLE_GDR chunks_[iplace_host_]->Allocate(-1,chunk_bits,0,AER_MAX_BUFFERS); +#else + chunks_[iplace_host_]->Allocate(-1,chunk_bits,0,0); +#endif } } @@ -292,9 +283,11 @@ void ChunkManager::Free(void) int i; for(i=0;iDeallocate(); - chunks_[i].reset(); + chunks_[i].reset(); + chunks_[i] = nullptr; + } } chunk_bits_ = 0; diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 8fe2ba9250..4e55457a13 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -373,6 +373,7 @@ void DeviceChunkContainer::Deallocate(void) } stream_.clear(); #endif + ChunkContainer::deallocate_chunks(); } template diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp index a6b32d1375..f00dee8195 100644 --- a/src/simulators/statevector/chunk/host_chunk_container.hpp +++ b/src/simulators/statevector/chunk/host_chunk_container.hpp @@ -131,12 +131,16 @@ uint_t HostChunkContainer::Allocate(int idev,int bits,uint_t chunks,uint ChunkContainer::num_buffers_ = buffers; ChunkContainer::num_checkpoint_ = checkpoint; ChunkContainer::num_chunks_ = nc; - data_.resize((nc + buffers + checkpoint) << bits); - matrix_.resize(nc + buffers); - params_.resize(nc + buffers); + if(nc + buffers + checkpoint > 0) + data_.resize((nc + buffers + checkpoint) << bits); + if(nc + buffers > 0){ + matrix_.resize(nc + buffers); + params_.resize(nc + buffers); + } //allocate chunk classes - ChunkContainer::allocate_chunks(); + if(nc + buffers + checkpoint > 0) + ChunkContainer::allocate_chunks(); return nc; } @@ -147,9 +151,12 @@ uint_t HostChunkContainer::Resize(uint_t chunks,uint_t buffers,uint_t ch uint_t i; if(chunks + buffers + checkpoint > this->num_chunks_ + this->num_buffers_ + this->num_checkpoint_){ - data_.resize((chunks + buffers + checkpoint) << this->chunk_bits_); - matrix_.resize(chunks + buffers); - params_.resize(chunks + buffers); + if(chunks + buffers + checkpoint > 0) + data_.resize((chunks + buffers + checkpoint) << this->chunk_bits_); + if(chunks + buffers > 0){ + matrix_.resize(chunks + buffers); + params_.resize(chunks + buffers); + } } this->num_chunks_ = chunks; @@ -157,7 +164,8 @@ uint_t HostChunkContainer::Resize(uint_t chunks,uint_t buffers,uint_t ch this->num_checkpoint_ = checkpoint; //allocate chunk classes - ChunkContainer::allocate_chunks(); + if(chunks + buffers + checkpoint > 0) + ChunkContainer::allocate_chunks(); return chunks + buffers + checkpoint; } @@ -171,6 +179,8 @@ void HostChunkContainer::Deallocate(void) matrix_.shrink_to_fit(); params_.clear(); params_.shrink_to_fit(); + + ChunkContainer::deallocate_chunks(); } diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index d1a1291eda..5b9475012e 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -945,31 +945,14 @@ std::complex QubitVectorThrust::inner_product() const chunk_->set_device(); vec0 = (data_t*)chunk_->pointer(); + vec1 = (data_t*)checkpoint_->pointer(); #ifdef AER_THRUST_CUDA cudaStream_t strm = chunk_->stream(); - if(strm){ - if(chunk_->device() == checkpoint_->device()){ - vec1 = (data_t*)checkpoint_->pointer(); - - dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); - } - else{ - std::shared_ptr> pBuffer = chunk_manager_.MapBufferChunk(chunk_->place()); - pBuffer->CopyIn(checkpoint_); - vec1 = (data_t*)pBuffer->pointer(); - - dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); - chunk_manager_.UnmapBufferChunk(pBuffer); - } - } - else{ - vec1 = (data_t*)checkpoint_->pointer(); - + if(strm) + dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); + else dot = thrust::inner_product(thrust::omp::par,vec0,vec0 + data_size_*2,vec1,0.0); - } #else - vec1 = (data_t*)checkpoint_->pointer(); - if(num_qubits_ > omp_threshold_ && omp_threads_ > 1) dot = thrust::inner_product(thrust::device,vec0,vec0 + data_size_*2,vec1,0.0); else diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index 8935631d75..ae476faa20 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -101,7 +101,7 @@ class CacheBlocking : public CircuitOptimization { bool is_diagonal_op(Operations::Op& op) const; void insert_swap(std::vector& ops,uint_t bit0,uint_t bit1,bool chunk) const; - void insert_sim_op(std::vector& ops,const char* name,const reg_t& qubits) const; + void insert_sim_op(std::vector& ops,char* name,const reg_t& qubits) const; void insert_pauli(std::vector& ops,reg_t& qubits,std::string& pauli) const; void define_blocked_qubits(std::vector& ops,reg_t& blockedQubits,bool crossQubitOnly) const; @@ -147,6 +147,7 @@ void CacheBlocking::set_blocking(int bits, size_t min_memory, uint_t n_place, si { int chunk_bits = bits; uint_t scale = is_matrix ? 2 : 1; + size_t size; //get largest possible chunk bits while((complex_size << (scale*chunk_bits)) > min_memory){ @@ -189,7 +190,7 @@ void CacheBlocking::insert_swap(std::vector& ops,uint_t bit0,uin ops.push_back(sgate); } -void CacheBlocking::insert_sim_op(std::vector& ops,const char* name,const reg_t& qubits) const +void CacheBlocking::insert_sim_op(std::vector& ops,char* name,const reg_t& qubits) const { Operations::Op op; op.type = Operations::OpType::sim_op; @@ -208,6 +209,10 @@ void CacheBlocking::optimize_circuit(Circuit& circ, if(!blocking_enabled_ && gpu_blocking_bits_ == 0){ return; } + std::cout << " === before cache block " << block_bits_ << std::endl; + for(uint_t i=0;i& ops,reg_t& blockedQubits,bool crossQubitOnly) const { uint_t i,j,iq; - int nq; + int nq,nb; bool exist; for(i=0;i= block_bits_) @@ -332,7 +343,7 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector& bool CacheBlocking::block_circuit(Circuit& circ,bool doSwap) const { - uint_t n; + uint_t i,n; std::vector out; std::vector queue; std::vector queue_next; @@ -459,6 +470,7 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector Date: Wed, 21 Apr 2021 18:41:21 +0900 Subject: [PATCH 09/10] remove debug message --- src/simulators/statevector/qubitvector_thrust.hpp | 9 --------- src/transpile/cacheblocking.hpp | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 5b9475012e..a756a5cf25 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -2761,15 +2761,6 @@ class phase_func : public GateFuncBase mask |= (1ull << qubits[i]); } } - int qubits_count(void) - { - return nqubits; - } - int num_control_bits(void) - { - return nqubits - 1; - } - bool is_diagonal(void) { return true; diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index ae476faa20..db53160e21 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -209,10 +209,6 @@ void CacheBlocking::optimize_circuit(Circuit& circ, if(!blocking_enabled_ && gpu_blocking_bits_ == 0){ return; } - std::cout << " === before cache block " << block_bits_ << std::endl; - for(uint_t i=0;i& ops,reg_t& blockedQubits,bool crossQubitOnly) const From 6cd12d7b0be65fac4b3ce04901cbaff599ee2216 Mon Sep 17 00:00:00 2001 From: Christopher Wood Date: Thu, 22 Apr 2021 15:59:47 -0400 Subject: [PATCH 10/10] Add release note --- releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml diff --git a/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml b/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml new file mode 100644 index 0000000000..01ebe9e31f --- /dev/null +++ b/releasenotes/notes/fix-gpu-memory-cf50e1def7072f38.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes a bug introduced in 0.8.0 where GPU simulations would allocate + unneeded host memory in addition to the GPU memory.