From d21d899806cfd8718122ea618f818160ee6ff698 Mon Sep 17 00:00:00 2001 From: doichanj Date: Mon, 29 Mar 2021 16:00:33 +0900 Subject: [PATCH 1/2] Fixing cache blocking transpiler and chunk states --- src/controllers/qasm_controller.hpp | 1 + .../densitymatrix_state_chunk.hpp | 115 ++++++++---- src/simulators/state_chunk.hpp | 166 +++++++++++++++++- .../statevector/statevector_state.hpp | 1 + .../statevector/statevector_state_chunk.hpp | 141 ++++++++------- src/transpile/cacheblocking.hpp | 140 +++++++++++---- 6 files changed, 431 insertions(+), 133 deletions(-) diff --git a/src/controllers/qasm_controller.hpp b/src/controllers/qasm_controller.hpp index a408b2e83d..70ede34137 100755 --- a/src/controllers/qasm_controller.hpp +++ b/src/controllers/qasm_controller.hpp @@ -1103,6 +1103,7 @@ void QasmController::run_circuit_helper(const Circuit& circ, if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu) is_matrix = true; auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex) : sizeof(std::complex),is_matrix); + cache_block_pass.set_sample_measure(check_measure_sampling_opt(opt_circ, method)); cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result); uint_t block_bits = 0; diff --git a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp index f0ce811d05..9b6eee2e77 100644 --- a/src/simulators/density_matrix/densitymatrix_state_chunk.hpp +++ b/src/simulators/density_matrix/densitymatrix_state_chunk.hpp @@ -648,6 +648,9 @@ void State::apply_chunk_swap(const reg_t &qubits) uint_t q0,q1; q0 = qubits[0]; q1 = qubits[1]; + + std::swap(BaseState::qubit_map_[q0],BaseState::qubit_map_[q1]); + if(qubits[0] >= BaseState::chunk_bits_){ q0 += BaseState::chunk_bits_; } @@ -713,8 +716,9 @@ void State::apply_save_amplitudes_sq(const Operations::Op &op, BaseState::threads_ > 1) \ num_threads(BaseState::threads_) for (int_t i = 0; i < size; ++i) { - if(op.int_params[i] >= (irow << BaseState::chunk_bits_) && op.int_params[i] < ((irow+1) << BaseState::chunk_bits_)) - amps_sq[i] = BaseState::qregs_[iChunk].probability(op.int_params[i] - (irow << BaseState::chunk_bits_)); + uint_t idx = BaseState::mapped_index(op.int_params[i]); + if(idx >= (irow << BaseState::chunk_bits_) && idx < ((irow+1) << BaseState::chunk_bits_)) + amps_sq[i] = BaseState::qregs_[iChunk].probability(idx - (irow << BaseState::chunk_bits_)); } } #ifdef AER_MPI @@ -1240,35 +1244,52 @@ rvector_t State::measure_probs(const reg_t &qubits) const icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); if(irow == icol){ //diagonal chunk - auto chunkSum = BaseState::qregs_[i].probabilities(qubits); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } } } +#ifdef AER_MPI + BaseState::reduce_sum(sum); +#endif + return sum; } @@ -1426,9 +1447,15 @@ void State::measure_reset_update(const reg_t &qubits, // If it doesn't agree with the reset state update if (final_state != meas_state) { + if(qubits[0] < BaseState::chunk_bits_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i::measure_reset_update(const reg_t &qubits, // If it doesn't agree with the reset state update // TODO This function could be optimized as a permutation update if (final_state != meas_state) { - // build vectorized permutation matrix - cvector_t perm(dim * dim, 0.); - perm[final_state * dim + meas_state] = 1.; - perm[meas_state * dim + final_state] = 1.; - for (size_t j=0; j < dim; j++) { - if (j != final_state && j != meas_state) - perm[j * dim + j] = 1.; + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + for(i=0;i 0){ //in chunk exchange + const size_t dim_in = 1ULL << qubits_in_chunk.size(); + // build vectorized permutation matrix + cvector_t perm(dim_in * dim_in, 0.); + perm[final_state * dim_in + meas_state] = 1.; + perm[meas_state * dim_in + final_state] = 1.; + for (size_t j=0; j < dim_in; j++) { + if (j != final_state && j != meas_state) + perm[j * dim_in + j] = 1.; + } + + // apply permutation to swap state #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange + for(i=0;i @@ -488,6 +493,12 @@ void StateChunk::allocate(uint_t num_qubits,uint_t block_bits) //only first one allocates chunks, others only set chunk index nchunks = 0; } + + //initialize qubit map + qubit_map_.resize(num_qubits_); + for(i=0;i @@ -847,19 +858,36 @@ void StateChunk::apply_save_expval(const Operations::Op &op, } } +template +uint_t StateChunk::mapped_index(const uint_t idx) +{ + uint_t i,ret = 0; + uint_t t = idx; + + for(i=0;i>= 1; + } + return ret; +} + template void StateChunk::apply_chunk_swap(const reg_t &qubits) { - uint_t q0,q1,t,nLarge; + uint_t q0,q1,nLarge; int_t iChunk; q0 = qubits[qubits.size() - 2]; q1 = qubits[qubits.size() - 1]; + if(qubit_scale() == 1){ + std::swap(qubit_map_[q0],qubit_map_[q1]); + } + if(q0 > q1){ - t = q0; - q0 = q1; - q1 = t; + std::swap(q0,q1); } if(q1 < chunk_bits_*qubit_scale()){ @@ -1021,6 +1049,136 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) } } +template +void StateChunk::apply_chunk_x(const uint_t qubit) +{ + uint_t q0,q1,t,nLarge; + int_t iChunk; + + + if(qubit < chunk_bits_*qubit_scale()){ + reg_t qubits(1,qubit); +#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) + for(iChunk=0;iChunk>= (chunk_bits_*qubit_scale()); + + int proc_bits = 0; + uint_t procs = distributed_procs_; + while(procs > 1){ + if((procs & 1) != 0){ + proc_bits = -1; + break; + } + proc_bits++; + procs >>= 1; + } + + if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed + nPair = num_local_chunks_ >> 1; + +#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) + for(iPair=0;iPair=0;i--){ + baseChunk += (iu[i] << ub[i]); + //update for next + iu[i] += add; + add = 0; + if(iu[i] >= nu[i]){ + iu[i] = 0; + add = 1; + } + } + + iChunk1 = baseChunk; + iChunk2 = baseChunk | mask; + + if(iChunk1 >= chunk_index_begin_[distributed_rank_] && iChunk1 < chunk_index_end_[distributed_rank_]){ //chunk1 is on this process + if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process + qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap(qubits,qregs_[iChunk2 - global_chunk_index_],true); + continue; + } + else{ + iLocalChunk = iChunk1; + iRemoteChunk = iChunk2; + iProc = get_process_by_chunk(iChunk2); + } + } + else{ + if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process + iLocalChunk = iChunk2; + iRemoteChunk = iChunk1; + iProc = get_process_by_chunk(iChunk1); + } + else{ + continue; //there is no chunk for this pair on this process + } + } + + MPI_Request reqSend,reqRecv; + MPI_Status st; + uint_t sizeRecv,sizeSend; + + auto pSend = qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend); + MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,iPair,distributed_comm_,&reqSend); + + auto pRecv = qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv); + MPI_Irecv(pRecv,sizeRecv,MPI_BYTE,iProc,iPair,distributed_comm_,&reqRecv); + + MPI_Wait(&reqSend,&st); + MPI_Wait(&reqRecv,&st); + + qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap(qubits,iRemoteChunk); + } + } +#endif + + } +} template void StateChunk::send_chunk(uint_t local_chunk_index, uint_t global_pair_index) diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index 84076cc72c..a8077a85f7 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -580,6 +580,7 @@ void State::apply_ops(const std::vector &ops, else if(op.name == "end_register_blocking"){ BaseState::qreg_.leave_register_blocking(); } + break; case OpType::save_expval: case OpType::save_expval_var: BaseState::apply_save_expval(op, result); diff --git a/src/simulators/statevector/statevector_state_chunk.hpp b/src/simulators/statevector/statevector_state_chunk.hpp index 69992ba261..c7582e72af 100644 --- a/src/simulators/statevector/statevector_state_chunk.hpp +++ b/src/simulators/statevector/statevector_state_chunk.hpp @@ -546,8 +546,6 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, RngEngine &rng, bool final_ops) { - uint_t ireg; - if(BaseState::creg_.check_conditional(op)) { switch (op.type) { case Operations::OpType::barrier: @@ -562,11 +560,9 @@ void State::apply_op(const int_t iChunk,const Operations::Op &op, apply_measure(op.qubits, op.memory, op.registers, rng); break; case Operations::OpType::bfunc: - if(iChunk == 0 || ireg > 0) BaseState::creg_.apply_bfunc(op); break; case Operations::OpType::roerror: - if(iChunk == 0 || ireg > 0) BaseState::creg_.apply_roerror(op, rng); break; case Operations::OpType::gate: @@ -825,10 +821,11 @@ void State::apply_save_amplitudes(const Operations::Op &op, if (op.type == Operations::OpType::save_amps) { Vector amps(size, false); for (int_t i = 0; i < size; ++i) { - uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + uint_t idx = BaseState::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BaseState::chunk_bits_; amps[i] = 0.0; if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ - amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].get_state(idx - (iChunk << BaseState::chunk_bits_)); } #ifdef AER_MPI complex_t amp = amps[i]; @@ -842,9 +839,10 @@ void State::apply_save_amplitudes(const Operations::Op &op, else{ rvector_t amps_sq(size,0); for (int_t i = 0; i < size; ++i) { - uint_t iChunk = op.int_params[i] >> BaseState::chunk_bits_; + uint_t idx = BaseState::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BaseState::chunk_bits_; if(iChunk >= BaseState::global_chunk_index_ && iChunk < BaseState::global_chunk_index_ + BaseState::num_local_chunks_){ - amps_sq[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].probability(op.int_params[i] - (iChunk << BaseState::chunk_bits_)); + amps_sq[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_].probability(idx - (iChunk << BaseState::chunk_bits_)); } } #ifdef AER_MPI @@ -1376,33 +1374,46 @@ rvector_t State::measure_probs(const reg_t &qubits) const #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += nr; + } } #ifdef AER_MPI @@ -1493,28 +1504,14 @@ std::vector State::sample_measure(const reg_t &qubits, std::vector all_samples; all_samples.reserve(shots); - if(qubits.size() == 0){ - //return all bits if qubits is empty (for multi-shot parallelization) - for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); - reg_t sample; - sample.reserve(BaseState::num_qubits_); - for (uint_t qubit = 0 ; qubit < BaseState::num_qubits_; qubit++) { - sample.push_back(allbit_sample[qubit]); - } - all_samples.push_back(sample); - } - } - else{ - for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); - reg_t sample; - sample.reserve(qubits.size()); - for (uint_t qubit : qubits) { - sample.push_back(allbit_sample[qubit]); - } - all_samples.push_back(sample); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); } + all_samples.push_back(sample); } return all_samples; } @@ -1557,13 +1554,11 @@ void State::measure_reset_update(const std::vector &qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i::measure_reset_update(const std::vector &qubits, #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) for(i=0;i 0){ //in chunk exchange + const size_t dim_in = 1ULL << qubits_in_chunk.size(); + // build vectorized permutation matrix + cvector_t perm(dim_in * dim_in, 0.); + perm[final_state * dim_in + meas_state] = 1.; + perm[meas_state * dim_in + final_state] = 1.; + for (size_t j=0; j < dim_in; j++) { + if (j != final_state && j != meas_state) + perm[j * dim_in + j] = 1.; + } + // apply permutation to swap state #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) - for(i=0;i 0){ //out of chunk exchange + for(i=0;i& waiting_ops) const; bool split_pauli(const Operations::Op& op, const reg_t blockedQubits, std::vector& out,std::vector& queue) const; + + bool is_blockable_operation(Operations::Op& op) const; }; void CacheBlocking::set_config(const json_t &config) @@ -281,19 +289,24 @@ bool CacheBlocking::can_reorder(Operations::Op& op,std::vector& //check if the operation can be reordered in front of waiting queue uint_t j,iq,jq; - //only gate and matrix can reorder + //only gate and matrix can be reordered if(op.type != Operations::OpType::gate && op.type != Operations::OpType::matrix){ return false; } for(j=0;j queue_next; n = add_ops(circ.ops,out,queue,doSwap,true); - put_nongate_ops(out,queue_next,queue,doSwap); - queue.clear(); - while(queue_next.size() > 0){ - n = add_ops(queue_next,out,queue,doSwap,false); +// put_nongate_ops(out,queue_next,queue,doSwap); +// queue.clear(); + while(queue.size() > 0){ + n = add_ops(queue,out,queue_next,doSwap,false); + queue = queue_next; queue_next.clear(); - put_nongate_ops(out,queue_next,queue,doSwap); +// put_nongate_ops(out,queue_next,queue,doSwap); if(n == 0){ break; } - queue.clear(); +// queue.clear(); } if(queue.size() > 0) return false; + +// restore_qubits_order(out); + circ.ops = out; return true; } @@ -421,6 +438,15 @@ void CacheBlocking::restore_qubits_order(std::vector& ops) const }while(count != 0); } +bool CacheBlocking::is_blockable_operation(Operations::Op& op) const +{ + if(op.type == Operations::OpType::gate || op.type == Operations::OpType::matrix || + op.type == Operations::OpType::diagonal_matrix || op.type == Operations::OpType::multiplexer){ + return true; + } + return false; +} + uint_t CacheBlocking::add_ops(std::vector& ops,std::vector& out,std::vector& queue,bool doSwap,bool first) const { uint_t i,j,iq; @@ -430,6 +456,7 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector& ops,std::vector 0 && !end_block_inserted){ //insert end of block to synchronize chunks + insert_sim_op(out,"end_blocking",blockedQubits); + } + else if(!end_block_inserted){ + out.pop_back(); + } + if(ops[i].type == Operations::OpType::kraus){ + if(ops[i].qubits.size() >= block_bits_){ + throw std::runtime_error("CacheBlocking : kraus number of qubits should be smaller than chunk qubit size"); + break; + } + if(!can_block(ops[i],blockedQubits)){ //if some qubits are out of chunk, queued for next step + queue.push_back(ops[i]); + continue; + } + } + if(!end_block_inserted){ + if(sample_measure_ && ops[i].type == Operations::OpType::measure){ + //currently sampling should be done with original qubit mapping (TO DO : sampling without inserting swaps) + restore_qubits_order(out); + } + else if(ops[i].type == Operations::OpType::save_statevec || ops[i].type == Operations::OpType::save_statevec_dict || ops[i].type == Operations::OpType::save_densmat || ops[i].type == Operations::OpType::save_unitary){ + restore_qubits_order(out); + } + } + //mapping swapped qubits + for(iq=0;iq 0){ - insert_sim_op(out,"end_blocking",blockedQubits); - } - else{ - //pop unnecessary operations - while(out.size() > pos_begin){ - out.pop_back(); + if(!end_block_inserted){ + if(num_gates_added > 0){ + insert_sim_op(out,"end_blocking",blockedQubits); + } + else{ + //pop unnecessary operations + while(out.size() > pos_begin){ + out.pop_back(); + } } } } @@ -633,6 +703,9 @@ bool CacheBlocking::is_cross_qubits_op(Operations::Op& op) const if(op.qubits.size() > 1) return true; } + if(op.type == Operations::OpType::kraus){ + return true; + } return false; } @@ -721,6 +794,7 @@ bool CacheBlocking::split_pauli(const Operations::Op& op,const reg_t blockedQubi return false; } + //------------------------------------------------------------------------- } // end namespace Transpile } // end namespace AER From 6c00af26653d857c478bdb2959183d7c59ed3311 Mon Sep 17 00:00:00 2001 From: doichanj Date: Mon, 29 Mar 2021 16:58:45 +0900 Subject: [PATCH 2/2] add save_state operation to qubit restoring --- src/transpile/cacheblocking.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transpile/cacheblocking.hpp b/src/transpile/cacheblocking.hpp index 3b9b420976..9fa69a47b6 100644 --- a/src/transpile/cacheblocking.hpp +++ b/src/transpile/cacheblocking.hpp @@ -567,14 +567,14 @@ uint_t CacheBlocking::add_ops(std::vector& ops,std::vector