Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/controllers/qasm_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,7 @@ void QasmController::run_circuit_helper(const Circuit& circ,
if(method == Method::density_matrix || method == Method::density_matrix_thrust_gpu || method == Method::density_matrix_thrust_cpu)
is_matrix = true;
auto cache_block_pass = transpile_cache_blocking(opt_circ,noise,config,(simulation_precision_ == Precision::single_precision) ? sizeof(std::complex<float>) : sizeof(std::complex<double>),is_matrix);
cache_block_pass.set_sample_measure(check_measure_sampling_opt(opt_circ, method));
cache_block_pass.optimize_circuit(opt_circ, dummy_noise, state.opset(), result);

uint_t block_bits = 0;
Expand Down
115 changes: 82 additions & 33 deletions src/simulators/density_matrix/densitymatrix_state_chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,9 @@ void State<densmat_t>::apply_chunk_swap(const reg_t &qubits)
uint_t q0,q1;
q0 = qubits[0];
q1 = qubits[1];

std::swap(BaseState::qubit_map_[q0],BaseState::qubit_map_[q1]);

if(qubits[0] >= BaseState::chunk_bits_){
q0 += BaseState::chunk_bits_;
}
Expand Down Expand Up @@ -713,8 +716,9 @@ void State<densmat_t>::apply_save_amplitudes_sq(const Operations::Op &op,
BaseState::threads_ > 1) \
num_threads(BaseState::threads_)
for (int_t i = 0; i < size; ++i) {
if(op.int_params[i] >= (irow << BaseState::chunk_bits_) && op.int_params[i] < ((irow+1) << BaseState::chunk_bits_))
amps_sq[i] = BaseState::qregs_[iChunk].probability(op.int_params[i] - (irow << BaseState::chunk_bits_));
uint_t idx = BaseState::mapped_index(op.int_params[i]);
if(idx >= (irow << BaseState::chunk_bits_) && idx < ((irow+1) << BaseState::chunk_bits_))
amps_sq[i] = BaseState::qregs_[iChunk].probability(idx - (irow << BaseState::chunk_bits_));
}
}
#ifdef AER_MPI
Expand Down Expand Up @@ -1240,35 +1244,52 @@ rvector_t State<densmat_t>::measure_probs(const reg_t &qubits) const
icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));

if(irow == icol){ //diagonal chunk
auto chunkSum = BaseState::qregs_[i].probabilities(qubits);
if(qubits_in_chunk.size() == qubits.size()){
for(j=0;j<dim;j++){
if(qubits_in_chunk.size() > 0){
auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk);
if(qubits_in_chunk.size() == qubits.size()){
for(j=0;j<dim;j++){
#pragma omp atomic
sum[j] += chunkSum[j];
sum[j] += chunkSum[j];
}
}
}
else{
for(j=0;j<chunkSum.size();j++){
int idx = 0;
int i_in = 0;
for(k=0;k<qubits.size();k++){
if(qubits[k] < (BaseState::chunk_bits_*2)){
idx += (((j >> i_in) & 1) << k);
i_in++;
}
else{
if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){
idx += 1ull << k;
else{
for(j=0;j<chunkSum.size();j++){
int idx = 0;
int i_in = 0;
for(k=0;k<qubits.size();k++){
if(qubits[k] < (BaseState::chunk_bits_)){
idx += (((j >> i_in) & 1) << k);
i_in++;
}
else{
if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){
idx += 1ull << k;
}
}
}
}
#pragma omp atomic
sum[idx] += chunkSum[j];
sum[idx] += chunkSum[j];
}
}
}
else{ //there is no bit in chunk
auto tr = std::real(BaseState::qregs_[i].trace());
int idx = 0;
for(k=0;k<qubits_out_chunk.size();k++){
if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits_out_chunk[k]) & 1){
idx += 1ull << k;
}
}
#pragma omp atomic
sum[idx] += tr;
}
}
}

#ifdef AER_MPI
BaseState::reduce_sum(sum);
#endif

return sum;
}

Expand Down Expand Up @@ -1426,9 +1447,15 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,

// If it doesn't agree with the reset state update
if (final_state != meas_state) {
if(qubits[0] < BaseState::chunk_bits_){
#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i)
for(i=0;i<BaseState::num_local_chunks_;i++){
BaseState::qregs_[i].apply_x(qubits[0]);
for(i=0;i<BaseState::num_local_chunks_;i++){
BaseState::qregs_[i].apply_x(qubits[0]);
}
}
else{
BaseState::apply_chunk_x(qubits[0]);
BaseState::apply_chunk_x(qubits[0]+BaseState::chunk_bits_);
}
}
}
Expand All @@ -1447,18 +1474,40 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,
// If it doesn't agree with the reset state update
// TODO This function could be optimized as a permutation update
if (final_state != meas_state) {
// build vectorized permutation matrix
cvector_t perm(dim * dim, 0.);
perm[final_state * dim + meas_state] = 1.;
perm[meas_state * dim + final_state] = 1.;
for (size_t j=0; j < dim; j++) {
if (j != final_state && j != meas_state)
perm[j * dim + j] = 1.;
reg_t qubits_in_chunk;
reg_t qubits_out_chunk;

for(i=0;i<qubits.size();i++){
if(qubits[i] < BaseState::chunk_bits_){
qubits_in_chunk.push_back(qubits[i]);
}
else{
qubits_out_chunk.push_back(qubits[i]);
}
}
// apply permutation to swap state

if(qubits_in_chunk.size() > 0){ //in chunk exchange
const size_t dim_in = 1ULL << qubits_in_chunk.size();
// build vectorized permutation matrix
cvector_t perm(dim_in * dim_in, 0.);
perm[final_state * dim_in + meas_state] = 1.;
perm[meas_state * dim_in + final_state] = 1.;
for (size_t j=0; j < dim_in; j++) {
if (j != final_state && j != meas_state)
perm[j * dim_in + j] = 1.;
}

// apply permutation to swap state
#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i)
for(i=0;i<BaseState::num_local_chunks_;i++){
BaseState::qregs_[i].apply_unitary_matrix(qubits, perm);
for(i=0;i<BaseState::num_local_chunks_;i++){
BaseState::qregs_[i].apply_unitary_matrix(qubits, perm);
}
}
if(qubits_out_chunk.size() > 0){ //out of chunk exchange
for(i=0;i<qubits_out_chunk.size();i++){
BaseState::apply_chunk_x(qubits_out_chunk[i]);
BaseState::apply_chunk_x(qubits_out_chunk[i]+(BaseState::num_qubits_ - BaseState::chunk_bits_));
}
}
}
}
Expand Down
166 changes: 162 additions & 4 deletions src/simulators/state_chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,8 @@ class StateChunk {
bool chunk_omp_parallel_; //using thread parallel to process loop of chunks or not
bool gpu_optimization_; //optimization for GPU

reg_t qubit_map_; //qubit map to restore swapped qubits

virtual int qubit_scale(void)
{
return 1; //scale of qubit number (x2 for density and unitary matrices)
Expand All @@ -315,6 +317,8 @@ class StateChunk {
//swap between chunks
virtual void apply_chunk_swap(const reg_t &qubits);

virtual void apply_chunk_x(const uint_t qubit);

//send/receive chunk in receive buffer
void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
Expand Down Expand Up @@ -360,6 +364,7 @@ class StateChunk {
MPI_Comm distributed_comm_;
#endif

uint_t mapped_index(const uint_t idx);
};

template <class state_t>
Expand Down Expand Up @@ -488,6 +493,12 @@ void StateChunk<state_t>::allocate(uint_t num_qubits,uint_t block_bits)
//only first one allocates chunks, others only set chunk index
nchunks = 0;
}

//initialize qubit map
qubit_map_.resize(num_qubits_);
for(i=0;i<num_qubits_;i++){
qubit_map_[i] = i;
}
}

template <class state_t>
Expand Down Expand Up @@ -847,19 +858,36 @@ void StateChunk<state_t>::apply_save_expval(const Operations::Op &op,
}
}

template <class state_t>
uint_t StateChunk<state_t>::mapped_index(const uint_t idx)
{
uint_t i,ret = 0;
uint_t t = idx;

for(i=0;i<num_qubits_;i++){
if(t & 1){
ret |= (1ull << qubit_map_[i]);
}
t >>= 1;
}
return ret;
}

template <class state_t>
void StateChunk<state_t>::apply_chunk_swap(const reg_t &qubits)
{
uint_t q0,q1,t,nLarge;
uint_t q0,q1,nLarge;
int_t iChunk;

q0 = qubits[qubits.size() - 2];
q1 = qubits[qubits.size() - 1];

if(qubit_scale() == 1){
std::swap(qubit_map_[q0],qubit_map_[q1]);
}

if(q0 > q1){
t = q0;
q0 = q1;
q1 = t;
std::swap(q0,q1);
}

if(q1 < chunk_bits_*qubit_scale()){
Expand Down Expand Up @@ -1021,6 +1049,136 @@ void StateChunk<state_t>::apply_chunk_swap(const reg_t &qubits)
}
}

template <class state_t>
void StateChunk<state_t>::apply_chunk_x(const uint_t qubit)
{
uint_t q0,q1,t,nLarge;
int_t iChunk;


if(qubit < chunk_bits_*qubit_scale()){
reg_t qubits(1,qubit);
#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk)
for(iChunk=0;iChunk<num_local_chunks_;iChunk++){
qregs_[iChunk].apply_mcx(qubits);
}
}
else{ //exchange over chunks
int_t iPair;
uint_t nPair,mask;
uint_t baseChunk,iChunk1,iChunk2;
reg_t qubits(2);
qubits[0] = qubit;
qubits[1] = qubit;

mask = (1ull << qubit);
mask >>= (chunk_bits_*qubit_scale());

int proc_bits = 0;
uint_t procs = distributed_procs_;
while(procs > 1){
if((procs & 1) != 0){
proc_bits = -1;
break;
}
proc_bits++;
procs >>= 1;
}

if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed
nPair = num_local_chunks_ >> 1;

#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2)
for(iPair=0;iPair<nPair;iPair++){
baseChunk = iPair & (mask-1);
baseChunk += ((iPair - baseChunk) << 1);

iChunk1 = baseChunk;
iChunk2 = baseChunk | mask;

qregs_[iChunk1].apply_chunk_swap(qubits,qregs_[iChunk2],true);
}
}
#ifdef AER_MPI
else{
//chunk scheduler that supports any number of processes
uint_t nu[3];
uint_t ub[3];
uint_t iu[3];
uint_t add;
uint_t iLocalChunk,iRemoteChunk,iProc;
int i;

nLarge = 1;
nu[0] = 1ull << (qubit - chunk_bits_*qubit_scale());
ub[0] = 0;
iu[0] = 0;

nu[1] = 1ull << (num_qubits_*qubit_scale() - qubit - 1);
ub[1] = (qubit - chunk_bits_*qubit_scale()) + 1;
iu[1] = 0;
nPair = 1ull << (num_qubits_*qubit_scale() - chunk_bits_*qubit_scale() - 1);

for(iPair=0;iPair<nPair;iPair++){
//calculate index of pair of chunks
baseChunk = 0;
add = 1;
for(i=1;i>=0;i--){
baseChunk += (iu[i] << ub[i]);
//update for next
iu[i] += add;
add = 0;
if(iu[i] >= nu[i]){
iu[i] = 0;
add = 1;
}
}

iChunk1 = baseChunk;
iChunk2 = baseChunk | mask;

if(iChunk1 >= chunk_index_begin_[distributed_rank_] && iChunk1 < chunk_index_end_[distributed_rank_]){ //chunk1 is on this process
if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process
qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap(qubits,qregs_[iChunk2 - global_chunk_index_],true);
continue;
}
else{
iLocalChunk = iChunk1;
iRemoteChunk = iChunk2;
iProc = get_process_by_chunk(iChunk2);
}
}
else{
if(iChunk2 >= chunk_index_begin_[distributed_rank_] && iChunk2 < chunk_index_end_[distributed_rank_]){ //chunk2 is on this process
iLocalChunk = iChunk2;
iRemoteChunk = iChunk1;
iProc = get_process_by_chunk(iChunk1);
}
else{
continue; //there is no chunk for this pair on this process
}
}

MPI_Request reqSend,reqRecv;
MPI_Status st;
uint_t sizeRecv,sizeSend;

auto pSend = qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend);
MPI_Isend(pSend,sizeSend,MPI_BYTE,iProc,iPair,distributed_comm_,&reqSend);

auto pRecv = qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv);
MPI_Irecv(pRecv,sizeRecv,MPI_BYTE,iProc,iPair,distributed_comm_,&reqRecv);

MPI_Wait(&reqSend,&st);
MPI_Wait(&reqRecv,&st);

qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap(qubits,iRemoteChunk);
}
}
#endif

}
}

template <class state_t>
void StateChunk<state_t>::send_chunk(uint_t local_chunk_index, uint_t global_pair_index)
Expand Down
Loading