Skip to content

Commit f27286f

Browse files
melvenTakishima
authored andcommitted
C++ simulator performance improvements (#329)
* C++ simulator performance: make the swap-gate run in native C++ It was defined as a BasicMathGate before which made it run as python code through the emulate_math_wrapper. The new variant just uses its matrix representation to run it in native code. * C++ simulator performance: add dedicated C++ code for common math gates The BasicMathGate uses a C++ python wrapper (emulate_math_wrapper) to allow generic calculations which makes it very slow. This detects some math gates and provides a native C++ implementation for it. * C++ simulator performance: use larger memory alignment * C++ simulator performance: recycle large StateVector memory buffers This avoids costly std::vector copying/reallocations by using some static std::vector to reuse the allocated buffer (just by std::swap'ing a vector into a buffer for later use when it would be deallocated otherwise). * C++ simulator performance: improve compiler flags * Add test coverage for constant math emulation * Revert "Add test coverage for constant math emulation" This reverts commit 3bb8a2c. * Add test coverage for constant math emulation
1 parent 9db3383 commit f27286f

File tree

6 files changed

+196
-45
lines changed

6 files changed

+196
-45
lines changed

projectq/backends/_sim/_cppkernels/simulator.hpp

Lines changed: 114 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class Simulator{
3838
public:
3939
using calc_type = double;
4040
using complex_type = std::complex<calc_type>;
41-
using StateVector = std::vector<complex_type, aligned_allocator<complex_type,64>>;
41+
using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
4242
using Map = std::map<unsigned, unsigned>;
4343
using RndEngine = std::mt19937;
4444
using Term = std::vector<std::pair<unsigned, char>>;
@@ -55,11 +55,18 @@ class Simulator{
5555
void allocate_qubit(unsigned id){
5656
if (map_.count(id) == 0){
5757
map_[id] = N_++;
58-
auto newvec = StateVector(1UL << N_);
59-
#pragma omp parallel for schedule(static)
58+
StateVector newvec; // avoid large memory allocations
59+
if( tmpBuff1_.capacity() >= (1UL << N_) )
60+
std::swap(newvec, tmpBuff1_);
61+
newvec.resize(1UL << N_);
62+
#pragma omp parallel for schedule(static)
6063
for (std::size_t i = 0; i < newvec.size(); ++i)
6164
newvec[i] = (i < vec_.size())?vec_[i]:0.;
62-
vec_ = std::move(newvec);
65+
std::swap(vec_, newvec);
66+
// recycle large memory
67+
std::swap(tmpBuff1_, newvec);
68+
if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
69+
std::swap(tmpBuff1_, tmpBuff2_);
6370
}
6471
else
6572
throw(std::runtime_error(
@@ -113,12 +120,18 @@ class Simulator{
113120
}
114121
}
115122
else{
116-
StateVector newvec((1UL << (N_-1)));
117-
#pragma omp parallel for schedule(static)
123+
StateVector newvec; // avoid costly memory reallocations
124+
if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
125+
std::swap(tmpBuff1_, newvec);
126+
newvec.resize((1UL << (N_-1)));
127+
#pragma omp parallel for schedule(static) if(0)
118128
for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
119129
std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
120130
delta, &newvec[i/2]);
121-
vec_ = std::move(newvec);
131+
std::swap(vec_, newvec);
132+
std::swap(tmpBuff1_, newvec);
133+
if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
134+
std::swap(tmpBuff1_, tmpBuff2_);
122135

123136
for (auto& p : map_){
124137
if (p.second > pos)
@@ -189,8 +202,8 @@ class Simulator{
189202
}
190203

191204
template <class M>
192-
void apply_controlled_gate(M const& m, std::vector<unsigned> ids,
193-
std::vector<unsigned> ctrl){
205+
void apply_controlled_gate(M const& m, const std::vector<unsigned>& ids,
206+
const std::vector<unsigned>& ctrl){
194207
auto fused_gates = fused_gates_;
195208
fused_gates.insert(m, ids, ctrl);
196209

@@ -209,46 +222,85 @@ class Simulator{
209222
}
210223

211224
template <class F, class QuReg>
212-
void emulate_math(F const& f, QuReg quregs, std::vector<unsigned> ctrl,
213-
unsigned num_threads=1){
225+
void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
226+
bool parallelize = false){
214227
run();
215228
auto ctrlmask = get_control_mask(ctrl);
216229

217230
for (unsigned i = 0; i < quregs.size(); ++i)
218231
for (unsigned j = 0; j < quregs[i].size(); ++j)
219232
quregs[i][j] = map_[quregs[i][j]];
220233

221-
StateVector newvec(vec_.size(), 0.);
222-
std::vector<int> res(quregs.size());
223-
224-
#pragma omp parallel for schedule(static) firstprivate(res) num_threads(num_threads)
225-
for (std::size_t i = 0; i < vec_.size(); ++i){
226-
if ((ctrlmask&i) == ctrlmask){
227-
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
228-
res[qr_i] = 0;
229-
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
230-
res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
231-
}
232-
f(res);
233-
auto new_i = i;
234-
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
235-
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
236-
if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
237-
new_i ^= (1UL << quregs[qr_i][qb_i]);
238-
}
239-
}
240-
newvec[new_i] += vec_[i];
241-
}
242-
else
243-
newvec[i] += vec_[i];
234+
StateVector newvec; // avoid costly memory reallocations
235+
if( tmpBuff1_.capacity() >= vec_.size() )
236+
std::swap(newvec, tmpBuff1_);
237+
newvec.resize(vec_.size());
238+
#pragma omp parallel for schedule(static)
239+
for (std::size_t i = 0; i < vec_.size(); i++)
240+
newvec[i] = 0;
241+
242+
//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
243+
{
244+
std::vector<int> res(quregs.size());
245+
//#pragma omp for schedule(static)
246+
for (std::size_t i = 0; i < vec_.size(); ++i){
247+
if ((ctrlmask&i) == ctrlmask){
248+
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
249+
res[qr_i] = 0;
250+
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
251+
res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
252+
}
253+
f(res);
254+
auto new_i = i;
255+
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
256+
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
257+
if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
258+
new_i ^= (1UL << quregs[qr_i][qb_i]);
259+
}
260+
}
261+
newvec[new_i] += vec_[i];
262+
}
263+
else
264+
newvec[i] += vec_[i];
265+
}
244266
}
245-
vec_ = std::move(newvec);
267+
std::swap(vec_, newvec);
268+
std::swap(tmpBuff1_, newvec);
269+
}
270+
271+
// faster version without calling python
272+
template<class QuReg>
273+
inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
274+
{
275+
emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
276+
}
277+
278+
// faster version without calling python
279+
template<class QuReg>
280+
inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
281+
{
282+
emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
283+
}
284+
285+
// faster version without calling python
286+
template<class QuReg>
287+
inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
288+
{
289+
emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
246290
}
247291

248292
calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
249293
run();
250294
calc_type expectation = 0.;
251-
auto current_state = vec_;
295+
296+
StateVector current_state; // avoid costly memory reallocations
297+
if( tmpBuff1_.capacity() >= vec_.size() )
298+
std::swap(tmpBuff1_, current_state);
299+
current_state.resize(vec_.size());
300+
#pragma omp parallel for schedule(static)
301+
for (std::size_t i = 0; i < vec_.size(); ++i)
302+
current_state[i] = vec_[i];
303+
252304
for (auto const& term : td){
253305
auto const& coefficient = term.second;
254306
apply_term(term.first, ids, {});
@@ -260,17 +312,29 @@ class Simulator{
260312
auto const a2 = std::real(vec_[i]);
261313
auto const b2 = std::imag(vec_[i]);
262314
delta += a1 * a2 - b1 * b2;
315+
// reset vec_
316+
vec_[i] = current_state[i];
263317
}
264318
expectation += coefficient * delta;
265-
vec_ = current_state;
266319
}
320+
std::swap(current_state, tmpBuff1_);
267321
return expectation;
268322
}
269323

270324
void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
271325
run();
272-
auto new_state = StateVector(vec_.size(), 0.);
273-
auto current_state = vec_;
326+
StateVector new_state, current_state; // avoid costly memory reallocations
327+
if( tmpBuff1_.capacity() >= vec_.size() )
328+
std::swap(tmpBuff1_, new_state);
329+
if( tmpBuff2_.capacity() >= vec_.size() )
330+
std::swap(tmpBuff2_, current_state);
331+
new_state.resize(vec_.size());
332+
current_state.resize(vec_.size());
333+
#pragma omp parallel for schedule(static)
334+
for (std::size_t i = 0; i < vec_.size(); ++i){
335+
new_state[i] = 0;
336+
current_state[i] = vec_[i];
337+
}
274338
for (auto const& term : td){
275339
auto const& coefficient = term.second;
276340
apply_term(term.first, ids, {});
@@ -280,7 +344,9 @@ class Simulator{
280344
vec_[i] = current_state[i];
281345
}
282346
}
283-
vec_ = std::move(new_state);
347+
std::swap(vec_, new_state);
348+
std::swap(tmpBuff1_, new_state);
349+
std::swap(tmpBuff2_, current_state);
284350
}
285351

286352
calc_type get_probability(std::vector<bool> const& bit_string,
@@ -452,6 +518,8 @@ class Simulator{
452518
#pragma omp parallel
453519
kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
454520
break;
521+
default:
522+
throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
455523
}
456524

457525
fused_gates_ = Fusion();
@@ -500,6 +568,12 @@ class Simulator{
500568
unsigned fusion_qubits_min_, fusion_qubits_max_;
501569
RndEngine rnd_eng_;
502570
std::function<double()> rng_;
571+
572+
// large array buffers to avoid costly reallocations
573+
static StateVector tmpBuff1_, tmpBuff2_;
503574
};
504575

576+
Simulator::StateVector Simulator::tmpBuff1_;
577+
Simulator::StateVector Simulator::tmpBuff2_;
578+
505579
#endif

projectq/backends/_sim/_cppsim.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ PYBIND11_PLUGIN(_cppsim) {
5050
.def("measure_qubits", &Simulator::measure_qubits_return)
5151
.def("apply_controlled_gate", &Simulator::apply_controlled_gate<MatrixType>)
5252
.def("emulate_math", &emulate_math_wrapper<QuRegs>)
53+
.def("emulate_math_addConstant", &Simulator::emulate_math_addConstant<QuRegs>)
54+
.def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN<QuRegs>)
55+
.def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN<QuRegs>)
5356
.def("get_expectation_value", &Simulator::get_expectation_value)
5457
.def("apply_qubit_operator", &Simulator::apply_qubit_operator)
5558
.def("emulate_time_evolution", &Simulator::emulate_time_evolution)

projectq/backends/_sim/_simulator.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@
3333
TimeEvolution)
3434
from projectq.types import WeakQubitRef
3535

36+
FALLBACK_TO_PYSIM = False
3637
try:
3738
from ._cppsim import Simulator as SimulatorBackend
3839
except ImportError:
3940
from ._pysim import Simulator as SimulatorBackend
41+
FALLBACK_TO_PYSIM = True
4042

4143

4244
class Simulator(BasicEngine):
@@ -384,14 +386,34 @@ def _handle(self, cmd):
384386
ID = cmd.qubits[0][0].id
385387
self._simulator.deallocate_qubit(ID)
386388
elif isinstance(cmd.gate, BasicMathGate):
389+
# improve performance by using C++ code for some commomn gates
390+
from projectq.libs.math import (AddConstant,
391+
AddConstantModN,
392+
MultiplyByConstantModN)
387393
qubitids = []
388394
for qr in cmd.qubits:
389395
qubitids.append([])
390396
for qb in qr:
391397
qubitids[-1].append(qb.id)
392-
math_fun = cmd.gate.get_math_function(cmd.qubits)
393-
self._simulator.emulate_math(math_fun, qubitids,
394-
[qb.id for qb in cmd.control_qubits])
398+
if FALLBACK_TO_PYSIM:
399+
math_fun = cmd.gate.get_math_function(cmd.qubits)
400+
self._simulator.emulate_math(math_fun, qubitids,
401+
[qb.id for qb in cmd.control_qubits])
402+
else:
403+
# individual code for different standard gates to make it faster!
404+
if isinstance(cmd.gate, AddConstant):
405+
self._simulator.emulate_math_addConstant(cmd.gate.a, qubitids,
406+
[qb.id for qb in cmd.control_qubits])
407+
elif isinstance(cmd.gate, AddConstantModN):
408+
self._simulator.emulate_math_addConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
409+
[qb.id for qb in cmd.control_qubits])
410+
elif isinstance(cmd.gate, MultiplyByConstantModN):
411+
self._simulator.emulate_math_multiplyByConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
412+
[qb.id for qb in cmd.control_qubits])
413+
else:
414+
math_fun = cmd.gate.get_math_function(cmd.qubits)
415+
self._simulator.emulate_math(math_fun, qubitids,
416+
[qb.id for qb in cmd.control_qubits])
395417
elif isinstance(cmd.gate, TimeEvolution):
396418
op = [(list(term), coeff) for (term, coeff)
397419
in cmd.gate.hamiltonian.terms.items()]

projectq/backends/_sim/_simulator_test.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,3 +683,55 @@ def receive(command_list):
683683
qubit1[0].id: qubit0[0].id}
684684
assert (sim._convert_logical_to_mapped_qureg(qubit0 + qubit1) ==
685685
qubit1 + qubit0)
686+
687+
688+
def test_simulator_constant_math_emulation():
689+
if "cpp_simulator" not in get_available_simulators():
690+
pytest.skip("No C++ simulator")
691+
return
692+
693+
results = [[[1, 1, 0, 0, 0]], [[0, 1, 0, 0, 0]], [[0, 1, 1, 1, 0]]]
694+
695+
import projectq.backends._sim._simulator as _sim
696+
from projectq.backends._sim._pysim import Simulator as PySim
697+
from projectq.backends._sim._cppsim import Simulator as CppSim
698+
from projectq.libs.math import (AddConstant, AddConstantModN,
699+
MultiplyByConstantModN)
700+
701+
def gate_filter(eng, cmd):
702+
g = cmd.gate
703+
if isinstance(g, BasicMathGate):
704+
return False
705+
return eng.next_engine.is_available(cmd)
706+
707+
def run_simulation(sim):
708+
eng = MainEngine(sim, [])
709+
quint = eng.allocate_qureg(5)
710+
AddConstant(3) | quint
711+
All(Measure) | quint
712+
eng.flush()
713+
results[0].append([int(qb) for qb in quint])
714+
715+
AddConstantModN(4, 5) | quint
716+
All(Measure) | quint
717+
eng.flush()
718+
results[1].append([int(qb) for qb in quint])
719+
720+
MultiplyByConstantModN(15, 16) | quint
721+
All(Measure) | quint
722+
eng.flush()
723+
results[2].append([int(qb) for qb in quint])
724+
725+
cppsim = Simulator(gate_fusion=False)
726+
cppsim._simulator = CppSim(1)
727+
run_simulation(cppsim)
728+
729+
_sim.FALLBACK_TO_PYSIM = True
730+
pysim = Simulator()
731+
pysim._simulator = PySim(1)
732+
# run_simulation(pysim)
733+
734+
for result in results:
735+
ref = result[0]
736+
for res in result[1:]:
737+
assert ref == res

projectq/ops/_gates.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,9 @@ def __str__(self):
157157
SqrtX = SqrtXGate()
158158

159159

160-
class SwapGate(SelfInverseGate, BasicMathGate):
160+
class SwapGate(SelfInverseGate):
161161
""" Swap gate class (swaps 2 qubits) """
162162
def __init__(self):
163-
BasicMathGate.__init__(self, lambda x, y: (y, x))
164163
SelfInverseGate.__init__(self)
165164
self.interchangeable_qubit_indices = [[0, 1]]
166165

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def build_extensions(self):
124124
opts.append('/arch:AVX')
125125
else:
126126
opts.append('-march=native')
127+
opts.append('-ffast-math')
127128

128129
opts.append(openmp)
129130
if ct == 'unix':

0 commit comments

Comments
 (0)