@@ -38,7 +38,7 @@ class Simulator{
3838public:
3939 using calc_type = double ;
4040 using complex_type = std::complex <calc_type>;
41- using StateVector = std::vector<complex_type, aligned_allocator<complex_type,64 >>;
41+ using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512 >>;
4242 using Map = std::map<unsigned , unsigned >;
4343 using RndEngine = std::mt19937;
4444 using Term = std::vector<std::pair<unsigned , char >>;
@@ -55,11 +55,18 @@ class Simulator{
5555 void allocate_qubit (unsigned id){
5656 if (map_.count (id) == 0 ){
5757 map_[id] = N_++;
58- auto newvec = StateVector (1UL << N_);
59- #pragma omp parallel for schedule(static)
58+ StateVector newvec; // avoid large memory allocations
59+ if ( tmpBuff1_.capacity () >= (1UL << N_) )
60+ std::swap (newvec, tmpBuff1_);
61+ newvec.resize (1UL << N_);
62+ #pragma omp parallel for schedule(static)
6063 for (std::size_t i = 0 ; i < newvec.size (); ++i)
6164 newvec[i] = (i < vec_.size ())?vec_[i]:0 .;
62- vec_ = std::move (newvec);
65+ std::swap (vec_, newvec);
66+ // recycle large memory
67+ std::swap (tmpBuff1_, newvec);
68+ if ( tmpBuff1_.capacity () < tmpBuff2_.capacity () )
69+ std::swap (tmpBuff1_, tmpBuff2_);
6370 }
6471 else
6572 throw (std::runtime_error (
@@ -113,12 +120,18 @@ class Simulator{
113120 }
114121 }
115122 else {
116- StateVector newvec ((1UL << (N_-1 )));
117- #pragma omp parallel for schedule(static)
123+ StateVector newvec; // avoid costly memory reallocations
124+ if ( tmpBuff1_.capacity () >= (1UL << (N_-1 )) )
125+ std::swap (tmpBuff1_, newvec);
126+ newvec.resize ((1UL << (N_-1 )));
127+ #pragma omp parallel for schedule(static) if(0)
118128 for (std::size_t i = 0 ; i < vec_.size (); i += 2 *delta)
119129 std::copy_n (&vec_[i + static_cast <std::size_t >(value)*delta],
120130 delta, &newvec[i/2 ]);
121- vec_ = std::move (newvec);
131+ std::swap (vec_, newvec);
132+ std::swap (tmpBuff1_, newvec);
133+ if ( tmpBuff1_.capacity () < tmpBuff2_.capacity () )
134+ std::swap (tmpBuff1_, tmpBuff2_);
122135
123136 for (auto & p : map_){
124137 if (p.second > pos)
@@ -189,8 +202,8 @@ class Simulator{
189202 }
190203
191204 template <class M >
192- void apply_controlled_gate (M const & m, std::vector<unsigned > ids,
193- std::vector<unsigned > ctrl){
205+ void apply_controlled_gate (M const & m, const std::vector<unsigned >& ids,
206+ const std::vector<unsigned >& ctrl){
194207 auto fused_gates = fused_gates_;
195208 fused_gates.insert (m, ids, ctrl);
196209
@@ -209,46 +222,85 @@ class Simulator{
209222 }
210223
211224 template <class F , class QuReg >
212- void emulate_math (F const & f, QuReg quregs, std::vector<unsigned > ctrl,
213- unsigned num_threads= 1 ){
225+ void emulate_math (F const & f, QuReg quregs, const std::vector<unsigned >& ctrl,
226+ bool parallelize = false ){
214227 run ();
215228 auto ctrlmask = get_control_mask (ctrl);
216229
217230 for (unsigned i = 0 ; i < quregs.size (); ++i)
218231 for (unsigned j = 0 ; j < quregs[i].size (); ++j)
219232 quregs[i][j] = map_[quregs[i][j]];
220233
221- StateVector newvec (vec_.size (), 0 .);
222- std::vector<int > res (quregs.size ());
223-
224- #pragma omp parallel for schedule(static) firstprivate(res) num_threads(num_threads)
225- for (std::size_t i = 0 ; i < vec_.size (); ++i){
226- if ((ctrlmask&i) == ctrlmask){
227- for (unsigned qr_i = 0 ; qr_i < quregs.size (); ++qr_i){
228- res[qr_i] = 0 ;
229- for (unsigned qb_i = 0 ; qb_i < quregs[qr_i].size (); ++qb_i)
230- res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1 ) << qb_i;
231- }
232- f (res);
233- auto new_i = i;
234- for (unsigned qr_i = 0 ; qr_i < quregs.size (); ++qr_i){
235- for (unsigned qb_i = 0 ; qb_i < quregs[qr_i].size (); ++qb_i){
236- if (!(((new_i >> quregs[qr_i][qb_i])&1 ) == ((res[qr_i] >> qb_i)&1 )))
237- new_i ^= (1UL << quregs[qr_i][qb_i]);
238- }
239- }
240- newvec[new_i] += vec_[i];
241- }
242- else
243- newvec[i] += vec_[i];
234+ StateVector newvec; // avoid costly memory reallocations
235+ if ( tmpBuff1_.capacity () >= vec_.size () )
236+ std::swap (newvec, tmpBuff1_);
237+ newvec.resize (vec_.size ());
238+ #pragma omp parallel for schedule(static)
239+ for (std::size_t i = 0 ; i < vec_.size (); i++)
240+ newvec[i] = 0 ;
241+
242+ // #pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
243+ {
244+ std::vector<int > res (quregs.size ());
245+ // #pragma omp for schedule(static)
246+ for (std::size_t i = 0 ; i < vec_.size (); ++i){
247+ if ((ctrlmask&i) == ctrlmask){
248+ for (unsigned qr_i = 0 ; qr_i < quregs.size (); ++qr_i){
249+ res[qr_i] = 0 ;
250+ for (unsigned qb_i = 0 ; qb_i < quregs[qr_i].size (); ++qb_i)
251+ res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1 ) << qb_i;
252+ }
253+ f (res);
254+ auto new_i = i;
255+ for (unsigned qr_i = 0 ; qr_i < quregs.size (); ++qr_i){
256+ for (unsigned qb_i = 0 ; qb_i < quregs[qr_i].size (); ++qb_i){
257+ if (!(((new_i >> quregs[qr_i][qb_i])&1 ) == ((res[qr_i] >> qb_i)&1 )))
258+ new_i ^= (1UL << quregs[qr_i][qb_i]);
259+ }
260+ }
261+ newvec[new_i] += vec_[i];
262+ }
263+ else
264+ newvec[i] += vec_[i];
265+ }
244266 }
245- vec_ = std::move (newvec);
267+ std::swap (vec_, newvec);
268+ std::swap (tmpBuff1_, newvec);
269+ }
270+
271+ // faster version without calling python
272+ template <class QuReg >
273+ inline void emulate_math_addConstant (int a, const QuReg& quregs, const std::vector<unsigned >& ctrl)
274+ {
275+ emulate_math ([a](std::vector<int > &res){for (auto & x: res) x = x + a;}, quregs, ctrl, true );
276+ }
277+
278+ // faster version without calling python
279+ template <class QuReg >
280+ inline void emulate_math_addConstantModN (int a, int N, const QuReg& quregs, const std::vector<unsigned >& ctrl)
281+ {
282+ emulate_math ([a,N](std::vector<int > &res){for (auto & x: res) x = (x + a) % N;}, quregs, ctrl, true );
283+ }
284+
285+ // faster version without calling python
286+ template <class QuReg >
287+ inline void emulate_math_multiplyByConstantModN (int a, int N, const QuReg& quregs, const std::vector<unsigned >& ctrl)
288+ {
289+ emulate_math ([a,N](std::vector<int > &res){for (auto & x: res) x = (x * a) % N;}, quregs, ctrl, true );
246290 }
247291
248292 calc_type get_expectation_value (TermsDict const & td, std::vector<unsigned > const & ids){
249293 run ();
250294 calc_type expectation = 0 .;
251- auto current_state = vec_;
295+
296+ StateVector current_state; // avoid costly memory reallocations
297+ if ( tmpBuff1_.capacity () >= vec_.size () )
298+ std::swap (tmpBuff1_, current_state);
299+ current_state.resize (vec_.size ());
300+ #pragma omp parallel for schedule(static)
301+ for (std::size_t i = 0 ; i < vec_.size (); ++i)
302+ current_state[i] = vec_[i];
303+
252304 for (auto const & term : td){
253305 auto const & coefficient = term.second ;
254306 apply_term (term.first , ids, {});
@@ -260,17 +312,29 @@ class Simulator{
260312 auto const a2 = std::real (vec_[i]);
261313 auto const b2 = std::imag (vec_[i]);
262314 delta += a1 * a2 - b1 * b2;
315+ // reset vec_
316+ vec_[i] = current_state[i];
263317 }
264318 expectation += coefficient * delta;
265- vec_ = current_state;
266319 }
320+ std::swap (current_state, tmpBuff1_);
267321 return expectation;
268322 }
269323
270324 void apply_qubit_operator (ComplexTermsDict const & td, std::vector<unsigned > const & ids){
271325 run ();
272- auto new_state = StateVector (vec_.size (), 0 .);
273- auto current_state = vec_;
326+ StateVector new_state, current_state; // avoid costly memory reallocations
327+ if ( tmpBuff1_.capacity () >= vec_.size () )
328+ std::swap (tmpBuff1_, new_state);
329+ if ( tmpBuff2_.capacity () >= vec_.size () )
330+ std::swap (tmpBuff2_, current_state);
331+ new_state.resize (vec_.size ());
332+ current_state.resize (vec_.size ());
333+ #pragma omp parallel for schedule(static)
334+ for (std::size_t i = 0 ; i < vec_.size (); ++i){
335+ new_state[i] = 0 ;
336+ current_state[i] = vec_[i];
337+ }
274338 for (auto const & term : td){
275339 auto const & coefficient = term.second ;
276340 apply_term (term.first , ids, {});
@@ -280,7 +344,9 @@ class Simulator{
280344 vec_[i] = current_state[i];
281345 }
282346 }
283- vec_ = std::move (new_state);
347+ std::swap (vec_, new_state);
348+ std::swap (tmpBuff1_, new_state);
349+ std::swap (tmpBuff2_, current_state);
284350 }
285351
286352 calc_type get_probability (std::vector<bool > const & bit_string,
@@ -452,6 +518,8 @@ class Simulator{
452518 #pragma omp parallel
453519 kernel (vec_, ids[4 ], ids[3 ], ids[2 ], ids[1 ], ids[0 ], m, ctrlmask);
454520 break ;
521+ default :
522+ throw std::invalid_argument (" Gates with more than 5 qubits are not supported!" );
455523 }
456524
457525 fused_gates_ = Fusion ();
@@ -500,6 +568,12 @@ class Simulator{
500568 unsigned fusion_qubits_min_, fusion_qubits_max_;
501569 RndEngine rnd_eng_;
502570 std::function<double ()> rng_;
571+
572+ // large array buffers to avoid costly reallocations
573+ static StateVector tmpBuff1_, tmpBuff2_;
503574};
504575
576+ Simulator::StateVector Simulator::tmpBuff1_;
577+ Simulator::StateVector Simulator::tmpBuff2_;
578+
505579#endif
0 commit comments