vm6502q
diff --git a/‎docs/projectq.ops.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/projectq.ops.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/projectq.setups.decompositions.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/projectq.setups.decompositions.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎projectq/backends/_resource_test.py‎
Lines changed: 3 additions & 2 deletions b/‎projectq/backends/_resource_test.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎projectq/backends/_sim/_cppkernels/simulator.hpp‎
Lines changed: 114 additions & 40 deletions b/‎projectq/backends/_sim/_cppkernels/simulator.hpp‎
Lines changed: 114 additions & 40 deletions
diff --git a/‎projectq/backends/_sim/_cppsim.cpp‎
Lines changed: 3 additions & 0 deletions b/‎projectq/backends/_sim/_cppsim.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎projectq/backends/_sim/_simulator.py‎
Lines changed: 25 additions & 3 deletions b/‎projectq/backends/_sim/_simulator.py‎
Lines changed: 25 additions & 3 deletions
@@ -55,6 +55,7 @@ The operations collection consists of various default gates and is a work-in-pro
 	projectq.ops.StatePreparation
 	projectq.ops.QPE
 	projectq.ops.FlipBits
+	projectq.ops.QAA
 
 
 Module contents
 
@@ -27,6 +27,7 @@ The decomposition package is a collection of gate decomposition / replacement ru
     projectq.setups.decompositions.toffoli2cnotandtgate
     projectq.setups.decompositions.uniformlycontrolledr2cnot
     projectq.setups.decompositions.phaseestimation
+    projectq.setups.decompositions.amplitudeamplification
 
 
 Submodules
@@ -180,6 +181,13 @@ projectq.setups.decompositions.phaseestimation module
     :members:
     :undoc-members:
 
+projectq.setups.decompositions.amplitudeamplification module
+---------------------------------------------------------------
+
+.. automodule:: projectq.setups.decompositions.amplitudeamplification
+    :members:
+    :undoc-members:
+
 
 Module contents
 ---------------
 
@@ -20,7 +20,7 @@
 
 from projectq.cengines import DummyEngine, MainEngine, NotYetMeasuredError
 from projectq.meta import LogicalQubitIDTag
-from projectq.ops import All, Allocate, CNOT, Command, H, Measure, QFT, Rz, X
+from projectq.ops import All, Allocate, CNOT, Command, H, Measure, QFT, Rz, Rzz, X
 from projectq.types import WeakQubitRef
 
 from projectq.backends import ResourceCounter
@@ -74,14 +74,15 @@ def test_resource_counter():
     CNOT | (qubit1, qubit3)
     Rz(0.1) | qubit1
     Rz(0.3) | qubit1
+    Rzz(0.5) | qubit1
 
     All(Measure) | qubit1 + qubit3
 
     with pytest.raises(NotYetMeasuredError):
         int(qubit1)
 
     assert resource_counter.max_width == 2
-    assert resource_counter.depth_of_dag == 5
+    assert resource_counter.depth_of_dag == 6
 
     str_repr = str(resource_counter)
     assert str_repr.count(" HGate : 1") == 1
 
@@ -38,7 +38,7 @@ class Simulator{
 public:
     using calc_type = double;
     using complex_type = std::complex<calc_type>;
-    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,64>>;
+    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
     using Map = std::map<unsigned, unsigned>;
     using RndEngine = std::mt19937;
     using Term = std::vector<std::pair<unsigned, char>>;
@@ -55,11 +55,18 @@ class Simulator{
     void allocate_qubit(unsigned id){
         if (map_.count(id) == 0){
             map_[id] = N_++;
-            auto newvec = StateVector(1UL << N_);
-            #pragma omp parallel for schedule(static)
+            StateVector newvec; // avoid large memory allocations
+            if( tmpBuff1_.capacity() >= (1UL << N_) )
+              std::swap(newvec, tmpBuff1_);
+            newvec.resize(1UL << N_);
+#pragma omp parallel for schedule(static)
             for (std::size_t i = 0; i < newvec.size(); ++i)
                 newvec[i] = (i < vec_.size())?vec_[i]:0.;
-            vec_ = std::move(newvec);
+            std::swap(vec_, newvec);
+            // recycle large memory
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
         }
         else
             throw(std::runtime_error(
@@ -113,12 +120,18 @@ class Simulator{
             }
         }
         else{
-            StateVector newvec((1UL << (N_-1)));
-            #pragma omp parallel for schedule(static)
+            StateVector newvec; // avoid costly memory reallocations
+            if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
+              std::swap(tmpBuff1_, newvec);
+            newvec.resize((1UL << (N_-1)));
+            #pragma omp parallel for schedule(static) if(0)
             for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
                 std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
                             delta, &newvec[i/2]);
-            vec_ = std::move(newvec);
+            std::swap(vec_, newvec);
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
 
             for (auto& p : map_){
                 if (p.second > pos)
@@ -189,8 +202,8 @@ class Simulator{
     }
 
     template <class M>
-    void apply_controlled_gate(M const& m, std::vector<unsigned> ids,
-                               std::vector<unsigned> ctrl){
+    void apply_controlled_gate(M const& m, const std::vector<unsigned>& ids,
+                               const std::vector<unsigned>& ctrl){
         auto fused_gates = fused_gates_;
         fused_gates.insert(m, ids, ctrl);
 
@@ -209,46 +222,85 @@ class Simulator{
     }
 
     template <class F, class QuReg>
-    void emulate_math(F const& f, QuReg quregs, std::vector<unsigned> ctrl,
-                      unsigned num_threads=1){
+    void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
+                      bool parallelize = false){
         run();
         auto ctrlmask = get_control_mask(ctrl);
 
         for (unsigned i = 0; i < quregs.size(); ++i)
             for (unsigned j = 0; j < quregs[i].size(); ++j)
                 quregs[i][j] = map_[quregs[i][j]];
 
-        StateVector newvec(vec_.size(), 0.);
-        std::vector<int> res(quregs.size());
-
-        #pragma omp parallel for schedule(static) firstprivate(res) num_threads(num_threads)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
-            if ((ctrlmask&i) == ctrlmask){
-                for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
-                    res[qr_i] = 0;
-                    for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
-                        res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
-                }
-                f(res);
-                auto new_i = i;
-                for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
-                    for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
-                        if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
-                            new_i ^= (1UL << quregs[qr_i][qb_i]);
-                    }
-                }
-                newvec[new_i] += vec_[i];
-            }
-            else
-                newvec[i] += vec_[i];
+        StateVector newvec; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(newvec, tmpBuff1_);
+        newvec.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); i++)
+          newvec[i] = 0;
+
+//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
+        {
+          std::vector<int> res(quregs.size());
+          //#pragma omp for schedule(static)
+          for (std::size_t i = 0; i < vec_.size(); ++i){
+              if ((ctrlmask&i) == ctrlmask){
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      res[qr_i] = 0;
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
+                          res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
+                  }
+                  f(res);
+                  auto new_i = i;
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
+                          if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
+                              new_i ^= (1UL << quregs[qr_i][qb_i]);
+                      }
+                  }
+                  newvec[new_i] += vec_[i];
+              }
+              else
+                  newvec[i] += vec_[i];
+          }
         }
-        vec_ = std::move(newvec);
+        std::swap(vec_, newvec);
+        std::swap(tmpBuff1_, newvec);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
     }
 
     calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
         run();
         calc_type expectation = 0.;
-        auto current_state = vec_;
+
+        StateVector current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, current_state);
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i)
+          current_state[i] = vec_[i];
+
         for (auto const& term : td){
             auto const& coefficient = term.second;
             apply_term(term.first, ids, {});
@@ -260,17 +312,29 @@ class Simulator{
                 auto const a2 = std::real(vec_[i]);
                 auto const b2 = std::imag(vec_[i]);
                 delta += a1 * a2 - b1 * b2;
+                // reset vec_
+                vec_[i] = current_state[i];
             }
             expectation += coefficient * delta;
-            vec_ = current_state;
         }
+        std::swap(current_state, tmpBuff1_);
         return expectation;
     }
 
     void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
         run();
-        auto new_state = StateVector(vec_.size(), 0.);
-        auto current_state = vec_;
+        StateVector new_state, current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, new_state);
+        if( tmpBuff2_.capacity() >= vec_.size() )
+          std::swap(tmpBuff2_, current_state);
+        new_state.resize(vec_.size());
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+          new_state[i] = 0;
+          current_state[i] = vec_[i];
+        }
         for (auto const& term : td){
             auto const& coefficient = term.second;
             apply_term(term.first, ids, {});
@@ -280,7 +344,9 @@ class Simulator{
                 vec_[i] = current_state[i];
             }
         }
-        vec_ = std::move(new_state);
+        std::swap(vec_, new_state);
+        std::swap(tmpBuff1_, new_state);
+        std::swap(tmpBuff2_, current_state);
     }
 
     calc_type get_probability(std::vector<bool> const& bit_string,
@@ -452,6 +518,8 @@ class Simulator{
                 #pragma omp parallel
                 kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
                 break;
+            default:
+                throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
         }
 
         fused_gates_ = Fusion();
@@ -500,6 +568,12 @@ class Simulator{
     unsigned fusion_qubits_min_, fusion_qubits_max_;
     RndEngine rnd_eng_;
     std::function<double()> rng_;
+
+    // large array buffers to avoid costly reallocations
+    static StateVector tmpBuff1_, tmpBuff2_;
 };
 
+Simulator::StateVector Simulator::tmpBuff1_;
+Simulator::StateVector Simulator::tmpBuff2_;
+
 #endif
@@ -50,6 +50,9 @@ PYBIND11_PLUGIN(_cppsim) {
         .def("measure_qubits", &Simulator::measure_qubits_return)
         .def("apply_controlled_gate", &Simulator::apply_controlled_gate<MatrixType>)
         .def("emulate_math", &emulate_math_wrapper<QuRegs>)
+        .def("emulate_math_addConstant", &Simulator::emulate_math_addConstant<QuRegs>)
+        .def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN<QuRegs>)
+        .def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN<QuRegs>)
         .def("get_expectation_value", &Simulator::get_expectation_value)
         .def("apply_qubit_operator", &Simulator::apply_qubit_operator)
         .def("emulate_time_evolution", &Simulator::emulate_time_evolution)
 
@@ -33,10 +33,12 @@
                           TimeEvolution)
 from projectq.types import WeakQubitRef
 
+FALLBACK_TO_PYSIM = False
 try:
     from ._cppsim import Simulator as SimulatorBackend
 except ImportError:
     from ._pysim import Simulator as SimulatorBackend
+    FALLBACK_TO_PYSIM = True
 
 
 class Simulator(BasicEngine):
@@ -384,14 +386,34 @@ def _handle(self, cmd):
             ID = cmd.qubits[0][0].id
             self._simulator.deallocate_qubit(ID)
         elif isinstance(cmd.gate, BasicMathGate):
+            # improve performance by using C++ code for some commomn gates
+            from projectq.libs.math import (AddConstant,
+                                            AddConstantModN,
+                                            MultiplyByConstantModN)
             qubitids = []
             for qr in cmd.qubits:
                 qubitids.append([])
                 for qb in qr:
                     qubitids[-1].append(qb.id)
-            math_fun = cmd.gate.get_math_function(cmd.qubits)
-            self._simulator.emulate_math(math_fun, qubitids,
-                                         [qb.id for qb in cmd.control_qubits])
+            if FALLBACK_TO_PYSIM:
+                math_fun = cmd.gate.get_math_function(cmd.qubits)
+                self._simulator.emulate_math(math_fun, qubitids,
+                                             [qb.id for qb in cmd.control_qubits])
+            else:
+                # individual code for different standard gates to make it faster!
+                if isinstance(cmd.gate, AddConstant):
+                    self._simulator.emulate_math_addConstant(cmd.gate.a, qubitids,
+                                                             [qb.id for qb in cmd.control_qubits])
+                elif isinstance(cmd.gate, AddConstantModN):
+                    self._simulator.emulate_math_addConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
+                                                                 [qb.id for qb in cmd.control_qubits])
+                elif isinstance(cmd.gate, MultiplyByConstantModN):
+                    self._simulator.emulate_math_multiplyByConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
+                                                                        [qb.id for qb in cmd.control_qubits])
+                else:
+                    math_fun = cmd.gate.get_math_function(cmd.qubits)
+                    self._simulator.emulate_math(math_fun, qubitids,
+                                                 [qb.id for qb in cmd.control_qubits])
         elif isinstance(cmd.gate, TimeEvolution):
             op = [(list(term), coeff) for (term, coeff)
                   in cmd.gate.hamiltonian.terms.items()]