C++ simulator performance improvements (#329)

melven · Takishima · commit f27286f520ff · 2019-08-20T10:12:09.000+02:00
* C++ simulator performance: make the swap-gate run in native C++ It was defined as a BasicMathGate before which made it run as python code through the emulate_math_wrapper. The new variant just uses its matrix representation to run it in native code. * C++ simulator performance: add dedicated C++ code for common math gates The BasicMathGate uses a C++ python wrapper (emulate_math_wrapper) to allow generic calculations which makes it very slow. This detects some math gates and provides a native C++ implementation for it. * C++ simulator performance: use larger memory alignment * C++ simulator performance: recycle large StateVector memory buffers This avoids costly std::vector copying/reallocations by using some static std::vector to reuse the allocated buffer (just by std::swap'ing a vector into a buffer for later use when it would be deallocated otherwise). * C++ simulator performance: improve compiler flags * Add test coverage for constant math emulation * Revert "Add test coverage for constant math emulation" This reverts commit 3bb8a2c. * Add test coverage for constant math emulation
diff --git a/projectq/backends/_sim/_cppkernels/simulator.hpp b/projectq/backends/_sim/_cppkernels/simulator.hpp
@@ -38,7 +38,7 @@ class Simulator{
 public:
     using calc_type = double;
     using complex_type = std::complex<calc_type>;
-    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,64>>;
+    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
     using Map = std::map<unsigned, unsigned>;
     using RndEngine = std::mt19937;
     using Term = std::vector<std::pair<unsigned, char>>;
@@ -55,11 +55,18 @@ class Simulator{
     void allocate_qubit(unsigned id){
         if (map_.count(id) == 0){
             map_[id] = N_++;
-            auto newvec = StateVector(1UL << N_);
-            #pragma omp parallel for schedule(static)
+            StateVector newvec; // avoid large memory allocations
+            if( tmpBuff1_.capacity() >= (1UL << N_) )
+              std::swap(newvec, tmpBuff1_);
+            newvec.resize(1UL << N_);
+#pragma omp parallel for schedule(static)
             for (std::size_t i = 0; i < newvec.size(); ++i)
                 newvec[i] = (i < vec_.size())?vec_[i]:0.;
-            vec_ = std::move(newvec);
+            std::swap(vec_, newvec);
+            // recycle large memory
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
         }
         else
             throw(std::runtime_error(
@@ -113,12 +120,18 @@ class Simulator{
             }
         }
         else{
-            StateVector newvec((1UL << (N_-1)));
-            #pragma omp parallel for schedule(static)
+            StateVector newvec; // avoid costly memory reallocations
+            if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
+              std::swap(tmpBuff1_, newvec);
+            newvec.resize((1UL << (N_-1)));
+            #pragma omp parallel for schedule(static) if(0)
             for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
                 std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
                             delta, &newvec[i/2]);
-            vec_ = std::move(newvec);
+            std::swap(vec_, newvec);
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
 
             for (auto& p : map_){
                 if (p.second > pos)
@@ -189,8 +202,8 @@ class Simulator{
     }
 
     template <class M>
-    void apply_controlled_gate(M const& m, std::vector<unsigned> ids,
-                               std::vector<unsigned> ctrl){
+    void apply_controlled_gate(M const& m, const std::vector<unsigned>& ids,
+                               const std::vector<unsigned>& ctrl){
         auto fused_gates = fused_gates_;
         fused_gates.insert(m, ids, ctrl);
 
@@ -209,46 +222,85 @@ class Simulator{
     }
 
     template <class F, class QuReg>
-    void emulate_math(F const& f, QuReg quregs, std::vector<unsigned> ctrl,
-                      unsigned num_threads=1){
+    void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
+                      bool parallelize = false){
         run();
         auto ctrlmask = get_control_mask(ctrl);
 
         for (unsigned i = 0; i < quregs.size(); ++i)
             for (unsigned j = 0; j < quregs[i].size(); ++j)
                 quregs[i][j] = map_[quregs[i][j]];
 
-        StateVector newvec(vec_.size(), 0.);
-        std::vector<int> res(quregs.size());
-
-        #pragma omp parallel for schedule(static) firstprivate(res) num_threads(num_threads)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
-            if ((ctrlmask&i) == ctrlmask){
-                for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
-                    res[qr_i] = 0;
-                    for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
-                        res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
-                }
-                f(res);
-                auto new_i = i;
-                for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
-                    for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
-                        if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
-                            new_i ^= (1UL << quregs[qr_i][qb_i]);
-                    }
-                }
-                newvec[new_i] += vec_[i];
-            }
-            else
-                newvec[i] += vec_[i];
+        StateVector newvec; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(newvec, tmpBuff1_);
+        newvec.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); i++)
+          newvec[i] = 0;
+
+//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
+        {
+          std::vector<int> res(quregs.size());
+          //#pragma omp for schedule(static)
+          for (std::size_t i = 0; i < vec_.size(); ++i){
+              if ((ctrlmask&i) == ctrlmask){
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      res[qr_i] = 0;
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
+                          res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
+                  }
+                  f(res);
+                  auto new_i = i;
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
+                          if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
+                              new_i ^= (1UL << quregs[qr_i][qb_i]);
+                      }
+                  }
+                  newvec[new_i] += vec_[i];
+              }
+              else
+                  newvec[i] += vec_[i];
+          }
         }
-        vec_ = std::move(newvec);
+        std::swap(vec_, newvec);
+        std::swap(tmpBuff1_, newvec);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python 
+    template<class QuReg>
+    inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
     }
 
     calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
         run();
         calc_type expectation = 0.;
-        auto current_state = vec_;
+
+        StateVector current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, current_state);
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i)
+          current_state[i] = vec_[i];
+
         for (auto const& term : td){
             auto const& coefficient = term.second;
             apply_term(term.first, ids, {});
@@ -260,17 +312,29 @@ class Simulator{
                 auto const a2 = std::real(vec_[i]);
                 auto const b2 = std::imag(vec_[i]);
                 delta += a1 * a2 - b1 * b2;
+                // reset vec_
+                vec_[i] = current_state[i];
             }
             expectation += coefficient * delta;
-            vec_ = current_state;
         }
+        std::swap(current_state, tmpBuff1_);
         return expectation;
     }
 
     void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
         run();
-        auto new_state = StateVector(vec_.size(), 0.);
-        auto current_state = vec_;
+        StateVector new_state, current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, new_state);
+        if( tmpBuff2_.capacity() >= vec_.size() )
+          std::swap(tmpBuff2_, current_state);
+        new_state.resize(vec_.size());
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+          new_state[i] = 0;
+          current_state[i] = vec_[i];
+        }
         for (auto const& term : td){
             auto const& coefficient = term.second;
             apply_term(term.first, ids, {});
@@ -280,7 +344,9 @@ class Simulator{
                 vec_[i] = current_state[i];
             }
         }
-        vec_ = std::move(new_state);
+        std::swap(vec_, new_state);
+        std::swap(tmpBuff1_, new_state);
+        std::swap(tmpBuff2_, current_state);
     }
 
     calc_type get_probability(std::vector<bool> const& bit_string,
@@ -452,6 +518,8 @@ class Simulator{
                 #pragma omp parallel
                 kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
                 break;
+            default:
+                throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
         }
 
         fused_gates_ = Fusion();
@@ -500,6 +568,12 @@ class Simulator{
     unsigned fusion_qubits_min_, fusion_qubits_max_;
     RndEngine rnd_eng_;
     std::function<double()> rng_;
+
+    // large array buffers to avoid costly reallocations
+    static StateVector tmpBuff1_, tmpBuff2_;
 };
 
+Simulator::StateVector Simulator::tmpBuff1_;
+Simulator::StateVector Simulator::tmpBuff2_;
+
 #endif
diff --git a/projectq/backends/_sim/_cppsim.cpp b/projectq/backends/_sim/_cppsim.cpp
@@ -50,6 +50,9 @@ PYBIND11_PLUGIN(_cppsim) {
         .def("measure_qubits", &Simulator::measure_qubits_return)
         .def("apply_controlled_gate", &Simulator::apply_controlled_gate<MatrixType>)
         .def("emulate_math", &emulate_math_wrapper<QuRegs>)
+        .def("emulate_math_addConstant", &Simulator::emulate_math_addConstant<QuRegs>)
+        .def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN<QuRegs>)
+        .def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN<QuRegs>)
         .def("get_expectation_value", &Simulator::get_expectation_value)
         .def("apply_qubit_operator", &Simulator::apply_qubit_operator)
         .def("emulate_time_evolution", &Simulator::emulate_time_evolution)
diff --git a/projectq/backends/_sim/_simulator.py b/projectq/backends/_sim/_simulator.py
@@ -33,10 +33,12 @@
                           TimeEvolution)
 from projectq.types import WeakQubitRef
 
+FALLBACK_TO_PYSIM = False
 try:
     from ._cppsim import Simulator as SimulatorBackend
 except ImportError:
     from ._pysim import Simulator as SimulatorBackend
+    FALLBACK_TO_PYSIM = True
 
 
 class Simulator(BasicEngine):
@@ -384,14 +386,34 @@ def _handle(self, cmd):
             ID = cmd.qubits[0][0].id
             self._simulator.deallocate_qubit(ID)
         elif isinstance(cmd.gate, BasicMathGate):
+            # improve performance by using C++ code for some commomn gates
+            from projectq.libs.math import (AddConstant,
+                                            AddConstantModN,
+                                            MultiplyByConstantModN)
             qubitids = []
             for qr in cmd.qubits:
                 qubitids.append([])
                 for qb in qr:
                     qubitids[-1].append(qb.id)
-            math_fun = cmd.gate.get_math_function(cmd.qubits)
-            self._simulator.emulate_math(math_fun, qubitids,
-                                         [qb.id for qb in cmd.control_qubits])
+            if FALLBACK_TO_PYSIM:
+                math_fun = cmd.gate.get_math_function(cmd.qubits)
+                self._simulator.emulate_math(math_fun, qubitids,
+                                             [qb.id for qb in cmd.control_qubits])
+            else:
+                # individual code for different standard gates to make it faster!
+                if isinstance(cmd.gate, AddConstant):
+                    self._simulator.emulate_math_addConstant(cmd.gate.a, qubitids,
+                                                             [qb.id for qb in cmd.control_qubits])
+                elif isinstance(cmd.gate, AddConstantModN):
+                    self._simulator.emulate_math_addConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
+                                                                 [qb.id for qb in cmd.control_qubits])
+                elif isinstance(cmd.gate, MultiplyByConstantModN):
+                    self._simulator.emulate_math_multiplyByConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
+                                                                        [qb.id for qb in cmd.control_qubits])
+                else:
+                    math_fun = cmd.gate.get_math_function(cmd.qubits)
+                    self._simulator.emulate_math(math_fun, qubitids,
+                                                 [qb.id for qb in cmd.control_qubits])
         elif isinstance(cmd.gate, TimeEvolution):
             op = [(list(term), coeff) for (term, coeff)
                   in cmd.gate.hamiltonian.terms.items()]
diff --git a/projectq/backends/_sim/_simulator_test.py b/projectq/backends/_sim/_simulator_test.py
@@ -683,3 +683,55 @@ def receive(command_list):
                               qubit1[0].id: qubit0[0].id}
     assert (sim._convert_logical_to_mapped_qureg(qubit0 + qubit1) ==
             qubit1 + qubit0)
+
+
+def test_simulator_constant_math_emulation():
+    if "cpp_simulator" not in get_available_simulators():
+        pytest.skip("No C++ simulator")
+        return
+
+    results = [[[1, 1, 0, 0, 0]], [[0, 1, 0, 0, 0]], [[0, 1, 1, 1, 0]]]
+
+    import projectq.backends._sim._simulator as _sim
+    from projectq.backends._sim._pysim import Simulator as PySim
+    from projectq.backends._sim._cppsim import Simulator as CppSim
+    from projectq.libs.math import (AddConstant, AddConstantModN,
+                                    MultiplyByConstantModN)
+
+    def gate_filter(eng, cmd):
+        g = cmd.gate
+        if isinstance(g, BasicMathGate):
+            return False
+        return eng.next_engine.is_available(cmd)
+
+    def run_simulation(sim):
+        eng = MainEngine(sim, [])
+        quint = eng.allocate_qureg(5)
+        AddConstant(3) | quint
+        All(Measure) | quint
+        eng.flush()
+        results[0].append([int(qb) for qb in quint])
+
+        AddConstantModN(4, 5) | quint
+        All(Measure) | quint
+        eng.flush()
+        results[1].append([int(qb) for qb in quint])
+
+        MultiplyByConstantModN(15, 16) | quint
+        All(Measure) | quint
+        eng.flush()
+        results[2].append([int(qb) for qb in quint])
+
+    cppsim = Simulator(gate_fusion=False)
+    cppsim._simulator = CppSim(1)
+    run_simulation(cppsim)
+
+    _sim.FALLBACK_TO_PYSIM = True
+    pysim = Simulator()
+    pysim._simulator = PySim(1)
+    # run_simulation(pysim)
+
+    for result in results:
+        ref = result[0]
+        for res in result[1:]:
+            assert ref == res
diff --git a/projectq/ops/_gates.py b/projectq/ops/_gates.py
@@ -157,10 +157,9 @@ def __str__(self):
 SqrtX = SqrtXGate()
 
 
-class SwapGate(SelfInverseGate, BasicMathGate):
+class SwapGate(SelfInverseGate):
     """ Swap gate class (swaps 2 qubits) """
     def __init__(self):
-        BasicMathGate.__init__(self, lambda x, y: (y, x))
         SelfInverseGate.__init__(self)
         self.interchangeable_qubit_indices = [[0, 1]]
 
diff --git a/setup.py b/setup.py
@@ -124,6 +124,7 @@ def build_extensions(self):
                 opts.append('/arch:AVX')
             else:
                 opts.append('-march=native')
+                opts.append('-ffast-math')
 
         opts.append(openmp)
         if ct == 'unix':