Qiskit · hhorii · Mar 1, 2022 · Dec 13, 2021 · Dec 13, 2021 · Dec 13, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -257,6 +257,15 @@ if(AER_THRUST_SUPPORTED)
 
 		set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA)
 		set(THRUST_DEPENDENT_LIBS "")
+		if(CUSTATEVEC_ROOT)
+			set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC)
+			set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include")
+            if(CUSTATEVEC_STATIC)
+				set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas")
+			else()
+				set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec")
+			endif()
+		endif()
 	elseif(AER_THRUST_BACKEND STREQUAL "TBB")
 		message(STATUS "TBB Support found!")
 		set(THRUST_DEPENDENT_LIBS AER_DEPENDENCY_PKG::tbb)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -643,6 +643,34 @@ Few notes on GPU builds:
 3. We don't need NVIDIA® drivers for building, but we need them for running simulations
 4. Only Linux platforms are supported
 
+Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®.
+cuStateVec APIs can be exploited to accelerate statevector, density_matrix and unitary methods.
+Because cuQuantum is beta version currently, some of the operations are not accelerated by cuStateVec.
+
+To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following.
+
+For example,
+
+    qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum
+
+if you want to link cuQuantum library statically, set `CUSTATEVEC_STATIC` to setup.py. 
+Otherwise you also have to set environmental variable LD_LIBRARY_PATH to indicate path to the cuQuantum libraries.
+
+To run with cuStateVec, set `device='GPU'` to AerSimulator option and set `cuStateVec_enable=True` to option in execute method.
+
+```
+sim = AerSimulator(method='statevector', device='GPU')
+results = execute(circuit,sim,cuStateVec_enable=True).result()
+```
+
+Also you can accelrate density matrix and unitary matrix simulations as well.
+```
+sim = AerSimulator(method='density_matrix', device='GPU')
+results = execute(circuit,sim,cuStateVec_enable=True).result()
+```
+
+
+
 ### Building with MPI support
 
 Qiskit Aer can parallelize its simulation on the cluster systems by using MPI. 

diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py
@@ -148,6 +148,10 @@ class AerSimulator(AerBackend):
     initialization or with :meth:`set_options`. The list of supported devices
     for the current system can be returned using :meth:`available_devices`.
 
+    If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled
+    by setting ``cuStateVec_enable=True``. This is experimental implementation
+    based on cuQuantum Beta 2.
+
     **Additional Backend Options**
 
     The following simulator specific backend options are supported
@@ -216,6 +220,11 @@ class AerSimulator(AerBackend):
       values (16 Bytes). If set to 0, the maximum will be automatically
       set to the system memory size (Default: 0).
 
+    * ``cuStateVec_enable`` (bool): This option enables accelerating by
+      cuStateVec library of cuQuantum from NVIDIA, that has highly optimized
+      kernels for GPUs (Default: False). This option will be ignored
+      if AerSimulator is not built with cuStateVec support.
+
     * ``blocking_enable`` (bool): This option enables parallelization with
       multiple GPUs or multiple processes with MPI (CPU/GPU). This option
       is only available for ``"statevector"``, ``"density_matrix"`` and
@@ -514,6 +523,8 @@ def _default_options(cls):
             memory=None,
             noise_model=None,
             seed_simulator=None,
+            # cuStateVec (cuQuantum) option
+            cuStateVec_enable=False,
             # cache blocking for multi-GPUs/MPI options
             blocking_qubits=None,
             blocking_enable=False,

diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py
@@ -339,9 +339,9 @@ class QasmSimulator(AerBackend):
     }
 
     _SIMULATION_METHODS = [
-        'automatic', 'statevector', 'statevector_gpu',
+        'automatic', 'statevector', 'statevector_gpu', 'statevector_custatevec',
         'statevector_thrust', 'density_matrix',
-        'density_matrix_gpu', 'density_matrix_thrust',
+        'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust',
         'stabilizer', 'matrix_product_state', 'extended_stabilizer'
     ]
 
@@ -595,7 +595,8 @@ def _basis_gates(self):
     def _method_basis_gates(self):
         """Return method basis gates and custom instructions"""
         method = self._options.get('method', None)
-        if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        if method in ['density_matrix', 'density_matrix_gpu',
+                      'density_matrix_custatevec', 'density_matrix_thrust']:
             return sorted([
                 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x',
                 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx',
@@ -628,15 +629,17 @@ def _custom_instructions(self):
             return self._options_configuration['custom_instructions']
 
         method = self._options.get('method', None)
-        if method in ['statevector', 'statevector_gpu', 'statevector_thrust']:
+        if method in ['statevector', 'statevector_gpu',
+                      'statevector_custatevec', 'statevector_thrust']:
             return sorted([
                 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval',
                 'save_expval_var', 'save_probabilities', 'save_probabilities_dict',
                 'save_amplitudes', 'save_amplitudes_sq', 'save_state',
                 'save_density_matrix', 'save_statevector', 'save_statevector_dict',
                 'set_statevector'
             ])
-        if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        if method in ['density_matrix', 'density_matrix_gpu',
+                      'density_matrix_custatevec', 'density_matrix_thrust']:
             return sorted([
                 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot',
                 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict',
@@ -666,10 +669,12 @@ def _custom_instructions(self):
     def _set_method_config(self, method=None):
         """Set non-basis gate options when setting method"""
         # Update configuration description and number of qubits
-        if method in ['statevector', 'statevector_gpu', 'statevector_thrust']:
+        if method in ['statevector', 'statevector_gpu',
+                      'statevector_custatevec', 'statevector_thrust']:
             description = 'A C++ statevector simulator with noise'
             n_qubits = MAX_QUBITS_STATEVECTOR
-        elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        elif method in ['density_matrix', 'density_matrix_gpu',
+                        'density_matrix_custatevec', 'density_matrix_thrust']:
             description = 'A C++ density matrix simulator with noise'
             n_qubits = MAX_QUBITS_STATEVECTOR // 2
         elif method == 'matrix_product_state':

diff --git a/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml
@@ -0,0 +1,13 @@
+---
+features:
+  - |
+    Added support for cuQuantum, NVIDIA's APIs for quantum computing,
+    to accelerate statevector, density matrix and unitary simulators
+    by using GPUs.
+    This is experiemental implementation for cuQuantum Beta 2. (0.1.0)
+    cuStateVec APIs are enabled to accelerate instead of Aer's implementations
+    by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``.
+    (binary distribution is not available currently.)
+    cuStateVector is enabled by setting ``device='GPU'`` and 
+    ``cuStateVec_threshold`` options. cuStateVec is enabled when number of
+    qubits of input circuit is equal or greater than ``cuStateVec_threshold``.
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
@@ -377,6 +377,8 @@ class Controller {
   int_t batched_shots_gpu_max_qubits_ = 16;   //multi-shot parallelization is applied if qubits is less than max qubits
   bool enable_batch_multi_shots_ = false;   //multi-shot parallelization can be applied
 
+  //settings for cuStateVec
+  bool cuStateVec_enable_ = false;
 };
 
 //=========================================================================
@@ -466,6 +468,12 @@ void Controller::set_config(const json_t &config) {
     JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config);
   }
 
+  //cuStateVec configs
+  cuStateVec_enable_ = false;
+  if(JSON::check_key("cuStateVec_enable", config)) {
+    JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config);
+  }
+
   // Override automatic simulation method with a fixed method
   std::string method;
   if (JSON::get_value(method, "method", config)) {
@@ -489,6 +497,9 @@ void Controller::set_config(const json_t &config) {
     }
   }
 
+  if(method_ == Method::density_matrix || method_ == Method::unitary)
+    batched_shots_gpu_max_qubits_ /= 2;
+
   // Override automatic simulation method with a fixed method
   if (JSON::get_value(sim_device_name_, "device", config)) {
     if (sim_device_name_ == "CPU") {
@@ -502,18 +513,37 @@ void Controller::set_config(const json_t &config) {
 #endif
     } else if (sim_device_name_ == "GPU") {
 #ifndef AER_THRUST_CUDA
-        throw std::runtime_error(
-            "Simulation device \"GPU\" is not supported on this system");
+      throw std::runtime_error(
+          "Simulation device \"GPU\" is not supported on this system");
 #else
-        int nDev;
-        if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
-            cudaGetLastError();
-            throw std::runtime_error("No CUDA device available!");
-        }
 
-        sim_device_ = Device::GPU;
+#ifndef AER_CUSTATEVEC
+      if(cuStateVec_enable_){
+        //Aer is not built for cuStateVec
+        throw std::runtime_error(
+            "Simulation device \"GPU\" does not supported cuStateVec on this system");
+      }
 #endif
+      int nDev;
+      if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+          cudaGetLastError();
+          throw std::runtime_error("No CUDA device available!");
+      }
+      sim_device_ = Device::GPU;
+
+#ifdef AER_CUSTATEVEC
+      if(cuStateVec_enable_){
+        //initialize custatevevtor handle once before actual calculation (takes long time at first call)
+        custatevecStatus_t err;
+        custatevecHandle_t stHandle;
+        err = custatevecCreate(&stHandle);
+        if(err == CUSTATEVEC_STATUS_SUCCESS){
+          custatevecDestroy(stHandle);
+        }
       }
+#endif
+#endif
+    }
     else {
       throw std::runtime_error(std::string("Invalid simulation device (\"") +
                                sim_device_name_ + std::string("\")."));
@@ -636,9 +666,16 @@ void Controller::set_parallelization_circuit(const Circuit &circ,
                                              const Method method)  
 {
   enable_batch_multi_shots_ = false;
-  if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && 
-              batched_shots_gpu_max_qubits_ >= circ.num_qubits ){
-    enable_batch_multi_shots_ = true;
+  if(batched_shots_gpu_ && sim_device_ == Device::GPU && 
+     circ.shots > 1 && max_batched_states_ >= num_gpus_ && 
+     batched_shots_gpu_max_qubits_ >= circ.num_qubits ){
+      enable_batch_multi_shots_ = true;
+  }
+
+  if(sim_device_ == Device::GPU && cuStateVec_enable_){
+    enable_batch_multi_shots_ = false;    //cuStateVec does not support batch execution of multi-shots
+    parallel_shots_ = 1;    //cuStateVec is currently not thread safe
+    return;
   }
 
   if(explicit_parallelization_)
@@ -785,6 +822,7 @@ size_t Controller::get_gpu_memory_mb() {
   }
   num_gpus_ = nDev;
 #endif
+
 #ifdef AER_MPI
   // get minimum memory size per process
   uint64_t locMem, minMem;
@@ -866,7 +904,6 @@ Result Controller::execute(const inputdata_t &input_qobj) {
     auto time_taken =
         std::chrono::duration<double>(myclock_t::now() - timer_start).count();
     result.metadata.add(time_taken, "time_taken");
-
     return result;
   } catch (std::exception &e) {
     // qobj was invalid, return valid output containing error message
@@ -959,7 +996,7 @@ Result Controller::execute(std::vector<Circuit> &circuits,
     const int NUM_RESULTS = result.results.size();
     //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation
     //(DO NOT use if statement in #pragma omp)
-    if (parallel_experiments_ == 1) {
+    if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) {
       for (int j = 0; j < NUM_RESULTS; ++j) {
         set_parallelization_circuit(circuits[j], noise_model, methods[j]);
         run_circuit(circuits[j], noise_model,methods[j],
@@ -1439,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ,
   // Check if measure sampler and optimization are valid
   if (can_sample) {
     // Implement measure sampler
-    if (parallel_shots_ <= 1) {
+    if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) {
       state.set_max_matrix_qubits(max_bits);
       RngEngine rng;
       rng.set_seed(circ.seed);
@@ -1460,7 +1497,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ,
         shot_state.set_parallelization(parallel_state_update_);
         shot_state.set_global_phase(circ.global_phase_angle);
 
-        state.set_max_matrix_qubits(max_bits);
+        shot_state.set_max_matrix_qubits(max_bits);
 
         RngEngine rng;
         rng.set_seed(circ.seed + i);
@@ -1736,7 +1773,12 @@ void Controller::measure_sampler(
     shots_or_index = shots;
   else
     shots_or_index = shot_index;
+
+  auto timer_start = myclock_t::now();
   auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  result.metadata.add(time_taken, "sample_measure_time");
 
   // Make qubit map of position in vector of measured qubits
   std::unordered_map<uint_t, uint_t> qubit_map;