-
Notifications
You must be signed in to change notification settings - Fork 434
Add experimental support of cuQuantum #1400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
309c73d
54dc128
a5bc75e
b1bd96e
a40898c
adfc125
26c4538
87afff5
f16a35c
5533b76
0c10325
181eb2c
54d1a68
4d502ed
eba2594
5a93807
983773b
5bea04d
1fb5031
1d01542
da0f42d
c781208
0f4a93e
c509131
5458b7c
61083cb
046036d
3a31cef
de4c978
88d7d95
3ffabcf
7cf50ee
879a4ac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| --- | ||
| features: | ||
| - | | ||
| Added support for cuQuantum, NVIDIA's APIs for quantum computing, | ||
| to accelerate statevector, density matrix and unitary simulators | ||
| by using GPUs. | ||
| This is experiemental implementation for cuQuantum Beta 2. (0.1.0) | ||
| cuStateVec APIs are enabled to accelerate instead of Aer's implementations | ||
| by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``. | ||
| (binary distribution is not available currently.) | ||
| cuStateVector is enabled by setting ``device='GPU'`` and | ||
| ``cuStateVec_threshold`` options. cuStateVec is enabled when number of | ||
| qubits of input circuit is equal or greater than ``cuStateVec_threshold``. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -377,6 +377,8 @@ class Controller { | |
| int_t batched_shots_gpu_max_qubits_ = 16; //multi-shot parallelization is applied if qubits is less than max qubits | ||
| bool enable_batch_multi_shots_ = false; //multi-shot parallelization can be applied | ||
|
|
||
| //settings for cuStateVec | ||
| bool cuStateVec_enable_ = false; | ||
| }; | ||
|
|
||
| //========================================================================= | ||
|
|
@@ -466,6 +468,12 @@ void Controller::set_config(const json_t &config) { | |
| JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config); | ||
| } | ||
|
|
||
| //cuStateVec configs | ||
| cuStateVec_enable_ = false; | ||
| if(JSON::check_key("cuStateVec_enable", config)) { | ||
| JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); | ||
| } | ||
|
|
||
| // Override automatic simulation method with a fixed method | ||
| std::string method; | ||
| if (JSON::get_value(method, "method", config)) { | ||
|
|
@@ -489,6 +497,9 @@ void Controller::set_config(const json_t &config) { | |
| } | ||
| } | ||
|
|
||
| if(method_ == Method::density_matrix || method_ == Method::unitary) | ||
| batched_shots_gpu_max_qubits_ /= 2; | ||
|
|
||
| // Override automatic simulation method with a fixed method | ||
| if (JSON::get_value(sim_device_name_, "device", config)) { | ||
| if (sim_device_name_ == "CPU") { | ||
|
|
@@ -502,18 +513,37 @@ void Controller::set_config(const json_t &config) { | |
| #endif | ||
| } else if (sim_device_name_ == "GPU") { | ||
| #ifndef AER_THRUST_CUDA | ||
| throw std::runtime_error( | ||
| "Simulation device \"GPU\" is not supported on this system"); | ||
| throw std::runtime_error( | ||
| "Simulation device \"GPU\" is not supported on this system"); | ||
| #else | ||
| int nDev; | ||
| if (cudaGetDeviceCount(&nDev) != cudaSuccess) { | ||
| cudaGetLastError(); | ||
| throw std::runtime_error("No CUDA device available!"); | ||
| } | ||
|
|
||
| sim_device_ = Device::GPU; | ||
| #ifndef AER_CUSTATEVEC | ||
| if(cuStateVec_enable_){ | ||
| //Aer is not built for cuStateVec | ||
| throw std::runtime_error( | ||
| "Simulation device \"GPU\" does not supported cuStateVec on this system"); | ||
| } | ||
| #endif | ||
| int nDev; | ||
| if (cudaGetDeviceCount(&nDev) != cudaSuccess) { | ||
| cudaGetLastError(); | ||
| throw std::runtime_error("No CUDA device available!"); | ||
| } | ||
| sim_device_ = Device::GPU; | ||
|
|
||
| #ifdef AER_CUSTATEVEC | ||
| if(cuStateVec_enable_){ | ||
| //initialize custatevevtor handle once before actual calculation (takes long time at first call) | ||
| custatevecStatus_t err; | ||
| custatevecHandle_t stHandle; | ||
| err = custatevecCreate(&stHandle); | ||
| if(err == CUSTATEVEC_STATUS_SUCCESS){ | ||
| custatevecDestroy(stHandle); | ||
| } | ||
| } | ||
| #endif | ||
| #endif | ||
| } | ||
| else { | ||
| throw std::runtime_error(std::string("Invalid simulation device (\"") + | ||
| sim_device_name_ + std::string("\").")); | ||
|
|
@@ -636,9 +666,16 @@ void Controller::set_parallelization_circuit(const Circuit &circ, | |
| const Method method) | ||
| { | ||
| enable_batch_multi_shots_ = false; | ||
| if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && | ||
| batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ | ||
| enable_batch_multi_shots_ = true; | ||
| if(batched_shots_gpu_ && sim_device_ == Device::GPU && | ||
| circ.shots > 1 && max_batched_states_ >= num_gpus_ && | ||
| batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ | ||
| enable_batch_multi_shots_ = true; | ||
| } | ||
|
|
||
| if(sim_device_ == Device::GPU && cuStateVec_enable_){ | ||
| enable_batch_multi_shots_ = false; //cuStateVec does not support batch execution of multi-shots | ||
| parallel_shots_ = 1; //cuStateVec is currently not thread safe | ||
| return; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: when IMHO though it's beyond a "workaround": even after we fix the thread safety issue, generally speaking it is still challenging for library handles to be shared by multiple host threads. For example, despite cuBLAS supports this usage pattern they explicitly recommend to not do so. Thus the handle pool approach is commonly seen in ML/DL frameworks.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for explanation @doichanj. I understand better now. So once we fix thread safety we can unblock you for the shot-level parallelization. |
||
| } | ||
|
|
||
| if(explicit_parallelization_) | ||
|
|
@@ -785,6 +822,7 @@ size_t Controller::get_gpu_memory_mb() { | |
| } | ||
| num_gpus_ = nDev; | ||
| #endif | ||
|
|
||
| #ifdef AER_MPI | ||
| // get minimum memory size per process | ||
| uint64_t locMem, minMem; | ||
|
|
@@ -866,7 +904,6 @@ Result Controller::execute(const inputdata_t &input_qobj) { | |
| auto time_taken = | ||
| std::chrono::duration<double>(myclock_t::now() - timer_start).count(); | ||
| result.metadata.add(time_taken, "time_taken"); | ||
|
|
||
| return result; | ||
| } catch (std::exception &e) { | ||
| // qobj was invalid, return valid output containing error message | ||
|
|
@@ -959,7 +996,7 @@ Result Controller::execute(std::vector<Circuit> &circuits, | |
| const int NUM_RESULTS = result.results.size(); | ||
| //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation | ||
| //(DO NOT use if statement in #pragma omp) | ||
| if (parallel_experiments_ == 1) { | ||
| if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) { | ||
| for (int j = 0; j < NUM_RESULTS; ++j) { | ||
| set_parallelization_circuit(circuits[j], noise_model, methods[j]); | ||
| run_circuit(circuits[j], noise_model,methods[j], | ||
|
|
@@ -1439,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, | |
| // Check if measure sampler and optimization are valid | ||
| if (can_sample) { | ||
| // Implement measure sampler | ||
| if (parallel_shots_ <= 1) { | ||
| if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) { | ||
| state.set_max_matrix_qubits(max_bits); | ||
| RngEngine rng; | ||
| rng.set_seed(circ.seed); | ||
|
|
@@ -1460,7 +1497,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, | |
| shot_state.set_parallelization(parallel_state_update_); | ||
| shot_state.set_global_phase(circ.global_phase_angle); | ||
|
|
||
| state.set_max_matrix_qubits(max_bits); | ||
| shot_state.set_max_matrix_qubits(max_bits); | ||
|
hhorii marked this conversation as resolved.
|
||
|
|
||
| RngEngine rng; | ||
| rng.set_seed(circ.seed + i); | ||
|
|
@@ -1736,7 +1773,12 @@ void Controller::measure_sampler( | |
| shots_or_index = shots; | ||
| else | ||
| shots_or_index = shot_index; | ||
|
|
||
| auto timer_start = myclock_t::now(); | ||
| auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng); | ||
| auto time_taken = | ||
| std::chrono::duration<double>(myclock_t::now() - timer_start).count(); | ||
| result.metadata.add(time_taken, "sample_measure_time"); | ||
|
|
||
| // Make qubit map of position in vector of measured qubits | ||
| std::unordered_map<uint_t, uint_t> qubit_map; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.