diff --git a/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h b/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h
index 1da88b1f07254..e8e1ecb1a0396 100644
--- a/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h
+++ b/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h
@@ -19,9 +19,25 @@ enum NNAPIFlags {
   // Please note for now, NNAPI perform worse using NCHW compare to using NHWC
   NNAPI_FLAG_USE_NCHW = 0x002,
 
+  // Prevent NNAPI from using CPU devices.
+  //
+  // NNAPI is more efficient using GPU or NPU for execution, and NNAPI might fall back to its own CPU implementation
+  // for operations not supported by GPU/NPU. The CPU implementation of NNAPI (which is called nnapi-reference)
+  // might be less efficient than the optimized versions of the operation of ORT. It might be advantageous to disable
+  // the NNAPI CPU fallback and handle execution using ORT kernels.
+  //
+  // For some models, if NNAPI would use CPU to execute an operation, and this flag is set, the execution of the
+  // model may fall back to ORT kernels.
+  //
+  // This option is only available after Android API level 29, and will be ignored for Android API level 28-
+  //
+  // For NNAPI device assignments, see https://developer.android.com/ndk/guides/neuralnetworks#device-assignment
+  // For NNAPI CPU fallback, see https://developer.android.com/ndk/guides/neuralnetworks#cpu-fallback
+  NNAPI_FLAG_CPU_DISABLED = 0x004,
+
   // Keep NNAPI_FLAG_MAX at the end of the enum definition
   // And assign the last NNAPIFlag to it
-  NNAPI_FLAG_LAST = NNAPI_FLAG_USE_NCHW,
+  NNAPI_FLAG_LAST = NNAPI_FLAG_CPU_DISABLED,
 };
 
 #ifdef __cplusplus
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index ead5a7eb1ba21..3a7f9c1dc7859 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -87,17 +87,23 @@ Status ModelBuilder::GetTargetDevices() {
   for (uint32_t i = 0; i < num_devices; i++) {
     ANeuralNetworksDevice* device = nullptr;
     const char* device_name = nullptr;
+    int32_t device_type;
     RETURN_STATUS_ON_ERROR_WITH_NOTE(
         nnapi_->ANeuralNetworks_getDevice(i, &device), "Getting " + std::to_string(i) + "th device");
 
     RETURN_STATUS_ON_ERROR_WITH_NOTE(nnapi_->ANeuralNetworksDevice_getName(device, &device_name),
                                      "Getting " + std::to_string(i) + "th device's name");
 
+    RETURN_STATUS_ON_ERROR_WITH_NOTE(nnapi_->ANeuralNetworksDevice_getType(device, &device_type),
+                                     "Getting " + std::to_string(i) + "th device's type");
+
     bool device_is_cpu = nnapi_cpu == device_name;
     if ((target_device_option_ == TargetDeviceOption::CPU_DISABLED && !device_is_cpu) ||
         (target_device_option_ == TargetDeviceOption::CPU_ONLY && device_is_cpu)) {
       nnapi_target_devices_.push_back(device);
-      LOGS_DEFAULT(VERBOSE) << "Target device [" << device_name << "] added";
+      const auto device_detail = MakeString("[Name: [", device_name, "], Type [", device_type, "]], ");
+      nnapi_target_devices_detail_ += device_detail;
+      LOGS_DEFAULT(VERBOSE) << "Target device " << device_detail << " is added";
     }
   }
 
@@ -489,6 +495,7 @@ Status ModelBuilder::AddOperation(int op, const std::vector<uint32_t>& input_ind
           output_indices.size(), &output_indices[0]),
       "op = " + std::to_string(op));
 
+  num_nnapi_ops_++;
   return Status::OK();
 }
 
@@ -515,7 +522,38 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
       nnapi_->ANeuralNetworksModel_finish(nnapi_model_->model_),
       "on model finish");
 
+  // We have a list of target devices, try to see if the model can be run entirely
+  // using the list of target devices
+  // This is only available on API 29+, for API 28- the nnapi_target_devices_ will
+  // be empty so we will not check API level here, see GetTargetDevices()
+  bool use_create_for_devices = false;
   if (!nnapi_target_devices_.empty()) {
+    std::unique_ptr<bool[]> supported_ops_holder = onnxruntime::make_unique<bool[]>(num_nnapi_ops_);
+    auto* supported_ops = supported_ops_holder.get();
+    RETURN_STATUS_ON_ERROR_WITH_NOTE(
+        nnapi_->ANeuralNetworksModel_getSupportedOperationsForDevices(
+            nnapi_model_->model_, nnapi_target_devices_.data(),
+            nnapi_target_devices_.size(), supported_ops),
+        "on getSupportedOperationsForDevices");
+
+    bool all_ops_supported = std::all_of(supported_ops, supported_ops + num_nnapi_ops_,
+                                         [](bool is_supported) { return is_supported; });
+    if (!all_ops_supported) {
+      // There are some ops not supported by the list of the target devices
+      // Fail the Compile
+      //
+      // TODO, add some logic to not fail for some cases
+      // Such as, if there are some acceptable fall back to cpu (nnapi-reference)
+      // and cpu is not in the target devices list
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "The model cannot run using current set of target devices, ",
+                             nnapi_target_devices_detail_);
+    } else {
+      use_create_for_devices = true;
+    }
+  }
+
+  if (use_create_for_devices) {
     RETURN_STATUS_ON_ERROR_WITH_NOTE(
         nnapi_->ANeuralNetworksCompilation_createForDevices(
             nnapi_model_->model_, nnapi_target_devices_.data(),
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 86882c8e7c975..ee2ac944b6327 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -22,9 +22,12 @@ class ModelBuilder {
 
   enum class TargetDeviceOption : int8_t {
     ALL_DEVICES,  // use all avaliable target devices
-    /* TODO support this option
-    SINGLE_DEVICE,  // use a single target device, must be given
+
+    /* TODO support these options
+    PREFERRED_DEVICES,  // Use one or more preferred devices (must be given)
+    EXCLUDED_DEVICES,   // Exclude one or more devices (must be given)
      */
+
     CPU_DISABLED,  // use all avaliable target devices except CPU
     CPU_ONLY,      // use CPU only
   };
@@ -74,6 +77,8 @@ class ModelBuilder {
   // It is off by default
   void SetUseFp16(bool use_fp16) { use_fp16_ = use_fp16; }
 
+  void SetTargetDeviceOption(TargetDeviceOption option) { target_device_option_ = option; }
+
   // Set NNAPI execution preference
   // Default preference is PREFER_SUSTAINED_SPEED
   void ExecutePreference(
@@ -148,7 +153,10 @@ class ModelBuilder {
 
   TargetDeviceOption target_device_option_{TargetDeviceOption::ALL_DEVICES};
   std::vector<ANeuralNetworksDevice*> nnapi_target_devices_;
+  std::string nnapi_target_devices_detail_;  // Debug info for target devices
 
+  // The number of nnapi operations in this model
+  size_t num_nnapi_ops_ = 0;
   uint32_t next_index_ = 0;
 
   // Convert the onnx model to ANeuralNetworksModel
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index e959a24c2b95a..f1dfe7b03a47d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -256,6 +256,10 @@ common::Status NnapiExecutionProvider::Compile(const std::vector<FusedNodeAndGra
     nnapi::ModelBuilder builder(graph_viewer);
     builder.SetUseNCHW(nnapi_flags_ & NNAPI_FLAG_USE_NCHW);
     builder.SetUseFp16(nnapi_flags_ & NNAPI_FLAG_USE_FP16);
+    if (nnapi_flags_ & NNAPI_FLAG_CPU_DISABLED) {
+      builder.SetTargetDeviceOption(nnapi::ModelBuilder::TargetDeviceOption::CPU_DISABLED);
+    }
+
     std::unique_ptr<nnapi::Model> nnapi_model;
     ORT_RETURN_IF_ERROR(builder.Compile(nnapi_model));