[Bug Fix] Improve error handling and compatibility in TensorRT engine tests (#75948)

youge325 · web-flow · commit e1ffaed2bd6f · 2025-10-22T17:18:25.000+08:00
- 在 test_tensorrt_engine_instruction.cc 里，原先直接用 TensorRT 的 `FullyConnected` 层，现在改成手工搭建 Shuffle → Constant → MatrixMultiply → ElementWise → Shuffle 的子网，等价地实现带 bias 的全连接。这样做主要是规避 TensorRT 里旧版 FC 层的限制，并能更清楚地控制动态形状和推理流程。
- 每一步都补充了更具体的 `PADDLE_ENFORCE_NOT_NULL` 抛错信息，比如提示 reshape、常量层、矩阵乘、加法等各环节可能失败的原因，便于在引擎生成失败时快速定位问题。
- 针对 TensorRT 8.6 之后 `ICudaEngine` API 的变化，新增了 `IS_TRT_VERSION_GE(8600)` 的分支，在新老版本之间分别检查 `getNbIOTensors()` 或 `getNbBindings()`，保证测试在不同 TensorRT 版本下都能正确校验。
- 动态 shape 的测试把 Shuffle 失败时的报错信息改得更精准，明确指出是运行时 shape 绑定的问题。
- 插件测试同样完善了插件创建、层加入失败时的提示，并加入了前述的 TensorRT 版本兼容检查，使调试自定义插件时的可诊断性更好。
diff --git a/test/cpp/inference/tensorrt/test_tensorrt_engine_instruction.cc b/test/cpp/inference/tensorrt/test_tensorrt_engine_instruction.cc
@@ -85,20 +85,79 @@ TEST(TensorRTEngineInstructionTest, test_tensorrt_engine_instruction) {
       nvinfer1::DataType::kFLOAT, raw_bias, size);
   auto *x = engine->DeclareInput(
       "x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{-1, 1, 1, 1});
-  auto *fc_layer = TRT_ENGINE_ADD_LAYER(
-      engine, FullyConnected, *x, size, weight.get(), bias.get());
-  PADDLE_ENFORCE_NOT_NULL(fc_layer,
-                          common::errors::InvalidArgument(
-                              "TRT fully connected layer building failed."));
+  auto *flatten_layer = engine->network()->addShuffle(*x);
+  PADDLE_ENFORCE_NOT_NULL(
+      flatten_layer,
+      common::errors::InvalidArgument(
+          "Unable to build the TensorRT shuffle layer for the input tensor "
+          "'x'. "
+          "This usually indicates the TensorRT network failed to allocate the "
+          "intermediate reshape layer."));
+  flatten_layer->setReshapeDimensions(nvinfer1::Dims2{-1, 1});
+
+  auto *weight_layer = TRT_ENGINE_ADD_LAYER(
+      engine, Constant, nvinfer1::Dims2{1, 1}, weight.get());
+  PADDLE_ENFORCE_NOT_NULL(
+      weight_layer,
+      common::errors::InvalidArgument("TensorRT failed to create the constant "
+                                      "layer for parameter 'weight'. "
+                                      "Please confirm the TensorRT builder "
+                                      "supports constant initialisation "
+                                      "for the provided weight shape."));
+
+  auto *bias_layer =
+      TRT_ENGINE_ADD_LAYER(engine, Constant, nvinfer1::Dims2{1, 1}, bias.get());
+  PADDLE_ENFORCE_NOT_NULL(
+      bias_layer,
+      common::errors::InvalidArgument(
+          "TensorRT failed to create the constant layer for parameter 'bias'. "
+          "Check whether the provided bias data matches the expected shape."));
+
+  auto *matmul_layer = TRT_ENGINE_ADD_LAYER(engine,
+                                            MatrixMultiply,
+                                            *flatten_layer->getOutput(0),
+                                            nvinfer1::MatrixOperation::kNONE,
+                                            *weight_layer->getOutput(0),
+                                            nvinfer1::MatrixOperation::kNONE);
+  PADDLE_ENFORCE_NOT_NULL(
+      matmul_layer,
+      common::errors::InvalidArgument(
+          "TensorRT returned a null matrix-multiply layer while fusing the "
+          "fully-connected op. Verify the network input ranks and TensorRT "
+          "version."));
+
+  auto *add_layer = TRT_ENGINE_ADD_LAYER(engine,
+                                         ElementWise,
+                                         *matmul_layer->getOutput(0),
+                                         *bias_layer->getOutput(0),
+                                         nvinfer1::ElementWiseOperation::kSUM);
+  PADDLE_ENFORCE_NOT_NULL(
+      add_layer,
+      common::errors::InvalidArgument(
+          "TensorRT could not construct the elementwise-add layer for bias "
+          "fusion. Ensure the bias tensor uses broadcastable dimensions."));
 
-  engine->DeclareOutput(fc_layer, 0, "y");
+  auto *reshape_layer = engine->network()->addShuffle(*add_layer->getOutput(0));
+  PADDLE_ENFORCE_NOT_NULL(
+      reshape_layer,
+      common::errors::InvalidArgument(
+          "TensorRT could not emit the final shuffle layer to restore the "
+          "output shape. Confirm the shape tensor and inferred dimensions are "
+          "valid."));
+  reshape_layer->setReshapeDimensions(nvinfer1::Dims4{-1, 1, 1, 1});
+
+  engine->DeclareOutput(reshape_layer, 0, "y");
   std::vector<std::string> input_names = {"x", ""};
   std::vector<std::string> output_names = {"y"};
   std::vector<std::vector<int64_t>> outputs_shape = {{1}};
   std::vector<phi::DataType> outputs_dtype = {phi::DataType::FLOAT32};
   LOG(INFO) << "freeze network";
   engine->FreezeNetwork();
+#if IS_TRT_VERSION_GE(8600)
+  ASSERT_EQ(engine->engine()->getNbIOTensors(), 2);
+#else
   ASSERT_EQ(engine->engine()->getNbBindings(), 2);
+#endif
   nvinfer1::IHostMemory *serialized_engine_data = engine->Serialize();
 
   std::ofstream outFile("engine_serialized_data.bin", std::ios::binary);
@@ -220,7 +279,10 @@ TEST(TensorRTEngineInstructionTest, test_tensorrt_engine_instruction_dynamic) {
   layer->setInput(1, *shape);
   PADDLE_ENFORCE_NOT_NULL(
       layer,
-      common::errors::InvalidArgument("TRT shuffle layer building failed."));
+      common::errors::InvalidArgument(
+          "TensorRT failed to construct the dynamic shuffle layer that "
+          "consumes the runtime shape tensor. Please check the provided "
+          "shape binding."));
   engine->DeclareOutput(layer, 0, "y");
   engine->FreezeNetwork();
 
@@ -401,14 +463,19 @@ TEST(PluginTest, test_generic_plugin) {
       creator->createPlugin("pir_generic_plugin", plugin_collection.get());
   PADDLE_ENFORCE_NOT_NULL(
       generic_plugin,
-      common::errors::InvalidArgument("TRT create generic plugin failed."));
+      common::errors::InvalidArgument(
+          "TensorRT plugin registry returned nullptr while creating "
+          "'pir_generic_plugin'. Verify the plugin has been registered before "
+          "building the engine."));
   std::vector<nvinfer1::ITensor *> plugin_inputs;
   plugin_inputs.emplace_back(x);
   auto plugin_layer = engine->network()->addPluginV2(
       plugin_inputs.data(), plugin_inputs.size(), *generic_plugin);
-  PADDLE_ENFORCE_NOT_NULL(plugin_layer,
-                          common::errors::InvalidArgument(
-                              "TRT generic plugin layer building failed."));
+  PADDLE_ENFORCE_NOT_NULL(
+      plugin_layer,
+      common::errors::InvalidArgument(
+          "TensorRT failed to add the generic plugin layer to the network. "
+          "Ensure the plugin inputs match the expected TensorRT types."));
 
   engine->DeclareOutput(plugin_layer, 0, "y");
   std::vector<std::string> input_names = {"x"};
@@ -417,7 +484,11 @@ TEST(PluginTest, test_generic_plugin) {
   std::vector<phi::DataType> outputs_dtype = {phi::DataType::FLOAT32};
   LOG(INFO) << "freeze network";
   engine->FreezeNetwork();
+#if IS_TRT_VERSION_GE(8600)
+  ASSERT_EQ(engine->engine()->getNbIOTensors(), 2);
+#else
   ASSERT_EQ(engine->engine()->getNbBindings(), 2);
+#endif
   nvinfer1::IHostMemory *serialized_engine_data = engine->Serialize();
   std::ofstream outFile("engine_serialized_data.bin", std::ios::binary);
   outFile.write(static_cast<const char *>(serialized_engine_data->data()),