[CI] Add op-benchmark (#72991)

swgu98 · web-flow · commit 425c14ddc08c · 2025-05-30T04:50:39.000-07:00
* test=document_fix

* test=document_fix

* test=document_fix

* test=document_fix
diff --git a/tools/op_benchmark.sh b/tools/op_benchmark.sh
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set +ex
+set +e
+set -x
 
 [ -z "$PADDLE_ROOT" ] && PADDLE_ROOT=$(cd $(dirname ${BASH_SOURCE[0]})/.. && pwd)
 
@@ -94,35 +95,21 @@ function load_CHANGE_OP_FILES_by_header_file {
 # Load op files that PR changes
 function load_CHANGE_OP_FILES {
   LOG "[INFO] run function load_CHANGE_OP_FILES"
-  local sub_dir change_file
-  # TODO(Avin0323): Need to filter the files added by the new OP.
-  for change_file in $(git diff --name-only develop)
-  do
-    # match directory limit
-    [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]]  || continue
-    # match file name limit
-    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]]
-    then
-      # match cu file directory limit
-      match_cu_file_directory $change_file || continue
-      LOG "[INFO] Found \"${change_file}\" changed."
-      CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
-    elif [[ "$change_file" =~ ".h" ]]
-    then
-      match_h_file_directory $change_file || continue
-      LOG "[INFO] Found \"${change_file}\" changed, keep searching."
-      INCLUDE_SEARCH_MAP[${change_file}]="searched"
-      load_CHANGE_OP_FILES_by_header_file $change_file
+  file_patterns=("_op.cu" "_kernel.cu" "_kernel_gpudnn.cu")
+  directories=("paddle/fluid/operators" "paddle/phi/kernels")
+  for dir in "${directories[@]}"; do
+    if [ -d "$dir" ]; then
+        for pattern in "${file_patterns[@]}"; do
+            while IFS= read -r file; do
+                match_cu_file_directory $file || continue
+                LOG "[INFO] Found \"${file}\"."
+                CHANGE_OP_FILES+=("$file")
+            done < <(find "$dir" -type f -name "*$pattern" 2>/dev/null)
+        done
+    else
+        echo "Directory $dir does not exist."
     fi
   done
-  if [ ${#CHANGE_OP_FILES[@]} -eq 0 ]; then
-    LOG "[INFO] Uninstall PaddlePaddle ..."
-    pip uninstall -y paddlepaddle paddlepaddle_gpu
-    LOG "[INFO] Install PaddlePaddle ..."
-    pip install build/pr_whl/*.whl
-    collect_kernel_registry_info
-    LOG "[INFO] No op to test, skip this ci." && exit 0
-  fi
 }
 
 # Clone benchmark repo
@@ -233,7 +220,7 @@ function check_op_benchmark_result {
   local logs_dir api_info_file check_status_code
   # default 3 times
   [ -z "${RETRY_TIMES}" ] && RETRY_TIMES=3
-  logs_dir=$(pwd)/logs-test_pr
+  logs_dir=$(pwd)/logs-pr_whl
   api_info_file=$(pwd)/api_info.txt
   for retry_time in $(seq 0 ${RETRY_TIMES})
   do
@@ -255,7 +242,7 @@ function check_op_benchmark_result {
     # check current result and update the file to benchmark test
     python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \
         --develop_logs_dir $(pwd)/logs-dev_whl \
-        --pr_logs_dir $(pwd)/logs-test_pr \
+        --pr_logs_dir $(pwd)/logs-pr_whl \
         --api_info_file ${api_info_file}
     check_status_code=$?
     # TODO(Avin0323): retry only if the performance check fails
@@ -315,24 +302,5 @@ function gpu_op_benchmark {
   exit 0
 }
 
-
-# The PR will pass quickly when get approval from specific person.
-set +x
-approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
-if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 Xreki zhangting2020)
-  LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-  if [ "${APPROVALS}" == "TRUE" ]; then
-    LOG "[INFO] ==================================="
-    LOG "[INFO] current pr ${GIT_PR_ID} has got approvals. So, Pass CI directly!"
-    LOG "[INFO] ==================================="
-    exit 0
-  fi
-fi
-
-case $1 in
-  run_op_benchmark)
-    prepare_env
-    gpu_op_benchmark
-  ;;
-esac
+prepare_env
+gpu_op_benchmark
diff --git a/tools/op_benchmark_count.py b/tools/op_benchmark_count.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import re
+from collections import defaultdict
+
+gpu_time_categories = {
+    "within_1%": 0,
+    "increase_1_to_5%": 0,
+    "increase_above_5_to_10%": 0,
+    "increase_above_10%": 0,
+    "decrease_1_to_5%": 0,
+    "decrease_above_5%": 0,
+}
+
+total_time_categories = {
+    "within_1%": 0,
+    "increase_1_to_5%": 0,
+    "increase_above_5_to_10%": 0,
+    "increase_above_10%": 0,
+    "decrease_1_to_5%": 0,
+    "decrease_above_5%": 0,
+}
+
+parser = argparse.ArgumentParser(
+    description="Analyze time changes in log files"
+)
+parser.add_argument('file_name', type=str, help='The name of the log file')
+args = parser.parse_args()
+
+gpu_time_pattern = re.compile(r"GPU time change: ([\d.-]*)")
+total_time_pattern = re.compile(r"Total time change: ([\d.-]+)%")
+error_pattern = re.compile(r'Check speed result with case "(.*?)"')
+
+gpu_time_lines = 0
+error_cases = defaultdict(int)
+
+with open(args.file_name, 'r') as file:
+    for line in file:
+        if "GPU time change" in line:
+            gpu_time_lines += 1
+            gpu_time_match = gpu_time_pattern.search(line)
+            if gpu_time_match:
+                gpu_time_change_str = gpu_time_match.group(1)
+                gpu_time_change = (
+                    float(gpu_time_change_str) if gpu_time_change_str else 0.0
+                )
+
+                if -1 < gpu_time_change < 1:
+                    gpu_time_categories["within_1%"] += 1
+                elif 1 <= gpu_time_change < 5:
+                    gpu_time_categories["increase_1_to_5%"] += 1
+                elif 5 <= gpu_time_change < 10:
+                    gpu_time_categories["increase_above_5_to_10%"] += 1
+                elif gpu_time_change >= 10:
+                    gpu_time_categories["increase_above_10%"] += 1
+                elif -5 < gpu_time_change <= -1:
+                    gpu_time_categories["decrease_1_to_5%"] += 1
+                elif gpu_time_change <= -5:
+                    gpu_time_categories["decrease_above_5%"] += 1
+
+        elif "Total time change" in line:
+            total_time_match = total_time_pattern.search(line)
+            if total_time_match:
+                total_time_change = float(total_time_match.group(1))
+
+                if -1 < total_time_change < 1:
+                    total_time_categories["within_1%"] += 1
+                elif 1 <= total_time_change < 5:
+                    total_time_categories["increase_1_to_5%"] += 1
+                elif 5 <= total_time_change < 10:
+                    total_time_categories["increase_above_5_to_10%"] += 1
+                elif total_time_change >= 10:
+                    total_time_categories["increase_above_10%"] += 1
+                elif -5 < total_time_change <= -1:
+                    total_time_categories["decrease_1_to_5%"] += 1
+                elif total_time_change <= -5:
+                    total_time_categories["decrease_above_5%"] += 1
+
+        elif error_pattern.search(line):
+            error_match = error_pattern.search(line)
+            if error_match:
+                case_name = error_match.group(1)
+                error_cases[case_name] += 1
+
+
+def print_categories(categories, title):
+    total = sum(categories.values())
+    print(f"\n{title} Categories:")
+    for category, count in categories.items():
+        percentage = (count / total * 100) if total > 0 else 0
+        print(f"{category}: {count} ({percentage:.2f}%)")
+
+
+print_categories(gpu_time_categories, "GPU Time Change")
+print_categories(total_time_categories, "Total Time Change")
+
+total_errors = sum(error_cases.values())
+error_percentage = (
+    (total_errors / gpu_time_lines * 100) if gpu_time_lines > 0 else 0
+)
+unique_errors = len(error_cases)
+
+print(f"\nError Cases Total: {total_errors}")
+print(f"Error Lines Percentage: {error_percentage:.2f}%")
+print(f"Unique Error OP: {unique_errors}\n")
+
+for case, count in error_cases.items():
+    print(f"OP '{case}': {count} occurrences")