@@ -635,8 +635,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
635635
636636static inline void put_obj (struct ras_manager * obj )
637637{
638- if (obj && (-- obj -> use == 0 ))
638+ if (obj && (-- obj -> use == 0 )) {
639639 list_del (& obj -> node );
640+ amdgpu_ras_error_data_fini (& obj -> err_data );
641+ }
642+
640643 if (obj && (obj -> use < 0 ))
641644 DRM_ERROR ("RAS ERROR: Unbalance obj(%s) use\n" , get_ras_block_str (& obj -> head ));
642645}
@@ -666,6 +669,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
666669 if (alive_obj (obj ))
667670 return NULL ;
668671
672+ if (amdgpu_ras_error_data_init (& obj -> err_data ))
673+ return NULL ;
674+
669675 obj -> head = * head ;
670676 obj -> adev = adev ;
671677 list_add (& obj -> node , & con -> head );
@@ -1023,44 +1029,68 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
10231029}
10241030
10251031static void amdgpu_ras_error_print_error_data (struct amdgpu_device * adev ,
1026- struct ras_query_if * query_if ,
1032+ struct ras_manager * ras_mgr ,
10271033 struct ras_err_data * err_data ,
1034+ const char * blk_name ,
10281035 bool is_ue )
10291036{
1030- struct ras_manager * ras_mgr = amdgpu_ras_find_obj (adev , & query_if -> head );
1031- const char * blk_name = get_ras_block_str (& query_if -> head );
10321037 struct amdgpu_smuio_mcm_config_info * mcm_info ;
10331038 struct ras_err_node * err_node ;
10341039 struct ras_err_info * err_info ;
10351040
1036- if (is_ue )
1037- dev_info (adev -> dev , "%ld uncorrectable hardware errors detected in %s block\n" ,
1038- ras_mgr -> err_data .ue_count , blk_name );
1039- else
1040- dev_info (adev -> dev , "%ld correctable hardware errors detected in %s block\n" ,
1041- ras_mgr -> err_data .ce_count , blk_name );
1041+ if (is_ue ) {
1042+ for_each_ras_error (err_node , err_data ) {
1043+ err_info = & err_node -> err_info ;
1044+ mcm_info = & err_info -> mcm_info ;
1045+ if (err_info -> ue_count ) {
1046+ dev_info (adev -> dev , "socket: %d, die: %d, "
1047+ "%lld new uncorrectable hardware errors detected in %s block\n" ,
1048+ mcm_info -> socket_id ,
1049+ mcm_info -> die_id ,
1050+ err_info -> ue_count ,
1051+ blk_name );
1052+ }
1053+ }
10421054
1043- for_each_ras_error (err_node , err_data ) {
1044- err_info = & err_node -> err_info ;
1045- mcm_info = & err_info -> mcm_info ;
1046- if (is_ue && err_info -> ue_count ) {
1047- dev_info (adev -> dev , "socket: %d, die: %d "
1048- "%lld uncorrectable hardware errors detected in %s block\n" ,
1049- mcm_info -> socket_id ,
1050- mcm_info -> die_id ,
1051- err_info -> ue_count ,
1052- blk_name );
1053- } else if (!is_ue && err_info -> ce_count ) {
1054- dev_info (adev -> dev , "socket: %d, die: %d "
1055- "%lld correctable hardware errors detected in %s block\n" ,
1056- mcm_info -> socket_id ,
1057- mcm_info -> die_id ,
1058- err_info -> ce_count ,
1059- blk_name );
1055+ for_each_ras_error (err_node , & ras_mgr -> err_data ) {
1056+ err_info = & err_node -> err_info ;
1057+ mcm_info = & err_info -> mcm_info ;
1058+ dev_info (adev -> dev , "socket: %d, die: %d, "
1059+ "%lld uncorrectable hardware errors detected in total in %s block\n" ,
1060+ mcm_info -> socket_id , mcm_info -> die_id , err_info -> ue_count , blk_name );
1061+ }
1062+
1063+ } else {
1064+ for_each_ras_error (err_node , err_data ) {
1065+ err_info = & err_node -> err_info ;
1066+ mcm_info = & err_info -> mcm_info ;
1067+ if (err_info -> ce_count ) {
1068+ dev_info (adev -> dev , "socket: %d, die: %d, "
1069+ "%lld new correctable hardware errors detected in %s block, "
1070+ "no user action is needed\n" ,
1071+ mcm_info -> socket_id ,
1072+ mcm_info -> die_id ,
1073+ err_info -> ce_count ,
1074+ blk_name );
1075+ }
1076+ }
1077+
1078+ for_each_ras_error (err_node , & ras_mgr -> err_data ) {
1079+ err_info = & err_node -> err_info ;
1080+ mcm_info = & err_info -> mcm_info ;
1081+ dev_info (adev -> dev , "socket: %d, die: %d, "
1082+ "%lld correctable hardware errors detected in total in %s block, "
1083+ "no user action is needed\n" ,
1084+ mcm_info -> socket_id , mcm_info -> die_id , err_info -> ce_count , blk_name );
10601085 }
10611086 }
10621087}
10631088
1089+ static inline bool err_data_has_source_info (struct ras_err_data * data )
1090+ {
1091+ return !list_empty (& data -> err_node_list );
1092+ }
1093+
10641094static void amdgpu_ras_error_generate_report (struct amdgpu_device * adev ,
10651095 struct ras_query_if * query_if ,
10661096 struct ras_err_data * err_data )
@@ -1069,9 +1099,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
10691099 const char * blk_name = get_ras_block_str (& query_if -> head );
10701100
10711101 if (err_data -> ce_count ) {
1072- if (!list_empty (& err_data -> err_node_list )) {
1073- amdgpu_ras_error_print_error_data (adev , query_if ,
1074- err_data , false);
1102+ if (err_data_has_source_info (err_data )) {
1103+ amdgpu_ras_error_print_error_data (adev , ras_mgr , err_data , blk_name , false);
10751104 } else if (!adev -> aid_mask &&
10761105 adev -> smuio .funcs &&
10771106 adev -> smuio .funcs -> get_socket_id &&
@@ -1094,9 +1123,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
10941123 }
10951124
10961125 if (err_data -> ue_count ) {
1097- if (!list_empty (& err_data -> err_node_list )) {
1098- amdgpu_ras_error_print_error_data (adev , query_if ,
1099- err_data , true);
1126+ if (err_data_has_source_info (err_data )) {
1127+ amdgpu_ras_error_print_error_data (adev , ras_mgr , err_data , blk_name , true);
11001128 } else if (!adev -> aid_mask &&
11011129 adev -> smuio .funcs &&
11021130 adev -> smuio .funcs -> get_socket_id &&
@@ -1118,6 +1146,25 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
11181146
11191147}
11201148
1149+ static void amdgpu_rasmgr_error_data_statistic_update (struct ras_manager * obj , struct ras_err_data * err_data )
1150+ {
1151+ struct ras_err_node * err_node ;
1152+ struct ras_err_info * err_info ;
1153+
1154+ if (err_data_has_source_info (err_data )) {
1155+ for_each_ras_error (err_node , err_data ) {
1156+ err_info = & err_node -> err_info ;
1157+
1158+ amdgpu_ras_error_statistic_ce_count (& obj -> err_data , & err_info -> mcm_info , err_info -> ce_count );
1159+ amdgpu_ras_error_statistic_ue_count (& obj -> err_data , & err_info -> mcm_info , err_info -> ue_count );
1160+ }
1161+ } else {
1162+ /* for legacy asic path which doesn't has error source info */
1163+ obj -> err_data .ue_count += err_data -> ue_count ;
1164+ obj -> err_data .ce_count += err_data -> ce_count ;
1165+ }
1166+ }
1167+
11211168/* query/inject/cure begin */
11221169int amdgpu_ras_query_error_status (struct amdgpu_device * adev ,
11231170 struct ras_query_if * info )
@@ -1156,8 +1203,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
11561203 }
11571204 }
11581205
1159- obj -> err_data .ue_count += err_data .ue_count ;
1160- obj -> err_data .ce_count += err_data .ce_count ;
1206+ amdgpu_rasmgr_error_data_statistic_update (obj , & err_data );
11611207
11621208 info -> ue_count = obj -> err_data .ue_count ;
11631209 info -> ce_count = obj -> err_data .ce_count ;
0 commit comments