Skip to content

Commit ec3e0a9

Browse files
Yang Wangalexdeucher
authored andcommitted
drm/amdgpu: refine ras error kernel log print
refine ras error kernel log to avoid user-ridden ambiguity. Signed-off-by: Yang Wang <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 53d4d77 commit ec3e0a9

File tree

2 files changed

+82
-39
lines changed

2 files changed

+82
-39
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 81 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -635,8 +635,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
635635

636636
static inline void put_obj(struct ras_manager *obj)
637637
{
638-
if (obj && (--obj->use == 0))
638+
if (obj && (--obj->use == 0)) {
639639
list_del(&obj->node);
640+
amdgpu_ras_error_data_fini(&obj->err_data);
641+
}
642+
640643
if (obj && (obj->use < 0))
641644
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
642645
}
@@ -666,6 +669,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
666669
if (alive_obj(obj))
667670
return NULL;
668671

672+
if (amdgpu_ras_error_data_init(&obj->err_data))
673+
return NULL;
674+
669675
obj->head = *head;
670676
obj->adev = adev;
671677
list_add(&obj->node, &con->head);
@@ -1023,44 +1029,68 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
10231029
}
10241030

10251031
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
1026-
struct ras_query_if *query_if,
1032+
struct ras_manager *ras_mgr,
10271033
struct ras_err_data *err_data,
1034+
const char *blk_name,
10281035
bool is_ue)
10291036
{
1030-
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
1031-
const char *blk_name = get_ras_block_str(&query_if->head);
10321037
struct amdgpu_smuio_mcm_config_info *mcm_info;
10331038
struct ras_err_node *err_node;
10341039
struct ras_err_info *err_info;
10351040

1036-
if (is_ue)
1037-
dev_info(adev->dev, "%ld uncorrectable hardware errors detected in %s block\n",
1038-
ras_mgr->err_data.ue_count, blk_name);
1039-
else
1040-
dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n",
1041-
ras_mgr->err_data.ce_count, blk_name);
1041+
if (is_ue) {
1042+
for_each_ras_error(err_node, err_data) {
1043+
err_info = &err_node->err_info;
1044+
mcm_info = &err_info->mcm_info;
1045+
if (err_info->ue_count) {
1046+
dev_info(adev->dev, "socket: %d, die: %d, "
1047+
"%lld new uncorrectable hardware errors detected in %s block\n",
1048+
mcm_info->socket_id,
1049+
mcm_info->die_id,
1050+
err_info->ue_count,
1051+
blk_name);
1052+
}
1053+
}
10421054

1043-
for_each_ras_error(err_node, err_data) {
1044-
err_info = &err_node->err_info;
1045-
mcm_info = &err_info->mcm_info;
1046-
if (is_ue && err_info->ue_count) {
1047-
dev_info(adev->dev, "socket: %d, die: %d "
1048-
"%lld uncorrectable hardware errors detected in %s block\n",
1049-
mcm_info->socket_id,
1050-
mcm_info->die_id,
1051-
err_info->ue_count,
1052-
blk_name);
1053-
} else if (!is_ue && err_info->ce_count) {
1054-
dev_info(adev->dev, "socket: %d, die: %d "
1055-
"%lld correctable hardware errors detected in %s block\n",
1056-
mcm_info->socket_id,
1057-
mcm_info->die_id,
1058-
err_info->ce_count,
1059-
blk_name);
1055+
for_each_ras_error(err_node, &ras_mgr->err_data) {
1056+
err_info = &err_node->err_info;
1057+
mcm_info = &err_info->mcm_info;
1058+
dev_info(adev->dev, "socket: %d, die: %d, "
1059+
"%lld uncorrectable hardware errors detected in total in %s block\n",
1060+
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
1061+
}
1062+
1063+
} else {
1064+
for_each_ras_error(err_node, err_data) {
1065+
err_info = &err_node->err_info;
1066+
mcm_info = &err_info->mcm_info;
1067+
if (err_info->ce_count) {
1068+
dev_info(adev->dev, "socket: %d, die: %d, "
1069+
"%lld new correctable hardware errors detected in %s block, "
1070+
"no user action is needed\n",
1071+
mcm_info->socket_id,
1072+
mcm_info->die_id,
1073+
err_info->ce_count,
1074+
blk_name);
1075+
}
1076+
}
1077+
1078+
for_each_ras_error(err_node, &ras_mgr->err_data) {
1079+
err_info = &err_node->err_info;
1080+
mcm_info = &err_info->mcm_info;
1081+
dev_info(adev->dev, "socket: %d, die: %d, "
1082+
"%lld correctable hardware errors detected in total in %s block, "
1083+
"no user action is needed\n",
1084+
mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name);
10601085
}
10611086
}
10621087
}
10631088

1089+
static inline bool err_data_has_source_info(struct ras_err_data *data)
1090+
{
1091+
return !list_empty(&data->err_node_list);
1092+
}
1093+
10641094
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
10651095
struct ras_query_if *query_if,
10661096
struct ras_err_data *err_data)
@@ -1069,9 +1099,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
10691099
const char *blk_name = get_ras_block_str(&query_if->head);
10701100

10711101
if (err_data->ce_count) {
1072-
if (!list_empty(&err_data->err_node_list)) {
1073-
amdgpu_ras_error_print_error_data(adev, query_if,
1074-
err_data, false);
1102+
if (err_data_has_source_info(err_data)) {
1103+
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false);
10751104
} else if (!adev->aid_mask &&
10761105
adev->smuio.funcs &&
10771106
adev->smuio.funcs->get_socket_id &&
@@ -1094,9 +1123,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
10941123
}
10951124

10961125
if (err_data->ue_count) {
1097-
if (!list_empty(&err_data->err_node_list)) {
1098-
amdgpu_ras_error_print_error_data(adev, query_if,
1099-
err_data, true);
1126+
if (err_data_has_source_info(err_data)) {
1127+
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true);
11001128
} else if (!adev->aid_mask &&
11011129
adev->smuio.funcs &&
11021130
adev->smuio.funcs->get_socket_id &&
@@ -1118,6 +1146,25 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
11181146

11191147
}
11201148

1149+
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
1150+
{
1151+
struct ras_err_node *err_node;
1152+
struct ras_err_info *err_info;
1153+
1154+
if (err_data_has_source_info(err_data)) {
1155+
for_each_ras_error(err_node, err_data) {
1156+
err_info = &err_node->err_info;
1157+
1158+
amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
1159+
amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
1160+
}
1161+
} else {
1162+
/* for legacy asic path which doesn't has error source info */
1163+
obj->err_data.ue_count += err_data->ue_count;
1164+
obj->err_data.ce_count += err_data->ce_count;
1165+
}
1166+
}
1167+
11211168
/* query/inject/cure begin */
11221169
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
11231170
struct ras_query_if *info)
@@ -1156,8 +1203,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
11561203
}
11571204
}
11581205

1159-
obj->err_data.ue_count += err_data.ue_count;
1160-
obj->err_data.ce_count += err_data.ce_count;
1206+
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
11611207

11621208
info->ue_count = obj->err_data.ue_count;
11631209
info->ce_count = obj->err_data.ce_count;

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,7 @@ struct ras_manager {
515515
/* IH data */
516516
struct ras_ih_data ih_data;
517517

518-
struct {
519-
unsigned long ue_count;
520-
unsigned long ce_count;
521-
} err_data;
518+
struct ras_err_data err_data;
522519
};
523520

524521
struct ras_badpage {

0 commit comments

Comments
 (0)