drm/amdgpu: correct the calculation of RAS bad page

After the introduction of NPS RAS, one bad page record on eeprom may be
related to 1 or 16 bad pages, so the bad page record and bad page are
two different concepts, define a new variable to store bad page number.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou
2024-11-29 16:52:41 +08:00
committed by Alex Deucher
parent 1f06e7f344
commit ae756cd853
4 changed files with 36 additions and 22 deletions

View File

@@ -2943,13 +2943,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
bad_page_num = control->ras_num_recs;
/* one record on eeprom stands for all pages in one memory row
* in this mode
*/
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA)
bad_page_num = control->ras_num_recs * adev->umc.retire_unit;
bad_page_num = control->ras_num_bad_pages;
save_count = data->count - bad_page_num;
mutex_unlock(&con->recovery_lock);
@@ -3433,7 +3427,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
return ret;
amdgpu_dpm_send_hbm_bad_pages_num(
adev, control->ras_num_recs);
adev, control->ras_num_bad_pages);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(

View File

@@ -470,9 +470,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
res = __write_table_ras_info(control);
control->ras_num_recs = 0;
control->ras_num_bad_pages = 0;
control->ras_fri = 0;
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
@@ -559,7 +560,7 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
dev_warn(adev->dev,
"But GPU can be operated due to bad_page_threshold = -1.\n");
return false;
@@ -621,6 +622,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
struct amdgpu_device *adev = to_amdgpu_device(control);
u32 a, b, i;
u8 *buf, *pp;
int res;
@@ -723,6 +725,12 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
control->ras_num_recs = 1 + (control->ras_max_record_count + b
- control->ras_fri)
% control->ras_max_record_count;
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
control->ras_num_bad_pages = control->ras_num_recs;
else
control->ras_num_bad_pages =
control->ras_num_recs * adev->umc.retire_unit;
Out:
kfree(buf);
return res;
@@ -740,10 +748,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
/* Modify the header if it exceeds.
*/
if (amdgpu_bad_page_threshold != 0 &&
control->ras_num_recs >= ras->bad_page_cnt_threshold) {
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
dev_warn(adev->dev,
"Saved bad pages %d reaches threshold value %d\n",
control->ras_num_recs, ras->bad_page_cnt_threshold);
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
@@ -798,9 +806,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
*/
if (amdgpu_bad_page_threshold != 0 &&
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
control->ras_num_recs < ras->bad_page_cnt_threshold)
control->ras_num_bad_pages < ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
control->ras_num_recs) * 100) /
control->ras_num_bad_pages) * 100) /
ras->bad_page_cnt_threshold;
/* Recalc the checksum.
@@ -1402,9 +1410,15 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
if (!__get_eeprom_i2c_addr(adev, control))
return -EINVAL;
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
control->ras_num_bad_pages = control->ras_num_recs;
else
control->ras_num_bad_pages =
control->ras_num_recs * adev->umc.retire_unit;
if (hdr->header == RAS_TABLE_HDR_VAL) {
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->ras_num_recs);
control->ras_num_bad_pages);
if (hdr->version == RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
@@ -1419,9 +1433,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
/* Warn if we are at 90% of the threshold or above
*/
if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
control->ras_num_recs,
control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
@@ -1437,7 +1451,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
res);
return -EINVAL;
}
if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
/* This means that, the threshold was increased since
* the last time the system was booted, and now,
* ras->bad_page_cnt_threshold - control->num_recs > 0,
@@ -1447,13 +1461,13 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
dev_info(adev->dev,
"records:%d threshold:%d, resetting "
"RAS table header signature",
control->ras_num_recs,
control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, ras->bad_page_cnt_threshold);
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
if (amdgpu_bad_page_threshold == -1) {
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
res = 0;
@@ -1462,7 +1476,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
dev_err(adev->dev,
"RAS records:%d exceed threshold:%d, "
"GPU will not be initialized. Replace this GPU or increase the threshold",
control->ras_num_recs, ras->bad_page_cnt_threshold);
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
}
}
} else {

View File

@@ -95,6 +95,11 @@ struct amdgpu_ras_eeprom_control {
*/
u32 ras_num_recs;
/* the bad page number is ras_num_recs or
* ras_num_recs * umc.retire_unit
*/
u32 ras_num_bad_pages;
/* First record index to read, 0-based.
* Range is [0, num_recs-1]. This is
* an absolute index, starting right after

View File

@@ -169,7 +169,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
err_data->err_addr_cnt, false);
amdgpu_ras_save_bad_pages(adev, &err_count);
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
amdgpu_dpm_send_hbm_bad_pages_num(adev,
con->eeprom_control.ras_num_bad_pages);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);