mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-19 12:49:10 +08:00
drm/amdgpu: correct the calculation of RAS bad page
After the introduction of NPS RAS, one bad page record on eeprom may be related to 1 or 16 bad pages, so the bad page record and bad page are two different concepts, define a new variable to store bad page number. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -2943,13 +2943,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
|
||||
mutex_lock(&con->recovery_lock);
|
||||
control = &con->eeprom_control;
|
||||
data = con->eh_data;
|
||||
bad_page_num = control->ras_num_recs;
|
||||
/* one record on eeprom stands for all pages in one memory row
|
||||
* in this mode
|
||||
*/
|
||||
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA)
|
||||
bad_page_num = control->ras_num_recs * adev->umc.retire_unit;
|
||||
|
||||
bad_page_num = control->ras_num_bad_pages;
|
||||
save_count = data->count - bad_page_num;
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
@@ -3433,7 +3427,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
|
||||
return ret;
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(
|
||||
adev, control->ras_num_recs);
|
||||
adev, control->ras_num_bad_pages);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(
|
||||
|
||||
@@ -470,9 +470,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
res = __write_table_ras_info(control);
|
||||
|
||||
control->ras_num_recs = 0;
|
||||
control->ras_num_bad_pages = 0;
|
||||
control->ras_fri = 0;
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
|
||||
|
||||
control->bad_channel_bitmap = 0;
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
|
||||
@@ -559,7 +560,7 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
|
||||
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
|
||||
if (amdgpu_bad_page_threshold == -1) {
|
||||
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
|
||||
con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
|
||||
con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
|
||||
dev_warn(adev->dev,
|
||||
"But GPU can be operated due to bad_page_threshold = -1.\n");
|
||||
return false;
|
||||
@@ -621,6 +622,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
|
||||
const u32 num)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
u32 a, b, i;
|
||||
u8 *buf, *pp;
|
||||
int res;
|
||||
@@ -723,6 +725,12 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
|
||||
control->ras_num_recs = 1 + (control->ras_max_record_count + b
|
||||
- control->ras_fri)
|
||||
% control->ras_max_record_count;
|
||||
|
||||
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
|
||||
control->ras_num_bad_pages = control->ras_num_recs;
|
||||
else
|
||||
control->ras_num_bad_pages =
|
||||
control->ras_num_recs * adev->umc.retire_unit;
|
||||
Out:
|
||||
kfree(buf);
|
||||
return res;
|
||||
@@ -740,10 +748,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
/* Modify the header if it exceeds.
|
||||
*/
|
||||
if (amdgpu_bad_page_threshold != 0 &&
|
||||
control->ras_num_recs >= ras->bad_page_cnt_threshold) {
|
||||
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
|
||||
dev_warn(adev->dev,
|
||||
"Saved bad pages %d reaches threshold value %d\n",
|
||||
control->ras_num_recs, ras->bad_page_cnt_threshold);
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
|
||||
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
|
||||
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
|
||||
@@ -798,9 +806,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
*/
|
||||
if (amdgpu_bad_page_threshold != 0 &&
|
||||
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
|
||||
control->ras_num_recs < ras->bad_page_cnt_threshold)
|
||||
control->ras_num_bad_pages < ras->bad_page_cnt_threshold)
|
||||
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
|
||||
control->ras_num_recs) * 100) /
|
||||
control->ras_num_bad_pages) * 100) /
|
||||
ras->bad_page_cnt_threshold;
|
||||
|
||||
/* Recalc the checksum.
|
||||
@@ -1402,9 +1410,15 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
if (!__get_eeprom_i2c_addr(adev, control))
|
||||
return -EINVAL;
|
||||
|
||||
if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
|
||||
control->ras_num_bad_pages = control->ras_num_recs;
|
||||
else
|
||||
control->ras_num_bad_pages =
|
||||
control->ras_num_recs * adev->umc.retire_unit;
|
||||
|
||||
if (hdr->header == RAS_TABLE_HDR_VAL) {
|
||||
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
|
||||
control->ras_num_recs);
|
||||
control->ras_num_bad_pages);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
res = __read_table_ras_info(control);
|
||||
@@ -1419,9 +1433,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
|
||||
/* Warn if we are at 90% of the threshold or above
|
||||
*/
|
||||
if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
|
||||
if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
|
||||
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
|
||||
control->ras_num_recs,
|
||||
control->ras_num_bad_pages,
|
||||
ras->bad_page_cnt_threshold);
|
||||
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
|
||||
amdgpu_bad_page_threshold != 0) {
|
||||
@@ -1437,7 +1451,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
res);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
|
||||
if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
|
||||
/* This means that, the threshold was increased since
|
||||
* the last time the system was booted, and now,
|
||||
* ras->bad_page_cnt_threshold - control->num_recs > 0,
|
||||
@@ -1447,13 +1461,13 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
dev_info(adev->dev,
|
||||
"records:%d threshold:%d, resetting "
|
||||
"RAS table header signature",
|
||||
control->ras_num_recs,
|
||||
control->ras_num_bad_pages,
|
||||
ras->bad_page_cnt_threshold);
|
||||
res = amdgpu_ras_eeprom_correct_header_tag(control,
|
||||
RAS_TABLE_HDR_VAL);
|
||||
} else {
|
||||
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
|
||||
control->ras_num_recs, ras->bad_page_cnt_threshold);
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
if (amdgpu_bad_page_threshold == -1) {
|
||||
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
|
||||
res = 0;
|
||||
@@ -1462,7 +1476,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
||||
dev_err(adev->dev,
|
||||
"RAS records:%d exceed threshold:%d, "
|
||||
"GPU will not be initialized. Replace this GPU or increase the threshold",
|
||||
control->ras_num_recs, ras->bad_page_cnt_threshold);
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -95,6 +95,11 @@ struct amdgpu_ras_eeprom_control {
|
||||
*/
|
||||
u32 ras_num_recs;
|
||||
|
||||
/* the bad page number is ras_num_recs or
|
||||
* ras_num_recs * umc.retire_unit
|
||||
*/
|
||||
u32 ras_num_bad_pages;
|
||||
|
||||
/* First record index to read, 0-based.
|
||||
* Range is [0, num_recs-1]. This is
|
||||
* an absolute index, starting right after
|
||||
|
||||
@@ -169,7 +169,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
err_data->err_addr_cnt, false);
|
||||
amdgpu_ras_save_bad_pages(adev, &err_count);
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev,
|
||||
con->eeprom_control.ras_num_bad_pages);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
|
||||
|
||||
Reference in New Issue
Block a user