mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	powerpc/powernv: Invoke opal_cec_reboot2() on unrecoverable machine check errors.
On non-recoverable MCE errors in kernel space, Linux kernel panics and system reboots. On BMC based system opal-prd runs as a daemon in the host. Hence, kernel crash may prevent opal-prd to detect and analyze this MCE error. This may land us in a situation where the faulty memory never gets de-configured and Linux would keep hitting same MCE error again and again. If this happens in early stage of kernel initialization, then Linux will keep crashing and rebooting in a loop. This patch fixes this issue by invoking new opal_cec_reboot2() call with reboot type OPAL_REBOOT_PLATFORM_ERROR to inform BMC/OCC about this error, so that BMC can collect relevant data for error analysis and decide what component to de-configure before rebooting. This patch is dependent on OPAL patchset posted on skiboot mailing list at https://lists.ozlabs.org/pipermail/skiboot/2015-July/001771.html that introduces opal_cec_reboot2() opal call. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
		
							parent
							
								
									1852ae276b
								
							
						
					
					
						commit
						e784b6499d
					
				| @ -154,7 +154,8 @@ | |||||||
| #define OPAL_FLASH_WRITE			111 | #define OPAL_FLASH_WRITE			111 | ||||||
| #define OPAL_FLASH_ERASE			112 | #define OPAL_FLASH_ERASE			112 | ||||||
| #define OPAL_PRD_MSG				113 | #define OPAL_PRD_MSG				113 | ||||||
| #define OPAL_LAST				113 | #define OPAL_CEC_REBOOT2			116 | ||||||
|  | #define OPAL_LAST				116 | ||||||
| 
 | 
 | ||||||
| /* Device tree flags */ | /* Device tree flags */ | ||||||
| 
 | 
 | ||||||
| @ -857,6 +858,12 @@ enum OpalSysCooling { | |||||||
| 	OPAL_SYSCOOL_INSF	= 0x0001, /* System insufficient cooling */ | 	OPAL_SYSCOOL_INSF	= 0x0001, /* System insufficient cooling */ | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* Argument to OPAL_CEC_REBOOT2() */ | ||||||
|  | enum { | ||||||
|  | 	OPAL_REBOOT_NORMAL		= 0, | ||||||
|  | 	OPAL_REBOOT_PLATFORM_ERROR	= 1, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| #endif /* __ASSEMBLY__ */ | #endif /* __ASSEMBLY__ */ | ||||||
| 
 | 
 | ||||||
| #endif /* __OPAL_API_H */ | #endif /* __OPAL_API_H */ | ||||||
|  | |||||||
| @ -44,6 +44,7 @@ int64_t opal_tpo_write(uint64_t token, uint32_t year_mon_day, | |||||||
| 		       uint32_t hour_min); | 		       uint32_t hour_min); | ||||||
| int64_t opal_cec_power_down(uint64_t request); | int64_t opal_cec_power_down(uint64_t request); | ||||||
| int64_t opal_cec_reboot(void); | int64_t opal_cec_reboot(void); | ||||||
|  | int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag); | ||||||
| int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); | int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); | ||||||
| int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); | int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); | ||||||
| int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask); | int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask); | ||||||
|  | |||||||
| @ -202,6 +202,7 @@ OPAL_CALL(opal_rtc_read,			OPAL_RTC_READ); | |||||||
| OPAL_CALL(opal_rtc_write,			OPAL_RTC_WRITE);
 | OPAL_CALL(opal_rtc_write,			OPAL_RTC_WRITE);
 | ||||||
| OPAL_CALL(opal_cec_power_down,			OPAL_CEC_POWER_DOWN);
 | OPAL_CALL(opal_cec_power_down,			OPAL_CEC_POWER_DOWN);
 | ||||||
| OPAL_CALL(opal_cec_reboot,			OPAL_CEC_REBOOT);
 | OPAL_CALL(opal_cec_reboot,			OPAL_CEC_REBOOT);
 | ||||||
|  | OPAL_CALL(opal_cec_reboot2,			OPAL_CEC_REBOOT2);
 | ||||||
| OPAL_CALL(opal_read_nvram,			OPAL_READ_NVRAM);
 | OPAL_CALL(opal_read_nvram,			OPAL_READ_NVRAM);
 | ||||||
| OPAL_CALL(opal_write_nvram,			OPAL_WRITE_NVRAM);
 | OPAL_CALL(opal_write_nvram,			OPAL_WRITE_NVRAM);
 | ||||||
| OPAL_CALL(opal_handle_interrupt,		OPAL_HANDLE_INTERRUPT);
 | OPAL_CALL(opal_handle_interrupt,		OPAL_HANDLE_INTERRUPT);
 | ||||||
|  | |||||||
| @ -441,6 +441,7 @@ static int opal_recover_mce(struct pt_regs *regs, | |||||||
| int opal_machine_check(struct pt_regs *regs) | int opal_machine_check(struct pt_regs *regs) | ||||||
| { | { | ||||||
| 	struct machine_check_event evt; | 	struct machine_check_event evt; | ||||||
|  | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) | 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) | ||||||
| 		return 0; | 		return 0; | ||||||
| @ -455,6 +456,40 @@ int opal_machine_check(struct pt_regs *regs) | |||||||
| 
 | 
 | ||||||
| 	if (opal_recover_mce(regs, &evt)) | 	if (opal_recover_mce(regs, &evt)) | ||||||
| 		return 1; | 		return 1; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Unrecovered machine check, we are heading to panic path. | ||||||
|  | 	 * | ||||||
|  | 	 * We may have hit this MCE in very early stage of kernel | ||||||
|  | 	 * initialization even before opal-prd has started running. If | ||||||
|  | 	 * this is the case then this MCE error may go un-noticed or | ||||||
|  | 	 * un-analyzed if we go down panic path. We need to inform | ||||||
|  | 	 * BMC/OCC about this error so that they can collect relevant | ||||||
|  | 	 * data for error analysis before rebooting. | ||||||
|  | 	 * Use opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR) to do so. | ||||||
|  | 	 * This function may not return on BMC based system. | ||||||
|  | 	 */ | ||||||
|  | 	ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, | ||||||
|  | 			"Unrecoverable Machine Check exception"); | ||||||
|  | 	if (ret == OPAL_UNSUPPORTED) { | ||||||
|  | 		pr_emerg("Reboot type %d not supported\n", | ||||||
|  | 					OPAL_REBOOT_PLATFORM_ERROR); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * We reached here. There can be three possibilities: | ||||||
|  | 	 * 1. We are running on a firmware level that do not support | ||||||
|  | 	 *    opal_cec_reboot2() | ||||||
|  | 	 * 2. We are running on a firmware level that do not support | ||||||
|  | 	 *    OPAL_REBOOT_PLATFORM_ERROR reboot type. | ||||||
|  | 	 * 3. We are running on FSP based system that does not need opal | ||||||
|  | 	 *    to trigger checkstop explicitly for error analysis. The FSP | ||||||
|  | 	 *    PRD component would have already got notified about this | ||||||
|  | 	 *    error through other channels. | ||||||
|  | 	 * | ||||||
|  | 	 * In any case, let us just fall through. We anyway heading | ||||||
|  | 	 * down to panic path. | ||||||
|  | 	 */ | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Mahesh Salgaonkar
						Mahesh Salgaonkar