powerpc/pseries: Display machine check error details.
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Tue, 11 Sep 2018 14:27:07 +0000 (19:57 +0530)
committerMichael Ellerman <mpe@ellerman.id.au>
Wed, 19 Sep 2018 11:59:41 +0000 (21:59 +1000)
Extract the MCE error details from RTAS extended log and display it to
console.

With this patch you should now see mce logs like below:

[  142.371818] Severe Machine check interrupt [Recovered]
[  142.371822]   NIP [d00000000ca301b8]: init_module+0x1b8/0x338 [bork_kernel]
[  142.371822]   Initiator: CPU
[  142.371823]   Error type: SLB [Multihit]
[  142.371824]     Effective address: d00000000ca70000

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
arch/powerpc/include/asm/rtas.h
arch/powerpc/platforms/pseries/ras.c

index adefa6493d2923954d4d7953aff53c41d7c4899a..0183e9595acc2d0636ef419353dbccd9082f6264 100644 (file)
@@ -197,6 +197,11 @@ static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog)
        return (elog->byte1 & 0x04) >> 2;
 }
 
+static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog)
+{
+       return (elog->byte2 & 0xf0) >> 4;
+}
+
 #define rtas_error_type(x)     ((x)->byte3)
 
 static inline
index 0578c243ef0180906bc8dfbc42fdf07ab3d78617..49e83c954d2c43b15e27915e6ae1c63bb10f2561 100644 (file)
@@ -523,6 +523,136 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
        return 0; /* need to perform reset */
 }
 
+#define VAL_TO_STRING(ar, val) \
+       (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
+
+static void pseries_print_mce_info(struct pt_regs *regs,
+                                  struct rtas_error_log *errp)
+{
+       const char *level, *sevstr;
+       struct pseries_errorlog *pseries_log;
+       struct pseries_mc_errorlog *mce_log;
+       u8 error_type, err_sub_type;
+       u64 addr;
+       u8 initiator = rtas_error_initiator(errp);
+       int disposition = rtas_error_disposition(errp);
+
+       static const char * const initiators[] = {
+               "Unknown",
+               "CPU",
+               "PCI",
+               "ISA",
+               "Memory",
+               "Power Mgmt",
+       };
+       static const char * const mc_err_types[] = {
+               "UE",
+               "SLB",
+               "ERAT",
+               "TLB",
+               "D-Cache",
+               "Unknown",
+               "I-Cache",
+       };
+       static const char * const mc_ue_types[] = {
+               "Indeterminate",
+               "Instruction fetch",
+               "Page table walk ifetch",
+               "Load/Store",
+               "Page table walk Load/Store",
+       };
+
+       /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
+       static const char * const mc_slb_types[] = {
+               "Parity",
+               "Multihit",
+               "Indeterminate",
+       };
+
+       /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
+       static const char * const mc_soft_types[] = {
+               "Unknown",
+               "Parity",
+               "Multihit",
+               "Indeterminate",
+       };
+
+       if (!rtas_error_extended(errp)) {
+               pr_err("Machine check interrupt: Missing extended error log\n");
+               return;
+       }
+
+       pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+       if (pseries_log == NULL)
+               return;
+
+       mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+       error_type = mce_log->error_type;
+       err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+       switch (rtas_error_severity(errp)) {
+       case RTAS_SEVERITY_NO_ERROR:
+               level = KERN_INFO;
+               sevstr = "Harmless";
+               break;
+       case RTAS_SEVERITY_WARNING:
+               level = KERN_WARNING;
+               sevstr = "";
+               break;
+       case RTAS_SEVERITY_ERROR:
+       case RTAS_SEVERITY_ERROR_SYNC:
+               level = KERN_ERR;
+               sevstr = "Severe";
+               break;
+       case RTAS_SEVERITY_FATAL:
+       default:
+               level = KERN_ERR;
+               sevstr = "Fatal";
+               break;
+       }
+
+       printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+              disposition == RTAS_DISP_FULLY_RECOVERED ?
+              "Recovered" : "Not recovered");
+       if (user_mode(regs)) {
+               printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
+                      regs->nip, current->pid, current->comm);
+       } else {
+               printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
+                      (void *)regs->nip);
+       }
+       printk("%s  Initiator: %s\n", level,
+              VAL_TO_STRING(initiators, initiator));
+
+       switch (error_type) {
+       case MC_ERROR_TYPE_UE:
+               printk("%s  Error type: %s [%s]\n", level,
+                      VAL_TO_STRING(mc_err_types, error_type),
+                      VAL_TO_STRING(mc_ue_types, err_sub_type));
+               break;
+       case MC_ERROR_TYPE_SLB:
+               printk("%s  Error type: %s [%s]\n", level,
+                      VAL_TO_STRING(mc_err_types, error_type),
+                      VAL_TO_STRING(mc_slb_types, err_sub_type));
+               break;
+       case MC_ERROR_TYPE_ERAT:
+       case MC_ERROR_TYPE_TLB:
+               printk("%s  Error type: %s [%s]\n", level,
+                      VAL_TO_STRING(mc_err_types, error_type),
+                      VAL_TO_STRING(mc_soft_types, err_sub_type));
+               break;
+       default:
+               printk("%s  Error type: %s\n", level,
+                      VAL_TO_STRING(mc_err_types, error_type));
+               break;
+       }
+
+       addr = rtas_mc_get_effective_addr(mce_log);
+       if (addr)
+               printk("%s    Effective address: %016llx\n", level, addr);
+}
+
 static int mce_handle_error(struct rtas_error_log *errp)
 {
        struct pseries_errorlog *pseries_log;
@@ -585,8 +715,11 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
        int recovered = 0;
        int disposition = rtas_error_disposition(err);
 
+       pseries_print_mce_info(regs, err);
+
        if (!(regs->msr & MSR_RI)) {
                /* If MSR_RI isn't set, we cannot recover */
+               pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
                recovered = 0;
 
        } else if (disposition == RTAS_DISP_FULLY_RECOVERED) {