powerpc/powernv/mce: Reduce MCE console logs to lesser lines.
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Mon, 29 Apr 2019 18:15:48 +0000 (23:45 +0530)
committerMichael Ellerman <mpe@ellerman.id.au>
Wed, 1 May 2019 12:22:24 +0000 (22:22 +1000)
Also add cpu number while displaying MCE log. This will help cleaner
logs when MCE hits on multiple cpus simultaneously.

Before the changes the MCE output was:

  Severe Machine check interrupt [Recovered]
    NIP [d00000000ba80280]: insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
    Initiator: CPU
    Error type: SLB [Multihit]
      Effective address: d00000000ba80280

After this patch series changes the MCE output will be:

  MCE: CPU80: machine check (Warning) Host SLB Multihit [Recovered]
  MCE: CPU80: NIP: [d00000000b550280] insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
  MCE: CPU80: Probable software error (some chance of hardware cause)

UE in host application:

  MCE: CPU48: machine check (Severe) Host UE Load/Store DAR: 00007fffc6079a80 paddr: 0000000f8e260000 [Not recovered]
  MCE: CPU48: PID: 4584 Comm: find NIP: [0000000010023368]
  MCE: CPU48: Hardware error

and for MCE in Guest:

  MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
  MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
  MCE: CPU80: Probable software error (some chance of hardware cause)

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
arch/powerpc/include/asm/mce.h
arch/powerpc/kernel/mce.c

index ad47fa8653240f0a566ed94af8e8d690ff49f05d..c888ef9a3eafbf9d58412f41e189723f39ca4c80 100644 (file)
@@ -116,7 +116,7 @@ struct machine_check_event {
        enum MCE_Initiator      initiator:8;    /* 0x03 */
        enum MCE_ErrorType      error_type:8;   /* 0x04 */
        enum MCE_Disposition    disposition:8;  /* 0x05 */
-       uint8_t                 reserved_1[2];  /* 0x06 */
+       uint16_t                cpu;            /* 0x06 */
        uint64_t                gpr3;           /* 0x08 */
        uint64_t                srr0;           /* 0x10 */
        uint64_t                srr1;           /* 0x18 */
index b5fec1f9751a13a04d2b280c6e84c3eaba8dbd47..25a8b20cbbdc37da24c9379d895fc17f9417273c 100644 (file)
@@ -112,6 +112,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
        mce->srr1 = regs->msr;
        mce->gpr3 = regs->gpr[3];
        mce->in_use = 1;
+       mce->cpu = get_paca()->paca_index;
 
        /* Mark it recovered if we have handled it and MSR(RI=1). */
        if (handled && (regs->msr & MSR_RI))
@@ -310,7 +311,11 @@ static void machine_check_process_queued_event(struct irq_work *work)
 void machine_check_print_event_info(struct machine_check_event *evt,
                                    bool user_mode, bool in_guest)
 {
-       const char *level, *sevstr, *subtype;
+       const char *level, *sevstr, *subtype, *err_type;
+       uint64_t ea = 0, pa = 0;
+       int n = 0;
+       char dar_str[50];
+       char pa_str[50];
        static const char *mc_ue_types[] = {
                "Indeterminate",
                "Instruction fetch",
@@ -384,101 +389,103 @@ void machine_check_print_event_info(struct machine_check_event *evt,
                break;
        }
 
-       printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-              evt->disposition == MCE_DISPOSITION_RECOVERED ?
-              "Recovered" : "Not recovered");
-
-       if (in_guest) {
-               printk("%s  Guest NIP: %016llx\n", level, evt->srr0);
-       } else if (user_mode) {
-               printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
-                       evt->srr0, current->pid, current->comm);
-       } else {
-               printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
-                      (void *)evt->srr0);
-       }
-
-       printk("%s  Initiator: %s\n", level,
-              evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
        switch (evt->error_type) {
        case MCE_ERROR_TYPE_UE:
+               err_type = "UE";
                subtype = evt->u.ue_error.ue_error_type <
                        ARRAY_SIZE(mc_ue_types) ?
                        mc_ue_types[evt->u.ue_error.ue_error_type]
                        : "Unknown";
-               printk("%s  Error type: UE [%s]\n", level, subtype);
                if (evt->u.ue_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.ue_error.effective_address);
+                       ea = evt->u.ue_error.effective_address;
                if (evt->u.ue_error.physical_address_provided)
-                       printk("%s    Physical address:  %016llx\n",
-                              level, evt->u.ue_error.physical_address);
+                       pa = evt->u.ue_error.physical_address;
                break;
        case MCE_ERROR_TYPE_SLB:
+               err_type = "SLB";
                subtype = evt->u.slb_error.slb_error_type <
                        ARRAY_SIZE(mc_slb_types) ?
                        mc_slb_types[evt->u.slb_error.slb_error_type]
                        : "Unknown";
-               printk("%s  Error type: SLB [%s]\n", level, subtype);
                if (evt->u.slb_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.slb_error.effective_address);
+                       ea = evt->u.slb_error.effective_address;
                break;
        case MCE_ERROR_TYPE_ERAT:
+               err_type = "ERAT";
                subtype = evt->u.erat_error.erat_error_type <
                        ARRAY_SIZE(mc_erat_types) ?
                        mc_erat_types[evt->u.erat_error.erat_error_type]
                        : "Unknown";
-               printk("%s  Error type: ERAT [%s]\n", level, subtype);
                if (evt->u.erat_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.erat_error.effective_address);
+                       ea = evt->u.erat_error.effective_address;
                break;
        case MCE_ERROR_TYPE_TLB:
+               err_type = "TLB";
                subtype = evt->u.tlb_error.tlb_error_type <
                        ARRAY_SIZE(mc_tlb_types) ?
                        mc_tlb_types[evt->u.tlb_error.tlb_error_type]
                        : "Unknown";
-               printk("%s  Error type: TLB [%s]\n", level, subtype);
                if (evt->u.tlb_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.tlb_error.effective_address);
+                       ea = evt->u.tlb_error.effective_address;
                break;
        case MCE_ERROR_TYPE_USER:
+               err_type = "User";
                subtype = evt->u.user_error.user_error_type <
                        ARRAY_SIZE(mc_user_types) ?
                        mc_user_types[evt->u.user_error.user_error_type]
                        : "Unknown";
-               printk("%s  Error type: User [%s]\n", level, subtype);
                if (evt->u.user_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.user_error.effective_address);
+                       ea = evt->u.user_error.effective_address;
                break;
        case MCE_ERROR_TYPE_RA:
+               err_type = "Real address";
                subtype = evt->u.ra_error.ra_error_type <
                        ARRAY_SIZE(mc_ra_types) ?
                        mc_ra_types[evt->u.ra_error.ra_error_type]
                        : "Unknown";
-               printk("%s  Error type: Real address [%s]\n", level, subtype);
                if (evt->u.ra_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.ra_error.effective_address);
+                       ea = evt->u.ra_error.effective_address;
                break;
        case MCE_ERROR_TYPE_LINK:
+               err_type = "Link";
                subtype = evt->u.link_error.link_error_type <
                        ARRAY_SIZE(mc_link_types) ?
                        mc_link_types[evt->u.link_error.link_error_type]
                        : "Unknown";
-               printk("%s  Error type: Link [%s]\n", level, subtype);
                if (evt->u.link_error.effective_address_provided)
-                       printk("%s    Effective address: %016llx\n",
-                              level, evt->u.link_error.effective_address);
+                       ea = evt->u.link_error.effective_address;
                break;
        default:
        case MCE_ERROR_TYPE_UNKNOWN:
-               printk("%s  Error type: Unknown\n", level);
+               err_type = "Unknown";
+               subtype = "";
                break;
        }
+
+       dar_str[0] = pa_str[0] = '\0';
+       if (ea && evt->srr0 != ea) {
+               /* Load/Store address */
+               n = sprintf(dar_str, "DAR: %016llx ", ea);
+               if (pa)
+                       sprintf(dar_str + n, "paddr: %016llx ", pa);
+       } else if (pa) {
+               sprintf(pa_str, " paddr: %016llx", pa);
+       }
+
+       printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
+               level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
+               err_type, subtype, dar_str,
+               evt->disposition == MCE_DISPOSITION_RECOVERED ?
+               "Recovered" : "Not recovered");
+
+       if (in_guest || user_mode) {
+               printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
+                       level, evt->cpu, current->pid, current->comm,
+                       in_guest ? "Guest " : "", evt->srr0, pa_str);
+       } else {
+               printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
+                       level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
+       }
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);