/* * Copyright (c) 2020 Intel Corporation. * * SPDX-License-Identifier: Apache-2.0 */ #define DT_DRV_COMPAT intel_ibecc #include #include #include #include #include "ibecc.h" /** * In the driver 64 bit registers are used and not all of then at the * moment may be correctly logged. */ #include LOG_MODULE_REGISTER(edac_ibecc, CONFIG_EDAC_LOG_LEVEL); #define DEVICE_NODE DT_NODELABEL(ibecc) #define PCI_HOST_BRIDGE PCIE_BDF(0, 0, 0) struct ibecc_data { mem_addr_t mchbar; edac_notify_callback_f cb; uint32_t error_type; /* Error count */ unsigned int errors_cor; unsigned int errors_uc; }; static void ibecc_write_reg64(const struct device *dev, uint16_t reg, uint64_t value) { struct ibecc_data *data = dev->data; mem_addr_t reg_addr = data->mchbar + reg; sys_write64(value, reg_addr); } static uint64_t ibecc_read_reg64(const struct device *dev, uint16_t reg) { struct ibecc_data *data = dev->data; mem_addr_t reg_addr = data->mchbar + reg; return sys_read64(reg_addr); } #if defined(CONFIG_EDAC_ERROR_INJECT) static void ibecc_write_reg32(const struct device *dev, uint16_t reg, uint32_t value) { struct ibecc_data *data = dev->data; mem_addr_t reg_addr = data->mchbar + reg; sys_write32(value, reg_addr); } #endif static uint32_t ibecc_read_reg32(const struct device *dev, uint16_t reg) { struct ibecc_data *data = dev->data; mem_addr_t reg_addr = data->mchbar + reg; return sys_read32(reg_addr); } static bool ibecc_enabled(const pcie_bdf_t bdf) { return !!(pcie_conf_read(bdf, CAPID0_C_REG) & CAPID0_C_IBECC_ENABLED); } static void ibecc_errcmd_setup(const pcie_bdf_t bdf, bool enable) { uint32_t errcmd; errcmd = pcie_conf_read(bdf, ERRCMD_REG); if (enable) { errcmd |= (ERRCMD_IBECC_COR | ERRCMD_IBECC_UC) << 16; } else { errcmd &= ~(ERRCMD_IBECC_COR | ERRCMD_IBECC_UC) << 16; } pcie_conf_write(bdf, ERRCMD_REG, errcmd); } static void ibecc_errsts_clear(const pcie_bdf_t bdf) { uint32_t errsts; errsts = pcie_conf_read(bdf, ERRSTS_REG); if (!(errsts & (ERRSTS_IBECC_COR | ERRSTS_IBECC_UC))) { return; } pcie_conf_write(bdf, ERRSTS_REG, errsts); } static const char *get_ddr_type(uint8_t type) { switch (type) { case 0: return "DDR4"; case 3: return "LPDDR4"; default: return "Unknown"; } } static const char *get_dimm_width(uint8_t type) { switch (type) { case 0: return "X8"; case 1: return "X16"; case 2: return "X32"; default: return "Unknown"; } } static void mchbar_regs_dump(const struct device *dev) { uint32_t mad_inter_chan, chan_hash; /* Memory configuration */ chan_hash = ibecc_read_reg32(dev, CHANNEL_HASH); LOG_DBG("Channel hash %x", chan_hash); mad_inter_chan = ibecc_read_reg32(dev, MAD_INTER_CHAN); LOG_DBG("DDR memory type %s", get_ddr_type(INTER_CHAN_DDR_TYPE(mad_inter_chan))); for (int ch = 0; ch < DRAM_MAX_CHANNELS; ch++) { uint32_t intra_ch = ibecc_read_reg32(dev, MAD_INTRA_CH(ch)); uint32_t dimm_ch = ibecc_read_reg32(dev, MAD_DIMM_CH(ch)); uint64_t l_size = DIMM_L_SIZE(dimm_ch); uint64_t s_size = DIMM_S_SIZE(dimm_ch); uint8_t l_map = DIMM_L_MAP(intra_ch); LOG_DBG("channel %d: l_size 0x%llx s_size 0x%llx l_map %d\n", ch, l_size, s_size, l_map); for (int d = 0; d < DRAM_MAX_DIMMS; d++) { uint64_t size; const char *type; if (d ^ l_map) { type = get_dimm_width(DIMM_S_WIDTH(dimm_ch)); size = s_size; } else { type = get_dimm_width(DIMM_L_WIDTH(dimm_ch)); size = l_size; } if (!size) { continue; } LOG_DBG("Channel %d DIMM %d size %llu GiB width %s", ch, d, size >> 30, type); } } } static void parse_ecclog(const struct device *dev, const uint64_t ecclog, struct ibecc_error *error_data) { struct ibecc_data *data = dev->data; if (!ecclog) { return; } error_data->type = ECC_ERROR_ERRTYPE(ecclog); error_data->address = ECC_ERROR_ERRADD(ecclog); error_data->syndrome = ECC_ERROR_ERRSYND(ecclog); if (ecclog & ECC_ERROR_MERRSTS) { data->errors_uc++; } if (ecclog & ECC_ERROR_CERRSTS) { data->errors_cor++; } } #if defined(CONFIG_EDAC_ERROR_INJECT) static int inject_set_param1(const struct device *dev, uint64_t addr) { if (addr & ~INJ_ADDR_BASE_MASK) { return -EINVAL; } ibecc_write_reg64(dev, IBECC_INJ_ADDR_BASE, addr); return 0; } static uint64_t inject_get_param1(const struct device *dev) { return ibecc_read_reg64(dev, IBECC_INJ_ADDR_BASE); } static int inject_set_param2(const struct device *dev, uint64_t mask) { if (mask & ~INJ_ADDR_BASE_MASK_MASK) { return -EINVAL; } ibecc_write_reg64(dev, IBECC_INJ_ADDR_MASK, mask); return 0; } static uint64_t inject_get_param2(const struct device *dev) { return ibecc_read_reg64(dev, IBECC_INJ_ADDR_MASK); } static int inject_set_error_type(const struct device *dev, uint32_t error_type) { struct ibecc_data *data = dev->data; data->error_type = error_type; return 0; } static uint32_t inject_get_error_type(const struct device *dev) { struct ibecc_data *data = dev->data; return data->error_type; } static int inject_error_trigger(const struct device *dev) { struct ibecc_data *data = dev->data; uint32_t ctrl = 0; switch (data->error_type) { case EDAC_ERROR_TYPE_DRAM_COR: ctrl |= INJ_CTRL_COR; break; case EDAC_ERROR_TYPE_DRAM_UC: ctrl |= INJ_CTRL_UC; break; default: /* This would clear error injection */ break; } ibecc_write_reg32(dev, IBECC_INJ_ADDR_CTRL, ctrl); return 0; } #endif /* CONFIG_EDAC_ERROR_INJECT */ static uint64_t ecc_error_log_get(const struct device *dev) { return ibecc_read_reg64(dev, IBECC_ECC_ERROR_LOG); } static void ecc_error_log_clear(const struct device *dev) { /* Clear all error bits */ ibecc_write_reg64(dev, IBECC_ECC_ERROR_LOG, ECC_ERROR_MERRSTS | ECC_ERROR_CERRSTS); } static uint64_t parity_error_log_get(const struct device *dev) { return ibecc_read_reg64(dev, IBECC_PARITY_ERROR_LOG); } static void parity_error_log_clear(const struct device *dev) { ibecc_write_reg64(dev, IBECC_PARITY_ERROR_LOG, PARITY_ERROR_ERRSTS); } static unsigned int errors_cor_get(const struct device *dev) { struct ibecc_data *data = dev->data; return data->errors_cor; } static unsigned int errors_uc_get(const struct device *dev) { struct ibecc_data *data = dev->data; return data->errors_uc; } static int notify_callback_set(const struct device *dev, edac_notify_callback_f cb) { struct ibecc_data *data = dev->data; int key = irq_lock(); data->cb = cb; irq_unlock(key); return 0; } static const struct edac_driver_api api = { #if defined(CONFIG_EDAC_ERROR_INJECT) /* Error Injection functions */ .inject_set_param1 = inject_set_param1, .inject_get_param1 = inject_get_param1, .inject_set_param2 = inject_set_param2, .inject_get_param2 = inject_get_param2, .inject_set_error_type = inject_set_error_type, .inject_get_error_type = inject_get_error_type, .inject_error_trigger = inject_error_trigger, #endif /* CONFIG_EDAC_ERROR_INJECT */ /* Error reporting & clearing functions */ .ecc_error_log_get = ecc_error_log_get, .ecc_error_log_clear = ecc_error_log_clear, .parity_error_log_get = parity_error_log_get, .parity_error_log_clear = parity_error_log_clear, /* Get error stats */ .errors_cor_get = errors_cor_get, .errors_uc_get = errors_uc_get, /* Notification callback set */ .notify_cb_set = notify_callback_set, }; int edac_ibecc_init(const struct device *dev) { const pcie_bdf_t bdf = PCI_HOST_BRIDGE; struct ibecc_data *data = dev->data; uint32_t tolud; uint64_t touud, tom, mchbar; LOG_INF("EDAC IBECC initialization"); if (!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU5)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU6)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU7)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU8)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU9)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU10)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU11)) && !pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SKU12))) { LOG_ERR("PCI Probe failed"); return -ENODEV; } if (!ibecc_enabled(bdf)) { LOG_ERR("IBECC is not enabled"); return -ENODEV; } mchbar = pcie_conf_read(bdf, MCHBAR_REG); mchbar |= (uint64_t)pcie_conf_read(bdf, MCHBAR_REG + 1) << 32; /* Check that MCHBAR is enabled */ if (!(mchbar & MCHBAR_ENABLE)) { LOG_ERR("MCHBAR is not enabled"); return -ENODEV; } mchbar &= MCHBAR_MASK; /* workaround for 32 bit read */ touud = pcie_conf_read(bdf, TOUUD_REG); touud |= (uint64_t)pcie_conf_read(bdf, TOUUD_REG + 1) << 32; touud &= TOUUD_MASK; /* workaround for 32 bit read */ tom = pcie_conf_read(bdf, TOM_REG); tom |= (uint64_t)pcie_conf_read(bdf, TOM_REG + 1) << 32; tom &= TOM_MASK; tolud = pcie_conf_read(bdf, TOLUD_REG) & TOLUD_MASK; device_map(&data->mchbar, mchbar, MCH_SIZE, K_MEM_CACHE_NONE); LOG_DBG("MCHBAR\t%llx", mchbar); LOG_DBG("TOUUD\t%llx", touud); LOG_DBG("TOM\t%llx", tom); LOG_DBG("TOLUD\t%x", tolud); mchbar_regs_dump(dev); /* Enable Host Bridge generated SERR event */ ibecc_errcmd_setup(bdf, true); return 0; } static struct ibecc_data ibecc_data; DEVICE_DT_DEFINE(DEVICE_NODE, &edac_ibecc_init, NULL, &ibecc_data, NULL, POST_KERNEL, CONFIG_KERNEL_INIT_PRIORITY_DEVICE, &api); /** * An IBECC error causes SERR_NMI_STS set and is indicated by * ERRSTS PCI registers by IBECC_UC and IBECC_COR fields. * Following needs to be done: * - Read ECC_ERR_LOG register * - Clear IBECC_UC and IBECC_COR fields of ERRSTS PCI * - Clear MERRSTS & CERRSTS fields of ECC_ERR_LOG register */ static struct k_spinlock nmi_lock; /* NMI handling */ static bool handle_nmi(void) { uint8_t status; status = sys_in8(NMI_STS_CNT_REG); if (!(status & NMI_STS_SRC_SERR)) { LOG_DBG("Skip NMI, NMI_STS_CNT: 0x%x", status); /** * We should be able to find that this NMI we * should not handle and return false. However this * does not work for some older SKUs */ return true; } LOG_DBG("core: %d status 0x%x", arch_curr_cpu()->id, status); /* Re-enable */ status = (status & NMI_STS_MASK_EN) | NMI_STS_SERR_EN; sys_out8(status, NMI_STS_CNT_REG); status &= ~NMI_STS_SERR_EN; sys_out8(status, NMI_STS_CNT_REG); return true; } bool z_x86_do_kernel_nmi(const z_arch_esf_t *esf) { const struct device *dev = DEVICE_DT_GET(DEVICE_NODE); struct ibecc_data *data = dev->data; struct ibecc_error error_data; k_spinlock_key_t key; bool ret = true; uint64_t ecclog; key = k_spin_lock(&nmi_lock); if (!handle_nmi()) { /* Indicate that we do not handle this NMI */ ret = false; goto out; } /* Skip the same NMI handling for other cores and return handled */ if (arch_curr_cpu()->id) { ret = true; goto out; } ecclog = edac_ecc_error_log_get(dev); parse_ecclog(dev, ecclog, &error_data); if (data->cb) { data->cb(dev, &error_data); } edac_ecc_error_log_clear(dev); ibecc_errsts_clear(PCI_HOST_BRIDGE); out: k_spin_unlock(&nmi_lock, key); return ret; }