Skip to content

Commit ec54b34

Browse files
Ranjan Kumarmartinkpetersen
authored andcommitted
scsi: mpi3mr: Record and report controller firmware faults
Capture and retain firmware fault codes and extended fault information whenever the controller enters a fault state. Maintain a persistent firmware fault counter, expose it via sysfs, and generate uevents to aid userspace diagnostics and failure analysis. Co-developed-by: Salomon Dushimirimana <salomondush@google.com> Signed-off-by: Salomon Dushimirimana <salomondush@google.com> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com> Link: https://patch.msgid.link/20260116060719.32937-7-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
1 parent d065433 commit ec54b34

3 files changed

Lines changed: 135 additions & 0 deletions

File tree

drivers/scsi/mpi3mr/mpi3mr.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,10 @@ struct scmd_priv {
11371137
* @default_qcount: Total Default queues
11381138
* @active_poll_qcount: Currently active poll queue count
11391139
* @requested_poll_qcount: User requested poll queue count
1140+
* @fault_during_init: Indicates a firmware fault occurred during initialization
1141+
* @saved_fault_code: Firmware fault code captured at the time of failure
1142+
* @saved_fault_info: Additional firmware-provided fault information
1143+
* @fwfault_counter: Count of firmware faults detected by the driver
11401144
* @bsg_dev: BSG device structure
11411145
* @bsg_queue: Request queue for BSG device
11421146
* @stop_bsgs: Stop BSG request flag
@@ -1340,6 +1344,10 @@ struct mpi3mr_ioc {
13401344
u16 default_qcount;
13411345
u16 active_poll_qcount;
13421346
u16 requested_poll_qcount;
1347+
u8 fault_during_init;
1348+
u32 saved_fault_code;
1349+
u32 saved_fault_info[3];
1350+
u64 fwfault_counter;
13431351

13441352
struct device bsg_dev;
13451353
struct request_queue *bsg_queue;

drivers/scsi/mpi3mr/mpi3mr_app.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3255,6 +3255,29 @@ adp_state_show(struct device *dev, struct device_attribute *attr,
32553255

32563256
static DEVICE_ATTR_RO(adp_state);
32573257

3258+
/**
3259+
* fwfault_count_show() - SysFS callback to show firmware fault count
3260+
* @dev: class device
3261+
* @attr: Device attribute
3262+
* @buf: Buffer to copy data into
3263+
*
3264+
* Displays the total number of firmware faults detected by the driver
3265+
* since the controller was initialized.
3266+
*
3267+
* Return: Number of bytes written to @buf
3268+
*/
3269+
3270+
static ssize_t
3271+
fwfault_count_show(struct device *dev, struct device_attribute *attr,
3272+
char *buf)
3273+
{
3274+
struct Scsi_Host *shost = class_to_shost(dev);
3275+
struct mpi3mr_ioc *mrioc = shost_priv(shost);
3276+
3277+
return snprintf(buf, PAGE_SIZE, "%llu\n", mrioc->fwfault_counter);
3278+
}
3279+
static DEVICE_ATTR_RO(fwfault_count);
3280+
32583281
static struct attribute *mpi3mr_host_attrs[] = {
32593282
&dev_attr_version_fw.attr,
32603283
&dev_attr_fw_queue_depth.attr,
@@ -3263,6 +3286,7 @@ static struct attribute *mpi3mr_host_attrs[] = {
32633286
&dev_attr_reply_qfull_count.attr,
32643287
&dev_attr_logging_level.attr,
32653288
&dev_attr_adp_state.attr,
3289+
&dev_attr_fwfault_count.attr,
32663290
NULL,
32673291
};
32683292

drivers/scsi/mpi3mr/mpi3mr_fw.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,31 @@ void mpi3mr_print_fault_info(struct mpi3mr_ioc *mrioc)
11081108
}
11091109
}
11101110

1111+
/**
1112+
* mpi3mr_save_fault_info - Save fault information
1113+
* @mrioc: Adapter instance reference
1114+
*
1115+
* Save the controller fault information if there is a
1116+
* controller fault.
1117+
*
1118+
* Return: Nothing.
1119+
*/
1120+
static void mpi3mr_save_fault_info(struct mpi3mr_ioc *mrioc)
1121+
{
1122+
u32 ioc_status, i;
1123+
1124+
ioc_status = readl(&mrioc->sysif_regs->ioc_status);
1125+
1126+
if (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT) {
1127+
mrioc->saved_fault_code = readl(&mrioc->sysif_regs->fault) &
1128+
MPI3_SYSIF_FAULT_CODE_MASK;
1129+
for (i = 0; i < 3; i++) {
1130+
mrioc->saved_fault_info[i] =
1131+
readl(&mrioc->sysif_regs->fault_info[i]);
1132+
}
1133+
}
1134+
}
1135+
11111136
/**
11121137
* mpi3mr_get_iocstate - Get IOC State
11131138
* @mrioc: Adapter instance reference
@@ -1249,6 +1274,60 @@ static void mpi3mr_alloc_ioctl_dma_memory(struct mpi3mr_ioc *mrioc)
12491274
mpi3mr_free_ioctl_dma_memory(mrioc);
12501275
}
12511276

1277+
/**
1278+
* mpi3mr_fault_uevent_emit - Emit uevent for any controller
1279+
* fault
1280+
* @mrioc: Pointer to the mpi3mr_ioc structure for the controller instance
1281+
*
1282+
* This function is invoked when the controller undergoes any
1283+
* type of fault.
1284+
*/
1285+
1286+
static void mpi3mr_fault_uevent_emit(struct mpi3mr_ioc *mrioc)
1287+
{
1288+
struct kobj_uevent_env *env;
1289+
int ret;
1290+
1291+
env = kzalloc(sizeof(*env), GFP_KERNEL);
1292+
if (!env)
1293+
return;
1294+
1295+
ret = add_uevent_var(env, "DRIVER=%s", mrioc->driver_name);
1296+
if (ret)
1297+
goto out_free;
1298+
1299+
ret = add_uevent_var(env, "IOC_ID=%u", mrioc->id);
1300+
if (ret)
1301+
goto out_free;
1302+
1303+
ret = add_uevent_var(env, "FAULT_CODE=0x%08x",
1304+
mrioc->saved_fault_code);
1305+
if (ret)
1306+
goto out_free;
1307+
1308+
ret = add_uevent_var(env, "FAULT_INFO0=0x%08x",
1309+
mrioc->saved_fault_info[0]);
1310+
if (ret)
1311+
goto out_free;
1312+
1313+
ret = add_uevent_var(env, "FAULT_INFO1=0x%08x",
1314+
mrioc->saved_fault_info[1]);
1315+
if (ret)
1316+
goto out_free;
1317+
1318+
ret = add_uevent_var(env, "FAULT_INFO2=0x%08x",
1319+
mrioc->saved_fault_info[2]);
1320+
if (ret)
1321+
goto out_free;
1322+
1323+
kobject_uevent_env(&mrioc->shost->shost_gendev.kobj,
1324+
KOBJ_CHANGE, env->envp);
1325+
1326+
out_free:
1327+
kfree(env);
1328+
1329+
}
1330+
12521331
/**
12531332
* mpi3mr_clear_reset_history - clear reset history
12541333
* @mrioc: Adapter instance reference
@@ -1480,6 +1559,10 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
14801559
if (ioc_state == MRIOC_STATE_FAULT) {
14811560
timeout = MPI3_SYSIF_DIAG_SAVE_TIMEOUT * 10;
14821561
mpi3mr_print_fault_info(mrioc);
1562+
mpi3mr_save_fault_info(mrioc);
1563+
mrioc->fault_during_init = 1;
1564+
mrioc->fwfault_counter++;
1565+
14831566
do {
14841567
host_diagnostic =
14851568
readl(&mrioc->sysif_regs->host_diagnostic);
@@ -2577,6 +2660,9 @@ void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code)
25772660
mpi3mr_set_trigger_data_in_all_hdb(mrioc,
25782661
MPI3MR_HDB_TRIGGER_TYPE_FAULT, &trigger_data, 0);
25792662
mpi3mr_print_fault_info(mrioc);
2663+
mpi3mr_save_fault_info(mrioc);
2664+
mrioc->fault_during_init = 1;
2665+
mrioc->fwfault_counter++;
25802666
return;
25812667
}
25822668

@@ -2594,6 +2680,10 @@ void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code)
25942680
break;
25952681
msleep(100);
25962682
} while (--timeout);
2683+
2684+
mpi3mr_save_fault_info(mrioc);
2685+
mrioc->fault_during_init = 1;
2686+
mrioc->fwfault_counter++;
25972687
}
25982688

25992689
/**
@@ -2770,6 +2860,11 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
27702860
union mpi3mr_trigger_data trigger_data;
27712861
u16 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
27722862

2863+
if (mrioc->fault_during_init) {
2864+
mpi3mr_fault_uevent_emit(mrioc);
2865+
mrioc->fault_during_init = 0;
2866+
}
2867+
27732868
if (mrioc->reset_in_progress || mrioc->pci_err_recovery)
27742869
return;
27752870

@@ -2842,6 +2937,10 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
28422937
goto schedule_work;
28432938
}
28442939

2940+
mpi3mr_save_fault_info(mrioc);
2941+
mpi3mr_fault_uevent_emit(mrioc);
2942+
mrioc->fwfault_counter++;
2943+
28452944
switch (trigger_data.fault) {
28462945
case MPI3_SYSIF_FAULT_CODE_COMPLETE_RESET_NEEDED:
28472946
case MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED:
@@ -5478,6 +5577,10 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
54785577
break;
54795578
msleep(100);
54805579
} while (--timeout);
5580+
5581+
mpi3mr_save_fault_info(mrioc);
5582+
mpi3mr_fault_uevent_emit(mrioc);
5583+
mrioc->fwfault_counter++;
54815584
mpi3mr_set_trigger_data_in_all_hdb(mrioc,
54825585
MPI3MR_HDB_TRIGGER_TYPE_FAULT, &trigger_data, 0);
54835586
}

0 commit comments

Comments
 (0)