liquidio: CN23XX health monitoring
Adds support for watchdog based health monitoring of octeon cores on cn23xx device. Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com> Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com> Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com> Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
dc3abcbeae
commit
9ff1a9bad8
@ -24,6 +24,7 @@
|
|||||||
#include <linux/firmware.h>
|
#include <linux/firmware.h>
|
||||||
#include <linux/ptp_clock_kernel.h>
|
#include <linux/ptp_clock_kernel.h>
|
||||||
#include <net/vxlan.h>
|
#include <net/vxlan.h>
|
||||||
|
#include <linux/kthread.h>
|
||||||
#include "liquidio_common.h"
|
#include "liquidio_common.h"
|
||||||
#include "octeon_droq.h"
|
#include "octeon_droq.h"
|
||||||
#include "octeon_iq.h"
|
#include "octeon_iq.h"
|
||||||
@ -948,8 +949,6 @@ static void update_txq_status(struct octeon_device *oct, int iq_num)
|
|||||||
struct lio *lio;
|
struct lio *lio;
|
||||||
struct octeon_instr_queue *iq = oct->instr_queue[iq_num];
|
struct octeon_instr_queue *iq = oct->instr_queue[iq_num];
|
||||||
|
|
||||||
/*octeon_update_iq_read_idx(oct, iq);*/
|
|
||||||
|
|
||||||
netdev = oct->props[iq->ifidx].netdev;
|
netdev = oct->props[iq->ifidx].netdev;
|
||||||
|
|
||||||
/* This is needed because the first IQ does not have
|
/* This is needed because the first IQ does not have
|
||||||
@ -1187,6 +1186,102 @@ static int octeon_setup_interrupt(struct octeon_device *oct)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int liquidio_watchdog(void *param)
|
||||||
|
{
|
||||||
|
u64 wdog;
|
||||||
|
u16 mask_of_stuck_cores = 0;
|
||||||
|
u16 mask_of_crashed_cores = 0;
|
||||||
|
int core_num;
|
||||||
|
u8 core_is_stuck[LIO_MAX_CORES];
|
||||||
|
u8 core_crashed[LIO_MAX_CORES];
|
||||||
|
struct octeon_device *oct = param;
|
||||||
|
|
||||||
|
memset(core_is_stuck, 0, sizeof(core_is_stuck));
|
||||||
|
memset(core_crashed, 0, sizeof(core_crashed));
|
||||||
|
|
||||||
|
while (!kthread_should_stop()) {
|
||||||
|
mask_of_crashed_cores =
|
||||||
|
(u16)octeon_read_csr64(oct, CN23XX_SLI_SCRATCH2);
|
||||||
|
|
||||||
|
for (core_num = 0; core_num < LIO_MAX_CORES; core_num++) {
|
||||||
|
if (!core_is_stuck[core_num]) {
|
||||||
|
wdog = lio_pci_readq(oct, CIU3_WDOG(core_num));
|
||||||
|
|
||||||
|
/* look at watchdog state field */
|
||||||
|
wdog &= CIU3_WDOG_MASK;
|
||||||
|
if (wdog) {
|
||||||
|
/* this watchdog timer has expired */
|
||||||
|
core_is_stuck[core_num] =
|
||||||
|
LIO_MONITOR_WDOG_EXPIRE;
|
||||||
|
mask_of_stuck_cores |= (1 << core_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!core_crashed[core_num])
|
||||||
|
core_crashed[core_num] =
|
||||||
|
(mask_of_crashed_cores >> core_num) & 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mask_of_stuck_cores) {
|
||||||
|
for (core_num = 0; core_num < LIO_MAX_CORES;
|
||||||
|
core_num++) {
|
||||||
|
if (core_is_stuck[core_num] == 1) {
|
||||||
|
dev_err(&oct->pci_dev->dev,
|
||||||
|
"ERROR: Octeon core %d is stuck!\n",
|
||||||
|
core_num);
|
||||||
|
/* 2 means we have printk'd an error
|
||||||
|
* so no need to repeat the same printk
|
||||||
|
*/
|
||||||
|
core_is_stuck[core_num] =
|
||||||
|
LIO_MONITOR_CORE_STUCK_MSGD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mask_of_crashed_cores) {
|
||||||
|
for (core_num = 0; core_num < LIO_MAX_CORES;
|
||||||
|
core_num++) {
|
||||||
|
if (core_crashed[core_num] == 1) {
|
||||||
|
dev_err(&oct->pci_dev->dev,
|
||||||
|
"ERROR: Octeon core %d crashed! See oct-fwdump for details.\n",
|
||||||
|
core_num);
|
||||||
|
/* 2 means we have printk'd an error
|
||||||
|
* so no need to repeat the same printk
|
||||||
|
*/
|
||||||
|
core_crashed[core_num] =
|
||||||
|
LIO_MONITOR_CORE_STUCK_MSGD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef CONFIG_MODULE_UNLOAD
|
||||||
|
if (mask_of_stuck_cores || mask_of_crashed_cores) {
|
||||||
|
/* make module refcount=0 so that rmmod will work */
|
||||||
|
long refcount;
|
||||||
|
|
||||||
|
refcount = module_refcount(THIS_MODULE);
|
||||||
|
|
||||||
|
while (refcount > 0) {
|
||||||
|
module_put(THIS_MODULE);
|
||||||
|
refcount = module_refcount(THIS_MODULE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* compensate for and withstand an unlikely (but still
|
||||||
|
* possible) race condition
|
||||||
|
*/
|
||||||
|
while (refcount < 0) {
|
||||||
|
try_module_get(THIS_MODULE);
|
||||||
|
refcount = module_refcount(THIS_MODULE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* sleep for two seconds */
|
||||||
|
set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
schedule_timeout(2 * HZ);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief PCI probe handler
|
* \brief PCI probe handler
|
||||||
* @param pdev PCI device structure
|
* @param pdev PCI device structure
|
||||||
@ -1232,6 +1327,30 @@ liquidio_probe(struct pci_dev *pdev,
|
|||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (OCTEON_CN23XX_PF(oct_dev)) {
|
||||||
|
u64 scratch1;
|
||||||
|
u8 bus, device, function;
|
||||||
|
|
||||||
|
scratch1 = octeon_read_csr64(oct_dev, CN23XX_SLI_SCRATCH1);
|
||||||
|
if (!(scratch1 & 4ULL)) {
|
||||||
|
/* Bit 2 of SLI_SCRATCH_1 is a flag that indicates that
|
||||||
|
* the lio watchdog kernel thread is running for this
|
||||||
|
* NIC. Each NIC gets one watchdog kernel thread.
|
||||||
|
*/
|
||||||
|
scratch1 |= 4ULL;
|
||||||
|
octeon_write_csr64(oct_dev, CN23XX_SLI_SCRATCH1,
|
||||||
|
scratch1);
|
||||||
|
|
||||||
|
bus = pdev->bus->number;
|
||||||
|
device = PCI_SLOT(pdev->devfn);
|
||||||
|
function = PCI_FUNC(pdev->devfn);
|
||||||
|
oct_dev->watchdog_task = kthread_create(
|
||||||
|
liquidio_watchdog, oct_dev,
|
||||||
|
"liowd/%02hhx:%02hhx.%hhx", bus, device, function);
|
||||||
|
wake_up_process(oct_dev->watchdog_task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
oct_dev->rx_pause = 1;
|
oct_dev->rx_pause = 1;
|
||||||
oct_dev->tx_pause = 1;
|
oct_dev->tx_pause = 1;
|
||||||
|
|
||||||
@ -1564,6 +1683,9 @@ static void liquidio_remove(struct pci_dev *pdev)
|
|||||||
|
|
||||||
dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n");
|
dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n");
|
||||||
|
|
||||||
|
if (oct_dev->watchdog_task)
|
||||||
|
kthread_stop(oct_dev->watchdog_task);
|
||||||
|
|
||||||
if (oct_dev->app_mode && (oct_dev->app_mode == CVM_DRV_NIC_APP))
|
if (oct_dev->app_mode && (oct_dev->app_mode == CVM_DRV_NIC_APP))
|
||||||
liquidio_stop_nic_module(oct_dev);
|
liquidio_stop_nic_module(oct_dev);
|
||||||
|
|
||||||
|
@ -484,6 +484,8 @@ struct octeon_device {
|
|||||||
|
|
||||||
/* private flags to control driver-specific features through ethtool */
|
/* private flags to control driver-specific features through ethtool */
|
||||||
u32 priv_flags;
|
u32 priv_flags;
|
||||||
|
|
||||||
|
void *watchdog_task;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define OCT_DRV_ONLINE 1
|
#define OCT_DRV_ONLINE 1
|
||||||
|
@ -131,6 +131,12 @@ struct lio {
|
|||||||
#define LIO_SIZE (sizeof(struct lio))
|
#define LIO_SIZE (sizeof(struct lio))
|
||||||
#define GET_LIO(netdev) ((struct lio *)netdev_priv(netdev))
|
#define GET_LIO(netdev) ((struct lio *)netdev_priv(netdev))
|
||||||
|
|
||||||
|
#define CIU3_WDOG(c) (0x1010000020000ULL + (c << 3))
|
||||||
|
#define CIU3_WDOG_MASK 12ULL
|
||||||
|
#define LIO_MONITOR_WDOG_EXPIRE 1
|
||||||
|
#define LIO_MONITOR_CORE_STUCK_MSGD 2
|
||||||
|
#define LIO_MAX_CORES 12
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Enable or disable feature
|
* \brief Enable or disable feature
|
||||||
* @param netdev pointer to network device
|
* @param netdev pointer to network device
|
||||||
|
Loading…
Reference in New Issue
Block a user