net/mlx5: Cancel delayed recovery work when unloading the driver
authorMohamad Haj Yahia <mohamad@mellanox.com>
Thu, 30 Mar 2017 14:09:00 +0000 (17:09 +0300)
committerSaeed Mahameed <saeedm@mellanox.com>
Tue, 27 Jun 2017 11:49:57 +0000 (14:49 +0300)
Draining the health workqueue will ignore future health works including
the one that report hardware failure and thus we can't enter error state
Instead cancel the recovery flow and make sure only recovery flow won't
be scheduled.

Fixes: 5e44fca50470 ('net/mlx5: Only cancel recovery work when cleaning up device')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
drivers/net/ethernet/mellanox/mlx5/core/health.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
include/linux/mlx5/driver.h

index f27f84ffbc850487557ad0184960d7c872abb160..8a8b5f0e497cf45382e02d434429772d7c277a32 100644 (file)
@@ -67,6 +67,7 @@ enum {
 
 enum {
        MLX5_DROP_NEW_HEALTH_WORK,
+       MLX5_DROP_NEW_RECOVERY_WORK,
 };
 
 static u8 get_nic_state(struct mlx5_core_dev *dev)
@@ -193,7 +194,7 @@ static void health_care(struct work_struct *work)
        mlx5_handle_bad_state(dev);
 
        spin_lock(&health->wq_lock);
-       if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+       if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
                schedule_delayed_work(&health->recover_work, recover_delay);
        else
                dev_err(&dev->pdev->dev,
@@ -313,6 +314,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
        init_timer(&health->timer);
        health->sick = 0;
        clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+       clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
        health->health = &dev->iseg->health;
        health->health_counter = &dev->iseg->health_counter;
 
@@ -335,11 +337,22 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 
        spin_lock(&health->wq_lock);
        set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+       set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
        spin_unlock(&health->wq_lock);
        cancel_delayed_work_sync(&health->recover_work);
        cancel_work_sync(&health->work);
 }
 
+void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+
+       spin_lock(&health->wq_lock);
+       set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+       spin_unlock(&health->wq_lock);
+       cancel_delayed_work_sync(&dev->priv.health.recover_work);
+}
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
index fd47b51348418f2d27ae6d55c0e29dfb060363c1..524c16f72e83507b16df9707f73072e92cf641b3 100644 (file)
@@ -1228,7 +1228,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
        int err = 0;
 
        if (cleanup)
-               mlx5_drain_health_wq(dev);
+               mlx5_drain_health_recovery(dev);
 
        mutex_lock(&dev->intf_state_mutex);
        if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
index 93273d9ea4d145f125846f0c9a56a5a6e74915e4..ba260330ce5e6c921e28c41d6789a4ae69c249d3 100644 (file)
@@ -925,6 +925,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
+void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
 int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
                        struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);