net: ena: fix crash during ena_remove()
authorArthur Kiyanovski <akiyano@amazon.com>
Mon, 19 Nov 2018 10:05:21 +0000 (12:05 +0200)
committerDavid S. Miller <davem@davemloft.net>
Mon, 19 Nov 2018 23:13:00 +0000 (15:13 -0800)
In ena_remove() we have the following stack call:
ena_remove()
  unregister_netdev()
  ena_destroy_device()
    netif_carrier_off()

Calling netif_carrier_off() causes linkwatch to try to handle the
link change event on the already unregistered netdev, which leads
to a read from an unreadable memory address.

This patch switches the order of the two functions, so that
netif_carrier_off() is called on a regiestered netdev.

To accomplish this fix we also had to:
1. Remove the set bit ENA_FLAG_TRIGGER_RESET
2. Add a sanitiy check in ena_close()
both to prevent double device reset (when calling unregister_netdev()
ena_close is called, but the device was already deleted in
ena_destroy_device()).
3. Set the admin_queue running state to false to avoid using it after
device was reset (for example when calling ena_destroy_all_io_queues()
right after ena_com_dev_reset() in ena_down)

Fixes: 944b28aa2982 ("net: ena: fix missing lock during device destruction")
Signed-off-by: Arthur Kiyanovski <akiyano@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/amazon/ena/ena_netdev.c

index 1d3cead0977eb79683f93abb5df9e3ebd7d5ca7f..a70bb1bb90e7d89dc75f6f3dcc4f1455484057bc 100644 (file)
@@ -1848,6 +1848,8 @@ static void ena_down(struct ena_adapter *adapter)
                rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
                if (rc)
                        dev_err(&adapter->pdev->dev, "Device reset failed\n");
+               /* stop submitting admin commands on a device that was reset */
+               ena_com_set_admin_running_state(adapter->ena_dev, false);
        }
 
        ena_destroy_all_io_queues(adapter);
@@ -1914,6 +1916,9 @@ static int ena_close(struct net_device *netdev)
 
        netif_dbg(adapter, ifdown, netdev, "%s\n", __func__);
 
+       if (!test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
+               return 0;
+
        if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
                ena_down(adapter);
 
@@ -2613,9 +2618,7 @@ static void ena_destroy_device(struct ena_adapter *adapter, bool graceful)
                ena_down(adapter);
 
        /* Stop the device from sending AENQ events (in case reset flag is set
-        *  and device is up, ena_close already reset the device
-        * In case the reset flag is set and the device is up, ena_down()
-        * already perform the reset, so it can be skipped.
+        *  and device is up, ena_down() already reset the device.
         */
        if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up))
                ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
@@ -3452,6 +3455,8 @@ err_rss:
        ena_com_rss_destroy(ena_dev);
 err_free_msix:
        ena_com_dev_reset(ena_dev, ENA_REGS_RESET_INIT_ERR);
+       /* stop submitting admin commands on a device that was reset */
+       ena_com_set_admin_running_state(ena_dev, false);
        ena_free_mgmnt_irq(adapter);
        ena_disable_msix(adapter);
 err_worker_destroy:
@@ -3498,18 +3503,12 @@ static void ena_remove(struct pci_dev *pdev)
 
        cancel_work_sync(&adapter->reset_task);
 
-       unregister_netdev(netdev);
-
-       /* If the device is running then we want to make sure the device will be
-        * reset to make sure no more events will be issued by the device.
-        */
-       if (test_bit(ENA_FLAG_DEVICE_RUNNING, &adapter->flags))
-               set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-
        rtnl_lock();
        ena_destroy_device(adapter, true);
        rtnl_unlock();
 
+       unregister_netdev(netdev);
+
        free_netdev(netdev);
 
        ena_com_rss_destroy(ena_dev);