Skip to content

Commit fc2e6b3

Browse files
SlawomirLabaanguy11
authored andcommitted
iavf: Rework mutexes for better synchronisation
The driver used to crash in multiple spots when put to stress testing of the init, reset and remove paths. The user would experience call traces or hangs when creating, resetting, removing VFs. Depending on the machines, the call traces are happening in random spots, like reset restoring resources racing with driver remove. Make adapter->crit_lock mutex a mandatory lock for guarding the operations performed on all workqueues and functions dealing with resource allocation and disposal. Make __IAVF_REMOVE a final state of the driver respected by workqueues that shall not requeue, when they fail to obtain the crit_lock. Make the IRQ handler not to queue the new work for adminq_task when the __IAVF_REMOVE state is set. Fixes: 5ac49f3 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Slawomir Laba <slawomirx.laba@intel.com> Signed-off-by: Phani Burra <phani.r.burra@intel.com> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
1 parent e01b042 commit fc2e6b3

File tree

2 files changed

+36
-31
lines changed

2 files changed

+36
-31
lines changed

drivers/net/ethernet/intel/iavf/iavf.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,6 @@ struct iavf_adapter {
246246
struct list_head mac_filter_list;
247247
struct mutex crit_lock;
248248
struct mutex client_lock;
249-
struct mutex remove_lock;
250249
/* Lock to protect accesses to MAC and VLAN lists */
251250
spinlock_t mac_vlan_list_lock;
252251
char misc_vector_name[IFNAMSIZ + 9];

drivers/net/ethernet/intel/iavf/iavf_main.c

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,9 @@ static irqreturn_t iavf_msix_aq(int irq, void *data)
302302
rd32(hw, IAVF_VFINT_ICR01);
303303
rd32(hw, IAVF_VFINT_ICR0_ENA1);
304304

305-
/* schedule work on the private workqueue */
306-
queue_work(iavf_wq, &adapter->adminq_task);
305+
if (adapter->state != __IAVF_REMOVE)
306+
/* schedule work on the private workqueue */
307+
queue_work(iavf_wq, &adapter->adminq_task);
307308

308309
return IRQ_HANDLED;
309310
}
@@ -2374,8 +2375,12 @@ static void iavf_watchdog_task(struct work_struct *work)
23742375
struct iavf_hw *hw = &adapter->hw;
23752376
u32 reg_val;
23762377

2377-
if (!mutex_trylock(&adapter->crit_lock))
2378+
if (!mutex_trylock(&adapter->crit_lock)) {
2379+
if (adapter->state == __IAVF_REMOVE)
2380+
return;
2381+
23782382
goto restart_watchdog;
2383+
}
23792384

23802385
if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)
23812386
iavf_change_state(adapter, __IAVF_COMM_FAILED);
@@ -2601,13 +2606,13 @@ static void iavf_reset_task(struct work_struct *work)
26012606
/* When device is being removed it doesn't make sense to run the reset
26022607
* task, just return in such a case.
26032608
*/
2604-
if (mutex_is_locked(&adapter->remove_lock))
2605-
return;
2609+
if (!mutex_trylock(&adapter->crit_lock)) {
2610+
if (adapter->state != __IAVF_REMOVE)
2611+
queue_work(iavf_wq, &adapter->reset_task);
26062612

2607-
if (iavf_lock_timeout(&adapter->crit_lock, 200)) {
2608-
schedule_work(&adapter->reset_task);
26092613
return;
26102614
}
2615+
26112616
while (!mutex_trylock(&adapter->client_lock))
26122617
usleep_range(500, 1000);
26132618
if (CLIENT_ENABLED(adapter)) {
@@ -2826,13 +2831,19 @@ static void iavf_adminq_task(struct work_struct *work)
28262831
if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)
28272832
goto out;
28282833

2834+
if (!mutex_trylock(&adapter->crit_lock)) {
2835+
if (adapter->state == __IAVF_REMOVE)
2836+
return;
2837+
2838+
queue_work(iavf_wq, &adapter->adminq_task);
2839+
goto out;
2840+
}
2841+
28292842
event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
28302843
event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
28312844
if (!event.msg_buf)
28322845
goto out;
28332846

2834-
if (iavf_lock_timeout(&adapter->crit_lock, 200))
2835-
goto freedom;
28362847
do {
28372848
ret = iavf_clean_arq_element(hw, &event, &pending);
28382849
v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
@@ -3800,11 +3811,12 @@ static int iavf_close(struct net_device *netdev)
38003811
struct iavf_adapter *adapter = netdev_priv(netdev);
38013812
int status;
38023813

3803-
if (adapter->state <= __IAVF_DOWN_PENDING)
3804-
return 0;
3814+
mutex_lock(&adapter->crit_lock);
38053815

3806-
while (!mutex_trylock(&adapter->crit_lock))
3807-
usleep_range(500, 1000);
3816+
if (adapter->state <= __IAVF_DOWN_PENDING) {
3817+
mutex_unlock(&adapter->crit_lock);
3818+
return 0;
3819+
}
38083820

38093821
set_bit(__IAVF_VSI_DOWN, adapter->vsi.state);
38103822
if (CLIENT_ENABLED(adapter))
@@ -4431,7 +4443,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
44314443
*/
44324444
mutex_init(&adapter->crit_lock);
44334445
mutex_init(&adapter->client_lock);
4434-
mutex_init(&adapter->remove_lock);
44354446
mutex_init(&hw->aq.asq_mutex);
44364447
mutex_init(&hw->aq.arq_mutex);
44374448

@@ -4556,11 +4567,7 @@ static void iavf_remove(struct pci_dev *pdev)
45564567
struct iavf_cloud_filter *cf, *cftmp;
45574568
struct iavf_hw *hw = &adapter->hw;
45584569
int err;
4559-
/* Indicate we are in remove and not to run reset_task */
4560-
mutex_lock(&adapter->remove_lock);
4561-
cancel_work_sync(&adapter->reset_task);
4562-
cancel_delayed_work_sync(&adapter->watchdog_task);
4563-
cancel_delayed_work_sync(&adapter->client_task);
4570+
45644571
if (adapter->netdev_registered) {
45654572
unregister_netdev(netdev);
45664573
adapter->netdev_registered = false;
@@ -4572,25 +4579,30 @@ static void iavf_remove(struct pci_dev *pdev)
45724579
err);
45734580
}
45744581

4582+
mutex_lock(&adapter->crit_lock);
4583+
dev_info(&adapter->pdev->dev, "Remove device\n");
4584+
iavf_change_state(adapter, __IAVF_REMOVE);
4585+
45754586
iavf_request_reset(adapter);
45764587
msleep(50);
45774588
/* If the FW isn't responding, kick it once, but only once. */
45784589
if (!iavf_asq_done(hw)) {
45794590
iavf_request_reset(adapter);
45804591
msleep(50);
45814592
}
4582-
if (iavf_lock_timeout(&adapter->crit_lock, 5000))
4583-
dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", __FUNCTION__);
45844593

4585-
dev_info(&adapter->pdev->dev, "Removing device\n");
4594+
iavf_misc_irq_disable(adapter);
45864595
/* Shut down all the garbage mashers on the detention level */
4587-
iavf_change_state(adapter, __IAVF_REMOVE);
4596+
cancel_work_sync(&adapter->reset_task);
4597+
cancel_delayed_work_sync(&adapter->watchdog_task);
4598+
cancel_work_sync(&adapter->adminq_task);
4599+
cancel_delayed_work_sync(&adapter->client_task);
4600+
45884601
adapter->aq_required = 0;
45894602
adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED;
45904603

45914604
iavf_free_all_tx_resources(adapter);
45924605
iavf_free_all_rx_resources(adapter);
4593-
iavf_misc_irq_disable(adapter);
45944606
iavf_free_misc_irq(adapter);
45954607

45964608
/* In case we enter iavf_remove from erroneous state, free traffic irqs
@@ -4606,10 +4618,6 @@ static void iavf_remove(struct pci_dev *pdev)
46064618
iavf_reset_interrupt_capability(adapter);
46074619
iavf_free_q_vectors(adapter);
46084620

4609-
cancel_delayed_work_sync(&adapter->watchdog_task);
4610-
4611-
cancel_work_sync(&adapter->adminq_task);
4612-
46134621
iavf_free_rss(adapter);
46144622

46154623
if (hw->aq.asq.count)
@@ -4621,8 +4629,6 @@ static void iavf_remove(struct pci_dev *pdev)
46214629
mutex_destroy(&adapter->client_lock);
46224630
mutex_unlock(&adapter->crit_lock);
46234631
mutex_destroy(&adapter->crit_lock);
4624-
mutex_unlock(&adapter->remove_lock);
4625-
mutex_destroy(&adapter->remove_lock);
46264632

46274633
iounmap(hw->hw_addr);
46284634
pci_release_regions(pdev);

0 commit comments

Comments
 (0)