Skip to content

Commit 2d2419a

Browse files
Dust Lipiso77
Dust Li
authored andcommitted
net/smc: fix kernel panic caused by race of smc_sock
BugLink: https://bugs.launchpad.net/bugs/1929035 A crash occurs when smc_cdc_tx_handler() tries to access smc_sock but smc_release() has already freed it. [ 4570.695099] BUG: unable to handle page fault for address: 000000002eae9e88 [ 4570.696048] #PF: supervisor write access in kernel mode [ 4570.696728] #PF: error_code(0x0002) - not-present page [ 4570.697401] PGD 0 P4D 0 [ 4570.697716] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 4570.698228] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc4+ #111 [ 4570.699013] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/0 [ 4570.699933] RIP: 0010:_raw_spin_lock+0x1a/0x30 <...> [ 4570.711446] Call Trace: [ 4570.711746] <IRQ> [ 4570.711992] smc_cdc_tx_handler+0x41/0xc0 [ 4570.712470] smc_wr_tx_tasklet_fn+0x213/0x560 [ 4570.712981] ? smc_cdc_tx_dismisser+0x10/0x10 [ 4570.713489] tasklet_action_common.isra.17+0x66/0x140 [ 4570.714083] __do_softirq+0x123/0x2f4 [ 4570.714521] irq_exit_rcu+0xc4/0xf0 [ 4570.714934] common_interrupt+0xba/0xe0 Though smc_cdc_tx_handler() checked the existence of smc connection, smc_release() may have already dismissed and released the smc socket before smc_cdc_tx_handler() further visits it. smc_cdc_tx_handler() |smc_release() if (!conn) | | |smc_cdc_tx_dismiss_slots() | smc_cdc_tx_dismisser() | |sock_put(&smc->sk) <- last sock_put, | smc_sock freed bh_lock_sock(&smc->sk) (panic) | To make sure we won't receive any CDC messages after we free the smc_sock, add a refcount on the smc_connection for inflight CDC message(posted to the QP but haven't received related CQE), and don't release the smc_connection until all the inflight CDC messages haven been done, for both success or failed ones. Using refcount on CDC messages brings another problem: when the link is going to be destroyed, smcr_link_clear() will reset the QP, which then remove all the pending CQEs related to the QP in the CQ. To make sure all the CQEs will always come back so the refcount on the smc_connection can always reach 0, smc_ib_modify_qp_reset() was replaced by smc_ib_modify_qp_error(). And remove the timeout in smc_wr_tx_wait_no_pending_sends() since we need to wait for all pending WQEs done, or we may encounter use-after- free when handling CQEs. For IB device removal routine, we need to wait for all the QPs on that device been destroyed before we can destroy CQs on the device, or the refcount on smc_connection won't reach 0 and smc_sock cannot be released. Fixes: 5f08318 ("smc: connection data control (CDC)") Reported-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net> (cherry picked from commit 349d431) [picked again from upstream after having to revert due to LP: 1929035] Signed-off-by: Frank Heimes <frank.heimes@canonical.com> Acked-by: Tim Gardner <tim.gardner@canonical.com>
1 parent 0590784 commit 2d2419a

File tree

8 files changed

+41
-75
lines changed

8 files changed

+41
-75
lines changed

net/smc/smc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,11 @@ struct smc_connection {
186186
u16 tx_cdc_seq; /* sequence # for CDC send */
187187
u16 tx_cdc_seq_fin; /* sequence # - tx completed */
188188
spinlock_t send_lock; /* protect wr_sends */
189+
atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
190+
* - inc when post wqe,
191+
* - dec on polled tx cqe
192+
*/
193+
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
189194
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
190195
u32 tx_off; /* base offset in peer rmb */
191196

net/smc/smc_cdc.c

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
3131
struct smc_sock *smc;
3232
int diff;
3333

34-
if (!conn)
35-
/* already dismissed */
36-
return;
37-
3834
smc = container_of(conn, struct smc_sock, conn);
3935
bh_lock_sock(&smc->sk);
4036
if (!wc_status) {
@@ -51,6 +47,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
5147
conn);
5248
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
5349
}
50+
51+
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52+
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53+
wake_up(&conn->cdc_pend_tx_wq);
54+
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
55+
5456
smc_tx_sndbuf_nonfull(smc);
5557
bh_unlock_sock(&smc->sk);
5658
}
@@ -107,13 +109,18 @@ int smc_cdc_msg_send(struct smc_connection *conn,
107109
conn->tx_cdc_seq++;
108110
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
109111
smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
112+
113+
atomic_inc(&conn->cdc_pend_tx_wr);
114+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
115+
110116
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
111117
if (!rc) {
112118
smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
113119
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
114120
} else {
115121
conn->tx_cdc_seq--;
116122
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
123+
atomic_dec(&conn->cdc_pend_tx_wr);
117124
}
118125

119126
return rc;
@@ -136,7 +143,18 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn,
136143
peer->token = htonl(local->token);
137144
peer->prod_flags.failover_validation = 1;
138145

146+
/* We need to set pend->conn here to make sure smc_cdc_tx_handler()
147+
* can handle properly
148+
*/
149+
smc_cdc_add_pending_send(conn, pend);
150+
151+
atomic_inc(&conn->cdc_pend_tx_wr);
152+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
153+
139154
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
155+
if (unlikely(rc))
156+
atomic_dec(&conn->cdc_pend_tx_wr);
157+
140158
return rc;
141159
}
142160

@@ -193,31 +211,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
193211
return rc;
194212
}
195213

196-
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
197-
unsigned long data)
214+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
198215
{
199-
struct smc_connection *conn = (struct smc_connection *)data;
200-
struct smc_cdc_tx_pend *cdc_pend =
201-
(struct smc_cdc_tx_pend *)tx_pend;
202-
203-
return cdc_pend->conn == conn;
204-
}
205-
206-
static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
207-
{
208-
struct smc_cdc_tx_pend *cdc_pend =
209-
(struct smc_cdc_tx_pend *)tx_pend;
210-
211-
cdc_pend->conn = NULL;
212-
}
213-
214-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
215-
{
216-
struct smc_link *link = conn->lnk;
217-
218-
smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
219-
smc_cdc_tx_filter, smc_cdc_tx_dismisser,
220-
(unsigned long)conn);
216+
wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
221217
}
222218

223219
/* Send a SMC-D CDC header.

net/smc/smc_cdc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
291291
struct smc_wr_buf **wr_buf,
292292
struct smc_rdma_wr **wr_rdma_buf,
293293
struct smc_cdc_tx_pend **pend);
294-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
294+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
295295
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
296296
struct smc_cdc_tx_pend *pend);
297297
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);

net/smc/smc_core.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,7 +1126,7 @@ void smc_conn_free(struct smc_connection *conn)
11261126
smc_ism_unset_conn(conn);
11271127
tasklet_kill(&conn->rx_tsklet);
11281128
} else {
1129-
smc_cdc_tx_dismiss_slots(conn);
1129+
smc_cdc_wait_pend_tx_wr(conn);
11301130
if (current_work() != &conn->abort_work)
11311131
cancel_work_sync(&conn->abort_work);
11321132
}
@@ -1203,7 +1203,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
12031203
smc_llc_link_clear(lnk, log);
12041204
smcr_buf_unmap_lgr(lnk);
12051205
smcr_rtoken_clear_link(lnk);
1206-
smc_ib_modify_qp_reset(lnk);
1206+
smc_ib_modify_qp_error(lnk);
12071207
smc_wr_free_link(lnk);
12081208
smc_ib_destroy_queue_pair(lnk);
12091209
smc_ib_dealloc_protection_domain(lnk);
@@ -1335,7 +1335,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
13351335
else
13361336
tasklet_unlock_wait(&conn->rx_tsklet);
13371337
} else {
1338-
smc_cdc_tx_dismiss_slots(conn);
1338+
smc_cdc_wait_pend_tx_wr(conn);
13391339
}
13401340
smc_lgr_unregister_conn(conn);
13411341
smc_close_active_abort(smc);
@@ -1600,7 +1600,6 @@ static void smcr_link_down(struct smc_link *lnk)
16001600
if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
16011601
return;
16021602

1603-
smc_ib_modify_qp_reset(lnk);
16041603
to_lnk = smc_switch_conns(lgr, lnk, true);
16051604
if (!to_lnk) { /* no backup link available */
16061605
smcr_link_clear(lnk, true);
@@ -1837,6 +1836,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
18371836
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
18381837
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
18391838
conn->urg_state = SMC_URG_READ;
1839+
init_waitqueue_head(&conn->cdc_pend_tx_wq);
18401840
INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
18411841
if (ini->is_smcd) {
18421842
conn->rx_off = sizeof(struct smcd_cdc_msg);

net/smc/smc_ib.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
109109
IB_QP_MAX_QP_RD_ATOMIC);
110110
}
111111

112-
int smc_ib_modify_qp_reset(struct smc_link *lnk)
112+
int smc_ib_modify_qp_error(struct smc_link *lnk)
113113
{
114114
struct ib_qp_attr qp_attr;
115115

116116
memset(&qp_attr, 0, sizeof(qp_attr));
117-
qp_attr.qp_state = IB_QPS_RESET;
117+
qp_attr.qp_state = IB_QPS_ERR;
118118
return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
119119
}
120120

net/smc/smc_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk);
9090
int smc_ib_ready_link(struct smc_link *lnk);
9191
int smc_ib_modify_qp_rts(struct smc_link *lnk);
9292
int smc_ib_modify_qp_reset(struct smc_link *lnk);
93+
int smc_ib_modify_qp_error(struct smc_link *lnk);
9394
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
9495
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
9596
struct smc_buf_desc *buf_slot, u8 link_idx);

net/smc/smc_wr.c

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,9 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link)
6262
}
6363

6464
/* wait till all pending tx work requests on the given link are completed */
65-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
65+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
6666
{
67-
if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
68-
SMC_WR_TX_WAIT_PENDING_TIME))
69-
return 0;
70-
else /* timeout */
71-
return -EPIPE;
67+
wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
7268
}
7369

7470
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
@@ -87,7 +83,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
8783
struct smc_wr_tx_pend pnd_snd;
8884
struct smc_link *link;
8985
u32 pnd_snd_idx;
90-
int i;
9186

9287
link = wc->qp->qp_context;
9388

@@ -128,14 +123,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
128123
}
129124

130125
if (wc->status) {
131-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
132-
/* clear full struct smc_wr_tx_pend including .priv */
133-
memset(&link->wr_tx_pends[i], 0,
134-
sizeof(link->wr_tx_pends[i]));
135-
memset(&link->wr_tx_bufs[i], 0,
136-
sizeof(link->wr_tx_bufs[i]));
137-
clear_bit(i, link->wr_tx_mask);
138-
}
139126
if (link->lgr->smc_version == SMC_V2) {
140127
memset(link->wr_tx_v2_pend, 0,
141128
sizeof(*link->wr_tx_v2_pend));
@@ -421,25 +408,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
421408
return rc;
422409
}
423410

424-
void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
425-
smc_wr_tx_filter filter,
426-
smc_wr_tx_dismisser dismisser,
427-
unsigned long data)
428-
{
429-
struct smc_wr_tx_pend_priv *tx_pend;
430-
struct smc_wr_rx_hdr *wr_tx;
431-
int i;
432-
433-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
434-
wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
435-
if (wr_tx->type != wr_tx_hdr_type)
436-
continue;
437-
tx_pend = &link->wr_tx_pends[i].priv;
438-
if (filter(tx_pend, data))
439-
dismisser(tx_pend);
440-
}
441-
}
442-
443411
/****************************** receive queue ********************************/
444412

445413
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -675,10 +643,7 @@ void smc_wr_free_link(struct smc_link *lnk)
675643
smc_wr_wakeup_reg_wait(lnk);
676644
smc_wr_wakeup_tx_wait(lnk);
677645

678-
if (smc_wr_tx_wait_no_pending_sends(lnk))
679-
memset(lnk->wr_tx_mask, 0,
680-
BITS_TO_LONGS(SMC_WR_BUF_CNT) *
681-
sizeof(*lnk->wr_tx_mask));
646+
smc_wr_tx_wait_no_pending_sends(lnk);
682647
wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
683648
wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
684649

net/smc/smc_wr.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
2323

2424
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25-
#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
2625

2726
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
2827

@@ -130,7 +129,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
130129
smc_wr_tx_filter filter,
131130
smc_wr_tx_dismisser dismisser,
132131
unsigned long data);
133-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
132+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
134133

135134
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
136135
int smc_wr_rx_post_init(struct smc_link *link);

0 commit comments

Comments
 (0)