Skip to content

Commit fe00cae

Browse files
chrismason-xxgerd-rausch
authored andcommitted
RDS: give up on half formed connections after 15s
RDS relies on events to transition connections through a few different states, but sometimes we get stuck and end up with a half formed connection that is never able to finish The other end has either wandered off or there are bugs in other layers, and we end up with any future attempts from the other end rejected because we're already working on a connection attempt. This patch changes things to give up on half formed connections after 15 seconds. Signed-off-by: Chris Mason <chris.mason@oracle.com> Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com> Orabug: 27364391 (cherry picked from commit d783cad) cherry-pick-repo=linux-uek.git Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com> Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
1 parent 8e089a7 commit fe00cae

File tree

3 files changed

+41
-4
lines changed

3 files changed

+41
-4
lines changed

net/rds/ib_cm.c

+33-4
Original file line numberDiff line numberDiff line change
@@ -585,20 +585,49 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
585585
*/
586586
mutex_lock(&conn->c_cm_lock);
587587
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
588+
/*
589+
* in both of the cases below, the conn is half setup.
590+
* we need to make sure the lower layers don't destroy it
591+
*/
592+
ic = conn->c_transport_data;
593+
if (ic && ic->i_cm_id == cm_id)
594+
destroy = 0;
588595
if (rds_conn_state(conn) == RDS_CONN_UP) {
589596
rdsdebug("incoming connect while connecting\n");
590597
rds_conn_drop(conn);
591598
rds_ib_stats_inc(s_ib_listen_closed_stale);
592-
} else
593-
if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
594-
/* Wait and see - our connect may still be succeeding */
595-
rds_ib_stats_inc(s_ib_connect_raced);
599+
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
600+
unsigned long now = get_seconds();
601+
602+
/*
603+
* after 15 seconds, give up on existing connection
604+
* attempts and make them try again. At this point
605+
* it's no longer a race but something has gone
606+
* horribly wrong
607+
*/
608+
if (now > conn->c_connection_start &&
609+
now - conn->c_connection_start > 15) {
610+
printk(KERN_CRIT "rds connection racing for 15s, forcing reset "
611+
"connection %pI4->%pI4\n",
612+
&conn->c_laddr, &conn->c_faddr);
613+
rds_conn_drop(conn);
614+
rds_ib_stats_inc(s_ib_listen_closed_stale);
615+
} else {
616+
/* Wait and see - our connect may still be succeeding */
617+
rds_ib_stats_inc(s_ib_connect_raced);
618+
}
596619
}
597620
goto out;
598621
}
599622

600623
ic = conn->c_transport_data;
601624

625+
/*
626+
* record the time we started trying to connect so that we can
627+
* drop the connection if it doesn't work out after a while
628+
*/
629+
conn->c_connection_start = get_seconds();
630+
602631
rds_ib_set_protocol(conn, version);
603632
rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
604633

net/rds/rds.h

+1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ struct rds_connection {
124124

125125
struct list_head c_map_item;
126126
unsigned long c_map_queued;
127+
unsigned long c_connection_start; /* when was this connection started */
127128

128129
unsigned int c_unacked_packets;
129130
unsigned int c_unacked_bytes;

net/rds/threads.c

+7
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ void rds_connect_complete(struct rds_connection *conn)
8989
set_bit(0, &conn->c_map_queued);
9090
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
9191
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
92+
conn->c_connection_start = get_seconds();
9293
}
9394
EXPORT_SYMBOL_GPL(rds_connect_complete);
9495

@@ -143,6 +144,12 @@ void rds_connect_worker(struct work_struct *work)
143144

144145
clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
145146
if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
147+
/*
148+
* record the time we started trying to connect so that we can
149+
* drop the connection if it doesn't work out after a while
150+
*/
151+
conn->c_connection_start = get_seconds();
152+
146153
ret = conn->c_trans->conn_connect(conn);
147154
rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
148155
conn, &conn->c_laddr, &conn->c_faddr, ret);

0 commit comments

Comments
 (0)