From: Liang Zhen Date: Tue, 21 Feb 2012 04:40:25 +0000 (+0800) Subject: LU-78 o2iblnd: kiblnd_check_conns can deadlock X-Git-Tag: 2.1.58~18 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=cc875104bb81313415167425ce21c562ddf540c9 LU-78 o2iblnd: kiblnd_check_conns can deadlock kiblnd_check_conns() called kiblnd_check_sends() with hold of global rwlock, it's wrong because kiblnd_check_sends() could do many things: - call lnet_finalize() which is not safe with hold of spinlock - call kiblnd_close_conn() which requires to write_lock the same global lock - kiblnd_check_sends() might need to allocate NOOP message It can be fixed by moving call of kiblnd_check_sends out from spinlock This patch is from the fix of Bug 20288, with some small changes. Signed-off-by: Liang Zhen Change-Id: Icc9fedc70ecb25b0c41ebaf6d80c971f8281c9c6 Reviewed-on: http://review.whamcloud.com/2166 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 332928a..59fd3d0 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -573,7 +573,7 @@ typedef struct kib_connvars typedef struct kib_conn { struct kib_peer *ibc_peer; /* owning peer */ - kib_hca_dev_t *ibc_hdev; /* HCA bound on */ + kib_hca_dev_t *ibc_hdev; /* HCA bound on */ cfs_list_t ibc_list; /* stash on peer's conn list */ cfs_list_t ibc_sched_list; /* schedule for attention */ __u16 ibc_version; /* version of connection */ @@ -589,9 +589,14 @@ typedef struct kib_conn int ibc_nrx:16; /* receive buffers owned */ int ibc_scheduled:1; /* scheduled for attention */ int ibc_ready:1; /* CQ callback fired */ - unsigned long ibc_last_send; /* time of last send */ - cfs_list_t ibc_early_rxs; /* rxs completed before ESTABLISHED */ - cfs_list_t ibc_tx_noops; /* IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + /* time of last send */ + unsigned long ibc_last_send; + /** link chain for kiblnd_check_conns only */ + cfs_list_t ibc_connd_list; + /** rxs completed before ESTABLISHED */ + cfs_list_t ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + cfs_list_t ibc_tx_noops; cfs_list_t ibc_tx_queue; /* sends that need a credit */ cfs_list_t ibc_tx_queue_nocred;/* sends that don't need a credit */ cfs_list_t ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ @@ -606,7 +611,7 @@ typedef struct kib_conn kib_connvars_t *ibc_connvars; /* in-progress connection state */ } kib_conn_t; -#define IBLND_CONN_INIT 0 /* being intialised */ +#define IBLND_CONN_INIT 0 /* being initialised */ #define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ #define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ #define IBLND_CONN_ESTABLISHED 3 /* connection established */ @@ -738,7 +743,7 @@ kiblnd_send_keepalive(kib_conn_t *conn) } static inline int -kiblnd_send_noop(kib_conn_t *conn) +kiblnd_need_noop(kib_conn_t *conn) { LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); @@ -752,11 +757,12 @@ kiblnd_send_noop(kib_conn_t *conn) return 0; /* NOOP can be piggybacked */ /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (cfs_list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 0); + return (cfs_list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); } - if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ - !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */ + if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ conn->ibc_credits == 0) /* no credit */ return 0; diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index ce9825d..b95a04f 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -813,7 +813,7 @@ kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit) tx->tx_queued = 0; if (msg->ibm_type == IBLND_MSG_NOOP && - (!kiblnd_send_noop(conn) || /* redundant NOOP */ + (!kiblnd_need_noop(conn) || /* redundant NOOP */ (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { /* OK to drop when posted enough NOOPs, since @@ -928,7 +928,7 @@ kiblnd_check_sends (kib_conn_t *conn) conn->ibc_reserved_credits--; } - if (kiblnd_send_noop(conn)) { + if (kiblnd_need_noop(conn)) { cfs_spin_unlock(&conn->ibc_lock); tx = kiblnd_get_idle_tx(ni); @@ -2969,14 +2969,11 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) } } -int -kiblnd_check_txs (kib_conn_t *conn, cfs_list_t *txs) +static int +kiblnd_check_txs_locked(kib_conn_t *conn, cfs_list_t *txs) { kib_tx_t *tx; cfs_list_t *ttmp; - int timed_out = 0; - - cfs_spin_lock(&conn->ibc_lock); cfs_list_for_each (ttmp, txs) { tx = cfs_list_entry (ttmp, kib_tx_t, tx_list); @@ -2989,41 +2986,40 @@ kiblnd_check_txs (kib_conn_t *conn, cfs_list_t *txs) } if (cfs_time_aftereq (jiffies, tx->tx_deadline)) { - timed_out = 1; CERROR("Timed out tx: %s, %lu seconds\n", kiblnd_queue2str(conn, txs), cfs_duration_sec(jiffies - tx->tx_deadline)); - break; + return 1; } } - cfs_spin_unlock(&conn->ibc_lock); - return timed_out; + return 0; } -int -kiblnd_conn_timed_out (kib_conn_t *conn) +static int +kiblnd_conn_timed_out_locked(kib_conn_t *conn) { - return kiblnd_check_txs(conn, &conn->ibc_tx_queue) || - kiblnd_check_txs(conn, &conn->ibc_tx_noops) || - kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || - kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) || - kiblnd_check_txs(conn, &conn->ibc_active_txs); + return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); } void kiblnd_check_conns (int idx) { - cfs_list_t *peers = &kiblnd_data.kib_peers[idx]; - cfs_list_t *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - cfs_list_t *ctmp; - unsigned long flags; + CFS_LIST_HEAD (closes); + CFS_LIST_HEAD (checksends); + cfs_list_t *peers = &kiblnd_data.kib_peers[idx]; + cfs_list_t *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + cfs_list_t *ctmp; + unsigned long flags; - again: /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we + * RDMAs to time out, so we just use a shared lock while we * take a look... */ cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -3031,41 +3027,66 @@ kiblnd_check_conns (int idx) peer = cfs_list_entry (ptmp, kib_peer_t, ibp_list); cfs_list_for_each (ctmp, &peer->ibp_conns) { - conn = cfs_list_entry (ctmp, kib_conn_t, ibc_list); + int timedout; + int sendnoop; + + conn = cfs_list_entry(ctmp, kib_conn_t, ibc_list); LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED); - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kiblnd_check_sends(conn); + cfs_spin_lock(&conn->ibc_lock); - if (!kiblnd_conn_timed_out(conn)) + sendnoop = kiblnd_need_noop(conn); + timedout = kiblnd_conn_timed_out_locked(conn); + if (!sendnoop && !timedout) { + cfs_spin_unlock(&conn->ibc_lock); continue; + } - /* Handle timeout by closing the whole connection. We - * can only be sure RDMA activity has ceased once the - * QP has been modified. */ - - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, - flags); - - CERROR("Timed out RDMA with %s (%lu)\n", - libcfs_nid2str(peer->ibp_nid), - cfs_duration_sec(cfs_time_current() - - peer->ibp_last_alive)); - - kiblnd_close_conn(conn, -ETIMEDOUT); - kiblnd_conn_decref(conn); /* ...until here */ + if (timedout) { + CERROR("Timed out RDMA with %s (%lu): " + "c: %u, oc: %u, rc: %u\n", + libcfs_nid2str(peer->ibp_nid), + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive), + conn->ibc_credits, + conn->ibc_outstanding_credits, + conn->ibc_reserved_credits); + cfs_list_add(&conn->ibc_connd_list, &closes); + } else { + cfs_list_add(&conn->ibc_connd_list, + &checksends); + } + /* +ref for 'closes' or 'checksends' */ + kiblnd_conn_addref(conn); - /* start again now I've dropped the lock */ - goto again; + cfs_spin_unlock(&conn->ibc_lock); } } cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Handle timeout by closing the whole + * connection. We can only be sure RDMA activity + * has ceased once the QP has been modified. */ + while (!cfs_list_empty(&closes)) { + conn = cfs_list_entry(closes.next, + kib_conn_t, ibc_connd_list); + cfs_list_del(&conn->ibc_connd_list); + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); + } + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + while (!cfs_list_empty(&checksends)) { + conn = cfs_list_entry(checksends.next, + kib_conn_t, ibc_connd_list); + cfs_list_del(&conn->ibc_connd_list); + kiblnd_check_sends(conn); + kiblnd_conn_decref(conn); + } } void