Whamcloud - gitweb
LU-11756 o2iblnd: kib_conn leak 47/36347/2
authorAndriy Skulysh <c17819@cray.com>
Wed, 22 Aug 2018 15:11:53 +0000 (18:11 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 8 Oct 2019 13:25:52 +0000 (13:25 +0000)
A new tx can be queued while kiblnd_finalise_conn()
aborts txs. Thus a reference from new tx will
prevent connection from moving into kib_connd_zombies.

Insert new tx after IBLND_CONN_DISCONNECTED into
ibc_zombie_txs list and abort it during
kiblnd_destroy_conn().

Lustre-change: https://review.whamcloud.com/33828
Lustre-commit: a155c3fca38d2a3092f9b5d116ad7877d51d1db1

Change-Id: Ib92d8d02e6e3f66f7140041a330fc00b7ad44ae3
Cray-bug-id: LUS-6412
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Chris Horn <hornc@cray.com>
Signed-off-by: Minh Diep <mdiep@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/36347
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 229033c..ec16a3d 100644 (file)
@@ -832,6 +832,7 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
        INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
        INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
        INIT_LIST_HEAD(&conn->ibc_active_txs);
+       INIT_LIST_HEAD(&conn->ibc_zombie_txs);
        spin_lock_init(&conn->ibc_lock);
 
        LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
@@ -1045,6 +1046,9 @@ kiblnd_destroy_conn(struct kib_conn *conn)
                        CWARN("Error destroying CQ: %d\n", rc);
        }
 
+       kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
+                          LNET_MSG_STATUS_OK);
+
        if (conn->ibc_rx_pages != NULL)
                kiblnd_unmap_rx_descs(conn);
 
index 4d23465..59a2be2 100644 (file)
@@ -692,6 +692,8 @@ struct kib_conn {
        struct list_head        ibc_tx_queue_rsrvd;
        /* active tx awaiting completion */
        struct list_head        ibc_active_txs;
+       /* zombie tx awaiting done */
+       struct list_head        ibc_zombie_txs;
        /* serialise */
        spinlock_t              ibc_lock;
        /* the rx descs */
@@ -1146,6 +1148,7 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
+void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
 void kiblnd_map_rx_descs(struct kib_conn *conn);
 void kiblnd_unmap_rx_descs(struct kib_conn *conn);
 void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
index d07e944..a0edce8 100644 (file)
@@ -1293,6 +1293,21 @@ kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
        LASSERT(!tx->tx_queued);        /* not queued for sending already */
        LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
+       if (conn->ibc_state >= IBLND_CONN_DISCONNECTED) {
+               tx->tx_status = -ECONNABORTED;
+               tx->tx_waiting = 0;
+               if (tx->tx_conn != NULL) {
+                       /* PUT_DONE first attached to conn as a PUT_REQ */
+                       LASSERT(tx->tx_conn == conn);
+                       LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+                       tx->tx_conn = NULL;
+                       kiblnd_conn_decref(conn);
+               }
+               list_add(&tx->tx_list, &conn->ibc_zombie_txs);
+
+               return;
+       }
+
        timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC;
        tx->tx_queued = 1;
        tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
@@ -2132,7 +2147,7 @@ kiblnd_handle_early_rxs(struct kib_conn *conn)
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 }
 
-static void
+void
 kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
 {
        struct list_head         zombies = LIST_HEAD_INIT(zombies);
@@ -2202,13 +2217,13 @@ kiblnd_finalise_conn(struct kib_conn *conn)
        LASSERT (!in_interrupt());
        LASSERT (conn->ibc_state > IBLND_CONN_INIT);
 
-       kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
        /* abort_receives moves QP state to IB_QPS_ERR.  This is only required
         * for connections that didn't get as far as being connected, because
         * rdma_disconnect() does this for free. */
        kiblnd_abort_receives(conn);
 
+       kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
        /* Complete all tx descs not waiting for sends to complete.
         * NB we should be safe from RDMA now that the QP has changed state */