Whamcloud - gitweb
LU-14499 lnet: Revert "LU-13368 lnet: discard the callback" 37/41937/3
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Mon, 8 Mar 2021 17:46:03 +0000 (09:46 -0800)
committerSerguei Smirnov <ssmirnov@whamcloud.com>
Mon, 8 Mar 2021 18:27:35 +0000 (10:27 -0800)
The changes introduced by LU-13368 have been shown to cause
the o2iblnd shutdown procedure to hang on lustre_rmmod
as it infinitely waits for peers to disconnect. Revert it.
This reverts commit babf0232273467b7199ec9a7c36047b1968913df.

Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I489ae4af445b18df852ec35adc958c4fac33de09

ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch
lnet/include/lnet/api.h
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/lnet/lib-md.c
lustre/include/lustre_net.h
lustre/ptlrpc/client.c
lustre/ptlrpc/niobuf.c

index 2e76284..eaab8de 100644 (file)
@@ -38,7 +38,7 @@ The idea is simple, like how pdirop is implement at LDLM level.
     while holding CW lock, we drop CW lock and take EX lock and retry.
   - disable pdirops by always EX lock on change and PR lock on lookup/readdir.
 
-Lustre-bug-id: https://jira.whamcloud.com/browse/LU-50
+Lustre-bug-id: LU-50" target="_blank">https://jira.whamcloud.com/browse/LU-50
 Lustre-change: http://review.whamcloud.com/375
 Signed-off-by: Liang Zhen <liang@whamcloud.com>
 
index 506b762..b3901f5 100644 (file)
@@ -125,8 +125,7 @@ int LNetMDBind(const struct lnet_md *md_in,
               enum lnet_unlink unlink_in,
               struct lnet_handle_md *md_handle_out);
 
-int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard);
-#define LNetMDUnlink(handle) __LNetMDUnlink(handle, false)
+int LNetMDUnlink(struct lnet_handle_md md_in);
 
 void lnet_assert_handler_unused(lnet_handler_t handler);
 /** @} lnet_md */
index 4fdcd87..1b426bc 100644 (file)
@@ -679,7 +679,6 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
 void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
 void lnet_clean_zombie_rstqs(void);
 
-bool lnet_md_discarded(struct lnet_libmd *md);
 void lnet_finalize(struct lnet_msg *msg, int rc);
 bool lnet_send_error_simulation(struct lnet_msg *msg,
                                enum lnet_msg_hstatus *hstatus);
index 22a1bdd..ee6cd9d 100644 (file)
@@ -227,7 +227,6 @@ struct lnet_libmd {
  * call.
  */
 #define LNET_MD_FLAG_HANDLING   BIT(3)
-#define LNET_MD_FLAG_DISCARD    BIT(4)
 
 struct lnet_test_peer {
        /* info about peers we are trying to fail */
index 152173b..5ab0f77 100644 (file)
@@ -3067,7 +3067,6 @@ kiblnd_base_startup(struct net *ns)
 
        spin_lock_init(&kiblnd_data.kib_connd_lock);
        INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
-       INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits);
        INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
        INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
        INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
index 3e4013f..2788a24 100644 (file)
@@ -450,8 +450,6 @@ struct kib_data {
        struct list_head        kib_reconn_list;
        /* peers wait for reconnection */
        struct list_head        kib_reconn_wait;
-       /* connections wait for completion */
-       struct list_head        kib_connd_waits;
        /*
         * The second that peers are pulled out from \a kib_reconn_wait
         * for reconnection.
@@ -590,8 +588,6 @@ struct kib_conn {
        __u16                   ibc_queue_depth;
        /* connections max frags */
        __u16                   ibc_max_frags;
-       /* count of timeout txs waiting on cq */
-       __u16                   ibc_waits;
        /* receive buffers owned */
        unsigned int            ibc_nrx:16;
        /* scheduled for attention */
index 658d0ad..14e7257 100644 (file)
@@ -2135,10 +2135,6 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
                if (tx->tx_sending == 0) {
                        tx->tx_queued = 0;
                        list_move(&tx->tx_list, &zombies);
-               } else {
-                       /* keep tx until cq destroy */
-                       list_move(&tx->tx_list, &conn->ibc_zombie_txs);
-                       conn->ibc_waits ++;
                }
        }
 
@@ -2153,31 +2149,6 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
        kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK);
 }
 
-static int
-kiblnd_tx_may_discard(struct kib_conn *conn)
-{
-       int rc = 0;
-       struct kib_tx *nxt;
-       struct kib_tx *tx;
-
-       spin_lock(&conn->ibc_lock);
-
-       list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) {
-               if (tx->tx_sending > 0 && tx->tx_lntmsg[0] &&
-                   lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) {
-                       tx->tx_sending --;
-                       if (tx->tx_sending == 0) {
-                               kiblnd_conn_decref(tx->tx_conn);
-                               tx->tx_conn = NULL;
-                               rc = 1;
-                       }
-               }
-       }
-
-       spin_unlock(&conn->ibc_lock);
-       return rc;
-}
-
 static void
 kiblnd_finalise_conn(struct kib_conn *conn)
 {
@@ -3325,9 +3296,8 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
                }
 
                if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
-                       CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n",
+                       CERROR("Timed out tx: %s, %lld seconds\n",
                               kiblnd_queue2str(conn, txs),
-                              tx->tx_waiting, tx->tx_sending, tx->tx_queued,
                               kiblnd_timeout() +
                               ktime_ms_delta(ktime_get(),
                                              tx->tx_deadline) / MSEC_PER_SEC);
@@ -3529,7 +3499,6 @@ kiblnd_connd (void *arg)
                }
 
                if (!list_empty(&kiblnd_data.kib_connd_conns)) {
-                       int wait;
                        conn = list_entry(kiblnd_data.kib_connd_conns.next,
                                          struct kib_conn, ibc_list);
                        list_del(&conn->ibc_list);
@@ -3538,15 +3507,9 @@ kiblnd_connd (void *arg)
                        dropped_lock = 1;
 
                        kiblnd_disconnect_conn(conn);
-                       wait = conn->ibc_waits;
-                       if (wait == 0) /* keep ref for connd_wait, see below */
-                               kiblnd_conn_decref(conn);
+                       kiblnd_conn_decref(conn);
 
                        spin_lock_irqsave(lock, flags);
-
-                       if (wait)
-                               list_add_tail(&conn->ibc_list,
-                                             &kiblnd_data.kib_connd_waits);
                }
 
                while (reconn < KIB_RECONN_BREAK) {
@@ -3574,25 +3537,9 @@ kiblnd_connd (void *arg)
                        spin_lock_irqsave(lock, flags);
                }
 
-               if (!list_empty(&kiblnd_data.kib_connd_waits)) {
-                       conn = list_entry(kiblnd_data.kib_connd_waits.next,
-                                         struct kib_conn, ibc_list);
-                       list_del(&conn->ibc_list);
-                       spin_unlock_irqrestore(lock, flags);
-
-                       dropped_lock = kiblnd_tx_may_discard(conn);
-                       if (dropped_lock)
-                               kiblnd_conn_decref(conn);
-
-                       spin_lock_irqsave(lock, flags);
-                       if (dropped_lock == 0)
-                               list_add_tail(&conn->ibc_list,
-                                             &kiblnd_data.kib_connd_waits);
-               }
-
                /* careful with the jiffy wrap... */
                timeout = (int)(deadline - jiffies);
-               if (timeout <= 0) {
+                if (timeout <= 0) {
                        const int n = 4;
                        const int p = 1;
                        int chunk = HASH_SIZE(kiblnd_data.kib_peers);
index dfb8b03..bc10b4a 100644 (file)
@@ -471,7 +471,7 @@ EXPORT_SYMBOL(LNetMDBind);
  * \retval -ENOENT If \a mdh does not point to a valid MD object.
  */
 int
-__LNetMDUnlink(struct lnet_handle_md mdh, bool discard)
+LNetMDUnlink(struct lnet_handle_md mdh)
 {
        struct lnet_event ev;
        struct lnet_libmd *md = NULL;
@@ -505,9 +505,6 @@ __LNetMDUnlink(struct lnet_handle_md mdh, bool discard)
                handler = md->md_handler;
        }
 
-       if (discard)
-               md->md_flags |= LNET_MD_FLAG_DISCARD;
-
        if (md->md_rspt_ptr != NULL)
                lnet_detach_rsp_tracker(md, cpt);
 
@@ -520,22 +517,4 @@ __LNetMDUnlink(struct lnet_handle_md mdh, bool discard)
 
        return 0;
 }
-EXPORT_SYMBOL(__LNetMDUnlink);
-
-bool
-lnet_md_discarded(struct lnet_libmd *md)
-{
-       bool rc;
-       int cpt;
-
-       if (md == NULL)
-               return false;
-
-       cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
-       lnet_res_lock(cpt);
-       rc = md->md_flags & LNET_MD_FLAG_DISCARD;
-       lnet_res_unlock(cpt);
-
-       return rc;
-}
-EXPORT_SYMBOL(lnet_md_discarded);
+EXPORT_SYMBOL(LNetMDUnlink);
index e732457..b8d3c0e 100644 (file)
@@ -2501,10 +2501,8 @@ ptlrpc_client_recv(struct ptlrpc_request *req)
        return req->rq_receiving_reply;
 }
 
-#define ptlrpc_cli_wait_unlink(req) __ptlrpc_cli_wait_unlink(req, NULL)
-
 static inline int
-__ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard)
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
 {
        int rc;
 
@@ -2518,15 +2516,6 @@ __ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard)
                return 1;
        }
 
-       if (discard) {
-               *discard = false;
-               if (req->rq_reply_unlinked && req->rq_req_unlinked == 0) {
-                       *discard = true;
-                       spin_unlock(&req->rq_lock);
-                       return 1; /* Should call again after LNetMDUnlink */
-               }
-       }
-
        rc = !req->rq_req_unlinked || !req->rq_reply_unlinked ||
             req->rq_receiving_reply;
        spin_unlock(&req->rq_lock);
index 6f9bfbf..bbba44d 100644 (file)
@@ -1869,7 +1869,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                         * not corrupt any data.
                         */
                        if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
-                           ptlrpc_cli_wait_unlink(req))
+                           ptlrpc_client_recv_or_unlink(req))
                                continue;
                        if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
                            ptlrpc_client_bulk_active(req))
@@ -1907,7 +1907,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                        /*
                         * Check if we still need to wait for unlink.
                         */
-                       if (ptlrpc_cli_wait_unlink(req) ||
+                       if (ptlrpc_client_recv_or_unlink(req) ||
                            ptlrpc_client_bulk_active(req))
                                continue;
                        /* If there is no need to resend, fail it now. */
@@ -2731,7 +2731,6 @@ EXPORT_SYMBOL(ptlrpc_req_xid);
  */
 static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
 {
-       bool discard = false;
        /*
         * Might sleep.
         */
@@ -2746,18 +2745,15 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
        /*
         * Nothing left to do.
         */
-       if (!__ptlrpc_cli_wait_unlink(request, &discard))
+       if (!ptlrpc_client_recv_or_unlink(request))
                RETURN(1);
 
        LNetMDUnlink(request->rq_reply_md_h);
 
-       if (discard) /* Discard the request-out callback */
-               __LNetMDUnlink(request->rq_req_md_h, discard);
-
        /*
         * Let's check it once again.
         */
-       if (!ptlrpc_cli_wait_unlink(request))
+       if (!ptlrpc_client_recv_or_unlink(request))
                RETURN(1);
 
        /* Move to "Unregistering" phase as reply was not unlinked yet. */
@@ -2786,7 +2782,7 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
                while (seconds > 0 &&
                       wait_event_idle_timeout(
                               *wq,
-                              !ptlrpc_cli_wait_unlink(request),
+                              !ptlrpc_client_recv_or_unlink(request),
                               cfs_time_seconds(1)) == 0)
                        seconds -= 1;
                if (seconds > 0) {
index 117c1f2..6bce933 100644 (file)
@@ -104,15 +104,12 @@ static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len,
        RETURN (0);
 }
 
-#define mdunlink_iterate_helper(mds, count) \
-               __mdunlink_iterate_helper(mds, count, false) 
-static void __mdunlink_iterate_helper(struct lnet_handle_md *bd_mds,
-                                     int count, bool discard)
+static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count)
 {
        int i;
 
        for (i = 0; i < count; i++)
-               __LNetMDUnlink(bd_mds[i], discard);
+               LNetMDUnlink(bd_mds[i]);
 }
 
 #ifdef HAVE_SERVER_SUPPORT
@@ -288,7 +285,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
         * but we must still wait_event_idle_timeout() in this case, to give
         * us a chance to run server_bulk_callback()
         */
-       __mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw, true);
+       mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 
        for (;;) {
                /* Network access will complete in finite time but the HUGE