From: Serguei Smirnov Date: Mon, 8 Mar 2021 17:46:03 +0000 (-0800) Subject: LU-14499 lnet: Revert "LU-13368 lnet: discard the callback" X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F37%2F41937%2F3;p=fs%2Flustre-release.git LU-14499 lnet: Revert "LU-13368 lnet: discard the callback" The changes introduced by LU-13368 have been shown to cause the o2iblnd shutdown procedure to hang on lustre_rmmod as it infinitely waits for peers to disconnect. Revert it. This reverts commit babf0232273467b7199ec9a7c36047b1968913df. Signed-off-by: Serguei Smirnov Change-Id: I489ae4af445b18df852ec35adc958c4fac33de09 --- diff --git a/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch b/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch index 2e76284..eaab8de 100644 --- a/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch +++ b/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch @@ -38,7 +38,7 @@ The idea is simple, like how pdirop is implement at LDLM level. while holding CW lock, we drop CW lock and take EX lock and retry. - disable pdirops by always EX lock on change and PR lock on lookup/readdir. -Lustre-bug-id: https://jira.whamcloud.com/browse/LU-50 +Lustre-bug-id: LU-50" target="_blank">https://jira.whamcloud.com/browse/LU-50 Lustre-change: http://review.whamcloud.com/375 Signed-off-by: Liang Zhen diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 506b762..b3901f5 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -125,8 +125,7 @@ int LNetMDBind(const struct lnet_md *md_in, enum lnet_unlink unlink_in, struct lnet_handle_md *md_handle_out); -int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard); -#define LNetMDUnlink(handle) __LNetMDUnlink(handle, false) +int LNetMDUnlink(struct lnet_handle_md md_in); void lnet_assert_handler_unused(lnet_handler_t handler); /** @} lnet_md */ diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 4fdcd87..1b426bc 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -679,7 +679,6 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); void lnet_clean_zombie_rstqs(void); -bool lnet_md_discarded(struct lnet_libmd *md); void lnet_finalize(struct lnet_msg *msg, int rc); bool lnet_send_error_simulation(struct lnet_msg *msg, enum lnet_msg_hstatus *hstatus); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 22a1bdd..ee6cd9d 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -227,7 +227,6 @@ struct lnet_libmd { * call. */ #define LNET_MD_FLAG_HANDLING BIT(3) -#define LNET_MD_FLAG_DISCARD BIT(4) struct lnet_test_peer { /* info about peers we are trying to fail */ diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 152173b..5ab0f77 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -3067,7 +3067,6 @@ kiblnd_base_startup(struct net *ns) spin_lock_init(&kiblnd_data.kib_connd_lock); INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); - INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits); INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 3e4013f..2788a24 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -450,8 +450,6 @@ struct kib_data { struct list_head kib_reconn_list; /* peers wait for reconnection */ struct list_head kib_reconn_wait; - /* connections wait for completion */ - struct list_head kib_connd_waits; /* * The second that peers are pulled out from \a kib_reconn_wait * for reconnection. @@ -590,8 +588,6 @@ struct kib_conn { __u16 ibc_queue_depth; /* connections max frags */ __u16 ibc_max_frags; - /* count of timeout txs waiting on cq */ - __u16 ibc_waits; /* receive buffers owned */ unsigned int ibc_nrx:16; /* scheduled for attention */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 658d0ad..14e7257 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -2135,10 +2135,6 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) if (tx->tx_sending == 0) { tx->tx_queued = 0; list_move(&tx->tx_list, &zombies); - } else { - /* keep tx until cq destroy */ - list_move(&tx->tx_list, &conn->ibc_zombie_txs); - conn->ibc_waits ++; } } @@ -2153,31 +2149,6 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK); } -static int -kiblnd_tx_may_discard(struct kib_conn *conn) -{ - int rc = 0; - struct kib_tx *nxt; - struct kib_tx *tx; - - spin_lock(&conn->ibc_lock); - - list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) { - if (tx->tx_sending > 0 && tx->tx_lntmsg[0] && - lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) { - tx->tx_sending --; - if (tx->tx_sending == 0) { - kiblnd_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - rc = 1; - } - } - } - - spin_unlock(&conn->ibc_lock); - return rc; -} - static void kiblnd_finalise_conn(struct kib_conn *conn) { @@ -3325,9 +3296,8 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) } if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { - CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n", + CERROR("Timed out tx: %s, %lld seconds\n", kiblnd_queue2str(conn, txs), - tx->tx_waiting, tx->tx_sending, tx->tx_queued, kiblnd_timeout() + ktime_ms_delta(ktime_get(), tx->tx_deadline) / MSEC_PER_SEC); @@ -3529,7 +3499,6 @@ kiblnd_connd (void *arg) } if (!list_empty(&kiblnd_data.kib_connd_conns)) { - int wait; conn = list_entry(kiblnd_data.kib_connd_conns.next, struct kib_conn, ibc_list); list_del(&conn->ibc_list); @@ -3538,15 +3507,9 @@ kiblnd_connd (void *arg) dropped_lock = 1; kiblnd_disconnect_conn(conn); - wait = conn->ibc_waits; - if (wait == 0) /* keep ref for connd_wait, see below */ - kiblnd_conn_decref(conn); + kiblnd_conn_decref(conn); spin_lock_irqsave(lock, flags); - - if (wait) - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_connd_waits); } while (reconn < KIB_RECONN_BREAK) { @@ -3574,25 +3537,9 @@ kiblnd_connd (void *arg) spin_lock_irqsave(lock, flags); } - if (!list_empty(&kiblnd_data.kib_connd_waits)) { - conn = list_entry(kiblnd_data.kib_connd_waits.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - spin_unlock_irqrestore(lock, flags); - - dropped_lock = kiblnd_tx_may_discard(conn); - if (dropped_lock) - kiblnd_conn_decref(conn); - - spin_lock_irqsave(lock, flags); - if (dropped_lock == 0) - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_connd_waits); - } - /* careful with the jiffy wrap... */ timeout = (int)(deadline - jiffies); - if (timeout <= 0) { + if (timeout <= 0) { const int n = 4; const int p = 1; int chunk = HASH_SIZE(kiblnd_data.kib_peers); diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index dfb8b03..bc10b4a 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -471,7 +471,7 @@ EXPORT_SYMBOL(LNetMDBind); * \retval -ENOENT If \a mdh does not point to a valid MD object. */ int -__LNetMDUnlink(struct lnet_handle_md mdh, bool discard) +LNetMDUnlink(struct lnet_handle_md mdh) { struct lnet_event ev; struct lnet_libmd *md = NULL; @@ -505,9 +505,6 @@ __LNetMDUnlink(struct lnet_handle_md mdh, bool discard) handler = md->md_handler; } - if (discard) - md->md_flags |= LNET_MD_FLAG_DISCARD; - if (md->md_rspt_ptr != NULL) lnet_detach_rsp_tracker(md, cpt); @@ -520,22 +517,4 @@ __LNetMDUnlink(struct lnet_handle_md mdh, bool discard) return 0; } -EXPORT_SYMBOL(__LNetMDUnlink); - -bool -lnet_md_discarded(struct lnet_libmd *md) -{ - bool rc; - int cpt; - - if (md == NULL) - return false; - - cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); - lnet_res_lock(cpt); - rc = md->md_flags & LNET_MD_FLAG_DISCARD; - lnet_res_unlock(cpt); - - return rc; -} -EXPORT_SYMBOL(lnet_md_discarded); +EXPORT_SYMBOL(LNetMDUnlink); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e732457..b8d3c0e 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2501,10 +2501,8 @@ ptlrpc_client_recv(struct ptlrpc_request *req) return req->rq_receiving_reply; } -#define ptlrpc_cli_wait_unlink(req) __ptlrpc_cli_wait_unlink(req, NULL) - static inline int -__ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard) +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) { int rc; @@ -2518,15 +2516,6 @@ __ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard) return 1; } - if (discard) { - *discard = false; - if (req->rq_reply_unlinked && req->rq_req_unlinked == 0) { - *discard = true; - spin_unlock(&req->rq_lock); - return 1; /* Should call again after LNetMDUnlink */ - } - } - rc = !req->rq_req_unlinked || !req->rq_reply_unlinked || req->rq_receiving_reply; spin_unlock(&req->rq_lock); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 6f9bfbf..bbba44d 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1869,7 +1869,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * not corrupt any data. */ if (req->rq_phase == RQ_PHASE_UNREG_RPC && - ptlrpc_cli_wait_unlink(req)) + ptlrpc_client_recv_or_unlink(req)) continue; if (req->rq_phase == RQ_PHASE_UNREG_BULK && ptlrpc_client_bulk_active(req)) @@ -1907,7 +1907,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) /* * Check if we still need to wait for unlink. */ - if (ptlrpc_cli_wait_unlink(req) || + if (ptlrpc_client_recv_or_unlink(req) || ptlrpc_client_bulk_active(req)) continue; /* If there is no need to resend, fail it now. */ @@ -2731,7 +2731,6 @@ EXPORT_SYMBOL(ptlrpc_req_xid); */ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) { - bool discard = false; /* * Might sleep. */ @@ -2746,18 +2745,15 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) /* * Nothing left to do. */ - if (!__ptlrpc_cli_wait_unlink(request, &discard)) + if (!ptlrpc_client_recv_or_unlink(request)) RETURN(1); LNetMDUnlink(request->rq_reply_md_h); - if (discard) /* Discard the request-out callback */ - __LNetMDUnlink(request->rq_req_md_h, discard); - /* * Let's check it once again. */ - if (!ptlrpc_cli_wait_unlink(request)) + if (!ptlrpc_client_recv_or_unlink(request)) RETURN(1); /* Move to "Unregistering" phase as reply was not unlinked yet. */ @@ -2786,7 +2782,7 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) while (seconds > 0 && wait_event_idle_timeout( *wq, - !ptlrpc_cli_wait_unlink(request), + !ptlrpc_client_recv_or_unlink(request), cfs_time_seconds(1)) == 0) seconds -= 1; if (seconds > 0) { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 117c1f2..6bce933b 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -104,15 +104,12 @@ static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len, RETURN (0); } -#define mdunlink_iterate_helper(mds, count) \ - __mdunlink_iterate_helper(mds, count, false) -static void __mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, - int count, bool discard) +static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count) { int i; for (i = 0; i < count; i++) - __LNetMDUnlink(bd_mds[i], discard); + LNetMDUnlink(bd_mds[i]); } #ifdef HAVE_SERVER_SUPPORT @@ -288,7 +285,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) * but we must still wait_event_idle_timeout() in this case, to give * us a chance to run server_bulk_callback() */ - __mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw, true); + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); for (;;) { /* Network access will complete in finite time but the HUGE