From babf0232273467b7199ec9a7c36047b1968913df Mon Sep 17 00:00:00 2001 From: Yang Sheng Date: Mon, 4 May 2020 18:57:33 +0800 Subject: [PATCH] LU-13368 lnet: discard the callback Lustre need a completion callback for event that request has been sent. And then need other callback when reply arrived. Sometime the request completion callback maybe lost by some reason even reply has been received. system will wait forever even timeout. We needn't to wait request completion in such case. So provide a way to discard the callback. Signed-off-by: Yang Sheng Change-Id: If9cd8420ee76947ee5053180e0f5219f76bb94c2 Reviewed-on: https://review.whamcloud.com/38845 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- .../patches/base/ext4-htree-lock.patch | 2 +- lnet/include/lnet/api.h | 3 +- lnet/include/lnet/lib-lnet.h | 1 + lnet/include/lnet/lib-types.h | 1 + lnet/klnds/o2iblnd/o2iblnd.c | 1 + lnet/klnds/o2iblnd/o2iblnd.h | 4 ++ lnet/klnds/o2iblnd/o2iblnd_cb.c | 57 +++++++++++++++++++++- lnet/lnet/lib-md.c | 25 +++++++++- lustre/include/lustre_net.h | 13 ++++- lustre/ptlrpc/client.c | 14 ++++-- lustre/ptlrpc/niobuf.c | 9 ++-- 11 files changed, 115 insertions(+), 15 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch b/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch index eaab8de..2e76284 100644 --- a/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch +++ b/ldiskfs/kernel_patches/patches/base/ext4-htree-lock.patch @@ -38,7 +38,7 @@ The idea is simple, like how pdirop is implement at LDLM level. while holding CW lock, we drop CW lock and take EX lock and retry. - disable pdirops by always EX lock on change and PR lock on lookup/readdir. -Lustre-bug-id: LU-50" target="_blank">https://jira.whamcloud.com/browse/LU-50 +Lustre-bug-id: https://jira.whamcloud.com/browse/LU-50 Lustre-change: http://review.whamcloud.com/375 Signed-off-by: Liang Zhen diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index b3901f5..506b762 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -125,7 +125,8 @@ int LNetMDBind(const struct lnet_md *md_in, enum lnet_unlink unlink_in, struct lnet_handle_md *md_handle_out); -int LNetMDUnlink(struct lnet_handle_md md_in); +int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard); +#define LNetMDUnlink(handle) __LNetMDUnlink(handle, false) void lnet_assert_handler_unused(lnet_handler_t handler); /** @} lnet_md */ diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index c7d7afc..c3202cb 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -669,6 +669,7 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); void lnet_clean_zombie_rstqs(void); +bool lnet_md_discarded(struct lnet_libmd *md); void lnet_finalize(struct lnet_msg *msg, int rc); bool lnet_send_error_simulation(struct lnet_msg *msg, enum lnet_msg_hstatus *hstatus); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 1a71059..9500550 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -226,6 +226,7 @@ struct lnet_libmd { * call. */ #define LNET_MD_FLAG_HANDLING BIT(3) +#define LNET_MD_FLAG_DISCARD BIT(4) struct lnet_test_peer { /* info about peers we are trying to fail */ diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index dbd11e95..0ad8553 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -3052,6 +3052,7 @@ kiblnd_base_startup(struct net *ns) spin_lock_init(&kiblnd_data.kib_connd_lock); INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits); INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 8ff3b8c..5334403 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -437,6 +437,8 @@ struct kib_data { struct list_head kib_reconn_list; /* peers wait for reconnection */ struct list_head kib_reconn_wait; + /* connections wait for completion */ + struct list_head kib_connd_waits; /* * The second that peers are pulled out from \a kib_reconn_wait * for reconnection. @@ -690,6 +692,8 @@ struct kib_conn { __u16 ibc_queue_depth; /* connections max frags */ __u16 ibc_max_frags; + /* count of timeout txs waiting on cq */ + __u16 ibc_waits; /* receive buffers owned */ unsigned int ibc_nrx:16; /* scheduled for attention */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index a7ededd..c6ad2b4 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -2119,6 +2119,10 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) if (tx->tx_sending == 0) { tx->tx_queued = 0; list_move(&tx->tx_list, &zombies); + } else { + /* keep tx until cq destroy */ + list_move(&tx->tx_list, &conn->ibc_zombie_txs); + conn->ibc_waits ++; } } @@ -2133,6 +2137,31 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK); } +static int +kiblnd_tx_may_discard(struct kib_conn *conn) +{ + int rc = 0; + struct kib_tx *nxt; + struct kib_tx *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) { + if (tx->tx_sending > 0 && tx->tx_lntmsg[0] && + lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) { + tx->tx_sending --; + if (tx->tx_sending == 0) { + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + rc = 1; + } + } + } + + spin_unlock(&conn->ibc_lock); + return rc; +} + static void kiblnd_finalise_conn(struct kib_conn *conn) { @@ -3283,8 +3312,9 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) } if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { - CERROR("Timed out tx: %s, %lld seconds\n", + CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n", kiblnd_queue2str(conn, txs), + tx->tx_waiting, tx->tx_sending, tx->tx_queued, kiblnd_timeout() + ktime_ms_delta(ktime_get(), tx->tx_deadline) / MSEC_PER_SEC); @@ -3486,6 +3516,7 @@ kiblnd_connd (void *arg) } if (!list_empty(&kiblnd_data.kib_connd_conns)) { + int wait; conn = list_entry(kiblnd_data.kib_connd_conns.next, struct kib_conn, ibc_list); list_del(&conn->ibc_list); @@ -3494,9 +3525,15 @@ kiblnd_connd (void *arg) dropped_lock = 1; kiblnd_disconnect_conn(conn); - kiblnd_conn_decref(conn); + wait = conn->ibc_waits; + if (wait == 0) /* keep ref for connd_wait, see below */ + kiblnd_conn_decref(conn); spin_lock_irqsave(lock, flags); + + if (wait) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); } while (reconn < KIB_RECONN_BREAK) { @@ -3524,6 +3561,22 @@ kiblnd_connd (void *arg) spin_lock_irqsave(lock, flags); } + if (!list_empty(&kiblnd_data.kib_connd_waits)) { + conn = list_entry(kiblnd_data.kib_connd_waits.next, + struct kib_conn, ibc_list); + list_del(&conn->ibc_list); + spin_unlock_irqrestore(lock, flags); + + dropped_lock = kiblnd_tx_may_discard(conn); + if (dropped_lock) + kiblnd_conn_decref(conn); + + spin_lock_irqsave(lock, flags); + if (dropped_lock == 0) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); + } + /* careful with the jiffy wrap... */ timeout = (int)(deadline - jiffies); if (timeout <= 0) { diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index bc10b4a..dfb8b03 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -471,7 +471,7 @@ EXPORT_SYMBOL(LNetMDBind); * \retval -ENOENT If \a mdh does not point to a valid MD object. */ int -LNetMDUnlink(struct lnet_handle_md mdh) +__LNetMDUnlink(struct lnet_handle_md mdh, bool discard) { struct lnet_event ev; struct lnet_libmd *md = NULL; @@ -505,6 +505,9 @@ LNetMDUnlink(struct lnet_handle_md mdh) handler = md->md_handler; } + if (discard) + md->md_flags |= LNET_MD_FLAG_DISCARD; + if (md->md_rspt_ptr != NULL) lnet_detach_rsp_tracker(md, cpt); @@ -517,4 +520,22 @@ LNetMDUnlink(struct lnet_handle_md mdh) return 0; } -EXPORT_SYMBOL(LNetMDUnlink); +EXPORT_SYMBOL(__LNetMDUnlink); + +bool +lnet_md_discarded(struct lnet_libmd *md) +{ + bool rc; + int cpt; + + if (md == NULL) + return false; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + lnet_res_lock(cpt); + rc = md->md_flags & LNET_MD_FLAG_DISCARD; + lnet_res_unlock(cpt); + + return rc; +} +EXPORT_SYMBOL(lnet_md_discarded); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 7dad281..f34a272 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2501,8 +2501,10 @@ ptlrpc_client_recv(struct ptlrpc_request *req) return req->rq_receiving_reply; } +#define ptlrpc_cli_wait_unlink(req) __ptlrpc_cli_wait_unlink(req, NULL) + static inline int -ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +__ptlrpc_cli_wait_unlink(struct ptlrpc_request *req, bool *discard) { int rc; @@ -2516,6 +2518,15 @@ ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) return 1; } + if (discard) { + *discard = false; + if (req->rq_reply_unlinked && req->rq_req_unlinked == 0) { + *discard = true; + spin_unlock(&req->rq_lock); + return 1; /* Should call again after LNetMDUnlink */ + } + } + rc = !req->rq_req_unlinked || !req->rq_reply_unlinked || req->rq_receiving_reply; spin_unlock(&req->rq_lock); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 4ec69b3..7ee5324 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1880,7 +1880,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * not corrupt any data. */ if (req->rq_phase == RQ_PHASE_UNREG_RPC && - ptlrpc_client_recv_or_unlink(req)) + ptlrpc_cli_wait_unlink(req)) continue; if (req->rq_phase == RQ_PHASE_UNREG_BULK && ptlrpc_client_bulk_active(req)) @@ -1918,7 +1918,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) /* * Check if we still need to wait for unlink. */ - if (ptlrpc_client_recv_or_unlink(req) || + if (ptlrpc_cli_wait_unlink(req) || ptlrpc_client_bulk_active(req)) continue; /* If there is no need to resend, fail it now. */ @@ -2758,6 +2758,7 @@ EXPORT_SYMBOL(ptlrpc_req_xid); */ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) { + bool discard = false; /* * Might sleep. */ @@ -2772,15 +2773,18 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) /* * Nothing left to do. */ - if (!ptlrpc_client_recv_or_unlink(request)) + if (!__ptlrpc_cli_wait_unlink(request, &discard)) RETURN(1); LNetMDUnlink(request->rq_reply_md_h); + if (discard) /* Discard the request-out callback */ + __LNetMDUnlink(request->rq_req_md_h, discard); + /* * Let's check it once again. */ - if (!ptlrpc_client_recv_or_unlink(request)) + if (!ptlrpc_cli_wait_unlink(request)) RETURN(1); /* Move to "Unregistering" phase as reply was not unlinked yet. */ @@ -2809,7 +2813,7 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) while (seconds > 0 && wait_event_idle_timeout( *wq, - !ptlrpc_client_recv_or_unlink(request), + !ptlrpc_cli_wait_unlink(request), cfs_time_seconds(1)) == 0) seconds -= 1; if (seconds > 0) { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 6bce933b..117c1f2 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -104,12 +104,15 @@ static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len, RETURN (0); } -static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count) +#define mdunlink_iterate_helper(mds, count) \ + __mdunlink_iterate_helper(mds, count, false) +static void __mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, + int count, bool discard) { int i; for (i = 0; i < count; i++) - LNetMDUnlink(bd_mds[i]); + __LNetMDUnlink(bd_mds[i], discard); } #ifdef HAVE_SERVER_SUPPORT @@ -285,7 +288,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) * but we must still wait_event_idle_timeout() in this case, to give * us a chance to run server_bulk_callback() */ - mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + __mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw, true); for (;;) { /* Network access will complete in finite time but the HUGE -- 1.8.3.1