From 5e30a2c06176f50f5e17aba68fdae7e38d922d33 Mon Sep 17 00:00:00 2001 From: Mr NeilBrown Date: Sat, 18 Jan 2020 09:38:03 -0500 Subject: [PATCH 1/1] LU-10467 lustre: convert most users of LWI_TIMEOUT_INTERVAL() when l_wait_event() is called with an lwi initialised with LWI_TIMEOUT_INTERVAL(t1, t2, NULL, NUL), waits for a total of t1 jiffies, but wakes up every t2 jiffies to check the condition - incase the condition changed without triggering a wakeup. In (nearly) every case, t2 is one second. So this is effectively a poll loop around wait_event_timeout. So replace with with seconds = t1; while (seconds > 0 && wait_event_timeout(q, cond, cfs_time_seconds(1)) == 0) seconds -= 1; Then if seconds is zero at the end, the whole loop timed out. In the one exception ("nearly" above) if t1 is small, t2 is set to one jiffies, so we always wait a little bit and check the condition. For that case, we count to "seconds >= 0" and adjust the timeout accordingly when seconds == 0. Note that in one case, the on_timeout function is target_bulk_timeout() instead of NULL. As this always returns '1', it behaves exactly like passing NULL. Change-Id: I4cddbd2c28f07012cce7915489eedcb668c7e808 Signed-off-by: Mr NeilBrown Reviewed-on: https://review.whamcloud.com/35973 Tested-by: jenkins Reviewed-by: James Simmons Tested-by: Maloo Reviewed-by: Petros Koutoupis Reviewed-by: Shaun Tancheff Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lib.c | 34 +++++++++++----------------------- lustre/lfsck/lfsck_engine.c | 24 ++++++++++++------------ lustre/ptlrpc/client.c | 18 ++++++++---------- lustre/ptlrpc/niobuf.c | 32 ++++++++++++++++---------------- lustre/ptlrpc/service.c | 16 +++++++++------- 5 files changed, 56 insertions(+), 68 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 5ee714e..5086c28e 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -3306,16 +3306,6 @@ void ldlm_dump_export_locks(struct obd_export *exp) #endif #ifdef HAVE_SERVER_SUPPORT -static int target_bulk_timeout(void *data) -{ - ENTRY; - /* - * We don't fail the connection here, because having the export - * killed makes the (vital) call to commitrw very sad. - */ - RETURN(1); -} - static inline const char *bulk2type(struct ptlrpc_request *req) { if (req->rq_bulk_read) @@ -3330,7 +3320,6 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc) struct ptlrpc_request *req = desc->bd_req; time64_t start = ktime_get_seconds(); time64_t deadline; - struct l_wait_info lwi; int rc = 0; ENTRY; @@ -3375,20 +3364,19 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc) do { time64_t timeoutl = deadline - ktime_get_seconds(); - long timeout_jiffies = timeoutl <= 0 ? - 1 : cfs_time_seconds(timeoutl); time64_t rq_deadline; - lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies, - cfs_time_seconds(1), - target_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, - !ptlrpc_server_bulk_active(desc) || - exp->exp_failed || - exp->exp_conn_cnt > - lustre_msg_get_conn_cnt(req->rq_reqmsg), - &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); + while (timeoutl >= 0 && + wait_event_idle_timeout( + desc->bd_waitq, + !ptlrpc_server_bulk_active(desc) || + exp->exp_failed || + exp->exp_conn_cnt > + lustre_msg_get_conn_cnt(req->rq_reqmsg), + timeoutl ? cfs_time_seconds(1) : 1) == 0) + timeoutl -= 1; + rc = timeoutl < 0 ? -ETIMEDOUT : 0; + /* Wait again if we changed rq_deadline. */ rq_deadline = READ_ONCE(req->rq_deadline); deadline = start + bulk_timeout; diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 520ff35..914ad83 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -1711,6 +1711,8 @@ int lfsck_assistant_engine(void *args) GOTO(cleanup, rc = 0); while (test_bit(LAD_IN_DOUBLE_SCAN, &lad->lad_flags)) { + int seconds = 30; + rc = lfsck_assistant_query_others(env, com); if (lfsck_phase2_next_ready(lad)) goto p2_next; @@ -1720,26 +1722,24 @@ int lfsck_assistant_engine(void *args) /* Pull LFSCK status on related targets once * per 30 seconds if we are not notified. */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(30), - cfs_time_seconds(1), - NULL, NULL); - rc = l_wait_event(athread->t_ctl_waitq, - lfsck_phase2_next_ready(lad) || - test_bit(LAD_EXIT, &lad->lad_flags) || - !thread_is_running(mthread), - &lwi); + while (seconds > 0 && + wait_event_idle_timeout( + athread->t_ctl_waitq, + lfsck_phase2_next_ready(lad) || + test_bit(LAD_EXIT, + &lad->lad_flags) || + !thread_is_running(mthread), + cfs_time_seconds(1)) == 0) + seconds -= 1; if (unlikely( test_bit(LAD_EXIT, &lad->lad_flags) || !thread_is_running(mthread))) GOTO(cleanup, rc = 0); - if (rc == -ETIMEDOUT) + if (seconds == 0) continue; - if (rc < 0) - GOTO(cleanup, rc); - p2_next: rc = lao->la_handler_p2(env, com); if (rc != 0) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0182bbd..bee8d3f 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2741,9 +2741,6 @@ EXPORT_SYMBOL(ptlrpc_req_xid); */ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) { - int rc; - struct l_wait_info lwi; - /* * Might sleep. */ @@ -2784,24 +2781,25 @@ static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) * unlinked before returning a req to the pool. */ for (;;) { - /* The wq argument is ignored by user-space wait_event macros */ wait_queue_head_t *wq = (request->rq_set) ? &request->rq_set->set_waitq : &request->rq_reply_waitq; + int seconds = LONG_UNLINK; /* * Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), - &lwi); - if (rc == 0) { + while (seconds > 0 && + wait_event_idle_timeout( + *wq, + !ptlrpc_client_recv_or_unlink(request), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) { ptlrpc_rqphase_move(request, request->rq_next_phase); RETURN(1); } - LASSERT(rc == -ETIMEDOUT); DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout receiving_reply=%d req_ulinked=%d reply_unlinked=%d", request->rq_receiving_reply, diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 915e092..c19fecd 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -268,9 +268,6 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) */ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) { - struct l_wait_info lwi; - int rc; - LASSERT(!in_interrupt()); /* might sleep */ if (!ptlrpc_server_bulk_active(desc)) /* completed or */ @@ -290,14 +287,16 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(desc->bd_waitq, - !ptlrpc_server_bulk_active(desc), &lwi); - if (rc == 0) + int seconds = LONG_UNLINK; + + while (seconds > 0 && + wait_event_idle_timeout(desc->bd_waitq, + !ptlrpc_server_bulk_active(desc), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) return; - LASSERT(rc == -ETIMEDOUT); CWARN("Unexpectedly long timeout: desc %p\n", desc); } } @@ -437,8 +436,6 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req) int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) { struct ptlrpc_bulk_desc *desc = req->rq_bulk; - struct l_wait_info lwi; - int rc; ENTRY; LASSERT(!in_interrupt()); /* might sleep */ @@ -478,15 +475,18 @@ int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) * Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs. */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); - if (rc == 0) { + int seconds = LONG_UNLINK; + + while (seconds > 0 && + wait_event_idle_timeout(*wq, + !ptlrpc_client_bulk_active(req), + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds > 0) { ptlrpc_rqphase_move(req, req->rq_next_phase); RETURN(1); } - LASSERT(rc == -ETIMEDOUT); DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", desc); } diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 64ecbc1..8962690 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -3396,7 +3396,6 @@ ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) { struct ptlrpc_service_part *svcpt; struct ptlrpc_request_buffer_desc *rqbd; - struct l_wait_info lwi; int rc; int i; @@ -3434,18 +3433,21 @@ ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) */ spin_lock(&svcpt->scp_lock); while (svcpt->scp_nrqbds_posted != 0) { + int seconds = LONG_UNLINK; + spin_unlock(&svcpt->scp_lock); /* * Network access will complete in finite time but * the HUGE timeout lets us CWARN for visibility * of sluggish NALs */ - lwi = LWI_TIMEOUT_INTERVAL( - cfs_time_seconds(LONG_UNLINK), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(svcpt->scp_waitq, - svcpt->scp_nrqbds_posted == 0, &lwi); - if (rc == -ETIMEDOUT) { + while (seconds > 0 && + wait_event_idle_timeout( + svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, + cfs_time_seconds(1)) == 0) + seconds -= 1; + if (seconds == 0) { CWARN("Service %s waiting for request buffers\n", svcpt->scp_service->srv_name); } -- 1.8.3.1