From 1aaf8571cb42b36759d92f3beb172f0a15c830cf Mon Sep 17 00:00:00 2001 From: shadow Date: Wed, 3 Dec 2008 04:54:58 +0000 Subject: [PATCH] don't resend llog cancels, fix resend requests for ldlm imports. Branch HEAD b=17695 i=umka i=tappro --- lustre/ptlrpc/client.c | 30 ++++++++++-------------------- lustre/ptlrpc/niobuf.c | 3 +++ lustre/ptlrpc/recov_thread.c | 5 +++++ lustre/tests/replay-single.sh | 27 --------------------------- 4 files changed, 18 insertions(+), 47 deletions(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0913e85..f06de2c7 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1249,8 +1249,11 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) continue; spin_lock(&imp->imp_lock); - if (ptlrpc_import_delay_req(imp, req, &status)){ + /* put on delay list - only if we wait + * recovery finished - before send */ + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock(&imp->imp_lock); continue; } @@ -1282,8 +1285,6 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) /* This is re-sending anyways, * let's mark req as resend. */ req->rq_resend = 1; - lustre_msg_add_flags(req->rq_reqmsg, - MSG_RESENT); if (req->rq_bulk) { __u64 old_xid; @@ -1352,17 +1353,8 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) spin_unlock(&req->rq_lock); req->rq_status = after_reply(req); - if (req->rq_resend) { - /* Add this req to the delayed list so - it can be errored if the import is - evicted after recovery. */ - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - list_add_tail(&req->rq_list, - &imp->imp_delayed_list); - spin_unlock(&imp->imp_lock); + if (req->rq_resend) continue; - } /* If there is no bulk associated with this request, * then we're done and should let the interpreter @@ -2154,8 +2146,6 @@ restart: } if (req->rq_resend) { - lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); - if (req->rq_bulk != NULL) { ptlrpc_unregister_bulk(req, 0); @@ -2246,18 +2236,18 @@ after_send: } /* Resend if we need to */ - if (req->rq_resend) { + if (req->rq_resend||req->rq_timedout) { /* ...unless we were specifically told otherwise. */ if (req->rq_no_resend) GOTO(out, rc = -ETIMEDOUT); spin_lock(&imp->imp_lock); + /* we can have rq_timeout on dlm fake import which not support + * recovery - but me need resend request on this import instead + * of return error */ + req->rq_resend = 1; goto restart; } - if (req->rq_timedout) { /* non-recoverable timeout */ - GOTO(out, rc = -ETIMEDOUT); - } - if (!ptlrpc_client_replied(req)) { /* How can this be? -eeb */ DEBUG_REQ(D_ERROR, req, "!rq_replied: "); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 786f929..11f6641 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -540,6 +540,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) RETURN(rc); } + if (request->rq_resend) + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + if (!noreply) { LASSERT (request->rq_replen != 0); if (request->rq_repbuf == NULL) { diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 69aec83..dc4b13b 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -270,6 +270,11 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; req->rq_interpret_reply = (ptlrpc_interpterer_t)llcd_interpret; req->rq_async_args.pointer_arg[0] = llcd; + + /* llog cancels will be replayed after reconnect so this will do twice + * first from replay llog, second for resended rpc */ + req->rq_no_delay = req->rq_no_resend = 1; + rc = ptlrpc_set_add_new_req(&lcm->lcm_pc, req); if (rc) { ptlrpc_request_free(req); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index e97d286..1135b45 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1427,33 +1427,6 @@ test_59() { } run_test 59 "test log_commit_thread vs filter_destroy race" -# bug 17323 -test_59b() { - do_facet $SINGLEMDS "lctl set_param debug=+rpctrace" - mkdir -p $DIR/$tdir - createmany -o $DIR/$tdir/$tfile-%d 2000 - sync -#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x606" - unlinkmany $DIR/$tdir/$tfile-%d 2000 - - # make sure that all llcds left ost and nothing left cached - sync - sleep 10 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x0" - - # sleep 2 obd_timeouts from ost to make sure that we get resents. - local timeout=$(do_facet ost1 lctl get_param -n timeout) - timeout=$((timeout * 2)) - log "Sleep $timeout" - sleep $timeout - do_facet $SINGLEMDS $LCTL dk | grep -q "RESENT cancel req" - local res=$? - rmdir $DIR/$tdir - return $res -} -run_test 59b "resent handle in llog_origin_handle_cancel" - # race between add unlink llog vs cat log init in post_recovery (only for b1_6) # bug 12086: should no oops and No ctxt error for this test test_60() { -- 1.8.3.1