Whamcloud - gitweb
LU-6084 ptlrpc: prevent request timeout grow due to recovery 20/13520/9
authorMikhail Pershin <mike.pershin@intel.com>
Tue, 3 Feb 2015 18:30:14 +0000 (10:30 -0800)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 9 Feb 2015 02:40:41 +0000 (02:40 +0000)
Patch fixes the issue with growing request timeout which occured
after commit 1d889090 for LU-5079. While commit itself is correct,
it reveals another issue. If request is being processed for a long
time on server then client adaptive timeouts will adapt to that
after receiving reply and new requests will have bigger timeout.
Another problem is that server AT history is corrupted by recovery
request processing time which not pure service time but includes
also waiting time for clients to recover.

Patch prevents the AT stats update from early replies on client and
from recovering requests processing time on server.
The ptlrpc_at_recv_early_reply() still updates the current request
timeout as asked by server, but don't include this into AT stats.
The real reply will bring that data from server after all.

Test-Parameters: alwaysuploadlogs testlist=replay-vbr,replay-dual

Signed-off-by: Mikhail Pershin <mike.pershin@intel.com>
Change-Id: Ifcadfd669162013b6ccb386eb2b508bd9f0b22d9
Reviewed-on: http://review.whamcloud.com/13520
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Jian Yu <jian.yu@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/ptlrpc/client.c
lustre/ptlrpc/service.c
lustre/tests/replay-dual.sh
lustre/tests/replay-vbr.sh

index 3ef3292..f677ea0 100644 (file)
@@ -359,27 +359,29 @@ __must_hold(&req->rq_lock)
        rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
        if (rc) {
                spin_lock(&req->rq_lock);
-                RETURN(rc);
-        }
-
-        rc = unpack_reply(early_req);
-        if (rc == 0) {
-                /* Expecting to increase the service time estimate here */
-                ptlrpc_at_adj_service(req,
-                        lustre_msg_get_timeout(early_req->rq_repmsg));
-                ptlrpc_at_adj_net_latency(req,
-                        lustre_msg_get_service_time(early_req->rq_repmsg));
-        }
-
-        sptlrpc_cli_finish_early_reply(early_req);
+               RETURN(rc);
+       }
 
+       rc = unpack_reply(early_req);
        if (rc != 0) {
+               sptlrpc_cli_finish_early_reply(early_req);
                spin_lock(&req->rq_lock);
                RETURN(rc);
        }
 
-       /* Adjust the local timeout for this req */
-       ptlrpc_at_set_req_timeout(req);
+       /* Use new timeout value just to adjust the local value for this
+        * request, don't include it into at_history. It is unclear yet why
+        * service time increased and should it be counted or skipped, e.g.
+        * that can be recovery case or some error or server, the real reply
+        * will add all new data if it is worth to add. */
+       req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg);
+       lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+
+       /* Network latency can be adjusted, it is pure network delays */
+       ptlrpc_at_adj_net_latency(req,
+                       lustre_msg_get_service_time(early_req->rq_repmsg));
+
+       sptlrpc_cli_finish_early_reply(early_req);
 
        spin_lock(&req->rq_lock);
        olddl = req->rq_deadline;
index b5b70ab..267685c 100644 (file)
@@ -1259,11 +1259,13 @@ ptlrpc_at_remove_timed(struct ptlrpc_request *req)
 static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
 {
        struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
-        struct ptlrpc_request *reqcopy;
-        struct lustre_msg *reqmsg;
-        cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
-        int rc;
-        ENTRY;
+       struct ptlrpc_request *reqcopy;
+       struct lustre_msg *reqmsg;
+       cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+       time_t  newdl;
+       int rc;
+
+       ENTRY;
 
        if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT)) {
                /* don't send early reply */
@@ -1305,10 +1307,11 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                 * during the recovery period send at least 4 early replies,
                 * spacing them every at_extra if we can. at_estimate should
                 * always equal this fixed value during recovery. */
-               at_measured(&svcpt->scp_at_estimate,
-                           cfs_time_current_sec() -
-                           req->rq_arrival_time.tv_sec + min(at_extra,
-                           req->rq_export->exp_obd->obd_recovery_timeout / 4));
+               /* Don't account request processing time into AT history
+                * during recovery, it is not service time we need but
+                * includes also waiting time for recovering clients */
+               newdl = cfs_time_current_sec() + min(at_extra,
+                       req->rq_export->exp_obd->obd_recovery_timeout / 4);
        } else {
                /* We want to extend the request deadline by at_extra seconds,
                 * so we set our service estimate to reflect how much time has
@@ -1320,17 +1323,16 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                at_measured(&svcpt->scp_at_estimate, at_extra +
                            cfs_time_current_sec() -
                            req->rq_arrival_time.tv_sec);
-
+               newdl = req->rq_arrival_time.tv_sec +
+                       at_get(&svcpt->scp_at_estimate);
        }
+
        /* Check to see if we've actually increased the deadline -
         * we may be past adaptive_max */
-       if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
-           at_get(&svcpt->scp_at_estimate)) {
+       if (req->rq_deadline >= newdl) {
                DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
                          "(%ld/%ld), not sending early reply\n",
-                         olddl, req->rq_arrival_time.tv_sec +
-                         at_get(&svcpt->scp_at_estimate) -
-                         cfs_time_current_sec());
+                         olddl, newdl - cfs_time_current_sec());
                RETURN(-ETIMEDOUT);
        }
 
@@ -1388,8 +1390,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
 
        if (!rc) {
                /* Adjust our own deadline to what we told the client */
-               req->rq_deadline = req->rq_arrival_time.tv_sec +
-                                  at_get(&svcpt->scp_at_estimate);
+               req->rq_deadline = newdl;
                req->rq_early_count++; /* number sent, server side */
        } else {
                DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
@@ -1951,6 +1952,7 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
                     MSGHDR_AT_SUPPORT) ?
                    /* The max time the client expects us to take */
                    lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+
         req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
         if (unlikely(deadline == 0)) {
                 DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
index b8e5094..5cab1a4 100755 (executable)
@@ -491,9 +491,9 @@ test_20() { #16389
        zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
        local tier2=$((SECONDS - before))
 
-       # timeout is more than 2.25x original timeout
-       ((tier2 < tier1 * 9 / 4)) ||
-               error "recovery time $tier2 >= 2.25x original time $tier1"
+       # timeout is more than 1.5x original timeout
+       ((tier2 < tier1 * 6 / 4)) ||
+               error "recovery time $tier2 >= 1.5x original time $tier1"
 }
 run_test 20 "recovery time is not increasing"
 
index f633a0d..ca00df5 100644 (file)
@@ -711,12 +711,6 @@ test_7_cycle() {
     do_facet $SINGLEMDS "$LCTL set_param mdd.${!var}.sync_permission=0"
     do_facet $SINGLEMDS "$LCTL set_param mdt.${!var}.commit_on_sharing=0"
 
-       local at_max_saved=0
-       if at_is_enabled; then
-               at_max_saved=$(at_max_get $SINGLEMDS)
-               at_max_set $TIMEOUT mds client
-       fi
-
     do_node $CLIENT1 mkdir -p $DIR/$tdir
     replay_barrier $SINGLEMDS
     # first operation
@@ -736,7 +730,6 @@ test_7_cycle() {
        wait_recovery_complete $SINGLEMDS
        wait_mds_ost_sync || error "wait_mds_ost_sync failed"
 
-       [[ $at_max_saved -eq 0 ]] || at_max_set $at_max_saved mds client
        rm -rf $DIR/$tdir
        return $rc
 }