Whamcloud - gitweb
LU-1329 ptlrpc: resend request on -EINPROGRESS
authorJohann Lombardi <johann@whamcloud.com>
Tue, 3 Jul 2012 10:54:54 +0000 (12:54 +0200)
committerOleg Drokin <green@whamcloud.com>
Thu, 26 Jul 2012 04:49:33 +0000 (00:49 -0400)
It seems the EINPROGRESS is going to be used for many different
purposes (e.g. on statfs see review 3198). As a result, it sounds
like ptlrpc is the right place to resend requests on EINPROGRESS.
Upper layers can still decide to handle EINPROGRESS by themselves by
setting rq_no_retry_einprogress to 1.

Signed-off-by: Johann Lombardi <johann@whamcloud.com>
Change-Id: Iae2a5976666e66be4f6e71f82c5653e5636ba07d
Reviewed-on: http://review.whamcloud.com/3262
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Niu Yawei <niu@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_net.h
lustre/include/obd_support.h
lustre/mdc/mdc_locks.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/client.c
lustre/tests/replay-ost-single.sh

index e09b50e..b5d8985 100644 (file)
@@ -718,7 +718,11 @@ struct ptlrpc_request {
                 rq_committed:1,
                 /* whether the "rq_set" is a valid one */
                 rq_invalid_rqset:1,
-                rq_generation_set:1;
+               rq_generation_set:1,
+               /* do not resend request on -EINPROGRESS */
+               rq_no_retry_einprogress:1;
+
+       unsigned int rq_nr_resend;
 
         enum rq_phase rq_phase; /* one of RQ_PHASE_* */
         enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
index 665dc89..bcaa60d 100644 (file)
@@ -289,6 +289,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
 #define OBD_FAIL_OST_ENOINO              0x229
 #define OBD_FAIL_OST_DQACQ_NET           0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
index 746012e..855d2ee 100644 (file)
@@ -733,6 +733,11 @@ resend:
         if (IS_ERR(req))
                 RETURN(PTR_ERR(req));
 
+       if (req != NULL && it && it->it_op & IT_CREAT)
+               /* ask ptlrpc not to resend on EINPROGRESS since we have our own
+                * retry logic */
+               req->rq_no_retry_einprogress = 1;
+
         if (resends) {
                 req->rq_generation_set = 1;
                 req->rq_import_generation = generation;
@@ -791,7 +796,7 @@ resend:
                 if (generation == obddev->u.cli.cl_import->imp_generation) {
                         goto resend;
                 } else {
-                        CDEBUG(D_HA, "resned cross eviction\n");
+                       CDEBUG(D_HA, "resend cross eviction\n");
                         RETURN(-EIO);
                 }
         }
index 879b30c..ed0ac4b 100644 (file)
@@ -1251,6 +1251,9 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
         }
         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
         ptlrpc_at_set_req_timeout(req);
+       /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+        * retry logic */
+       req->rq_no_retry_einprogress = 1;
 
         if (opc == OST_WRITE)
                 desc = ptlrpc_prep_bulk_imp(req, page_count,
index b9405cb..17a0fb2 100644 (file)
@@ -322,6 +322,9 @@ static int ost_statfs(struct ptlrpc_request *req)
         if (req->rq_status != 0)
                 CERROR("ost: statfs failed: rc %d\n", req->rq_status);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OST_STATFS_EINPROGRESS))
+               req->rq_status = -EINPROGRESS;
+
         RETURN(0);
 }
 
index 3ad9a87..aefce46 100644 (file)
@@ -1275,6 +1275,27 @@ static int after_reply(struct ptlrpc_request *req)
                 RETURN(rc);
         }
 
+       /* retry indefinitely on EINPROGRESS */
+       if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+           ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+               time_t  now = cfs_time_current_sec();
+
+               DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+               req->rq_resend = 1;
+               req->rq_nr_resend++;
+
+               /* Readjust the timeout for current conditions */
+               ptlrpc_at_set_req_timeout(req);
+               /* delay resend to give a chance to the server to get ready.
+                * The delay is increased by 1s on every resend and is capped to
+                * the current request timeout (i.e. obd_timeout if AT is off,
+                * or AT service time x 125% + 5s, see at_est2timeout) */
+               if (req->rq_nr_resend > req->rq_timeout)
+                       req->rq_sent = now + req->rq_timeout;
+               else
+                       req->rq_sent = now + req->rq_nr_resend;
+       }
+
         /*
          * Security layer unwrap might ask resend this request.
          */
@@ -1511,7 +1532,12 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 
                 /* delayed send - skip */
                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
-                        continue;
+                       continue;
+
+               /* delayed resend - skip */
+               if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+                   req->rq_sent > cfs_time_current_sec())
+                       continue;
 
                 if (!(req->rq_phase == RQ_PHASE_RPC ||
                       req->rq_phase == RQ_PHASE_BULK ||
@@ -2042,6 +2068,8 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
 
                 if (req->rq_phase == RQ_PHASE_NEW)
                         deadline = req->rq_sent;
+               else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+                       deadline = req->rq_sent;
                 else
                         deadline = req->rq_sent + req->rq_timeout;
 
index 0f4194b..fa62200 100755 (executable)
@@ -306,8 +306,7 @@ test_8c() {
 }
 run_test 8c "Verify redo io: redo io should fail after eviction"
 
-
-test_9d() {
+test_8d() {
 #define OBD_FAIL_MDS_DQACQ_NET           0x187
     do_facet $SINGLEMDS "lctl set_param fail_loc=0x187"
     # test the non-intent create path
@@ -338,7 +337,23 @@ test_9d() {
     wait $cpid || return 4
     stat $TDIR/$tfile || error "open failed"
 }
-run_test 9d "Verify redo creation on -EINPROGRESS"
+run_test 8d "Verify redo creation on -EINPROGRESS"
+
+test_8e() {
+       sleep 1 # ensure we have a fresh statfs
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+       do_facet ost1 "lctl set_param fail_loc=0x231"
+       df $MOUNT &
+       dfpid=$!
+       sleep $TIMEOUT
+       if ! ps -p $dfpid  > /dev/null 2>&1; then
+                       do_facet ost1 "lctl set_param fail_loc=0"
+                       error "df shouldn't have completed!"
+                       return 1
+       fi
+       do_facet ost1 "lctl set_param fail_loc=0"
+}
+run_test 8e "Verify that ptlrpc resends request on -EINPROGRESS"
 
 complete $(basename $0) $SECONDS
 check_and_cleanup_lustre