From: Mikhail Pershin <mike.pershin@intel.com>
Date: Mon, 11 Nov 2013 17:42:43 +0000 (+0400)
Subject: LU-793 ptlrpc: allow client to reconnect with RPC in progress
X-Git-Tag: 2.5.52~28
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=da6a28fc59a70226dd89bac95880f7791f1db075;p=fs%2Flustre-release.git

LU-793 ptlrpc: allow client to reconnect with RPC in progress

Since not letting clients with pending RPCs to reconnect caused a lot
of problems, this patch allows such a reconnection and would only
refuse to process requests with the same xids that are already being
processed.

Bulk requests are aborted upon reconnection by comparing connection
count of request and export.

Signed-off-by: Mikhail Pershin <mike.pershin@intel.com>
Change-Id: Ic4b23a71fd288df02d1040d98867373ae06b60f6
Reviewed-on: http://review.whamcloud.com/4960
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Johann Lombardi <johann.lombardi@intel.com>
Reviewed-by: Alexander Boyko <alexander_boyko@xyratex.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h
index 2a01b60..3759ebf 100644
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -232,9 +232,6 @@ struct obd_export {
                                   exp_flvr_changed:1,
                                   exp_flvr_adapt:1,
                                   exp_libclient:1, /* liblustre client? */
-                                  /* client timed out and tried to reconnect,
-                                   * but couldn't because of active rpcs */
-				  exp_abort_active_req:1,
 				  /* if to swap nidtbl entries for 2.2 clients.
 				   * Only used by the MGS to fix LU-1644. */
 				  exp_need_mne_swab:1;
@@ -247,6 +244,7 @@ struct obd_export {
         /** protects exp_hp_rpcs */
 	spinlock_t		  exp_rpc_lock;
 	cfs_list_t		  exp_hp_rpcs;	/* (potential) HP RPCs */
+	cfs_list_t		  exp_reg_rpcs;  /* RPC being handled */
 
         /** blocking dlm lock list, protected by exp_bl_list_lock */
         cfs_list_t                exp_bl_list;
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 6b18781..8360290 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -976,21 +976,6 @@ no_export:
                               libcfs_nid2str(req->rq_peer.nid),
                               cfs_atomic_read(&export->exp_refcount));
                 GOTO(out, rc = -EBUSY);
-        } else if (req->rq_export != NULL &&
-                   (cfs_atomic_read(&export->exp_rpc_count) > 1)) {
-		/* The current connect RPC has increased exp_rpc_count. */
-                LCONSOLE_WARN("%s: Client %s (at %s) refused reconnection, "
-                              "still busy with %d active RPCs\n",
-                              target->obd_name, cluuid.uuid,
-                              libcfs_nid2str(req->rq_peer.nid),
-                              cfs_atomic_read(&export->exp_rpc_count) - 1);
-		spin_lock(&export->exp_lock);
-		if (req->rq_export->exp_conn_cnt <
-		    lustre_msg_get_conn_cnt(req->rq_reqmsg))
-			/* try to abort active requests */
-			req->rq_export->exp_abort_active_req = 1;
-		spin_unlock(&export->exp_lock);
-		GOTO(out, rc = -EBUSY);
         } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                 if (!strstr(cluuid.uuid, "mdt"))
                         LCONSOLE_WARN("%s: Rejecting reconnect from the "
@@ -1013,7 +998,7 @@ no_export:
               export, (long)cfs_time_current_sec(),
               export ? (long)export->exp_last_request_time : 0);
 
-        /* If this is the first time a client connects, reset the recovery
+	/* If this is the first time a client connects, reset the recovery
 	 * timer. Discard lightweight connections which might be local. */
 	if (!lw_client && rc == 0 && target->obd_recovering)
 		check_and_start_recovery_timer(target, req, export == NULL);
@@ -1132,7 +1117,6 @@ dont_check_exports:
         }
         LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
-        export->exp_abort_active_req = 0;
 
 	/* Don't evict liblustre clients for not pinging. */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
@@ -2643,89 +2627,91 @@ static inline char *bulk2type(struct ptlrpc_bulk_desc *desc)
 int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                    struct l_wait_info *lwi)
 {
-        struct ptlrpc_request *req = desc->bd_req;
-        int rc = 0;
-        ENTRY;
+	struct ptlrpc_request	*req = desc->bd_req;
+	time_t			 start = cfs_time_current_sec();
+	int			 rc = 0;
+
+	ENTRY;
 
 	/* If there is eviction in progress, wait for it to finish. */
-        if (unlikely(cfs_atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
-                *lwi = LWI_INTR(NULL, NULL);
-                rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
-                                  !cfs_atomic_read(&exp->exp_obd->
-                                                   obd_evict_inprogress),
-                                  lwi);
-        }
+	if (unlikely(cfs_atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
+		*lwi = LWI_INTR(NULL, NULL);
+		rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
+				  !cfs_atomic_read(&exp->exp_obd->
+					  	   obd_evict_inprogress),
+				  lwi);
+	}
 
-	/* Check if client was evicted or tried to reconnect already. */
-        if (exp->exp_failed || exp->exp_abort_active_req) {
-                rc = -ENOTCONN;
-        } else {
-                if (desc->bd_type == BULK_PUT_SINK)
-                        rc = sptlrpc_svc_wrap_bulk(req, desc);
-                if (rc == 0)
-                        rc = ptlrpc_start_bulk_transfer(desc);
-        }
-
-        if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
-                ptlrpc_abort_bulk(desc);
-        } else if (rc == 0) {
-                time_t start = cfs_time_current_sec();
-                do {
-                        long timeoutl = req->rq_deadline - cfs_time_current_sec();
-                        cfs_duration_t timeout = timeoutl <= 0 ?
-                                CFS_TICK : cfs_time_seconds(timeoutl);
-                        *lwi = LWI_TIMEOUT_INTERVAL(timeout,
-                                                    cfs_time_seconds(1),
-                                                   target_bulk_timeout,
-                                                   desc);
-                        rc = l_wait_event(desc->bd_waitq,
-                                          !ptlrpc_server_bulk_active(desc) ||
-                                          exp->exp_failed ||
-                                          exp->exp_abort_active_req,
-                                          lwi);
-                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
-			/* Wait again if we changed deadline. */
-                } while ((rc == -ETIMEDOUT) &&
-                         (req->rq_deadline > cfs_time_current_sec()));
-
-                if (rc == -ETIMEDOUT) {
-                        DEBUG_REQ(D_ERROR, req,
-                                  "timeout on bulk %s after %ld%+lds",
-                                  bulk2type(desc),
-                                  req->rq_deadline - start,
-                                  cfs_time_current_sec() -
-                                  req->rq_deadline);
-                        ptlrpc_abort_bulk(desc);
-                } else if (exp->exp_failed) {
-                        DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
-                                  bulk2type(desc));
-                        rc = -ENOTCONN;
-                        ptlrpc_abort_bulk(desc);
-                } else if (exp->exp_abort_active_req) {
-                        DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
-                                  bulk2type(desc));
-			/* We don't reply anyway. */
-                        rc = -ETIMEDOUT;
-                        ptlrpc_abort_bulk(desc);
-		} else if (desc->bd_failure ||
-			   desc->bd_nob_transferred != desc->bd_nob) {
-			DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
-				  desc->bd_failure ?
-				  "network error on" : "truncated",
-				  bulk2type(desc),
-				  desc->bd_nob_transferred,
-				  desc->bd_nob);
-			/* XXX Should this be a different errno? */
-			rc = -ETIMEDOUT;
-                } else if (desc->bd_type == BULK_GET_SINK) {
-                        rc = sptlrpc_svc_unwrap_bulk(req, desc);
-                }
-        } else {
-                DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
-                          bulk2type(desc), rc);
-        }
+	/* Check if client was evicted or reconnected already. */
+	if (exp->exp_failed ||
+	    exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		rc = -ENOTCONN;
+	} else {
+		if (desc->bd_type == BULK_PUT_SINK)
+			rc = sptlrpc_svc_wrap_bulk(req, desc);
+		if (rc == 0)
+			rc = ptlrpc_start_bulk_transfer(desc);
+	}
 
-        RETURN(rc);
+	if (rc < 0) {
+		DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
+			  bulk2type(desc), rc);
+		RETURN(rc);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
+		ptlrpc_abort_bulk(desc);
+		RETURN(0);
+	}
+
+	do {
+		long timeoutl = req->rq_deadline - cfs_time_current_sec();
+		cfs_duration_t timeout = timeoutl <= 0 ?
+					 CFS_TICK : cfs_time_seconds(timeoutl);
+
+		*lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+					    target_bulk_timeout, desc);
+		rc = l_wait_event(desc->bd_waitq,
+				  !ptlrpc_server_bulk_active(desc) ||
+				  exp->exp_failed ||
+				  exp->exp_conn_cnt >
+				  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+				  lwi);
+		LASSERT(rc == 0 || rc == -ETIMEDOUT);
+		/* Wait again if we changed deadline. */
+	} while ((rc == -ETIMEDOUT) &&
+		 (req->rq_deadline > cfs_time_current_sec()));
+
+	if (rc == -ETIMEDOUT) {
+		DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+			  bulk2type(desc), req->rq_deadline - start,
+			  cfs_time_current_sec() - req->rq_deadline);
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_failed) {
+		DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
+			  bulk2type(desc));
+		rc = -ENOTCONN;
+		ptlrpc_abort_bulk(desc);
+	} else if (exp->exp_conn_cnt >
+		   lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
+			  bulk2type(desc));
+		/* We don't reply anyway. */
+		rc = -ETIMEDOUT;
+		ptlrpc_abort_bulk(desc);
+	} else if (desc->bd_failure ||
+		   desc->bd_nob_transferred != desc->bd_nob) {
+		DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
+			  desc->bd_failure ? "network error on" : "truncated",
+			  bulk2type(desc), desc->bd_nob_transferred,
+			  desc->bd_nob);
+		/* XXX Should this be a different errno? */
+		rc = -ETIMEDOUT;
+	} else if (desc->bd_type == BULK_GET_SINK) {
+		rc = sptlrpc_svc_unwrap_bulk(req, desc);
+	}
+
+	RETURN(rc);
 }
 EXPORT_SYMBOL(target_bulk_io);
 
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c
index 2f0b54b..7f5a6bc 100644
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -860,6 +860,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
 	CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue);
 	CFS_INIT_LIST_HEAD(&export->exp_handle.h_link);
 	CFS_INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	CFS_INIT_LIST_HEAD(&export->exp_reg_rpcs);
 	class_handle_hash(&export->exp_handle, &export_handle_ops);
 	export->exp_last_request_time = cfs_time_current_sec();
 	spin_lock_init(&export->exp_lock);
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c
index 4d2ce16..08827eb 100644
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1520,6 +1520,45 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 	RETURN(1); /* return "did_something" for liblustre */
 }
 
+/* Check if we are already handling earlier incarnation of this request.
+ * Called under &req->rq_export->exp_rpc_lock locked */
+static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request	*tmp = NULL;
+
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
+	    (cfs_atomic_read(&req->rq_export->exp_rpc_count) == 0))
+		return 0;
+
+	/* bulk request are aborted upon reconnect, don't try to
+	 * find a match */
+	if (req->rq_bulk_write || req->rq_bulk_read)
+		return 0;
+
+	/* This list should not be longer than max_requests in
+	 * flights on the client, so it is not all that long.
+	 * Also we only hit this codepath in case of a resent
+	 * request which makes it even more rarely hit */
+	cfs_list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	cfs_list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs,
+				rq_exp_list) {
+		/* Found duplicate one */
+		if (tmp->rq_xid == req->rq_xid)
+			goto found;
+	}
+	return 0;
+
+found:
+	DEBUG_REQ(D_HA, req, "Found duplicate req in processing\n");
+	DEBUG_REQ(D_HA, tmp, "Request being processed\n");
+	return -EBUSY;
+}
+
 /**
  * Put the request to the export list if the request may become
  * a high priority one.
@@ -1527,7 +1566,9 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
 				    struct ptlrpc_request *req)
 {
-	int rc = 0;
+	cfs_list_t	*list;
+	int		 rc, hp = 0;
+
 	ENTRY;
 
 	if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
@@ -1536,48 +1577,61 @@ static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
 			RETURN(rc);
 		LASSERT(rc == 0);
 	}
-	if (req->rq_export && req->rq_ops) {
-		/* Perform request specific check. We should do this check
-		 * before the request is added into exp_hp_rpcs list otherwise
-		 * it may hit swab race at LU-1044. */
-		if (req->rq_ops->hpreq_check) {
-			rc = req->rq_ops->hpreq_check(req);
-			/**
-			 * XXX: Out of all current
-			 * ptlrpc_hpreq_ops::hpreq_check(), only
-			 * ldlm_cancel_hpreq_check() can return an error code;
-			 * other functions assert in similar places, which seems
-			 * odd. What also does not seem right is that handlers
-			 * for those RPCs do not assert on the same checks, but
-			 * rather handle the error cases. e.g. see
-			 * ost_rw_hpreq_check(), and ost_brw_read(),
-			 * ost_brw_write().
-			 */
-			if (rc < 0)
-				RETURN(rc);
-			LASSERT(rc == 0 || rc == 1);
+	if (req->rq_export) {
+		if (req->rq_ops) {
+			/* Perform request specific check. We should do this
+			 * check before the request is added into exp_hp_rpcs
+			 * list otherwise it may hit swab race at LU-1044. */
+			if (req->rq_ops->hpreq_check) {
+				rc = req->rq_ops->hpreq_check(req);
+				/**
+				 * XXX: Out of all current
+				 * ptlrpc_hpreq_ops::hpreq_check(), only
+				 * ldlm_cancel_hpreq_check() can return an
+				 * error code; other functions assert in
+				 * similar places, which seems odd.
+				 * What also does not seem right is that
+				 * handlers for those RPCs do not assert
+				 * on the same checks, but rather handle the
+				 * error cases. e.g. see ost_rw_hpreq_check(),
+				 * and ost_brw_read(), ost_brw_write().
+				 */
+				if (rc < 0)
+					RETURN(rc);
+				LASSERT(rc == 0 || rc == 1);
+				hp = rc;
+			}
+			list = &req->rq_export->exp_hp_rpcs;
+		} else {
+			list = &req->rq_export->exp_reg_rpcs;
 		}
 
+		/* do search for duplicated xid and the adding to the list
+		 * atomically */
 		spin_lock_bh(&req->rq_export->exp_rpc_lock);
-		cfs_list_add(&req->rq_exp_list,
-			     &req->rq_export->exp_hp_rpcs);
+		rc = ptlrpc_server_check_resend_in_progress(req);
+		if (rc < 0) {
+			spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+			RETURN(rc);
+		}
+		cfs_list_add(&req->rq_exp_list, list);
 		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
 	}
 
-	ptlrpc_nrs_req_initialize(svcpt, req, rc);
+	ptlrpc_nrs_req_initialize(svcpt, req, !!hp);
 
-	RETURN(rc);
+	RETURN(hp);
 }
 
 /** Remove the request from the export list. */
 static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
 {
-        ENTRY;
-        if (req->rq_export && req->rq_ops) {
-                /* refresh lock timeout again so that client has more
-                 * room to send lock cancel RPC. */
-                if (req->rq_ops->hpreq_fini)
-                        req->rq_ops->hpreq_fini(req);
+	ENTRY;
+	if (req->rq_export) {
+		/* refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC. */
+		if (req->rq_ops && req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
 
 		spin_lock_bh(&req->rq_export->exp_rpc_lock);
 		cfs_list_del_init(&req->rq_exp_list);
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 6889115..08b40a6 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -1493,7 +1493,8 @@ int tgt_brw_read(struct tgt_session_info *tsi)
 	if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
 		RETURN(-EIO);
 
-	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, (obd_timeout + 1) / 4);
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+			 cfs_fail_val : (obd_timeout + 1) / 4);
 
 	/* Check if there is eviction in progress, and if so, wait for it to
 	 * finish */
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh
index 0d38b23..420a8f9 100755
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -1788,6 +1788,29 @@ test_111 ()
 }
 run_test 111 "mdd setup fail should not cause umount oops"
 
+# LU-793
+test_112a() {
+	remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+	[[ $(lustre_version_code ost1) -ge $(version_code 2.5.92) ]] ||
+		{ skip "Need OST version at least 2.5.92"; return 0; }
+
+	do_facet_random_file client $TMP/$tfile 100K ||
+		error_noexit "Create random file $TMP/$tfile"
+
+	pause_bulk "cp $TMP/$tfile $DIR/$tfile" $TIMEOUT ||
+		error_noexit "Can't pause_bulk copy"
+
+	df $DIR
+	# expect cmp to succeed, client resent bulk
+	cmp $TMP/$tfile $DIR/$tfile ||
+		error_noexit "Wrong data has been written"
+	rm $DIR/$tfile ||
+		error_noexit "Can't remove file"
+	rm $TMP/$tfile
+}
+run_test 112a "bulk resend while orignal request is in progress"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index 2a62f83..a807c6b 100644
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -4266,12 +4266,16 @@ drop_update_reply() {
 
 pause_bulk() {
 #define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
-    RC=0
-    do_facet ost1 lctl set_param fail_loc=0x214
-    do_facet client "$1" || RC=$?
-    do_facet client "sync"
-    do_facet ost1 lctl set_param fail_loc=0
-    return $RC
+	RC=0
+
+	local timeout=${2:-0}
+	# default is (obd_timeout / 4) if unspecified
+	echo "timeout is $timeout/$2"
+	do_facet ost1 lctl set_param fail_val=$timeout fail_loc=0x80000214
+	do_facet client "$1" || RC=$?
+	do_facet client "sync"
+	do_facet ost1 lctl set_param fail_loc=0
+	return $RC
 }
 
 drop_ldlm_cancel() {
@@ -4313,7 +4317,7 @@ clear_failloc() {
 }
 
 set_nodes_failloc () {
-    do_nodes $(comma_list $1)  lctl set_param fail_loc=$2
+	do_nodes $(comma_list $1)  lctl set_param fail_val=0 fail_loc=$2
 }
 
 cancel_lru_locks() {