LU-793 ptlrpc: allow client to reconnect with RPC in progress

author Mikhail Pershin <mike.pershin@intel.com>

Mon, 11 Nov 2013 17:42:43 +0000 (21:42 +0400)

committer Oleg Drokin <oleg.drokin@intel.com>

Mon, 2 Dec 2013 12:05:52 +0000 (12:05 +0000)
author Mikhail Pershin <mike.pershin@intel.com>
Mon, 11 Nov 2013 17:42:43 +0000 (21:42 +0400)
committer Oleg Drokin <oleg.drokin@intel.com>
Mon, 2 Dec 2013 12:05:52 +0000 (12:05 +0000)
diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h

index 2a01b60..3759ebf 100644 (file)
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -232,9 +232,6 @@ struct obd_export {
                                    exp_flvr_changed:1,
                                    exp_flvr_adapt:1,
                                    exp_libclient:1, /* liblustre client? */
-                                  /* client timed out and tried to reconnect,
-                                   * but couldn't because of active rpcs */
-                                 exp_abort_active_req:1,
                                   /* if to swap nidtbl entries for 2.2 clients.
                                    * Only used by the MGS to fix LU-1644. */
                                   exp_need_mne_swab:1;
@@ -247,6 +244,7 @@ struct obd_export {
          /** protects exp_hp_rpcs */
         spinlock_t                exp_rpc_lock;
         cfs_list_t                exp_hp_rpcs;  /* (potential) HP RPCs */
+       cfs_list_t                exp_reg_rpcs;  /* RPC being handled */
  
          /** blocking dlm lock list, protected by exp_bl_list_lock */
          cfs_list_t                exp_bl_list;
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index 6b18781..8360290 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -976,21 +976,6 @@ no_export:
                                libcfs_nid2str(req->rq_peer.nid),
                                cfs_atomic_read(&export->exp_refcount));
                  GOTO(out, rc = -EBUSY);
-        } else if (req->rq_export != NULL &&
-                   (cfs_atomic_read(&export->exp_rpc_count) > 1)) {
-               /* The current connect RPC has increased exp_rpc_count. */
-                LCONSOLE_WARN("%s: Client %s (at %s) refused reconnection, "
-                              "still busy with %d active RPCs\n",
-                              target->obd_name, cluuid.uuid,
-                              libcfs_nid2str(req->rq_peer.nid),
-                              cfs_atomic_read(&export->exp_rpc_count) - 1);
-               spin_lock(&export->exp_lock);
-               if (req->rq_export->exp_conn_cnt <
-                   lustre_msg_get_conn_cnt(req->rq_reqmsg))
-                       /* try to abort active requests */
-                       req->rq_export->exp_abort_active_req = 1;
-               spin_unlock(&export->exp_lock);
-               GOTO(out, rc = -EBUSY);
          } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                  if (!strstr(cluuid.uuid, "mdt"))
                          LCONSOLE_WARN("%s: Rejecting reconnect from the "
@@ -1013,7 +998,7 @@ no_export:
                export, (long)cfs_time_current_sec(),
                export ? (long)export->exp_last_request_time : 0);
  
-        /* If this is the first time a client connects, reset the recovery
+       /* If this is the first time a client connects, reset the recovery
          * timer. Discard lightweight connections which might be local. */
         if (!lw_client && rc == 0 && target->obd_recovering)
                 check_and_start_recovery_timer(target, req, export == NULL);
@@ -1132,7 +1117,6 @@ dont_check_exports:
          }
          LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
          export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
-        export->exp_abort_active_req = 0;
  
         /* Don't evict liblustre clients for not pinging. */
          if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
@@ -2643,89 +2627,91 @@ static inline char *bulk2type(struct ptlrpc_bulk_desc *desc)
  int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                     struct l_wait_info *lwi)
  {
-        struct ptlrpc_request *req = desc->bd_req;
-        int rc = 0;
-        ENTRY;
+       struct ptlrpc_request   *req = desc->bd_req;
+       time_t                   start = cfs_time_current_sec();
+       int                      rc = 0;
+
+       ENTRY;
  
         /* If there is eviction in progress, wait for it to finish. */
-        if (unlikely(cfs_atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
-                *lwi = LWI_INTR(NULL, NULL);
-                rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
-                                  !cfs_atomic_read(&exp->exp_obd->
-                                                   obd_evict_inprogress),
-                                  lwi);
-        }
+       if (unlikely(cfs_atomic_read(&exp->exp_obd->obd_evict_inprogress))) {
+               *lwi = LWI_INTR(NULL, NULL);
+               rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
+                                 !cfs_atomic_read(&exp->exp_obd->
+                                                  obd_evict_inprogress),
+                                 lwi);
+       }
  
-       /* Check if client was evicted or tried to reconnect already. */
-        if (exp->exp_failed || exp->exp_abort_active_req) {
-                rc = -ENOTCONN;
-        } else {
-                if (desc->bd_type == BULK_PUT_SINK)
-                        rc = sptlrpc_svc_wrap_bulk(req, desc);
-                if (rc == 0)
-                        rc = ptlrpc_start_bulk_transfer(desc);
-        }
-
-        if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
-                ptlrpc_abort_bulk(desc);
-        } else if (rc == 0) {
-                time_t start = cfs_time_current_sec();
-                do {
-                        long timeoutl = req->rq_deadline - cfs_time_current_sec();
-                        cfs_duration_t timeout = timeoutl <= 0 ?
-                                CFS_TICK : cfs_time_seconds(timeoutl);
-                        *lwi = LWI_TIMEOUT_INTERVAL(timeout,
-                                                    cfs_time_seconds(1),
-                                                   target_bulk_timeout,
-                                                   desc);
-                        rc = l_wait_event(desc->bd_waitq,
-                                          !ptlrpc_server_bulk_active(desc) ||
-                                          exp->exp_failed ||
-                                          exp->exp_abort_active_req,
-                                          lwi);
-                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
-                       /* Wait again if we changed deadline. */
-                } while ((rc == -ETIMEDOUT) &&
-                         (req->rq_deadline > cfs_time_current_sec()));
-
-                if (rc == -ETIMEDOUT) {
-                        DEBUG_REQ(D_ERROR, req,
-                                  "timeout on bulk %s after %ld%+lds",
-                                  bulk2type(desc),
-                                  req->rq_deadline - start,
-                                  cfs_time_current_sec() -
-                                  req->rq_deadline);
-                        ptlrpc_abort_bulk(desc);
-                } else if (exp->exp_failed) {
-                        DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
-                                  bulk2type(desc));
-                        rc = -ENOTCONN;
-                        ptlrpc_abort_bulk(desc);
-                } else if (exp->exp_abort_active_req) {
-                        DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
-                                  bulk2type(desc));
-                       /* We don't reply anyway. */
-                        rc = -ETIMEDOUT;
-                        ptlrpc_abort_bulk(desc);
-               } else if (desc->bd_failure ||
-                          desc->bd_nob_transferred != desc->bd_nob) {
-                       DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
-                                 desc->bd_failure ?
-                                 "network error on" : "truncated",
-                                 bulk2type(desc),
-                                 desc->bd_nob_transferred,
-                                 desc->bd_nob);
-                       /* XXX Should this be a different errno? */
-                       rc = -ETIMEDOUT;
-                } else if (desc->bd_type == BULK_GET_SINK) {
-                        rc = sptlrpc_svc_unwrap_bulk(req, desc);
-                }
-        } else {
-                DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
-                          bulk2type(desc), rc);
-        }
+       /* Check if client was evicted or reconnected already. */
+       if (exp->exp_failed ||
+           exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+               rc = -ENOTCONN;
+       } else {
+               if (desc->bd_type == BULK_PUT_SINK)
+                       rc = sptlrpc_svc_wrap_bulk(req, desc);
+               if (rc == 0)
+                       rc = ptlrpc_start_bulk_transfer(desc);
+       }
  
-        RETURN(rc);
+       if (rc < 0) {
+               DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
+                         bulk2type(desc), rc);
+               RETURN(rc);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
+               ptlrpc_abort_bulk(desc);
+               RETURN(0);
+       }
+
+       do {
+               long timeoutl = req->rq_deadline - cfs_time_current_sec();
+               cfs_duration_t timeout = timeoutl <= 0 ?
+                                        CFS_TICK : cfs_time_seconds(timeoutl);
+
+               *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+                                           target_bulk_timeout, desc);
+               rc = l_wait_event(desc->bd_waitq,
+                                 !ptlrpc_server_bulk_active(desc) ||
+                                 exp->exp_failed ||
+                                 exp->exp_conn_cnt >
+                                 lustre_msg_get_conn_cnt(req->rq_reqmsg),
+                                 lwi);
+               LASSERT(rc == 0 || rc == -ETIMEDOUT);
+               /* Wait again if we changed deadline. */
+       } while ((rc == -ETIMEDOUT) &&
+                (req->rq_deadline > cfs_time_current_sec()));
+
+       if (rc == -ETIMEDOUT) {
+               DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+                         bulk2type(desc), req->rq_deadline - start,
+                         cfs_time_current_sec() - req->rq_deadline);
+               ptlrpc_abort_bulk(desc);
+       } else if (exp->exp_failed) {
+               DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
+                         bulk2type(desc));
+               rc = -ENOTCONN;
+               ptlrpc_abort_bulk(desc);
+       } else if (exp->exp_conn_cnt >
+                  lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+               DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
+                         bulk2type(desc));
+               /* We don't reply anyway. */
+               rc = -ETIMEDOUT;
+               ptlrpc_abort_bulk(desc);
+       } else if (desc->bd_failure ||
+                  desc->bd_nob_transferred != desc->bd_nob) {
+               DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
+                         desc->bd_failure ? "network error on" : "truncated",
+                         bulk2type(desc), desc->bd_nob_transferred,
+                         desc->bd_nob);
+               /* XXX Should this be a different errno? */
+               rc = -ETIMEDOUT;
+       } else if (desc->bd_type == BULK_GET_SINK) {
+               rc = sptlrpc_svc_unwrap_bulk(req, desc);
+       }
+
+       RETURN(rc);
  }
  EXPORT_SYMBOL(target_bulk_io);
  
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index 2f0b54b..7f5a6bc 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -860,6 +860,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
         CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue);
         CFS_INIT_LIST_HEAD(&export->exp_handle.h_link);
         CFS_INIT_LIST_HEAD(&export->exp_hp_rpcs);
+       CFS_INIT_LIST_HEAD(&export->exp_reg_rpcs);
         class_handle_hash(&export->exp_handle, &export_handle_ops);
         export->exp_last_request_time = cfs_time_current_sec();
         spin_lock_init(&export->exp_lock);
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index 4d2ce16..08827eb 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1520,6 +1520,45 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
         RETURN(1); /* return "did_something" for liblustre */
  }
  
+/* Check if we are already handling earlier incarnation of this request.
+ * Called under &req->rq_export->exp_rpc_lock locked */
+static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request   *tmp = NULL;
+
+       if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
+           (cfs_atomic_read(&req->rq_export->exp_rpc_count) == 0))
+               return 0;
+
+       /* bulk request are aborted upon reconnect, don't try to
+        * find a match */
+       if (req->rq_bulk_write || req->rq_bulk_read)
+               return 0;
+
+       /* This list should not be longer than max_requests in
+        * flights on the client, so it is not all that long.
+        * Also we only hit this codepath in case of a resent
+        * request which makes it even more rarely hit */
+       cfs_list_for_each_entry(tmp, &req->rq_export->exp_reg_rpcs,
+                               rq_exp_list) {
+               /* Found duplicate one */
+               if (tmp->rq_xid == req->rq_xid)
+                       goto found;
+       }
+       cfs_list_for_each_entry(tmp, &req->rq_export->exp_hp_rpcs,
+                               rq_exp_list) {
+               /* Found duplicate one */
+               if (tmp->rq_xid == req->rq_xid)
+                       goto found;
+       }
+       return 0;
+
+found:
+       DEBUG_REQ(D_HA, req, "Found duplicate req in processing\n");
+       DEBUG_REQ(D_HA, tmp, "Request being processed\n");
+       return -EBUSY;
+}
+
  /**
   * Put the request to the export list if the request may become
   * a high priority one.
@@ -1527,7 +1566,9 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
  static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
                                     struct ptlrpc_request *req)
  {
-       int rc = 0;
+       cfs_list_t      *list;
+       int              rc, hp = 0;
+
         ENTRY;
  
         if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
@@ -1536,48 +1577,61 @@ static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
                         RETURN(rc);
                 LASSERT(rc == 0);
         }
-       if (req->rq_export && req->rq_ops) {
-               /* Perform request specific check. We should do this check
-                * before the request is added into exp_hp_rpcs list otherwise
-                * it may hit swab race at LU-1044. */
-               if (req->rq_ops->hpreq_check) {
-                       rc = req->rq_ops->hpreq_check(req);
-                       /**
-                        * XXX: Out of all current
-                        * ptlrpc_hpreq_ops::hpreq_check(), only
-                        * ldlm_cancel_hpreq_check() can return an error code;
-                        * other functions assert in similar places, which seems
-                        * odd. What also does not seem right is that handlers
-                        * for those RPCs do not assert on the same checks, but
-                        * rather handle the error cases. e.g. see
-                        * ost_rw_hpreq_check(), and ost_brw_read(),
-                        * ost_brw_write().
-                        */
-                       if (rc < 0)
-                               RETURN(rc);
-                       LASSERT(rc == 0 || rc == 1);
+       if (req->rq_export) {
+               if (req->rq_ops) {
+                       /* Perform request specific check. We should do this
+                        * check before the request is added into exp_hp_rpcs
+                        * list otherwise it may hit swab race at LU-1044. */
+                       if (req->rq_ops->hpreq_check) {
+                               rc = req->rq_ops->hpreq_check(req);
+                               /**
+                                * XXX: Out of all current
+                                * ptlrpc_hpreq_ops::hpreq_check(), only
+                                * ldlm_cancel_hpreq_check() can return an
+                                * error code; other functions assert in
+                                * similar places, which seems odd.
+                                * What also does not seem right is that
+                                * handlers for those RPCs do not assert
+                                * on the same checks, but rather handle the
+                                * error cases. e.g. see ost_rw_hpreq_check(),
+                                * and ost_brw_read(), ost_brw_write().
+                                */
+                               if (rc < 0)
+                                       RETURN(rc);
+                               LASSERT(rc == 0 || rc == 1);
+                               hp = rc;
+                       }
+                       list = &req->rq_export->exp_hp_rpcs;
+               } else {
+                       list = &req->rq_export->exp_reg_rpcs;
                 }
  
+               /* do search for duplicated xid and the adding to the list
+                * atomically */
                 spin_lock_bh(&req->rq_export->exp_rpc_lock);
-               cfs_list_add(&req->rq_exp_list,
-                            &req->rq_export->exp_hp_rpcs);
+               rc = ptlrpc_server_check_resend_in_progress(req);
+               if (rc < 0) {
+                       spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+                       RETURN(rc);
+               }
+               cfs_list_add(&req->rq_exp_list, list);
                 spin_unlock_bh(&req->rq_export->exp_rpc_lock);
         }
  
-       ptlrpc_nrs_req_initialize(svcpt, req, rc);
+       ptlrpc_nrs_req_initialize(svcpt, req, !!hp);
  
-       RETURN(rc);
+       RETURN(hp);
  }
  
  /** Remove the request from the export list. */
  static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
  {
-        ENTRY;
-        if (req->rq_export && req->rq_ops) {
-                /* refresh lock timeout again so that client has more
-                 * room to send lock cancel RPC. */
-                if (req->rq_ops->hpreq_fini)
-                        req->rq_ops->hpreq_fini(req);
+       ENTRY;
+       if (req->rq_export) {
+               /* refresh lock timeout again so that client has more
+                * room to send lock cancel RPC. */
+               if (req->rq_ops && req->rq_ops->hpreq_fini)
+                       req->rq_ops->hpreq_fini(req);
  
                 spin_lock_bh(&req->rq_export->exp_rpc_lock);
                 cfs_list_del_init(&req->rq_exp_list);
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c

index 6889115..08b40a6 100644 (file)
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -1493,7 +1493,8 @@ int tgt_brw_read(struct tgt_session_info *tsi)
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
                 RETURN(-EIO);
  
-       OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, (obd_timeout + 1) / 4);
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, cfs_fail_val > 0 ?
+                        cfs_fail_val : (obd_timeout + 1) / 4);
  
         /* Check if there is eviction in progress, and if so, wait for it to
          * finish */
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 0d38b23..420a8f9 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -1788,6 +1788,29 @@ test_111 ()
  }
  run_test 111 "mdd setup fail should not cause umount oops"
  
+# LU-793
+test_112a() {
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+       [[ $(lustre_version_code ost1) -ge $(version_code 2.5.92) ]] ||
+               { skip "Need OST version at least 2.5.92"; return 0; }
+
+       do_facet_random_file client $TMP/$tfile 100K ||
+               error_noexit "Create random file $TMP/$tfile"
+
+       pause_bulk "cp $TMP/$tfile $DIR/$tfile" $TIMEOUT ||
+               error_noexit "Can't pause_bulk copy"
+
+       df $DIR
+       # expect cmp to succeed, client resent bulk
+       cmp $TMP/$tfile $DIR/$tfile ||
+               error_noexit "Wrong data has been written"
+       rm $DIR/$tfile ||
+               error_noexit "Can't remove file"
+       rm $TMP/$tfile
+}
+run_test 112a "bulk resend while orignal request is in progress"
+
  complete $SECONDS
  check_and_cleanup_lustre
  exit_status
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 2a62f83..a807c6b 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -4266,12 +4266,16 @@ drop_update_reply() {
  
  pause_bulk() {
  #define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
-    RC=0
-    do_facet ost1 lctl set_param fail_loc=0x214
-    do_facet client "$1" || RC=$?
-    do_facet client "sync"
-    do_facet ost1 lctl set_param fail_loc=0
-    return $RC
+       RC=0
+
+       local timeout=${2:-0}
+       # default is (obd_timeout / 4) if unspecified
+       echo "timeout is $timeout/$2"
+       do_facet ost1 lctl set_param fail_val=$timeout fail_loc=0x80000214
+       do_facet client "$1" || RC=$?
+       do_facet client "sync"
+       do_facet ost1 lctl set_param fail_loc=0
+       return $RC
  }
  
  drop_ldlm_cancel() {
@@ -4313,7 +4317,7 @@ clear_failloc() {
  }
  
  set_nodes_failloc () {
-    do_nodes $(comma_list $1)  lctl set_param fail_loc=$2
+       do_nodes $(comma_list $1)  lctl set_param fail_val=0 fail_loc=$2
  }
  
  cancel_lru_locks() {
author	Mikhail Pershin <mike.pershin@intel.com>
	Mon, 11 Nov 2013 17:42:43 +0000 (21:42 +0400)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Mon, 2 Dec 2013 12:05:52 +0000 (12:05 +0000)
lustre/include/lustre_export.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/target/tgt_handler.c		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history