Whamcloud - gitweb
abort bulk too early if client is reconnected
authorshadow <shadow>
Thu, 28 May 2009 03:55:15 +0000 (03:55 +0000)
committershadow <shadow>
Thu, 28 May 2009 03:55:15 +0000 (03:55 +0000)
Branch b1_8
b=18674
i=rread
i=shadow

lustre/ChangeLog
lustre/include/lustre_export.h
lustre/ldlm/ldlm_lib.c
lustre/ost/ost_handler.c

index a4272eb..884de7b 100644 (file)
@@ -15,6 +15,11 @@ tbd Sun Microsystems, Inc.
          more information, please refer to bugzilla 17630.
 
 Severity   : normal
+Bugzilla   : 18674
+Description: abort bulk too early if client is reconnected
+
+Severity   : normal
+Frequency  : rare, if used wide striped file and one ost in down.
 Bugzilla   : 18382
 Descriptoin: don't return error if we created a subset of objects for file.
 Details    : lov_update_create_set() uses set->set_success as index for created
index 0e333a8..e4887a4 100644 (file)
@@ -163,7 +163,10 @@ struct obd_export {
                                   exp_vbr_failed:1,
                                   exp_replay_needed:1,
                                   exp_need_sync:1, /* needs sync from connect */
-                                  exp_libclient:1; /* liblustre client? */
+                                  exp_libclient:1, /* liblustre client? */
+                                  /* client timed out and tried to reconnect,
+                                   * but couldn't because of active rpcs */
+                                  exp_abort_active_req:1;
         struct list_head          exp_queued_rpc;  /* RPC to be handled */
         /* VBR: per-export last committed */
         __u64                     exp_last_committed;
index 45e063e..158cf8f 100644 (file)
@@ -830,6 +830,12 @@ no_export:
                       "with %d active RPCs\n", target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
                       export, atomic_read(&export->exp_rpc_count) - 1);
+                spin_lock(&export->exp_lock);
+                if (req->rq_export->exp_conn_cnt <
+                    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+                        /* try to abort active requests */
+                        req->rq_export->exp_abort_active_req = 1;
+                spin_unlock(&export->exp_lock);
                 GOTO(out, rc = -EBUSY);
         } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                 CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
@@ -951,6 +957,7 @@ no_export:
                 GOTO(out, rc = -EALREADY);
         }
         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+        export->exp_abort_active_req = 0;
 
         /* request from liblustre?  Don't evict it for not pinging. */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
index 5edbd75..40a56ac 100644 (file)
@@ -773,7 +773,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                            desc);
                                 rc = l_wait_event(desc->bd_waitq,
                                                   !ptlrpc_server_bulk_active(desc) ||
-                                                  exp->exp_failed, &lwi);
+                                                  exp->exp_failed ||
+                                                  exp->exp_abort_active_req,
+                                                  &lwi);
                                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
                                 /* Wait again if we changed deadline */
                         } while ((rc == -ETIMEDOUT) &&
@@ -790,6 +792,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
                                 rc = -ENOTCONN;
                                 ptlrpc_abort_bulk(desc);
+                        } else if (exp->exp_abort_active_req) {
+                                DEBUG_REQ(D_ERROR, req, "Reconnect on bulk PUT");
+                                /* we don't reply anyway */
+                                rc = -ETIMEDOUT;
+                                ptlrpc_abort_bulk(desc);
                         } else if (!desc->bd_success ||
                                    desc->bd_nob_transferred != desc->bd_nob) {
                                 DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
@@ -990,7 +997,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                    ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
                                           !ptlrpc_server_bulk_active(desc) ||
-                                          desc->bd_export->exp_failed, &lwi);
+                                          desc->bd_export->exp_failed ||
+                                          desc->bd_export->exp_abort_active_req,
+                                          &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
                         /* Wait again if we changed deadline */
                 } while ((rc == -ETIMEDOUT) &&
@@ -1007,6 +1016,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                         DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
                         rc = -ENOTCONN;
                         ptlrpc_abort_bulk(desc);
+                } else if (desc->bd_export->exp_abort_active_req) {
+                        DEBUG_REQ(D_ERROR, req, "Reconnect on bulk GET");
+                        /* we don't reply anyway */
+                        rc = -ETIMEDOUT;
+                        ptlrpc_abort_bulk(desc);
                 } else if (!desc->bd_success ||
                            desc->bd_nob_transferred != desc->bd_nob) {
                         DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)",