Whamcloud - gitweb
Port of Johann's patch "abort bulk request (v2)" to HEAD.
authordeen <deen>
Tue, 26 May 2009 18:04:06 +0000 (18:04 +0000)
committerdeen <deen>
Tue, 26 May 2009 18:04:06 +0000 (18:04 +0000)
i=robert.read
i=alexey.lyashkov
b=18674

lustre/include/lustre_export.h
lustre/ldlm/ldlm_lib.c
lustre/ost/ost_handler.c

index d0e502d..ef3cd4c 100644 (file)
@@ -181,7 +181,10 @@ struct obd_export {
                                   exp_need_sync:1,
                                   exp_flvr_changed:1,
                                   exp_flvr_adapt:1,
-                                  exp_libclient:1; /* liblustre client? */
+                                  exp_libclient:1, /* liblustre client? */
+                                  /* client timed out and tried to reconnect,
+                                   * but couldn't because of active rpcs */
+                                  exp_abort_active_req:1;
         struct list_head          exp_queued_rpc;  /* RPC to be handled */
         /* also protected by exp_lock */
         enum lustre_sec_part      exp_sp_peer;
index 6475ccb..7cb9a38 100644 (file)
@@ -807,6 +807,12 @@ no_export:
                       target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
                       export, atomic_read(&export->exp_rpc_count) - 1);
+                spin_lock(&export->exp_lock);
+                if (req->rq_export->exp_conn_cnt <
+                    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+                        /* try to abort active requests */
+                        req->rq_export->exp_abort_active_req = 1;
+                spin_unlock(&export->exp_lock);
                 GOTO(out, rc = -EBUSY);
         } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                 CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
@@ -918,6 +924,7 @@ dont_check_exports:
                 GOTO(out, rc = -EALREADY);
         }
         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+        export->exp_abort_active_req = 0;
 
         /* request from liblustre?  Don't evict it for not pinging. */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
index c47a196..7809aa3 100644 (file)
@@ -773,7 +773,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                            desc);
                                 rc = l_wait_event(desc->bd_waitq,
                                                   !ptlrpc_server_bulk_active(desc) ||
-                                                  exp->exp_failed, &lwi);
+                                                  exp->exp_failed ||
+                                                  exp->exp_abort_active_req,
+                                                  &lwi);
                                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
                                 /* Wait again if we changed deadline */
                         } while ((rc == -ETIMEDOUT) &&
@@ -790,6 +792,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
                                 rc = -ENOTCONN;
                                 ptlrpc_abort_bulk(desc);
+                        } else if (exp->exp_abort_active_req) {
+                                DEBUG_REQ(D_ERROR, req, "Reconnect on bulk PUT");
+                                /* we don't reply anyway */
+                                rc = -ETIMEDOUT;
+                                ptlrpc_abort_bulk(desc);
                         } else if (!desc->bd_success ||
                                    desc->bd_nob_transferred != desc->bd_nob) {
                                 DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
@@ -1003,7 +1010,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                    ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
                                           !ptlrpc_server_bulk_active(desc) ||
-                                          desc->bd_export->exp_failed, &lwi);
+                                          desc->bd_export->exp_failed ||
+                                          desc->bd_export->exp_abort_active_req,
+                                          &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
                         /* Wait again if we changed deadline */
                 } while ((rc == -ETIMEDOUT) &&
@@ -1020,6 +1029,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                         DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
                         rc = -ENOTCONN;
                         ptlrpc_abort_bulk(desc);
+                } else if (desc->bd_export->exp_abort_active_req) {
+                        DEBUG_REQ(D_ERROR, req, "Reconnect on bulk GET");
+                        /* we don't reply anyway */
+                        rc = -ETIMEDOUT;
+                        ptlrpc_abort_bulk(desc);
                 } else if (!desc->bd_success) {
                         DEBUG_REQ(D_ERROR, req, "network error on bulk GET");
                         /* XXX should this be a different errno? */