From 37e760558443b95ebe0b0c817608f1ff3e94d57c Mon Sep 17 00:00:00 2001 From: shadow Date: Thu, 28 May 2009 03:55:15 +0000 Subject: [PATCH] abort bulk too early if client is reconnected Branch b1_8 b=18674 i=rread i=shadow --- lustre/ChangeLog | 5 +++++ lustre/include/lustre_export.h | 5 ++++- lustre/ldlm/ldlm_lib.c | 7 +++++++ lustre/ost/ost_handler.c | 18 ++++++++++++++++-- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index a4272eb..884de7b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -15,6 +15,11 @@ tbd Sun Microsystems, Inc. more information, please refer to bugzilla 17630. Severity : normal +Bugzilla : 18674 +Description: abort bulk too early if client is reconnected + +Severity : normal +Frequency : rare, if used wide striped file and one ost in down. Bugzilla : 18382 Descriptoin: don't return error if we created a subset of objects for file. Details : lov_update_create_set() uses set->set_success as index for created diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 0e333a8..e4887a4 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -163,7 +163,10 @@ struct obd_export { exp_vbr_failed:1, exp_replay_needed:1, exp_need_sync:1, /* needs sync from connect */ - exp_libclient:1; /* liblustre client? */ + exp_libclient:1, /* liblustre client? */ + /* client timed out and tried to reconnect, + * but couldn't because of active rpcs */ + exp_abort_active_req:1; struct list_head exp_queued_rpc; /* RPC to be handled */ /* VBR: per-export last committed */ __u64 exp_last_committed; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 45e063e..158cf8f 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -830,6 +830,12 @@ no_export: "with %d active RPCs\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), export, atomic_read(&export->exp_rpc_count) - 1); + spin_lock(&export->exp_lock); + if (req->rq_export->exp_conn_cnt < + lustre_msg_get_conn_cnt(req->rq_reqmsg)) + /* try to abort active requests */ + req->rq_export->exp_abort_active_req = 1; + spin_unlock(&export->exp_lock); GOTO(out, rc = -EBUSY); } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; " @@ -951,6 +957,7 @@ no_export: GOTO(out, rc = -EALREADY); } export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); + export->exp_abort_active_req = 0; /* request from liblustre? Don't evict it for not pinging. */ if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 5edbd75..40a56ac 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -773,7 +773,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) desc); rc = l_wait_event(desc->bd_waitq, !ptlrpc_server_bulk_active(desc) || - exp->exp_failed, &lwi); + exp->exp_failed || + exp->exp_abort_active_req, + &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); /* Wait again if we changed deadline */ } while ((rc == -ETIMEDOUT) && @@ -790,6 +792,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT"); rc = -ENOTCONN; ptlrpc_abort_bulk(desc); + } else if (exp->exp_abort_active_req) { + DEBUG_REQ(D_ERROR, req, "Reconnect on bulk PUT"); + /* we don't reply anyway */ + rc = -ETIMEDOUT; + ptlrpc_abort_bulk(desc); } else if (!desc->bd_success || desc->bd_nob_transferred != desc->bd_nob) { DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)", @@ -990,7 +997,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, !ptlrpc_server_bulk_active(desc) || - desc->bd_export->exp_failed, &lwi); + desc->bd_export->exp_failed || + desc->bd_export->exp_abort_active_req, + &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); /* Wait again if we changed deadline */ } while ((rc == -ETIMEDOUT) && @@ -1007,6 +1016,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET"); rc = -ENOTCONN; ptlrpc_abort_bulk(desc); + } else if (desc->bd_export->exp_abort_active_req) { + DEBUG_REQ(D_ERROR, req, "Reconnect on bulk GET"); + /* we don't reply anyway */ + rc = -ETIMEDOUT; + ptlrpc_abort_bulk(desc); } else if (!desc->bd_success || desc->bd_nob_transferred != desc->bd_nob) { DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)", -- 1.8.3.1