From 77c40f46f1631212a8ad9f4ff2b3bfc5ca2d8aa0 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Mon, 1 Sep 2014 15:03:51 +0200 Subject: [PATCH] LU-5556 target: limit bulk transfer time Messages lost during bulk transfer are not resent, so there is no point in waiting for a very long time (up to at_max/600s has been seen). This patch adds a new static timeout for the bulk transfer (100s by default). Signed-off-by: Johann Lombardi Change-Id: I3926a7a8f2bce4cbd00b8fe54094a8e9cbec1508 Reviewed-on: http://review.whamcloud.com/11717 Tested-by: Jenkins Reviewed-by: Liang Zhen Reviewed-by: Li Wei Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lib.c | 21 +++++++++++++++------ lustre/obdclass/class_obd.c | 3 +++ lustre/obdclass/linux/linux-sysctl.c | 12 ++++++++++++ lustre/ofd/ofd_dev.c | 3 +++ 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index cf9a8f5..b7cce63 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -59,6 +59,7 @@ extern unsigned int obd_timeout; /* seconds */ extern unsigned int ldlm_timeout; /* seconds */ extern unsigned int obd_timeout_set; extern unsigned int ldlm_timeout_set; +extern unsigned int bulk_timeout; extern unsigned int at_min; extern unsigned int at_max; extern unsigned int at_history; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 99b21dc..1b1c2a6 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -2615,6 +2615,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, { struct ptlrpc_request *req = desc->bd_req; time_t start = cfs_time_current_sec(); + time_t deadline; int rc = 0; ENTRY; @@ -2624,7 +2625,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, *lwi = LWI_INTR(NULL, NULL); rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, !atomic_read(&exp->exp_obd-> - obd_evict_inprogress), + obd_evict_inprogress), lwi); } @@ -2650,8 +2651,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, RETURN(0); } + /* limit actual bulk transfer to bulk_timeout seconds */ + deadline = start + bulk_timeout; + if (deadline > req->rq_deadline) + deadline = req->rq_deadline; + do { - long timeoutl = req->rq_deadline - cfs_time_current_sec(); + long timeoutl = deadline - cfs_time_current_sec(); cfs_duration_t timeout = timeoutl <= 0 ? CFS_TICK : cfs_time_seconds(timeoutl); @@ -2664,14 +2670,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, lustre_msg_get_conn_cnt(req->rq_reqmsg), lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); - /* Wait again if we changed deadline. */ + /* Wait again if we changed rq_deadline. */ + deadline = start + bulk_timeout; + if (deadline > req->rq_deadline) + deadline = req->rq_deadline; } while ((rc == -ETIMEDOUT) && - (req->rq_deadline > cfs_time_current_sec())); + (deadline > cfs_time_current_sec())); if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds", - bulk2type(desc), req->rq_deadline - start, - cfs_time_current_sec() - req->rq_deadline); + bulk2type(desc), deadline - start, + cfs_time_current_sec() - deadline); ptlrpc_abort_bulk(desc); } else if (exp->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s", diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 0fa08a5..1c93825 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -84,6 +84,9 @@ unsigned int obd_timeout_set; EXPORT_SYMBOL(obd_timeout_set); unsigned int ldlm_timeout_set; EXPORT_SYMBOL(ldlm_timeout_set); +/* bulk transfer timeout, give up after 100s by default */ +unsigned int bulk_timeout = 100; /* seconds */ +EXPORT_SYMBOL(bulk_timeout); /* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ unsigned int at_min = 0; EXPORT_SYMBOL(at_min); diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c index e298a3a..326218f 100644 --- a/lustre/obdclass/linux/linux-sysctl.c +++ b/lustre/obdclass/linux/linux-sysctl.c @@ -233,6 +233,10 @@ int proc_alloc_fail_rate(struct ctl_table *table, int write, } #endif +int LL_PROC_PROTO(proc_bulk_timeout) +{ + return proc_dointvec(table, write, buffer, lenp, ppos); +} int LL_PROC_PROTO(proc_at_min) { return proc_dointvec(table, write, buffer, lenp, ppos); @@ -348,6 +352,14 @@ static struct ctl_table obd_table[] = { }, { INIT_CTL_NAME + .procname = "bulk_timeout", + .data = &bulk_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_bulk_timeout + }, + { + INIT_CTL_NAME .procname = "at_min", .data = &at_min, .maxlen = sizeof(int), diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index ac91062..85068c4 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -1879,8 +1879,11 @@ static int ofd_rw_hpreq_lock_match(struct ptlrpc_request *req, if (!ostid_res_name_eq(&ioo->ioo_oid, &lock->l_resource->lr_name)) RETURN(0); + /* a bulk write can only hold a reference on a PW extent lock */ mode = LCK_PW; if (opc == OST_READ) + /* whereas a bulk read can be protected by either a PR or PW + * extent lock */ mode |= LCK_PR; if (!(lock->l_granted_mode & mode)) -- 1.8.3.1