Whamcloud - gitweb
LU-5556 target: limit bulk transfer time 17/11717/5
authorJohann Lombardi <johann.lombardi@intel.com>
Mon, 1 Sep 2014 13:03:51 +0000 (15:03 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 15 Sep 2014 18:20:21 +0000 (18:20 +0000)
Messages lost during bulk transfer are not resent, so there is no
point in waiting for a very long time (up to at_max/600s has been
seen). This patch adds a new static timeout for the bulk transfer
(100s by default).

Signed-off-by: Johann Lombardi <johann.lombardi@intel.com>
Change-Id: I3926a7a8f2bce4cbd00b8fe54094a8e9cbec1508
Reviewed-on: http://review.whamcloud.com/11717
Tested-by: Jenkins
Reviewed-by: Liang Zhen <liang.zhen@intel.com>
Reviewed-by: Li Wei <wei.g.li@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/obdclass/class_obd.c
lustre/obdclass/linux/linux-sysctl.c
lustre/ofd/ofd_dev.c

index cf9a8f5..b7cce63 100644 (file)
@@ -59,6 +59,7 @@ extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
 extern unsigned int obd_timeout_set;
 extern unsigned int ldlm_timeout_set;
+extern unsigned int bulk_timeout;
 extern unsigned int at_min;
 extern unsigned int at_max;
 extern unsigned int at_history;
index 99b21dc..1b1c2a6 100644 (file)
@@ -2615,6 +2615,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
 {
        struct ptlrpc_request   *req = desc->bd_req;
        time_t                   start = cfs_time_current_sec();
+       time_t                   deadline;
        int                      rc = 0;
 
        ENTRY;
@@ -2624,7 +2625,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                *lwi = LWI_INTR(NULL, NULL);
                rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq,
                                  !atomic_read(&exp->exp_obd->
-                                                  obd_evict_inprogress),
+                                                  obd_evict_inprogress),
                                  lwi);
        }
 
@@ -2650,8 +2651,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                RETURN(0);
        }
 
+       /* limit actual bulk transfer to bulk_timeout seconds */
+       deadline = start + bulk_timeout;
+       if (deadline > req->rq_deadline)
+               deadline = req->rq_deadline;
+
        do {
-               long timeoutl = req->rq_deadline - cfs_time_current_sec();
+               long timeoutl = deadline - cfs_time_current_sec();
                cfs_duration_t timeout = timeoutl <= 0 ?
                                         CFS_TICK : cfs_time_seconds(timeoutl);
 
@@ -2664,14 +2670,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                                  lustre_msg_get_conn_cnt(req->rq_reqmsg),
                                  lwi);
                LASSERT(rc == 0 || rc == -ETIMEDOUT);
-               /* Wait again if we changed deadline. */
+               /* Wait again if we changed rq_deadline. */
+               deadline = start + bulk_timeout;
+               if (deadline > req->rq_deadline)
+                       deadline = req->rq_deadline;
        } while ((rc == -ETIMEDOUT) &&
-                (req->rq_deadline > cfs_time_current_sec()));
+                (deadline > cfs_time_current_sec()));
 
        if (rc == -ETIMEDOUT) {
                DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
-                         bulk2type(desc), req->rq_deadline - start,
-                         cfs_time_current_sec() - req->rq_deadline);
+                         bulk2type(desc), deadline - start,
+                         cfs_time_current_sec() - deadline);
                ptlrpc_abort_bulk(desc);
        } else if (exp->exp_failed) {
                DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
index 0fa08a5..1c93825 100644 (file)
@@ -84,6 +84,9 @@ unsigned int obd_timeout_set;
 EXPORT_SYMBOL(obd_timeout_set);
 unsigned int ldlm_timeout_set;
 EXPORT_SYMBOL(ldlm_timeout_set);
+/* bulk transfer timeout, give up after 100s by default */
+unsigned int bulk_timeout = 100; /* seconds */
+EXPORT_SYMBOL(bulk_timeout);
 /* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
 unsigned int at_min = 0;
 EXPORT_SYMBOL(at_min);
index e298a3a..326218f 100644 (file)
@@ -233,6 +233,10 @@ int proc_alloc_fail_rate(struct ctl_table *table, int write,
 }
 #endif
 
+int LL_PROC_PROTO(proc_bulk_timeout)
+{
+       return proc_dointvec(table, write, buffer, lenp, ppos);
+}
 int LL_PROC_PROTO(proc_at_min)
 {
        return proc_dointvec(table, write, buffer, lenp, ppos);
@@ -348,6 +352,14 @@ static struct ctl_table obd_table[] = {
        },
        {
                INIT_CTL_NAME
+               .procname       = "bulk_timeout",
+               .data           = &bulk_timeout,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_bulk_timeout
+       },
+       {
+               INIT_CTL_NAME
                .procname       = "at_min",
                .data           = &at_min,
                .maxlen         = sizeof(int),
index ac91062..85068c4 100644 (file)
@@ -1879,8 +1879,11 @@ static int ofd_rw_hpreq_lock_match(struct ptlrpc_request *req,
        if (!ostid_res_name_eq(&ioo->ioo_oid, &lock->l_resource->lr_name))
                RETURN(0);
 
+       /* a bulk write can only hold a reference on a PW extent lock */
        mode = LCK_PW;
        if (opc == OST_READ)
+               /* whereas a bulk read can be protected by either a PR or PW
+                * extent lock */
                mode |= LCK_PR;
 
        if (!(lock->l_granted_mode & mode))