Whamcloud - gitweb
b=19778
authoranserper <anserper>
Thu, 23 Jul 2009 10:14:41 +0000 (10:14 +0000)
committeranserper <anserper>
Thu, 23 Jul 2009 10:14:41 +0000 (10:14 +0000)
a=24644

i=Alexey Lyashkov
i=ZhiYong Tian

do not loop infinitely in client_quota_ctl, retry several times after timeout

lustre/include/lprocfs_status.h
lustre/include/obd.h
lustre/ldlm/ldlm_lib.c
lustre/mdc/lproc_mdc.c
lustre/obdclass/lprocfs_status.c
lustre/osc/lproc_osc.c
lustre/quota/quota_ctl.c
lustre/quota/quota_internal.h
lustre/tests/sanity-quota.sh

index 175ddc1..cb1dab5 100644 (file)
@@ -473,6 +473,7 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
                                   int count, int *eof, void *data);
 extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
                               int count, int *eof, void *data);
+
 struct adaptive_timeout;
 extern int lprocfs_at_hist_helper(char *page, int count, int rc,
                                   struct adaptive_timeout *at);
@@ -485,6 +486,11 @@ extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
 extern int lprocfs_wr_ping(struct file *file, const char *buffer,
                            unsigned long count, void *data);
 
+extern int lprocfs_rd_quota_resend_count(char *page, char **start, off_t off,
+                                         int count, int *eof, void *data);
+extern int lprocfs_wr_quota_resend_count(struct file *file, const char *buffer,
+                                         unsigned long count, void *data);
+
 /* Statfs helpers */
 extern int lprocfs_rd_blksize(char *page, char **start, off_t off,
                               int count, int *eof, void *data);
@@ -501,6 +507,7 @@ extern int lprocfs_rd_filesfree(char *page, char **start, off_t off,
 extern int lprocfs_rd_filegroups(char *page, char **start, off_t off,
                                  int count, int *eof, void *data);
 
+
 extern int lprocfs_write_helper(const char *buffer, unsigned long count,
                                 int *val);
 extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
index a2bc1fa..235ac08 100644 (file)
@@ -393,6 +393,7 @@ struct filter_obd {
 #define MDC_MAX_RIF_DEFAULT       8
 #define MDC_MAX_RIF_MAX         512
 
+#define CLIENT_QUOTA_DEFAULT_RESENDS 10
 
 struct mdc_rpc_lock;
 struct obd_import;
@@ -502,6 +503,8 @@ struct client_obd {
         struct lu_client_seq    *cl_seq;
 
         atomic_t                 cl_resends; /* resend count */
+        atomic_t                 cl_quota_resends; /* quota related resend count */
+
         /* Cache of triples */
         struct lustre_cache     *cl_cache;
         obd_lock_cancel_cb       cl_ext_lock_cancel_cb;
index 78fc799..7459e77 100644 (file)
@@ -286,6 +286,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
 #endif
         atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+        atomic_set(&cli->cl_quota_resends, CLIENT_QUOTA_DEFAULT_RESENDS);
 
         /* This value may be changed at connect time in
            ptlrpc_connect_interpret. */
index 46aae45..63ef292 100644 (file)
@@ -75,6 +75,7 @@ static int mdc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
 
         return count;
 }
+
 static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
         { "uuid",            lprocfs_rd_uuid,        0, 0 },
         { "ping",            0, lprocfs_wr_ping,     0, 0, 0222 },
@@ -90,6 +91,8 @@ static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
         { "mds_conn_uuid",   lprocfs_rd_conn_uuid,   0, 0 },
         { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
                                 mdc_wr_max_rpcs_in_flight, 0 },
+        { "quota_resend_count",  lprocfs_rd_quota_resend_count,
+                                 lprocfs_wr_quota_resend_count, 0},
         { "timeouts",        lprocfs_rd_timeouts,    0, 0 },
         { "import",          lprocfs_rd_import,      0, 0 },
         { "state",           lprocfs_rd_state,       0, 0 },
index 1f99bf9..bb9ebb7 100644 (file)
@@ -910,6 +910,33 @@ int lprocfs_at_hist_helper(char *page, int count, int rc,
         return rc;
 }
 
+int lprocfs_rd_quota_resend_count(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+
+        return snprintf(page, count, "%u\n",
+                        atomic_read(&obd->u.cli.cl_quota_resends));
+}
+
+int lprocfs_wr_quota_resend_count(struct file *file, const char *buffer,
+                                  unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < 0)
+               return -EINVAL;
+
+        atomic_set(&obd->u.cli.cl_quota_resends, val);
+
+        return count;
+}
+
 /* See also ptlrpc_lprocfs_rd_timeouts */
 int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
                         int *eof, void *data)
@@ -2279,6 +2306,8 @@ EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
 EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
 EXPORT_SYMBOL(lprocfs_rd_filestotal);
 EXPORT_SYMBOL(lprocfs_rd_filesfree);
+EXPORT_SYMBOL(lprocfs_rd_quota_resend_count);
+EXPORT_SYMBOL(lprocfs_wr_quota_resend_count);
 
 EXPORT_SYMBOL(lprocfs_write_helper);
 EXPORT_SYMBOL(lprocfs_write_frac_helper);
index 37d2ec8..9ca2cbc 100644 (file)
@@ -541,6 +541,8 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
         { "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
         { "checksum_type",   osc_rd_checksum_type, osc_wd_checksum_type, 0 },
         { "resend_count",  osc_rd_resend_count, osc_wr_resend_count, 0},
+        { "quota_resend_count",  lprocfs_rd_quota_resend_count,
+                                 lprocfs_wr_quota_resend_count, 0},
         { "timeouts",        lprocfs_rd_timeouts,      0, 0 },
         { "import",          lprocfs_rd_import,    0, 0 },
         { "state",           lprocfs_rd_state,         0, 0 },
index 5397b94..4a4261c 100644 (file)
@@ -346,7 +346,7 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
         struct ptlrpc_request *req;
         struct obd_quotactl *oqc;
         __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*oqctl) };
-        int ver, opc, rc;
+        int ver, opc, rc, resends = 0;
         ENTRY;
 
         if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) {
@@ -359,6 +359,8 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
                 RETURN(-EINVAL);
         }
 
+restart_request:
+
         req = ptlrpc_prep_req(class_exp2cliimp(exp), ver, opc, 2, size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
@@ -367,6 +369,8 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
         *oqc = *oqctl;
 
         ptlrpc_req_set_repsize(req, 2, size);
+        ptlrpc_at_set_req_timeout(req);
+        req->rq_no_resend = 1;
 
         rc = ptlrpc_queue_wait(req);
         if (rc) {
@@ -387,6 +391,19 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
         EXIT;
 out:
         ptlrpc_req_finished(req);
+
+        if (client_quota_recoverable_error(rc)) {
+                resends++;
+                if (!client_quota_should_resend(resends, &exp->exp_obd->u.cli)) {
+                        CERROR("too many resend retries, returning error "
+                               "(cmd = %d, id = %u, type = %d)\n",
+                               oqctl->qc_cmd, oqctl->qc_id, oqctl->qc_type);
+                        RETURN(-EIO);
+                }
+
+                goto restart_request;
+        }
+
         return rc;
 }
 
index 4f17e88..bf07ecf 100644 (file)
@@ -200,4 +200,16 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
 int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
 int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
 int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+static inline int client_quota_recoverable_error(int rc)
+{
+        return (rc == -ETIMEDOUT || rc == -EAGAIN);
+}
+
+static inline int client_quota_should_resend(int resend, struct client_obd *cli)
+{
+        return atomic_read(&cli->cl_quota_resends) ?
+                atomic_read(&cli->cl_quota_resends) > resend : 1;
+}
+
 #endif
index 8981512..102f27e 100644 (file)
@@ -53,7 +53,7 @@ DIRECTIO=${DIRECTIO:-$LUSTRE/tests/directio}
 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
 remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
 
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21 29"
 
 QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
 
@@ -2158,6 +2158,45 @@ test_28() {
 }
 run_test_with_stat 28 "test for consistency for qunit when setquota (18574) ==========="
 
+test_29()
+{
+        local BLK_LIMIT=$((100 * 1024 * 1024)) # 100G
+        local timeout
+        local pid
+        local resends
+
+        if at_is_enabled; then
+                timeout=$(at_max_get client)
+                at_max_set 10 client
+        else
+                timeout=$(lctl get_param -n timeout)
+                lctl set_param timeout=10
+        fi
+
+        resends=$(lctl get_param -n mdc.${FSNAME}-*.quota_resend_count | head -1)
+
+        #define OBD_FAIL_MDS_QUOTACTL_NET 0x12e
+        lustre_fail mds 0x12e
+
+        $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR & pid=$!
+
+        echo "sleeping for $((10 * resends + 5)) seconds"
+        sleep $((10 * resends + 5))
+        ps -p $pid && error "lfs hadn't finished by timeout"
+        wait $pid && error "succeeded, but should have failed"
+
+        lustre_fail mds 0
+
+        if at_is_enabled; then
+                at_max_set $timeout client
+        else
+                lctl set_param timeout=$timeout
+        fi
+
+        resetquota -u $TSTUSR
+}
+run_test_with_stat 29 "unhandled quotactls must not hang lustre client (19778) ========"
+
 # turn off quota
 test_99()
 {