Whamcloud - gitweb
LU-9228 nrs: TBF realtime policies under congestion 87/26087/8
authorQian Yingjin <qian@ddn.com>
Mon, 6 Mar 2017 07:05:01 +0000 (15:05 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 25 Jan 2018 04:47:01 +0000 (04:47 +0000)
During TBF evaluation, we find that when the sum of I/O bandwidth
requirements for all classes exceeds the system capacity, the
classes with same rate limits get less bandwidth than preconfigured
evenly.

The reason is as follows: under heavy load on a congested server,
it will result in some missed deadlines for some classes. The
calculated tokens may larger than 1 during dequeuing. In the original
implementation, all classes are equally handled to simply discard
exceeding tokens.

Thus, a Hard Token Compensation (HTC) strategy is proposed. A class
can be configured with HTC feature by the rule it matches. This
feature means that requests in this kind of class queues have high
real-time requirements and that the bandwidth assignment must be
satisfied as good as possible. When deadline misses happen, the
class keeps the deadline unchanged and the time residue (the
remainder of elapsed time divided by 1/r) is compensated to the
next round. This ensures that the next idle I/O thread will always
select this class to serve until all accumulated exceeding tokens
are handled or there are no pending requests in the class queue.

A new command format is added to enable realtime feature for a rule:
start $ruleName jobid={dd.0} rate=100 realtime=1

Change-Id: I3c867052c27e57a30ccdfe649e0905d141792663
Signed-off-by: Qian Yingjin <qian@ddn.com>
Reviewed-on: https://review.whamcloud.com/26087
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Emoly Liu <emoly.liu@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre_nrs_tbf.h
lustre/ptlrpc/nrs_tbf.c

index 6e0c736..7da07fb 100644 (file)
@@ -85,6 +85,13 @@ struct nrs_tbf_client {
        __u64                            tc_depth;
        /** Time check-point. */
        __u64                            tc_check_time;
+       /** Deadline of a class */
+       __u64                            tc_deadline;
+       /**
+        * Time residue: the remainder of elapsed time
+        * divided by nsecs when dequeue a request.
+        */
+       __u64                            tc_nsecs_resid;
        /** List of queued requests. */
        struct list_head                 tc_list;
        /** Node in binary heap. */
@@ -102,8 +109,11 @@ struct nrs_tbf_client {
 
 #define MAX_TBF_NAME (16)
 
-#define NTRS_STOPPING  0x0000001
-#define NTRS_DEFAULT   0x0000002
+enum nrs_rule_flags {
+       NTRS_STOPPING   = 0x00000001,
+       NTRS_DEFAULT    = 0x00000002,
+       NTRS_REALTIME   = 0x00000004,
+};
 
 struct nrs_tbf_rule {
        /** Name of the rule. */
@@ -139,7 +149,7 @@ struct nrs_tbf_rule {
        /** List of client. */
        struct list_head                 tr_cli_list;
        /** Flags of the rule. */
-       __u32                            tr_flags;
+       enum nrs_rule_flags              tr_flags;
        /** Usage Reference count taken on the rule. */
        atomic_t                         tr_ref;
        /** Generation of the rule. */
@@ -275,7 +285,7 @@ struct nrs_tbf_cmd {
                        struct list_head         ts_conds;
                        char                    *ts_conds_str;
                        __u32                    ts_valid_type;
-                       __u32                    ts_rule_flags;
+                       enum nrs_rule_flags      ts_rule_flags;
                        char                    *ts_next_name;
                } tc_start;
                struct nrs_tbf_cmd_change {
index c858b6c..ccab4c6 100644 (file)
@@ -300,6 +300,7 @@ nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy,
 
        memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name));
        rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate;
+       rule->tr_flags = start->u.tc_start.ts_rule_flags;
        rule->tr_nsecs = NSEC_PER_SEC;
        do_div(rule->tr_nsecs, rule->tr_rpc_rate);
        rule->tr_depth = tbf_depth;
@@ -521,11 +522,9 @@ tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2)
        cli1 = container_of(e1, struct nrs_tbf_client, tc_node);
        cli2 = container_of(e2, struct nrs_tbf_client, tc_node);
 
-       if (cli1->tc_check_time + cli1->tc_nsecs <
-           cli2->tc_check_time + cli2->tc_nsecs)
+       if (cli1->tc_deadline < cli2->tc_deadline)
                return 1;
-       else if (cli1->tc_check_time + cli1->tc_nsecs >
-                cli2->tc_check_time + cli2->tc_nsecs)
+       else if (cli1->tc_deadline > cli2->tc_deadline)
                return 0;
 
        if (cli1->tc_check_time < cli2->tc_check_time)
@@ -2477,10 +2476,12 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
                                     struct ptlrpc_nrs_request,
                                     nr_u.tbf.tr_list);
        } else {
+               struct nrs_tbf_rule *rule = cli->tc_rule;
                __u64 now = ktime_to_ns(ktime_get());
                __u64 passed;
                __u64 ntoken;
                __u64 deadline;
+               __u64 old_resid = 0;
 
                deadline = cli->tc_check_time +
                          cli->tc_nsecs;
@@ -2488,9 +2489,19 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
                passed = now - cli->tc_check_time;
                ntoken = passed * cli->tc_rpc_rate;
                do_div(ntoken, NSEC_PER_SEC);
+
                ntoken += cli->tc_ntoken;
-               if (ntoken > cli->tc_depth)
+               if (rule->tr_flags & NTRS_REALTIME) {
+                       LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs);
+                       old_resid = cli->tc_nsecs_resid;
+                       cli->tc_nsecs_resid += passed % cli->tc_nsecs;
+                       if (cli->tc_nsecs_resid > cli->tc_nsecs) {
+                               ntoken++;
+                               cli->tc_nsecs_resid -= cli->tc_nsecs;
+                       }
+               } else if (ntoken > cli->tc_depth)
                        ntoken = cli->tc_depth;
+
                if (ntoken > 0) {
                        struct ptlrpc_request *req;
                        nrq = list_entry(cli->tc_list.next,
@@ -2508,6 +2519,8 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
                                                   &cli->tc_node);
                                cli->tc_in_heap = false;
                        } else {
+                               if (!(rule->tr_flags & NTRS_REALTIME))
+                                       cli->tc_deadline = now + cli->tc_nsecs;
                                cfs_binheap_relocate(head->th_binheap,
                                                     &cli->tc_node);
                        }
@@ -2521,6 +2534,15 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy,
                } else {
                        ktime_t time;
 
+                       if (rule->tr_flags & NTRS_REALTIME) {
+                               cli->tc_deadline = deadline;
+                               cli->tc_nsecs_resid = old_resid;
+                               cfs_binheap_relocate(head->th_binheap,
+                                                    &cli->tc_node);
+                               if (node != cfs_binheap_root(head->th_binheap))
+                                       return nrs_tbf_req_get(policy,
+                                                              peek, force);
+                       }
                        policy->pol_nrs->nrs_throttling = 1;
                        head->th_deadline = deadline;
                        time = ktime_set(0, 0);
@@ -2556,6 +2578,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
                            struct nrs_tbf_head, th_res);
        if (list_empty(&cli->tc_list)) {
                LASSERT(!cli->tc_in_heap);
+               cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs;
                rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node);
                if (rc == 0) {
                        cli->tc_in_heap = true;
@@ -2563,8 +2586,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy,
                        list_add_tail(&nrq->nr_u.tbf.tr_list,
                                          &cli->tc_list);
                        if (policy->pol_nrs->nrs_throttling) {
-                               __u64 deadline = cli->tc_check_time +
-                                                cli->tc_nsecs;
+                               __u64 deadline = cli->tc_deadline;
                                if ((head->th_deadline > deadline) &&
                                    (hrtimer_try_to_cancel(&head->th_timer)
                                     >= 0)) {
@@ -2805,6 +2827,15 @@ nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer)
                        cmd->u.tc_change.tc_next_name = val;
                else
                        return -EINVAL;
+       } else if (strcmp(key, "realtime") == 0) {
+               unsigned long realtime;
+
+               rc = kstrtoul(val, 10, &realtime);
+               if (rc)
+                       return rc;
+
+               if (realtime > 0)
+                       cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME;
        } else {
                return -EINVAL;
        }