From d11fa2c279593634cf6c4196b413a6d285b24e10 Mon Sep 17 00:00:00 2001 From: Qian Yingjin Date: Mon, 6 Mar 2017 15:05:01 +0800 Subject: [PATCH] LU-9228 nrs: TBF realtime policies under congestion During TBF evaluation, we find that when the sum of I/O bandwidth requirements for all classes exceeds the system capacity, the classes with same rate limits get less bandwidth than preconfigured evenly. The reason is as follows: under heavy load on a congested server, it will result in some missed deadlines for some classes. The calculated tokens may larger than 1 during dequeuing. In the original implementation, all classes are equally handled to simply discard exceeding tokens. Thus, a Hard Token Compensation (HTC) strategy is proposed. A class can be configured with HTC feature by the rule it matches. This feature means that requests in this kind of class queues have high real-time requirements and that the bandwidth assignment must be satisfied as good as possible. When deadline misses happen, the class keeps the deadline unchanged and the time residue (the remainder of elapsed time divided by 1/r) is compensated to the next round. This ensures that the next idle I/O thread will always select this class to serve until all accumulated exceeding tokens are handled or there are no pending requests in the class queue. A new command format is added to enable realtime feature for a rule: start $ruleName jobid={dd.0} rate=100 realtime=1 Change-Id: I3c867052c27e57a30ccdfe649e0905d141792663 Signed-off-by: Qian Yingjin Reviewed-on: https://review.whamcloud.com/26087 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Emoly Liu Reviewed-by: Oleg Drokin --- lustre/include/lustre_nrs_tbf.h | 18 +++++++++++++---- lustre/ptlrpc/nrs_tbf.c | 45 ++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/lustre/include/lustre_nrs_tbf.h b/lustre/include/lustre_nrs_tbf.h index 6e0c736..7da07fb 100644 --- a/lustre/include/lustre_nrs_tbf.h +++ b/lustre/include/lustre_nrs_tbf.h @@ -85,6 +85,13 @@ struct nrs_tbf_client { __u64 tc_depth; /** Time check-point. */ __u64 tc_check_time; + /** Deadline of a class */ + __u64 tc_deadline; + /** + * Time residue: the remainder of elapsed time + * divided by nsecs when dequeue a request. + */ + __u64 tc_nsecs_resid; /** List of queued requests. */ struct list_head tc_list; /** Node in binary heap. */ @@ -102,8 +109,11 @@ struct nrs_tbf_client { #define MAX_TBF_NAME (16) -#define NTRS_STOPPING 0x0000001 -#define NTRS_DEFAULT 0x0000002 +enum nrs_rule_flags { + NTRS_STOPPING = 0x00000001, + NTRS_DEFAULT = 0x00000002, + NTRS_REALTIME = 0x00000004, +}; struct nrs_tbf_rule { /** Name of the rule. */ @@ -139,7 +149,7 @@ struct nrs_tbf_rule { /** List of client. */ struct list_head tr_cli_list; /** Flags of the rule. */ - __u32 tr_flags; + enum nrs_rule_flags tr_flags; /** Usage Reference count taken on the rule. */ atomic_t tr_ref; /** Generation of the rule. */ @@ -275,7 +285,7 @@ struct nrs_tbf_cmd { struct list_head ts_conds; char *ts_conds_str; __u32 ts_valid_type; - __u32 ts_rule_flags; + enum nrs_rule_flags ts_rule_flags; char *ts_next_name; } tc_start; struct nrs_tbf_cmd_change { diff --git a/lustre/ptlrpc/nrs_tbf.c b/lustre/ptlrpc/nrs_tbf.c index c858b6c..ccab4c6 100644 --- a/lustre/ptlrpc/nrs_tbf.c +++ b/lustre/ptlrpc/nrs_tbf.c @@ -300,6 +300,7 @@ nrs_tbf_rule_start(struct ptlrpc_nrs_policy *policy, memcpy(rule->tr_name, start->tc_name, strlen(start->tc_name)); rule->tr_rpc_rate = start->u.tc_start.ts_rpc_rate; + rule->tr_flags = start->u.tc_start.ts_rule_flags; rule->tr_nsecs = NSEC_PER_SEC; do_div(rule->tr_nsecs, rule->tr_rpc_rate); rule->tr_depth = tbf_depth; @@ -521,11 +522,9 @@ tbf_cli_compare(struct cfs_binheap_node *e1, struct cfs_binheap_node *e2) cli1 = container_of(e1, struct nrs_tbf_client, tc_node); cli2 = container_of(e2, struct nrs_tbf_client, tc_node); - if (cli1->tc_check_time + cli1->tc_nsecs < - cli2->tc_check_time + cli2->tc_nsecs) + if (cli1->tc_deadline < cli2->tc_deadline) return 1; - else if (cli1->tc_check_time + cli1->tc_nsecs > - cli2->tc_check_time + cli2->tc_nsecs) + else if (cli1->tc_deadline > cli2->tc_deadline) return 0; if (cli1->tc_check_time < cli2->tc_check_time) @@ -2477,10 +2476,12 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, struct ptlrpc_nrs_request, nr_u.tbf.tr_list); } else { + struct nrs_tbf_rule *rule = cli->tc_rule; __u64 now = ktime_to_ns(ktime_get()); __u64 passed; __u64 ntoken; __u64 deadline; + __u64 old_resid = 0; deadline = cli->tc_check_time + cli->tc_nsecs; @@ -2488,9 +2489,19 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, passed = now - cli->tc_check_time; ntoken = passed * cli->tc_rpc_rate; do_div(ntoken, NSEC_PER_SEC); + ntoken += cli->tc_ntoken; - if (ntoken > cli->tc_depth) + if (rule->tr_flags & NTRS_REALTIME) { + LASSERT(cli->tc_nsecs_resid < cli->tc_nsecs); + old_resid = cli->tc_nsecs_resid; + cli->tc_nsecs_resid += passed % cli->tc_nsecs; + if (cli->tc_nsecs_resid > cli->tc_nsecs) { + ntoken++; + cli->tc_nsecs_resid -= cli->tc_nsecs; + } + } else if (ntoken > cli->tc_depth) ntoken = cli->tc_depth; + if (ntoken > 0) { struct ptlrpc_request *req; nrq = list_entry(cli->tc_list.next, @@ -2508,6 +2519,8 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, &cli->tc_node); cli->tc_in_heap = false; } else { + if (!(rule->tr_flags & NTRS_REALTIME)) + cli->tc_deadline = now + cli->tc_nsecs; cfs_binheap_relocate(head->th_binheap, &cli->tc_node); } @@ -2521,6 +2534,15 @@ struct ptlrpc_nrs_request *nrs_tbf_req_get(struct ptlrpc_nrs_policy *policy, } else { ktime_t time; + if (rule->tr_flags & NTRS_REALTIME) { + cli->tc_deadline = deadline; + cli->tc_nsecs_resid = old_resid; + cfs_binheap_relocate(head->th_binheap, + &cli->tc_node); + if (node != cfs_binheap_root(head->th_binheap)) + return nrs_tbf_req_get(policy, + peek, force); + } policy->pol_nrs->nrs_throttling = 1; head->th_deadline = deadline; time = ktime_set(0, 0); @@ -2556,6 +2578,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy, struct nrs_tbf_head, th_res); if (list_empty(&cli->tc_list)) { LASSERT(!cli->tc_in_heap); + cli->tc_deadline = cli->tc_check_time + cli->tc_nsecs; rc = cfs_binheap_insert(head->th_binheap, &cli->tc_node); if (rc == 0) { cli->tc_in_heap = true; @@ -2563,8 +2586,7 @@ static int nrs_tbf_req_add(struct ptlrpc_nrs_policy *policy, list_add_tail(&nrq->nr_u.tbf.tr_list, &cli->tc_list); if (policy->pol_nrs->nrs_throttling) { - __u64 deadline = cli->tc_check_time + - cli->tc_nsecs; + __u64 deadline = cli->tc_deadline; if ((head->th_deadline > deadline) && (hrtimer_try_to_cancel(&head->th_timer) >= 0)) { @@ -2805,6 +2827,15 @@ nrs_tbf_parse_value_pair(struct nrs_tbf_cmd *cmd, char *buffer) cmd->u.tc_change.tc_next_name = val; else return -EINVAL; + } else if (strcmp(key, "realtime") == 0) { + unsigned long realtime; + + rc = kstrtoul(val, 10, &realtime); + if (rc) + return rc; + + if (realtime > 0) + cmd->u.tc_start.ts_rule_flags |= NTRS_REALTIME; } else { return -EINVAL; } -- 1.8.3.1