Whamcloud - gitweb
LU-6283 ptlrpc: Implement NRS Delay Policy 01/14701/32
authorChris Horn <hornc@cray.com>
Thu, 12 Mar 2015 23:25:14 +0000 (18:25 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 23 Apr 2017 03:30:27 +0000 (03:30 +0000)
The NRS Delay policy seeks to perturb the timing of request processing
at the PtlRPC layer, with the goal of simulating high server load, and
finding and exposing timing related problems. When this policy is
active, upon arrival of a request the policy will calculate an offset,
within a defined, user-configurable range, from the request arrival
time, to determine a time after which the request should be handled.
The request is then stored using the cfs_binheap implementation,
which sorts the request according to the assigned start time.
Requests are removed from the binheap for handling once their start
time has been passed.

The behavior of the policy can be controlled via three proc files
which can be written to via lctl similar to other policies.

nrs_delay_min: Controls the minimum amount of time, in seconds, that a
request will be delayed by this policy. The default is 5 seconds.

nrs_delay_max: Controls the maximum amount of time, in seconds, that a
request will be delayed by this policy. The default is 300 seconds.

nrs_delay_pct: Control the percentage of requests that will be delayed
by this policy. The default is 100. Note, when a request is not
selected for handling by the delay policy due to this variable then
the request will be handled by whatever fallback policy is defined
for that service. If no other fallback policy is defined then the
request will be handled by the FIFO policy.

Some examples:

lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular
request minimum delay on all PtlRPC services to 5 seconds.

lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the
high-priority request minimum delay on all PtlRPC services to 2
seconds.

lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and
high-priority request minimum delay of the ost_io service to 8
seconds.

lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the
regular request maximum delay on all PtlRPC services to 20 seconds.

lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the
high-priority request maximum delay on all PtlRPC services to 10
seconds.

lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular
and high-priority request maximum delay of the ost_io service to 35
seconds.

lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5
percent of regular requests on all PtlRPC services.

lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent
of high-priority requests on all PtlRPC services.

lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both
regular and high-priority requests of the ost_io service.

Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Iab50a639900adf31893c7b1fe83658932fd59db1
Reviewed-on: https://review.whamcloud.com/14701
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Henri Doreau <henri.doreau@cea.fr>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/Makefile.am
lustre/include/lustre_nrs.h
lustre/include/lustre_nrs_delay.h [new file with mode: 0644]
lustre/ptlrpc/Makefile.in
lustre/ptlrpc/nrs.c
lustre/ptlrpc/nrs_delay.c [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_internal.h
lustre/tests/sanityn.sh [changed mode: 0644->0755]

index 5bbaf4a..32f6680 100644 (file)
@@ -74,6 +74,7 @@ EXTRA_DIST = \
        lustre_nodemap.h \
        lustre_nrs.h \
        lustre_nrs_crr.h \
+       lustre_nrs_delay.h \
        lustre_nrs_fifo.h \
        lustre_nrs_orr.h \
        lustre_nrs_tbf.h \
index 00d0cdf..6397cf2 100644 (file)
@@ -675,6 +675,7 @@ enum {
 #include <lustre_nrs_tbf.h>
 #include <lustre_nrs_crr.h>
 #include <lustre_nrs_orr.h>
+#include <lustre_nrs_delay.h>
 
 /**
  * NRS request
@@ -721,6 +722,10 @@ struct ptlrpc_nrs_request {
                 * TBF request definition
                 */
                struct nrs_tbf_req      tbf;
+               /**
+                * Fields for the delay policy
+                */
+               struct nrs_delay_req    delay;
        } nr_u;
        /**
         * Externally-registering policies may want to use this to allocate
diff --git a/lustre/include/lustre_nrs_delay.h b/lustre/include/lustre_nrs_delay.h
new file mode 100644 (file)
index 0000000..01605a7
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2015, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ */
+/*
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ */
+
+#ifndef _LUSTRE_NRS_DELAY_H
+#define _LUSTRE_NRS_DELAY_H
+
+/* \name delay
+ *
+ * Delay policy
+ * @{
+ */
+
+/**
+ * Private data structure for the delay policy
+ */
+struct nrs_delay_data {
+       struct ptlrpc_nrs_resource       delay_res;
+
+       /**
+        * Delayed requests are stored in this binheap until they are
+        * removed for handling.
+        */
+       struct cfs_binheap              *delay_binheap;
+
+       /**
+        * Minimum service time
+        */
+       __u32                            min_delay;
+
+       /**
+        * Maximum service time
+        */
+       __u32                            max_delay;
+
+       /**
+        * We'll delay this percent of requests
+        */
+       __u32                            delay_pct;
+};
+
+struct nrs_delay_req {
+       /**
+        * This is the time at which a request becomes eligible for handling
+        */
+       time64_t        req_start_time;
+};
+
+enum nrs_ctl_delay {
+       NRS_CTL_DELAY_RD_MIN = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       NRS_CTL_DELAY_WR_MIN,
+       NRS_CTL_DELAY_RD_MAX,
+       NRS_CTL_DELAY_WR_MAX,
+       NRS_CTL_DELAY_RD_PCT,
+       NRS_CTL_DELAY_WR_PCT,
+};
+
+/** @} delay */
+
+#endif
index f1966fa..1847e22 100644 (file)
@@ -22,7 +22,7 @@ ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
 ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
 ptlrpc_objs += sec.o sec_ctx.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
 ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o nrs_orr.o
-ptlrpc_objs += nrs_tbf.o errno.o
+ptlrpc_objs += nrs_tbf.o nrs_delay.o errno.o
 
 nodemap_objs := nodemap_handler.o nodemap_lproc.o nodemap_range.o
 nodemap_objs += nodemap_idmap.o nodemap_rbtree.o nodemap_member.o
index 9b627da..43d1a7c 100644 (file)
@@ -1811,6 +1811,10 @@ int ptlrpc_nrs_init(void)
        rc = ptlrpc_nrs_policy_register(&nrs_conf_tbf);
        if (rc != 0)
                GOTO(fail, rc);
+
+       rc = ptlrpc_nrs_policy_register(&nrs_conf_delay);
+       if (rc != 0)
+               GOTO(fail, rc);
 #endif /* HAVE_SERVER_SUPPORT */
 
        RETURN(rc);
diff --git a/lustre/ptlrpc/nrs_delay.c b/lustre/ptlrpc/nrs_delay.c
new file mode 100644 (file)
index 0000000..77da1c1
--- /dev/null
@@ -0,0 +1,842 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Cray Inc. All Rights Reserved.
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/ptlrpc/nrs_delay.c
+ *
+ * Network Request Scheduler (NRS) Delay policy
+ *
+ * This policy will delay request handling for some configurable amount of
+ * time.
+ *
+ * Author: Chris Horn <hornc@cray.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name delay
+ *
+ * The delay policy schedules RPCs so that they are only processed after some
+ * configurable amount of time (in seconds) has passed.
+ *
+ * The defaults were chosen arbitrarily.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_DELAY     "delay"
+
+/* Default minimum delay in seconds. */
+#define NRS_DELAY_MIN_DEFAULT  5
+/* Default maximum delay, in seconds. */
+#define NRS_DELAY_MAX_DEFAULT  300
+/* Default percentage of delayed RPCs. */
+#define NRS_DELAY_PCT_DEFAULT  100
+
+/**
+ * Binary heap predicate.
+ *
+ * Elements are sorted according to the start time assigned to the requests
+ * upon enqueue. An element with an earlier start time is "less than" an
+ * element with a later start time.
+ *
+ * \retval 0 start_time(e1) > start_time(e2)
+ * \retval 1 start_time(e1) <= start_time(e2)
+ */
+static int delay_req_compare(struct cfs_binheap_node *e1,
+                            struct cfs_binheap_node *e2)
+{
+       struct ptlrpc_nrs_request *nrq1;
+       struct ptlrpc_nrs_request *nrq2;
+
+       nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+       nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+       return nrq1->nr_u.delay.req_start_time <=
+              nrq2->nr_u.delay.req_start_time;
+}
+
+static struct cfs_binheap_ops nrs_delay_heap_ops = {
+       .hop_enter      = NULL,
+       .hop_exit       = NULL,
+       .hop_compare    = delay_req_compare,
+};
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes
+ * the delay-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ * \param[in] Generic char buffer; unused in this policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0     success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_delay_start(struct ptlrpc_nrs_policy *policy, char *arg)
+{
+       struct nrs_delay_data *delay_data;
+
+       ENTRY;
+
+       OBD_CPT_ALLOC_PTR(delay_data, nrs_pol2cptab(policy),
+                         nrs_pol2cptid(policy));
+       if (delay_data == NULL)
+               RETURN(-ENOMEM);
+
+       delay_data->delay_binheap = cfs_binheap_create(&nrs_delay_heap_ops,
+                                                      CBH_FLAG_ATOMIC_GROW,
+                                                      4096, NULL,
+                                                      nrs_pol2cptab(policy),
+                                                      nrs_pol2cptid(policy));
+
+       if (delay_data->delay_binheap == NULL) {
+               OBD_FREE_PTR(delay_data);
+               RETURN(-ENOMEM);
+       }
+
+       delay_data->min_delay = NRS_DELAY_MIN_DEFAULT;
+       delay_data->max_delay = NRS_DELAY_MAX_DEFAULT;
+       delay_data->delay_pct = NRS_DELAY_PCT_DEFAULT;
+
+       policy->pol_private = delay_data;
+
+       RETURN(0);
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the delay-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_delay_stop(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_delay_data *delay_data = policy->pol_private;
+
+       LASSERT(delay_data != NULL);
+       LASSERT(delay_data->delay_binheap != NULL);
+       LASSERT(cfs_binheap_is_empty(delay_data->delay_binheap));
+
+       cfs_binheap_destroy(delay_data->delay_binheap);
+
+       OBD_FREE_PTR(delay_data);
+}
+
+/**
+ * Is called for obtaining a delay policy resource.
+ *
+ * \param[in]  policy    The policy on which the request is being asked for
+ * \param[in]  nrq       The request for which resources are being taken
+ * \param[in]  parent    Parent resource, unused in this policy
+ * \param[out] resp      Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *                       policy
+ *
+ * \retval 1 The delay policy only has a one-level resource hierarchy
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_delay_res_get(struct ptlrpc_nrs_policy *policy,
+                            struct ptlrpc_nrs_request *nrq,
+                            const struct ptlrpc_nrs_resource *parent,
+                            struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+       /**
+        * Just return the resource embedded inside nrs_delay_data, and end this
+        * resource hierarchy reference request.
+        */
+       *resp = &((struct nrs_delay_data *)policy->pol_private)->delay_res;
+       return 1;
+}
+
+/**
+ * Called when getting a request from the delay policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ * Requests are only removed from this policy when their start time has
+ * passed.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  Force the policy to return a request
+ *
+ * \retval The request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_delay_req_get(struct ptlrpc_nrs_policy *policy,
+                                            bool peek, bool force)
+{
+       struct nrs_delay_data *delay_data = policy->pol_private;
+       struct cfs_binheap_node *node;
+       struct ptlrpc_nrs_request *nrq;
+
+       node = cfs_binheap_root(delay_data->delay_binheap);
+       nrq = unlikely(node == NULL) ? NULL :
+             container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+       if (likely(nrq != NULL)) {
+               if (!force &&
+                   ktime_get_real_seconds() < nrq->nr_u.delay.req_start_time)
+                       nrq = NULL;
+               else if (likely(!peek))
+                       cfs_binheap_remove(delay_data->delay_binheap,
+                                          &nrq->nr_node);
+       }
+
+       return nrq;
+}
+
+/**
+ * Adds request \a nrq to a delay \a policy instance's set of queued requests
+ *
+ * A percentage (delay_pct) of incoming requests are delayed by this policy.
+ * If selected for delay a request start time is calculated. A start time
+ * is the current time plus a random offset in the range [min_delay, max_delay]
+ * The start time is recorded in the request, and is then used by
+ * delay_req_compare() to maintain a set of requests ordered by their start
+ * times.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 request added
+ * \retval 1 request not added
+ *
+ */
+static int nrs_delay_req_add(struct ptlrpc_nrs_policy *policy,
+                            struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_delay_data *delay_data = policy->pol_private;
+
+       if (delay_data->delay_pct == 0 || /* Not delaying anything */
+           (delay_data->delay_pct != 100 &&
+            delay_data->delay_pct < cfs_rand() % 100))
+               return 1;
+
+       nrq->nr_u.delay.req_start_time = ktime_get_real_seconds() + cfs_rand() %
+                                        (delay_data->max_delay -
+                                         delay_data->min_delay + 1) +
+                                        delay_data->min_delay;
+
+       return cfs_binheap_insert(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_delay_req_del(struct ptlrpc_nrs_policy *policy,
+                             struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_delay_data *delay_data = policy->pol_private;
+
+       cfs_binheap_remove(delay_data->delay_binheap, &nrq->nr_node);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_delay_req_stop(struct ptlrpc_nrs_policy *policy,
+                              struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+                                                 rq_nrq);
+
+       DEBUG_REQ(D_RPCTRACE, req,
+                 "NRS: finished delayed request from %s after %llds",
+                 libcfs_id2str(req->rq_peer),
+                 (s64)(nrq->nr_u.delay.req_start_time -
+                       req->rq_srv.sr_arrival_time.tv_sec));
+}
+
+/**
+ * Performs ctl functions specific to delay policy instances; similar to ioctl
+ *
+ * \param[in]     policy the policy instance
+ * \param[in]     opc    the opcode
+ * \param[in,out] arg    used for passing parameters and information
+ *
+ * \pre assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ * \post assert_spin_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+static int nrs_delay_ctl(struct ptlrpc_nrs_policy *policy,
+                        enum ptlrpc_nrs_ctl opc, void *arg)
+{
+       struct nrs_delay_data *delay_data = policy->pol_private;
+       __u32 *val = (__u32 *)arg;
+
+       assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+       switch ((enum nrs_ctl_delay)opc) {
+       default:
+               RETURN(-EINVAL);
+
+       case NRS_CTL_DELAY_RD_MIN:
+               *val = delay_data->min_delay;
+               break;
+
+       case NRS_CTL_DELAY_WR_MIN:
+               if (*val > delay_data->max_delay)
+                       RETURN(-EINVAL);
+
+               delay_data->min_delay = *val;
+               break;
+
+       case NRS_CTL_DELAY_RD_MAX:
+               *val = delay_data->max_delay;
+               break;
+
+       case NRS_CTL_DELAY_WR_MAX:
+               if (*val < delay_data->min_delay)
+                       RETURN(-EINVAL);
+
+               delay_data->max_delay = *val;
+               break;
+
+       case NRS_CTL_DELAY_RD_PCT:
+               *val = delay_data->delay_pct;
+               break;
+
+       case NRS_CTL_DELAY_WR_PCT:
+               if (*val < 0 || *val > 100)
+                       RETURN(-EINVAL);
+
+               delay_data->delay_pct = *val;
+               break;
+       }
+       RETURN(0);
+}
+
+/**
+ * lprocfs interface
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/* nrs_delay_min and nrs_delay_max are bounded by these values */
+#define LPROCFS_NRS_DELAY_LOWER_BOUND          0
+#define LPROCFS_NRS_DELAY_UPPER_BOUND          65535
+
+#define LPROCFS_NRS_DELAY_MIN_NAME             "delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_REG         "reg_delay_min:"
+#define LPROCFS_NRS_DELAY_MIN_NAME_HP          "hp_delay_min:"
+
+/**
+ * Max size of the nrs_delay_min seq_write buffer. Needs to be large enough
+ * to hold the string: "reg_min_delay:65535 hp_min_delay:65535"
+ */
+#define LPROCFS_NRS_DELAY_MIN_SIZE                                            \
+       sizeof(LPROCFS_NRS_DELAY_MIN_NAME_REG                                  \
+              __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)                      \
+              " " LPROCFS_NRS_DELAY_MIN_NAME_HP                               \
+              __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_MAX_NAME             "delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_REG         "reg_delay_max:"
+#define LPROCFS_NRS_DELAY_MAX_NAME_HP          "hp_delay_max:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_max
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_MAX_SIZE                                            \
+       sizeof(LPROCFS_NRS_DELAY_MAX_NAME_REG                                  \
+              __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND)                      \
+              " " LPROCFS_NRS_DELAY_MAX_NAME_HP                               \
+              __stringify(LPROCFS_NRS_DELAY_UPPER_BOUND))
+
+#define LPROCFS_NRS_DELAY_PCT_MIN_VAL          0
+#define LPROCFS_NRS_DELAY_PCT_MAX_VAL          100
+#define LPROCFS_NRS_DELAY_PCT_NAME             "delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_REG         "reg_delay_pct:"
+#define LPROCFS_NRS_DELAY_PCT_NAME_HP          "hp_delay_pct:"
+
+/**
+ * Similar to LPROCFS_NRS_DELAY_MIN_SIZE above, but for the nrs_delay_pct
+ * variable.
+ */
+#define LPROCFS_NRS_DELAY_PCT_SIZE                                            \
+       sizeof(LPROCFS_NRS_DELAY_PCT_NAME_REG                                  \
+              __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL)                      \
+              " " LPROCFS_NRS_DELAY_PCT_NAME_HP                               \
+              __stringify(LPROCFS_NRS_DELAY_PCT_MAX_VAL))
+
+/**
+ * Helper for delay's seq_write functions.
+ */
+static ssize_t
+lprocfs_nrs_delay_seq_write_common(const char __user *buffer,
+                                  unsigned int bufsize, size_t count,
+                                  const char *var_name, unsigned int min_val,
+                                  unsigned int max_val,
+                                  struct ptlrpc_service *svc, char *pol_name,
+                                  enum ptlrpc_nrs_ctl opc, bool single)
+{
+       enum ptlrpc_nrs_queue_type queue = 0;
+       char *kernbuf;
+       char *val_str;
+       long unsigned int val_reg;
+       long unsigned int val_hp;
+       size_t count_copy;
+       int rc = 0;
+       char *tmp = NULL;
+       int tmpsize = 0;
+
+       if (count > bufsize - 1)
+               return -EINVAL;
+
+       OBD_ALLOC(kernbuf, bufsize);
+       if (kernbuf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               GOTO(free_kernbuf, rc = -EFAULT);
+
+       tmpsize = strlen("reg_") + strlen(var_name) + 1;
+       OBD_ALLOC(tmp, tmpsize);
+       if (tmp == NULL)
+               GOTO(free_tmp, rc = -ENOMEM);
+
+       /* look for "reg_<var_name>" in kernbuf */
+       snprintf(tmp, tmpsize, "reg_%s", var_name);
+       count_copy = count;
+       val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+       if (val_str != kernbuf) {
+               rc = kstrtoul(val_str, 10, &val_reg);
+               if (rc != 0)
+                       GOTO(free_tmp, rc = -EINVAL);
+               queue |= PTLRPC_NRS_QUEUE_REG;
+       }
+
+       /* look for "hp_<var_name>" in kernbuf */
+       snprintf(tmp, tmpsize, "hp_%s", var_name);
+       count_copy = count;
+       val_str = lprocfs_find_named_value(kernbuf, tmp, &count_copy);
+       if (val_str != kernbuf) {
+               if (!nrs_svc_has_hp(svc))
+                       GOTO(free_tmp, rc = -ENODEV);
+
+               rc = kstrtoul(val_str, 10, &val_hp);
+               if (rc != 0)
+                       GOTO(free_tmp, rc = -EINVAL);
+               queue |= PTLRPC_NRS_QUEUE_HP;
+       }
+
+       if (queue == 0) {
+               if (!isdigit(kernbuf[0]))
+                       GOTO(free_tmp, rc = -EINVAL);
+
+               rc = kstrtoul(kernbuf, 10, &val_reg);
+               if (rc != 0)
+                       GOTO(free_tmp, rc = -EINVAL);
+
+               queue = PTLRPC_NRS_QUEUE_REG;
+
+               if (nrs_svc_has_hp(svc)) {
+                       queue |= PTLRPC_NRS_QUEUE_HP;
+                       val_hp = val_reg;
+               }
+       }
+
+       if (queue & PTLRPC_NRS_QUEUE_REG) {
+               if (val_reg > max_val || val_reg < min_val)
+                       GOTO(free_tmp, rc = -EINVAL);
+
+               rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                              pol_name, opc, single, &val_reg);
+               if ((rc < 0 && rc != -ENODEV) ||
+                   (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+                       GOTO(free_tmp, rc);
+       }
+
+       if (queue & PTLRPC_NRS_QUEUE_HP) {
+               int rc2 = 0;
+               if (val_hp > max_val || val_hp < min_val)
+                       GOTO(free_tmp, rc = -EINVAL);
+
+               rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                               pol_name, opc, single, &val_hp);
+               if ((rc2 < 0 && rc2 != -ENODEV) ||
+                   (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+                       GOTO(free_tmp, rc = rc2);
+       }
+
+       /* If we've reached here then we want to return count */
+       rc = count;
+
+free_tmp:
+       OBD_FREE(tmp, tmpsize);
+free_kernbuf:
+       OBD_FREE(kernbuf, bufsize);
+
+       return rc;
+}
+
+/**
+ * Retrieves the value of the minimum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_min_seq_show(struct seq_file *m, void *data)
+{
+       struct ptlrpc_service *svc = m->private;
+       unsigned int min_delay;
+       int rc;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_MIN,
+                                      true, &min_delay);
+
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_REG"%-5d\n",
+                          min_delay);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc != -ENODEV)
+               return rc;
+
+       if (!nrs_svc_has_hp(svc))
+               return 0;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_MIN,
+                                      true, &min_delay);
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_MIN_NAME_HP"%-5d\n",
+                          min_delay);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc == -ENODEV)
+               rc = 0;
+
+       return rc;
+}
+
+/**
+ * Sets the value of the minimum request delay for delay policy instances of a
+ * service. The user can set the minimum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_min=reg_delay_min:5, to set the regular
+ * request minimum delay on all PtlRPC services to 5 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_min=hp_delay_min:2, to set the high-priority
+ * request minimum delay on all PtlRPC services to 2 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_min=8, to set both the regular and
+ * high priority request minimum delay of the ost_io service to 8 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_min_seq_write(struct file *file,
+                                      const char __user *buffer, size_t count,
+                                      loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct ptlrpc_service *svc = m->private;
+
+       return lprocfs_nrs_delay_seq_write_common(buffer,
+                                                 LPROCFS_NRS_DELAY_MIN_SIZE,
+                                                 count,
+                                                 LPROCFS_NRS_DELAY_MIN_NAME,
+                                                 LPROCFS_NRS_DELAY_LOWER_BOUND,
+                                                 LPROCFS_NRS_DELAY_UPPER_BOUND,
+                                                 svc, NRS_POL_NAME_DELAY,
+                                                 NRS_CTL_DELAY_WR_MIN, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_min);
+
+/**
+ * Retrieves the value of the maximum delay for delay policy instances on both
+ * the regular and high-priority NRS head of a service, as long as a policy
+ * instance is not in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_max_seq_show(struct seq_file *m, void *data)
+{
+       struct ptlrpc_service *svc = m->private;
+       unsigned int max_delay;
+       int rc;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_MAX,
+                                      true, &max_delay);
+
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_REG"%-5d\n",
+                          max_delay);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc != -ENODEV)
+               return rc;
+
+       if (!nrs_svc_has_hp(svc))
+               return 0;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_MAX,
+                                      true, &max_delay);
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_MAX_NAME_HP"%-5d\n",
+                          max_delay);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc == -ENODEV)
+               rc = 0;
+
+       return rc;
+}
+
+/**
+ * Sets the value of the maximum request delay for delay policy instances of a
+ * service. The user can set the maximum request delay for the regular or high
+ * priority NRS head individually by specifying each value, or both together in
+ * a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_max=reg_delay_max:20, to set the regular
+ * request maximum delay on all PtlRPC services to 20 seconds
+ *
+ * lctl set_param *.*.*.nrs_delay_max=hp_delay_max:10, to set the high-priority
+ * request maximum delay on all PtlRPC services to 10 seconds, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_max=35, to set both the regular and
+ * high priority request maximum delay of the ost_io service to 35 seconds.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_max_seq_write(struct file *file,
+                                      const char __user *buffer, size_t count,
+                                      loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct ptlrpc_service *svc = m->private;
+
+       return lprocfs_nrs_delay_seq_write_common(buffer,
+                                                 LPROCFS_NRS_DELAY_MAX_SIZE,
+                                                 count,
+                                                 LPROCFS_NRS_DELAY_MAX_NAME,
+                                                 LPROCFS_NRS_DELAY_LOWER_BOUND,
+                                                 LPROCFS_NRS_DELAY_UPPER_BOUND,
+                                                 svc, NRS_POL_NAME_DELAY,
+                                                 NRS_CTL_DELAY_WR_MAX, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_max);
+
+/**
+ * Retrieves the value of the percentage of requests which should be delayed
+ * for delay policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state;
+ */
+static int
+ptlrpc_lprocfs_nrs_delay_pct_seq_show(struct seq_file *m, void *data)
+{
+       struct ptlrpc_service *svc = m->private;
+       unsigned int delay_pct;
+       int rc;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_PCT,
+                                      true, &delay_pct);
+
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_REG"%-3d\n",
+                          delay_pct);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc != -ENODEV)
+               return rc;
+
+       if (!nrs_svc_has_hp(svc))
+               return 0;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                      NRS_POL_NAME_DELAY,
+                                      NRS_CTL_DELAY_RD_PCT,
+                                      true, &delay_pct);
+       if (rc == 0)
+               seq_printf(m, LPROCFS_NRS_DELAY_PCT_NAME_HP"%-3d\n",
+                          delay_pct);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in
+                * the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       else if (rc == -ENODEV)
+               rc = 0;
+
+       return rc;
+}
+
+/**
+ * Sets the value of the percentage of requests to be delayed for delay policy
+ * instances of a service. The user can set the percentage for the regular or
+ * high-priority NRS head individually by specifying each value, or both
+ * together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=reg_delay_pct:5, to delay 5 percent of
+ * regular requests on all PtlRPC services
+ *
+ * lctl set_param *.*.*.nrs_delay_pct=hp_delay_pct:2, to delay 2 percent of
+ * high-priority requests on all PtlRPC services, and
+ *
+ * lctl set_param *.*.ost_io.nrs_delay_pct=8, to delay 8 percent of both
+ * regular and high-priority requests of the ost_io service.
+ */
+static ssize_t
+ptlrpc_lprocfs_nrs_delay_pct_seq_write(struct file *file,
+                                      const char __user *buffer, size_t count,
+                                      loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct ptlrpc_service *svc = m->private;
+
+       return lprocfs_nrs_delay_seq_write_common(buffer,
+                                                 LPROCFS_NRS_DELAY_PCT_SIZE,
+                                                 count,
+                                                 LPROCFS_NRS_DELAY_PCT_NAME,
+                                                 LPROCFS_NRS_DELAY_PCT_MIN_VAL,
+                                                 LPROCFS_NRS_DELAY_PCT_MAX_VAL,
+                                                 svc, NRS_POL_NAME_DELAY,
+                                                 NRS_CTL_DELAY_WR_PCT, false);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs_delay_pct);
+
+static int nrs_delay_lprocfs_init(struct ptlrpc_service *svc)
+{
+       struct lprocfs_vars nrs_delay_lprocfs_vars[] = {
+               { .name         = "nrs_delay_min",
+                 .fops         = &ptlrpc_lprocfs_nrs_delay_min_fops,
+                 .data         = svc },
+               { .name         = "nrs_delay_max",
+                 .fops         = &ptlrpc_lprocfs_nrs_delay_max_fops,
+                 .data         = svc },
+               { .name         = "nrs_delay_pct",
+                 .fops         = &ptlrpc_lprocfs_nrs_delay_pct_fops,
+                 .data         = svc },
+               { NULL }
+       };
+
+       if (svc->srv_procroot == NULL)
+               return 0;
+
+       return lprocfs_add_vars(svc->srv_procroot, nrs_delay_lprocfs_vars,
+                               NULL);
+}
+
+static void nrs_delay_lprocfs_fini(struct ptlrpc_service *svc)
+{
+       if (svc->srv_procroot == NULL)
+               return;
+
+       lprocfs_remove_proc_entry("nrs_delay_min", svc->srv_procroot);
+       lprocfs_remove_proc_entry("nrs_delay_max", svc->srv_procroot);
+       lprocfs_remove_proc_entry("nrs_delay_pct", svc->srv_procroot);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * Delay policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_delay_ops = {
+       .op_policy_start        = nrs_delay_start,
+       .op_policy_stop         = nrs_delay_stop,
+       .op_policy_ctl          = nrs_delay_ctl,
+       .op_res_get             = nrs_delay_res_get,
+       .op_req_get             = nrs_delay_req_get,
+       .op_req_enqueue         = nrs_delay_req_add,
+       .op_req_dequeue         = nrs_delay_req_del,
+       .op_req_stop            = nrs_delay_req_stop,
+#ifdef CONFIG_PROC_FS
+       .op_lprocfs_init        = nrs_delay_lprocfs_init,
+       .op_lprocfs_fini        = nrs_delay_lprocfs_fini,
+#endif
+};
+
+/**
+ * Delay policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_delay = {
+       .nc_name                = NRS_POL_NAME_DELAY,
+       .nc_ops                 = &nrs_delay_ops,
+       .nc_compat              = nrs_policy_compat_all,
+};
+
+/** @} delay */
+
+/** @} nrs */
index 5da292f..a37be57 100644 (file)
@@ -51,6 +51,7 @@ extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn;
 extern struct ptlrpc_nrs_pol_conf nrs_conf_orr;
 extern struct ptlrpc_nrs_pol_conf nrs_conf_trr;
 extern struct ptlrpc_nrs_pol_conf nrs_conf_tbf;
+extern struct ptlrpc_nrs_pol_conf nrs_conf_delay;
 #endif /* HAVE_SERVER_SUPPORT */
 
 /**
old mode 100644 (file)
new mode 100755 (executable)
index e607822..58fbd5a
@@ -3484,6 +3484,58 @@ test_77k() {
 }
 run_test 77k "check the extended TBF policy with NID/JobID/OPCode expression"
 
+test_77l() {
+       if [ $(lustre_version_code ost1) -lt $(version_code 2.9.54) ]; then
+               skip "Need OST version at least 2.9.54"
+               return 0
+       fi
+
+       local dir=$DIR/$tdir
+
+       mkdir $dir || error "mkdir $dir failed"
+       $LFS setstripe -c $OSTCOUNT $dir || error "setstripe to $dir failed"
+       chmod 777 $dir
+
+       local nodes=$(comma_list $(osts_nodes))
+       do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies=delay \
+                                      ost.OSS.ost_io.nrs_delay_min=4 \
+                                      ost.OSS.ost_io.nrs_delay_max=4 \
+                                      ost.OSS.ost_io.nrs_delay_pct=100
+       [ $? -ne 0 ] && error "Failed to set delay policy"
+
+       local start=$SECONDS
+       do_nodes "${SINGLECLIENT:-$HOSTNAME}" "$RUNAS" \
+                dd if=/dev/zero of="$dir/nrs_delay_$HOSTNAME" bs=1M count=1 \
+                  oflag=direct conv=fdatasync ||
+               { do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies="fifo";
+                 error "dd on client failed (1)"; }
+       local elapsed=$((SECONDS - start))
+
+       # NRS delay doesn't do sub-second timing, so a request enqueued at
+       # 0.9 seconds can be dequeued at 4.0
+       [ $elapsed -lt 3 ] &&
+               { do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies="fifo";
+                 error "Single 1M write should take at least 3 seconds"; }
+
+       start=$SECONDS
+       do_nodes "${SINGLECLIENT:-$HOSTNAME}" "$RUNAS" \
+                dd if=/dev/zero of="$dir/nrs_delay_$HOSTNAME" bs=1M count=10 \
+                  oflag=direct conv=fdatasync ||
+               { do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies="fifo";
+                 error "dd on client failed (2)"; }
+       elapsed=$((SECONDS - start))
+
+       [ $elapsed -lt 30 ] &&
+               { do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies="fifo";
+                 error "Ten 1M writes should take at least 30 seconds"; }
+
+       do_nodes $nodes lctl set_param ost.OSS.ost_io.nrs_policies="fifo"
+       [ $? -ne 0 ] && error "failed to set policy back to fifo"
+
+       return 0
+}
+run_test 77l "check NRS Delay slows write RPC processing"
+
 test_78() { #LU-6673
        local server_version=$(lustre_version_code ost1)
        [[ $server_version -ge $(version_code 2.7.58) ]] ||