Whamcloud - gitweb
LU-398 ptlrpc: Add the NRS CRR-N policy
authorNikitas Angelinas <nikitas_angelinas@xyratex.com>
Wed, 9 Jan 2013 02:02:12 +0000 (02:02 +0000)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 9 Apr 2013 05:19:30 +0000 (01:19 -0400)
The CRR-N (Client-based Round Robin over NIDs) policy batch-schedules
all types of RPCs in a Round Robin manner according to the NID of the
client that generated the RPC; the maximum size of the batches is
configurable via interaction with lprocfs. The policy aims to provide
for better resource utilization across the cluster, and to help
shorten completion times of jobs in some cases by distributing
available bandwidth more evenly across client nodes.

Signed-off-by: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
Co-authored-by: Liang Zhen <liang@whamcloud.com>
Change-Id: Ie91ee277fc95564908b20fd0d539a274089657ed
Oracle-bug-id: b=13634
Xyratex-bug-id: MRP-73
Reviewed-on: http://review.whamcloud.com/4937
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Liang Zhen <liang.zhen@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre_net.h
lustre/ptlrpc/Makefile.in
lustre/ptlrpc/autoMakefile.am
lustre/ptlrpc/nrs.c
lustre/ptlrpc/nrs_crr.c [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_internal.h

index 9746c27..22deb5e 100644 (file)
@@ -1413,6 +1413,98 @@ struct nrs_fifo_req {
 /** @} fifo */
 
 /**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+       struct ptlrpc_nrs_resource      cn_res;
+       cfs_binheap_t                  *cn_binheap;
+       cfs_hash_t                     *cn_cli_hash;
+       /**
+        * Used when a new scheduling round commences, in order to synchronize
+        * all clients with the new round number.
+        */
+       __u64                           cn_round;
+       /**
+        * Determines the relevant ordering amongst request batches within a
+        * scheduling round.
+        */
+       __u64                           cn_sequence;
+       /**
+        * Round Robin quantum; the maximum number of RPCs that each request
+        * batch for each client can have in a scheduling round.
+        */
+       __u16                           cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+       struct ptlrpc_nrs_resource      cc_res;
+       cfs_hlist_node_t                cc_hnode;
+       lnet_nid_t                      cc_nid;
+       /**
+        * The round number against which this client is currently scheduling
+        * requests.
+        */
+       __u64                           cc_round;
+       /**
+        * The sequence number used for requests scheduled by this client during
+        * the current round number.
+        */
+       __u64                           cc_sequence;
+       cfs_atomic_t                    cc_ref;
+       /**
+        * Round Robin quantum; the maximum number of RPCs the client is allowed
+        * to schedule in a single batch of each round.
+        */
+       __u16                           cc_quantum;
+       /**
+        * # of pending requests for this client, on all existing rounds
+        */
+       __u16                           cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+       /**
+        * Round number for this request; shared with all other requests in the
+        * same batch.
+        */
+       __u64                   cr_round;
+       /**
+        * Sequence number for this request; shared with all other requests in
+        * the same batch.
+        */
+       __u64                   cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+       /**
+        * Read the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       /**
+        * Write the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
  * NRS request
  *
  * Instances of this object exist embedded within ptlrpc_request; the main
@@ -1447,6 +1539,10 @@ struct ptlrpc_nrs_request {
                 * Fields for the FIFO policy
                 */
                struct nrs_fifo_req     fifo;
+               /**
+                * CRR-N request defintion
+                */
+               struct nrs_crrn_req     crr;
        } nr_u;
        /**
         * Externally-registering policies may want to use this to allocate
index fbfaa85..9cf66f2 100644 (file)
@@ -14,7 +14,7 @@ ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
 ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
 ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
-ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o nrs_crr.o
 
 target_objs := $(TARGET)tgt_main.o $(TARGET)tgt_lastrcvd.o
 
index 77ad215..d1f2a83 100644 (file)
@@ -94,6 +94,7 @@ ptlrpc_SOURCES =      \
        service.c       \
        nrs.c           \
        nrs_fifo.c      \
+       nrs_crr.c       \
        wiretest.c      \
        sec.c           \
        sec_bulk.c      \
index a4c72c3..f4700f8 100644 (file)
@@ -1741,6 +1741,10 @@ out:
 
 /* ptlrpc/nrs_fifo.c */
 extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+#if defined HAVE_SERVER_SUPPORT && defined(__KERNEL__)
+/* ptlrpc/nrs_crr.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_crrn;
+#endif
 
 /**
  * Adds all policies that ship with the ptlrpc module, to NRS core's list of
@@ -1761,6 +1765,12 @@ int ptlrpc_nrs_init(void)
        if (rc != 0)
                GOTO(fail, rc);
 
+#if defined HAVE_SERVER_SUPPORT && defined(__KERNEL__)
+       rc = ptlrpc_nrs_policy_register(&nrs_conf_crrn);
+       if (rc != 0)
+               GOTO(fail, rc);
+#endif
+
        RETURN(rc);
 fail:
        /**
diff --git a/lustre/ptlrpc/nrs_crr.c b/lustre/ptlrpc/nrs_crr.c
new file mode 100644 (file)
index 0000000..0335bf9
--- /dev/null
@@ -0,0 +1,894 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+#ifdef HAVE_SERVER_SUPPORT
+
+#define DEBUG_SUBSYSTEM S_RPC
+#ifndef __KERNEL__
+#include <liblustre.h>
+#endif
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name CRR-N policy
+ *
+ * Client Round-Robin scheduling over client NIDs
+ *
+ * @{
+ *
+ */
+
+#define NRS_POL_NAME_CRRN      "crrn"
+
+/**
+ * Binary heap predicate.
+ *
+ * Uses ptlrpc_nrs_request::nr_u::crr::cr_round and
+ * ptlrpc_nrs_request::nr_u::crr::cr_sequence to compare two binheap nodes and
+ * produce a binary predicate that shows their relative priority, so that the
+ * binary heap can perform the necessary sorting operations.
+ *
+ * \param[in] e1 the first binheap node to compare
+ * \param[in] e2 the second binheap node to compare
+ *
+ * \retval 0 e1 > e2
+ * \retval 1 e1 <= e2
+ */
+static int crrn_req_compare(cfs_binheap_node_t *e1, cfs_binheap_node_t *e2)
+{
+       struct ptlrpc_nrs_request *nrq1;
+       struct ptlrpc_nrs_request *nrq2;
+
+       nrq1 = container_of(e1, struct ptlrpc_nrs_request, nr_node);
+       nrq2 = container_of(e2, struct ptlrpc_nrs_request, nr_node);
+
+       if (nrq1->nr_u.crr.cr_round < nrq2->nr_u.crr.cr_round)
+               return 1;
+       else if (nrq1->nr_u.crr.cr_round > nrq2->nr_u.crr.cr_round)
+               return 0;
+
+       return nrq1->nr_u.crr.cr_sequence < nrq2->nr_u.crr.cr_sequence;
+}
+
+static cfs_binheap_ops_t nrs_crrn_heap_ops = {
+       .hop_enter      = NULL,
+       .hop_exit       = NULL,
+       .hop_compare    = crrn_req_compare,
+};
+
+/**
+ * libcfs_hash operations for nrs_crrn_net::cn_cli_hash
+ *
+ * This uses ptlrpc_request::rq_peer.nid as its key, in order to hash
+ * nrs_crrn_client objects.
+ */
+#define NRS_NID_BKT_BITS       8
+#define NRS_NID_BITS           16
+
+static unsigned nrs_crrn_hop_hash(cfs_hash_t *hs, const void *key,
+                                 unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static int nrs_crrn_hop_keycmp(const void *key, cfs_hlist_node_t *hnode)
+{
+       lnet_nid_t              *nid = (lnet_nid_t *)key;
+       struct nrs_crrn_client  *cli = cfs_hlist_entry(hnode,
+                                                      struct nrs_crrn_client,
+                                                      cc_hnode);
+       return *nid == cli->cc_nid;
+}
+
+static void *nrs_crrn_hop_key(cfs_hlist_node_t *hnode)
+{
+       struct nrs_crrn_client  *cli = cfs_hlist_entry(hnode,
+                                                      struct nrs_crrn_client,
+                                                      cc_hnode);
+       return &cli->cc_nid;
+}
+
+static void *nrs_crrn_hop_object(cfs_hlist_node_t *hnode)
+{
+       return cfs_hlist_entry(hnode, struct nrs_crrn_client, cc_hnode);
+}
+
+static void nrs_crrn_hop_get(cfs_hash_t *hs, cfs_hlist_node_t *hnode)
+{
+       struct nrs_crrn_client *cli = cfs_hlist_entry(hnode,
+                                                     struct nrs_crrn_client,
+                                                     cc_hnode);
+       cfs_atomic_inc(&cli->cc_ref);
+}
+
+static void nrs_crrn_hop_put(cfs_hash_t *hs, cfs_hlist_node_t *hnode)
+{
+       struct nrs_crrn_client  *cli = cfs_hlist_entry(hnode,
+                                                      struct nrs_crrn_client,
+                                                      cc_hnode);
+       cfs_atomic_dec(&cli->cc_ref);
+}
+
+static void nrs_crrn_hop_exit(cfs_hash_t *hs, cfs_hlist_node_t *hnode)
+{
+       struct nrs_crrn_client  *cli = cfs_hlist_entry(hnode,
+                                                      struct nrs_crrn_client,
+                                                      cc_hnode);
+       LASSERTF(cfs_atomic_read(&cli->cc_ref) == 0,
+                "Busy CRR-N object from client with NID %s, with %d refs\n",
+                libcfs_nid2str(cli->cc_nid), cfs_atomic_read(&cli->cc_ref));
+
+       OBD_FREE_PTR(cli);
+}
+
+static cfs_hash_ops_t nrs_crrn_hash_ops = {
+       .hs_hash        = nrs_crrn_hop_hash,
+       .hs_keycmp      = nrs_crrn_hop_keycmp,
+       .hs_key         = nrs_crrn_hop_key,
+       .hs_object      = nrs_crrn_hop_object,
+       .hs_get         = nrs_crrn_hop_get,
+       .hs_put         = nrs_crrn_hop_put,
+       .hs_put_locked  = nrs_crrn_hop_put,
+       .hs_exit        = nrs_crrn_hop_exit,
+};
+
+/**
+ * Called when a CRR-N policy instance is started.
+ *
+ * \param[in] policy the policy
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0      success
+ */
+static int nrs_crrn_start(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_crrn_net    *net;
+       int                     rc = 0;
+       ENTRY;
+
+       OBD_CPT_ALLOC_PTR(net, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+       if (net == NULL)
+               RETURN(-ENOMEM);
+
+       net->cn_binheap = cfs_binheap_create(&nrs_crrn_heap_ops,
+                                            CBH_FLAG_ATOMIC_GROW, 4096, NULL,
+                                            nrs_pol2cptab(policy),
+                                            nrs_pol2cptid(policy));
+       if (net->cn_binheap == NULL)
+               GOTO(failed, rc = -ENOMEM);
+
+       net->cn_cli_hash = cfs_hash_create("nrs_crrn_nid_hash",
+                                          NRS_NID_BITS, NRS_NID_BITS,
+                                          NRS_NID_BKT_BITS, 0,
+                                          CFS_HASH_MIN_THETA,
+                                          CFS_HASH_MAX_THETA,
+                                          &nrs_crrn_hash_ops,
+                                          CFS_HASH_RW_BKTLOCK);
+       if (net->cn_cli_hash == NULL)
+               GOTO(failed, rc = -ENOMEM);
+
+       /**
+        * Set default quantum value to max_rpcs_in_flight for non-MDS OSCs;
+        * there may be more RPCs pending from each struct nrs_crrn_client even
+        * with the default max_rpcs_in_flight value, as we are scheduling over
+        * NIDs, and there may be more than one mount point per client.
+        */
+       net->cn_quantum = OSC_MAX_RIF_DEFAULT;
+       /**
+        * Set to 1 so that the test inside nrs_crrn_req_add() can evaluate to
+        * true.
+        */
+       net->cn_sequence = 1;
+
+       policy->pol_private = net;
+
+       RETURN(rc);
+
+failed:
+       if (net->cn_binheap != NULL)
+               cfs_binheap_destroy(net->cn_binheap);
+
+       OBD_FREE_PTR(net);
+
+       RETURN(rc);
+}
+
+/**
+ * Called when a CRR-N policy instance is stopped.
+ *
+ * Called when the policy has been instructed to transition to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state and has no more pending
+ * requests to serve.
+ *
+ * \param[in] policy the policy
+ */
+static void nrs_crrn_stop(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_crrn_net     *net = policy->pol_private;
+       ENTRY;
+
+       LASSERT(net != NULL);
+       LASSERT(net->cn_binheap != NULL);
+       LASSERT(net->cn_cli_hash != NULL);
+       LASSERT(cfs_binheap_is_empty(net->cn_binheap));
+
+       cfs_binheap_destroy(net->cn_binheap);
+       cfs_hash_putref(net->cn_cli_hash);
+
+       OBD_FREE_PTR(net);
+}
+
+/**
+ * Performs a policy-specific ctl function on CRR-N policy instances; similar
+ * to ioctl.
+ *
+ * \param[in]    policy the policy instance
+ * \param[in]    opc    the opcode
+ * \param[in,out] arg   used for passing parameters and information
+ *
+ * \pre spin_is_locked(&policy->pol_nrs->->nrs_lock)
+ * \post spin_is_locked(&policy->pol_nrs->->nrs_lock)
+ *
+ * \retval 0   operation carried out successfully
+ * \retval -ve error
+ */
+int nrs_crrn_ctl(struct ptlrpc_nrs_policy *policy, enum ptlrpc_nrs_ctl opc,
+                void *arg)
+{
+       LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock));
+
+       switch(opc) {
+       default:
+               RETURN(-EINVAL);
+
+       /**
+        * Read Round Robin quantum size of a policy instance.
+        */
+       case NRS_CTL_CRRN_RD_QUANTUM: {
+               struct nrs_crrn_net     *net = policy->pol_private;
+
+               *(__u16 *)arg = net->cn_quantum;
+               }
+               break;
+
+       /**
+        * Write Round Robin quantum size of a policy instance.
+        */
+       case NRS_CTL_CRRN_WR_QUANTUM: {
+               struct nrs_crrn_net     *net = policy->pol_private;
+
+               net->cn_quantum = *(__u16 *)arg;
+               LASSERT(net->cn_quantum != 0);
+               }
+               break;
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Obtains resources from CRR-N policy instances. The top-level resource lives
+ * inside \e nrs_crrn_net and the second-level resource inside
+ * \e nrs_crrn_client object instances.
+ *
+ * \param[in]  policy    the policy for which resources are being taken for
+ *                       request \a nrq
+ * \param[in]  nrq       the request for which resources are being taken
+ * \param[in]  parent    parent resource, embedded in nrs_crrn_net for the
+ *                       CRR-N policy
+ * \param[out] resp      resources references are placed in this array
+ * \param[in]  moving_req signifies limited caller context; used to perform
+ *                       memory allocations in an atomic context in this
+ *                       policy
+ *
+ * \retval 0   we are returning a top-level, parent resource, one that is
+ *            embedded in an nrs_crrn_net object
+ * \retval 1   we are returning a bottom-level resource, one that is embedded
+ *            in an nrs_crrn_client object
+ *
+ * \see nrs_resource_get_safe()
+ */
+int nrs_crrn_res_get(struct ptlrpc_nrs_policy *policy,
+                    struct ptlrpc_nrs_request *nrq,
+                    const struct ptlrpc_nrs_resource *parent,
+                    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+       struct nrs_crrn_net     *net;
+       struct nrs_crrn_client  *cli;
+       struct nrs_crrn_client  *tmp;
+       struct ptlrpc_request   *req;
+
+       if (parent == NULL) {
+               *resp = &((struct nrs_crrn_net *)policy->pol_private)->cn_res;
+               return 0;
+       }
+
+       net = container_of(parent, struct nrs_crrn_net, cn_res);
+       req = container_of(nrq, struct ptlrpc_request, rq_nrq);
+
+       cli = cfs_hash_lookup(net->cn_cli_hash, &req->rq_peer.nid);
+       if (cli != NULL)
+               goto out;
+
+       OBD_CPT_ALLOC_GFP(cli, nrs_pol2cptab(policy), nrs_pol2cptid(policy),
+                         sizeof(*cli), moving_req ? CFS_ALLOC_ATOMIC :
+                         CFS_ALLOC_IO);
+       if (cli == NULL)
+               return -ENOMEM;
+
+       cli->cc_nid = req->rq_peer.nid;
+
+       cfs_atomic_set(&cli->cc_ref, 1);
+       tmp = cfs_hash_findadd_unique(net->cn_cli_hash, &cli->cc_nid,
+                                     &cli->cc_hnode);
+       if (tmp != cli) {
+               OBD_FREE_PTR(cli);
+               cli = tmp;
+       }
+out:
+       *resp = &cli->cc_res;
+
+       return 1;
+}
+
+/**
+ * Called when releasing references to the resource hierachy obtained for a
+ * request for scheduling using the CRR-N policy.
+ *
+ * \param[in] policy   the policy the resource belongs to
+ * \param[in] res      the resource to be released
+ */
+static void nrs_crrn_res_put(struct ptlrpc_nrs_policy *policy,
+                            const struct ptlrpc_nrs_resource *res)
+{
+       struct nrs_crrn_net     *net;
+       struct nrs_crrn_client  *cli;
+
+       /**
+        * Do nothing for freeing parent, nrs_crrn_net resources
+        */
+       if (res->res_parent == NULL)
+               return;
+
+       cli = container_of(res, struct nrs_crrn_client, cc_res);
+       net = container_of(res->res_parent, struct nrs_crrn_net, cn_res);
+
+       cfs_hash_put(net->cn_cli_hash, &cli->cc_hnode);
+}
+
+/**
+ * Called when getting a request from the CRR-N policy for handlingso that it can be served
+ *
+ * \param[in] policy the policy being polled
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  force the policy to return a request; unused in this policy
+ *
+ * \retval the request to be handled
+ * \retval NULL no request available
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_crrn_req_get(struct ptlrpc_nrs_policy *policy,
+                                           bool peek, bool force)
+{
+       struct nrs_crrn_net       *net = policy->pol_private;
+       cfs_binheap_node_t        *node = cfs_binheap_root(net->cn_binheap);
+       struct ptlrpc_nrs_request *nrq;
+
+       nrq = unlikely(node == NULL) ? NULL :
+             container_of(node, struct ptlrpc_nrs_request, nr_node);
+
+       if (likely(!peek && nrq != NULL)) {
+               struct nrs_crrn_client *cli;
+               struct ptlrpc_request *req = container_of(nrq,
+                                                         struct ptlrpc_request,
+                                                         rq_nrq);
+
+               cli = container_of(nrs_request_resource(nrq),
+                                  struct nrs_crrn_client, cc_res);
+
+               LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+               cfs_binheap_remove(net->cn_binheap, &nrq->nr_node);
+               cli->cc_active--;
+
+               CDEBUG(D_RPCTRACE,
+                      "NRS: starting to handle %s request from %s, with round "
+                      LPU64"\n", NRS_POL_NAME_CRRN,
+                      libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+
+               /** Peek at the next request to be served */
+               node = cfs_binheap_root(net->cn_binheap);
+
+               /** No more requests */
+               if (unlikely(node == NULL)) {
+                       net->cn_round++;
+               } else {
+                       struct ptlrpc_nrs_request *next;
+
+                       next = container_of(node, struct ptlrpc_nrs_request,
+                                           nr_node);
+
+                       if (net->cn_round < next->nr_u.crr.cr_round)
+                               net->cn_round = next->nr_u.crr.cr_round;
+               }
+       }
+
+       return nrq;
+}
+
+/**
+ * Adds request \a nrq to a CRR-N \a policy instance's set of queued requests
+ *
+ * A scheduling round is a stream of requests that have been sorted in batches
+ * according to the client that they originate from (as identified by its NID);
+ * there can be only one batch for each client in each round. The batches are of
+ * maximum size nrs_crrn_net:cn_quantum. When a new request arrives for
+ * scheduling from a client that has exhausted its quantum in its current round,
+ * it will start scheduling requests on the next scheduling round. Clients are
+ * allowed to schedule requests against a round until all requests for the round
+ * are serviced, so a client might miss a round if it is not generating requests
+ * for a long enough period of time. Clients that miss a round will continue
+ * with scheduling the next request that they generate, starting at the round
+ * that requests are being dispatched for, at the time of arrival of this new
+ * request.
+ *
+ * Requests are tagged with the round number and a sequence number; the sequence
+ * number indicates the relative ordering amongst the batches of requests in a
+ * round, and is identical for all requests in a batch, as is the round number.
+ * The round and sequence numbers are used by crrn_req_compare() in order to
+ * maintain an ordered set of rounds, with each round consisting of an ordered
+ * set of batches of requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to add
+ *
+ * \retval 0   request successfully added
+ * \retval != 0 error
+ */
+static int nrs_crrn_req_add(struct ptlrpc_nrs_policy *policy,
+                           struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_crrn_net     *net;
+       struct nrs_crrn_client  *cli;
+       int                      rc;
+
+       cli = container_of(nrs_request_resource(nrq),
+                          struct nrs_crrn_client, cc_res);
+       net = container_of(nrs_request_resource(nrq)->res_parent,
+                          struct nrs_crrn_net, cn_res);
+
+       if (cli->cc_quantum == 0 || cli->cc_round < net->cn_round ||
+           (cli->cc_active == 0 && cli->cc_quantum > 0)) {
+
+               /**
+                * If the client has no pending requests, and still some of its
+                * quantum remaining unused, which implies it has not had a
+                * chance to schedule up to its maximum allowed batch size of
+                * requests in the previous round it participated, schedule this
+                * next request on a new round; this avoids fragmentation of
+                * request batches caused by client inactivity, at the expense
+                * of potentially slightly increased service time for the
+                * request batch this request will be a part of.
+                */
+               if (cli->cc_active == 0 && cli->cc_quantum > 0)
+                       cli->cc_round++;
+
+               /** A new scheduling round has commenced */
+               if (cli->cc_round < net->cn_round)
+                       cli->cc_round = net->cn_round;
+
+               /** I was not the last client through here */
+               if (cli->cc_sequence < net->cn_sequence)
+                       cli->cc_sequence = ++net->cn_sequence;
+               /**
+                * Reset the quantum if we have reached the maximum quantum
+                * size for this batch, or even if we have not managed to
+                * complete a batch size up to its maximum allowed size.
+                * XXX: Accessed unlocked
+                */
+               cli->cc_quantum = net->cn_quantum;
+       }
+
+       nrq->nr_u.crr.cr_round = cli->cc_round;
+       nrq->nr_u.crr.cr_sequence = cli->cc_sequence;
+
+       rc = cfs_binheap_insert(net->cn_binheap, &nrq->nr_node);
+       if (rc == 0) {
+               cli->cc_active++;
+               if (--cli->cc_quantum == 0)
+                       cli->cc_round++;
+       }
+       return rc;
+}
+
+/**
+ * Removes request \a nrq from a CRR-N \a policy instance's set of queued
+ * requests.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq    the request to remove
+ */
+static void nrs_crrn_req_del(struct ptlrpc_nrs_policy *policy,
+                            struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_crrn_net     *net;
+       struct nrs_crrn_client  *cli;
+       bool                     is_root;
+
+       cli = container_of(nrs_request_resource(nrq),
+                          struct nrs_crrn_client, cc_res);
+       net = container_of(nrs_request_resource(nrq)->res_parent,
+                          struct nrs_crrn_net, cn_res);
+
+       LASSERT(nrq->nr_u.crr.cr_round <= cli->cc_round);
+
+       is_root = &nrq->nr_node == cfs_binheap_root(net->cn_binheap);
+
+       cfs_binheap_remove(net->cn_binheap, &nrq->nr_node);
+       cli->cc_active--;
+
+       /**
+        * If we just deleted the node at the root of the binheap, we may have
+        * to adjust round numbers.
+        */
+       if (unlikely(is_root)) {
+               /** Peek at the next request to be served */
+               cfs_binheap_node_t *node = cfs_binheap_root(net->cn_binheap);
+
+               /** No more requests */
+               if (unlikely(node == NULL)) {
+                       net->cn_round++;
+               } else {
+                       nrq = container_of(node, struct ptlrpc_nrs_request,
+                                          nr_node);
+
+                       if (net->cn_round < nrq->nr_u.crr.cr_round)
+                               net->cn_round = nrq->nr_u.crr.cr_round;
+               }
+       }
+}
+
+/**
+ * Called right after the request \a nrq finishes being handled by CRR-N policy
+ * instance \a policy.
+ *
+ * \param[in] policy the policy that handled the request
+ * \param[in] nrq    the request that was handled
+ */
+static void nrs_crrn_req_stop(struct ptlrpc_nrs_policy *policy,
+                             struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+                                                 rq_nrq);
+
+       CDEBUG(D_RPCTRACE,
+              "NRS: finished handling %s request from %s, with round "LPU64
+              "\n", NRS_POL_NAME_CRRN,
+              libcfs_id2str(req->rq_peer), nrq->nr_u.crr.cr_round);
+}
+
+#ifdef LPROCFS
+
+/**
+ * lprocfs interface
+ */
+
+/**
+ * Retrieves the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances on both the regular and high-priority NRS head
+ * of a service, as long as a policy instance is not in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state; policy instances in this
+ * state are skipped later by nrs_crrn_ctl().
+ *
+ * Quantum values are in # of RPCs, and output is in YAML format.
+ *
+ * For example:
+ *
+ *     reg_quantum:8
+ *     hp_quantum:4
+ */
+static int ptlrpc_lprocfs_rd_nrs_crrn_quantum(char *page, char **start,
+                                             off_t off, int count, int *eof,
+                                             void *data)
+{
+       struct ptlrpc_service       *svc = data;
+       __u16                        quantum;
+       int                          rc;
+       int                          rc2 = 0;
+
+       /**
+        * Perform two separate calls to this as only one of the NRS heads'
+        * policies may be in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+        * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING state.
+        */
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                      NRS_POL_NAME_CRRN,
+                                      NRS_CTL_CRRN_RD_QUANTUM,
+                                      true, &quantum);
+       if (rc == 0) {
+               *eof = 1;
+               rc2 = snprintf(page, count, NRS_LPROCFS_QUANTUM_NAME_REG
+                              "%-5d\n", quantum);
+               /**
+                * Ignore -ENODEV as the regular NRS head's policy may be in the
+                * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       } else if (rc != -ENODEV) {
+               return rc;
+       }
+
+       if (!nrs_svc_has_hp(svc))
+               goto no_hp;
+
+       rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                      NRS_POL_NAME_CRRN,
+                                      NRS_CTL_CRRN_RD_QUANTUM,
+                                      true, &quantum);
+       if (rc == 0) {
+               *eof = 1;
+               rc2 += snprintf(page + rc2, count - rc2,
+                               NRS_LPROCFS_QUANTUM_NAME_HP"%-5d\n", quantum);
+               /**
+                * Ignore -ENODEV as the high priority NRS head's policy may be
+                * in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state.
+                */
+       } else if (rc != -ENODEV) {
+               return rc;
+       }
+
+no_hp:
+
+       return rc2 ? : rc;
+}
+
+/**
+ * Sets the value of the Round Robin quantum (i.e. the maximum batch size)
+ * for CRR-N policy instances of a service. The user can set the quantum size
+ * for the regular or high priority NRS head individually by specifying each
+ * value, or both together in a single invocation.
+ *
+ * For example:
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=reg_quantum:32, to set the regular
+ * request quantum size on all PTLRPC services to 32
+ *
+ * lctl set_param *.*.*.nrs_crrn_quantum=hp_quantum:16, to set the high
+ * priority request quantum size on all PTLRPC services to 16, and
+ *
+ * lctl set_param *.*.ost_io.nrs_crrn_quantum=16, to set both the regular and
+ * high priority request quantum sizes of the ost_io service to 16.
+ *
+ * policy instances in the ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED state
+ * are skipped later by nrs_crrn_ctl().
+ */
+static int ptlrpc_lprocfs_wr_nrs_crrn_quantum(struct file *file,
+                                             const char *buffer,
+                                             unsigned long count, void *data)
+{
+       struct ptlrpc_service       *svc = data;
+       enum ptlrpc_nrs_queue_type   queue = 0;
+       char                         kernbuf[LPROCFS_NRS_WR_QUANTUM_MAX_CMD];
+       char                        *val;
+       long                         quantum_reg;
+       long                         quantum_hp;
+       /** lprocfs_find_named_value() modifies its argument, so keep a copy */
+       unsigned long                count_copy;
+       int                          rc = 0;
+       int                          rc2 = 0;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+       if (cfs_copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+       count_copy = count;
+
+       /**
+        * Check if the regular quantum value has been specified
+        */
+       val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_REG,
+                                      &count_copy);
+       if (val != kernbuf) {
+               quantum_reg = simple_strtol(val, NULL, 10);
+
+               queue |= PTLRPC_NRS_QUEUE_REG;
+       }
+
+       count_copy = count;
+
+       /**
+        * Check if the high priority quantum value has been specified
+        */
+       val = lprocfs_find_named_value(kernbuf, NRS_LPROCFS_QUANTUM_NAME_HP,
+                                      &count_copy);
+       if (val != kernbuf) {
+               if (!nrs_svc_has_hp(svc))
+                       return -ENODEV;
+
+               quantum_hp = simple_strtol(val, NULL, 10);
+
+               queue |= PTLRPC_NRS_QUEUE_HP;
+       }
+
+       /**
+        * If none of the queues has been specified, look for a valid numerical
+        * value
+        */
+       if (queue == 0) {
+               if (!isdigit(kernbuf[0]))
+                       return -EINVAL;
+
+               quantum_reg = simple_strtol(kernbuf, NULL, 10);
+
+               queue = PTLRPC_NRS_QUEUE_REG;
+
+               if (nrs_svc_has_hp(svc)) {
+                       queue |= PTLRPC_NRS_QUEUE_HP;
+                       quantum_hp = quantum_reg;
+               }
+       }
+
+       if ((((queue & PTLRPC_NRS_QUEUE_REG) != 0) &&
+           ((quantum_reg > LPROCFS_NRS_QUANTUM_MAX || quantum_reg <= 0))) ||
+           (((queue & PTLRPC_NRS_QUEUE_HP) != 0) &&
+           ((quantum_hp > LPROCFS_NRS_QUANTUM_MAX || quantum_hp <= 0))))
+               return -EINVAL;
+
+       /**
+        * We change the values on regular and HP NRS heads separately, so that
+        * we do not exit early from ptlrpc_nrs_policy_control() with an error
+        * returned by nrs_policy_ctl_locked(), in cases where the user has not
+        * started the policy on either the regular or HP NRS head; i.e. we are
+        * ignoring -ENODEV within nrs_policy_ctl_locked(). -ENODEV is returned
+        * only if the operation fails with -ENODEV on all heads that have been
+        * specified by the command; if at least one operation succeeds,
+        * success is returned.
+        */
+       if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+               rc = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_REG,
+                                              NRS_POL_NAME_CRRN,
+                                              NRS_CTL_CRRN_WR_QUANTUM, false,
+                                              &quantum_reg);
+               if ((rc < 0 && rc != -ENODEV) ||
+                   (rc == -ENODEV && queue == PTLRPC_NRS_QUEUE_REG))
+                       return rc;
+       }
+
+       if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+               rc2 = ptlrpc_nrs_policy_control(svc, PTLRPC_NRS_QUEUE_HP,
+                                               NRS_POL_NAME_CRRN,
+                                               NRS_CTL_CRRN_WR_QUANTUM, false,
+                                               &quantum_hp);
+               if ((rc2 < 0 && rc2 != -ENODEV) ||
+                   (rc2 == -ENODEV && queue == PTLRPC_NRS_QUEUE_HP))
+                       return rc2;
+       }
+
+       return rc == -ENODEV && rc2 == -ENODEV ? -ENODEV : count;
+}
+
+/**
+ * Initializes a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ *
+ * \retval 0   success
+ * \retval != 0        error
+ */
+int nrs_crrn_lprocfs_init(struct ptlrpc_service *svc)
+{
+       int     rc;
+
+       struct lprocfs_vars nrs_crrn_lprocfs_vars[] = {
+               { .name         = "nrs_crrn_quantum",
+                 .read_fptr    = ptlrpc_lprocfs_rd_nrs_crrn_quantum,
+                 .write_fptr   = ptlrpc_lprocfs_wr_nrs_crrn_quantum,
+                 .data = svc },
+               { NULL }
+       };
+
+       if (svc->srv_procroot == NULL)
+               return 0;
+
+       rc = lprocfs_add_vars(svc->srv_procroot, nrs_crrn_lprocfs_vars, NULL);
+
+       return rc;
+}
+
+/**
+ * Cleans up a CRR-N policy's lprocfs interface for service \a svc
+ *
+ * \param[in] svc the service
+ */
+void nrs_crrn_lprocfs_fini(struct ptlrpc_service *svc)
+{
+       if (svc->srv_procroot == NULL)
+               return;
+
+       lprocfs_remove_proc_entry("nrs_crrn_quantum", svc->srv_procroot);
+}
+
+#endif /* LPROCFS */
+
+/**
+ * CRR-N policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_crrn_ops = {
+       .op_policy_start        = nrs_crrn_start,
+       .op_policy_stop         = nrs_crrn_stop,
+       .op_policy_ctl          = nrs_crrn_ctl,
+       .op_res_get             = nrs_crrn_res_get,
+       .op_res_put             = nrs_crrn_res_put,
+       .op_req_get             = nrs_crrn_req_get,
+       .op_req_enqueue         = nrs_crrn_req_add,
+       .op_req_dequeue         = nrs_crrn_req_del,
+       .op_req_stop            = nrs_crrn_req_stop,
+#ifdef LPROCFS
+       .op_lprocfs_init        = nrs_crrn_lprocfs_init,
+       .op_lprocfs_fini        = nrs_crrn_lprocfs_fini,
+#endif
+};
+
+/**
+ * CRR-N policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_crrn = {
+       .nc_name                = NRS_POL_NAME_CRRN,
+       .nc_ops                 = &nrs_crrn_ops,
+       .nc_compat              = nrs_policy_compat_all,
+};
+
+/** @} CRR-N policy */
+
+/** @} nrs */
+
+#endif /* HAVE_SERVER_SUPPORT */
index ca0a582..45d9fed 100644 (file)
@@ -209,6 +209,22 @@ struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
        return nrs_request_resource(nrq)->res_policy;
 }
 
+#define NRS_LPROCFS_QUANTUM_NAME_REG   "reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP    "hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum
+ */
+#define LPROCFS_NRS_QUANTUM_MAX                65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD                                        \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+        NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
 /* recovd_thread.c */
 
 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);