4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details.
15 * You should have received a copy of the GNU General Public License
16 * version 2 along with this program; If not, see
17 * http://www.gnu.org/licenses/gpl-2.0.html
22 * Copyright (c) 2014, Intel Corporation.
24 * Copyright 2012 Xyratex Technology Limited
28 * Network Request Scheduler (NRS)
36 * \defgroup nrs Network Request Scheduler
39 struct ptlrpc_nrs_policy;
40 struct ptlrpc_nrs_resource;
41 struct ptlrpc_nrs_request;
44 * NRS control operations.
46 * These are common for all policies.
52 PTLRPC_NRS_CTL_INVALID,
54 * Activate the policy.
58 * Reserved for multiple primary policies, which may be a possibility
63 * Policies can start using opcodes from this value and onwards for
64 * their own purposes; the assigned value itself is arbitrary.
66 PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
70 * NRS policy operations.
72 * These determine the behaviour of a policy, and are called in response to
75 struct ptlrpc_nrs_pol_ops {
77 * Called during policy registration; this operation is optional.
79 * \param[in,out] policy The policy being initialized
81 int (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
83 * Called during policy unregistration; this operation is optional.
85 * \param[in,out] policy The policy being unregistered/finalized
87 void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
89 * Called when activating a policy via lprocfs; policies allocate and
90 * initialize their resources here; this operation is optional.
92 * \param[in,out] policy The policy being started
93 * \param[in,out] arg A generic char buffer
95 * \see nrs_policy_start_locked()
97 int (*op_policy_start) (struct ptlrpc_nrs_policy *policy,
100 * Called when deactivating a policy via lprocfs; policies deallocate
101 * their resources here; this operation is optional
103 * \param[in,out] policy The policy being stopped
105 * \see nrs_policy_stop0()
107 void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
109 * Used for policy-specific operations; i.e. not generic ones like
110 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
111 * to an ioctl; this operation is optional.
113 * \param[in,out] policy The policy carrying out operation \a opc
114 * \param[in] opc The command operation being carried out
115 * \param[in,out] arg An generic buffer for communication between the
116 * user and the control operation
121 * \see ptlrpc_nrs_policy_control()
123 int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
124 enum ptlrpc_nrs_ctl opc, void *arg);
127 * Called when obtaining references to the resources of the resource
128 * hierarchy for a request that has arrived for handling at the PTLRPC
129 * service. Policies should return -ve for requests they do not wish
130 * to handle. This operation is mandatory.
132 * \param[in,out] policy The policy we're getting resources for.
133 * \param[in,out] nrq The request we are getting resources for.
134 * \param[in] parent The parent resource of the resource being
135 * requested; set to NULL if none.
136 * \param[out] resp The resource is to be returned here; the
137 * fallback policy in an NRS head should
138 * \e always return a non-NULL pointer value.
139 * \param[in] moving_req When set, signifies that this is an attempt
140 * to obtain resources for a request being moved
141 * to the high-priority NRS head by
142 * ldlm_lock_reorder_req().
143 * This implies two things:
144 * 1. We are under obd_export::exp_rpc_lock and
145 * so should not sleep.
146 * 2. We should not perform non-idempotent or can
147 * skip performing idempotent operations that
148 * were carried out when resources were first
149 * taken for the request when it was initialized
150 * in ptlrpc_nrs_req_initialize().
152 * \retval 0, +ve The level of the returned resource in the resource
153 * hierarchy; currently only 0 (for a non-leaf resource)
154 * and 1 (for a leaf resource) are supported by the
158 * \see ptlrpc_nrs_req_initialize()
159 * \see ptlrpc_nrs_hpreq_add_nolock()
160 * \see ptlrpc_nrs_req_hp_move()
162 int (*op_res_get) (struct ptlrpc_nrs_policy *policy,
163 struct ptlrpc_nrs_request *nrq,
164 const struct ptlrpc_nrs_resource *parent,
165 struct ptlrpc_nrs_resource **resp,
168 * Called when releasing references taken for resources in the resource
169 * hierarchy for the request; this operation is optional.
171 * \param[in,out] policy The policy the resource belongs to
172 * \param[in] res The resource to be freed
174 * \see ptlrpc_nrs_req_finalize()
175 * \see ptlrpc_nrs_hpreq_add_nolock()
176 * \see ptlrpc_nrs_req_hp_move()
178 void (*op_res_put) (struct ptlrpc_nrs_policy *policy,
179 const struct ptlrpc_nrs_resource *res);
182 * Obtains a request for handling from the policy, and optionally
183 * removes the request from the policy; this operation is mandatory.
185 * \param[in,out] policy The policy to poll
186 * \param[in] peek When set, signifies that we just want to
187 * examine the request, and not handle it, so the
188 * request is not removed from the policy.
189 * \param[in] force When set, it will force a policy to return a
190 * request if it has one queued.
192 * \retval NULL No request available for handling
193 * \retval valid-pointer The request polled for handling
195 * \see ptlrpc_nrs_req_get_nolock()
197 struct ptlrpc_nrs_request *
198 (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
201 * Called when attempting to add a request to a policy for later
202 * handling; this operation is mandatory.
204 * \param[in,out] policy The policy on which to enqueue \a nrq
205 * \param[in,out] nrq The request to enqueue
210 * \see ptlrpc_nrs_req_add_nolock()
212 int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
213 struct ptlrpc_nrs_request *nrq);
215 * Removes a request from the policy's set of pending requests. Normally
216 * called after a request has been polled successfully from the policy
217 * for handling; this operation is mandatory.
219 * \param[in,out] policy The policy the request \a nrq belongs to
220 * \param[in,out] nrq The request to dequeue
222 * \see ptlrpc_nrs_req_del_nolock()
224 void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
225 struct ptlrpc_nrs_request *nrq);
227 * Called after the request being carried out. Could be used for
228 * job/resource control; this operation is optional.
230 * \param[in,out] policy The policy which is stopping to handle request
232 * \param[in,out] nrq The request
234 * \pre assert_spin_locked(&svcpt->scp_req_lock)
236 * \see ptlrpc_nrs_req_stop_nolock()
238 void (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
239 struct ptlrpc_nrs_request *nrq);
241 * Registers the policy's lprocfs interface with a PTLRPC service.
243 * \param[in] svc The service
248 int (*op_lprocfs_init) (struct ptlrpc_service *svc);
250 * Unegisters the policy's lprocfs interface with a PTLRPC service.
252 * In cases of failed policy registration in
253 * \e ptlrpc_nrs_policy_register(), this function may be called for a
254 * service which has not registered the policy successfully, so
255 * implementations of this method should make sure their operations are
256 * safe in such cases.
258 * \param[in] svc The service
260 void (*op_lprocfs_fini) (struct ptlrpc_service *svc);
266 enum nrs_policy_flags {
268 * Fallback policy, use this flag only on a single supported policy per
269 * service. The flag cannot be used on policies that use
270 * \e PTLRPC_NRS_FL_REG_EXTERN
272 PTLRPC_NRS_FL_FALLBACK = BIT(0),
274 * Start policy immediately after registering.
276 PTLRPC_NRS_FL_REG_START = BIT(1),
278 * This is a policy registering from a module different to the one NRS
279 * core ships in (currently ptlrpc).
281 PTLRPC_NRS_FL_REG_EXTERN = BIT(2),
287 * Denotes whether an NRS instance is for handling normal or high-priority
288 * RPCs, or whether an operation pertains to one or both of the NRS instances
291 enum ptlrpc_nrs_queue_type {
292 PTLRPC_NRS_QUEUE_REG = BIT(0),
293 PTLRPC_NRS_QUEUE_HP = BIT(1),
294 PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
300 * A PTLRPC service has at least one NRS head instance for handling normal
301 * priority RPCs, and may optionally have a second NRS head instance for
302 * handling high-priority RPCs. Each NRS head maintains a list of available
303 * policies, of which one and only one policy is acting as the fallback policy,
304 * and optionally a different policy may be acting as the primary policy. For
305 * all RPCs handled by this NRS head instance, NRS core will first attempt to
306 * enqueue the RPC using the primary policy (if any). The fallback policy is
307 * used in the following cases:
308 * - when there was no primary policy in the
309 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
311 * - when the primary policy that was at the
312 * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
313 * RPC was initialized, denoted it did not wish, or for some other reason was
314 * not able to handle the request, by returning a non-valid NRS resource
316 * - when the primary policy that was at the
317 * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
318 * RPC was initialized, fails later during the request enqueueing stage.
320 * \see nrs_resource_get_safe()
321 * \see nrs_request_enqueue()
325 /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
327 * List of registered policies
329 struct list_head nrs_policy_list;
331 * List of policies with queued requests. Policies that have any
332 * outstanding requests are queued here, and this list is queried
333 * in a round-robin manner from NRS core when obtaining a request
334 * for handling. This ensures that requests from policies that at some
335 * point transition away from the
336 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
338 struct list_head nrs_policy_queued;
340 * Service partition for this NRS head
342 struct ptlrpc_service_part *nrs_svcpt;
344 * Primary policy, which is the preferred policy for handling RPCs
346 struct ptlrpc_nrs_policy *nrs_policy_primary;
348 * Fallback policy, which is the backup policy for handling RPCs
350 struct ptlrpc_nrs_policy *nrs_policy_fallback;
352 * This NRS head handles either HP or regular requests
354 enum ptlrpc_nrs_queue_type nrs_queue_type;
356 * # queued requests from all policies in this NRS head
358 unsigned long nrs_req_queued;
360 * # scheduled requests from all policies in this NRS head
362 unsigned long nrs_req_started;
364 * # policies on this NRS
366 unsigned nrs_num_pols;
368 * This NRS head is in progress of starting a policy
370 unsigned nrs_policy_starting:1;
372 * In progress of shutting down the whole NRS head; used during
375 unsigned nrs_stopping:1;
377 * NRS policy is throttling reqeust
379 unsigned nrs_throttling:1;
382 #define NRS_POL_NAME_MAX 16
383 #define NRS_POL_ARG_MAX 16
385 struct ptlrpc_nrs_pol_desc;
388 * Service compatibility predicate; this determines whether a policy is adequate
389 * for handling RPCs of a particular PTLRPC service.
391 * XXX:This should give the same result during policy registration and
392 * unregistration, and for all partitions of a service; so the result should not
393 * depend on temporal service or other properties, that may influence the
396 typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
397 const struct ptlrpc_nrs_pol_desc *desc);
399 struct ptlrpc_nrs_pol_conf {
401 * Human-readable policy name
403 char nc_name[NRS_POL_NAME_MAX];
405 * NRS operations for this policy
407 const struct ptlrpc_nrs_pol_ops *nc_ops;
409 * Service compatibility predicate
411 nrs_pol_desc_compat_t nc_compat;
413 * Set for policies that support a single ptlrpc service, i.e. ones that
414 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
415 * depicts the name of the single service that such policies are
418 const char *nc_compat_svc_name;
420 * Owner module for this policy descriptor; policies registering from a
421 * different module to the one the NRS framework is held within
422 * (currently ptlrpc), should set this field to THIS_MODULE.
424 struct module *nc_owner;
426 * Policy registration flags; a bitmast of \e nrs_policy_flags
432 * NRS policy registering descriptor
434 * Is used to hold a description of a policy that can be passed to NRS core in
435 * order to register the policy with NRS heads in different PTLRPC services.
437 struct ptlrpc_nrs_pol_desc {
439 * Human-readable policy name
441 char pd_name[NRS_POL_NAME_MAX];
443 * Link into nrs_core::nrs_policies
445 struct list_head pd_list;
447 * NRS operations for this policy
449 const struct ptlrpc_nrs_pol_ops *pd_ops;
451 * Service compatibility predicate
453 nrs_pol_desc_compat_t pd_compat;
455 * Set for policies that are compatible with only one PTLRPC service.
457 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
459 const char *pd_compat_svc_name;
461 * Owner module for this policy descriptor.
463 * We need to hold a reference to the module whenever we might make use
464 * of any of the module's contents, i.e.
465 * - If one or more instances of the policy are at a state where they
466 * might be handling a request, i.e.
467 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
468 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
469 * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
470 * is taken on the module when
471 * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
472 * becomes 0, so that we hold only one reference to the module maximum
475 * We do not need to hold a reference to the module, even though we
476 * might use code and data from the module, in the following cases:
477 * - During external policy registration, because this should happen in
478 * the module's init() function, in which case the module is safe from
479 * removal because a reference is being held on the module by the
480 * kernel, and iirc kmod (and I guess module-init-tools also) will
481 * serialize any racing processes properly anyway.
482 * - During external policy unregistration, because this should happen
483 * in a module's exit() function, and any attempts to start a policy
484 * instance would need to take a reference on the module, and this is
485 * not possible once we have reached the point where the exit()
487 * - During service registration and unregistration, as service setup
488 * and cleanup, and policy registration, unregistration and policy
489 * instance starting, are serialized by \e nrs_core::nrs_mutex, so
490 * as long as users adhere to the convention of registering policies
491 * in init() and unregistering them in module exit() functions, there
492 * should not be a race between these operations.
493 * - During any policy-specific lprocfs operations, because a reference
494 * is held by the kernel on a proc entry that has been entered by a
495 * syscall, so as long as proc entries are removed during
496 * unregistration time, then unregistration and lprocfs operations
497 * will be properly serialized.
499 struct module *pd_owner;
501 * Bitmask of \e nrs_policy_flags
505 * # of references on this descriptor
513 * Policies transition from one state to the other during their lifetime
515 enum ptlrpc_nrs_pol_state {
517 * Not a valid policy state.
519 NRS_POL_STATE_INVALID,
521 * Policies are at this state either at the start of their life, or
522 * transition here when the user selects a different policy to act
523 * as the primary one.
525 NRS_POL_STATE_STOPPED,
527 * Policy is progress of stopping
529 NRS_POL_STATE_STOPPING,
531 * Policy is in progress of starting
533 NRS_POL_STATE_STARTING,
535 * A policy is in this state in two cases:
536 * - it is the fallback policy, which is always in this state.
537 * - it has been activated by the user; i.e. it is the primary policy,
539 NRS_POL_STATE_STARTED,
543 * NRS policy information
545 * Used for obtaining information for the status of a policy via lprocfs
547 struct ptlrpc_nrs_pol_info {
551 char pi_name[NRS_POL_NAME_MAX];
555 char pi_arg[NRS_POL_ARG_MAX];
557 * Current policy state
559 enum ptlrpc_nrs_pol_state pi_state;
561 * # RPCs enqueued for later dispatching by the policy
565 * # RPCs started for dispatch by the policy
569 * Is this a fallback policy?
571 unsigned pi_fallback:1;
577 * There is one instance of this for each policy in each NRS head of each
578 * PTLRPC service partition.
580 struct ptlrpc_nrs_policy {
582 * Linkage into the NRS head's list of policies,
583 * ptlrpc_nrs:nrs_policy_list
585 struct list_head pol_list;
587 * Linkage into the NRS head's list of policies with enqueued
588 * requests ptlrpc_nrs:nrs_policy_queued
590 struct list_head pol_list_queued;
592 * Current state of this policy
594 enum ptlrpc_nrs_pol_state pol_state;
596 * Bitmask of nrs_policy_flags
600 * # RPCs enqueued for later dispatching by the policy
604 * # RPCs started for dispatch by the policy
606 long pol_req_started;
608 * Usage Reference count taken on the policy instance
612 * Human-readable policy argument
614 char pol_arg[NRS_POL_ARG_MAX];
616 * The NRS head this policy has been created at
618 struct ptlrpc_nrs *pol_nrs;
620 * Private policy data; varies by policy type
624 * Policy descriptor for this policy instance.
626 struct ptlrpc_nrs_pol_desc *pol_desc;
632 * Resources are embedded into two types of NRS entities:
633 * - Inside NRS policies, in the policy's private data in
634 * ptlrpc_nrs_policy::pol_private
635 * - In objects that act as prime-level scheduling entities in different NRS
636 * policies; e.g. on a policy that performs round robin or similar order
637 * scheduling across client NIDs, there would be one NRS resource per unique
638 * client NID. On a policy which performs round robin scheduling across
639 * backend filesystem objects, there would be one resource associated with
640 * each of the backend filesystem objects partaking in the scheduling
641 * performed by the policy.
643 * NRS resources share a parent-child relationship, in which resources embedded
644 * in policy instances are the parent entities, with all scheduling entities
645 * a policy schedules across being the children, thus forming a simple resource
646 * hierarchy. This hierarchy may be extended with one or more levels in the
647 * future if the ability to have more than one primary policy is added.
649 * Upon request initialization, references to the then active NRS policies are
650 * taken and used to later handle the dispatching of the request with one of
653 * \see nrs_resource_get_safe()
654 * \see ptlrpc_nrs_req_add()
656 struct ptlrpc_nrs_resource {
658 * This NRS resource's parent; is NULL for resources embedded in NRS
659 * policy instances; i.e. those are top-level ones.
661 struct ptlrpc_nrs_resource *res_parent;
663 * The policy associated with this resource.
665 struct ptlrpc_nrs_policy *res_policy;
674 #include <lustre_nrs_fifo.h>
678 * Objects of this type are embedded into objects of the ordered set that is to
679 * be maintained by a \e struct binheap instance.
681 struct binheap_node {
682 /** Index into the binary tree */
683 unsigned int chn_index;
685 #ifdef HAVE_SERVER_SUPPORT
686 #include <lustre_nrs_tbf.h>
687 #include <lustre_nrs_crr.h>
688 #include <lustre_nrs_orr.h>
689 #endif /* HAVE_SERVER_SUPPORT */
690 #include <lustre_nrs_delay.h>
695 * Instances of this object exist embedded within ptlrpc_request; the main
696 * purpose of this object is to hold references to the request's resources
697 * for the lifetime of the request, and to hold properties that policies use
698 * use for determining the request's scheduling priority.
700 struct ptlrpc_nrs_request {
702 * The request's resource hierarchy.
704 struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX];
706 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
707 * policy that was used to enqueue the request.
709 * \see nrs_request_enqueue()
712 unsigned nr_initialized:1;
713 unsigned nr_enqueued:1;
714 unsigned nr_started:1;
715 unsigned nr_finalized:1;
716 struct binheap_node nr_node;
719 * Policy-specific fields, used for determining a request's scheduling
720 * priority, and other supporting functionality.
724 * Fields for the FIFO policy
726 struct nrs_fifo_req fifo;
727 #ifdef HAVE_SERVER_SUPPORT
729 * CRR-N request defintion
731 struct nrs_crrn_req crr;
732 /** ORR and TRR share the same request definition */
733 struct nrs_orr_req orr;
735 * TBF request definition
737 struct nrs_tbf_req tbf;
738 #endif /* HAVE_SERVER_SUPPORT */
740 * Fields for the delay policy
742 struct nrs_delay_req delay;
745 * Externally-registering policies may want to use this to allocate
746 * their own request properties.