4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details.
15 * You should have received a copy of the GNU General Public License
16 * version 2 along with this program; If not, see
17 * http://www.gnu.org/licenses/gpl-2.0.html
22 * Copyright (c) 2014, Intel Corporation.
24 * Copyright 2012 Xyratex Technology Limited
28 * Network Request Scheduler (NRS)
36 * \defgroup nrs Network Request Scheduler
39 struct ptlrpc_nrs_policy;
40 struct ptlrpc_nrs_resource;
41 struct ptlrpc_nrs_request;
44 * NRS control operations.
46 * These are common for all policies.
52 PTLRPC_NRS_CTL_INVALID,
54 * Activate the policy.
58 * Reserved for multiple primary policies, which may be a possibility
63 * Policies can start using opcodes from this value and onwards for
64 * their own purposes; the assigned value itself is arbitrary.
66 PTLRPC_NRS_CTL_POL_SPEC_01 = 0x20,
67 PTLRPC_NRS_CTL_POL_SPEC_02,
68 PTLRPC_NRS_CTL_POL_SPEC_03,
69 PTLRPC_NRS_CTL_POL_SPEC_04,
70 PTLRPC_NRS_CTL_POL_SPEC_05,
71 PTLRPC_NRS_CTL_POL_SPEC_06,
72 PTLRPC_NRS_CTL_POL_SPEC_07,
73 PTLRPC_NRS_CTL_POL_SPEC_08,
74 PTLRPC_NRS_CTL_POL_SPEC_09,
75 PTLRPC_NRS_CTL_POL_SPEC_10
79 * NRS policy operations.
81 * These determine the behaviour of a policy, and are called in response to
84 struct ptlrpc_nrs_pol_ops {
86 * Called during policy registration; this operation is optional.
88 * \param[in,out] policy The policy being initialized
90 int (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
92 * Called during policy unregistration; this operation is optional.
94 * \param[in,out] policy The policy being unregistered/finalized
96 void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
98 * Called when activating a policy via lprocfs; policies allocate and
99 * initialize their resources here; this operation is optional.
101 * \param[in,out] policy The policy being started
102 * \param[in,out] arg A generic char buffer
104 * \see nrs_policy_start_locked()
106 int (*op_policy_start) (struct ptlrpc_nrs_policy *policy,
109 * Called when deactivating a policy via lprocfs; policies deallocate
110 * their resources here; this operation is optional
112 * \param[in,out] policy The policy being stopped
114 * \see nrs_policy_stop0()
116 void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
118 * Used for policy-specific operations; i.e. not generic ones like
119 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
120 * to an ioctl; this operation is optional.
122 * \param[in,out] policy The policy carrying out operation \a opc
123 * \param[in] opc The command operation being carried out
124 * \param[in,out] arg An generic buffer for communication between the
125 * user and the control operation
130 * \see ptlrpc_nrs_policy_control()
132 int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
133 enum ptlrpc_nrs_ctl opc, void *arg);
136 * Called when obtaining references to the resources of the resource
137 * hierarchy for a request that has arrived for handling at the PTLRPC
138 * service. Policies should return -ve for requests they do not wish
139 * to handle. This operation is mandatory.
141 * \param[in,out] policy The policy we're getting resources for.
142 * \param[in,out] nrq The request we are getting resources for.
143 * \param[in] parent The parent resource of the resource being
144 * requested; set to NULL if none.
145 * \param[out] resp The resource is to be returned here; the
146 * fallback policy in an NRS head should
147 * \e always return a non-NULL pointer value.
148 * \param[in] moving_req When set, signifies that this is an attempt
149 * to obtain resources for a request being moved
150 * to the high-priority NRS head by
151 * ldlm_lock_reorder_req().
152 * This implies two things:
153 * 1. We are under obd_export::exp_rpc_lock and
154 * so should not sleep.
155 * 2. We should not perform non-idempotent or can
156 * skip performing idempotent operations that
157 * were carried out when resources were first
158 * taken for the request when it was initialized
159 * in ptlrpc_nrs_req_initialize().
161 * \retval 0, +ve The level of the returned resource in the resource
162 * hierarchy; currently only 0 (for a non-leaf resource)
163 * and 1 (for a leaf resource) are supported by the
167 * \see ptlrpc_nrs_req_initialize()
168 * \see ptlrpc_nrs_hpreq_add_nolock()
169 * \see ptlrpc_nrs_req_hp_move()
171 int (*op_res_get) (struct ptlrpc_nrs_policy *policy,
172 struct ptlrpc_nrs_request *nrq,
173 const struct ptlrpc_nrs_resource *parent,
174 struct ptlrpc_nrs_resource **resp,
177 * Called when releasing references taken for resources in the resource
178 * hierarchy for the request; this operation is optional.
180 * \param[in,out] policy The policy the resource belongs to
181 * \param[in] res The resource to be freed
183 * \see ptlrpc_nrs_req_finalize()
184 * \see ptlrpc_nrs_hpreq_add_nolock()
185 * \see ptlrpc_nrs_req_hp_move()
187 void (*op_res_put) (struct ptlrpc_nrs_policy *policy,
188 const struct ptlrpc_nrs_resource *res);
191 * Obtains a request for handling from the policy, and optionally
192 * removes the request from the policy; this operation is mandatory.
194 * \param[in,out] policy The policy to poll
195 * \param[in] peek When set, signifies that we just want to
196 * examine the request, and not handle it, so the
197 * request is not removed from the policy.
198 * \param[in] force When set, it will force a policy to return a
199 * request if it has one queued.
201 * \retval NULL No request available for handling
202 * \retval valid-pointer The request polled for handling
204 * \see ptlrpc_nrs_req_get_nolock()
206 struct ptlrpc_nrs_request *
207 (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
210 * Called when attempting to add a request to a policy for later
211 * handling; this operation is mandatory.
213 * \param[in,out] policy The policy on which to enqueue \a nrq
214 * \param[in,out] nrq The request to enqueue
219 * \see ptlrpc_nrs_req_add_nolock()
221 int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
222 struct ptlrpc_nrs_request *nrq);
224 * Removes a request from the policy's set of pending requests. Normally
225 * called after a request has been polled successfully from the policy
226 * for handling; this operation is mandatory.
228 * \param[in,out] policy The policy the request \a nrq belongs to
229 * \param[in,out] nrq The request to dequeue
231 * \see ptlrpc_nrs_req_del_nolock()
233 void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
234 struct ptlrpc_nrs_request *nrq);
236 * Called after the request being carried out. Could be used for
237 * job/resource control; this operation is optional.
239 * \param[in,out] policy The policy which is stopping to handle request
241 * \param[in,out] nrq The request
243 * \pre assert_spin_locked(&svcpt->scp_req_lock)
245 * \see ptlrpc_nrs_req_stop_nolock()
247 void (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
248 struct ptlrpc_nrs_request *nrq);
250 * Registers the policy's lprocfs interface with a PTLRPC service.
252 * \param[in] svc The service
257 int (*op_lprocfs_init) (struct ptlrpc_service *svc);
259 * Unegisters the policy's lprocfs interface with a PTLRPC service.
261 * In cases of failed policy registration in
262 * \e ptlrpc_nrs_policy_register(), this function may be called for a
263 * service which has not registered the policy successfully, so
264 * implementations of this method should make sure their operations are
265 * safe in such cases.
267 * \param[in] svc The service
269 void (*op_lprocfs_fini) (struct ptlrpc_service *svc);
275 enum nrs_policy_flags {
277 * Fallback policy, use this flag only on a single supported policy per
278 * service. The flag cannot be used on policies that use
279 * \e PTLRPC_NRS_FL_REG_EXTERN
281 PTLRPC_NRS_FL_FALLBACK = BIT(0),
283 * Start policy immediately after registering.
285 PTLRPC_NRS_FL_REG_START = BIT(1),
287 * This is a policy registering from a module different to the one NRS
288 * core ships in (currently ptlrpc).
290 PTLRPC_NRS_FL_REG_EXTERN = BIT(2),
296 * Denotes whether an NRS instance is for handling normal or high-priority
297 * RPCs, or whether an operation pertains to one or both of the NRS instances
300 enum ptlrpc_nrs_queue_type {
301 PTLRPC_NRS_QUEUE_REG = BIT(0),
302 PTLRPC_NRS_QUEUE_HP = BIT(1),
303 PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
309 * A PTLRPC service has at least one NRS head instance for handling normal
310 * priority RPCs, and may optionally have a second NRS head instance for
311 * handling high-priority RPCs. Each NRS head maintains a list of available
312 * policies, of which one and only one policy is acting as the fallback policy,
313 * and optionally a different policy may be acting as the primary policy. For
314 * all RPCs handled by this NRS head instance, NRS core will first attempt to
315 * enqueue the RPC using the primary policy (if any). The fallback policy is
316 * used in the following cases:
317 * - when there was no primary policy in the
318 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
320 * - when the primary policy that was at the
321 * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
322 * RPC was initialized, denoted it did not wish, or for some other reason was
323 * not able to handle the request, by returning a non-valid NRS resource
325 * - when the primary policy that was at the
326 * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
327 * RPC was initialized, fails later during the request enqueueing stage.
329 * \see nrs_resource_get_safe()
330 * \see nrs_request_enqueue()
334 /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
336 * List of registered policies
338 struct list_head nrs_policy_list;
340 * List of policies with queued requests. Policies that have any
341 * outstanding requests are queued here, and this list is queried
342 * in a round-robin manner from NRS core when obtaining a request
343 * for handling. This ensures that requests from policies that at some
344 * point transition away from the
345 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
347 struct list_head nrs_policy_queued;
349 * Service partition for this NRS head
351 struct ptlrpc_service_part *nrs_svcpt;
353 * Primary policy, which is the preferred policy for handling RPCs
355 struct ptlrpc_nrs_policy *nrs_policy_primary;
357 * Fallback policy, which is the backup policy for handling RPCs
359 struct ptlrpc_nrs_policy *nrs_policy_fallback;
361 * This NRS head handles either HP or regular requests
363 enum ptlrpc_nrs_queue_type nrs_queue_type;
365 * # queued requests from all policies in this NRS head
367 unsigned long nrs_req_queued;
369 * # scheduled requests from all policies in this NRS head
371 unsigned long nrs_req_started;
373 * # policies on this NRS
375 unsigned nrs_num_pols;
377 * This NRS head is in progress of starting a policy
379 unsigned nrs_policy_starting:1;
381 * In progress of shutting down the whole NRS head; used during
384 unsigned nrs_stopping:1;
386 * NRS policy is throttling reqeust
388 unsigned nrs_throttling:1;
391 #define NRS_POL_NAME_MAX 16
392 #define NRS_POL_ARG_MAX 16
394 struct ptlrpc_nrs_pol_desc;
397 * Service compatibility predicate; this determines whether a policy is adequate
398 * for handling RPCs of a particular PTLRPC service.
400 * XXX:This should give the same result during policy registration and
401 * unregistration, and for all partitions of a service; so the result should not
402 * depend on temporal service or other properties, that may influence the
405 typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
406 const struct ptlrpc_nrs_pol_desc *desc);
408 struct ptlrpc_nrs_pol_conf {
410 * Human-readable policy name
412 char nc_name[NRS_POL_NAME_MAX];
414 * NRS operations for this policy
416 const struct ptlrpc_nrs_pol_ops *nc_ops;
418 * Service compatibility predicate
420 nrs_pol_desc_compat_t nc_compat;
422 * Set for policies that support a single ptlrpc service, i.e. ones that
423 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
424 * depicts the name of the single service that such policies are
427 const char *nc_compat_svc_name;
429 * Owner module for this policy descriptor; policies registering from a
430 * different module to the one the NRS framework is held within
431 * (currently ptlrpc), should set this field to THIS_MODULE.
433 struct module *nc_owner;
435 * Policy registration flags; a bitmast of \e nrs_policy_flags
441 * NRS policy registering descriptor
443 * Is used to hold a description of a policy that can be passed to NRS core in
444 * order to register the policy with NRS heads in different PTLRPC services.
446 struct ptlrpc_nrs_pol_desc {
448 * Human-readable policy name
450 char pd_name[NRS_POL_NAME_MAX];
452 * Link into nrs_core::nrs_policies
454 struct list_head pd_list;
456 * NRS operations for this policy
458 const struct ptlrpc_nrs_pol_ops *pd_ops;
460 * Service compatibility predicate
462 nrs_pol_desc_compat_t pd_compat;
464 * Set for policies that are compatible with only one PTLRPC service.
466 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
468 const char *pd_compat_svc_name;
470 * Owner module for this policy descriptor.
472 * We need to hold a reference to the module whenever we might make use
473 * of any of the module's contents, i.e.
474 * - If one or more instances of the policy are at a state where they
475 * might be handling a request, i.e.
476 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
477 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
478 * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
479 * is taken on the module when
480 * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
481 * becomes 0, so that we hold only one reference to the module maximum
484 * We do not need to hold a reference to the module, even though we
485 * might use code and data from the module, in the following cases:
486 * - During external policy registration, because this should happen in
487 * the module's init() function, in which case the module is safe from
488 * removal because a reference is being held on the module by the
489 * kernel, and iirc kmod (and I guess module-init-tools also) will
490 * serialize any racing processes properly anyway.
491 * - During external policy unregistration, because this should happen
492 * in a module's exit() function, and any attempts to start a policy
493 * instance would need to take a reference on the module, and this is
494 * not possible once we have reached the point where the exit()
496 * - During service registration and unregistration, as service setup
497 * and cleanup, and policy registration, unregistration and policy
498 * instance starting, are serialized by \e nrs_core::nrs_mutex, so
499 * as long as users adhere to the convention of registering policies
500 * in init() and unregistering them in module exit() functions, there
501 * should not be a race between these operations.
502 * - During any policy-specific lprocfs operations, because a reference
503 * is held by the kernel on a proc entry that has been entered by a
504 * syscall, so as long as proc entries are removed during
505 * unregistration time, then unregistration and lprocfs operations
506 * will be properly serialized.
508 struct module *pd_owner;
510 * Bitmask of \e nrs_policy_flags
514 * # of references on this descriptor
522 * Policies transition from one state to the other during their lifetime
524 enum ptlrpc_nrs_pol_state {
526 * Not a valid policy state.
528 NRS_POL_STATE_INVALID,
530 * Policies are at this state either at the start of their life, or
531 * transition here when the user selects a different policy to act
532 * as the primary one.
534 NRS_POL_STATE_STOPPED,
536 * Policy is progress of stopping
538 NRS_POL_STATE_STOPPING,
540 * Policy is in progress of starting
542 NRS_POL_STATE_STARTING,
544 * A policy is in this state in two cases:
545 * - it is the fallback policy, which is always in this state.
546 * - it has been activated by the user; i.e. it is the primary policy,
548 NRS_POL_STATE_STARTED,
552 * NRS policy information
554 * Used for obtaining information for the status of a policy via lprocfs
556 struct ptlrpc_nrs_pol_info {
560 char pi_name[NRS_POL_NAME_MAX];
564 char pi_arg[NRS_POL_ARG_MAX];
566 * Current policy state
568 enum ptlrpc_nrs_pol_state pi_state;
570 * # RPCs enqueued for later dispatching by the policy
574 * # RPCs started for dispatch by the policy
578 * Is this a fallback policy?
580 unsigned pi_fallback:1;
586 * There is one instance of this for each policy in each NRS head of each
587 * PTLRPC service partition.
589 struct ptlrpc_nrs_policy {
591 * Linkage into the NRS head's list of policies,
592 * ptlrpc_nrs:nrs_policy_list
594 struct list_head pol_list;
596 * Linkage into the NRS head's list of policies with enqueued
597 * requests ptlrpc_nrs:nrs_policy_queued
599 struct list_head pol_list_queued;
601 * Current state of this policy
603 enum ptlrpc_nrs_pol_state pol_state;
605 * Bitmask of nrs_policy_flags
609 * # RPCs enqueued for later dispatching by the policy
613 * # RPCs started for dispatch by the policy
615 long pol_req_started;
617 * Usage Reference count taken on the policy instance
621 * Human-readable policy argument
623 char pol_arg[NRS_POL_ARG_MAX];
625 * The NRS head this policy has been created at
627 struct ptlrpc_nrs *pol_nrs;
629 * Private policy data; varies by policy type
633 * Policy descriptor for this policy instance.
635 struct ptlrpc_nrs_pol_desc *pol_desc;
641 * Resources are embedded into two types of NRS entities:
642 * - Inside NRS policies, in the policy's private data in
643 * ptlrpc_nrs_policy::pol_private
644 * - In objects that act as prime-level scheduling entities in different NRS
645 * policies; e.g. on a policy that performs round robin or similar order
646 * scheduling across client NIDs, there would be one NRS resource per unique
647 * client NID. On a policy which performs round robin scheduling across
648 * backend filesystem objects, there would be one resource associated with
649 * each of the backend filesystem objects partaking in the scheduling
650 * performed by the policy.
652 * NRS resources share a parent-child relationship, in which resources embedded
653 * in policy instances are the parent entities, with all scheduling entities
654 * a policy schedules across being the children, thus forming a simple resource
655 * hierarchy. This hierarchy may be extended with one or more levels in the
656 * future if the ability to have more than one primary policy is added.
658 * Upon request initialization, references to the then active NRS policies are
659 * taken and used to later handle the dispatching of the request with one of
662 * \see nrs_resource_get_safe()
663 * \see ptlrpc_nrs_req_add()
665 struct ptlrpc_nrs_resource {
667 * This NRS resource's parent; is NULL for resources embedded in NRS
668 * policy instances; i.e. those are top-level ones.
670 struct ptlrpc_nrs_resource *res_parent;
672 * The policy associated with this resource.
674 struct ptlrpc_nrs_policy *res_policy;
683 #include <lustre_nrs_fifo.h>
687 * Objects of this type are embedded into objects of the ordered set that is to
688 * be maintained by a \e struct binheap instance.
690 struct binheap_node {
691 /** Index into the binary tree */
692 unsigned int chn_index;
694 #ifdef HAVE_SERVER_SUPPORT
695 #include <lustre_nrs_tbf.h>
696 #include <lustre_nrs_crr.h>
697 #include <lustre_nrs_orr.h>
698 #endif /* HAVE_SERVER_SUPPORT */
699 #include <lustre_nrs_delay.h>
704 * Instances of this object exist embedded within ptlrpc_request; the main
705 * purpose of this object is to hold references to the request's resources
706 * for the lifetime of the request, and to hold properties that policies use
707 * use for determining the request's scheduling priority.
709 struct ptlrpc_nrs_request {
711 * The request's resource hierarchy.
713 struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX];
715 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
716 * policy that was used to enqueue the request.
718 * \see nrs_request_enqueue()
721 unsigned nr_initialized:1;
722 unsigned nr_enqueued:1;
723 unsigned nr_started:1;
724 unsigned nr_finalized:1;
725 struct binheap_node nr_node;
728 * Policy-specific fields, used for determining a request's scheduling
729 * priority, and other supporting functionality.
733 * Fields for the FIFO policy
735 struct nrs_fifo_req fifo;
736 #ifdef HAVE_SERVER_SUPPORT
738 * CRR-N request defintion
740 struct nrs_crrn_req crr;
741 /** ORR and TRR share the same request definition */
742 struct nrs_orr_req orr;
744 * TBF request definition
746 struct nrs_tbf_req tbf;
747 #endif /* HAVE_SERVER_SUPPORT */
749 * Fields for the delay policy
751 struct nrs_delay_req delay;
754 * Externally-registering policies may want to use this to allocate
755 * their own request properties.