/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 only, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License version 2 for more details. * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ /* * Copyright (c) 2014, Intel Corporation. * * Copyright 2012 Xyratex Technology Limited */ /* * * Network Request Scheduler (NRS) * */ #ifndef _LUSTRE_NRS_H #define _LUSTRE_NRS_H /** * \defgroup nrs Network Request Scheduler * @{ */ struct ptlrpc_nrs_policy; struct ptlrpc_nrs_resource; struct ptlrpc_nrs_request; /** * NRS control operations. * * These are common for all policies. */ enum ptlrpc_nrs_ctl { /** * Not a valid opcode. */ PTLRPC_NRS_CTL_INVALID, /** * Activate the policy. */ PTLRPC_NRS_CTL_START, /** * Reserved for multiple primary policies, which may be a possibility * in the future. */ PTLRPC_NRS_CTL_STOP, /** * Policies can start using opcodes from this value and onwards for * their own purposes; the assigned value itself is arbitrary. */ PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, }; /** * NRS policy operations. * * These determine the behaviour of a policy, and are called in response to * NRS core events. */ struct ptlrpc_nrs_pol_ops { /** * Called during policy registration; this operation is optional. * * \param[in,out] policy The policy being initialized */ int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); /** * Called during policy unregistration; this operation is optional. * * \param[in,out] policy The policy being unregistered/finalized */ void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); /** * Called when activating a policy via lprocfs; policies allocate and * initialize their resources here; this operation is optional. * * \param[in,out] policy The policy being started * \param[in,out] arg A generic char buffer * * \see nrs_policy_start_locked() */ int (*op_policy_start) (struct ptlrpc_nrs_policy *policy, char *arg); /** * Called when deactivating a policy via lprocfs; policies deallocate * their resources here; this operation is optional * * \param[in,out] policy The policy being stopped * * \see nrs_policy_stop0() */ void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); /** * Used for policy-specific operations; i.e. not generic ones like * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous * to an ioctl; this operation is optional. * * \param[in,out] policy The policy carrying out operation \a opc * \param[in] opc The command operation being carried out * \param[in,out] arg An generic buffer for communication between the * user and the control operation * * \retval -ve error * \retval 0 success * * \see ptlrpc_nrs_policy_control() */ int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, enum ptlrpc_nrs_ctl opc, void *arg); /** * Called when obtaining references to the resources of the resource * hierarchy for a request that has arrived for handling at the PTLRPC * service. Policies should return -ve for requests they do not wish * to handle. This operation is mandatory. * * \param[in,out] policy The policy we're getting resources for. * \param[in,out] nrq The request we are getting resources for. * \param[in] parent The parent resource of the resource being * requested; set to NULL if none. * \param[out] resp The resource is to be returned here; the * fallback policy in an NRS head should * \e always return a non-NULL pointer value. * \param[in] moving_req When set, signifies that this is an attempt * to obtain resources for a request being moved * to the high-priority NRS head by * ldlm_lock_reorder_req(). * This implies two things: * 1. We are under obd_export::exp_rpc_lock and * so should not sleep. * 2. We should not perform non-idempotent or can * skip performing idempotent operations that * were carried out when resources were first * taken for the request when it was initialized * in ptlrpc_nrs_req_initialize(). * * \retval 0, +ve The level of the returned resource in the resource * hierarchy; currently only 0 (for a non-leaf resource) * and 1 (for a leaf resource) are supported by the * framework. * \retval -ve error * * \see ptlrpc_nrs_req_initialize() * \see ptlrpc_nrs_hpreq_add_nolock() * \see ptlrpc_nrs_req_hp_move() */ int (*op_res_get) (struct ptlrpc_nrs_policy *policy, struct ptlrpc_nrs_request *nrq, const struct ptlrpc_nrs_resource *parent, struct ptlrpc_nrs_resource **resp, bool moving_req); /** * Called when releasing references taken for resources in the resource * hierarchy for the request; this operation is optional. * * \param[in,out] policy The policy the resource belongs to * \param[in] res The resource to be freed * * \see ptlrpc_nrs_req_finalize() * \see ptlrpc_nrs_hpreq_add_nolock() * \see ptlrpc_nrs_req_hp_move() */ void (*op_res_put) (struct ptlrpc_nrs_policy *policy, const struct ptlrpc_nrs_resource *res); /** * Obtains a request for handling from the policy, and optionally * removes the request from the policy; this operation is mandatory. * * \param[in,out] policy The policy to poll * \param[in] peek When set, signifies that we just want to * examine the request, and not handle it, so the * request is not removed from the policy. * \param[in] force When set, it will force a policy to return a * request if it has one queued. * * \retval NULL No request available for handling * \retval valid-pointer The request polled for handling * * \see ptlrpc_nrs_req_get_nolock() */ struct ptlrpc_nrs_request * (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, bool force); /** * Called when attempting to add a request to a policy for later * handling; this operation is mandatory. * * \param[in,out] policy The policy on which to enqueue \a nrq * \param[in,out] nrq The request to enqueue * * \retval 0 success * \retval != 0 error * * \see ptlrpc_nrs_req_add_nolock() */ int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, struct ptlrpc_nrs_request *nrq); /** * Removes a request from the policy's set of pending requests. Normally * called after a request has been polled successfully from the policy * for handling; this operation is mandatory. * * \param[in,out] policy The policy the request \a nrq belongs to * \param[in,out] nrq The request to dequeue * * \see ptlrpc_nrs_req_del_nolock() */ void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, struct ptlrpc_nrs_request *nrq); /** * Called after the request being carried out. Could be used for * job/resource control; this operation is optional. * * \param[in,out] policy The policy which is stopping to handle request * \a nrq * \param[in,out] nrq The request * * \pre assert_spin_locked(&svcpt->scp_req_lock) * * \see ptlrpc_nrs_req_stop_nolock() */ void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, struct ptlrpc_nrs_request *nrq); /** * Registers the policy's lprocfs interface with a PTLRPC service. * * \param[in] svc The service * * \retval 0 success * \retval != 0 error */ int (*op_lprocfs_init) (struct ptlrpc_service *svc); /** * Unegisters the policy's lprocfs interface with a PTLRPC service. * * In cases of failed policy registration in * \e ptlrpc_nrs_policy_register(), this function may be called for a * service which has not registered the policy successfully, so * implementations of this method should make sure their operations are * safe in such cases. * * \param[in] svc The service */ void (*op_lprocfs_fini) (struct ptlrpc_service *svc); }; /** * Policy flags */ enum nrs_policy_flags { /** * Fallback policy, use this flag only on a single supported policy per * service. The flag cannot be used on policies that use * \e PTLRPC_NRS_FL_REG_EXTERN */ PTLRPC_NRS_FL_FALLBACK = (1 << 0), /** * Start policy immediately after registering. */ PTLRPC_NRS_FL_REG_START = (1 << 1), /** * This is a policy registering from a module different to the one NRS * core ships in (currently ptlrpc). */ PTLRPC_NRS_FL_REG_EXTERN = (1 << 2), }; /** * NRS queue type. * * Denotes whether an NRS instance is for handling normal or high-priority * RPCs, or whether an operation pertains to one or both of the NRS instances * in a service. */ enum ptlrpc_nrs_queue_type { PTLRPC_NRS_QUEUE_REG = (1 << 0), PTLRPC_NRS_QUEUE_HP = (1 << 1), PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) }; /** * NRS head * * A PTLRPC service has at least one NRS head instance for handling normal * priority RPCs, and may optionally have a second NRS head instance for * handling high-priority RPCs. Each NRS head maintains a list of available * policies, of which one and only one policy is acting as the fallback policy, * and optionally a different policy may be acting as the primary policy. For * all RPCs handled by this NRS head instance, NRS core will first attempt to * enqueue the RPC using the primary policy (if any). The fallback policy is * used in the following cases: * - when there was no primary policy in the * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request * was initialized. * - when the primary policy that was at the * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the * RPC was initialized, denoted it did not wish, or for some other reason was * not able to handle the request, by returning a non-valid NRS resource * reference. * - when the primary policy that was at the * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the * RPC was initialized, fails later during the request enqueueing stage. * * \see nrs_resource_get_safe() * \see nrs_request_enqueue() */ struct ptlrpc_nrs { spinlock_t nrs_lock; /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ /** * List of registered policies */ struct list_head nrs_policy_list; /** * List of policies with queued requests. Policies that have any * outstanding requests are queued here, and this list is queried * in a round-robin manner from NRS core when obtaining a request * for handling. This ensures that requests from policies that at some * point transition away from the * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. */ struct list_head nrs_policy_queued; /** * Service partition for this NRS head */ struct ptlrpc_service_part *nrs_svcpt; /** * Primary policy, which is the preferred policy for handling RPCs */ struct ptlrpc_nrs_policy *nrs_policy_primary; /** * Fallback policy, which is the backup policy for handling RPCs */ struct ptlrpc_nrs_policy *nrs_policy_fallback; /** * This NRS head handles either HP or regular requests */ enum ptlrpc_nrs_queue_type nrs_queue_type; /** * # queued requests from all policies in this NRS head */ unsigned long nrs_req_queued; /** * # scheduled requests from all policies in this NRS head */ unsigned long nrs_req_started; /** * # policies on this NRS */ unsigned nrs_num_pols; /** * This NRS head is in progress of starting a policy */ unsigned nrs_policy_starting:1; /** * In progress of shutting down the whole NRS head; used during * unregistration */ unsigned nrs_stopping:1; /** * NRS policy is throttling reqeust */ unsigned nrs_throttling:1; }; #define NRS_POL_NAME_MAX 16 #define NRS_POL_ARG_MAX 16 struct ptlrpc_nrs_pol_desc; /** * Service compatibility predicate; this determines whether a policy is adequate * for handling RPCs of a particular PTLRPC service. * * XXX:This should give the same result during policy registration and * unregistration, and for all partitions of a service; so the result should not * depend on temporal service or other properties, that may influence the * result. */ typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, const struct ptlrpc_nrs_pol_desc *desc); struct ptlrpc_nrs_pol_conf { /** * Human-readable policy name */ char nc_name[NRS_POL_NAME_MAX]; /** * NRS operations for this policy */ const struct ptlrpc_nrs_pol_ops *nc_ops; /** * Service compatibility predicate */ nrs_pol_desc_compat_t nc_compat; /** * Set for policies that support a single ptlrpc service, i.e. ones that * have \a pd_compat set to nrs_policy_compat_one(). The variable value * depicts the name of the single service that such policies are * compatible with. */ const char *nc_compat_svc_name; /** * Owner module for this policy descriptor; policies registering from a * different module to the one the NRS framework is held within * (currently ptlrpc), should set this field to THIS_MODULE. */ struct module *nc_owner; /** * Policy registration flags; a bitmast of \e nrs_policy_flags */ unsigned nc_flags; }; /** * NRS policy registering descriptor * * Is used to hold a description of a policy that can be passed to NRS core in * order to register the policy with NRS heads in different PTLRPC services. */ struct ptlrpc_nrs_pol_desc { /** * Human-readable policy name */ char pd_name[NRS_POL_NAME_MAX]; /** * Link into nrs_core::nrs_policies */ struct list_head pd_list; /** * NRS operations for this policy */ const struct ptlrpc_nrs_pol_ops *pd_ops; /** * Service compatibility predicate */ nrs_pol_desc_compat_t pd_compat; /** * Set for policies that are compatible with only one PTLRPC service. * * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name */ const char *pd_compat_svc_name; /** * Owner module for this policy descriptor. * * We need to hold a reference to the module whenever we might make use * of any of the module's contents, i.e. * - If one or more instances of the policy are at a state where they * might be handling a request, i.e. * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference * is taken on the module when * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it * becomes 0, so that we hold only one reference to the module maximum * at any time. * * We do not need to hold a reference to the module, even though we * might use code and data from the module, in the following cases: * - During external policy registration, because this should happen in * the module's init() function, in which case the module is safe from * removal because a reference is being held on the module by the * kernel, and iirc kmod (and I guess module-init-tools also) will * serialize any racing processes properly anyway. * - During external policy unregistration, because this should happen * in a module's exit() function, and any attempts to start a policy * instance would need to take a reference on the module, and this is * not possible once we have reached the point where the exit() * handler is called. * - During service registration and unregistration, as service setup * and cleanup, and policy registration, unregistration and policy * instance starting, are serialized by \e nrs_core::nrs_mutex, so * as long as users adhere to the convention of registering policies * in init() and unregistering them in module exit() functions, there * should not be a race between these operations. * - During any policy-specific lprocfs operations, because a reference * is held by the kernel on a proc entry that has been entered by a * syscall, so as long as proc entries are removed during * unregistration time, then unregistration and lprocfs operations * will be properly serialized. */ struct module *pd_owner; /** * Bitmask of \e nrs_policy_flags */ unsigned pd_flags; /** * # of references on this descriptor */ atomic_t pd_refs; }; /** * NRS policy state * * Policies transition from one state to the other during their lifetime */ enum ptlrpc_nrs_pol_state { /** * Not a valid policy state. */ NRS_POL_STATE_INVALID, /** * Policies are at this state either at the start of their life, or * transition here when the user selects a different policy to act * as the primary one. */ NRS_POL_STATE_STOPPED, /** * Policy is progress of stopping */ NRS_POL_STATE_STOPPING, /** * Policy is in progress of starting */ NRS_POL_STATE_STARTING, /** * A policy is in this state in two cases: * - it is the fallback policy, which is always in this state. * - it has been activated by the user; i.e. it is the primary policy, */ NRS_POL_STATE_STARTED, }; /** * NRS policy information * * Used for obtaining information for the status of a policy via lprocfs */ struct ptlrpc_nrs_pol_info { /** * Policy name */ char pi_name[NRS_POL_NAME_MAX]; /** * Policy argument */ char pi_arg[NRS_POL_ARG_MAX]; /** * Current policy state */ enum ptlrpc_nrs_pol_state pi_state; /** * # RPCs enqueued for later dispatching by the policy */ long pi_req_queued; /** * # RPCs started for dispatch by the policy */ long pi_req_started; /** * Is this a fallback policy? */ unsigned pi_fallback:1; }; /** * NRS policy * * There is one instance of this for each policy in each NRS head of each * PTLRPC service partition. */ struct ptlrpc_nrs_policy { /** * Linkage into the NRS head's list of policies, * ptlrpc_nrs:nrs_policy_list */ struct list_head pol_list; /** * Linkage into the NRS head's list of policies with enqueued * requests ptlrpc_nrs:nrs_policy_queued */ struct list_head pol_list_queued; /** * Current state of this policy */ enum ptlrpc_nrs_pol_state pol_state; /** * Bitmask of nrs_policy_flags */ unsigned pol_flags; /** * # RPCs enqueued for later dispatching by the policy */ long pol_req_queued; /** * # RPCs started for dispatch by the policy */ long pol_req_started; /** * Usage Reference count taken on the policy instance */ long pol_ref; /** * Human-readable policy argument */ char pol_arg[NRS_POL_ARG_MAX]; /** * The NRS head this policy has been created at */ struct ptlrpc_nrs *pol_nrs; /** * Private policy data; varies by policy type */ void *pol_private; /** * Policy descriptor for this policy instance. */ struct ptlrpc_nrs_pol_desc *pol_desc; }; /** * NRS resource * * Resources are embedded into two types of NRS entities: * - Inside NRS policies, in the policy's private data in * ptlrpc_nrs_policy::pol_private * - In objects that act as prime-level scheduling entities in different NRS * policies; e.g. on a policy that performs round robin or similar order * scheduling across client NIDs, there would be one NRS resource per unique * client NID. On a policy which performs round robin scheduling across * backend filesystem objects, there would be one resource associated with * each of the backend filesystem objects partaking in the scheduling * performed by the policy. * * NRS resources share a parent-child relationship, in which resources embedded * in policy instances are the parent entities, with all scheduling entities * a policy schedules across being the children, thus forming a simple resource * hierarchy. This hierarchy may be extended with one or more levels in the * future if the ability to have more than one primary policy is added. * * Upon request initialization, references to the then active NRS policies are * taken and used to later handle the dispatching of the request with one of * these policies. * * \see nrs_resource_get_safe() * \see ptlrpc_nrs_req_add() */ struct ptlrpc_nrs_resource { /** * This NRS resource's parent; is NULL for resources embedded in NRS * policy instances; i.e. those are top-level ones. */ struct ptlrpc_nrs_resource *res_parent; /** * The policy associated with this resource. */ struct ptlrpc_nrs_policy *res_policy; }; enum { NRS_RES_FALLBACK, NRS_RES_PRIMARY, NRS_RES_MAX }; #include #include #include #include #include /** * NRS request * * Instances of this object exist embedded within ptlrpc_request; the main * purpose of this object is to hold references to the request's resources * for the lifetime of the request, and to hold properties that policies use * use for determining the request's scheduling priority. * */ struct ptlrpc_nrs_request { /** * The request's resource hierarchy. */ struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; /** * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the * policy that was used to enqueue the request. * * \see nrs_request_enqueue() */ unsigned nr_res_idx; unsigned nr_initialized:1; unsigned nr_enqueued:1; unsigned nr_started:1; unsigned nr_finalized:1; struct cfs_binheap_node nr_node; /** * Policy-specific fields, used for determining a request's scheduling * priority, and other supporting functionality. */ union { /** * Fields for the FIFO policy */ struct nrs_fifo_req fifo; /** * CRR-N request defintion */ struct nrs_crrn_req crr; /** ORR and TRR share the same request definition */ struct nrs_orr_req orr; /** * TBF request definition */ struct nrs_tbf_req tbf; /** * Fields for the delay policy */ struct nrs_delay_req delay; } nr_u; /** * Externally-registering policies may want to use this to allocate * their own request properties. */ void *ext; }; /** @} nrs */ #endif