X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flustre_net.h;h=a606ff27e18a0e4016bfcf3baf5518d1dcf31c94;hp=e34a426397c08e8218e89ee91ca84fd98e5b614f;hb=6b8967aa9545fbf5942cc79438d27cd38e919f70;hpb=08aa217ce49aba1ded52e0f7adb8a607035123fd

diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index e34a426..a606ff2 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -83,12 +83,39 @@
 #define PTLRPC_MD_OPTIONS  0
 
 /**
- * Define maxima for bulk I/O
- * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
- * these limits are system wide and not interface-local. */
-#define PTLRPC_MAX_BRW_BITS     LNET_MTU_BITS
-#define PTLRPC_MAX_BRW_SIZE     (1<<LNET_MTU_BITS)
-#define PTLRPC_MAX_BRW_PAGES    (PTLRPC_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS	2
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1 << LNET_MTU_BITS)
 
 /* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
 #ifdef __KERNEL__
@@ -98,10 +125,10 @@
 # if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * CFS_PAGE_SIZE))
 #  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * CFS_PAGE_SIZE"
 # endif
-# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU)
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
 #  error "PTLRPC_MAX_BRW_SIZE too big"
 # endif
-# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV)
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
 #  error "PTLRPC_MAX_BRW_PAGES too big"
 # endif
 #endif /* __KERNEL__ */
@@ -241,7 +268,7 @@
 #define LDLM_NTHRS_MAX		(cfs_num_online_cpus() == 1 ? 64 : 128)
 
 #define LDLM_BL_THREADS  LDLM_NTHRS_AUTO_INIT
-#define LDLM_NBUFS      (64 * cfs_num_online_cpus())
+#define LDLM_NBUFS      64
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
 #define LDLM_MAXREPSIZE (1024)
@@ -286,11 +313,14 @@
 #define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
 #define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
 
-#define MDS_NBUFS		(64 * cfs_num_online_cpus())
+#define MDS_NBUFS		64
 /**
  * Assume file name length = FNAME_MAX = 256 (true for ext3).
- *        path name length = PATH_MAX = 4096
- *        LOV MD size max  = EA_MAX = 48000 (2000 stripes)
+ *	  path name length = PATH_MAX = 4096
+ *	  LOV MD size max  = EA_MAX = 24 * 2000
+ *	  	(NB: 24 is size of lov_ost_data)
+ *	  LOV LOGCOOKIE size max = 32 * 2000
+ *	  	(NB: 32 is size of llog_cookie)
  * symlink:  FNAME_MAX + PATH_MAX  <- largest
  * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
  * rename:   FNAME_MAX + FNAME_MAX
@@ -299,23 +329,75 @@
  * MDS_MAXREQSIZE ~= 4736 bytes =
  * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
  * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
- * or, for mds_close() and mds_reint_unlink() on a many-OST filesystem:
- *      = 9210 bytes = lustre_msg + mdt_body + 160 * (easize + cookiesize)
  *
  * Realistic size is about 512 bytes (20 character name + 128 char symlink),
  * except in the open case where there are a large number of OSTs in a LOV.
  */
-#define MDS_MAXREPSIZE  max(10 * 1024, 362 + LOV_MAX_STRIPE_COUNT * 56)
-#define MDS_MAXREQSIZE  MDS_MAXREPSIZE
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+#define MDS_MAXREPSIZE		(9 * 1024)	/* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE	max(MDS_MAXREQSIZE, \
+				    362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE	(9 * 1024)
+#define MDS_OUT_MAXREPSIZE	MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE		max_t(int, MDS_MAXREQSIZE + 1024, 8 * 1024)
+
+/**
+ * MDS_LOV_BUFSIZE should be at least max_reqsize (with LOV EA) +
+ * max sptlrpc payload size, however, we need to allocate a much larger buffer
+ * for it because LNet requires each MD(rqbd) has at least MDS_LOVE_MAXREQSIZE
+ * bytes left to avoid dropping of maximum-sized incoming request.
+ * So if MDS_LOV_BUFSIZE is only a little larger than MDS_LOV_MAXREQSIZE,
+ * then it can only fit in one request even there are 48K bytes left in
+ * a rqbd, and memory utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 128K, so even each rqbd is unlinked
+ * from LNet with unused 48K, buffer utilization will be about 62%.
+ * Please check LU-2432 for details.
+ */
+/** MDS_LOV_BUFSIZE = max_reqsize (w/ LOV EA) + max sptlrpc payload size */
+#define MDS_LOV_BUFSIZE		max_t(int, MDS_LOV_MAXREQSIZE + 1024, \
+					   128 * 1024)
 
-/** MDS_BUFSIZE = max_reqsize + max sptlrpc payload size */
-#define MDS_BUFSIZE     (MDS_MAXREQSIZE + 1024)
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_LOV_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define MDS_OUT_BUFSIZE		max_t(int, MDS_OUT_MAXREQSIZE + 1024, \
+					   24 * 1024)
 
 /** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
 #define FLD_MAXREQSIZE  (160)
 
 /** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
 #define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE	(1 << 12)
 
 /**
  * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
@@ -324,12 +406,13 @@
 
 /** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
 #define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE	(1 << 12)
 
 /** MGS threads must be >= 3, see bug 22458 comment #28 */
 #define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
 #define MGS_NTHRS_MAX	32
 
-#define MGS_NBUFS       (64 * cfs_num_online_cpus())
+#define MGS_NBUFS       64
 #define MGS_BUFSIZE     (8 * 1024)
 #define MGS_MAXREQSIZE  (7 * 1024)
 #define MGS_MAXREPSIZE  (9 * 1024)
@@ -371,19 +454,24 @@
 #define OSS_CR_NTHRS_BASE	8
 #define OSS_CR_NTHRS_MAX	64
 
-#define OST_NBUFS       (64 * cfs_num_online_cpus())
-#define OST_BUFSIZE     (8 * 1024)
-
 /**
- * OST_MAXREQSIZE ~= 4768 bytes =
- * lustre_msg + obdo + 16 * obd_ioobj + 256 * niobuf_remote
+ * OST_MAXREQSIZE ~=
+ * lustre_msg + obdo + obd_ioobj + DT_MAX_BRW_PAGES * niobuf_remote
  *
  * - single object with 16 pages is 512 bytes
  * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
  */
-#define OST_MAXREQSIZE  (5 * 1024)
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + sizeof(struct obdo) + \
+			     sizeof(struct obd_ioobj) + DT_MAX_BRW_PAGES * \
+			     sizeof(struct niobuf_remote))
+#define OST_MAXREQSIZE	(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1)
+
 #define OST_MAXREPSIZE  (9 * 1024)
 
+#define OST_NBUFS       64
+#define OST_BUFSIZE     (OST_MAXREQSIZE + 1024)
+
 /* Macro to hide a typecast. */
 #define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
 
@@ -502,6 +590,7 @@ struct ptlrpc_set_cbdata {
 
 struct ptlrpc_bulk_desc;
 struct ptlrpc_service_part;
+struct ptlrpc_service;
 
 /**
  * ptlrpc callback & work item stuff
@@ -626,6 +715,658 @@ struct lu_env;
 struct ldlm_lock;
 
 /**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Recycle resources for inactive policies.
+	 */
+	PTLRPC_NRS_CTL_SHRINK,
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in] policy The policy being started
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop_final()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in]	  policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in]  policy	  The policy we're getting resources for.
+	 * \param[in]  nrq	  The request we are getting resources for.
+	 * \param[in]  parent	  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out] resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in] policy   The policy the resource belongs to
+	 * \param[in] res      The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtain a request for handling from the policy via polling; this
+	 * operation is mandatory.
+	 *
+	 * \param[in] policy The policy to poll
+	 *
+	 * \retval NULL No erquest available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_poll_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_poll) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in] policy The policy on which to enqueue \a nrq
+	 * \param[in] nrq    The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in] policy The policy the request \a nrq belongs to
+	 * \param[in] nrq    The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called before carrying out the request; should not block. Could be
+	 * used for job/resource control; this operation is optional.
+	 *
+	 * \param[in] policy The policy which is starting to handle request
+	 *		     \a nrq
+	 * \param[in] nrq    The request
+	 *
+	 * \pre spin_is_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_start_nolock()
+	 */
+	void	(*op_req_start) (struct ptlrpc_nrs_policy *policy,
+				 struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in] policy The policy which is stopping to handle request
+	 *		     \a nrq
+	 * \param[in] nrq    The request
+	 *
+	 * \pre spin_is_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. Do not use this flag for policies registering using
+	 * ptlrpc_nrs_policy_register() (i.e. ones that are not in
+	 * \e nrs_pols_builtin).
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= (1 << 0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= (1 << 1),
+	/**
+	 * This is a polciy registering externally with NRS core, via
+	 * ptlrpc_nrs_policy_register(), (i.e. one that is not in
+	 * \e nrs_pols_builtin. Used to avoid ptlrpc_nrs_policy_register()
+	 * racing with a policy start operation issued by the user via lprocfs.
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG,
+	PTLRPC_NRS_QUEUE_HP,
+	PTLRPC_NRS_QUEUE_BOTH,
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * Linkage into nrs_core_heads_list
+	 */
+	cfs_list_t			nrs_heads;
+	/**
+	 * List of registered policies
+	 */
+	cfs_list_t			nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	cfs_list_t			nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 * TODO: Can we avoid having this?
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char				pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	struct ptlrpc_nrs_pol_ops      *pd_ops;
+	/**
+	 * Service Compatibility function; this determines whether a policy is
+	 * adequate for handling RPCs of a particular PTLRPC service.
+	 *
+	 * XXX:This should give the same result during policy
+	 * registration and unregistration, and for all partitions of a
+	 * service; so the result should not depend on temporal service
+	 * or other properties, that may influence the result.
+	 */
+	bool	(*pd_compat) (struct ptlrpc_service *svc,
+			      const struct ptlrpc_nrs_pol_desc *desc);
+	/**
+	 * Optionally set for policies that support a single ptlrpc service,
+	 * i.e. ones that have \a pd_compat set to nrs_policy_compat_one()
+	 */
+	char			       *pd_compat_svc_name;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pd_flags;
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	cfs_list_t			pd_list;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * For now, this state is used exclusively for policies that register
+	 * externally to NRS core, i.e. ones that do so via
+	 * ptlrpc_nrs_policy_register() and are not part of nrs_pols_builtin;
+	 * it is used to prevent a race condition between the policy registering
+	 * with more than one service partition while service is operational,
+	 * and the user starting the policy via lprocfs.
+	 *
+	 * \see nrs_pol_make_avail()
+	 */
+	NRS_POL_STATE_UNAVAIL,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	cfs_list_t			pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	cfs_list_t			pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * NRS operations for this policy; points to ptlrpc_nrs_pol_desc::pd_ops
+	 */
+	struct ptlrpc_nrs_pol_ops      *pol_ops;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Human-readable policy name; point to ptlrpc_nrs_pol_desc::pd_name
+	 */
+	char			       *pol_name;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	cfs_list_t			fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	/** request header, must be the first member of structure */
+	cfs_list_t		fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_dequeued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+	cfs_binheap_node_t		nr_node;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+
+/**
  * Basic request prioritization operations structure.
  * The whole idea is centered around locks and RPCs that might affect locks.
  * When a lock is contended we try to give priority to RPCs that might lead
@@ -686,6 +1427,12 @@ struct ptlrpc_request {
 
         /** history sequence # */
         __u64 rq_history_seq;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request rq_nrq;
+	/** @} nrs */
         /** the index of service's srv_at_array into which request is linked */
         time_t rq_at_index;
         /** Lock to protect request flags and some other important bits, like
@@ -719,7 +1466,10 @@ struct ptlrpc_request {
                 rq_invalid_rqset:1,
 		rq_generation_set:1,
 		/* do not resend request on -EINPROGRESS */
-		rq_no_retry_einprogress:1;
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1;
 
 	unsigned int rq_nr_resend;
 
@@ -924,6 +1674,36 @@ static inline int ptlrpc_req_interpret(const struct lu_env *env,
         return rc;
 }
 
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_desc *desc);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_desc *desc);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool
+ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
 /**
  * Returns 1 if request buffer at offset \a index was already swabbed
  */
@@ -1090,7 +1870,7 @@ struct ptlrpc_bulk_page {
 #define BULK_PUT_SOURCE   3
 
 /**
- * Definition of buk descriptor.
+ * Definition of bulk descriptor.
  * Bulks are special "Two phase" RPCs where initial request message
  * is sent first and it is followed bt a transfer (o receiving) of a large
  * amount of data to be settled into pages referenced from the bulk descriptors.
@@ -1100,47 +1880,48 @@ struct ptlrpc_bulk_page {
  *  Another user is readpage for MDT.
  */
 struct ptlrpc_bulk_desc {
-        /** completed successfully */
-        unsigned long bd_success:1;
-        /** accessible to the network (network io potentially in progress) */
-        unsigned long bd_network_rw:1;
-        /** {put,get}{source,sink} */
-        unsigned long bd_type:2;
-        /** client side */
-        unsigned long bd_registered:1;
-        /** For serialization with callback */
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** {put,get}{source,sink} */
+	unsigned long bd_type:2;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
 	spinlock_t bd_lock;
-        /** Import generation when request for this bulk was sent */
-        int bd_import_generation;
-        /** Server side - export this bulk created for */
-        struct obd_export *bd_export;
-        /** Client side - import this bulk was sent on */
-        struct obd_import *bd_import;
-        /** LNet portal for this bulk */
-        __u32 bd_portal;
-        /** Back pointer to the request */
-        struct ptlrpc_request *bd_req;
-        cfs_waitq_t            bd_waitq;        /* server side only WQ */
-        int                    bd_iov_count;    /* # entries in bd_iov */
-        int                    bd_max_iov;      /* allocated size of bd_iov */
-        int                    bd_nob;          /* # bytes covered */
-        int                    bd_nob_transferred; /* # bytes GOT/PUT */
-
-        __u64                  bd_last_xid;
-
-        struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
-        lnet_handle_md_t       bd_md_h;         /* associated MD */
-        lnet_nid_t             bd_sender;       /* stash event::sender */
+	/** Import generation when request for this bulk was sent */
+	int bd_import_generation;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	cfs_waitq_t            bd_waitq;        /* server side only WQ */
+	int                    bd_iov_count;    /* # entries in bd_iov */
+	int                    bd_max_iov;      /* allocated size of bd_iov */
+	int                    bd_nob;          /* # bytes covered */
+	int                    bd_nob_transferred; /* # bytes GOT/PUT */
+
+	__u64                  bd_last_xid;
+
+	struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
+	lnet_nid_t             bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of associated MDs */
+	lnet_handle_md_t	bd_mds[PTLRPC_BULK_OPS_COUNT];
 
 #if defined(__KERNEL__)
-        /*
-         * encrypt iov, size is either 0 or bd_iov_count.
-         */
-        lnet_kiov_t           *bd_enc_iov;
+	/*
+	 * encrypt iov, size is either 0 or bd_iov_count.
+	 */
+	lnet_kiov_t           *bd_enc_iov;
 
-        lnet_kiov_t            bd_iov[0];
+	lnet_kiov_t            bd_iov[0];
 #else
-        lnet_md_iovec_t        bd_iov[0];
+	lnet_md_iovec_t        bd_iov[0];
 #endif
 };
 
@@ -1460,11 +2241,7 @@ struct ptlrpc_service_part {
 	 * sent to this portal
 	 */
 	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
-	/** # reqs in either of the queues below */
-	/** reqs waiting for service */
-	cfs_list_t			scp_req_pending;
-	/** high priority queue */
-	cfs_list_t			scp_hreq_pending;
+	/** # reqs in either of the NRS heads below */
 	/** # reqs being served */
 	int				scp_nreqs_active;
 	/** # HPreqs being served */
@@ -1472,6 +2249,12 @@ struct ptlrpc_service_part {
 	/** # hp requests handled */
 	int				scp_hreq_count;
 
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
 	/** AT stuff */
 	/** @{ */
 	/**
@@ -1611,6 +2394,49 @@ enum ptlrpcd_ctl_flags {
         LIOD_BIND        = 1 << 4,
 };
 
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the NRS head
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool
+nrs_policy_compat_all(struct ptlrpc_service *svc,
+		      const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the NRS head
+ * \retval true	 The policy is compatible with the NRS head
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool
+nrs_policy_compat_one(struct ptlrpc_service *svc,
+		      const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
 /* ptlrpc/events.c */
 extern lnet_handle_eq_t ptlrpc_eq_h;
 extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
@@ -1647,7 +2473,8 @@ extern lnet_pid_t ptl_get_pid(void);
  */
 #ifdef HAVE_SERVER_SUPPORT
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
-                                              int npages, int type, int portal);
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal);
 int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
 void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
 
@@ -1658,7 +2485,7 @@ static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
 	LASSERT(desc != NULL);
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_network_rw;
+	rc = desc->bd_md_count;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -1669,10 +2496,11 @@ int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
 
 static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
 {
-        struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	struct ptlrpc_bulk_desc *desc;
         int                      rc;
 
         LASSERT(req != NULL);
+	desc = req->rq_bulk;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
             req->rq_bulk_deadline > cfs_time_current_sec())
@@ -1682,7 +2510,7 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
                 return 0;
 
 	spin_lock(&desc->bd_lock);
-	rc = desc->bd_network_rw;
+	rc = desc->bd_md_count;
 	spin_unlock(&desc->bd_lock);
 	return rc;
 }
@@ -1767,7 +2595,8 @@ void ptlrpc_req_finished(struct ptlrpc_request *request);
 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
-                                              int npages, int type, int portal);
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal);
 void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
 static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
 {
@@ -1807,7 +2636,7 @@ int ptlrpcd_queue_work(void *handler);
 
 /** @} */
 struct ptlrpc_service_buf_conf {
-	/* nbufs is how many buffers to post */
+	/* nbufs is buffers # to allocate when growing the pool */
 	unsigned int			bc_nbufs;
 	/* buffer size to post */
 	unsigned int			bc_buf_size;
@@ -1890,7 +2719,6 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service);
 int liblustre_check_services(void *arg);
 void ptlrpc_daemonize(char *name);
 int ptlrpc_service_health_check(struct ptlrpc_service *);
-void ptlrpc_hpreq_reorder(struct ptlrpc_request *req);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
 
 #ifdef __KERNEL__
@@ -2213,6 +3041,7 @@ int server_disconnect_export(struct obd_export *exp);
  * Pinger API (client side only)
  * @{
  */
+extern int suppress_pings;
 enum timeout_event {
         TIMEOUT_GRANT = 1
 };
@@ -2272,6 +3101,7 @@ typedef enum {
 
 /* ptlrpc/ptlrpcd.c */
 void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
 void ptlrpcd_wake(struct ptlrpc_request *req);
 void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
 void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);