X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flustre_net.h;h=fb492883503b00d2a768daf8ef5e446427ee59d5;hp=4d8341d1d32a912108e8e9ecd6150010c8e68c55;hb=be025f5580a0cc4958267d2e4317aac4e2ebc0c3;hpb=020924966d4acd98e96ede094d72a49b762233db

diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 4d8341d..fb49288 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -106,9 +106,12 @@
 # endif
 #endif /* __KERNEL__ */
 
+#define PTLRPC_NTHRS_INIT	2
+
 /**
- * The following constants determine how memory is used to buffer incoming
- * service requests.
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
  *
  * ?_NBUFS              # buffers to allocate when growing the pool
  * ?_BUFSIZE            # bytes in a single request buffer
@@ -120,21 +123,170 @@
  * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
  * considered full when less than ?_MAXREQSIZE is left in them.
  */
-#define LDLM_THREADS_AUTO_MIN (2)
-#define LDLM_THREADS_AUTO_MAX min_t(unsigned, cfs_num_online_cpus() * \
-                                  cfs_num_online_cpus() * 32, 128)
-#define LDLM_BL_THREADS  LDLM_THREADS_AUTO_MIN
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT	        # threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR         # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *        top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *        MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number wil be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(cfs_num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS  LDLM_NTHRS_AUTO_INIT
 #define LDLM_NBUFS      (64 * cfs_num_online_cpus())
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
 #define LDLM_MAXREPSIZE (1024)
 
-/** Absolute limits */
-#define MDT_MIN_THREADS 2UL
-#ifndef MDT_MAX_THREADS
-#define MDT_MAX_THREADS 512UL
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS		1024
+#define MDS_MAX_OTHR_THREADS	256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS	PTLRPC_NTHRS_INIT
 #endif
-#define MDS_NBUFS       (64 * cfs_num_online_cpus())
+#define MDS_MAX_OTHR_THREADS	max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR		8
+#define MDS_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX		MDS_MAX_THREADS
+#define MDS_NTHRS_BASE		min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR	4
+#define MDS_RDPG_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE	min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR	4
+#define MDS_SETA_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE	min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS		(64 * cfs_num_online_cpus())
 /**
  * Assume file name length = FNAME_MAX = 256 (true for ext3).
  *        path name length = PATH_MAX = 4096
@@ -174,16 +326,51 @@
 #define SEQ_MAXREPSIZE  (152)
 
 /** MGS threads must be >= 3, see bug 22458 comment #28 */
-#define MGS_THREADS_AUTO_MIN 3
-#define MGS_THREADS_AUTO_MAX 32
+#define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX	32
+
 #define MGS_NBUFS       (64 * cfs_num_online_cpus())
 #define MGS_BUFSIZE     (8 * 1024)
 #define MGS_MAXREQSIZE  (7 * 1024)
 #define MGS_MAXREPSIZE  (9 * 1024)
 
-/** Absolute OSS limits */
-#define OSS_THREADS_MIN 3       /* difficult replies, HPQ, others */
-#define OSS_THREADS_MAX 512
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR		min_t(int, 8, \
+				CFS_NUM_CACHEPAGES >> (28 - CFS_PAGE_SHIFT))
+#define OSS_NTHRS_INIT		(PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE		64
+#define OSS_NTHRS_MAX		512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR	1
+#define OSS_CR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE	8
+#define OSS_CR_NTHRS_MAX	64
+
 #define OST_NBUFS       (64 * cfs_num_online_cpus())
 #define OST_BUFSIZE     (8 * 1024)
 
@@ -242,7 +429,7 @@ union ptlrpc_async_args {
          * least big enough for that.
          */
         void      *pointer_arg[11];
-        __u64      space[6];
+	__u64      space[7];
 };
 
 struct ptlrpc_request_set;
@@ -281,17 +468,17 @@ struct ptlrpc_request_set {
 	set_interpreter_func  set_interpret;
 	/** opaq argument passed to completion \a set_interpret callback. */
 	void                 *set_arg;
-	/** rq_status of requests that have been freed already */
-	int                   set_rc;
 	/**
 	 * Lock for \a set_new_requests manipulations
 	 * locked so that any old caller can communicate requests to
 	 * the set holder who can then fold them into the lock-free set
 	 */
-	cfs_spinlock_t        set_new_req_lock;
+	spinlock_t		set_new_req_lock;
 	/** List of new yet unsent requests. Only used with ptlrpcd now. */
 	cfs_list_t            set_new_requests;
 
+	/** rq_status of requests that have been freed already */
+	int                   set_rc;
 	/** Additional fields used by the flow control extension */
 	/** Maximum number of RPCs in flight */
 	int                   set_max_inflight;
@@ -314,6 +501,7 @@ struct ptlrpc_set_cbdata {
 };
 
 struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
 
 /**
  * ptlrpc callback & work item stuff
@@ -347,7 +535,7 @@ struct ptlrpc_reply_state {
         cfs_list_t             rs_debug_list;
 #endif
         /** A spinlock to protect the reply state flags */
-        cfs_spinlock_t         rs_lock;
+	spinlock_t		rs_lock;
         /** Reply state flags */
         unsigned long          rs_difficult:1;     /* ACK/commit stuff */
         unsigned long          rs_no_ack:1;    /* no ACK, even for
@@ -369,7 +557,7 @@ struct ptlrpc_reply_state {
         /** xid */
         __u64                  rs_xid;
         struct obd_export     *rs_export;
-        struct ptlrpc_service *rs_service;
+	struct ptlrpc_service_part *rs_svcpt;
         /** Lnet metadata handle for the reply */
         lnet_handle_md_t       rs_md_h;
         cfs_atomic_t           rs_refcount;
@@ -422,8 +610,8 @@ typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
  * any allocations (to avoid e.g. OOM).
  */
 struct ptlrpc_request_pool {
-        /** Locks the list */
-        cfs_spinlock_t prp_lock;
+	/** Locks the list */
+	spinlock_t prp_lock;
         /** list of ptlrpc_request structs */
         cfs_list_t prp_req_list;
         /** Maximum message size that would fit into a rquest from this pool */
@@ -469,8 +657,10 @@ struct ptlrpc_hpreq_ops {
  * in Lustre.
  */
 struct ptlrpc_request {
-        /* Request type: one of PTL_RPC_MSG_* */
-        int rq_type;
+	/* Request type: one of PTL_RPC_MSG_* */
+	int rq_type;
+	/** Result of request processing */
+	int rq_status;
         /**
          * Linkage item through which this request is included into
          * sending/delayed lists on client and into rqbd list on server
@@ -490,18 +680,20 @@ struct ptlrpc_request {
         cfs_list_t rq_exp_list;
         /** server-side hp handlers */
         struct ptlrpc_hpreq_ops *rq_ops;
+
+	/** initial thread servicing this request */
+	struct ptlrpc_thread *rq_svc_thread;
+
         /** history sequence # */
         __u64 rq_history_seq;
         /** the index of service's srv_at_array into which request is linked */
         time_t rq_at_index;
-        /** Result of request processing */
-        int rq_status;
         /** Lock to protect request flags and some other important bits, like
          * rq_list
          */
-        cfs_spinlock_t rq_lock;
-        /** client-side flags are serialized by rq_lock */
-        unsigned long rq_intr:1, rq_replied:1, rq_err:1,
+	spinlock_t rq_lock;
+	/** client-side flags are serialized by rq_lock */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
                 rq_timedout:1, rq_resend:1, rq_restart:1,
                 /**
                  * when ->rq_replay is set, request is kept by the client even
@@ -516,7 +708,6 @@ struct ptlrpc_request {
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                 rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
                 rq_early:1, rq_must_unlink:1,
-                rq_fake:1,          /* this fake req */
                 rq_memalloc:1,      /* req originated from "kswapd" */
                 /* server-side flags */
                 rq_packed_final:1,  /* packed final reply */
@@ -526,20 +717,21 @@ struct ptlrpc_request {
                 rq_committed:1,
                 /* whether the "rq_set" is a valid one */
                 rq_invalid_rqset:1,
-                rq_generation_set:1;
+		rq_generation_set:1,
+		/* do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1;
+
+	unsigned int rq_nr_resend;
 
         enum rq_phase rq_phase; /* one of RQ_PHASE_* */
         enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
         cfs_atomic_t rq_refcount;/* client-side refcount for SENT race,
                                     server-side refcounf for multiple replies */
 
-        /** initial thread servicing this request */
-        struct ptlrpc_thread *rq_svc_thread;
-
-        /** Portal to which this request would be sent */
-        int rq_request_portal;  /* XXX FIXME bug 249 */
-        /** Portal where to wait for reply and where reply would be sent */
-        int rq_reply_portal;    /* XXX FIXME bug 249 */
+	/** Portal to which this request would be sent */
+	short rq_request_portal;  /* XXX FIXME bug 249 */
+	/** Portal where to wait for reply and where reply would be sent */
+	short rq_reply_portal;    /* XXX FIXME bug 249 */
 
         /**
          * client-side:
@@ -549,11 +741,10 @@ struct ptlrpc_request {
         int rq_nob_received;
         /** Request length */
         int rq_reqlen;
-         /** Request message - what client sent */
-        struct lustre_msg *rq_reqmsg;
-
         /** Reply length */
         int rq_replen;
+	/** Request message - what client sent */
+	struct lustre_msg *rq_reqmsg;
         /** Reply message - server response */
         struct lustre_msg *rq_repmsg;
         /** Transaction number */
@@ -577,7 +768,8 @@ struct ptlrpc_request {
         struct sptlrpc_flavor    rq_flvr;        /**< for client & server */
         enum lustre_sec_part     rq_sp_from;
 
-        unsigned long            /* client/server security flags */
+	/* client/server security flags */
+	unsigned int
                                  rq_ctx_init:1,      /* context initiation */
                                  rq_ctx_fini:1,      /* context destroy */
                                  rq_bulk_read:1,     /* request bulk read */
@@ -601,21 +793,21 @@ struct ptlrpc_request {
         /* (server side), pointed directly into req buffer */
         struct ptlrpc_user_desc *rq_user_desc;
 
-        /** early replies go to offset 0, regular replies go after that */
-        unsigned int             rq_reply_off;
-
         /* various buffer pointers */
         struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+	char                    *rq_repbuf;      /* rep buffer */
+	struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+	struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
         int                      rq_reqbuf_len;  /* req wrapper buf len */
         int                      rq_reqdata_len; /* req wrapper msg len */
-        char                    *rq_repbuf;      /* rep buffer */
         int                      rq_repbuf_len;  /* rep buffer len */
-        struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
         int                      rq_repdata_len; /* rep wrapper msg len */
-        struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
         int                      rq_clrbuf_len;  /* only in priv mode */
         int                      rq_clrdata_len; /* only in priv mode */
 
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int             rq_reply_off;
+
         /** @} */
 
         /** Fields that help to see if request and reply were swabbed or not */
@@ -644,9 +836,6 @@ struct ptlrpc_request {
         struct ptlrpc_reply_state *rq_reply_state;
         /** incoming request buffer */
         struct ptlrpc_request_buffer_desc *rq_rqbd;
-#ifdef CRAY_XT3
-        __u32                rq_uid;            /* peer uid, used in MDS only */
-#endif
 
         /** client-only incoming reply */
         lnet_handle_md_t     rq_reply_md_h;
@@ -698,10 +887,10 @@ struct ptlrpc_request {
         int    rq_timeout;
 
         /** Multi-rpc bits */
-        /** Link item for request set lists */
-        cfs_list_t  rq_set_chain;
         /** Per-request waitq introduced by bug 21938 for recovery waiting */
         cfs_waitq_t rq_set_waitq;
+	/** Link item for request set lists */
+	cfs_list_t  rq_set_chain;
         /** Link back to the request set */
         struct ptlrpc_request_set *rq_set;
         /** Async completion handler, called when reply is received */
@@ -920,7 +1109,7 @@ struct ptlrpc_bulk_desc {
         /** client side */
         unsigned long bd_registered:1;
         /** For serialization with callback */
-        cfs_spinlock_t bd_lock;
+	spinlock_t bd_lock;
         /** Import generation when request for this bulk was sent */
         int bd_import_generation;
         /** Server side - export this bulk created for */
@@ -964,6 +1153,7 @@ enum {
         SVC_SIGNAL      = 1 << 5,
 };
 
+#define PTLRPC_THR_NAME_LEN		32
 /**
  * Definition of server service thread structure
  */
@@ -992,9 +1182,10 @@ struct ptlrpc_thread {
         /**
          * the svc this thread belonged to b=18582
          */
-        struct ptlrpc_service *t_svc;
-        cfs_waitq_t t_ctl_waitq;
-        struct lu_env *t_env;
+	struct ptlrpc_service_part	*t_svcpt;
+	cfs_waitq_t			t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
 };
 
 static inline int thread_is_init(struct ptlrpc_thread *thread)
@@ -1070,7 +1261,7 @@ struct ptlrpc_request_buffer_desc {
         /** History of requests for this buffer */
         cfs_list_t             rqbd_reqs;
         /** Back pointer to service for which this buffer is registered */
-        struct ptlrpc_service *rqbd_service;
+	struct ptlrpc_service_part *rqbd_svcpt;
         /** LNet descriptor */
         lnet_handle_md_t       rqbd_md_h;
         int                    rqbd_refcount;
@@ -1128,21 +1319,10 @@ struct ptlrpc_service_ops {
  * The service is listening on a particular portal (like tcp port)
  * and perform actions for a specific server like IO service for OST
  * or general metadata service for MDS.
- *
- * ptlrpc service has four locks:
- * \a srv_lock
- *    serialize operations on rqbd and requests waiting for preprocess
- * \a srv_rq_lock
- *    serialize operations active requests sent to this portal
- * \a srv_at_lock
- *    serialize adaptive timeout stuff
- * \a srv_rs_lock
- *    serialize operations on RS list (reply states)
- *
- * We don't have any use-case to take two or more locks at the same time
- * for now, so there is no lock order issue.
  */
 struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
         /** most often accessed fields */
         /** chain thru all services */
         cfs_list_t                      srv_list;
@@ -1154,17 +1334,10 @@ struct ptlrpc_service {
         char                           *srv_thread_name;
         /** service thread list */
         cfs_list_t                      srv_threads;
-        /** threads to start at beginning of service */
-        int                             srv_threads_min;
-        /** thread upper limit */
-        int                             srv_threads_max;
-        /** always increasing number */
-        unsigned                        srv_threads_next_id;
-        /** # of starting threads */
-        int                             srv_threads_starting;
-        /** # running threads */
-        int                             srv_threads_running;
-
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
         /** Root of /proc dir tree for this service */
         cfs_proc_dir_entry_t           *srv_procroot;
         /** Pointer to statistic data for this service */
@@ -1190,124 +1363,179 @@ struct ptlrpc_service {
         __u32                           srv_ctx_tags;
         /** soft watchdog timeout multiplier */
         int                             srv_watchdog_factor;
-        /** bind threads to CPUs */
-        unsigned                        srv_cpu_affinity:1;
         /** under unregister_service */
         unsigned                        srv_is_stopping:1;
 
-        /**
-         * serialize the following fields, used for protecting
-         * rqbd list and incoming requests waiting for preprocess
-         */
-        cfs_spinlock_t                  srv_lock  __cfs_cacheline_aligned;
-        /** incoming reqs */
-        cfs_list_t                      srv_req_in_queue;
-        /** total # req buffer descs allocated */
-        int                             srv_nbufs;
-        /** # posted request buffers */
-        int                             srv_nrqbd_receiving;
-        /** timeout before re-posting reqs, in tick */
-        cfs_duration_t                  srv_rqbd_timeout;
-        /** request buffers to be reposted */
-        cfs_list_t                      srv_idle_rqbds;
-        /** req buffers receiving */
-        cfs_list_t                      srv_active_rqbds;
-        /** request buffer history */
-        cfs_list_t                      srv_history_rqbds;
-        /** # request buffers in history */
-        int                             srv_n_history_rqbds;
-        /** max # request buffers in history */
-        int                             srv_max_history_rqbds;
-        /** request history */
-        cfs_list_t                      srv_request_history;
-        /** next request sequence # */
-        __u64                           srv_request_seq;
-        /** highest seq culled from history */
-        __u64                           srv_request_max_cull_seq;
-        /**
-         * all threads sleep on this. This wait-queue is signalled when new
-         * incoming request arrives and when difficult reply has to be handled.
-         */
-        cfs_waitq_t                     srv_waitq;
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service bound on */
+	int				srv_ncpts;
+	/** CPTs array this service bound on */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
 
-        /**
-         * serialize the following fields, used for processing requests
-         * sent to this portal
-         */
-        cfs_spinlock_t                  srv_rq_lock __cfs_cacheline_aligned;
-        /** # reqs in either of the queues below */
-        /** reqs waiting for service */
-        cfs_list_t                      srv_request_queue;
-        /** high priority queue */
-        cfs_list_t                      srv_request_hpq;
-        /** # incoming reqs */
-        int                             srv_n_queued_reqs;
-        /** # reqs being served */
-        int                             srv_n_active_reqs;
-        /** # HPreqs being served */
-        int                             srv_n_active_hpreq;
-        /** # hp requests handled */
-        int                             srv_hpreq_count;
-
-        /** AT stuff */
-        /** @{ */
-        /**
-         * serialize the following fields, used for changes on
-         * adaptive timeout
-         */
-        cfs_spinlock_t                  srv_at_lock __cfs_cacheline_aligned;
-        /** estimated rpc service time */
-        struct adaptive_timeout         srv_at_estimate;
-        /** reqs waiting for replies */
-        struct ptlrpc_at_array          srv_at_array;
-        /** early reply timer */
-        cfs_timer_t                     srv_at_timer;
-        /** check early replies */
-        unsigned                        srv_at_check;
-        /** debug */
-        cfs_time_t                      srv_at_checktime;
-        /** @} */
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # of stopping threads, reserved for shrinking threads */
+	int				scp_nthrs_stopping;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	cfs_list_t			scp_threads;
 
-        /**
-         * serialize the following fields, used for processing
-         * replies for this portal
-         */
-        cfs_spinlock_t                  srv_rs_lock __cfs_cacheline_aligned;
-        /** all the active replies */
-        cfs_list_t                      srv_active_replies;
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	cfs_list_t			scp_rqbd_idle;
+	/** req buffers receiving */
+	cfs_list_t			scp_rqbd_posted;
+	/** incoming reqs */
+	cfs_list_t			scp_req_incoming;
+	/** timeout before re-posting reqs, in tick */
+	cfs_duration_t			scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	cfs_waitq_t			scp_waitq;
+
+	/** request history */
+	cfs_list_t			scp_hist_reqs;
+	/** request buffer history */
+	cfs_list_t			scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the queues below */
+	/** reqs waiting for service */
+	cfs_list_t			scp_req_pending;
+	/** high priority queue */
+	cfs_list_t			scp_hreq_pending;
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	cfs_timer_t			scp_at_timer;
+	/** debug */
+	cfs_time_t			scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	cfs_list_t			scp_rep_active;
 #ifndef __KERNEL__
-        /** replies waiting for service */
-        cfs_list_t                      srv_reply_queue;
+	/** replies waiting for service */
+	cfs_list_t			scp_rep_queue;
 #endif
-        /** List of free reply_states */
-        cfs_list_t                      srv_free_rs_list;
-        /** waitq to run, when adding stuff to srv_free_rs_list */
-        cfs_waitq_t                     srv_free_rs_waitq;
-        /** # 'difficult' replies */
-        cfs_atomic_t                    srv_n_difficult_replies;
-        //struct ptlrpc_srv_ni srv_interfaces[0];
+	/** List of free reply_states */
+	cfs_list_t			scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	cfs_waitq_t			scp_rep_waitq;
+	/** # 'difficult' replies */
+	cfs_atomic_t			scp_nreps_difficult;
 };
 
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
 /**
  * Declaration of ptlrpcd control structure
  */
 struct ptlrpcd_ctl {
-        /**
-         * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
-         */
-        unsigned long               pc_flags;
-        /**
-         * Thread lock protecting structure fields.
-         */
-        cfs_spinlock_t              pc_lock;
-        /**
-         * Start completion.
-         */
-        cfs_completion_t            pc_starting;
-        /**
-         * Stop completion.
-         */
-        cfs_completion_t            pc_finishing;
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
         /**
          * Thread requests set.
          */
@@ -1425,14 +1653,14 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
 
 static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
 {
-        int rc;
+	int rc;
 
-        LASSERT(desc != NULL);
+	LASSERT(desc != NULL);
 
-        cfs_spin_lock(&desc->bd_lock);
-        rc = desc->bd_network_rw;
-        cfs_spin_unlock(&desc->bd_lock);
-        return rc;
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_network_rw;
+	spin_unlock(&desc->bd_lock);
+	return rc;
 }
 #endif
 
@@ -1453,10 +1681,10 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
         if (!desc)
                 return 0;
 
-        cfs_spin_lock(&desc->bd_lock);
-        rc = desc->bd_network_rw;
-        cfs_spin_unlock(&desc->bd_lock);
-        return rc;
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_network_rw;
+	spin_unlock(&desc->bd_lock);
+	return rc;
 }
 
 #define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
@@ -1528,11 +1756,6 @@ struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
                              __u32 version, int opcode, char **bufs,
                              struct ptlrpc_cli_ctx *ctx);
-struct ptlrpc_request *ptlrpc_prep_fakereq(struct obd_import *imp,
-                                           unsigned int timeout,
-                                           ptlrpc_interpterer_t interpreter);
-void ptlrpc_fakereq_finished(struct ptlrpc_request *req);
-
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
                                        int opcode, int count, __u32 *lengths,
                                        char **bufs);
@@ -1545,9 +1768,31 @@ void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
                                               int npages, int type, int portal);
-void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
-void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
-                           cfs_page_t *page, int pageoffset, int len);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     cfs_page_t *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+					     cfs_page_t *page, int pageoffset,
+					     int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					       cfs_page_t *page, int pageoffset,
+					       int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
                                       struct obd_import *imp);
 __u64 ptlrpc_next_xid(void);
@@ -1579,16 +1824,34 @@ struct ptlrpc_service_buf_conf {
 struct ptlrpc_service_thr_conf {
 	/* threadname should be 8 characters or less - 6 will be added on */
 	char				*tc_thr_name;
-	/* min number of service threads to start */
-	unsigned int			tc_nthrs_min;
-	/* max number of service threads to start */
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
 	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
 	/* set NUMA node affinity for service threads */
 	unsigned int			tc_cpu_affinity;
 	/* Tags for lu_context associated with service thread */
 	__u32				tc_ctx_tags;
 };
 
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+};
+
 struct ptlrpc_service_conf {
 	/* service name */
 	char				*psc_name;
@@ -1598,6 +1861,8 @@ struct ptlrpc_service_conf {
 	struct ptlrpc_service_buf_conf	psc_buf;
 	/* thread information */
 	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
 	/* function table */
 	struct ptlrpc_service_ops	psc_ops;
 };
@@ -1614,13 +1879,13 @@ void ptlrpc_save_lock(struct ptlrpc_request *req,
 void ptlrpc_commit_replies(struct obd_export *exp);
 void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
 void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
 struct ptlrpc_service *ptlrpc_register_service(
 				struct ptlrpc_service_conf *conf,
 				struct proc_dir_entry *proc_entry);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 
 int ptlrpc_start_threads(struct ptlrpc_service *svc);
-int ptlrpc_start_thread(struct ptlrpc_service *svc);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
 int liblustre_check_services(void *arg);
 void ptlrpc_daemonize(char *name);
@@ -1636,11 +1901,6 @@ void ptlrpc_hr_fini(void);
 # define ptlrpc_hr_fini() do {} while(0)
 #endif
 
-struct ptlrpc_svc_data {
-        char *name;
-        struct ptlrpc_service *svc;
-        struct ptlrpc_thread *thread;
-};
 /** @} */
 
 /* ptlrpc/import.c */
@@ -1728,7 +1988,7 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg);
 __u32 lustre_msg_get_service_time(struct lustre_msg *msg);
 char *lustre_msg_get_jobid(struct lustre_msg *msg);
 __u32 lustre_msg_get_cksum(struct lustre_msg *msg);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 0, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
 __u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
 #else
 # warning "remove checksum compatibility support for b1_8"
@@ -1822,17 +2082,17 @@ ptlrpc_client_recv(struct ptlrpc_request *req)
 static inline int
 ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
 {
-        int rc;
-
-        cfs_spin_lock(&req->rq_lock);
-        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
-            req->rq_reply_deadline > cfs_time_current_sec()) {
-                cfs_spin_unlock(&req->rq_lock);
-                return 1;
-        }
-        rc = req->rq_receiving_reply || req->rq_must_unlink;
-        cfs_spin_unlock(&req->rq_lock);
-        return rc;
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	rc = req->rq_receiving_reply || req->rq_must_unlink;
+	spin_unlock(&req->rq_lock);
+	return rc;
 }
 
 static inline void
@@ -1899,12 +2159,28 @@ static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
 
 static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
 {
-        if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
-                cfs_spin_lock(&req->rq_lock);
-                req->rq_no_resend = 1;
-                cfs_spin_unlock(&req->rq_lock);
-        }
-        return req->rq_no_resend;
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
 }
 
 /* ldlm/ldlm_lib.c */
@@ -2020,14 +2296,13 @@ static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
 /** @} */
 
 /* ptlrpc/llog_server.c */
-int llog_origin_handle_create(struct ptlrpc_request *req);
+int llog_origin_handle_open(struct ptlrpc_request *req);
 int llog_origin_handle_destroy(struct ptlrpc_request *req);
 int llog_origin_handle_prev_block(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
 int llog_origin_handle_close(struct ptlrpc_request *req);
 int llog_origin_handle_cancel(struct ptlrpc_request *req);
-int llog_catinfo(struct ptlrpc_request *req);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;