X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Flustre_net.h;h=fb492883503b00d2a768daf8ef5e446427ee59d5;hp=4d8341d1d32a912108e8e9ecd6150010c8e68c55;hb=be025f5580a0cc4958267d2e4317aac4e2ebc0c3;hpb=020924966d4acd98e96ede094d72a49b762233db diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 4d8341d..fb49288 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -106,9 +106,12 @@ # endif #endif /* __KERNEL__ */ +#define PTLRPC_NTHRS_INIT 2 + /** - * The following constants determine how memory is used to buffer incoming - * service requests. + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. * * ?_NBUFS # buffers to allocate when growing the pool * ?_BUFSIZE # bytes in a single request buffer @@ -120,21 +123,170 @@ * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are * considered full when less than ?_MAXREQSIZE is left in them. */ -#define LDLM_THREADS_AUTO_MIN (2) -#define LDLM_THREADS_AUTO_MAX min_t(unsigned, cfs_num_online_cpus() * \ - cfs_num_online_cpus() * 32, 128) -#define LDLM_BL_THREADS LDLM_THREADS_AUTO_MIN +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number wil be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (cfs_num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT #define LDLM_NBUFS (64 * cfs_num_online_cpus()) #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) #define LDLM_MAXREPSIZE (1024) -/** Absolute limits */ -#define MDT_MIN_THREADS 2UL -#ifndef MDT_MAX_THREADS -#define MDT_MAX_THREADS 512UL + /* + * MDS threads constants: + * + * Please see examples in "Thread Constants", MDS threads number will be at + * the comparable level of old versions, unless the server has many cores. + */ +#ifndef MDS_MAX_THREADS +#define MDS_MAX_THREADS 1024 +#define MDS_MAX_OTHR_THREADS 256 + +#else /* MDS_MAX_THREADS */ +#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT +#undef MDS_MAX_THREADS +#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT #endif -#define MDS_NBUFS (64 * cfs_num_online_cpus()) +#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2) +#endif + +/* default service */ +#define MDS_THR_FACTOR 8 +#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_NTHRS_MAX MDS_MAX_THREADS +#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX) + +/* read-page service */ +#define MDS_RDPG_THR_FACTOR 4 +#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX) + +/* these should be removed when we remove setattr service in the future */ +#define MDS_SETA_THR_FACTOR 4 +#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) + +/* non-affinity threads */ +#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS + +#define MDS_NBUFS (64 * cfs_num_online_cpus()) /** * Assume file name length = FNAME_MAX = 256 (true for ext3). * path name length = PATH_MAX = 4096 @@ -174,16 +326,51 @@ #define SEQ_MAXREPSIZE (152) /** MGS threads must be >= 3, see bug 22458 comment #28 */ -#define MGS_THREADS_AUTO_MIN 3 -#define MGS_THREADS_AUTO_MAX 32 +#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define MGS_NTHRS_MAX 32 + #define MGS_NBUFS (64 * cfs_num_online_cpus()) #define MGS_BUFSIZE (8 * 1024) #define MGS_MAXREQSIZE (7 * 1024) #define MGS_MAXREPSIZE (9 * 1024) -/** Absolute OSS limits */ -#define OSS_THREADS_MIN 3 /* difficult replies, HPQ, others */ -#define OSS_THREADS_MAX 512 + /* + * OSS threads constants: + * + * Given 8 as factor and 64 as base threads number + * + * example 1): + * On 8-core server configured to 2 partitions, we will have + * 64 + 8 * 4 = 96 threads for each partition, 192 total threads. + * + * example 2): + * On 32-core machine configured to 4 partitions, we will have + * 64 + 8 * 8 = 112 threads for each partition, so total threads number + * will be 112 * 4 = 448. + * + * example 3): + * On 64-core machine configured to 4 partitions, we will have + * 64 + 16 * 8 = 192 threads for each partition, so total threads number + * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we + * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads + * for each partition. + * + * So we can see that with these constants, threads number wil be at the + * similar level of old versions, unless the server has many cores. + */ + /* depress threads factor for VM with small memory size */ +#define OSS_THR_FACTOR min_t(int, 8, \ + CFS_NUM_CACHEPAGES >> (28 - CFS_PAGE_SHIFT)) +#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define OSS_NTHRS_BASE 64 +#define OSS_NTHRS_MAX 512 + +/* threads for handling "create" request */ +#define OSS_CR_THR_FACTOR 1 +#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define OSS_CR_NTHRS_BASE 8 +#define OSS_CR_NTHRS_MAX 64 + #define OST_NBUFS (64 * cfs_num_online_cpus()) #define OST_BUFSIZE (8 * 1024) @@ -242,7 +429,7 @@ union ptlrpc_async_args { * least big enough for that. */ void *pointer_arg[11]; - __u64 space[6]; + __u64 space[7]; }; struct ptlrpc_request_set; @@ -281,17 +468,17 @@ struct ptlrpc_request_set { set_interpreter_func set_interpret; /** opaq argument passed to completion \a set_interpret callback. */ void *set_arg; - /** rq_status of requests that have been freed already */ - int set_rc; /** * Lock for \a set_new_requests manipulations * locked so that any old caller can communicate requests to * the set holder who can then fold them into the lock-free set */ - cfs_spinlock_t set_new_req_lock; + spinlock_t set_new_req_lock; /** List of new yet unsent requests. Only used with ptlrpcd now. */ cfs_list_t set_new_requests; + /** rq_status of requests that have been freed already */ + int set_rc; /** Additional fields used by the flow control extension */ /** Maximum number of RPCs in flight */ int set_max_inflight; @@ -314,6 +501,7 @@ struct ptlrpc_set_cbdata { }; struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; /** * ptlrpc callback & work item stuff @@ -347,7 +535,7 @@ struct ptlrpc_reply_state { cfs_list_t rs_debug_list; #endif /** A spinlock to protect the reply state flags */ - cfs_spinlock_t rs_lock; + spinlock_t rs_lock; /** Reply state flags */ unsigned long rs_difficult:1; /* ACK/commit stuff */ unsigned long rs_no_ack:1; /* no ACK, even for @@ -369,7 +557,7 @@ struct ptlrpc_reply_state { /** xid */ __u64 rs_xid; struct obd_export *rs_export; - struct ptlrpc_service *rs_service; + struct ptlrpc_service_part *rs_svcpt; /** Lnet metadata handle for the reply */ lnet_handle_md_t rs_md_h; cfs_atomic_t rs_refcount; @@ -422,8 +610,8 @@ typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, * any allocations (to avoid e.g. OOM). */ struct ptlrpc_request_pool { - /** Locks the list */ - cfs_spinlock_t prp_lock; + /** Locks the list */ + spinlock_t prp_lock; /** list of ptlrpc_request structs */ cfs_list_t prp_req_list; /** Maximum message size that would fit into a rquest from this pool */ @@ -469,8 +657,10 @@ struct ptlrpc_hpreq_ops { * in Lustre. */ struct ptlrpc_request { - /* Request type: one of PTL_RPC_MSG_* */ - int rq_type; + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; /** * Linkage item through which this request is included into * sending/delayed lists on client and into rqbd list on server @@ -490,18 +680,20 @@ struct ptlrpc_request { cfs_list_t rq_exp_list; /** server-side hp handlers */ struct ptlrpc_hpreq_ops *rq_ops; + + /** initial thread servicing this request */ + struct ptlrpc_thread *rq_svc_thread; + /** history sequence # */ __u64 rq_history_seq; /** the index of service's srv_at_array into which request is linked */ time_t rq_at_index; - /** Result of request processing */ - int rq_status; /** Lock to protect request flags and some other important bits, like * rq_list */ - cfs_spinlock_t rq_lock; - /** client-side flags are serialized by rq_lock */ - unsigned long rq_intr:1, rq_replied:1, rq_err:1, + spinlock_t rq_lock; + /** client-side flags are serialized by rq_lock */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, /** * when ->rq_replay is set, request is kept by the client even @@ -516,7 +708,6 @@ struct ptlrpc_request { rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, rq_early:1, rq_must_unlink:1, - rq_fake:1, /* this fake req */ rq_memalloc:1, /* req originated from "kswapd" */ /* server-side flags */ rq_packed_final:1, /* packed final reply */ @@ -526,20 +717,21 @@ struct ptlrpc_request { rq_committed:1, /* whether the "rq_set" is a valid one */ rq_invalid_rqset:1, - rq_generation_set:1; + rq_generation_set:1, + /* do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1; + + unsigned int rq_nr_resend; enum rq_phase rq_phase; /* one of RQ_PHASE_* */ enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ cfs_atomic_t rq_refcount;/* client-side refcount for SENT race, server-side refcounf for multiple replies */ - /** initial thread servicing this request */ - struct ptlrpc_thread *rq_svc_thread; - - /** Portal to which this request would be sent */ - int rq_request_portal; /* XXX FIXME bug 249 */ - /** Portal where to wait for reply and where reply would be sent */ - int rq_reply_portal; /* XXX FIXME bug 249 */ + /** Portal to which this request would be sent */ + short rq_request_portal; /* XXX FIXME bug 249 */ + /** Portal where to wait for reply and where reply would be sent */ + short rq_reply_portal; /* XXX FIXME bug 249 */ /** * client-side: @@ -549,11 +741,10 @@ struct ptlrpc_request { int rq_nob_received; /** Request length */ int rq_reqlen; - /** Request message - what client sent */ - struct lustre_msg *rq_reqmsg; - /** Reply length */ int rq_replen; + /** Request message - what client sent */ + struct lustre_msg *rq_reqmsg; /** Reply message - server response */ struct lustre_msg *rq_repmsg; /** Transaction number */ @@ -577,7 +768,8 @@ struct ptlrpc_request { struct sptlrpc_flavor rq_flvr; /**< for client & server */ enum lustre_sec_part rq_sp_from; - unsigned long /* client/server security flags */ + /* client/server security flags */ + unsigned int rq_ctx_init:1, /* context initiation */ rq_ctx_fini:1, /* context destroy */ rq_bulk_read:1, /* request bulk read */ @@ -601,21 +793,21 @@ struct ptlrpc_request { /* (server side), pointed directly into req buffer */ struct ptlrpc_user_desc *rq_user_desc; - /** early replies go to offset 0, regular replies go after that */ - unsigned int rq_reply_off; - /* various buffer pointers */ struct lustre_msg *rq_reqbuf; /* req wrapper */ + char *rq_repbuf; /* rep buffer */ + struct lustre_msg *rq_repdata; /* rep wrapper msg */ + struct lustre_msg *rq_clrbuf; /* only in priv mode */ int rq_reqbuf_len; /* req wrapper buf len */ int rq_reqdata_len; /* req wrapper msg len */ - char *rq_repbuf; /* rep buffer */ int rq_repbuf_len; /* rep buffer len */ - struct lustre_msg *rq_repdata; /* rep wrapper msg */ int rq_repdata_len; /* rep wrapper msg len */ - struct lustre_msg *rq_clrbuf; /* only in priv mode */ int rq_clrbuf_len; /* only in priv mode */ int rq_clrdata_len; /* only in priv mode */ + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + /** @} */ /** Fields that help to see if request and reply were swabbed or not */ @@ -644,9 +836,6 @@ struct ptlrpc_request { struct ptlrpc_reply_state *rq_reply_state; /** incoming request buffer */ struct ptlrpc_request_buffer_desc *rq_rqbd; -#ifdef CRAY_XT3 - __u32 rq_uid; /* peer uid, used in MDS only */ -#endif /** client-only incoming reply */ lnet_handle_md_t rq_reply_md_h; @@ -698,10 +887,10 @@ struct ptlrpc_request { int rq_timeout; /** Multi-rpc bits */ - /** Link item for request set lists */ - cfs_list_t rq_set_chain; /** Per-request waitq introduced by bug 21938 for recovery waiting */ cfs_waitq_t rq_set_waitq; + /** Link item for request set lists */ + cfs_list_t rq_set_chain; /** Link back to the request set */ struct ptlrpc_request_set *rq_set; /** Async completion handler, called when reply is received */ @@ -920,7 +1109,7 @@ struct ptlrpc_bulk_desc { /** client side */ unsigned long bd_registered:1; /** For serialization with callback */ - cfs_spinlock_t bd_lock; + spinlock_t bd_lock; /** Import generation when request for this bulk was sent */ int bd_import_generation; /** Server side - export this bulk created for */ @@ -964,6 +1153,7 @@ enum { SVC_SIGNAL = 1 << 5, }; +#define PTLRPC_THR_NAME_LEN 32 /** * Definition of server service thread structure */ @@ -992,9 +1182,10 @@ struct ptlrpc_thread { /** * the svc this thread belonged to b=18582 */ - struct ptlrpc_service *t_svc; - cfs_waitq_t t_ctl_waitq; - struct lu_env *t_env; + struct ptlrpc_service_part *t_svcpt; + cfs_waitq_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; }; static inline int thread_is_init(struct ptlrpc_thread *thread) @@ -1070,7 +1261,7 @@ struct ptlrpc_request_buffer_desc { /** History of requests for this buffer */ cfs_list_t rqbd_reqs; /** Back pointer to service for which this buffer is registered */ - struct ptlrpc_service *rqbd_service; + struct ptlrpc_service_part *rqbd_svcpt; /** LNet descriptor */ lnet_handle_md_t rqbd_md_h; int rqbd_refcount; @@ -1128,21 +1319,10 @@ struct ptlrpc_service_ops { * The service is listening on a particular portal (like tcp port) * and perform actions for a specific server like IO service for OST * or general metadata service for MDS. - * - * ptlrpc service has four locks: - * \a srv_lock - * serialize operations on rqbd and requests waiting for preprocess - * \a srv_rq_lock - * serialize operations active requests sent to this portal - * \a srv_at_lock - * serialize adaptive timeout stuff - * \a srv_rs_lock - * serialize operations on RS list (reply states) - * - * We don't have any use-case to take two or more locks at the same time - * for now, so there is no lock order issue. */ struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; /** most often accessed fields */ /** chain thru all services */ cfs_list_t srv_list; @@ -1154,17 +1334,10 @@ struct ptlrpc_service { char *srv_thread_name; /** service thread list */ cfs_list_t srv_threads; - /** threads to start at beginning of service */ - int srv_threads_min; - /** thread upper limit */ - int srv_threads_max; - /** always increasing number */ - unsigned srv_threads_next_id; - /** # of starting threads */ - int srv_threads_starting; - /** # running threads */ - int srv_threads_running; - + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; /** Root of /proc dir tree for this service */ cfs_proc_dir_entry_t *srv_procroot; /** Pointer to statistic data for this service */ @@ -1190,124 +1363,179 @@ struct ptlrpc_service { __u32 srv_ctx_tags; /** soft watchdog timeout multiplier */ int srv_watchdog_factor; - /** bind threads to CPUs */ - unsigned srv_cpu_affinity:1; /** under unregister_service */ unsigned srv_is_stopping:1; - /** - * serialize the following fields, used for protecting - * rqbd list and incoming requests waiting for preprocess - */ - cfs_spinlock_t srv_lock __cfs_cacheline_aligned; - /** incoming reqs */ - cfs_list_t srv_req_in_queue; - /** total # req buffer descs allocated */ - int srv_nbufs; - /** # posted request buffers */ - int srv_nrqbd_receiving; - /** timeout before re-posting reqs, in tick */ - cfs_duration_t srv_rqbd_timeout; - /** request buffers to be reposted */ - cfs_list_t srv_idle_rqbds; - /** req buffers receiving */ - cfs_list_t srv_active_rqbds; - /** request buffer history */ - cfs_list_t srv_history_rqbds; - /** # request buffers in history */ - int srv_n_history_rqbds; - /** max # request buffers in history */ - int srv_max_history_rqbds; - /** request history */ - cfs_list_t srv_request_history; - /** next request sequence # */ - __u64 srv_request_seq; - /** highest seq culled from history */ - __u64 srv_request_max_cull_seq; - /** - * all threads sleep on this. This wait-queue is signalled when new - * incoming request arrives and when difficult reply has to be handled. - */ - cfs_waitq_t srv_waitq; + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service bound on */ + int srv_ncpts; + /** CPTs array this service bound on */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; - /** - * serialize the following fields, used for processing requests - * sent to this portal - */ - cfs_spinlock_t srv_rq_lock __cfs_cacheline_aligned; - /** # reqs in either of the queues below */ - /** reqs waiting for service */ - cfs_list_t srv_request_queue; - /** high priority queue */ - cfs_list_t srv_request_hpq; - /** # incoming reqs */ - int srv_n_queued_reqs; - /** # reqs being served */ - int srv_n_active_reqs; - /** # HPreqs being served */ - int srv_n_active_hpreq; - /** # hp requests handled */ - int srv_hpreq_count; - - /** AT stuff */ - /** @{ */ - /** - * serialize the following fields, used for changes on - * adaptive timeout - */ - cfs_spinlock_t srv_at_lock __cfs_cacheline_aligned; - /** estimated rpc service time */ - struct adaptive_timeout srv_at_estimate; - /** reqs waiting for replies */ - struct ptlrpc_at_array srv_at_array; - /** early reply timer */ - cfs_timer_t srv_at_timer; - /** check early replies */ - unsigned srv_at_check; - /** debug */ - cfs_time_t srv_at_checktime; - /** @} */ +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # of stopping threads, reserved for shrinking threads */ + int scp_nthrs_stopping; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + cfs_list_t scp_threads; - /** - * serialize the following fields, used for processing - * replies for this portal - */ - cfs_spinlock_t srv_rs_lock __cfs_cacheline_aligned; - /** all the active replies */ - cfs_list_t srv_active_replies; + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + cfs_list_t scp_rqbd_idle; + /** req buffers receiving */ + cfs_list_t scp_rqbd_posted; + /** incoming reqs */ + cfs_list_t scp_req_incoming; + /** timeout before re-posting reqs, in tick */ + cfs_duration_t scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + cfs_waitq_t scp_waitq; + + /** request history */ + cfs_list_t scp_hist_reqs; + /** request buffer history */ + cfs_list_t scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the queues below */ + /** reqs waiting for service */ + cfs_list_t scp_req_pending; + /** high priority queue */ + cfs_list_t scp_hreq_pending; + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + cfs_timer_t scp_at_timer; + /** debug */ + cfs_time_t scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + cfs_list_t scp_rep_active; #ifndef __KERNEL__ - /** replies waiting for service */ - cfs_list_t srv_reply_queue; + /** replies waiting for service */ + cfs_list_t scp_rep_queue; #endif - /** List of free reply_states */ - cfs_list_t srv_free_rs_list; - /** waitq to run, when adding stuff to srv_free_rs_list */ - cfs_waitq_t srv_free_rs_waitq; - /** # 'difficult' replies */ - cfs_atomic_t srv_n_difficult_replies; - //struct ptlrpc_srv_ni srv_interfaces[0]; + /** List of free reply_states */ + cfs_list_t scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + cfs_waitq_t scp_rep_waitq; + /** # 'difficult' replies */ + cfs_atomic_t scp_nreps_difficult; }; +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + /** * Declaration of ptlrpcd control structure */ struct ptlrpcd_ctl { - /** - * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) - */ - unsigned long pc_flags; - /** - * Thread lock protecting structure fields. - */ - cfs_spinlock_t pc_lock; - /** - * Start completion. - */ - cfs_completion_t pc_starting; - /** - * Stop completion. - */ - cfs_completion_t pc_finishing; + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; /** * Thread requests set. */ @@ -1425,14 +1653,14 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc) { - int rc; + int rc; - LASSERT(desc != NULL); + LASSERT(desc != NULL); - cfs_spin_lock(&desc->bd_lock); - rc = desc->bd_network_rw; - cfs_spin_unlock(&desc->bd_lock); - return rc; + spin_lock(&desc->bd_lock); + rc = desc->bd_network_rw; + spin_unlock(&desc->bd_lock); + return rc; } #endif @@ -1453,10 +1681,10 @@ static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) if (!desc) return 0; - cfs_spin_lock(&desc->bd_lock); - rc = desc->bd_network_rw; - cfs_spin_unlock(&desc->bd_lock); - return rc; + spin_lock(&desc->bd_lock); + rc = desc->bd_network_rw; + spin_unlock(&desc->bd_lock); + return rc; } #define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 @@ -1528,11 +1756,6 @@ struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, __u32 version, int opcode, char **bufs, struct ptlrpc_cli_ctx *ctx); -struct ptlrpc_request *ptlrpc_prep_fakereq(struct obd_import *imp, - unsigned int timeout, - ptlrpc_interpterer_t interpreter); -void ptlrpc_fakereq_finished(struct ptlrpc_request *req); - struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, __u32 *lengths, char **bufs); @@ -1545,9 +1768,31 @@ void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, int npages, int type, int portal); -void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); -void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, - cfs_page_t *page, int pageoffset, int len); +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin); +static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 1); +} +static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 0); +} +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + cfs_page_t *page, int pageoffset, int len, int); +static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + cfs_page_t *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + cfs_page_t *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, struct obd_import *imp); __u64 ptlrpc_next_xid(void); @@ -1579,16 +1824,34 @@ struct ptlrpc_service_buf_conf { struct ptlrpc_service_thr_conf { /* threadname should be 8 characters or less - 6 will be added on */ char *tc_thr_name; - /* min number of service threads to start */ - unsigned int tc_nthrs_min; - /* max number of service threads to start */ + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; /* set NUMA node affinity for service threads */ unsigned int tc_cpu_affinity; /* Tags for lu_context associated with service thread */ __u32 tc_ctx_tags; }; +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; +}; + struct ptlrpc_service_conf { /* service name */ char *psc_name; @@ -1598,6 +1861,8 @@ struct ptlrpc_service_conf { struct ptlrpc_service_buf_conf psc_buf; /* thread information */ struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; /* function table */ struct ptlrpc_service_ops psc_ops; }; @@ -1614,13 +1879,13 @@ void ptlrpc_save_lock(struct ptlrpc_request *req, void ptlrpc_commit_replies(struct obd_export *exp); void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); struct ptlrpc_service *ptlrpc_register_service( struct ptlrpc_service_conf *conf, struct proc_dir_entry *proc_entry); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_threads(struct ptlrpc_service *svc); -int ptlrpc_start_thread(struct ptlrpc_service *svc); int ptlrpc_unregister_service(struct ptlrpc_service *service); int liblustre_check_services(void *arg); void ptlrpc_daemonize(char *name); @@ -1636,11 +1901,6 @@ void ptlrpc_hr_fini(void); # define ptlrpc_hr_fini() do {} while(0) #endif -struct ptlrpc_svc_data { - char *name; - struct ptlrpc_service *svc; - struct ptlrpc_thread *thread; -}; /** @} */ /* ptlrpc/import.c */ @@ -1728,7 +1988,7 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg); __u32 lustre_msg_get_service_time(struct lustre_msg *msg); char *lustre_msg_get_jobid(struct lustre_msg *msg); __u32 lustre_msg_get_cksum(struct lustre_msg *msg); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 0, 0) +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) __u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18); #else # warning "remove checksum compatibility support for b1_8" @@ -1822,17 +2082,17 @@ ptlrpc_client_recv(struct ptlrpc_request *req) static inline int ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) { - int rc; - - cfs_spin_lock(&req->rq_lock); - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && - req->rq_reply_deadline > cfs_time_current_sec()) { - cfs_spin_unlock(&req->rq_lock); - return 1; - } - rc = req->rq_receiving_reply || req->rq_must_unlink; - cfs_spin_unlock(&req->rq_lock); - return rc; + int rc; + + spin_lock(&req->rq_lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) { + spin_unlock(&req->rq_lock); + return 1; + } + rc = req->rq_receiving_reply || req->rq_must_unlink; + spin_unlock(&req->rq_lock); + return rc; } static inline void @@ -1899,12 +2159,28 @@ static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) static inline int ptlrpc_no_resend(struct ptlrpc_request *req) { - if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { - cfs_spin_lock(&req->rq_lock); - req->rq_no_resend = 1; - cfs_spin_unlock(&req->rq_lock); - } - return req->rq_no_resend; + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; } /* ldlm/ldlm_lib.c */ @@ -2020,14 +2296,13 @@ static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} /** @} */ /* ptlrpc/llog_server.c */ -int llog_origin_handle_create(struct ptlrpc_request *req); +int llog_origin_handle_open(struct ptlrpc_request *req); int llog_origin_handle_destroy(struct ptlrpc_request *req); int llog_origin_handle_prev_block(struct ptlrpc_request *req); int llog_origin_handle_next_block(struct ptlrpc_request *req); int llog_origin_handle_read_header(struct ptlrpc_request *req); int llog_origin_handle_close(struct ptlrpc_request *req); int llog_origin_handle_cancel(struct ptlrpc_request *req); -int llog_catinfo(struct ptlrpc_request *req); /* ptlrpc/llog_client.c */ extern struct llog_operations llog_client_ops;