#define PTLRPC_MD_OPTIONS 0
/**
- * Define maxima for bulk I/O
- * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
- * these limits are system wide and not interface-local. */
-#define PTLRPC_MAX_BRW_BITS LNET_MTU_BITS
-#define PTLRPC_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
-#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value. The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS 2
+#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all. Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future. Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS)
#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
-#define DT_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
+#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> CFS_PAGE_SHIFT)
#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * CFS_PAGE_SIZE))
# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * CFS_PAGE_SIZE"
# endif
-# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU)
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
# error "PTLRPC_MAX_BRW_SIZE too big"
# endif
-# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV)
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
# error "PTLRPC_MAX_BRW_PAGES too big"
# endif
#endif /* __KERNEL__ */
* threads for each partition to keep service healthy, so total threads
* number should be 24 * 8 = 192.
*
- * So with these constants, threads number wil be at the similar level
+ * So with these constants, threads number will be at the similar level
* of old versions, unless target machine has over a hundred cores
*/
#define LDLM_THR_FACTOR 8
#define LDLM_NTHRS_BASE 24
#define LDLM_NTHRS_MAX (cfs_num_online_cpus() == 1 ? 64 : 128)
-#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT
-#define LDLM_NBUFS (64 * cfs_num_online_cpus())
-#define LDLM_BUFSIZE (8 * 1024)
-#define LDLM_MAXREQSIZE (5 * 1024)
-#define LDLM_MAXREPSIZE (1024)
+#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE (8 * 1024)
+#define LDLM_MAXREQSIZE (5 * 1024)
+#define LDLM_MAXREPSIZE (1024)
/*
* MDS threads constants:
#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT
#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS
-#define MDS_NBUFS (64 * cfs_num_online_cpus())
+#define MDS_NBUFS 64
/**
* Assume file name length = FNAME_MAX = 256 (true for ext3).
- * path name length = PATH_MAX = 4096
- * LOV MD size max = EA_MAX = 48000 (2000 stripes)
+ * path name length = PATH_MAX = 4096
+ * LOV MD size max = EA_MAX = 24 * 2000
+ * (NB: 24 is size of lov_ost_data)
+ * LOV LOGCOOKIE size max = 32 * 2000
+ * (NB: 32 is size of llog_cookie)
* symlink: FNAME_MAX + PATH_MAX <- largest
* link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create)
* rename: FNAME_MAX + FNAME_MAX
* MDS_MAXREQSIZE ~= 4736 bytes =
* lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
* MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
- * or, for mds_close() and mds_reint_unlink() on a many-OST filesystem:
- * = 9210 bytes = lustre_msg + mdt_body + 160 * (easize + cookiesize)
*
* Realistic size is about 512 bytes (20 character name + 128 char symlink),
* except in the open case where there are a large number of OSTs in a LOV.
*/
-#define MDS_MAXREPSIZE max(10 * 1024, 362 + LOV_MAX_STRIPE_COUNT * 56)
-#define MDS_MAXREQSIZE MDS_MAXREPSIZE
+#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */
+#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \
+ 362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE (9 * 1024)
+#define MDS_OUT_MAXREPSIZE MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE max_t(int, MDS_MAXREQSIZE + 1024, 8 * 1024)
+
+/**
+ * MDS_LOV_BUFSIZE should be at least max_reqsize (with LOV EA) +
+ * max sptlrpc payload size, however, we need to allocate a much larger buffer
+ * for it because LNet requires each MD(rqbd) has at least MDS_LOVE_MAXREQSIZE
+ * bytes left to avoid dropping of maximum-sized incoming request.
+ * So if MDS_LOV_BUFSIZE is only a little larger than MDS_LOV_MAXREQSIZE,
+ * then it can only fit in one request even there are 48K bytes left in
+ * a rqbd, and memory utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 128K, so even each rqbd is unlinked
+ * from LNet with unused 48K, buffer utilization will be about 62%.
+ * Please check LU-2432 for details.
+ */
+/** MDS_LOV_BUFSIZE = max_reqsize (w/ LOV EA) + max sptlrpc payload size */
+#define MDS_LOV_BUFSIZE max_t(int, MDS_LOV_MAXREQSIZE + 1024, \
+ 128 * 1024)
-/** MDS_BUFSIZE = max_reqsize + max sptlrpc payload size */
-#define MDS_BUFSIZE (MDS_MAXREQSIZE + 1024)
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_LOV_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+ */
+#define MDS_OUT_BUFSIZE max_t(int, MDS_OUT_MAXREQSIZE + 1024, \
+ 24 * 1024)
/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
#define FLD_MAXREQSIZE (160)
/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
#define FLD_MAXREPSIZE (152)
+#define FLD_BUFSIZE (1 << 12)
/**
* SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
#define SEQ_MAXREPSIZE (152)
+#define SEQ_BUFSIZE (1 << 12)
/** MGS threads must be >= 3, see bug 22458 comment #28 */
#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1)
#define MGS_NTHRS_MAX 32
-#define MGS_NBUFS (64 * cfs_num_online_cpus())
+#define MGS_NBUFS 64
#define MGS_BUFSIZE (8 * 1024)
#define MGS_MAXREQSIZE (7 * 1024)
#define MGS_MAXREPSIZE (9 * 1024)
#define OSS_CR_NTHRS_BASE 8
#define OSS_CR_NTHRS_MAX 64
-#define OST_NBUFS (64 * cfs_num_online_cpus())
-#define OST_BUFSIZE (8 * 1024)
-
/**
- * OST_MAXREQSIZE ~= 4768 bytes =
- * lustre_msg + obdo + 16 * obd_ioobj + 256 * niobuf_remote
+ * OST_IO_MAXREQSIZE ~=
+ * lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ * DT_MAX_BRW_PAGES * niobuf_remote
*
* - single object with 16 pages is 512 bytes
- * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+ sizeof(struct ptlrpc_body) + \
+ sizeof(struct obdo) + \
+ sizeof(struct obd_ioobj) + \
+ sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE (5 * 1024)
+#define OST_IO_MAXREQSIZE max_t(int, OST_MAXREQSIZE, \
+ (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE (9 * 1024)
+#define OST_IO_MAXREPSIZE OST_MAXREPSIZE
+
+#define OST_NBUFS 64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
*/
-#define OST_MAXREQSIZE (5 * 1024)
-#define OST_MAXREPSIZE (9 * 1024)
+#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
/* Macro to hide a typecast. */
#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
#define BULK_PUT_SOURCE 3
/**
- * Definition of buk descriptor.
+ * Definition of bulk descriptor.
* Bulks are special "Two phase" RPCs where initial request message
* is sent first and it is followed bt a transfer (o receiving) of a large
* amount of data to be settled into pages referenced from the bulk descriptors.
* Another user is readpage for MDT.
*/
struct ptlrpc_bulk_desc {
- /** completed successfully */
- unsigned long bd_success:1;
- /** accessible to the network (network io potentially in progress) */
- unsigned long bd_network_rw:1;
- /** {put,get}{source,sink} */
- unsigned long bd_type:2;
- /** client side */
- unsigned long bd_registered:1;
- /** For serialization with callback */
+ /** completed with failure */
+ unsigned long bd_failure:1;
+ /** {put,get}{source,sink} */
+ unsigned long bd_type:2;
+ /** client side */
+ unsigned long bd_registered:1;
+ /** For serialization with callback */
spinlock_t bd_lock;
- /** Import generation when request for this bulk was sent */
- int bd_import_generation;
- /** Server side - export this bulk created for */
- struct obd_export *bd_export;
- /** Client side - import this bulk was sent on */
- struct obd_import *bd_import;
- /** LNet portal for this bulk */
- __u32 bd_portal;
- /** Back pointer to the request */
- struct ptlrpc_request *bd_req;
- cfs_waitq_t bd_waitq; /* server side only WQ */
- int bd_iov_count; /* # entries in bd_iov */
- int bd_max_iov; /* allocated size of bd_iov */
- int bd_nob; /* # bytes covered */
- int bd_nob_transferred; /* # bytes GOT/PUT */
-
- __u64 bd_last_xid;
-
- struct ptlrpc_cb_id bd_cbid; /* network callback info */
- lnet_handle_md_t bd_md_h; /* associated MD */
- lnet_nid_t bd_sender; /* stash event::sender */
+ /** Import generation when request for this bulk was sent */
+ int bd_import_generation;
+ /** LNet portal for this bulk */
+ __u32 bd_portal;
+ /** Server side - export this bulk created for */
+ struct obd_export *bd_export;
+ /** Client side - import this bulk was sent on */
+ struct obd_import *bd_import;
+ /** Back pointer to the request */
+ struct ptlrpc_request *bd_req;
+ cfs_waitq_t bd_waitq; /* server side only WQ */
+ int bd_iov_count; /* # entries in bd_iov */
+ int bd_max_iov; /* allocated size of bd_iov */
+ int bd_nob; /* # bytes covered */
+ int bd_nob_transferred; /* # bytes GOT/PUT */
+
+ __u64 bd_last_xid;
+
+ struct ptlrpc_cb_id bd_cbid; /* network callback info */
+ lnet_nid_t bd_sender; /* stash event::sender */
+ int bd_md_count; /* # valid entries in bd_mds */
+ int bd_md_max_brw; /* max entries in bd_mds */
+ /** array of associated MDs */
+ lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT];
#if defined(__KERNEL__)
- /*
- * encrypt iov, size is either 0 or bd_iov_count.
- */
- lnet_kiov_t *bd_enc_iov;
+ /*
+ * encrypt iov, size is either 0 or bd_iov_count.
+ */
+ lnet_kiov_t *bd_enc_iov;
- lnet_kiov_t bd_iov[0];
+ lnet_kiov_t bd_iov[0];
#else
- lnet_md_iovec_t bd_iov[0];
+ lnet_md_iovec_t bd_iov[0];
#endif
};
*/
#ifdef HAVE_SERVER_SUPPORT
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
- int npages, int type, int portal);
+ unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal);
int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
LASSERT(desc != NULL);
spin_lock(&desc->bd_lock);
- rc = desc->bd_network_rw;
+ rc = desc->bd_md_count;
spin_unlock(&desc->bd_lock);
return rc;
}
return 0;
spin_lock(&desc->bd_lock);
- rc = desc->bd_network_rw;
+ rc = desc->bd_md_count;
spin_unlock(&desc->bd_lock);
return rc;
}
void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
- int npages, int type, int portal);
+ unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal);
void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
{
/** @} */
struct ptlrpc_service_buf_conf {
- /* nbufs is how many buffers to post */
+ /* nbufs is buffers # to allocate when growing the pool */
unsigned int bc_nbufs;
/* buffer size to post */
unsigned int bc_buf_size;
* Pinger API (client side only)
* @{
*/
+extern int suppress_pings;
enum timeout_event {
TIMEOUT_GRANT = 1
};
/* ptlrpc/ptlrpcd.c */
void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
void ptlrpcd_wake(struct ptlrpc_request *req);
void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);