From 6aa10c6de1f1590a7357e179cf6a41773d96fbef Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Tue, 1 Jan 2013 16:44:41 +0800 Subject: [PATCH] LU-2424 ptlrpc: buffer utilization of rqbd This patch covered a few things: - dfferent request buffer size for different MDS service Size of MDS request/reply without LOV EA can be way smaller than request/reply size with LOV EQ This patch defined four different buffer size for different MDS services: MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_LOV_MAXREQSIZE and MDS_LOV_MAXREPSIZE - add extra 128K to MDS_LOV_BUFSIZE MDS_LOV_BUFSIZE should be at least (max_reqsize + max sptlrpc payload size which is (MDS_LOV_MAXREQSIZE + 1024)), but if MDS_LOV_BUFSIZE is only a little larger than MDS_LOV_MAXREQSIZE, then it can only fit in one request even there are 48K bytes left in a rqbd, memory utilization is very low. In the meanwhile, size of rqbd can't be too large, because rqbd can't be reused until all requests fit in it have been processed and released, which means one long blocked request can prevent the rqbd bereused. Now we give extra 128K to buffer size, so even each rqbd is unlinked from LNet with unused 48K, buffer utilization will be above 70%. Xyratex-bug-id: MRP-689 Signed-off-by: Liang Zhen Change-Id: I19107918e62f9de59dd88652f3513234c30e56ce Reviewed-on: http://review.whamcloud.com/4940 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Fan Yong --- lustre/include/lustre_net.h | 64 ++++++++++++++++++++++++++++++++++++--------- lustre/mdt/mdt_mds.c | 14 +++++----- lustre/ptlrpc/service.c | 5 ++-- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index d3a51f4..f4a2e05 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -268,7 +268,7 @@ #define LDLM_NTHRS_MAX (cfs_num_online_cpus() == 1 ? 64 : 128) #define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT -#define LDLM_NBUFS (64 * cfs_num_online_cpus()) +#define LDLM_NBUFS 64 #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) #define LDLM_MAXREPSIZE (1024) @@ -313,11 +313,14 @@ #define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT #define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS -#define MDS_NBUFS (64 * cfs_num_online_cpus()) +#define MDS_NBUFS 64 /** * Assume file name length = FNAME_MAX = 256 (true for ext3). - * path name length = PATH_MAX = 4096 - * LOV MD size max = EA_MAX = 48000 (2000 stripes) + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 24 * 2000 + * (NB: 24 is size of lov_ost_data) + * LOV LOGCOOKIE size max = 32 * 2000 + * (NB: 32 is size of llog_cookie) * symlink: FNAME_MAX + PATH_MAX <- largest * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) * rename: FNAME_MAX + FNAME_MAX @@ -326,23 +329,57 @@ * MDS_MAXREQSIZE ~= 4736 bytes = * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header - * or, for mds_close() and mds_reint_unlink() on a many-OST filesystem: - * = 9210 bytes = lustre_msg + mdt_body + 160 * (easize + cookiesize) * * Realistic size is about 512 bytes (20 character name + 128 char symlink), * except in the open case where there are a large number of OSTs in a LOV. */ -#define MDS_MAXREPSIZE max(10 * 1024, 362 + LOV_MAX_STRIPE_COUNT * 56) -#define MDS_MAXREQSIZE MDS_MAXREPSIZE +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ +#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */ -/** MDS_BUFSIZE = max_reqsize + max sptlrpc payload size */ -#define MDS_BUFSIZE (MDS_MAXREQSIZE + 1024) +/** + * MDS incoming request with LOV EA + * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate + */ +#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \ + 362 + LOV_MAX_STRIPE_COUNT * 24) +/** + * MDS outgoing reply with LOV EA + * + * NB: max reply size Lustre 2.4+ client can get from old MDS is: + * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes + * + * but 2.4 or later MDS will never send reply with llog_cookie to any + * version client. This macro is defined for server side reply buffer size. + */ +#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE + +/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */ +#define MDS_BUFSIZE (MDS_MAXREQSIZE + 1024) +/** + * MDS_LOV_BUFSIZE should be at least max_reqsize (with LOV EA) + + * max sptlrpc payload size, however, we need to allocate a much larger buffer + * for it because LNet requires each MD(rqbd) has at least MDS_LOVE_MAXREQSIZE + * bytes left to avoid dropping of maximum-sized incoming request. + * So if MDS_LOV_BUFSIZE is only a little larger than MDS_LOV_MAXREQSIZE, + * then it can only fit in one request even there are 48K bytes left in + * a rqbd, and memory utilization is very low. + * + * In the meanwhile, size of rqbd can't be too large, because rqbd can't be + * reused until all requests fit in it have been processed and released, + * which means one long blocked request can prevent the rqbd be reused. + * Now we give extra 128K to buffer size, so even each rqbd is unlinked + * from LNet with unused 48K, buffer utilization will be about 72%. + * Please check LU-2432 for details. + */ +/** MDS_LOV_BUFSIZE = max_reqsize (w/ LOV EA) + max sptlrpc payload size */ +#define MDS_LOV_BUFSIZE (MDS_LOV_MAXREQSIZE + (1 << 17)) /** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */ #define FLD_MAXREQSIZE (160) /** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */ #define FLD_MAXREPSIZE (152) +#define FLD_BUFSIZE (1 << 12) /** * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range + @@ -351,12 +388,13 @@ /** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */ #define SEQ_MAXREPSIZE (152) +#define SEQ_BUFSIZE (1 << 12) /** MGS threads must be >= 3, see bug 22458 comment #28 */ #define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) #define MGS_NTHRS_MAX 32 -#define MGS_NBUFS (64 * cfs_num_online_cpus()) +#define MGS_NBUFS 64 #define MGS_BUFSIZE (8 * 1024) #define MGS_MAXREQSIZE (7 * 1024) #define MGS_MAXREPSIZE (9 * 1024) @@ -413,7 +451,7 @@ #define OST_MAXREPSIZE (9 * 1024) -#define OST_NBUFS (64 * cfs_num_online_cpus()) +#define OST_NBUFS 64 #define OST_BUFSIZE (OST_MAXREQSIZE + 1024) /* Macro to hide a typecast. */ @@ -2580,7 +2618,7 @@ int ptlrpcd_queue_work(void *handler); /** @} */ struct ptlrpc_service_buf_conf { - /* nbufs is how many buffers to post */ + /* nbufs is buffers # to allocate when growing the pool */ unsigned int bc_nbufs; /* buffer size to post */ unsigned int bc_buf_size; diff --git a/lustre/mdt/mdt_mds.c b/lustre/mdt/mdt_mds.c index 4e5e7c5..6d5865e 100644 --- a/lustre/mdt/mdt_mds.c +++ b/lustre/mdt/mdt_mds.c @@ -429,9 +429,9 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, .psc_buf = { .bc_nbufs = MDS_NBUFS, - .bc_buf_size = MDS_BUFSIZE, - .bc_req_max_size = MDS_MAXREQSIZE, - .bc_rep_max_size = MDS_MAXREPSIZE, + .bc_buf_size = MDS_LOV_BUFSIZE, + .bc_req_max_size = MDS_LOV_MAXREQSIZE, + .bc_rep_max_size = MDS_LOV_MAXREPSIZE, .bc_req_portal = MDS_REQUEST_PORTAL, .bc_rep_portal = MDC_REPLY_PORTAL, }, @@ -525,7 +525,7 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .bc_nbufs = MDS_NBUFS, .bc_buf_size = MDS_BUFSIZE, .bc_req_max_size = MDS_MAXREQSIZE, - .bc_rep_max_size = MDS_MAXREPSIZE, + .bc_rep_max_size = MDS_LOV_MAXREPSIZE, .bc_req_portal = MDS_SETATTR_PORTAL, .bc_rep_portal = MDC_REPLY_PORTAL, }, @@ -609,7 +609,7 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, .psc_buf = { .bc_nbufs = MDS_NBUFS, - .bc_buf_size = MDS_BUFSIZE, + .bc_buf_size = SEQ_BUFSIZE, .bc_req_max_size = SEQ_MAXREQSIZE, .bc_rep_max_size = SEQ_MAXREPSIZE, .bc_req_portal = SEQ_CONTROLLER_PORTAL, @@ -645,7 +645,7 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, .psc_buf = { .bc_nbufs = MDS_NBUFS, - .bc_buf_size = MDS_BUFSIZE, + .bc_buf_size = SEQ_BUFSIZE, .bc_req_max_size = SEQ_MAXREQSIZE, .bc_rep_max_size = SEQ_MAXREPSIZE, .bc_req_portal = SEQ_METADATA_PORTAL, @@ -679,7 +679,7 @@ static int mds_start_ptlrpc_service(struct mds_device *m) .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, .psc_buf = { .bc_nbufs = MDS_NBUFS, - .bc_buf_size = MDS_BUFSIZE, + .bc_buf_size = FLD_BUFSIZE, .bc_req_max_size = FLD_MAXREQSIZE, .bc_rep_max_size = FLD_MAXREPSIZE, .bc_req_portal = FLD_REQUEST_PORTAL, diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 574cd63..2f0f3a5 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -783,9 +783,8 @@ ptlrpc_register_service(struct ptlrpc_service_conf *conf, CFS_INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */ /* buffer configuration */ - service->srv_nbuf_per_group = test_req_buffer_pressure ? 1 : - max(conf->psc_buf.bc_nbufs / - service->srv_ncpts, 1U); + service->srv_nbuf_per_group = test_req_buffer_pressure ? + 1 : conf->psc_buf.bc_nbufs; service->srv_max_req_size = conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD; service->srv_buf_size = conf->psc_buf.bc_buf_size; -- 1.8.3.1