From: Li Xi Date: Mon, 13 Jul 2015 14:29:54 +0000 (+0800) Subject: LU-6770 osc: use global osc_rq_pool to reduce memory usage X-Git-Tag: 2.7.58~9 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=44c4f47c4d1f185831d4629cc9ca5ae5f50a8e07 LU-6770 osc: use global osc_rq_pool to reduce memory usage The per-osc request pools consume a lot of memory if there are hundreds of OSCs on one client. This will be a critical problem if the client doesn't have sufficient memory for both OSCs and applications. This patch replaces per-osc request pools with a global pool osc_rq_pool. The total memory usage is 5MB by default. And it can be set by a module parameter of OSC: "options osc osc_reqpool_mem_max=POOL_SIZE". The unit of POOL_SIZE is MB. If cl_max_rpcs_in_flight is the same for all OSCs, the memory usage of the OSC pool can be calculated as: Min(POOL_SIZE * 1M, (cl_max_rpcs_in_flight + 2) * OSC number * OST_IO_MAXREQSIZE) Also, this patch changes the allocation logic of OSC write requests. The allocation from osc_rq_pool will only be tried after normal allocation failed. Signed-off-by: Wu Libin Signed-off-by: Wang Shilong Signed-off-by: Li Xi Change-Id: I1b0c522ade01dba11d860ab57f83af53619ce4ba Reviewed-on: http://review.whamcloud.com/15422 Tested-by: Jenkins Reviewed-by: Jinshan Xiong Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 784268c..f5fc224 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -313,8 +313,6 @@ struct obd_import { __u32 imp_msg_magic; __u32 imp_msghdr_flags; /* adjusted based on server capability */ - struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ - struct imp_at imp_at; /* adaptive timeout data */ time_t imp_last_reply_time; /* for health check */ }; diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 64419d0..05ab12e 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -740,7 +740,7 @@ struct ptlrpc_request_pool { /** Maximum message size that would fit into a rquest from this pool */ int prp_rq_size; /** Function to allocate more requests for this pool */ - void (*prp_populate)(struct ptlrpc_request_pool *, int); + int (*prp_populate)(struct ptlrpc_request_pool *, int); }; struct lu_context; @@ -2097,11 +2097,11 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *); void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); -void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); struct ptlrpc_request_pool * ptlrpc_init_rq_pool(int, int, - void (*populate_pool)(struct ptlrpc_request_pool *, int)); + int (*populate_pool)(struct ptlrpc_request_pool *, int)); void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, @@ -2676,7 +2676,6 @@ int llog_origin_handle_close(struct ptlrpc_request *req); /* ptlrpc/llog_client.c */ extern struct llog_operations llog_client_ops; - /** @} net */ #endif diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index e1d776c..d6b84fb 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -650,10 +650,6 @@ static inline void obd_cleanup_client_import(struct obd_device *obd) CDEBUG(D_CONFIG, "%s: client import never connected\n", obd->obd_name); ptlrpc_invalidate_import(imp); - if (imp->imp_rq_pool) { - ptlrpc_free_rq_pool(imp->imp_rq_pool); - imp->imp_rq_pool = NULL; - } client_destroy_import(imp); obd->u.cli.cl_import = NULL; } diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 257cf03..5a05331 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1673,7 +1673,6 @@ static void obd_zombie_export_add(struct obd_export *exp) { */ static void obd_zombie_import_add(struct obd_import *imp) { LASSERT(imp->imp_sec == NULL); - LASSERT(imp->imp_rq_pool == NULL); spin_lock(&obd_zombie_impexp_lock); LASSERT(list_empty(&imp->imp_zombie_chain)); zombies_count++; diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 73ec748..51bb482 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -96,8 +96,8 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file, { struct obd_device *dev = ((struct seq_file *)file->private_data)->private; struct client_obd *cli = &dev->u.cli; - struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool; int val, rc; + int adding, added, req_count; rc = lprocfs_write_helper(buffer, count, &val); if (rc) @@ -107,8 +107,20 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file, return -ERANGE; LPROCFS_CLIMP_CHECK(dev); - if (pool && val > cli->cl_max_rpcs_in_flight) - pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight); + + adding = val - cli->cl_max_rpcs_in_flight; + req_count = atomic_read(&osc_pool_req_count); + if (adding > 0 && req_count < osc_reqpool_maxreqcount) { + /* + * There might be some race which will cause over-limit + * allocation, but it is fine. + */ + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = osc_rq_pool->prp_populate(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } spin_lock(&cli->cl_loi_list_lock); cli->cl_max_rpcs_in_flight = val; diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 842d5ac..985df64 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -39,6 +39,10 @@ #define OAP_MAGIC 8675309 +extern atomic_t osc_pool_req_count; +extern unsigned int osc_reqpool_maxreqcount; +extern struct ptlrpc_request_pool *osc_rq_pool; + struct lu_env; enum async_flags { diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 0a1b5f5..c6ccb68 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -49,9 +49,19 @@ #include #include #include +#include +#include #include "osc_internal.h" #include "osc_cl_internal.h" +atomic_t osc_pool_req_count; +unsigned int osc_reqpool_maxreqcount; +struct ptlrpc_request_pool *osc_rq_pool; + +/* max memory used for request pool, unit is MB */ +static unsigned int osc_reqpool_mem_max = 5; +module_param(osc_reqpool_mem_max, uint, 0444); + struct osc_brw_async_args { struct obdo *aa_oa; int aa_requested_nob; @@ -1000,15 +1010,15 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) RETURN(-EINVAL); /* Fatal */ - if ((cmd & OBD_BRW_WRITE) != 0) { - opc = OST_WRITE; - req = ptlrpc_request_alloc_pool(cli->cl_import, - cli->cl_import->imp_rq_pool, - &RQF_OST_BRW_WRITE); - } else { - opc = OST_READ; - req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); - } + if ((cmd & OBD_BRW_WRITE) != 0) { + opc = OST_WRITE; + req = ptlrpc_request_alloc_pool(cli->cl_import, + osc_rq_pool, + &RQF_OST_BRW_WRITE); + } else { + opc = OST_READ; + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); + } if (req == NULL) RETURN(-ENOMEM); @@ -2616,6 +2626,9 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) struct obd_type *type; void *handler; int rc; + int adding; + int added; + int req_count; ENTRY; rc = ptlrpcd_addref(); @@ -2672,15 +2685,20 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) ptlrpc_lprocfs_register_obd(obd); } - /* We need to allocate a few requests more, because - * brw_interpret tries to create new requests before freeing - * previous ones, Ideally we want to have 2x max_rpcs_in_flight - * reserved, but I'm afraid that might be too much wasted RAM - * in fact, so 2 is just my guess and still should work. */ - cli->cl_import->imp_rq_pool = - ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, - OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); + /* + * We try to control the total number of requests with a upper limit + * osc_reqpool_maxreqcount. There might be some race which will cause + * over-limit allocation, but it is fine. + */ + req_count = atomic_read(&osc_pool_req_count); + if (req_count < osc_reqpool_maxreqcount) { + adding = cli->cl_max_rpcs_in_flight + 2; + if (req_count + adding > osc_reqpool_maxreqcount) + adding = osc_reqpool_maxreqcount - req_count; + + added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); + atomic_add(added, &osc_pool_req_count); + } INIT_LIST_HEAD(&cli->cl_grant_shrink_list); ns_register_cancel(obd->obd_namespace, osc_cancel_weight); @@ -2767,12 +2785,12 @@ int osc_cleanup(struct obd_device *obd) } /* free memory of osc quota cache */ - osc_quota_cleanup(obd); + osc_quota_cleanup(obd); - rc = client_obd_cleanup(obd); + rc = client_obd_cleanup(obd); - ptlrpcd_decref(); - RETURN(rc); + ptlrpcd_decref(); + RETURN(rc); } int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) @@ -2813,7 +2831,10 @@ static int __init osc_init(void) { bool enable_proc = true; struct obd_type *type; + unsigned int reqpool_size; + unsigned int reqsize; int rc; + ENTRY; /* print an address of _any_ initialized kernel symbol from this @@ -2831,11 +2852,39 @@ static int __init osc_init(void) rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL, LUSTRE_OSC_NAME, &osc_device_type); - if (rc) { - lu_kmem_fini(osc_caches); - RETURN(rc); - } + if (rc) + GOTO(out_kmem, rc); + + /* This is obviously too much memory, only prevent overflow here */ + if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) + GOTO(out_type, rc = -EINVAL); + + reqpool_size = osc_reqpool_mem_max << 20; + + reqsize = 1; + while (reqsize < OST_IO_MAXREQSIZE) + reqsize = reqsize << 1; + + /* + * We don't enlarge the request count in OSC pool according to + * cl_max_rpcs_in_flight. The allocation from the pool will only be + * tried after normal allocation failed. So a small OSC pool won't + * cause much performance degression in most of cases. + */ + osc_reqpool_maxreqcount = reqpool_size / reqsize; + atomic_set(&osc_pool_req_count, 0); + osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + if (osc_rq_pool != NULL) + GOTO(out, rc); + rc = -ENOMEM; +out_type: + class_unregister_type(LUSTRE_OSC_NAME); +out_kmem: + lu_kmem_fini(osc_caches); +out: RETURN(rc); } @@ -2843,6 +2892,7 @@ static void /*__exit*/ osc_exit(void) { class_unregister_type(LUSTRE_OSC_NAME); lu_kmem_fini(osc_caches); + ptlrpc_free_rq_pool(osc_rq_pool); } MODULE_AUTHOR("Sun Microsystems, Inc. "); diff --git a/lustre/osp/lwp_dev.c b/lustre/osp/lwp_dev.c index dccfef4..a5ecc88 100644 --- a/lustre/osp/lwp_dev.c +++ b/lustre/osp/lwp_dev.c @@ -363,11 +363,6 @@ static struct lu_device *lwp_device_fini(const struct lu_env *env, imp = m->lpd_obd->u.cli.cl_import; - if (imp->imp_rq_pool) { - ptlrpc_free_rq_pool(imp->imp_rq_pool); - imp->imp_rq_pool = NULL; - } - LASSERT(m->lpd_obd); ptlrpc_lprocfs_unregister_obd(m->lpd_obd); lprocfs_obd_cleanup(m->lpd_obd); diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index b9ea13b..dae446f 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -1328,11 +1328,6 @@ static struct lu_device *osp_device_fini(const struct lu_env *env, imp = osp->opd_obd->u.cli.cl_import; - if (imp->imp_rq_pool) { - ptlrpc_free_rq_pool(imp->imp_rq_pool); - imp->imp_rq_pool = NULL; - } - if (osp->opd_symlink) lprocfs_remove(&osp->opd_symlink); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 81d1478..e03fcf2 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -526,7 +526,7 @@ EXPORT_SYMBOL(ptlrpc_free_rq_pool); /** * Allocates, initializes and adds \a num_rq requests to the pool \a pool */ -void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) { int i; int size = 1; @@ -548,11 +548,11 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) spin_unlock(&pool->prp_lock); req = ptlrpc_request_cache_alloc(GFP_NOFS); if (!req) - return; + return i; OBD_ALLOC_LARGE(msg, size); if (!msg) { ptlrpc_request_cache_free(req); - return; + return i; } req->rq_reqbuf = msg; req->rq_reqbuf_len = size; @@ -561,7 +561,7 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) list_add_tail(&req->rq_list, &pool->prp_req_list); } spin_unlock(&pool->prp_lock); - return; + return num_rq; } EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); @@ -575,7 +575,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); */ struct ptlrpc_request_pool * ptlrpc_init_rq_pool(int num_rq, int msgsize, - void (*populate_pool)(struct ptlrpc_request_pool *, int)) + int (*populate_pool)(struct ptlrpc_request_pool *, int)) { struct ptlrpc_request_pool *pool; @@ -593,11 +593,6 @@ ptlrpc_init_rq_pool(int num_rq, int msgsize, populate_pool(pool, num_rq); - if (list_empty(&pool->prp_req_list)) { - /* have not allocated a single request for the pool */ - OBD_FREE(pool, sizeof(struct ptlrpc_request_pool)); - pool = NULL; - } return pool; } EXPORT_SYMBOL(ptlrpc_init_rq_pool); @@ -770,11 +765,10 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, { struct ptlrpc_request *request = NULL; - if (pool) - request = ptlrpc_prep_req_from_pool(pool); + request = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!request) - request = ptlrpc_request_cache_alloc(GFP_NOFS); + if (!request && pool) + request = ptlrpc_prep_req_from_pool(pool); if (request) { ptlrpc_cli_req_init(request);