From: Li Xi <lixi@ddn.com>
Date: Mon, 13 Jul 2015 14:29:54 +0000 (+0800)
Subject: LU-6770 osc: use global osc_rq_pool to reduce memory usage
X-Git-Tag: 2.7.58~9
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=44c4f47c4d1f185831d4629cc9ca5ae5f50a8e07

LU-6770 osc: use global osc_rq_pool to reduce memory usage

The per-osc request pools consume a lot of memory if there are
hundreds of OSCs on one client. This will be a critical problem
if the client doesn't have sufficient memory for both OSCs and
applications.

This patch replaces per-osc request pools with a global pool
osc_rq_pool. The total memory usage is 5MB by default. And it
can be set by a module parameter of OSC:
"options osc osc_reqpool_mem_max=POOL_SIZE". The unit of POOL_SIZE
is MB. If cl_max_rpcs_in_flight is the same for all OSCs, the
memory usage of the OSC pool can be calculated as:
Min(POOL_SIZE * 1M,
    (cl_max_rpcs_in_flight + 2) * OSC number * OST_IO_MAXREQSIZE)

Also, this patch changes the allocation logic of OSC write requests.
The allocation from osc_rq_pool will only be tried after normal
allocation failed.

Signed-off-by: Wu Libin <lwu@ddn.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Li Xi <lixi@ddn.com>
Change-Id: I1b0c522ade01dba11d860ab57f83af53619ce4ba
Reviewed-on: http://review.whamcloud.com/15422
Tested-by: Jenkins
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h
index 784268c..f5fc224 100644
--- a/lustre/include/lustre_import.h
+++ b/lustre/include/lustre_import.h
@@ -313,8 +313,6 @@ struct obd_import {
         __u32                     imp_msg_magic;
         __u32                     imp_msghdr_flags;       /* adjusted based on server capability */
 
-        struct ptlrpc_request_pool *imp_rq_pool;          /* emergency request pool */
-
         struct imp_at             imp_at;                 /* adaptive timeout data */
         time_t                    imp_last_reply_time;    /* for health check */
 };
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 64419d0..05ab12e 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -740,7 +740,7 @@ struct ptlrpc_request_pool {
 	/** Maximum message size that would fit into a rquest from this pool */
 	int			prp_rq_size;
 	/** Function to allocate more requests for this pool */
-	void (*prp_populate)(struct ptlrpc_request_pool *, int);
+	int (*prp_populate)(struct ptlrpc_request_pool *, int);
 };
 
 struct lu_context;
@@ -2097,11 +2097,11 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *);
 void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
 
 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
-void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
 
 struct ptlrpc_request_pool *
 ptlrpc_init_rq_pool(int, int,
-                    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int));
 
 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
@@ -2676,7 +2676,6 @@ int llog_origin_handle_close(struct ptlrpc_request *req);
 
 /* ptlrpc/llog_client.c */
 extern struct llog_operations llog_client_ops;
-
 /** @} net */
 
 #endif
diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h
index e1d776c..d6b84fb 100644
--- a/lustre/include/obd_class.h
+++ b/lustre/include/obd_class.h
@@ -650,10 +650,6 @@ static inline void obd_cleanup_client_import(struct obd_device *obd)
                 CDEBUG(D_CONFIG, "%s: client import never connected\n",
                        obd->obd_name);
                 ptlrpc_invalidate_import(imp);
-                if (imp->imp_rq_pool) {
-                        ptlrpc_free_rq_pool(imp->imp_rq_pool);
-                        imp->imp_rq_pool = NULL;
-                }
                 client_destroy_import(imp);
                 obd->u.cli.cl_import = NULL;
         }
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c
index 257cf03..5a05331 100644
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -1673,7 +1673,6 @@ static void obd_zombie_export_add(struct obd_export *exp) {
  */
 static void obd_zombie_import_add(struct obd_import *imp) {
 	LASSERT(imp->imp_sec == NULL);
-	LASSERT(imp->imp_rq_pool == NULL);
 	spin_lock(&obd_zombie_impexp_lock);
 	LASSERT(list_empty(&imp->imp_zombie_chain));
 	zombies_count++;
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c
index 73ec748..51bb482 100644
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -96,8 +96,8 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
 {
 	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
         struct client_obd *cli = &dev->u.cli;
-        struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
         int val, rc;
+	int adding, added, req_count;
 
         rc = lprocfs_write_helper(buffer, count, &val);
         if (rc)
@@ -107,8 +107,20 @@ static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
                 return -ERANGE;
 
         LPROCFS_CLIMP_CHECK(dev);
-        if (pool && val > cli->cl_max_rpcs_in_flight)
-                pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+	adding = val - cli->cl_max_rpcs_in_flight;
+	req_count = atomic_read(&osc_pool_req_count);
+	if (adding > 0 && req_count < osc_reqpool_maxreqcount) {
+		/*
+		 * There might be some race which will cause over-limit
+		 * allocation, but it is fine.
+		 */
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = osc_rq_pool->prp_populate(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
 
 	spin_lock(&cli->cl_loi_list_lock);
 	cli->cl_max_rpcs_in_flight = val;
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h
index 842d5ac..985df64 100644
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -39,6 +39,10 @@
 
 #define OAP_MAGIC 8675309
 
+extern atomic_t osc_pool_req_count;
+extern unsigned int osc_reqpool_maxreqcount;
+extern struct ptlrpc_request_pool *osc_rq_pool;
+
 struct lu_env;
 
 enum async_flags {
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 0a1b5f5..c6ccb68 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -49,9 +49,19 @@
 #include <lustre_param.h>
 #include <lustre_fid.h>
 #include <obd_class.h>
+#include <obd.h>
+#include <lustre_net.h>
 #include "osc_internal.h"
 #include "osc_cl_internal.h"
 
+atomic_t osc_pool_req_count;
+unsigned int osc_reqpool_maxreqcount;
+struct ptlrpc_request_pool *osc_rq_pool;
+
+/* max memory used for request pool, unit is MB */
+static unsigned int osc_reqpool_mem_max = 5;
+module_param(osc_reqpool_mem_max, uint, 0444);
+
 struct osc_brw_async_args {
 	struct obdo		 *aa_oa;
 	int			  aa_requested_nob;
@@ -1000,15 +1010,15 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
                 RETURN(-EINVAL); /* Fatal */
 
-        if ((cmd & OBD_BRW_WRITE) != 0) {
-                opc = OST_WRITE;
-                req = ptlrpc_request_alloc_pool(cli->cl_import,
-                                                cli->cl_import->imp_rq_pool,
-                                                &RQF_OST_BRW_WRITE);
-        } else {
-                opc = OST_READ;
-                req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
-        }
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						osc_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -2616,6 +2626,9 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 	struct obd_type	  *type;
 	void		  *handler;
 	int		   rc;
+	int		   adding;
+	int		   added;
+	int		   req_count;
 	ENTRY;
 
 	rc = ptlrpcd_addref();
@@ -2672,15 +2685,20 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 		ptlrpc_lprocfs_register_obd(obd);
 	}
 
-	/* We need to allocate a few requests more, because
-	 * brw_interpret tries to create new requests before freeing
-	 * previous ones, Ideally we want to have 2x max_rpcs_in_flight
-	 * reserved, but I'm afraid that might be too much wasted RAM
-	 * in fact, so 2 is just my guess and still should work. */
-	cli->cl_import->imp_rq_pool =
-		ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
-				    OST_MAXREQSIZE,
-				    ptlrpc_add_rqs_to_pool);
+	/*
+	 * We try to control the total number of requests with a upper limit
+	 * osc_reqpool_maxreqcount. There might be some race which will cause
+	 * over-limit allocation, but it is fine.
+	 */
+	req_count = atomic_read(&osc_pool_req_count);
+	if (req_count < osc_reqpool_maxreqcount) {
+		adding = cli->cl_max_rpcs_in_flight + 2;
+		if (req_count + adding > osc_reqpool_maxreqcount)
+			adding = osc_reqpool_maxreqcount - req_count;
+
+		added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding);
+		atomic_add(added, &osc_pool_req_count);
+	}
 
 	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
 	ns_register_cancel(obd->obd_namespace, osc_cancel_weight);
@@ -2767,12 +2785,12 @@ int osc_cleanup(struct obd_device *obd)
 	}
 
         /* free memory of osc quota cache */
-        osc_quota_cleanup(obd);
+	osc_quota_cleanup(obd);
 
-        rc = client_obd_cleanup(obd);
+	rc = client_obd_cleanup(obd);
 
-        ptlrpcd_decref();
-        RETURN(rc);
+	ptlrpcd_decref();
+	RETURN(rc);
 }
 
 int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
@@ -2813,7 +2831,10 @@ static int __init osc_init(void)
 {
 	bool enable_proc = true;
 	struct obd_type *type;
+	unsigned int reqpool_size;
+	unsigned int reqsize;
 	int rc;
+
 	ENTRY;
 
         /* print an address of _any_ initialized kernel symbol from this
@@ -2831,11 +2852,39 @@ static int __init osc_init(void)
 
 	rc = class_register_type(&osc_obd_ops, NULL, enable_proc, NULL,
 				 LUSTRE_OSC_NAME, &osc_device_type);
-        if (rc) {
-                lu_kmem_fini(osc_caches);
-                RETURN(rc);
-        }
+	if (rc)
+		GOTO(out_kmem, rc);
+
+	/* This is obviously too much memory, only prevent overflow here */
+	if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
+		GOTO(out_type, rc = -EINVAL);
+
+	reqpool_size = osc_reqpool_mem_max << 20;
+
+	reqsize = 1;
+	while (reqsize < OST_IO_MAXREQSIZE)
+		reqsize = reqsize << 1;
+
+	/*
+	 * We don't enlarge the request count in OSC pool according to
+	 * cl_max_rpcs_in_flight. The allocation from the pool will only be
+	 * tried after normal allocation failed. So a small OSC pool won't
+	 * cause much performance degression in most of cases.
+	 */
+	osc_reqpool_maxreqcount = reqpool_size / reqsize;
 
+	atomic_set(&osc_pool_req_count, 0);
+	osc_rq_pool = ptlrpc_init_rq_pool(0, OST_IO_MAXREQSIZE,
+					  ptlrpc_add_rqs_to_pool);
+
+	if (osc_rq_pool != NULL)
+		GOTO(out, rc);
+	rc = -ENOMEM;
+out_type:
+	class_unregister_type(LUSTRE_OSC_NAME);
+out_kmem:
+	lu_kmem_fini(osc_caches);
+out:
 	RETURN(rc);
 }
 
@@ -2843,6 +2892,7 @@ static void /*__exit*/ osc_exit(void)
 {
 	class_unregister_type(LUSTRE_OSC_NAME);
 	lu_kmem_fini(osc_caches);
+	ptlrpc_free_rq_pool(osc_rq_pool);
 }
 
 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
diff --git a/lustre/osp/lwp_dev.c b/lustre/osp/lwp_dev.c
index dccfef4..a5ecc88 100644
--- a/lustre/osp/lwp_dev.c
+++ b/lustre/osp/lwp_dev.c
@@ -363,11 +363,6 @@ static struct lu_device *lwp_device_fini(const struct lu_env *env,
 
 	imp = m->lpd_obd->u.cli.cl_import;
 
-	if (imp->imp_rq_pool) {
-		ptlrpc_free_rq_pool(imp->imp_rq_pool);
-		imp->imp_rq_pool = NULL;
-	}
-
 	LASSERT(m->lpd_obd);
 	ptlrpc_lprocfs_unregister_obd(m->lpd_obd);
 	lprocfs_obd_cleanup(m->lpd_obd);
diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c
index b9ea13b..dae446f 100644
--- a/lustre/osp/osp_dev.c
+++ b/lustre/osp/osp_dev.c
@@ -1328,11 +1328,6 @@ static struct lu_device *osp_device_fini(const struct lu_env *env,
 
 	imp = osp->opd_obd->u.cli.cl_import;
 
-	if (imp->imp_rq_pool) {
-		ptlrpc_free_rq_pool(imp->imp_rq_pool);
-		imp->imp_rq_pool = NULL;
-	}
-
 	if (osp->opd_symlink)
 		lprocfs_remove(&osp->opd_symlink);
 
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 81d1478..e03fcf2 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -526,7 +526,7 @@ EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 /**
  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
  */
-void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 {
         int i;
         int size = 1;
@@ -548,11 +548,11 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 		spin_unlock(&pool->prp_lock);
 		req = ptlrpc_request_cache_alloc(GFP_NOFS);
 		if (!req)
-			return;
+			return i;
 		OBD_ALLOC_LARGE(msg, size);
 		if (!msg) {
 			ptlrpc_request_cache_free(req);
-			return;
+			return i;
                 }
                 req->rq_reqbuf = msg;
                 req->rq_reqbuf_len = size;
@@ -561,7 +561,7 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 		list_add_tail(&req->rq_list, &pool->prp_req_list);
 	}
 	spin_unlock(&pool->prp_lock);
-	return;
+	return num_rq;
 }
 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
  */
 struct ptlrpc_request_pool *
 ptlrpc_init_rq_pool(int num_rq, int msgsize,
-		    void (*populate_pool)(struct ptlrpc_request_pool *, int))
+		    int (*populate_pool)(struct ptlrpc_request_pool *, int))
 {
 	struct ptlrpc_request_pool *pool;
 
@@ -593,11 +593,6 @@ ptlrpc_init_rq_pool(int num_rq, int msgsize,
 
 	populate_pool(pool, num_rq);
 
-	if (list_empty(&pool->prp_req_list)) {
-		/* have not allocated a single request for the pool */
-		OBD_FREE(pool, sizeof(struct ptlrpc_request_pool));
-		pool = NULL;
-	}
 	return pool;
 }
 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
@@ -770,11 +765,10 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 {
 	struct ptlrpc_request *request = NULL;
 
-	if (pool)
-		request = ptlrpc_prep_req_from_pool(pool);
+	request = ptlrpc_request_cache_alloc(GFP_NOFS);
 
-	if (!request)
-		request = ptlrpc_request_cache_alloc(GFP_NOFS);
+	if (!request && pool)
+		request = ptlrpc_prep_req_from_pool(pool);
 
 	if (request) {
 		ptlrpc_cli_req_init(request);