From: Jinshan Xiong <jinshan.xiong@intel.com>
Date: Mon, 12 Sep 2016 18:17:10 +0000 (-0700)
Subject: LU-8135 osc: limits the number of chunks in write RPC
X-Git-Tag: 2.8.59~45
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=7f2aae8d80a73de7408668bbe569d5f4d8553efe

LU-8135 osc: limits the number of chunks in write RPC

OSC has to make sure that it won't issue write RPCs with too many
chunks otherwise it will casue ZFS to create transactions much
bigger than DMU_MAX_ACCESS in size, which will end up with write
failure.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: Ib68b09afca35c253ef0a6b569f64f555e08bd11b
Reviewed-on: http://review.whamcloud.com/22369
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Patrick Farrell <paf@cray.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c
index bc511c3..590f9f9 100644
--- a/lustre/osc/osc_cache.c
+++ b/lustre/osc/osc_cache.c
@@ -1883,6 +1883,21 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
 	EXIT;
 }
 
+struct extent_rpc_data {
+	struct list_head	*erd_rpc_list;
+	unsigned int		erd_page_count;
+	unsigned int		erd_max_pages;
+	unsigned int		erd_max_chunks;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+	return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
 /**
  * Try to add extent to one RPC. We need to think about the following things:
  * - # of pages must not be over max_pages_per_rpc
@@ -1890,10 +1905,10 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
  */
 static int try_to_add_extent_for_io(struct client_obd *cli,
 				    struct osc_extent *ext,
-				    struct list_head *rpclist,
-				    unsigned int *pc, unsigned int *max_pages)
+				    struct extent_rpc_data *data)
 {
 	struct osc_extent *tmp;
+	unsigned int chunk_count;
 	struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
 						      struct osc_async_page,
 						      oap_pending_item);
@@ -1902,11 +1917,15 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
 		ext);
 
-	*max_pages = max(ext->oe_mppr, *max_pages);
-	if (*pc + ext->oe_nr_pages > *max_pages)
+	chunk_count = osc_extent_chunks(ext);
+	if (chunk_count > data->erd_max_chunks)
+		RETURN(0);
+
+	data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+	if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
 		RETURN(0);
 
-	list_for_each_entry(tmp, rpclist, oe_link) {
+	list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
 		struct osc_async_page *oap2;
 		oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
 					oap_pending_item);
@@ -1918,8 +1937,8 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 		}
 #endif
 		if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
-			CDEBUG(D_CACHE, "Do not permit different type of IO"
-					" for a same RPC\n");
+			CDEBUG(D_CACHE, "Do not permit different types of IO "
+			       "in one RPC\n");
 			RETURN(0);
 		}
 
@@ -1932,12 +1951,41 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 		break;
 	}
 
-	*pc += ext->oe_nr_pages;
-	list_move_tail(&ext->oe_link, rpclist);
+	data->erd_max_chunks -= chunk_count;
+	data->erd_page_count += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, data->erd_rpc_list);
 	ext->oe_owner = current;
 	RETURN(1);
 }
 
+static inline unsigned osc_max_write_chunks(const struct client_obd *cli)
+{
+	/*
+	 * LU-8135:
+	 *
+	 * The maximum size of a single transaction is about 64MB in ZFS.
+	 * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+	 *
+	 * Since ZFS is a copy-on-write file system, a single dirty page in
+	 * a chunk will result in the rewrite of the whole chunk, therefore
+	 * an RPC shouldn't be allowed to contain too many chunks otherwise
+	 * it will make transaction size much bigger than 64MB, especially
+	 * with big block size for ZFS.
+	 *
+	 * This piece of code is to make sure that OSC won't send write RPCs
+	 * with too many chunks. The maximum chunk size that an RPC can cover
+	 * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+	 * OST should tell the client what the biggest transaction size is,
+	 * but it's good enough for now.
+	 *
+	 * This limitation doesn't apply to ldiskfs, which allows as many
+	 * chunks in one RPC as we want. However, it won't have any benefits
+	 * to have too many discontiguous pages in one RPC. Therefore, it
+	 * can only have 256 chunks at most in one RPC.
+	 */
+	return min(PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits, 256);
+}
+
 /**
  * In order to prevent multiple ptlrpcd from breaking contiguous extents,
  * get_write_extent() takes all appropriate extents in atomic.
@@ -1956,28 +2004,30 @@ static unsigned int get_write_extents(struct osc_object *obj,
 {
 	struct client_obd *cli = osc_cli(obj);
 	struct osc_extent *ext;
-	unsigned int page_count = 0;
-	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= osc_max_write_chunks(cli),
+	};
 
 	LASSERT(osc_object_is_locked(obj));
 	while (!list_empty(&obj->oo_hp_exts)) {
 		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
 				 oe_link);
 		LASSERT(ext->oe_state == OES_CACHE);
-		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-					      &max_pages))
-			return page_count;
-		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
 	}
-	if (page_count == max_pages)
-		return page_count;
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
 
 	while (!list_empty(&obj->oo_urgent_exts)) {
 		ext = list_entry(obj->oo_urgent_exts.next,
 				 struct osc_extent, oe_link);
-		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-					      &max_pages))
-			return page_count;
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
 
 		if (!ext->oe_intree)
 			continue;
@@ -1988,13 +2038,12 @@ static unsigned int get_write_extents(struct osc_object *obj,
 			     ext->oe_owner != NULL))
 				continue;
 
-			if (!try_to_add_extent_for_io(cli, ext, rpclist,
-						      &page_count, &max_pages))
-				return page_count;
+			if (!try_to_add_extent_for_io(cli, ext, &data))
+				return data.erd_page_count;
 		}
 	}
-	if (page_count == max_pages)
-		return page_count;
+	if (data.erd_page_count == data.erd_max_pages)
+		return data.erd_page_count;
 
 	ext = first_extent(obj);
 	while (ext != NULL) {
@@ -2005,13 +2054,12 @@ static unsigned int get_write_extents(struct osc_object *obj,
 			continue;
 		}
 
-		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
-					      &max_pages))
-			return page_count;
+		if (!try_to_add_extent_for_io(cli, ext, &data))
+			return data.erd_page_count;
 
 		ext = next_extent(ext);
 	}
-	return page_count;
+	return data.erd_page_count;
 }
 
 static int
@@ -2096,24 +2144,26 @@ __must_hold(osc)
 	struct osc_extent *ext;
 	struct osc_extent *next;
 	struct list_head rpclist = LIST_HEAD_INIT(rpclist);
-	unsigned int page_count = 0;
-	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+	struct extent_rpc_data data = {
+		.erd_rpc_list	= &rpclist,
+		.erd_page_count	= 0,
+		.erd_max_pages	= cli->cl_max_pages_per_rpc,
+		.erd_max_chunks	= UINT_MAX,
+	};
 	int rc = 0;
 	ENTRY;
 
 	LASSERT(osc_object_is_locked(osc));
-	list_for_each_entry_safe(ext, next,
-				     &osc->oo_reading_exts, oe_link) {
+	list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
 		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
-		if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
-					      &max_pages))
+		if (!try_to_add_extent_for_io(cli, ext, &data))
 			break;
 		osc_extent_state_set(ext, OES_RPC);
-		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+		EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
 	}
-	LASSERT(page_count <= max_pages);
+	LASSERT(data.erd_page_count <= data.erd_max_pages);
 
-	osc_update_pending(osc, OBD_BRW_READ, -page_count);
+	osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
 
 	if (!list_empty(&rpclist)) {
 		osc_object_unlock(osc);