From 783de3e99d0a2185d26bc67a9ca852b08a760cf6 Mon Sep 17 00:00:00 2001
From: Li Wei <wei.g.li@intel.com>
Date: Wed, 13 Mar 2013 23:11:34 +0800
Subject: [PATCH] LU-2951 mdt: Increase bc_req_max_size for MDS_REQUEST_PORTAL

Large EA tests triggered error messages like this on MDSs:

  10:38:21:LNetError: 3022:0:(lib-ptl.c:190:lnet_try_match_md())
  Matching packet from 12345-10.10.17.9@tcp, match 1429230968490588
  length 65928 too big: 117674 left, 49386 allowed

These were the REINT_SETXATTR requests carrying large EA values.  They
were dropped because the MDSs did not expect request buffers larger
than 49386 bytes.  This patch increases bc_req_max_size (and
bc_rep_max_size) for MDS_REQUEST_PORTAL (and MDS_REPLY_PORTAL) to
accommodate REINT_SETXATTR requests (and MDS_GETXATTR replies) with
255-byte names and 65536-byte values (or 65536-byte lists of EA
names).

Change-Id: Ifdcda6d3e91aa6115e5cafd5abd6c89b15485020
Signed-off-by: Li Wei <wei.g.li@intel.com>
Reviewed-on: http://review.whamcloud.com/5703
Tested-by: Hudson
Reviewed-by: Liang Zhen <liang.zhen@intel.com>
Reviewed-by: Nathaniel Clark <nathaniel.l.clark@intel.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/include/lustre_net.h | 53 +++++++++++++++++++++++++++++++--------------
 lustre/mdt/mdt_mds.c        |  6 ++---
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index cd6225a..0c8a806 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -315,6 +315,7 @@
 #define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
 
 #define MDS_NBUFS		64
+
 /**
  * Assume file name length = FNAME_MAX = 256 (true for ext3).
  *	  path name length = PATH_MAX = 4096
@@ -355,6 +356,26 @@
 #define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
 
 /**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg		 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body	184
+ *   mdt_rec_setxattr	136
+ *   lustre_capa	120
+ *   name		256 (XATTR_NAME_MAX)
+ *   value	      65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE	66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE	(((max(MDS_EA_MAXREQSIZE, \
+				       MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE	MDS_REG_MAXREQSIZE
+
+/**
  * The update request includes all of updates from the create, which might
  * include linkea (4K maxim), together with other updates, we set it to 9K:
  * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
@@ -363,35 +384,35 @@
 #define MDS_OUT_MAXREPSIZE	MDS_MAXREPSIZE
 
 /** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
-#define MDS_BUFSIZE		max_t(int, MDS_MAXREQSIZE + 1024, 8 * 1024)
+#define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    8 * 1024)
 
 /**
- * MDS_LOV_BUFSIZE should be at least max_reqsize (with LOV EA) +
- * max sptlrpc payload size, however, we need to allocate a much larger buffer
- * for it because LNet requires each MD(rqbd) has at least MDS_LOVE_MAXREQSIZE
- * bytes left to avoid dropping of maximum-sized incoming request.
- * So if MDS_LOV_BUFSIZE is only a little larger than MDS_LOV_MAXREQSIZE,
- * then it can only fit in one request even there are 48K bytes left in
- * a rqbd, and memory utilization is very low.
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
  *
  * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
  * reused until all requests fit in it have been processed and released,
  * which means one long blocked request can prevent the rqbd be reused.
- * Now we set request buffer size to 128K, so even each rqbd is unlinked
- * from LNet with unused 48K, buffer utilization will be about 62%.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
  * Please check LU-2432 for details.
  */
-/** MDS_LOV_BUFSIZE = max_reqsize (w/ LOV EA) + max sptlrpc payload size */
-#define MDS_LOV_BUFSIZE		max_t(int, MDS_LOV_MAXREQSIZE + 1024, \
-					   128 * 1024)
+#define MDS_REG_BUFSIZE		max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    160 * 1024)
 
 /**
  * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
- * about 10K, for the same reason as MDS_LOV_BUFSIZE, we also give some
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
  * extra bytes to each request buffer to improve buffer utilization rate.
   */
-#define MDS_OUT_BUFSIZE		max_t(int, MDS_OUT_MAXREQSIZE + 1024, \
-					   24 * 1024)
+#define MDS_OUT_BUFSIZE		max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    24 * 1024)
 
 /** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
 #define FLD_MAXREQSIZE  (160)
diff --git a/lustre/mdt/mdt_mds.c b/lustre/mdt/mdt_mds.c
index 08f4f11..4ec8a51 100644
--- a/lustre/mdt/mdt_mds.c
+++ b/lustre/mdt/mdt_mds.c
@@ -429,9 +429,9 @@ static int mds_start_ptlrpc_service(struct mds_device *m)
 		.psc_watchdog_factor	= MDT_SERVICE_WATCHDOG_FACTOR,
 		.psc_buf		= {
 			.bc_nbufs		= MDS_NBUFS,
-			.bc_buf_size		= MDS_LOV_BUFSIZE,
-			.bc_req_max_size	= MDS_LOV_MAXREQSIZE,
-			.bc_rep_max_size	= MDS_LOV_MAXREPSIZE,
+			.bc_buf_size		= MDS_REG_BUFSIZE,
+			.bc_req_max_size	= MDS_REG_MAXREQSIZE,
+			.bc_rep_max_size	= MDS_REG_MAXREPSIZE,
 			.bc_req_portal		= MDS_REQUEST_PORTAL,
 			.bc_rep_portal		= MDC_REPLY_PORTAL,
 		},
-- 
1.8.3.1