From: Qian Yingjin <qian@ddn.com>
Date: Fri, 14 May 2021 14:53:52 +0000 (+0800)
Subject: LU-14139 ptlrpc: grow PtlRPC properly when prepare sub request
X-Git-Tag: 2.15.56~133
X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=5a2dfd36f9c2b6c10ab7ba44b0e9e86372623fde;p=fs%2Flustre-release.git

LU-14139 ptlrpc: grow PtlRPC properly when prepare sub request

In this patch, it prepares and grows PtlRPC reply buffer
properly for SUB batch request in @req_capsule_server_pack().

At the same time, it adds a limit of reply buffer size with
BUT_MAXREPSIZE = (1000 * 1024).

Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: I4277974b3b0e9cd19fd0d18ae7c029cccaa9c558
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/43707
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 42ee5bf..6b700eb 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -373,6 +373,9 @@
 #define OUT_MAXREQSIZE	(1000 * 1024)
 #define OUT_MAXREPSIZE	MDS_MAXREPSIZE
 
+#define BUT_MAXREQSIZE	OUT_MAXREQSIZE
+#define BUT_MAXREPSIZE	BUT_MAXREQSIZE
+
 /** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
 #define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
 				    8 * 1024)
diff --git a/lustre/mdc/mdc_batch.c b/lustre/mdc/mdc_batch.c
index d997e42..c2ebd17 100644
--- a/lustre/mdc/mdc_batch.c
+++ b/lustre/mdc/mdc_batch.c
@@ -138,7 +138,7 @@ static int mdc_batch_getattr_pack(struct batch_update_head *head,
 	req_capsule_set_size(&pill, &RMF_ACL, RCL_SERVER,
 			     LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
 	req_capsule_set_size(&pill, &RMF_DEFAULT_MDT_MD, RCL_SERVER,
-			     sizeof(struct lmv_user_md));
+			     /*sizeof(struct lmv_user_md)*/MIN_MD_SIZE);
 
 	if (have_secctx) {
 		char *secctx_name;
diff --git a/lustre/mdt/mdt_batch.c b/lustre/mdt/mdt_batch.c
index cfd6aa1..2dff768 100644
--- a/lustre/mdt/mdt_batch.c
+++ b/lustre/mdt/mdt_batch.c
@@ -154,6 +154,7 @@ int mdt_batch(struct tgt_session_info *tsi)
 	__u32 update_buf_count;
 	__u32 packed_replen;
 	void **update_bufs;
+	bool grown = false;
 	int buh_size;
 	int rc;
 	int i;
@@ -326,7 +327,11 @@ next:
 			 * As @repmsg may be changed if the reply buffer is
 			 * too small to grow, thus it needs to reload it here.
 			 */
-			repmsg = pill->rc_repmsg;
+			if (repmsg != pill->rc_repmsg) {
+				repmsg = pill->rc_repmsg;
+				grown = true;
+			}
+
 			repmsg->lm_result = rc;
 			mdt_thread_info_reset(info);
 
@@ -342,9 +347,17 @@ next:
 		req_capsule_shrink(&req->rq_pill, &RMF_BUT_REPLY,
 				   packed_replen, RCL_SERVER);
 out:
-	if (reply != NULL)
+	if (reply != NULL) {
+		if (grown) {
+			reply = req_capsule_server_get(&req->rq_pill,
+						       &RMF_BUT_REPLY);
+			if (reply == NULL)
+				GOTO(out_free, rc = -EPROTO);
+		}
 		reply->burp_count = handled_update_count;
+	}
 
+out_free:
 	if (update_bufs != NULL) {
 		if (bub != NULL) {
 			for (i = 0; i < update_buf_count; i++, bub++) {
diff --git a/lustre/ptlrpc/batch.c b/lustre/ptlrpc/batch.c
index d678211..6381cc0 100644
--- a/lustre/ptlrpc/batch.c
+++ b/lustre/ptlrpc/batch.c
@@ -365,11 +365,16 @@ static int batch_update_request_fini(struct batch_update_head *head,
 			 */
 			repmsg = NULL;
 			rc1 = -ECANCELED;
+			/*
+			 * TODO: resend the unfinished sub request when the
+			 * return code is -EOVERFLOW.
+			 */
 		}
 
 		if (ouc->ouc_interpret != NULL)
 			ouc->ouc_interpret(req, repmsg, ouc, rc1);
 
+		index++;
 		object_update_callback_fini(ouc);
 		if (rc == 0 && rc1 < 0)
 			rc = rc1;
diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c
index d8057b8..4fb7b28 100644
--- a/lustre/ptlrpc/layout.c
+++ b/lustre/ptlrpc/layout.c
@@ -2023,17 +2023,59 @@ int req_capsule_server_pack(struct req_capsule *pill)
 				   count, fmt->rf_name);
 		}
 	} else { /* SUB request */
+		struct ptlrpc_request *req = pill->rc_req;
+		__u32 used_len;
 		__u32 msg_len;
 
 		msg_len = lustre_msg_size_v2(count, pill->rc_area[RCL_SERVER]);
-		if (msg_len > pill->rc_reqmsg->lm_repsize) {
+		used_len = (char *)pill->rc_repmsg - (char *)req->rq_repmsg;
+		/* Overflow the reply buffer */
+		if (used_len + msg_len > req->rq_replen) {
+			__u32 len;
+			__u32 max;
+
+			if (!req_capsule_has_field(&req->rq_pill,
+						   &RMF_BUT_REPLY, RCL_SERVER))
+				return -EINVAL;
+
+			if (!req_capsule_field_present(&req->rq_pill,
+						       &RMF_BUT_REPLY,
+						       RCL_SERVER))
+				return -EINVAL;
+
+			if (used_len + msg_len > BUT_MAXREPSIZE)
+				return -EOVERFLOW;
+
+			len = req_capsule_get_size(&req->rq_pill,
+						   &RMF_BUT_REPLY, RCL_SERVER);
+			/*
+			 * Currently just increase the batch reply buffer
+			 * by 2.
+			 */
+			max = BUT_MAXREPSIZE - req->rq_replen;
+			if (used_len + msg_len > len)
+				len = used_len + msg_len;
+
+			if (len > max)
+				len += max;
+			else
+				len += len;
+			rc = req_capsule_server_grow(&req->rq_pill,
+						     &RMF_BUT_REPLY, len);
+			if (rc)
+				return rc;
+
+			pill->rc_repmsg =
+				(struct lustre_msg *)((char *)req->rq_repmsg +
+						      used_len);
+		}
+		if (msg_len > pill->rc_reqmsg->lm_repsize)
 			/* TODO: Check whether there is enough buffer size */
 			CDEBUG(D_INFO,
 			       "Overflow pack %d fields in format '%s' for "
 			       "the SUB request with message len %u:%u\n",
 			       count, fmt->rf_name, msg_len,
 			       pill->rc_reqmsg->lm_repsize);
-		}
 
 		rc = 0;
 		lustre_init_msg_v2(pill->rc_repmsg, count,
@@ -2684,7 +2726,7 @@ int req_capsule_server_grow(struct req_capsule *pill,
 	struct ptlrpc_reply_state *rs = req->rq_reply_state, *nrs;
 	char *from, *to, *sptr = NULL;
 	__u32 slen = 0, snewlen = 0;
-	__u32 offset, len;
+	__u32 offset, len, max, diff;
 	int rc;
 
 	LASSERT(pill->rc_fmt != NULL);
@@ -2718,13 +2760,23 @@ int req_capsule_server_grow(struct req_capsule *pill,
 		}
 
 		/*
-		 * Currently just increase the reply buffer by 2 * newlen.
+		 * Currently first try to increase the reply buffer by
+		 * 2 * newlen with reply buffer limit of BUT_MAXREPSIZE.
 		 * TODO: Enlarge the reply buffer properly according to the
 		 * left SUB requests in the batch PTLRPC request.
 		 */
 		snewlen = newlen;
+		diff = snewlen - slen;
+		max = BUT_MAXREPSIZE - req->rq_replen;
+		if (diff > max)
+			return -EOVERFLOW;
+
+		if (diff * 2 + len < max)
+			newlen = (len + diff) * 2;
+		else
+			newlen = len + max;
+
 		req_capsule_set_size(pill, field, RCL_SERVER, snewlen);
-		newlen = len + cfs_size_round(2 * snewlen);
 		req_capsule_set_size(&req->rq_pill, &RMF_BUT_REPLY, RCL_SERVER,
 				     newlen);
 		offset = __req_capsule_offset(&req->rq_pill, &RMF_BUT_REPLY,
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 11ba80a..75627fe 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -13771,6 +13771,30 @@ test_123d() {
 }
 run_test 123d "Statahead on striped directories works correctly"
 
+test_123e() {
+	local max
+	local batch_max
+	local dir=$DIR/$tdir
+
+	mkdir $dir || error "mkdir $dir failed"
+	$LFS setstripe -C 32 $dir || error "setstripe $dir failed"
+
+	touch $dir/$tfile.{0..1000} || error "touch 1000 files failed"
+
+	max=$($LCTL get_param -n llite.*.statahead_max | head -n 1)
+	batch_max=$($LCTL get_param -n llite.*.statahead_batch_max | head -n 1)
+	stack_trap "$LCTL set_param llite.*.statahead_max=$max" EXIT
+	stack_trap "$LCTL set_param llite.*.statahead_batch_max=$batch_max" EXIT
+
+	$LCTL set_param llite.*.statahead_max=2048
+	$LCTL set_param llite.*.statahead_batch_max=1024
+
+	ls -l $dir
+	$LCTL get_param mdc.*.batch_stats
+	$LCTL get_param llite.*.statahead_*
+}
+run_test 123e "statahead with large wide striping"
+
 test_124a() {
 	[ $PARALLEL == "yes" ] && skip "skip parallel run"
 	$LCTL get_param -n mdc.*.connect_flags | grep -q lru_resize ||