From 11d55259ab9c1c69d98f2fc487deac8f6dd7e376 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Mon, 4 Nov 2024 12:01:56 -0800
Subject: [PATCH] LU-17251 tests: try to fix test_rr_alloc again

Try to fix parallel-scale test_rr_alloc again:
- ensure that the test directory is striped over all MDTs to
  maximize the number of precreated objects available
- ensure the FID SEQ has OIDs to not run out during this test,
  which caused some OSPs to run out of objects during creation

Add debugging to understand issue more if it continues to fail:
- print lfs df, lfs df -i at start and error to show imbalance
- delete files only in cleanup_rr_alloc() so distribution can be shown

Clean up test script to ensue SEQ is large enough to run test.

Test-Parameters: trivial
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testgroup=review-dne-part-9 env=RACER_EXCEPT="1 2"
Test-Parameters: testlist=parallel-scale env=ONLY=rr_alloc,ONLY_REPEAT=50
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I636a488e575d27ac235749911f171d5e1e33e310
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56853
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/tests/functions.sh | 47 +++++++++++++++++++++++++++++------------------
 lustre/tests/racer.sh     |  2 ++
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh
index 1993b7b..4f57f73 100644
--- a/lustre/tests/functions.sh
+++ b/lustre/tests/functions.sh
@@ -1027,13 +1027,13 @@ run_statahead () {
 }
 
 cleanup_rr_alloc () {
-	trap 0
 	local clients="$1"
 	local mntpt_root="$2"
 	local rr_alloc_MNTPTS="$3"
 	local mntpt_dir=$(dirname ${mntpt_root})
 
-	for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do
+	$LFS find $DIR/$tdir -type f | xargs -n1 -P8 unlink
+	for ((i=0; i < rr_alloc_MNTPTS; i++)); do
 		zconf_umount_clients $clients ${mntpt_root}$i ||
 		error_exit "Failed to umount lustre on ${mntpt_root}$i"
 	done
@@ -1043,12 +1043,13 @@ cleanup_rr_alloc () {
 run_rr_alloc() {
 	remote_mds_nodsh && skip "remote MDS with nodsh"
 
+	RR_ALLOC=${RR_ALLOC:-$(which rr_alloc 2> /dev/null || true)}
+	[[ -n "$RR_ALLOC" ]] || skip_env "rr_alloc not found"
+
 	echo "===Test gives more reproduction percentage if number of "
 	echo "   client and ost are more. Test with 44 or more clients "
 	echo "   and 73 or more OSTs gives 100% reproduction rate=="
 
-	RR_ALLOC=${RR_ALLOC:-$(which rr_alloc 2> /dev/null || true)}
-	[ x$RR_ALLOC = x ] && skip_env "rr_alloc not found"
 	declare -a diff_max_min_arr
 	local ost_idx
 	local qos_prec_objs="${TMP}/qos_and_precreated_objects"
@@ -1056,13 +1057,19 @@ run_rr_alloc() {
 	local rr_alloc_MNTPTS=${rr_alloc_MNTPTS:-11}
 	local total_MNTPTS=$((rr_alloc_MNTPTS * num_clients))
 	local mntpt_root="${TMP}/rr_alloc_mntpt/lustre"
-	test_mkdir $DIR/$tdir
+	test_mkdir -c $MDSCOUNT $DIR/$tdir
 	setstripe_getstripe $DIR/$tdir $rr_alloc_STRIPEPARAMS
 
+	ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
+	(( ONLY_REPEAT_ITER == 1 )) || wait_delete_completed
+
+	$LFS df $DIR/$tdir
+	$LFS df -i $DIR/$tdir
 	chmod 0777 $DIR/$tdir
 
-	trap "cleanup_rr_alloc $clients $mntpt_root $rr_alloc_MNTPTS" EXIT ERR
-	for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do
+	stack_trap "cleanup_rr_alloc $clients $mntpt_root $rr_alloc_MNTPTS"
+	for ((i=0; i < rr_alloc_MNTPTS; i++)); do
 		zconf_mount_clients $clients ${mntpt_root}$i $MOUNT_OPTS ||
 		error_exit "Failed to mount lustre on ${mntpt_root}$i $clients"
 	done
@@ -1075,7 +1082,7 @@ run_rr_alloc() {
 		"osp.$FSNAME-OST*-osc-MDT0000.create_count" >> $qos_prec_objs
 
 	local old_create_count=$(grep -e "create_count" $qos_prec_objs |
-		cut -d'=' -f 2 | sort -nr | head -n1)
+				 cut -d'=' -f 2 | sort -nr | head -n1)
 
 	# Make sure that every osp has enough precreated objects for the file
 	# creation app
@@ -1115,7 +1122,14 @@ run_rr_alloc() {
 				local count=$(precreated_ost_obj_count \
 					$mdt_idx $ost_idx)
 				if ((count < foeo_calc)); then
+					ost=$(ostname_from_index $ost_idx)
+					mdt=$(mdtname_from_index $mdt_idx)
+					t=osp.$ost-osc${mdt#$FSNAME}.create_count
+
+
 					sleep=1
+					do_facet mds$((mdt_idx+1)) \
+						$LCTL set_param $t=$create_count
 				fi
 			done
 		done
@@ -1139,26 +1153,23 @@ run_rr_alloc() {
 
 	diff_max_min_arr=($($LFS getstripe -r $DIR/$tdir/ |
 			    awk '/lmm_stripe_offset:/ {print $2}' |
-			    sort | uniq -c |
+			    sort | uniq -c | tee /dev/stderr |
 			    awk 'NR==1 {min=max=$1} \
 				 { $1<min ? min=$1:min; $1>max ? max=$1:max} \
 				 END {print max-min, max, min}'))
 
-	$LFS find $DIR/$tdir -type f | xargs -n1 -P8 unlink
-
-
-	# In-case of fairly large number of file creation using RR (round-robin)
+	# In case of fairly large number of file creation using RR (round-robin)
 	# there can be two cases in which deviation will occur than the regular
 	# RR algo behaviour-
 	# 1- When rr_alloc does not start right with 'lqr_start_count' reseeded,
 	# 2- When rr_alloc does not finish with 'lqr_start_count == 0'.
-	# So the difference of files b/w any 2 OST should not be more than 2.
-	# In some cases it may be more, but shouldn't be > 0.3% of the files.
-	local max_diff=$((create_count > 600 ? create_count / 300 : 2))
+	# So the difference of files for any 2 OST should not be more than 2-3.
+	# In some cases it may be more, but shouldn't be > .3% of the files.
+	local max_diff=$((create_count > 600 ? create_count / 200 : $MDSCOUNT))
 
 	(( ${diff_max_min_arr[0]} <= $max_diff )) || {
-		$LFS getstripe -r $DIR/$tdir |
-			awk '/lmm_stripe_offset:/ {print $2}' | sort | uniq -c
+		$LFS df $DIR/$tdir
+		$LFS df -i $DIR/$tdir
 
 		error "max/min OST objects (${diff_max_min_arr[1]} : ${diff_max_min_arr[2]}) too different"
 	}
diff --git a/lustre/tests/racer.sh b/lustre/tests/racer.sh
index a4248d6..9f1d3c9 100755
--- a/lustre/tests/racer.sh
+++ b/lustre/tests/racer.sh
@@ -18,6 +18,8 @@ LUSTRE=${LUSTRE:-$(dirname $0)/..}
 init_test_env "$@"
 init_logging
 
+ALWAYS_EXCEPT="$RACER_EXCEPT "
+
 build_test_filter
 
 racer=$LUSTRE/tests/racer/racer.sh
-- 
1.8.3.1