From 21501dedf64e027b529a3382d92bf4bf8039638c Mon Sep 17 00:00:00 2001
From: Ashish Maurya <ashish.maurya@seagate.com>
Date: Fri, 16 Feb 2018 20:38:00 +0300
Subject: [PATCH] LU-9780 tests: Testing Round-Robin allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Adding a test for fix in LU-977 which shows that
in absence of protection to lqr_start_idx there is
possibility of imblance in allocating objects on OSTs
with round-robin algorithm.

This test checks the protection of lqr_start_idx by using
a new reproducer, rr_alloc which uses MPI to create files in
parallel, and checking the even file distribution over OSTs.

Distribution check formula is adjusted as per implementation,i.e
some factors like - exhaustion of pre-created objects and counter
âlrq_start_countâ(lod_qos.c) reseed are also taken care so that
object allocation is not affected by these factors.

Test-Parameters: trivial osscount=3 clientcount=6                  envdefinitions=ONLY=rr_alloc testlist=parallel-scale
Signed-off-by: Ashish Maurya <ashish.maurya@seagate.com>
Signed-off-by: Rahul Deshmukh <rahul.deshmukh@seagate.com>
Signed-off-by: Elena Gryaznova <c17455@cray.com>
Cray-bug-id: MRP-2723
Reviewed-by: Reviewed-by: Vladimir Saveliev <c17830@cray.com>
Reviewed-by: Vikram Jadhav <jadhav.vikram@seagate.com>
Change-Id: I55f798c6dc8e607f002365f4a22ccf59a454fe1d
Reviewed-on: https://review.whamcloud.com/28075
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/tests/functions.sh      | 123 ++++++++++++++++++++++++++++
 lustre/tests/mpi/Makefile.am   |   3 +-
 lustre/tests/mpi/rr_alloc.c    | 182 +++++++++++++++++++++++++++++++++++++++++
 lustre/tests/parallel-scale.sh |   5 ++
 4 files changed, 312 insertions(+), 1 deletion(-)
 create mode 100644 lustre/tests/mpi/rr_alloc.c

diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh
index 5fc08ee..92dd853 100644
--- a/lustre/tests/functions.sh
+++ b/lustre/tests/functions.sh
@@ -978,6 +978,129 @@ run_statahead () {
     cleanup_statahead $clients $mntpt_root $num_mntpts
 }
 
+cleanup_rr_alloc () {
+	trap 0
+	local clients="$1"
+	local mntpt_root="$2"
+	local rr_alloc_MNTPTS="$3"
+	local mntpt_dir=$(dirname ${mntpt_root})
+
+	for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do
+		zconf_umount_clients $clients ${mntpt_root}$i ||
+		error_exit "Failed to umount lustre on ${mntpt_root}$i"
+	done
+	do_nodes $clients "rm -rf $mntpt_dir"
+}
+
+run_rr_alloc() {
+	remote_mds_nodsh && skip "remote MDS with nodsh" && return
+	echo "===Test gives more reproduction percentage if number of "\
+		"client and ost are more. Test with 44 or more clients "\
+		"and 73 or more OSTs gives 100% reproduction rate=="
+
+	RR_ALLOC=${RR_ALLOC:-$(which rr_alloc 2> /dev/null || true)}
+	[ x$RR_ALLOC = x ] && { skip_env "rr_alloc not found" && return; }
+	declare -a diff_max_min_arr
+	# foeo = file on each ost. calc = calculated.
+	local ost_idx
+	local foeo_calc
+	local qos_prec_objs="${TMP}/qos_and_precreated_objects"
+	local rr_alloc_NFILES=${rr_alloc_NFILES:-555}
+	local rr_alloc_MNTPTS=${rr_alloc_MNTPTS:-11}
+	local total_MNTPTS=$((rr_alloc_MNTPTS * num_clients))
+	local mntpt_root="${TMP}/rr_alloc_mntpt/lustre"
+	if [ $MDSCOUNT -lt 2 ]; then
+		[ -e $DIR/$tdir ] || mkdir -p $DIR/$tdir
+	else
+		[ -e $DIR/$tdir ] || $LFS mkdir -i 0 $DIR/$tdir
+	fi
+	chmod 0777 $DIR/$tdir
+	$SETSTRIPE -c 1 /$DIR/$tdir
+
+	trap "cleanup_rr_alloc $clients $mntpt_root $rr_alloc_MNTPTS" EXIT ERR
+	for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do
+		zconf_mount_clients $clients ${mntpt_root}$i $MOUNT_OPTS ||
+		error_exit "Failed to mount lustre on ${mntpt_root}$i $clients"
+	done
+
+	local cmd="$RR_ALLOC $mntpt_root/$tdir/ash $rr_alloc_NFILES \
+		$num_clients"
+
+	# Save mdt values, set threshold to 100% i.e always Round Robin,
+	# restore the saved values again after creating files...
+	save_lustre_params mds1 \
+		"lov.lustre-MDT0000*.qos_threshold_rr" > $qos_prec_objs
+	save_lustre_params mds1 \
+		"osp.lustre-OST*-osc-MDT0000.create_count" >> $qos_prec_objs
+
+	local old_create_count=$(grep -e "create_count" $qos_prec_objs |
+		cut -d'=' -f 2 | sort -nr | head -n1)
+
+	# Make sure that every osp has enough precreated objects for the file
+	# creation app
+
+	# create_count is always set to the power of 2 only, so if the files
+	# per OST are not multiple of that then it will be set to nearest
+	# lower power of 2. So set 'create_count' to the upper power of 2.
+
+	foeo_calc=$((rr_alloc_NFILES * total_MNTPTS / OSTCOUNT))
+	local create_count=$((2 * foeo_calc))
+	do_facet mds1 "$LCTL set_param -n \
+		lov.lustre-MDT0000*.qos_threshold_rr 100 \
+		osp.lustre-OST*-osc-MDT0000.create_count $create_count" ||
+		error "failed while setting qos_threshold_rr & creat_count"
+
+	# Create few temporary files in order to increase the precreated objects
+	# to a desired value, before starting 'rr_alloc' app. Due to default
+	# value 32 of precreation count (OST_MIN_PRECREATE=32), precreated
+	# objects available are 32 initially, these gets exhausted very soon,
+	# which causes skip of some osps when very large number of files
+	# is created per OSTs.
+	createmany -o $DIR/$tdir/foo- $(((old_create_count + 1) * OSTCOUNT)) \
+		> /dev/null
+	rm -f /$DIR/$tdir/foo*
+
+	# Check for enough precreated objects... We should not
+	# fail here because code(osp_precreate.c) also takes care of it.
+	# So we have good chances of passing test even if this check fails.
+	local mdt_idx=0
+	for ost_idx in $(seq 0 $((OSTCOUNT - 1))); do
+		[[ $(precreated_ost_obj_count $mdt_idx $ost_idx) -ge \
+			$foeo_calc ]] || echo "Warning: test may fail because" \
+			"of lack of precreated objects on OST${ost_idx}"
+	done
+
+	if [[ $total_MNTPTS -ne 0 ]]; then
+		# Now start the actual file creation app.
+		mpi_run "-np $total_MNTPTS" $cmd || return
+	else
+		error "No mount point"
+	fi
+
+	restore_lustre_params < $qos_prec_objs
+	rm -f $qos_prec_objs
+
+	diff_max_min_arr=($($GETSTRIPE -r $DIR/$tdir/ |
+		grep "lmm_stripe_offset:" | awk '{print $2}' | sort -n |
+		uniq -c | awk 'NR==1 {min=max=$1} \
+		{ $1<min ? min=$1 : min; $1>max ? max=$1 : max} \
+		END {print max-min, max, min}'))
+
+	rm -rf $DIR/$tdir
+
+	# In-case of fairly large number of file creation using RR (round-robin)
+	# there can be two cases in which deviation will occur than the regular
+	# RR algo behaviour-
+	# 1- When rr_alloc does not start right with 'lqr_start_count' reseeded,
+	# 2- When rr_alloc does not finish with 'lqr_start_count == 0'.
+	# So the difference of files b/w any 2 OST should not be more than 2.
+	[[ ${diff_max_min_arr[0]} -le 2 ]] ||
+		error "Uneven distribution detected: difference between" \
+		"maximum files per OST (${diff_max_min_arr[1]}) and" \
+		"minimum files per OST (${diff_max_min_arr[2]}) must not be" \
+		"greater than 2"
+}
+
 run_fs_test() {
 	# fs_test.x is the default name for exe
 	FS_TEST=${FS_TEST:=$(which fs_test.x 2> /dev/null || true)}
diff --git a/lustre/tests/mpi/Makefile.am b/lustre/tests/mpi/Makefile.am
index 7beb45e..25f1826 100644
--- a/lustre/tests/mpi/Makefile.am
+++ b/lustre/tests/mpi/Makefile.am
@@ -4,7 +4,7 @@ AM_CFLAGS := -fPIC -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64
 CC = @MPICC_WRAPPER@
 
 THETESTS = parallel_grouplock write_append_truncate createmany_mpi \
-	   mdsrate write_disjoint cascading_rw
+	   mdsrate write_disjoint cascading_rw rr_alloc
 
 if TESTS
 if MPITESTS
@@ -19,6 +19,7 @@ write_append_truncate_SOURCES=write_append_truncate.c
 write_disjoint_SOURCES=write_disjoint.c
 createmany_mpi_SOURCES=createmany-mpi.c
 parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c lp_utils.h
+rr_alloc_SOURCES=rr_alloc.c
 
 cascading_rw_SOURCES=cascading_rw.c lp_utils.c lp_utils.h
 cascading_rw_LDADD=$(top_builddir)/lustre/utils/liblustreapi.la
diff --git a/lustre/tests/mpi/rr_alloc.c b/lustre/tests/mpi/rr_alloc.c
new file mode 100644
index 0000000..e65f711
--- /dev/null
+++ b/lustre/tests/mpi/rr_alloc.c
@@ -0,0 +1,182 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see If not, see
+ * http://www.gnu.org/licenses
+ *
+ * Please contact http://www.seagate.com/contacts/ or visit www.seagate.com
+ * if you need additional information or have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Seagate Technology LLC
+ *
+ * Author: Ashish Maurya <ashish.maurya@seagate.com>
+ */
+/*
+ * lustre/tests/mpi/rr_alloc.c
+ *
+ * DESCRIPTION
+ *
+ * This code is creating <n> files using MPI processes which depend on the
+ * mounted clients. Processes are running in parallel through all the client
+ * nodes in RR fashion starting with rank 0 and so on, and creating files.
+ *
+ * USE CASE:- If there are 20 mounted clients on 4 client nodes, 5 clients on
+ * each node, it will run 5 processes on each client node through each mount
+ * point and each process will create <n> number of files given by the user.
+ * Each process rank is mapped to its matching mount point on the client node
+ * eg:- rank 0 <-> /tmp/mnt/lustre0 ; rank 1 <-> /tmp/mnt/lustre1 etc.
+ *
+ * NOTE:- For simplicity client on /mnt/lustre is not taken into account.
+ *
+ * IMPORTANT NOTE:- If argv[1] is /mnt/dir/ash, then the program assumes that.
+ * /mnt0/dir/, /mnt1/dir/, etc exist.
+ */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <libgen.h>
+
+void usage(char *prog)
+{
+	printf("Usage: %s <filename with mount pt and test dir>", prog);
+	printf(" <no. of files> <no. of cli nodes>\n");
+	printf("Ex: mpirun -np <njobs> rr_alloc /tmp/mnt/lustre/ash 512 4\n");
+
+	exit(EXIT_FAILURE);
+}
+
+void perr_exit(int rank, int error, const char *fmt, ...)
+{
+	va_list ap;
+
+	printf("Process rank %d exited with error code %d\n", rank, error);
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+
+	MPI_Abort(MPI_COMM_WORLD, error);
+}
+
+int main(int argc, char **argv)
+{
+	int proc_rank = 0;
+	int serial_prank_per_cli = 0;
+	int proc_per_cli_node = 0;
+	int bytes = 0;
+	int file_no = 0;
+	int client_nodes = 0;
+	int nproc = 0;
+	int rc = 0;
+	int fd = 0;
+	int i = 0;
+	char file_path[PATH_MAX] = {0};
+	char mnt_path[PATH_MAX] = {0};
+	char *path1;
+	char *path2;
+	char *path3;
+	char *fname;
+	char *dname;
+
+	if (argc != 4)
+		usage(argv[0]);
+
+	if (!strchr(argv[1], '/')) {
+		fprintf(stderr, "Please enter filename with mount point\n");
+		usage(argv[0]);
+	}
+
+	/*
+	 * Separating filename and mount point name. This is required for
+	 * mapping processes to particular mount point.
+	 */
+	path1 = strdup(argv[1]);
+	path2 = strdup(argv[1]);
+	path3 = strdup(argv[1]);
+	fname = basename(path1);
+	dname = basename(dirname(path2));
+	/* dirname looping depends on the depth of the file from mount path */
+	strncpy(mnt_path, dirname(dirname(path3)), sizeof(mnt_path));
+
+	file_no = atoi(argv[2]);
+	if (!file_no) {
+		fprintf(stderr, "Number of files must not be zero\n");
+		usage(argv[0]);
+	}
+	client_nodes = atoi(argv[3]);
+	if (!client_nodes) {
+		fprintf(stderr, "Client nodes must not be zero\n");
+		usage(argv[0]);
+	}
+
+	rc = MPI_Init(&argc, &argv);
+	if (rc != MPI_SUCCESS) {
+		fprintf(stderr, "MPI_Init failed: %d\n", rc);
+		exit(EXIT_FAILURE);
+	}
+
+	rc = MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
+	if (rc != MPI_SUCCESS)
+		perr_exit(proc_rank, rc, "MPI_Comm_rank failed: %d\n", rc);
+
+	rc = MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+	if (rc != MPI_SUCCESS)
+		perr_exit(proc_rank, rc, "MPI_Comm_size failed: %d\n", rc);
+
+	/*
+	 * Make sure that each rank is processed through its respective mnt pt
+	 * eg: job 0,1 will be executed by /tmp/mnt/lustre0, /tmp/mnt/lustre1,
+	 * etc. on each client node.
+	 */
+	/* Number of processes on each client nodes */
+	proc_per_cli_node = nproc / client_nodes;
+
+	/*
+	 * By default rank of processes is allocated in RR fashion throughout
+	 * all the client nodes so all the processes are not in serial order on
+	 * a particular client node. In order to map each process to a mount pt
+	 * by its rank we need process rank in serial order on a client node
+	 */
+	serial_prank_per_cli = proc_rank % proc_per_cli_node;
+
+	rc = MPI_Barrier(MPI_COMM_WORLD);
+	if (rc != MPI_SUCCESS)
+		perr_exit(proc_rank, rc, "Prep MPI_Barrier failed: %d\n", rc);
+
+	for (i = 0; i < file_no; i++) {
+		bytes = snprintf(file_path, sizeof(file_path),
+			"%s%d/%s/%s-%d-%d", mnt_path, serial_prank_per_cli,
+			dname, fname, proc_rank, i);
+		if (bytes >= sizeof(file_path))
+			perr_exit(proc_rank, -ENAMETOOLONG, "Name too long\n");
+		fd = open(file_path, O_CREAT|O_RDWR, 0644);
+		if (fd < 0) {
+			perr_exit(proc_rank, errno, "Cannot open \"%s\": %s\n",
+				file_path, strerror(errno));
+		}
+		close(fd);
+	}
+	MPI_Finalize();
+	return 0;
+}
diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh
index aea27cf..840a8b8 100644
--- a/lustre/tests/parallel-scale.sh
+++ b/lustre/tests/parallel-scale.sh
@@ -161,6 +161,11 @@ test_statahead () {
 }
 run_test statahead "statahead test, multiple clients"
 
+test_rr_alloc () {
+	run_rr_alloc
+}
+run_test rr_alloc "Checking even file distribution over OSTs in RR policy"
+
 test_fs_test () {
 	run_fs_test
 }
-- 
1.8.3.1