From 21501dedf64e027b529a3382d92bf4bf8039638c Mon Sep 17 00:00:00 2001 From: Ashish Maurya Date: Fri, 16 Feb 2018 20:38:00 +0300 Subject: [PATCH] LU-9780 tests: Testing Round-Robin allocation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Adding a test for fix in LU-977 which shows that in absence of protection to lqr_start_idx there is possibility of imblance in allocating objects on OSTs with round-robin algorithm. This test checks the protection of lqr_start_idx by using a new reproducer, rr_alloc which uses MPI to create files in parallel, and checking the even file distribution over OSTs. Distribution check formula is adjusted as per implementation,i.e some factors like - exhaustion of pre-created objects and counter ‘lrq_start_count’(lod_qos.c) reseed are also taken care so that object allocation is not affected by these factors. Test-Parameters: trivial osscount=3 clientcount=6 envdefinitions=ONLY=rr_alloc testlist=parallel-scale Signed-off-by: Ashish Maurya Signed-off-by: Rahul Deshmukh Signed-off-by: Elena Gryaznova Cray-bug-id: MRP-2723 Reviewed-by: Reviewed-by: Vladimir Saveliev Reviewed-by: Vikram Jadhav Change-Id: I55f798c6dc8e607f002365f4a22ccf59a454fe1d Reviewed-on: https://review.whamcloud.com/28075 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin --- lustre/tests/functions.sh | 123 ++++++++++++++++++++++++++++ lustre/tests/mpi/Makefile.am | 3 +- lustre/tests/mpi/rr_alloc.c | 182 +++++++++++++++++++++++++++++++++++++++++ lustre/tests/parallel-scale.sh | 5 ++ 4 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 lustre/tests/mpi/rr_alloc.c diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh index 5fc08ee..92dd853 100644 --- a/lustre/tests/functions.sh +++ b/lustre/tests/functions.sh @@ -978,6 +978,129 @@ run_statahead () { cleanup_statahead $clients $mntpt_root $num_mntpts } +cleanup_rr_alloc () { + trap 0 + local clients="$1" + local mntpt_root="$2" + local rr_alloc_MNTPTS="$3" + local mntpt_dir=$(dirname ${mntpt_root}) + + for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do + zconf_umount_clients $clients ${mntpt_root}$i || + error_exit "Failed to umount lustre on ${mntpt_root}$i" + done + do_nodes $clients "rm -rf $mntpt_dir" +} + +run_rr_alloc() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return + echo "===Test gives more reproduction percentage if number of "\ + "client and ost are more. Test with 44 or more clients "\ + "and 73 or more OSTs gives 100% reproduction rate==" + + RR_ALLOC=${RR_ALLOC:-$(which rr_alloc 2> /dev/null || true)} + [ x$RR_ALLOC = x ] && { skip_env "rr_alloc not found" && return; } + declare -a diff_max_min_arr + # foeo = file on each ost. calc = calculated. + local ost_idx + local foeo_calc + local qos_prec_objs="${TMP}/qos_and_precreated_objects" + local rr_alloc_NFILES=${rr_alloc_NFILES:-555} + local rr_alloc_MNTPTS=${rr_alloc_MNTPTS:-11} + local total_MNTPTS=$((rr_alloc_MNTPTS * num_clients)) + local mntpt_root="${TMP}/rr_alloc_mntpt/lustre" + if [ $MDSCOUNT -lt 2 ]; then + [ -e $DIR/$tdir ] || mkdir -p $DIR/$tdir + else + [ -e $DIR/$tdir ] || $LFS mkdir -i 0 $DIR/$tdir + fi + chmod 0777 $DIR/$tdir + $SETSTRIPE -c 1 /$DIR/$tdir + + trap "cleanup_rr_alloc $clients $mntpt_root $rr_alloc_MNTPTS" EXIT ERR + for i in $(seq 0 $((rr_alloc_MNTPTS - 1))); do + zconf_mount_clients $clients ${mntpt_root}$i $MOUNT_OPTS || + error_exit "Failed to mount lustre on ${mntpt_root}$i $clients" + done + + local cmd="$RR_ALLOC $mntpt_root/$tdir/ash $rr_alloc_NFILES \ + $num_clients" + + # Save mdt values, set threshold to 100% i.e always Round Robin, + # restore the saved values again after creating files... + save_lustre_params mds1 \ + "lov.lustre-MDT0000*.qos_threshold_rr" > $qos_prec_objs + save_lustre_params mds1 \ + "osp.lustre-OST*-osc-MDT0000.create_count" >> $qos_prec_objs + + local old_create_count=$(grep -e "create_count" $qos_prec_objs | + cut -d'=' -f 2 | sort -nr | head -n1) + + # Make sure that every osp has enough precreated objects for the file + # creation app + + # create_count is always set to the power of 2 only, so if the files + # per OST are not multiple of that then it will be set to nearest + # lower power of 2. So set 'create_count' to the upper power of 2. + + foeo_calc=$((rr_alloc_NFILES * total_MNTPTS / OSTCOUNT)) + local create_count=$((2 * foeo_calc)) + do_facet mds1 "$LCTL set_param -n \ + lov.lustre-MDT0000*.qos_threshold_rr 100 \ + osp.lustre-OST*-osc-MDT0000.create_count $create_count" || + error "failed while setting qos_threshold_rr & creat_count" + + # Create few temporary files in order to increase the precreated objects + # to a desired value, before starting 'rr_alloc' app. Due to default + # value 32 of precreation count (OST_MIN_PRECREATE=32), precreated + # objects available are 32 initially, these gets exhausted very soon, + # which causes skip of some osps when very large number of files + # is created per OSTs. + createmany -o $DIR/$tdir/foo- $(((old_create_count + 1) * OSTCOUNT)) \ + > /dev/null + rm -f /$DIR/$tdir/foo* + + # Check for enough precreated objects... We should not + # fail here because code(osp_precreate.c) also takes care of it. + # So we have good chances of passing test even if this check fails. + local mdt_idx=0 + for ost_idx in $(seq 0 $((OSTCOUNT - 1))); do + [[ $(precreated_ost_obj_count $mdt_idx $ost_idx) -ge \ + $foeo_calc ]] || echo "Warning: test may fail because" \ + "of lack of precreated objects on OST${ost_idx}" + done + + if [[ $total_MNTPTS -ne 0 ]]; then + # Now start the actual file creation app. + mpi_run "-np $total_MNTPTS" $cmd || return + else + error "No mount point" + fi + + restore_lustre_params < $qos_prec_objs + rm -f $qos_prec_objs + + diff_max_min_arr=($($GETSTRIPE -r $DIR/$tdir/ | + grep "lmm_stripe_offset:" | awk '{print $2}' | sort -n | + uniq -c | awk 'NR==1 {min=max=$1} \ + { $1max ? max=$1 : max} \ + END {print max-min, max, min}')) + + rm -rf $DIR/$tdir + + # In-case of fairly large number of file creation using RR (round-robin) + # there can be two cases in which deviation will occur than the regular + # RR algo behaviour- + # 1- When rr_alloc does not start right with 'lqr_start_count' reseeded, + # 2- When rr_alloc does not finish with 'lqr_start_count == 0'. + # So the difference of files b/w any 2 OST should not be more than 2. + [[ ${diff_max_min_arr[0]} -le 2 ]] || + error "Uneven distribution detected: difference between" \ + "maximum files per OST (${diff_max_min_arr[1]}) and" \ + "minimum files per OST (${diff_max_min_arr[2]}) must not be" \ + "greater than 2" +} + run_fs_test() { # fs_test.x is the default name for exe FS_TEST=${FS_TEST:=$(which fs_test.x 2> /dev/null || true)} diff --git a/lustre/tests/mpi/Makefile.am b/lustre/tests/mpi/Makefile.am index 7beb45e..25f1826 100644 --- a/lustre/tests/mpi/Makefile.am +++ b/lustre/tests/mpi/Makefile.am @@ -4,7 +4,7 @@ AM_CFLAGS := -fPIC -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 CC = @MPICC_WRAPPER@ THETESTS = parallel_grouplock write_append_truncate createmany_mpi \ - mdsrate write_disjoint cascading_rw + mdsrate write_disjoint cascading_rw rr_alloc if TESTS if MPITESTS @@ -19,6 +19,7 @@ write_append_truncate_SOURCES=write_append_truncate.c write_disjoint_SOURCES=write_disjoint.c createmany_mpi_SOURCES=createmany-mpi.c parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c lp_utils.h +rr_alloc_SOURCES=rr_alloc.c cascading_rw_SOURCES=cascading_rw.c lp_utils.c lp_utils.h cascading_rw_LDADD=$(top_builddir)/lustre/utils/liblustreapi.la diff --git a/lustre/tests/mpi/rr_alloc.c b/lustre/tests/mpi/rr_alloc.c new file mode 100644 index 0000000..e65f711 --- /dev/null +++ b/lustre/tests/mpi/rr_alloc.c @@ -0,0 +1,182 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see If not, see + * http://www.gnu.org/licenses + * + * Please contact http://www.seagate.com/contacts/ or visit www.seagate.com + * if you need additional information or have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Seagate Technology LLC + * + * Author: Ashish Maurya + */ +/* + * lustre/tests/mpi/rr_alloc.c + * + * DESCRIPTION + * + * This code is creating files using MPI processes which depend on the + * mounted clients. Processes are running in parallel through all the client + * nodes in RR fashion starting with rank 0 and so on, and creating files. + * + * USE CASE:- If there are 20 mounted clients on 4 client nodes, 5 clients on + * each node, it will run 5 processes on each client node through each mount + * point and each process will create number of files given by the user. + * Each process rank is mapped to its matching mount point on the client node + * eg:- rank 0 <-> /tmp/mnt/lustre0 ; rank 1 <-> /tmp/mnt/lustre1 etc. + * + * NOTE:- For simplicity client on /mnt/lustre is not taken into account. + * + * IMPORTANT NOTE:- If argv[1] is /mnt/dir/ash, then the program assumes that. + * /mnt0/dir/, /mnt1/dir/, etc exist. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void usage(char *prog) +{ + printf("Usage: %s ", prog); + printf(" \n"); + printf("Ex: mpirun -np rr_alloc /tmp/mnt/lustre/ash 512 4\n"); + + exit(EXIT_FAILURE); +} + +void perr_exit(int rank, int error, const char *fmt, ...) +{ + va_list ap; + + printf("Process rank %d exited with error code %d\n", rank, error); + va_start(ap, fmt); + vprintf(fmt, ap); + + MPI_Abort(MPI_COMM_WORLD, error); +} + +int main(int argc, char **argv) +{ + int proc_rank = 0; + int serial_prank_per_cli = 0; + int proc_per_cli_node = 0; + int bytes = 0; + int file_no = 0; + int client_nodes = 0; + int nproc = 0; + int rc = 0; + int fd = 0; + int i = 0; + char file_path[PATH_MAX] = {0}; + char mnt_path[PATH_MAX] = {0}; + char *path1; + char *path2; + char *path3; + char *fname; + char *dname; + + if (argc != 4) + usage(argv[0]); + + if (!strchr(argv[1], '/')) { + fprintf(stderr, "Please enter filename with mount point\n"); + usage(argv[0]); + } + + /* + * Separating filename and mount point name. This is required for + * mapping processes to particular mount point. + */ + path1 = strdup(argv[1]); + path2 = strdup(argv[1]); + path3 = strdup(argv[1]); + fname = basename(path1); + dname = basename(dirname(path2)); + /* dirname looping depends on the depth of the file from mount path */ + strncpy(mnt_path, dirname(dirname(path3)), sizeof(mnt_path)); + + file_no = atoi(argv[2]); + if (!file_no) { + fprintf(stderr, "Number of files must not be zero\n"); + usage(argv[0]); + } + client_nodes = atoi(argv[3]); + if (!client_nodes) { + fprintf(stderr, "Client nodes must not be zero\n"); + usage(argv[0]); + } + + rc = MPI_Init(&argc, &argv); + if (rc != MPI_SUCCESS) { + fprintf(stderr, "MPI_Init failed: %d\n", rc); + exit(EXIT_FAILURE); + } + + rc = MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank); + if (rc != MPI_SUCCESS) + perr_exit(proc_rank, rc, "MPI_Comm_rank failed: %d\n", rc); + + rc = MPI_Comm_size(MPI_COMM_WORLD, &nproc); + if (rc != MPI_SUCCESS) + perr_exit(proc_rank, rc, "MPI_Comm_size failed: %d\n", rc); + + /* + * Make sure that each rank is processed through its respective mnt pt + * eg: job 0,1 will be executed by /tmp/mnt/lustre0, /tmp/mnt/lustre1, + * etc. on each client node. + */ + /* Number of processes on each client nodes */ + proc_per_cli_node = nproc / client_nodes; + + /* + * By default rank of processes is allocated in RR fashion throughout + * all the client nodes so all the processes are not in serial order on + * a particular client node. In order to map each process to a mount pt + * by its rank we need process rank in serial order on a client node + */ + serial_prank_per_cli = proc_rank % proc_per_cli_node; + + rc = MPI_Barrier(MPI_COMM_WORLD); + if (rc != MPI_SUCCESS) + perr_exit(proc_rank, rc, "Prep MPI_Barrier failed: %d\n", rc); + + for (i = 0; i < file_no; i++) { + bytes = snprintf(file_path, sizeof(file_path), + "%s%d/%s/%s-%d-%d", mnt_path, serial_prank_per_cli, + dname, fname, proc_rank, i); + if (bytes >= sizeof(file_path)) + perr_exit(proc_rank, -ENAMETOOLONG, "Name too long\n"); + fd = open(file_path, O_CREAT|O_RDWR, 0644); + if (fd < 0) { + perr_exit(proc_rank, errno, "Cannot open \"%s\": %s\n", + file_path, strerror(errno)); + } + close(fd); + } + MPI_Finalize(); + return 0; +} diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh index aea27cf..840a8b8 100644 --- a/lustre/tests/parallel-scale.sh +++ b/lustre/tests/parallel-scale.sh @@ -161,6 +161,11 @@ test_statahead () { } run_test statahead "statahead test, multiple clients" +test_rr_alloc () { + run_rr_alloc +} +run_test rr_alloc "Checking even file distribution over OSTs in RR policy" + test_fs_test () { run_fs_test } -- 1.8.3.1