X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Ftests%2Flarge-scale.sh;h=65f5d29395fa8a77f405fab46fc5c1c30ae72992;hb=c12fbb19d332be89607462d8a210a8aedf354fa1;hp=6d6664272250c9c2b9716cd942a7aa3c7a180847;hpb=d1a878f71efe2cd08b8475731b747b89558cdd48;p=fs%2Flustre-release.git diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh index 6d66642..65f5d29 100644 --- a/lustre/tests/large-scale.sh +++ b/lustre/tests/large-scale.sh @@ -18,9 +18,9 @@ init_test_env $@ remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 -[ -n "$CLIENTS" ] || { skip "$0: Need two or more clients" && exit 0; } +[ -n "$CLIENTS" ] || { skip_env "$0: Need two or more clients" && exit 0; } [ $CLIENTCOUNT -ge 2 ] || \ - { skip "$0: Need two or more clients, have $CLIENTCOUNT" && exit 0; } + { skip_env "$0: Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } # [ "$SLOW" = "no" ] && EXCEPT_SLOW="" @@ -39,23 +39,11 @@ check_vbr () { } check_vbr || \ - { skip "$0: no version_recovery" && exit 0; } + { skip_env "$0: no version_recovery" && exit 0; } FAKE_NUM_MAX=${FAKE_NUM_MAX:-1000} [ "$SLOW" = "no" ] && FAKE_NUM_MAX=100 -do_and_time () { - local cmd=$1 - - local start_ts=`date +%s` - - $cmd - - local current_ts=`date +%s` - ELAPSED=`expr $current_ts - $start_ts` - echo "===== START $start_ts CURRENT $current_ts" -} - delete_fake_exports () { NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l") @@ -87,8 +75,8 @@ test_1b() { NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l") [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM" echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM" - do_and_time "zconf_mount_clients $CLIENTS $DIR" - echo "==== $TESTNAME ===== CONNECTION TIME $ELAPSED: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT" + local elapsed=$(do_and_time "zconf_mount_clients $CLIENTS $DIR") + echo "==== $TESTNAME ===== CONNECTION TIME $elapsed: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT" # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1" delete_fake_exports @@ -102,7 +90,7 @@ run_test 1b "VBR: connect $CLIENTCOUNT clients with delayed exports" # fail fn does not do df on all clients fail_mds () { facet_failover mds - client_df + clients_up } test_1c() { @@ -148,7 +136,6 @@ test_1c() { } run_test 1c "VBR: recovery $CLIENTCOUNT clients with delayed exports" - test_1d() { delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; } @@ -175,8 +162,8 @@ test_1d() { EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED") [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM" - do_and_time "zconf_mount_clients $CLIENTS $DIR" - echo "==== $TESTNAME===== CONNECTION TIME $ELAPSED: expired FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT" + local elapsed=$(do_and_time "zconf_mount_clients $CLIENTS $DIR") + echo "==== $TESTNAME===== CONNECTION TIME $elapsed: expired FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT" do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE" done @@ -186,6 +173,85 @@ test_1d() { run_test 1d "VBR: expire exports, connect $CLIENTCOUNT clients" # VBR scale tests end +test_3a() { + assert_env CLIENTS MDSRATE MPIRUN + + local -a nodes=(${CLIENTS//,/ }) + + # INCREMENT is a number of clients + # a half of clients by default + increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))} + + machinefile=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} + local LOG=$TMP/${TESTSUITE}_$tfile + + local var=mds_svc + local procfile="*.${!var}.recovery_status" + local iters=${ITERS:-3} + local nfiles=${NFILES:-50000} + local nthreads=${THREADS_PER_CLIENT:-3} + + local IFree=$(inodes_available) + [ $IFree -gt $nfiles ] || nfiles=$IFree + + local dir=$DIR/$tdir + mkdir -p $dir + chmod 0777 $dir + + local pid + local list + local -a res + + local num=$increment + + while [ $num -le $CLIENTCOUNT ]; do + list=$(comma_list ${nodes[@]:0:$num}) + + generate_machine_file $list $machinefile || + { error "can not generate machinefile"; exit 1; } + + for i in $(seq $iters); do + mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore + + COMMAND="${MDSRATE} --create --nfiles $nfiles --dir $dir --filefmt 'f%%d'" + mpi_run -np $((num * nthreads)) -machinefile $machinefile ${COMMAND} | tee ${LOG} & + + pid=$! + echo "pid=$pid" + + # 2 threads 100000 creates 117 secs + sleep 20 + + log "$i : Starting failover on mds" + facet_failover mds + if ! wait_recovery_complete mds $((TIMEOUT * 10)); then + echo "mds recovery is not completed!" + kill -9 $pid + exit 7 + fi + + duration=$(do_facet mds lctl get_param -n $procfile | grep recovery_duration) + + res=( "${res[@]}" "$num" ) + res=( "${res[@]}" "$duration" ) + echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" + wait $pid + + done + num=$((num + increment)) + done + + mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore + + i=0 + while [ $i -lt ${#res[@]} ]; do + echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]} ${res[i+1]}" + i=$((i+2)) + done +} + +run_test 3a "recovery time, $CLIENTCOUNT clients" + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true