X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=954f1952daf132390917bdda311fa0c8629d40f4;hp=d4ba7098639ff13ae990ed33598da8420b944731;hb=c394068ee148595711c661651368b91373bcc78a;hpb=e9a1f308b5359c2de1fda67816ef662ce727d275 diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh old mode 100644 new mode 100755 index d4ba709..954f195 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,10 +11,12 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: 3815 3939 -ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36 40" +# bug number for skipped test: 3815 +ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" # bug number for skipped test:4178 4176 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a" +# bug number for skipped test:LU-3852 +ALWAYS_EXCEPT="$ALWAYS_EXCEPT 251" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -30,9 +32,9 @@ MCREATE=${MCREATE:-mcreate} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false -if [[ $MDSCOUNT -ge 2 ]]; then - skip_env "Only run with single MDT for now" && exit -fi +# script only handles up to 10 MDTs (because of MDT_PREFIX) +[ $MDSCOUNT -gt 9 ] && + error "script cannot handle more than 9 MDTs, please fix" && exit check_and_setup_lustre @@ -90,15 +92,22 @@ init_agt_vars() { export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} + export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""} + export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""} + export HSMTOOL_TESTDIR export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") HSM_ARCHIVE=$(copytool_device $SINGLEAGT) HSM_ARCHIVE_NUMBER=2 - MDT_PARAM="mdt.$FSNAME-MDT0000" - HSM_PARAM="$MDT_PARAM.hsm" + # The test only support up to 10 MDTs + MDT_PREFIX="mdt.$FSNAME-MDT000" + HSM_PARAM="${MDT_PREFIX}0.hsm" # archive is purged at copytool setup HSM_ARCHIVE_PURGE=true + + # Don't allow copytool error upon start/setup + HSMTOOL_NOERROR=false } # Get the backend root path for the given agent facet. @@ -111,11 +120,28 @@ copytool_device() { # Stop copytool and unregister an existing changelog user. cleanup() { + copytool_monitor_cleanup copytool_cleanup changelog_cleanup cdt_set_sanity_policy } +get_mdt_devices() { + local mdtno + # get MDT device for each mdc + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + MDT[$idx]=$($LCTL get_param -n \ + mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid | + awk '{gsub(/_UUID/,""); print $1}' | head -n1) + done +} + +search_copytools() { + local agents=${1:-$(facet_active_host $SINGLEAGT)} + do_nodesv $agents "pgrep -x $HSMTOOL_BASE" +} + search_and_kill_copytool() { local agents=${1:-$(facet_active_host $SINGLEAGT)} @@ -123,11 +149,68 @@ search_and_kill_copytool() { do_nodesv $agents "killall -q $HSMTOOL_BASE" || true } +copytool_monitor_setup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + local cmd="mktemp --tmpdir=/tmp -d ${TESTSUITE}.${TESTNAME}.XXXX" + local test_dir=$(do_node $agent "$cmd") || + error "Failed to create tempdir on $agent" + export HSMTOOL_MONITOR_DIR=$test_dir + + # Create the fifo and a monitor (cat dies when copytool dies) + do_node $agent "mkfifo -m 0644 $test_dir/fifo" || + error "failed to create copytool fifo on $agent" + cmd="cat $test_dir/fifo > $test_dir/events &" + cmd+=" echo \\\$! > $test_dir/monitor_pid" + + if [[ $PDSH == *Rmrsh* ]]; then + # This is required for pdsh -Rmrsh and its handling of remote + # shells. + # Regular ssh and pdsh -Rssh work fine without this + # backgrounded subshell nonsense. + (do_node $agent "$cmd") & + export HSMTOOL_MONITOR_PDSH=$! + + # Slightly racy, but just making a best-effort to catch obvious + # problems. + sleep 1 + ps -p $HSMTOOL_MONITOR_PDSH >&- || + error "Failed to start copytool monitor on $agent" + else + do_node $agent "$cmd" + if [ $? != 0 ]; then + error "Failed to start copytool monitor on $agent" + fi + fi +} + +copytool_monitor_cleanup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + if [ -n "$HSMTOOL_MONITOR_DIR" ]; then + # Should die when the copytool dies, but just in case. + local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)" + cmd+=" 2>/dev/null || true" + do_node $agent "$cmd" + do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR" + export HSMTOOL_MONITOR_DIR= + fi + + # The pdsh should die on its own when the monitor dies. Just + # in case, though, try to clean up to avoid any cruft. + if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then + kill $HSMTOOL_MONITOR_PDSH 2>/dev/null + export HSMTOOL_MONITOR_PDSH= + fi +} + copytool_setup() { local facet=${1:-$SINGLEAGT} local lustre_mntpnt=${2:-$MOUNT} local arc_id=$3 - local hsm_root=$(copytool_device $facet) + local hsm_root=${4:-$(copytool_device $facet)} local agent=$(facet_active_host $facet) if [[ -z "$arc_id" ]] && @@ -147,6 +230,10 @@ copytool_setup() { # independent of hardware local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root $hsm_root" [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" + [[ -z "$HSMTOOL_UPDATE_INTERVAL" ]] || + cmd+=" --update-interval $HSMTOOL_UPDATE_INTERVAL" + [[ -z "$HSMTOOL_EVENT_FIFO" ]] || + cmd+=" --event-fifo $HSMTOOL_EVENT_FIFO" cmd+=" --bandwidth 1 $lustre_mntpnt" # Redirect the standard output and error to a log file which @@ -155,18 +242,36 @@ copytool_setup() { [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME local copytool_log=$prefix.copytool${arc_id}_log.$agent.log - do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" || - error "start copytool $facet on $agent failed" + do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" + if [[ $? != 0 ]]; then + [[ $HSMTOOL_NOERROR == true ]] || + error "start copytool $facet on $agent failed" + echo "start copytool $facet on $agent failed" + fi + trap cleanup EXIT } +get_copytool_event_log() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + [ -z "$HSMTOOL_MONITOR_DIR" ] && + error "Can't get event log: No monitor directory!" + + do_node $agent "cat $HSMTOOL_MONITOR_DIR/events" || + error "Could not collect event log from $agent" +} + copytool_cleanup() { trap - EXIT - local agents=${1:-$(facet_active_host $SINGLEAGT)} + local facet=$SINGLEAGT + local agents=${1:-$(facet_active_host $facet)} local mdtno local idx local oldstate local mdt_hsmctrl + local hsm_root=$(copytool_device $facet) do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 sleep 1 @@ -191,6 +296,7 @@ copytool_cleanup() { "$oldstate" 20 || error "mds${mdtno} cdt state is not $oldstate" done + do_facet $facet "rm -rf $hsm_root" } copytool_suspend() { @@ -227,20 +333,66 @@ copy2archive() { do_facet $SINGLEAGT cp -p $1 $file || error "cannot copy $1 to $file" } +mdts_set_param() { + local arg=$1 + local key=$2 + local value=$3 + local mdtno + local rc=0 + if [[ "$value" != "" ]]; then + value="=$value" + fi + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local facet=mds${mdtno} + # if $arg include -P option, run 1 set_param per MDT on the MGS + # else, run set_param on each MDT + [[ $arg = *"-P"* ]] && facet=mgs + do_facet $facet $LCTL set_param $arg mdt.${MDT[$idx]}.$key$value + [[ $? != 0 ]] && rc=1 + done + return $rc +} + +mdts_check_param() { + local key="$1" + local target="$2" + local timeout="$3" + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + wait_result mds${mdtno} \ + "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \ + $timeout || + error "$key state is not '$target' on mds${mdtno}" + done +} + changelog_setup() { - CL_USER=$(do_facet $SINGLEMDS $LCTL --device $MDT0\ - changelog_register -n) - do_facet $SINGLEMDS lctl set_param mdd.$MDT0.changelog_mask="+hsm" - $LFS changelog_clear $MDT0 $CL_USER 0 + CL_USERS=() + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local cl_user=$(do_facet mds${mdtno} $LCTL \ + --device ${MDT[$idx]} \ + changelog_register -n) + CL_USERS+=($cl_user) + do_facet mds${mdtno} lctl set_param \ + mdd.${MDT[$idx]}.changelog_mask="+hsm" + $LFS changelog_clear ${MDT[$idx]} $cl_user 0 + done } changelog_cleanup() { -# $LFS changelog $MDT0 - [[ -n "$CL_USER" ]] || return 0 - - $LFS changelog_clear $MDT0 $CL_USER 0 - do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $CL_USER - CL_USER= + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + [[ -z ${CL_USERS[$idx]} ]] && continue + $LFS changelog_clear ${MDT[$idx]} ${CL_USERS[$idx]} 0 + do_facet mds${mdtno} lctl --device ${MDT[$idx]} \ + changelog_deregister ${CL_USERS[$idx]} + done + CL_USERS=() } changelog_get_flags() { @@ -261,64 +413,57 @@ set_hsm_param() { local param=$1 local value=$2 local opt=$3 - if [[ "$value" != "" ]]; then - value="=$value" - fi - do_facet $SINGLEMDS $LCTL set_param $opt -n $HSM_PARAM.$param$value + mdts_set_param "$opt -n" "hsm.$param" "$value" return $? } set_test_state() { local cmd=$1 local target=$2 - do_facet $SINGLEMDS $LCTL set_param $MDT_PARAM.hsm_control=$cmd - wait_result $SINGLEMDS "$LCTL get_param -n $MDT_PARAM.hsm_control"\ - $target 10 || error "cdt state is not $target" + mdts_set_param "" hsm_control "$cmd" + mdts_check_param hsm_control "$target" 10 } cdt_set_sanity_policy() { if [[ "$CDT_POLICY_HAD_CHANGED" ]] then # clear all - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "+NRA" + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED= fi } cdt_set_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA + mdts_set_param "" hsm.policy "+NRA" CDT_POLICY_HAD_CHANGED=true } cdt_clear_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NRA + mdts_set_param "" hsm.policy "-NRA" CDT_POLICY_HAD_CHANGED=true } cdt_set_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NBR + mdts_set_param "" hsm.policy "+NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -d -P $MDT_PARAM.hsm_control + mdts_set_param "-P -d" hsm_control "" } cdt_set_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -P $MDT_PARAM.hsm_control=$1 + mdts_set_param "-P" hsm_control "$1" } cdt_check_state() { - local target=$1 - wait_result $SINGLEMDS\ - "$LCTL get_param -n $MDT_PARAM.hsm_control" "$target" 20 || - error "cdt state is not $target" + mdts_check_param hsm_control "$1" 20 } cdt_disable() { @@ -423,6 +568,12 @@ make_small() { path2fid $1 || error "cannot get fid on $1" } +make_small_sync() { + dd if=/dev/urandom of=$1 count=1 bs=1M conv=sync || + error "cannot create $1" + path2fid $1 || error "cannot get fid on $1" +} + cleanup_large_files() { local ratio=$(df -P $MOUNT | tail -1 | awk '{print $5}' | sed 's/%//g') @@ -431,7 +582,7 @@ cleanup_large_files() { make_large_for_striping() { local file2=${1/$DIR/$DIR2} - local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -1) + local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) cleanup_large_files @@ -489,12 +640,15 @@ wait_request_state() { local fid=$1 local request=$2 local state=$3 + # 4th arg (mdt index) is optional + local mdtidx=${4:-0} + local mds=mds$(($mdtidx + 1)) - local cmd="$LCTL get_param -n $HSM_PARAM.actions" + local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $SINGLEMDS "$cmd" $state 100 || - error "request on $fid is not $state" + wait_result $mds "$cmd" $state 100 || + error "request on $fid is not $state on $mds" } get_request_state() { @@ -515,8 +669,10 @@ get_request_count() { wait_all_done() { local timeout=$1 + local fid=$2 local cmd="$LCTL get_param -n $HSM_PARAM.actions" + [[ -n $fid ]] && cmd+=" | grep '$fid'" cmd+=" | egrep 'WAITING|STARTED'" wait_result $SINGLEMDS "$cmd" "" $timeout || @@ -528,8 +684,20 @@ wait_for_grace_delay() { sleep $val } -MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | - awk '{gsub(/_UUID/,""); print $1}' | head -1) +parse_json_event() { + local raw_event=$1 + + # python2.6 in EL6 includes an internal json module + local json_parser='import json; import fileinput;' + json_parser+=' print "\n".join(["local %s=\"%s\"" % tuple for tuple in ' + json_parser+='json.loads([line for line in ' + json_parser+='fileinput.input()][0]).items()])' + + echo $raw_event | python -c "$json_parser" +} + +# populate MDT device array +get_mdt_devices # initiate variables init_agt_vars @@ -826,7 +994,7 @@ test_10d() { } run_test 10d "Archive a file on the default archive id" -test_11() { +test_11a() { mkdir -p $DIR/$tdir copy2archive /etc/hosts $tdir/$tfile local f=$DIR/$tdir/$tfile @@ -850,7 +1018,31 @@ test_11() { local AFILE=$(do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid) || error "fid $fid not in archive $HSM_ARCHIVE" } -run_test 11 "Import a file" +run_test 11a "Import a file" + +test_11b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + $LFS hsm_archive -a $HSM_ARCHIVE_NUMBER $f || + error "hsm_archive failed" + wait_request_state $fid ARCHIVE SUCCEED + + local FILE_HASH=$(md5sum $f) + rm -f $f + + import_file $fid $f + + echo "$FILE_HASH" | md5sum -c + + [[ $? -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 11b "Import a deleted file using its FID" test_12a() { # test needs a running copytool @@ -861,16 +1053,16 @@ test_12a() { local f=$DIR/$tdir/$tfile import_file $tdir/$tfile $f - local f=$DIR2/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile echo "Verifying released state: " - check_hsm_flags $f "0x0000000d" + check_hsm_flags $f2 "0x0000000d" - local fid=$(path2fid $f) - $LFS hsm_restore $f + local fid=$(path2fid $f2) + $LFS hsm_restore $f2 wait_request_state $fid RESTORE SUCCEED echo "Verifying file state: " - check_hsm_flags $f "0x00000009" + check_hsm_flags $f2 "0x00000009" do_facet $SINGLEAGT diff -q $HSM_ARCHIVE/$tdir/$tfile $f @@ -1090,6 +1282,57 @@ test_12n() { } run_test 12n "Import/implicit restore/release" +test_12o() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" + +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 + do_facet $SINGLEMDS lctl set_param fail_loc=0x152 + + # set no retry action mode + cdt_set_no_retry + + diff -q /etc/hosts $f + local st=$? + + # we check we had a restore failure + wait_request_state $fid RESTORE FAILED + + [[ $st -eq 0 ]] && error "Restore must fail" + + # remove no retry action mode + cdt_clear_no_retry + + # check file is still released + check_hsm_flags $f "0x0000000d" + + # retry w/o failure injection + do_facet $SINGLEMDS lctl set_param fail_loc=0 + + # to be sure previous RESTORE result is gone + cdt_purge + wait_for_grace_delay + + diff -q /etc/hosts $f + st=$? + + # we check we had a restore done + wait_request_state $fid RESTORE SUCCEED + + [[ $st -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 12o "Layout-swap failure during Restore leaves file released" + test_13() { # test needs a running copytool copytool_setup @@ -1284,19 +1527,41 @@ test_21() { local fid=$(make_small $f) check_hsm_flags $f "0x00000000" + # LU-4388/LU-4389 - ZFS does not report full number of blocks + # used until file is flushed to disk + if [ $(facet_fstype ost1) == "zfs" ]; then + # this causes an OST_SYNC rpc to be sent + dd if=/dev/zero of=$f bs=512 count=1 oflag=sync conv=notrunc,fsync + # clear locks to reread file data + cancel_lru_locks osc + fi + + local orig_size=$(stat -c "%s" $f) + local orig_blocks=$(stat -c "%b" $f) + + start_full_debug_logging + $LFS hsm_archive $f || error "could not archive file" wait_request_state $fid ARCHIVE SUCCEED - [ $(stat -c "%b" $f) -ne "1" ] || error "wrong block number" - local sz=$(stat -c "%s" $f) - [ $sz -ne "0" ] || error "file size should not be zero" + local blocks=$(stat -c "%b" $f) + [ $blocks -eq $orig_blocks ] || + error "$f: wrong block number after archive: " \ + "$blocks != $orig_blocks" + local size=$(stat -c "%s" $f) + [ $size -eq $orig_size ] || + error "$f: wrong size after archive: $size != $orig_size" # Release and check states $LFS hsm_release $f || error "could not release file" check_hsm_flags $f "0x0000000d" - [ $(stat -c "%b" $f) -eq "1" ] || error "wrong block number" - [ $(stat -c "%s" $f) -eq $sz ] || error "wrong file size" + blocks=$(stat -c "%b" $f) + [ $blocks -gt 5 ] && + error "$f: too many blocks after release: $blocks > 5" + size=$(stat -c "%s" $f) + [ $size -ne $orig_size ] && + error "$f: wrong size after release: $size != $orig_size" # Check we can release an file without stripe info f=$f.nolov @@ -1314,6 +1579,8 @@ test_21() { $LFS hsm_release $f || fail "second release should succeed" check_hsm_flags $f "0x0000000d" + stop_full_debug_logging + copytool_cleanup } run_test 21 "Simple release tests" @@ -1995,6 +2262,15 @@ test_33() { wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f + # to be sure wait_all_done will not be mislead by previous tests + # and ops. + cdt_purge + wait_for_grace_delay + # Also raise grace_delay significantly so the Canceled + # Restore action will stay enough long avail. + local old_grace=$(get_hsm_param grace_delay) + set_hsm_param grace_delay 100 + md5sum $f >/dev/null & local pid=$! wait_request_state $fid RESTORE STARTED @@ -2007,8 +2283,29 @@ test_33() { $LFS hsm_cancel $f - wait_request_state $fid RESTORE CANCELED - wait_request_state $fid CANCEL SUCCEED + # instead of waiting+checking both Restore and Cancel ops + # sequentially, wait for both to be finished and then check + # each results. + wait_all_done 100 $fid + local rstate=$(get_request_state $fid RESTORE) + local cstate=$(get_request_state $fid CANCEL) + + # restore orig grace_delay. + set_hsm_param grace_delay $old_grace + + if [[ "$rstate" == "CANCELED" ]] ; then + [[ "$cstate" == "SUCCEED" ]] || + error "Restore state is CANCELED and Cancel state " \ + "is not SUCCEED but $cstate" + echo "Restore state is CANCELED, Cancel state is SUCCEED" + elif [[ "$rstate" == "SUCCEED" ]] ; then + [[ "$cstate" == "FAILED" ]] || + error "Restore state is SUCCEED and Cancel state " \ + "is not FAILED but $cstate" + echo "Restore state is SUCCEED, Cancel state is FAILED" + else + error "Restore state is $rstate and Cancel state is $cstate" + fi [ -z $killed ] || error "Cannot kill process waiting for restore ($killed)" @@ -2145,7 +2442,13 @@ test_40() { fid=$(copy_file /etc/hosts $f.$p.$i) done done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if df --local $HSM_ARCHIVE >/dev/null 2>&1 ; then + copytool_setup + else + copytool_setup $SINGLEAGT $MOUNT $HSM_ARCHIVE_NUMBER $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2389,10 +2692,288 @@ test_58() { } run_test 58 "Truncate a released file will trigger restore" -test_90() { - file_count=57 +test_60() { + # This test validates the fix for LU-4512. Ensure that the -u + # option changes the progress reporting interval from the default + # (30 seconds) to the user-specified interval. + local interval=5 + local progress_timeout=$((interval * 3)) + + # test needs a new running copytool + copytool_cleanup + HSMTOOL_UPDATE_INTERVAL=$interval copytool_setup + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile + local fid=$(make_large_for_progress $f) + + local start_at=$(date +%s) + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + + local mdtidx=0 + local mdt=${MDT_PREFIX}${mdtidx} + local mds=mds$((mdtidx + 1)) + + local cmd="$LCTL get_param -n ${mdt}.hsm.active_requests" + cmd+=" | awk '/'$fid'.*action=ARCHIVE/ {print \\\$12}' | cut -f2 -d=" + + local RESULT + local WAIT=0 + local sleep=1 + + echo -n "Expecting a progress update within $progress_timeout seconds... " + while [ true ]; do + RESULT=$(do_node $(facet_active_host $mds) "$cmd") + if [ $RESULT -gt 0 ]; then + echo "$RESULT bytes copied in $WAIT seconds." + break + elif [ $WAIT -ge $progress_timeout ]; then + error "Timed out waiting for progress update!" + break + fi + WAIT=$((WAIT + sleep)) + sleep $sleep + done + + local finish_at=$(date +%s) + local elapsed=$((finish_at - start_at)) + + # Ensure that the progress update occurred within the expected window. + if [ $elapsed -lt $interval ]; then + error "Expected progress update after at least $interval seconds" + fi + + cdt_clear_no_retry + copytool_cleanup +} +run_test 60 "Changing progress update interval from default" + +test_70() { + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + # Just start and stop the copytool to generate events. + cdt_clear_no_retry + copytool_cleanup + + local REGISTER_EVENT + local UNREGISTER_EVENT + while read event; do + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "REGISTER" ]; then + REGISTER_EVENT=$event + elif [ $event_type == "UNREGISTER" ]; then + UNREGISTER_EVENT=$event + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$REGISTER_EVENT" ]; then + error "Copytool failed to send register event to FIFO" + fi + + if [ -z "$UNREGISTER_EVENT" ]; then + error "Copytool failed to send unregister event to FIFO" + fi + + copytool_monitor_cleanup + echo "Register/Unregister events look OK." +} +run_test 70 "Copytool logs JSON register/unregister events to FIFO" + +test_71() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_large_for_progress $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "ARCHIVE_START" ]; then + START_EVENT=$event + continue + elif [ $event_type == "ARCHIVE_FINISH" ]; then + FINISH_EVENT=$event + continue + elif [ $event_type != "ARCHIVE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # These should be identical throughout an archive + # operation. + if [ $source_fid != $data_fid ]; then + error "Expected source_fid to equal data_fid" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send archive start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send archive finish event to FIFO" + fi + + echo "Archive events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup +} +run_test 71 "Copytool logs JSON archive events to FIFO" + +test_72() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + local test_file=$HSMTOOL_MONITOR_DIR/file + + local cmd="dd if=/dev/urandom of=$test_file count=16 bs=1000000 " + cmd+="conv=fsync" + do_facet $SINGLEAGT "$cmd" || + error "cannot create $test_file on $SINGLEAGT" + copy2archive $test_file $tdir/$tfile + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + f=$DIR2/$tdir/$tfile + echo "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + local fid=$(path2fid $f) + $LFS hsm_restore $f + wait_request_state $fid RESTORE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "RESTORE_START" ]; then + START_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at start" + fi + continue + elif [ $event_type == "RESTORE_FINISH" ]; then + FINISH_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at finish" + fi + continue + elif [ $event_type != "RESTORE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # When a restore starts out, the data fid is the same as the + # source fid. After the restore has gotten going, we learn + # the new data fid. Once the restore has finished, the source + # fid is set to the new data fid. + # + # We test this because some monitoring software may depend on + # this behavior. If it changes, then the consumers of these + # events may need to be modified. + if [ $source_fid == $data_fid ]; then + error "source_fid should != data_fid during restore" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send restore start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send restore finish event to FIFO" + fi + + echo "Restore events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup + + rm -rf $test_dir +} +run_test 72 "Copytool logs JSON restore events to FIFO" + +test_90() { + file_count=51 # Max number of files constrained by LNET message size + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + local f=$DIR/$tdir/$tfile local FILELIST=/tmp/filelist.txt local i="" @@ -2529,28 +3110,70 @@ test_105() { } run_test 105 "Restart of coordinator" -test_106() { - # test needs a running copytool - copytool_setup +get_agent_by_uuid_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + do_facet $mds "$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.agents |\ + grep $uuid" +} + +check_agent_registered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ ! -z "$agent" ]]; then + echo "found agent $agent on $mds" + else + error "uuid $uuid not found in agent list on $mds" + fi +} + +check_agent_unregistered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ -z "$agent" ]]; then + echo "uuid not found in agent list on $mds" + else + error "uuid found in agent list on $mds: $agent" + fi +} + +check_agent_registered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_registered_by_mdt $uuid $((mdsno - 1)) + done +} +check_agent_unregistered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_unregistered_by_mdt $uuid $((mdsno - 1)) + done +} + +test_106() { local uuid=$(do_rpc_nodes $(facet_active_host $SINGLEAGT) \ get_client_uuid $MOUNT | cut -d' ' -f2) - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + + copytool_setup + check_agent_registered $uuid + + search_copytools || error "No copytool found" + copytool_cleanup - [[ ! -z "$agent" ]] || error "My uuid $uuid not found in agent list" - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) - [[ -z "$agent" ]] || - error "My uuid $uuid still found in agent list,"\ - " after copytool shutdown" + check_agent_unregistered $uuid + copytool_setup - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + check_agent_registered $uuid + copytool_cleanup - [[ ! -z "$agent" ]] || - error "My uuid $uuid not found in agent list after"\ - " copytool restart" } run_test 106 "Copytool register/unregister" @@ -2826,7 +3449,7 @@ test_220() { $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) changelog_cleanup local target=0x0 @@ -2853,7 +3476,7 @@ test_221() { wait_request_state $fid ARCHIVE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x7d [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2878,7 +3501,7 @@ test_222a() { $LFS hsm_restore $f wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2904,7 +3527,7 @@ test_222b() { wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2933,7 +3556,7 @@ test_223a() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2962,7 +3585,7 @@ test_223b() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2988,7 +3611,7 @@ test_224() { $LFS hsm_remove $f wait_request_state $fid REMOVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -n 1) local target=0x200 [[ $flags == $target ]] || @@ -3024,9 +3647,9 @@ test_225() { wait_request_state $fid REMOVE CANCELED wait_request_state $fid CANCEL SUCCEED - flags=$(changelog_get_flags $MDT0 RENME $fid2) - local flags=$($LFS changelog $MDT0 | grep HSM | grep $fid | tail -1 | - awk '{print $5}') + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) + local flags=$($LFS changelog ${MDT[0]} | grep HSM | grep $fid | + tail -n 1 | awk '{print $5}') local target=0x27d [[ $flags == $target ]] || @@ -3058,7 +3681,7 @@ test_226() { rm $f1 || error "rm $f1 failed" - local flags=$(changelog_get_flags $MDT0 UNLNK $fid1) + local flags=$(changelog_get_flags ${MDT[0]} UNLNK $fid1) local target=0x3 [[ $flags == $target ]] || @@ -3066,7 +3689,7 @@ test_226() { mv $f3 $f2 || error "mv $f3 $f2 failed" - flags=$(changelog_get_flags $MDT0 RENME $fid2) + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) target=0x3 [[ $flags == $target ]] || @@ -3086,7 +3709,7 @@ check_flags_changes() { local target=0x280 $LFS hsm_set --$hsm_flag $f || error "Cannot set $hsm_flag on $f" - local flags=($(changelog_get_flags $MDT0 HSM $fid)) + local flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) local seen=${#flags[*]} cnt=$((fst + cnt)) [[ $seen == $cnt ]] || @@ -3097,7 +3720,7 @@ check_flags_changes() { $LFS hsm_clear --$hsm_flag $f || error "Cannot clear $hsm_flag on $f" - flags=($(changelog_get_flags $MDT0 HSM $fid)) + flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) seen=${#flags[*]} cnt=$(($cnt + 1)) [[ $cnt == $seen ]] || @@ -3137,10 +3760,9 @@ test_228() { # test needs a running copytool copytool_setup - dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 conv=sync || - error "creating $DIR/$tfile" + local fid=$(make_small_sync $DIR/$tfile) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tfile - wait_request_state $(path2fid $DIR/$tfile) ARCHIVE SUCCEED + wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" @@ -3157,13 +3779,15 @@ test_228() { $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir || error "mkdir $tdir failed" tar cf - --sparse $DIR/$tfile | tar xvf - -C $DIR/$tdir || error "tar failed" cmp $DIR/$tfile $DIR/$tdir/$DIR/$tfile || error "comparing untarred $DIR/$tfile" + rm -f $DIR/$tfile $DIR/$tfile.2 || + error "rm $DIR/$tfile or $DIR/$tfile.2 failed" copytool_cleanup } run_test 228 "On released file, return extend to FIEMAP. For [cp,tar] --sparse" @@ -3286,7 +3910,11 @@ test_302() { cdt_shutdown set_hsm_param default_archive_id $new -P - fail $SINGLEMDS + + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + fail mds${mdtno} + done # check cdt is on cdt_check_state enabled @@ -3300,6 +3928,168 @@ test_302() { } run_test 302 "HSM tunnable are persistent when CDT is off" +test_400() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that hsm request on mdt0 is sent to the right MDS + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # check that hsm request on mdt1 is sent to the right MDS + $LFS hsm_archive $dir_mdt1/$tfile || error "lfs hsm_archive" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 400 "Single request is sent to the right MDT" + +test_401() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that compound requests are shunt to the rights MDTs + $LFS hsm_archive $dir_mdt0/$tfile $dir_mdt1/$tfile || + error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 401 "Compound requests split and sent to their respective MDTs" + +mdc_change_state() # facet, MDT_pattern, activate|deactivate +{ + local facet=$1 + local pattern="$2" + local state=$3 + local node=$(facet_active_host $facet) + local mdc + for mdc in $(do_facet $facet "$LCTL dl | grep -E ${pattern}-mdc" | + awk '{print $4}'); do + echo "$3 $mdc on $node" + do_facet $facet "$LCTL --device $mdc $state" || return 1 + done +} + +test_402() { + # make sure there is no running copytool + copytool_cleanup + + # deactivate all mdc on agent1 + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "deactivate" + + HSMTOOL_NOERROR=true copytool_setup $SINGLEAGT + + check_agent_unregistered "uuid" # match any agent + + # no expected running copytool + search_copytools $agent && error "Copytool start should have failed" + + # reactivate MDCs + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" +} +run_test 402 "Copytool start fails if all MDTs are inactive" + +test_403() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + # make sure there is no running copytool + copytool_cleanup + + local agent=$(facet_active_host $SINGLEAGT) + local uuid=$(do_rpc_nodes $agent get_client_uuid | cut -d' ' -f2) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" + + copytool_setup + # check the agent is registered on MDT0000, and not on MDT0001 + check_agent_registered_by_mdt $uuid 0 + check_agent_unregistered_by_mdt $uuid 1 + + # check running copytool process + search_copytools $agent || error "No running copytools on $agent" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" + + # make sure the copytool is now registered to all MDTs + check_agent_registered $uuid + + copytool_cleanup +} +run_test 403 "Copytool starts with inactive MDT and register on reconnect" + +test_404() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + # create files on both MDT0000 and MDT0001 + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + + # create 1 file on mdt0 + local fid1=$(make_small $dir_mdt0/$tfile) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" + + # send an HSM request for files in MDT0000 + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + + # check for completion of files in MDT0000 + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 +} +run_test 404 "Inactive MDT does not block requests for active MDTs" + copytool_cleanup complete $SECONDS