X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=0ee67b103990e87f6b8eafec33406a983b066a4d;hp=233db87a509c512e536495158410e4df68700347;hb=f625f670afbe954030ff81f0f8522137d6cdd335;hpb=fd6d814e5cebd79dd1bf566a601538ab6f310269 diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh old mode 100644 new mode 100755 index 233db87..0ee67b1 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,10 +11,8 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: 3815 3939 -ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36 40" -# bug number for skipped test:4178 4176 -ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a" +# bug number for skipped test: LU-3815 +ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -26,7 +24,6 @@ init_logging MULTIOP=${MULTIOP:-multiop} OPENFILE=${OPENFILE:-openfile} -MCREATE=${MCREATE:-mcreate} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false @@ -90,6 +87,9 @@ init_agt_vars() { export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} + export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""} + export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""} + export HSMTOOL_TESTDIR export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") HSM_ARCHIVE=$(copytool_device $SINGLEAGT) HSM_ARCHIVE_NUMBER=2 @@ -100,6 +100,9 @@ init_agt_vars() { # archive is purged at copytool setup HSM_ARCHIVE_PURGE=true + + # Don't allow copytool error upon start/setup + HSMTOOL_NOERROR=false } # Get the backend root path for the given agent facet. @@ -112,6 +115,7 @@ copytool_device() { # Stop copytool and unregister an existing changelog user. cleanup() { + copytool_monitor_cleanup copytool_cleanup changelog_cleanup cdt_set_sanity_policy @@ -124,7 +128,7 @@ get_mdt_devices() { local idx=$(($mdtno - 1)) MDT[$idx]=$($LCTL get_param -n \ mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid | - awk '{gsub(/_UUID/,""); print $1}' | head -1) + awk '{gsub(/_UUID/,""); print $1}' | head -n1) done } @@ -140,16 +144,73 @@ search_and_kill_copytool() { do_nodesv $agents "killall -q $HSMTOOL_BASE" || true } +copytool_monitor_setup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + local cmd="mktemp --tmpdir=/tmp -d ${TESTSUITE}.${TESTNAME}.XXXX" + local test_dir=$(do_node $agent "$cmd") || + error "Failed to create tempdir on $agent" + export HSMTOOL_MONITOR_DIR=$test_dir + + # Create the fifo and a monitor (cat dies when copytool dies) + do_node $agent "mkfifo -m 0644 $test_dir/fifo" || + error "failed to create copytool fifo on $agent" + cmd="cat $test_dir/fifo > $test_dir/events &" + cmd+=" echo \\\$! > $test_dir/monitor_pid" + + if [[ $PDSH == *Rmrsh* ]]; then + # This is required for pdsh -Rmrsh and its handling of remote + # shells. + # Regular ssh and pdsh -Rssh work fine without this + # backgrounded subshell nonsense. + (do_node $agent "$cmd") & + export HSMTOOL_MONITOR_PDSH=$! + + # Slightly racy, but just making a best-effort to catch obvious + # problems. + sleep 1 + ps -p $HSMTOOL_MONITOR_PDSH >&- || + error "Failed to start copytool monitor on $agent" + else + do_node $agent "$cmd" + if [ $? != 0 ]; then + error "Failed to start copytool monitor on $agent" + fi + fi +} + +copytool_monitor_cleanup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + if [ -n "$HSMTOOL_MONITOR_DIR" ]; then + # Should die when the copytool dies, but just in case. + local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)" + cmd+=" 2>/dev/null || true" + do_node $agent "$cmd" + do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR" + export HSMTOOL_MONITOR_DIR= + fi + + # The pdsh should die on its own when the monitor dies. Just + # in case, though, try to clean up to avoid any cruft. + if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then + kill $HSMTOOL_MONITOR_PDSH 2>/dev/null + export HSMTOOL_MONITOR_PDSH= + fi +} + copytool_setup() { local facet=${1:-$SINGLEAGT} local lustre_mntpnt=${2:-$MOUNT} local arc_id=$3 - local hsm_root=$(copytool_device $facet) + local hsm_root=${4:-$(copytool_device $facet)} local agent=$(facet_active_host $facet) if [[ -z "$arc_id" ]] && do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then - echo "Wakeup copytool $facet on $agent" + echo "Only wakeup running copytool $facet on $agent" return 0 fi @@ -164,6 +225,10 @@ copytool_setup() { # independent of hardware local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root $hsm_root" [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" + [[ -z "$HSMTOOL_UPDATE_INTERVAL" ]] || + cmd+=" --update-interval $HSMTOOL_UPDATE_INTERVAL" + [[ -z "$HSMTOOL_EVENT_FIFO" ]] || + cmd+=" --event-fifo $HSMTOOL_EVENT_FIFO" cmd+=" --bandwidth 1 $lustre_mntpnt" # Redirect the standard output and error to a log file which @@ -172,22 +237,55 @@ copytool_setup() { [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME local copytool_log=$prefix.copytool${arc_id}_log.$agent.log - do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" || - error "start copytool $facet on $agent failed" + do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" + if [[ $? != 0 ]]; then + [[ $HSMTOOL_NOERROR == true ]] || + error "start copytool $facet on $agent failed" + echo "start copytool $facet on $agent failed" + fi + trap cleanup EXIT } +get_copytool_event_log() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + [ -z "$HSMTOOL_MONITOR_DIR" ] && + error "Can't get event log: No monitor directory!" + + do_node $agent "cat $HSMTOOL_MONITOR_DIR/events" || + error "Could not collect event log from $agent" +} + copytool_cleanup() { trap - EXIT - local agents=${1:-$(facet_active_host $SINGLEAGT)} + local facet=$SINGLEAGT + local agents=${1:-$(facet_active_host $facet)} local mdtno local idx local oldstate local mdt_hsmctrl + local hsm_root=$(copytool_device $facet) + local end_wait=$(( SECONDS + TIMEOUT )) do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 - sleep 1 - echo "Copytool is stopped on $agents" + + while (( SECONDS < end_wait )); do + sleep 2 + do_nodesv $agents "pgrep -x $HSMTOOL_BASE" + if [ $? -ne 0 ]; then + echo "Copytool is stopped on $agents" + break + fi + echo "Copytool still running on $agents" + done + if do_nodesv $agents "pgrep -x $HSMTOOL_BASE"; then + error "Copytool failed to stop in ${TIMEOUT}s ..." + else + echo "Copytool has stopped in " \ + "$((TIMEOUT - (end_wait - SECONDS)))s." + fi # clean all CDTs orphans requests from previous tests # that would otherwise need to timeout to clear. @@ -208,6 +306,9 @@ copytool_cleanup() { "$oldstate" 20 || error "mds${mdtno} cdt state is not $oldstate" done + if do_facet $facet "df $hsm_root" >/dev/null 2>&1 ; then + do_facet $facet "rm -rf $hsm_root/*" + fi } copytool_suspend() { @@ -235,7 +336,7 @@ make_archive() { local file=$HSM_ARCHIVE/$1 do_facet $SINGLEAGT mkdir -p $(dirname $file) do_facet $SINGLEAGT dd if=/dev/urandom of=$file count=32 bs=1000000 || - error "cannot create $file" + file_creation_failure dd $file $? } copy2archive() { @@ -410,17 +511,19 @@ needclients() { path2fid() { $LFS path2fid $1 | tr -d '[]' + return ${PIPESTATUS[0]} } get_hsm_flags() { local f=$1 local u=$2 + local st if [[ $u == "user" ]]; then - local st=$($RUNAS $LFS hsm_state $f) + st=$($RUNAS $LFS hsm_state $f) else - local st=$($LFS hsm_state $f) u=root + st=$($LFS hsm_state $f) fi [[ $? == 0 ]] || error "$LFS hsm_state $f failed (run as $u)" @@ -431,7 +534,8 @@ get_hsm_flags() { get_hsm_archive_id() { local f=$1 - local st=$($LFS hsm_state $f) + local st + st=$($LFS hsm_state $f) [[ $? == 0 ]] || error "$LFS hsm_state $f failed" local ar=$(echo $st | grep "archive_id" | cut -f5 -d" " | @@ -455,6 +559,15 @@ check_hsm_flags_user() { [[ $st == $fl ]] || error "hsm flags on $f are $st != $fl" } +file_creation_failure() { + local cmd=$1 + local f=$2 + local err=$3 + + df $MOUNT $MOUNT2 >&2 + error "cannot create $f with $cmd, status=$err" +} + copy_file() { local f= @@ -468,31 +581,52 @@ copy_file() { f=${f/$DIR/$DIR2} fi rm -f $f - cp $1 $f || error "cannot copy $1 to $f" + cp $1 $f || file_creation_failure cp $f $? + path2fid $f || error "cannot get fid on $f" } make_small() { local file2=${1/$DIR/$DIR2} dd if=/dev/urandom of=$file2 count=2 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } +make_small_sync() { + dd if=/dev/urandom of=$1 count=1 bs=1M conv=sync || + file_creation_failure dd $1 $? + path2fid $1 || error "cannot get fid on $1" +} + cleanup_large_files() { local ratio=$(df -P $MOUNT | tail -1 | awk '{print $5}' | sed 's/%//g') [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \; } +check_enough_free_space() { + local nb=$1 + local unit=$2 + local need=$((nb * unit /1024)) + local free=$(df -kP $MOUNT | tail -1 | awk '{print $4}') + (( $need >= $free )) && return 1 + return 0 +} + make_large_for_striping() { local file2=${1/$DIR/$DIR2} - local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -1) + local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) cleanup_large_files + check_enough_free_space 5 $sz + [ $? != 0 ] && return $? + dd if=/dev/urandom of=$file2 count=5 bs=$sz conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } @@ -501,12 +635,16 @@ make_large_for_progress() { cleanup_large_files + check_enough_free_space 39 1000000 + [ $? != 0 ] && return $? + # big file is large enough, so copy time is > 30s # so copytool make 1 progress # size is not a multiple of 1M to avoid stripe # aligment dd if=/dev/urandom of=$file2 count=39 bs=1000000 conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } @@ -515,12 +653,15 @@ make_large_for_progress_aligned() { cleanup_large_files + check_enough_free_space 33 1048576 + [ $? != 0 ] && return $? + # big file is large enough, so copy time is > 30s # so copytool make 1 progress # size is a multiple of 1M to have stripe # aligment dd if=/dev/urandom of=$file2 count=33 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -529,9 +670,12 @@ make_large_for_cancel() { cleanup_large_files + check_enough_free_space 103 1048576 + [ $? != 0 ] && return $? + # Copy timeout is 100s. 105MB => 105s dd if=/dev/urandom of=$file2 count=103 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -552,7 +696,7 @@ wait_request_state() { local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $mds "$cmd" $state 100 || + wait_result $mds "$cmd" $state 200 || error "request on $fid is not $state on $mds" } @@ -574,8 +718,10 @@ get_request_count() { wait_all_done() { local timeout=$1 + local fid=$2 local cmd="$LCTL get_param -n $HSM_PARAM.actions" + [[ -n $fid ]] && cmd+=" | grep '$fid'" cmd+=" | egrep 'WAITING|STARTED'" wait_result $SINGLEMDS "$cmd" "" $timeout || @@ -587,6 +733,18 @@ wait_for_grace_delay() { sleep $val } +parse_json_event() { + local raw_event=$1 + + # python2.6 in EL6 includes an internal json module + local json_parser='import json; import fileinput;' + json_parser+=' print "\n".join(["local %s=\"%s\"" % tuple for tuple in ' + json_parser+='json.loads([line for line in ' + json_parser+='fileinput.input()][0]).items()])' + + echo $raw_event | python -c "$json_parser" +} + # populate MDT device array get_mdt_devices @@ -702,7 +860,7 @@ test_3() { error "user could not change hsm flags" dd if=/etc/passwd of=$f.append bs=1 count=3\ conv=notrunc oflag=append status=noxfer || - error "could not append to test file" + file_creation_failure dd $f.append $? check_hsm_flags $f.append "0x00000003" # Modify a file sets it dirty @@ -711,7 +869,7 @@ test_3() { error "user could not change hsm flags" dd if=/dev/zero of=$f.modify bs=1 count=3\ conv=notrunc status=noxfer || - error "could not modify test file" + file_creation_failure dd $f.modify $? check_hsm_flags $f.modify "0x00000003" # Open O_TRUNC sets dirty @@ -772,7 +930,7 @@ test_9() { copytool_cleanup } -run_test 9 "Use of explict archive number, with dedicated copytool" +run_test 9 "Use of explicit archive number, with dedicated copytool" test_9a() { needclients 3 || return 0 @@ -885,7 +1043,7 @@ test_10d() { } run_test 10d "Archive a file on the default archive id" -test_11() { +test_11a() { mkdir -p $DIR/$tdir copy2archive /etc/hosts $tdir/$tfile local f=$DIR/$tdir/$tfile @@ -909,7 +1067,31 @@ test_11() { local AFILE=$(do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid) || error "fid $fid not in archive $HSM_ARCHIVE" } -run_test 11 "Import a file" +run_test 11a "Import a file" + +test_11b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + $LFS hsm_archive -a $HSM_ARCHIVE_NUMBER $f || + error "hsm_archive failed" + wait_request_state $fid ARCHIVE SUCCEED + + local FILE_HASH=$(md5sum $f) + rm -f $f + + import_file $fid $f + + echo "$FILE_HASH" | md5sum -c + + [[ $? -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 11b "Import a deleted file using its FID" test_12a() { # test needs a running copytool @@ -920,16 +1102,16 @@ test_12a() { local f=$DIR/$tdir/$tfile import_file $tdir/$tfile $f - local f=$DIR2/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile echo "Verifying released state: " - check_hsm_flags $f "0x0000000d" + check_hsm_flags $f2 "0x0000000d" - local fid=$(path2fid $f) - $LFS hsm_restore $f + local fid=$(path2fid $f2) + $LFS hsm_restore $f2 wait_request_state $fid RESTORE SUCCEED echo "Verifying file state: " - check_hsm_flags $f "0x00000009" + check_hsm_flags $f2 "0x00000009" do_facet $SINGLEAGT diff -q $HSM_ARCHIVE/$tdir/$tfile $f @@ -973,7 +1155,10 @@ test_12c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile $LFS setstripe -c 2 $f - local fid=$(make_large_for_striping $f) + local fid + fid=$(make_large_for_striping $f) + [ $? != 0 ] && skip "not enough free space" && return + local FILE_CRC=$(md5sum $f) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -1149,6 +1334,77 @@ test_12n() { } run_test 12n "Import/implicit restore/release" +test_12o() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" + +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 + do_facet $SINGLEMDS lctl set_param fail_loc=0x152 + + # set no retry action mode + cdt_set_no_retry + + diff -q /etc/hosts $f + local st=$? + + # we check we had a restore failure + wait_request_state $fid RESTORE FAILED + + [[ $st -eq 0 ]] && error "Restore must fail" + + # remove no retry action mode + cdt_clear_no_retry + + # check file is still released + check_hsm_flags $f "0x0000000d" + + # retry w/o failure injection + do_facet $SINGLEMDS lctl set_param fail_loc=0 + + # to be sure previous RESTORE result is gone + cdt_purge + wait_for_grace_delay + + diff -q /etc/hosts $f + st=$? + + # we check we had a restore done + wait_request_state $fid RESTORE SUCCEED + + [[ $st -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 12o "Layout-swap failure during Restore leaves file released" + +test_12p() { + # test needs a running copytool + copytool_setup + + mkdir $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + + copytool_cleanup +} +run_test 12p "implicit restore of a file on copytool mount point" + test_13() { # test needs a running copytool copytool_setup @@ -1578,7 +1834,7 @@ test_24b() { copytool_setup mkdir -p $DIR/$tdir - # Check that root can do HSM actions on a ordinary user's file. + # Check that root can do HSM actions on a regular user's file. rm -f $file fid=$(make_small $file) sum0=$(md5sum $file) @@ -1797,7 +2053,10 @@ test_26() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -1834,7 +2093,10 @@ test_27b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -1853,7 +2115,10 @@ test_28() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -1942,6 +2207,7 @@ test_30c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/SLEEP + local slp_sum1=$(md5sum /bin/sleep) local fid=$(copy_file /bin/sleep $f) chmod 755 $f $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -1958,7 +2224,12 @@ test_30c() { wait $pid [[ $? == 0 ]] || error "Execution failed during run" cmp /bin/sleep $f - [[ $? == 0 ]] || error "Binary overwritten during exec" + if [[ $? != 0 ]]; then + local slp_sum2=$(md5sum /bin/sleep) + # in case sleep file is modified during the test + [[ $slp_sum1 == $slp_sum2 ]] && + error "Binary overwritten during exec" + fi # cleanup # remove no try action mode @@ -2031,7 +2302,10 @@ test_31b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2052,7 +2326,10 @@ test_31c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress_aligned $f) + local fid + fid=$(make_large_for_progress_aligned $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2073,11 +2350,23 @@ test_33() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f + # to be sure wait_all_done will not be mislead by previous tests + # and ops. + cdt_purge + wait_for_grace_delay + # Also raise grace_delay significantly so the Canceled + # Restore action will stay enough long avail. + local old_grace=$(get_hsm_param grace_delay) + set_hsm_param grace_delay 100 + md5sum $f >/dev/null & local pid=$! wait_request_state $fid RESTORE STARTED @@ -2090,8 +2379,29 @@ test_33() { $LFS hsm_cancel $f - wait_request_state $fid RESTORE CANCELED - wait_request_state $fid CANCEL SUCCEED + # instead of waiting+checking both Restore and Cancel ops + # sequentially, wait for both to be finished and then check + # each results. + wait_all_done 100 $fid + local rstate=$(get_request_state $fid RESTORE) + local cstate=$(get_request_state $fid CANCEL) + + # restore orig grace_delay. + set_hsm_param grace_delay $old_grace + + if [[ "$rstate" == "CANCELED" ]] ; then + [[ "$cstate" == "SUCCEED" ]] || + error "Restore state is CANCELED and Cancel state " \ + "is not SUCCEED but $cstate" + echo "Restore state is CANCELED, Cancel state is SUCCEED" + elif [[ "$rstate" == "SUCCEED" ]] ; then + [[ "$cstate" == "FAILED" ]] || + error "Restore state is SUCCEED and Cancel state " \ + "is not FAILED but $cstate" + echo "Restore state is SUCCEED, Cancel state is FAILED" + else + error "Restore state is $rstate and Cancel state is $cstate" + fi [ -z $killed ] || error "Cannot kill process waiting for restore ($killed)" @@ -2107,7 +2417,10 @@ test_34() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2140,7 +2453,10 @@ test_35() { local f=$DIR/$tdir/$tfile local f1=$DIR/$tdir/$tfile-1 - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + local fid1=$(copy_file /etc/passwd $f1) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2176,7 +2492,10 @@ test_36() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2228,7 +2547,13 @@ test_40() { fid=$(copy_file /etc/hosts $f.$p.$i) done done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then + copytool_setup + else + copytool_setup $SINGLEAGT $MOUNT $HSM_ARCHIVE_NUMBER $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2363,7 +2688,9 @@ test_56() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2472,10 +2799,314 @@ test_58() { } run_test 58 "Truncate a released file will trigger restore" -test_90() { - file_count=57 +test_60() { + # This test validates the fix for LU-4512. Ensure that the -u + # option changes the progress reporting interval from the + # default (30 seconds) to the user-specified interval. + local interval=5 + local progress_timeout=$((interval * 4)) + + # test needs a new running copytool + copytool_cleanup + HSMTOOL_UPDATE_INTERVAL=$interval copytool_setup + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + + local mdtidx=0 + local mdt=${MDT_PREFIX}${mdtidx} + local mds=mds$((mdtidx + 1)) + + # Wait for copytool to register + wait_update_facet $mds \ + "$LCTL get_param -n ${mdt}.hsm.agents | grep -o ^uuid" \ + uuid 100 || error "coyptool failed to register with $mdt" + + local start_at=$(date +%s) + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + + local agent=$(facet_active_host $SINGLEAGT) + local prefix=$TESTLOG_PREFIX + [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME + local copytool_log=$prefix.copytool_log.$agent.log + + + wait_update $agent \ + "grep -o start.copy $copytool_log" "start copy" 100 || + error "copytool failed to start" + + local cmd="$LCTL get_param -n ${mdt}.hsm.active_requests" + cmd+=" | awk '/'$fid'.*action=ARCHIVE/ {print \\\$12}' | cut -f2 -d=" + + local RESULT + local WAIT=0 + local sleep=1 + + echo -n "Expecting a progress update within $progress_timeout seconds... " + while [ true ]; do + RESULT=$(do_node $(facet_active_host $mds) "$cmd") + if [ $RESULT -gt 0 ]; then + echo "$RESULT bytes copied in $WAIT seconds." + break + elif [ $WAIT -ge $progress_timeout ]; then + error "Timed out waiting for progress update!" + break + fi + WAIT=$((WAIT + sleep)) + sleep $sleep + done + + local finish_at=$(date +%s) + local elapsed=$((finish_at - start_at)) + + # Ensure that the progress update occurred within the expected window. + if [ $elapsed -lt $interval ]; then + error "Expected progress update after at least $interval seconds" + fi + + cdt_clear_no_retry + copytool_cleanup +} +run_test 60 "Changing progress update interval from default" + +test_70() { + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + # Just start and stop the copytool to generate events. + cdt_clear_no_retry + + # Wait for the copytool to register. + wait_update --verbose $(facet_active_host mds1) \ + "$LCTL get_param -n ${MDT_PREFIX}0.hsm.agents | grep -o ^uuid" \ + uuid 100 || + error "copytool failed to register with MDT0000" + + copytool_cleanup + + local REGISTER_EVENT + local UNREGISTER_EVENT + while read event; do + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "REGISTER" ]; then + REGISTER_EVENT=$event + elif [ $event_type == "UNREGISTER" ]; then + UNREGISTER_EVENT=$event + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$REGISTER_EVENT" ]; then + error "Copytool failed to send register event to FIFO" + fi + + if [ -z "$UNREGISTER_EVENT" ]; then + error "Copytool failed to send unregister event to FIFO" + fi + + copytool_monitor_cleanup + echo "Register/Unregister events look OK." +} +run_test 70 "Copytool logs JSON register/unregister events to FIFO" + +test_71() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "ARCHIVE_START" ]; then + START_EVENT=$event + continue + elif [ $event_type == "ARCHIVE_FINISH" ]; then + FINISH_EVENT=$event + continue + elif [ $event_type != "ARCHIVE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # These should be identical throughout an archive + # operation. + if [ $source_fid != $data_fid ]; then + error "Expected source_fid to equal data_fid" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send archive start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send archive finish event to FIFO" + fi + + echo "Archive events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup +} +run_test 71 "Copytool logs JSON archive events to FIFO" + +test_72() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + local test_file=$HSMTOOL_MONITOR_DIR/file + + local cmd="dd if=/dev/urandom of=$test_file count=16 bs=1000000 " + cmd+="conv=fsync" + do_facet $SINGLEAGT "$cmd" || + error "cannot create $test_file on $SINGLEAGT" + copy2archive $test_file $tdir/$tfile + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + f=$DIR2/$tdir/$tfile + echo "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + local fid=$(path2fid $f) + $LFS hsm_restore $f + wait_request_state $fid RESTORE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "RESTORE_START" ]; then + START_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at start" + fi + continue + elif [ $event_type == "RESTORE_FINISH" ]; then + FINISH_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at finish" + fi + continue + elif [ $event_type != "RESTORE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # When a restore starts out, the data fid is the same as the + # source fid. After the restore has gotten going, we learn + # the new data fid. Once the restore has finished, the source + # fid is set to the new data fid. + # + # We test this because some monitoring software may depend on + # this behavior. If it changes, then the consumers of these + # events may need to be modified. + if [ $source_fid == $data_fid ]; then + error "source_fid should != data_fid during restore" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send restore start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send restore finish event to FIFO" + fi + + echo "Restore events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup + + rm -rf $test_dir +} +run_test 72 "Copytool logs JSON restore events to FIFO" + +test_90() { + file_count=51 # Max number of files constrained by LNET message size + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + local f=$DIR/$tdir/$tfile local FILELIST=/tmp/filelist.txt local i="" @@ -2484,7 +3115,14 @@ test_90() { fid=$(copy_file /etc/hosts $f.$i) echo $f.$i >> $FILELIST done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then + copytool_setup + else + local dai=$(get_hsm_param default_archive_id) + copytool_setup $SINGLEAGT $MOUNT $dai $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2571,7 +3209,10 @@ test_104() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + # if cdt is on, it can serve too quickly the request cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f @@ -2882,7 +3523,10 @@ test_200() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return + # test with cdt on is made in test_221 cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2923,7 +3567,10 @@ test_202() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2968,7 +3615,9 @@ test_221() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -3075,7 +3724,9 @@ test_223b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -3135,7 +3786,9 @@ test_225() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -3262,10 +3915,9 @@ test_228() { # test needs a running copytool copytool_setup - dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 conv=sync || - error "creating $DIR/$tfile" + local fid=$(make_small_sync $DIR/$tfile) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tfile - wait_request_state $(path2fid $DIR/$tfile) ARCHIVE SUCCEED + wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" @@ -3282,13 +3934,15 @@ test_228() { $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir || error "mkdir $tdir failed" tar cf - --sparse $DIR/$tfile | tar xvf - -C $DIR/$tdir || error "tar failed" cmp $DIR/$tfile $DIR/$tdir/$DIR/$tfile || error "comparing untarred $DIR/$tfile" + rm -f $DIR/$tfile $DIR/$tfile.2 || + error "rm $DIR/$tfile or $DIR/$tfile.2 failed" copytool_cleanup } run_test 228 "On released file, return extend to FIEMAP. For [cp,tar] --sparse" @@ -3337,7 +3991,9 @@ test_251() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return cdt_disable # to have a short test @@ -3349,6 +4005,10 @@ test_251() { set_hsm_param loop_period 2 cdt_enable + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE STARTED sleep 5 @@ -3514,9 +4174,9 @@ test_402() { copytool_cleanup # deactivate all mdc on agent1 - mdc_change_state $SINGLEAGT "MDT000." "deactivate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "deactivate" - copytool_setup $SINGLEAGT + HSMTOOL_NOERROR=true copytool_setup $SINGLEAGT check_agent_unregistered "uuid" # match any agent @@ -3524,7 +4184,7 @@ test_402() { search_copytools $agent && error "Copytool start should have failed" # reactivate MDCs - mdc_change_state $SINGLEAGT "MDT000." "activate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" } run_test 402 "Copytool start fails if all MDTs are inactive" @@ -3538,7 +4198,7 @@ test_403() { local uuid=$(do_rpc_nodes $agent get_client_uuid | cut -d' ' -f2) # deactivate all mdc for MDT0001 - mdc_change_state $SINGLEAGT "MDT0001" "deactivate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" copytool_setup # check the agent is registered on MDT0000, and not on MDT0001 @@ -3549,7 +4209,7 @@ test_403() { search_copytools $agent || error "No running copytools on $agent" # reactivate all mdc for MDT0001 - mdc_change_state $SINGLEAGT "MDT0001" "activate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" # make sure the copytool is now registered to all MDTs check_agent_registered $uuid @@ -3573,7 +4233,7 @@ test_404() { local fid1=$(make_small $dir_mdt0/$tfile) # deactivate all mdc for MDT0001 - mdc_change_state $SINGLEAGT "MDT0001" "deactivate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" # send an HSM request for files in MDT0000 $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" @@ -3583,7 +4243,7 @@ test_404() { echo "archive successful on mdt0" # reactivate all mdc for MDT0001 - mdc_change_state $SINGLEAGT "MDT0001" "activate" + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" copytool_cleanup # clean test files and directories @@ -3591,6 +4251,69 @@ test_404() { } run_test 404 "Inactive MDT does not block requests for active MDTs" +test_405() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local striped_dir=$DIR/$tdir/striped_dir + + # create striped dir on all of MDTs + $LFS mkdir -i 0 -c $MDSCOUNT $striped_dir || error "lfs mkdir" + + local fid1=$(make_small_sync $striped_dir/${tfile}_0) + local fid2=$(make_small_sync $striped_dir/${tfile}_1) + local fid3=$(make_small_sync $striped_dir/${tfile}_2) + local fid4=$(make_small_sync $striped_dir/${tfile}_3) + + local idx1=$($LFS getstripe -M $striped_dir/${tfile}_0) + local idx2=$($LFS getstripe -M $striped_dir/${tfile}_1) + local idx3=$($LFS getstripe -M $striped_dir/${tfile}_2) + local idx4=$($LFS getstripe -M $striped_dir/${tfile}_3) + + # check that compound requests are shunt to the rights MDTs + $LFS hsm_archive $striped_dir/${tfile}_0 $striped_dir/${tfile}_1 \ + $striped_dir/${tfile}_2 $striped_dir/${tfile}_3 || + error "lfs hsm_archive" + + wait_request_state $fid1 ARCHIVE SUCCEED $idx1 && + echo "archive successful on $fid1" + wait_request_state $fid2 ARCHIVE SUCCEED $idx2 && + echo "archive successful on $fid2" + wait_request_state $fid3 ARCHIVE SUCCEED $idx3 && + echo "archive successful on $fid3" + wait_request_state $fid4 ARCHIVE SUCCEED $idx4 && + echo "archive successful on $fid4" + + $LFS hsm_release $striped_dir/${tfile}_0 || error "lfs hsm_release 1" + $LFS hsm_release $striped_dir/${tfile}_1 || error "lfs hsm_release 2" + $LFS hsm_release $striped_dir/${tfile}_2 || error "lfs hsm_release 3" + $LFS hsm_release $striped_dir/${tfile}_3 || error "lfs hsm_release 4" + + cat $striped_dir/${tfile}_0 > /dev/null || error "cat ${tfile}_0 failed" + cat $striped_dir/${tfile}_1 > /dev/null || error "cat ${tfile}_1 failed" + cat $striped_dir/${tfile}_2 > /dev/null || error "cat ${tfile}_2 failed" + cat $striped_dir/${tfile}_3 > /dev/null || error "cat ${tfile}_3 failed" + + copytool_cleanup +} +run_test 405 "archive and release under striped directory" + +test_500() +{ + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.92) ] && + skip "HSM migrate is not supported" && return + + # Stop the existing copytool + copytool_cleanup + + test_mkdir -p $DIR/$tdir + llapi_hsm_test -d $DIR/$tdir || error "One llapi HSM test failed" +} +run_test 500 "various LLAPI HSM tests" + copytool_cleanup complete $SECONDS