X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=c9704e7015bb28204193bb69cb0314ad06eead0b;hb=32dcb2a411163419fd7ab88d1442ae17d3e501cc;hp=1b34cc52a14f07915f800deba8db82b0f5ec327a;hpb=de0b4231c2b57d1517ea19de5cb224aa275cbdd8;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 1b34cc5..c9704e7 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,12 +11,8 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: 3815 +# bug number for skipped test: LU-3815 ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" -# bug number for skipped test:4178 4176 -ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a" -# bug number for skipped test:LU-3852 -ALWAYS_EXCEPT="$ALWAYS_EXCEPT 251" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -28,6 +24,7 @@ init_logging MULTIOP=${MULTIOP:-multiop} OPENFILE=${OPENFILE:-openfile} +MMAP_CAT=${MMAP_CAT:-mmap_cat} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false @@ -49,6 +46,14 @@ check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS build_test_filter +# if there is no CLIENT1 defined, some tests can be ran on localhost +CLIENT1=${CLIENT1:-$HOSTNAME} +# if CLIENT2 doesn't exist then use CLIENT1 instead +# All tests should use CLIENT2 with MOUNT2 only therefore it will work if +# $CLIENT2 == CLIENT1 +# Exception is the test which need two separate nodes +CLIENT2=${CLIENT2:-$CLIENT1} + # # In order to test multiple remote HSM agents, a new facet type named "AGT" and # the following associated variables are added: @@ -77,8 +82,11 @@ init_agt_vars() { exit 0 fi + # We used to put the HSM archive in $SHARED_DIRECTORY but that + # meant NFS issues could hose sanity-hsm sessions. So now we + # use $TMP instead. for n in $(seq $AGTCOUNT); do - eval export AGTDEV$n=\$\{AGTDEV$n:-"$SHARED_DIRECTORY/arc$n"\} + eval export AGTDEV$n=\$\{AGTDEV$n:-"$TMP/arc$n"\} agent=CLIENT$((n + 1)) if [[ -z "${!agent}" ]]; then [[ $CLIENTCOUNT -eq 1 ]] && agent=CLIENT1 || @@ -137,15 +145,40 @@ get_mdt_devices() { } search_copytools() { - local agents=${1:-$(facet_active_host $SINGLEAGT)} - do_nodesv $agents "pgrep -x $HSMTOOL_BASE" + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + do_nodesv $hosts "pgrep -x $HSMTOOL_BASE" } -search_and_kill_copytool() { - local agents=${1:-$(facet_active_host $SINGLEAGT)} +kill_copytools() { + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + + echo "Killing existing copytools on $hosts" + do_nodesv $hosts "killall -q $HSMTOOL_BASE" || true +} + +wait_copytools() { + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + local wait_timeout=200 + local wait_start=$SECONDS + local wait_end=$((wait_start + wait_timeout)) + + while ((SECONDS < wait_end)); do + sleep 2 + if ! search_copytools $hosts; then + echo "copytools stopped in $((SECONDS - wait_start))s" + return 0 + fi + + echo "copytools still running on $hosts" + done + + # try to dump Copytool's stack + do_nodesv $hosts "echo 1 >/proc/sys/kernel/sysrq ; " \ + "echo t >/proc/sysrq-trigger" + + echo "copytools failed to stop in ${wait_timeout}s" - echo "Killing existing copytools on $agents" - do_nodesv $agents "killall -q $HSMTOOL_BASE" || true + return 1 } copytool_monitor_setup() { @@ -174,7 +207,7 @@ copytool_monitor_setup() { # Slightly racy, but just making a best-effort to catch obvious # problems. sleep 1 - ps -p $HSMTOOL_MONITOR_PDSH >&- || + ps -p $HSMTOOL_MONITOR_PDSH > /dev/null || error "Failed to start copytool monitor on $agent" else do_node $agent "$cmd" @@ -207,14 +240,15 @@ copytool_monitor_cleanup() { copytool_setup() { local facet=${1:-$SINGLEAGT} - local lustre_mntpnt=${2:-$MOUNT} + # Use MOUNT2 by default if defined + local lustre_mntpnt=${2:-${MOUNT2:-$MOUNT}} local arc_id=$3 local hsm_root=${4:-$(copytool_device $facet)} local agent=$(facet_active_host $facet) if [[ -z "$arc_id" ]] && do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then - echo "Wakeup copytool $facet on $agent" + echo "Only wakeup running copytool $facet on $agent" return 0 fi @@ -264,38 +298,59 @@ get_copytool_event_log() { copytool_cleanup() { trap - EXIT - local facet=$SINGLEAGT - local agents=${1:-$(facet_active_host $facet)} - local mdtno - local idx - local oldstate - local mdt_hsmctrl - local hsm_root=$(copytool_device $facet) + local agt_facet=$SINGLEAGT + local agt_hosts=${1:-$(facet_active_host $agt_facet)} + local hsm_root=$(copytool_device $agt_facet) + local i + local facet + local param + local -a state + + kill_copytools $agt_hosts + wait_copytools $agt_hosts || error "copytools failed to stop" + + # Clean all CDTs orphans requests from previous tests that + # would otherwise need to timeout to clear. + for ((i = 0; i < MDSCOUNT; i++)); do + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + state[$i]=$(do_facet $facet "$LCTL get_param -n $param") + + # Skip already stopping or stopped CDTs. + [[ "${state[$i]}" =~ ^stop ]] && continue + + do_facet $facet "$LCTL set_param $param=shutdown" + done - do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 - sleep 1 - echo "Copytool is stopped on $agents" + for ((i = 0; i < MDSCOUNT; i++)); do + # Only check and restore CDTs that we stopped in the first loop. + [[ "${state[$i]}" =~ ^stop ]] && continue - # clean all CDTs orphans requests from previous tests - # that would otherwise need to timeout to clear. - for mdtno in $(seq 1 $MDSCOUNT); do - idx=$(($mdtno - 1)) - mdt_hsmctrl="mdt.$FSNAME-MDT000${idx}.hsm_control" - oldstate=$(do_facet mds${mdtno} "$LCTL get_param -n " \ - "$mdt_hsmctrl") - # skip already stop[ed,ing] CDTs - echo $oldstate | grep stop && continue - - do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=shutdown" - wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \ - "stopped" 20 || - error "mds${mdtno} cdt state is not stopped" - do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=$oldstate" - wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \ - "$oldstate" 20 || - error "mds${mdtno} cdt state is not $oldstate" + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + + wait_result $facet "$LCTL get_param -n $param" stopped 20 || + error "$facet CDT state is not stopped" + + # Restore old CDT state. + do_facet $facet "$LCTL set_param $param=${state[$i]}" done - do_facet $facet "rm -rf $hsm_root" + + for ((i = 0; i < MDSCOUNT; i++)); do + # Only check CDTs that we stopped in the first loop. + [[ "${state[$i]}" =~ ^stop ]] && continue + + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + + # Check that the old CDT state was restored. + wait_result $facet "$LCTL get_param -n $param" "${state[$i]}" \ + 20 || error "$facet CDT state is not '${state[$i]}'" + done + + if do_facet $agt_facet "df $hsm_root" >/dev/null 2>&1 ; then + do_facet $agt_facet "rm -rf $hsm_root/*" + fi } copytool_suspend() { @@ -323,7 +378,7 @@ make_archive() { local file=$HSM_ARCHIVE/$1 do_facet $SINGLEAGT mkdir -p $(dirname $file) do_facet $SINGLEAGT dd if=/dev/urandom of=$file count=32 bs=1000000 || - error "cannot create $file" + file_creation_failure dd $file $? } copy2archive() { @@ -459,6 +514,13 @@ cdt_clear_mount_state() { cdt_set_mount_state() { mdts_set_param "-P" hsm_control "$1" + # set_param -P is asynchronous operation and could race with set_param. + # In such case configs could be retrieved and applied at mgc after + # set_param -P completion. Sleep here to avoid race with set_param. + # We need at least 20 seconds. 10 for mgc_requeue_thread to wake up + # MGC_TIMEOUT_MIN_SECONDS + MGC_TIMEOUT_RAND_CENTISEC(5 + 5) + # and 10 seconds to retrieve config from server. + sleep 20 } cdt_check_state() { @@ -498,17 +560,19 @@ needclients() { path2fid() { $LFS path2fid $1 | tr -d '[]' + return ${PIPESTATUS[0]} } get_hsm_flags() { local f=$1 local u=$2 + local st if [[ $u == "user" ]]; then - local st=$($RUNAS $LFS hsm_state $f) + st=$($RUNAS $LFS hsm_state $f) else - local st=$($LFS hsm_state $f) u=root + st=$($LFS hsm_state $f) fi [[ $? == 0 ]] || error "$LFS hsm_state $f failed (run as $u)" @@ -519,7 +583,8 @@ get_hsm_flags() { get_hsm_archive_id() { local f=$1 - local st=$($LFS hsm_state $f) + local st + st=$($LFS hsm_state $f) [[ $? == 0 ]] || error "$LFS hsm_state $f failed" local ar=$(echo $st | grep "archive_id" | cut -f5 -d" " | @@ -543,6 +608,15 @@ check_hsm_flags_user() { [[ $st == $fl ]] || error "hsm flags on $f are $st != $fl" } +file_creation_failure() { + local cmd=$1 + local f=$2 + local err=$3 + + df $MOUNT $MOUNT2 >&2 + error "cannot create $f with $cmd, status=$err" +} + copy_file() { local f= @@ -556,20 +630,22 @@ copy_file() { f=${f/$DIR/$DIR2} fi rm -f $f - cp $1 $f || error "cannot copy $1 to $f" + cp $1 $f || file_creation_failure cp $f $? + path2fid $f || error "cannot get fid on $f" } make_small() { local file2=${1/$DIR/$DIR2} dd if=/dev/urandom of=$file2 count=2 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } make_small_sync() { dd if=/dev/urandom of=$1 count=1 bs=1M conv=sync || - error "cannot create $1" + file_creation_failure dd $1 $? path2fid $1 || error "cannot get fid on $1" } @@ -579,14 +655,27 @@ cleanup_large_files() { [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \; } +check_enough_free_space() { + local nb=$1 + local unit=$2 + local need=$((nb * unit /1024)) + local free=$(df -kP $MOUNT | tail -1 | awk '{print $4}') + (( $need >= $free )) && return 1 + return 0 +} + make_large_for_striping() { local file2=${1/$DIR/$DIR2} local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) cleanup_large_files + check_enough_free_space 5 $sz + [ $? != 0 ] && return $? + dd if=/dev/urandom of=$file2 count=5 bs=$sz conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } @@ -595,12 +684,16 @@ make_large_for_progress() { cleanup_large_files + check_enough_free_space 39 1000000 + [ $? != 0 ] && return $? + # big file is large enough, so copy time is > 30s # so copytool make 1 progress # size is not a multiple of 1M to avoid stripe # aligment dd if=/dev/urandom of=$file2 count=39 bs=1000000 conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? + path2fid $1 || error "cannot get fid on $1" } @@ -609,12 +702,15 @@ make_large_for_progress_aligned() { cleanup_large_files + check_enough_free_space 33 1048576 + [ $? != 0 ] && return $? + # big file is large enough, so copy time is > 30s # so copytool make 1 progress # size is a multiple of 1M to have stripe # aligment dd if=/dev/urandom of=$file2 count=33 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -623,9 +719,12 @@ make_large_for_cancel() { cleanup_large_files + check_enough_free_space 103 1048576 + [ $? != 0 ] && return $? + # Copy timeout is 100s. 105MB => 105s dd if=/dev/urandom of=$file2 count=103 bs=1M conv=fsync || - error "cannot create $file2" + file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -646,7 +745,7 @@ wait_request_state() { local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $mds "$cmd" $state 100 || + wait_result $mds "$cmd" $state 200 || error "request on $fid is not $state on $mds" } @@ -702,7 +801,7 @@ get_mdt_devices init_agt_vars # cleanup from previous bad setup -search_and_kill_copytool +kill_copytools # for recovery tests, coordinator needs to be started at mount # so force it @@ -755,6 +854,23 @@ test_1() { } run_test 1 "lfs hsm flags root/non-root access" +test_1a() { + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_small $f) + + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + # Release and check states + $LFS hsm_release $f || error "could not release file" + echo -n "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + $MMAP_CAT $f > /dev/null || error "failed mmap & cat release file" +} +run_test 1a "mmap & cat a HSM released file" + test_2() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile @@ -810,7 +926,7 @@ test_3() { error "user could not change hsm flags" dd if=/etc/passwd of=$f.append bs=1 count=3\ conv=notrunc oflag=append status=noxfer || - error "could not append to test file" + file_creation_failure dd $f.append $? check_hsm_flags $f.append "0x00000003" # Modify a file sets it dirty @@ -819,7 +935,7 @@ test_3() { error "user could not change hsm flags" dd if=/dev/zero of=$f.modify bs=1 count=3\ conv=notrunc status=noxfer || - error "could not modify test file" + file_creation_failure dd $f.modify $? check_hsm_flags $f.modify "0x00000003" # Open O_TRUNC sets dirty @@ -880,7 +996,7 @@ test_9() { copytool_cleanup } -run_test 9 "Use of explict archive number, with dedicated copytool" +run_test 9 "Use of explicit archive number, with dedicated copytool" test_9a() { needclients 3 || return 0 @@ -1105,7 +1221,10 @@ test_12c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile $LFS setstripe -c 2 $f - local fid=$(make_large_for_striping $f) + local fid + fid=$(make_large_for_striping $f) + [ $? != 0 ] && skip "not enough free space" && return + local FILE_CRC=$(md5sum $f) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -1332,6 +1451,97 @@ test_12o() { } run_test 12o "Layout-swap failure during Restore leaves file released" +test_12p() { + # test needs a running copytool + copytool_setup + + mkdir $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + + copytool_cleanup +} +run_test 12p "implicit restore of a file on copytool mount point" + +cleanup_test_12q() { + trap 0 + zconf_umount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot umount $MOUNT3 on $SINGLEAGT" +} + +test_12q() { + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.58) ] && + skip "need MDS version at least 2.7.58" && return 0 + + zconf_mount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot mount $MOUNT3 on $SINGLEAGT" + + trap cleanup_test_12q EXIT + + # test needs a running copytool + copytool_setup $SINGLEAGT $MOUNT3 + + mkdir $DIR/$tdir + local f=$DIR/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile + local fid=$(make_small $f) + local orig_size=$(stat -c "%s" $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + $LFS hsm_release $f || error "could not release file" + check_hsm_flags $f "0x0000000d" + + kill_copytools + wait_copytools || error "copytool failed to stop" + + cat $f > /dev/null & + + # wait a bit to allow implicit restore request to be handled. + # if not, next stat would also block on layout-lock. + sleep 5 + + local size=$(stat -c "%s" $f2) + [ $size -eq $orig_size ] || + error "$f2: wrong size after archive: $size != $orig_size" + + HSM_ARCHIVE_PURGE=false copytool_setup $SINGLEAGT /mnt/lustre3 + + wait + + size=$(stat -c "%s" $f) + [ $size -eq $orig_size ] || + error "$f: wrong size after restore: $size != $orig_size" + + size=$(stat -c "%s" $f2) + [ $size -eq $orig_size ] || + error "$f2: wrong size after restore: $size != $orig_size" + + :>$f + + size=$(stat -c "%s" $f) + [ $size -eq 0 ] || + error "$f: wrong size after overwrite: $size != 0" + + size=$(stat -c "%s" $f2) + [ $size -eq 0 ] || + error "$f2: wrong size after overwrite: $size != 0" + + copytool_cleanup + zconf_umount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot umount $MOUNT3 on $SINGLEAGT" +} +run_test 12q "file attributes are refreshed after restore" + test_13() { # test needs a running copytool copytool_setup @@ -1726,7 +1936,7 @@ test_24a() { [ $mtime0 -eq $mtime1 ] || error "restore changed mtime from $mtime0 to $mtime1" - [ $ctime0 -le $ctime1 ] || + [ $ctime0 -eq $ctime1 ] || error "restore changed ctime from $ctime0 to $ctime1" copytool_cleanup @@ -1745,7 +1955,7 @@ test_24a() { [ $mtime0 -eq $mtime1 ] || error "remount changed mtime from $mtime0 to $mtime1" - [ $ctime0 -le $ctime1 ] || + [ $ctime0 -eq $ctime1 ] || error "remount changed ctime from $ctime0 to $ctime1" } run_test 24a "Archive, release, and restore does not change a/mtime (i/o)" @@ -1761,7 +1971,7 @@ test_24b() { copytool_setup mkdir -p $DIR/$tdir - # Check that root can do HSM actions on a ordinary user's file. + # Check that root can do HSM actions on a regular user's file. rm -f $file fid=$(make_small $file) sum0=$(md5sum $file) @@ -1927,6 +2137,55 @@ test_24d() { } run_test 24d "check that read-only mounts are respected" +test_24e() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid + + fid=$(make_small $f) || error "cannot create $f" + $LFS hsm_archive $f || error "cannot archive $f" + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + while ! $LFS hsm_state $f | grep released; do + sleep 1 + done + + tar -cf $TMP/$tfile.tar $DIR/$tdir || error "cannot tar $DIR/$tdir" + + copytool_cleanup +} +run_test 24e "tar succeeds on HSM released files" # LU-6213 + +test_24f() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir/d1 + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + sum0=$(md5sum $f) + echo $sum0 + $LFS hsm_archive -a $HSM_ARCHIVE_NUMBER $f || + error "hsm_archive failed" + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + tar --xattrs -cvf $f.tar -C $DIR/$tdir $tfile + rm -f $f + sync + tar --xattrs -xvf $f.tar -C $DIR/$tdir || + error "Can not recover the tar contents" + sum1=$(md5sum $f) + echo "Sum0 = $sum0, sum1 = $sum1" + [ "$sum0" == "$sum1" ] || error "md5sum mismatch for '$tfile'" + + copytool_cleanup +} +run_test 24f "root can archive, release, and restore tar files" + test_25a() { # test needs a running copytool copytool_setup @@ -1980,7 +2239,10 @@ test_26() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2017,7 +2279,10 @@ test_27b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2036,7 +2301,10 @@ test_28() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2053,6 +2321,76 @@ test_28() { } run_test 28 "Concurrent archive/file remove" +test_29a() { + # Tests --mntpath and --archive options + + local archive_id=7 + copytool_setup $SINGLEAGT $MOUNT $archive_id + + # Bad archive number + $LFS hsm_remove -m $MOUNT -a 33 0x857765760:0x8:0x2 2>&1 | + grep "Invalid argument" || + error "unexpected hsm_remove failure (1)" + + # mntpath is present but file is given + $LFS hsm_remove --mntpath $MOUNT --archive 30 /qwerty/uyt 2>&1 | + grep "hsm: '/qwerty/uyt' is not a valid FID" || + error "unexpected hsm_remove failure (2)" + + copytool_cleanup +} +run_test 29a "Tests --mntpath and --archive options" + +test_29b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_small $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + rm -f $f + + $LFS hsm_remove -m $MOUNT -a $HSM_ARCHIVE_NUMBER $fid + wait_request_state $fid REMOVE SUCCEED + + copytool_cleanup +} +run_test 29b "Archive/delete/remove by FID from the archive." + +test_29c() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local fid1=$(make_small $DIR/$tdir/$tfile-1) + local fid2=$(make_small $DIR/$tdir/$tfile-2) + local fid3=$(make_small $DIR/$tdir/$tfile-3) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tdir/$tfile-[1-3] + wait_request_state $fid1 ARCHIVE SUCCEED + wait_request_state $fid2 ARCHIVE SUCCEED + wait_request_state $fid3 ARCHIVE SUCCEED + + rm -f $DIR/$tdir/$tfile-[1-3] + + echo $fid1 > $DIR/$tdir/list + echo $fid2 >> $DIR/$tdir/list + echo $fid3 >> $DIR/$tdir/list + + $LFS hsm_remove -m $MOUNT -a $HSM_ARCHIVE_NUMBER \ + --filelist $DIR/$tdir/list + wait_request_state $fid1 REMOVE SUCCEED + wait_request_state $fid2 REMOVE SUCCEED + wait_request_state $fid3 REMOVE SUCCEED + + copytool_cleanup +} +run_test 29c "Archive/delete/remove by FID, using a file list." + test_30a() { # restore at exec cannot work on agent node (because of Linux kernel # protection of executables) @@ -2125,6 +2463,7 @@ test_30c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/SLEEP + local slp_sum1=$(md5sum /bin/sleep) local fid=$(copy_file /bin/sleep $f) chmod 755 $f $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2141,7 +2480,12 @@ test_30c() { wait $pid [[ $? == 0 ]] || error "Execution failed during run" cmp /bin/sleep $f - [[ $? == 0 ]] || error "Binary overwritten during exec" + if [[ $? != 0 ]]; then + local slp_sum2=$(md5sum /bin/sleep) + # in case sleep file is modified during the test + [[ $slp_sum1 == $slp_sum2 ]] && + error "Binary overwritten during exec" + fi # cleanup # remove no try action mode @@ -2214,7 +2558,10 @@ test_31b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2235,7 +2582,10 @@ test_31c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress_aligned $f) + local fid + fid=$(make_large_for_progress_aligned $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2256,7 +2606,10 @@ test_33() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2320,7 +2673,10 @@ test_34() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2353,7 +2709,10 @@ test_35() { local f=$DIR/$tdir/$tfile local f1=$DIR/$tdir/$tfile-1 - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + local fid1=$(copy_file /etc/passwd $f1) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2389,7 +2748,10 @@ test_36() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2416,6 +2778,31 @@ test_36() { } run_test 36 "Move file during restore" +test_37() { + # LU-5683: check that an archived dirty file can be rearchived. + copytool_cleanup + copytool_setup $SINGLEAGT $MOUNT2 + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + + fid=$(make_small $f) || error "cannot create small file" + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + + # Dirty file. + dd if=/dev/urandom of=$f bs=1M count=1 || error "cannot dirty file" + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 37 "re-archive a dirty file" + multi_archive() { local prefix=$1 local count=$2 @@ -2443,7 +2830,7 @@ test_40() { done # force copytool to use a local/temp archive dir to ensure best # performance vs remote/NFS mounts used in auto-tests - if df --local $HSM_ARCHIVE >/dev/null 2>&1 ; then + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then copytool_setup else copytool_setup $SINGLEAGT $MOUNT $HSM_ARCHIVE_NUMBER $TMP/$tdir @@ -2526,7 +2913,7 @@ test_54() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_small $f) + local fid=$(make_large_for_progress $f) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2554,7 +2941,7 @@ test_55() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_small $f) + local fid=$(make_large_for_progress $f) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2582,7 +2969,9 @@ test_56() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2691,12 +3080,29 @@ test_58() { } run_test 58 "Truncate a released file will trigger restore" +test_59() { + local fid + local server_version=$(lustre_version_code $SINGLEMDS) + [[ $server_version -lt $(version_code 2.7.63) ]] && + skip "Need MDS version at least 2.7.63" && return + + copytool_setup + $MCREATE $DIR/$tfile || error "mcreate failed" + $TRUNCATE $DIR/$tfile 42 || error "truncate failed" + $LFS hsm_archive $DIR/$tfile || error "archive request failed" + fid=$(path2fid $DIR/$tfile) + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $DIR/$tfile || error "release failed" + copytool_cleanup +} +run_test 59 "Release stripeless file with non-zero size" + test_60() { # This test validates the fix for LU-4512. Ensure that the -u - # option changes the progress reporting interval from the default - # (30 seconds) to the user-specified interval. + # option changes the progress reporting interval from the + # default (30 seconds) to the user-specified interval. local interval=5 - local progress_timeout=$((interval * 3)) + local progress_timeout=$((interval * 4)) # test needs a new running copytool copytool_cleanup @@ -2704,15 +3110,32 @@ test_60() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + + local mdtidx=0 + local mdt=${MDT_PREFIX}${mdtidx} + local mds=mds$((mdtidx + 1)) + + # Wait for copytool to register + wait_update_facet $mds \ + "$LCTL get_param -n ${mdt}.hsm.agents | grep -o ^uuid" \ + uuid 100 || error "coyptool failed to register with $mdt" local start_at=$(date +%s) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" - local mdtidx=0 - local mdt=${MDT_PREFIX}${mdtidx} - local mds=mds$((mdtidx + 1)) + local agent=$(facet_active_host $SINGLEAGT) + local prefix=$TESTLOG_PREFIX + [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME + local copytool_log=$prefix.copytool_log.$agent.log + + + wait_update $agent \ + "grep -o start.copy $copytool_log" "start copy" 100 || + error "copytool failed to start" local cmd="$LCTL get_param -n ${mdt}.hsm.active_requests" cmd+=" | awk '/'$fid'.*action=ARCHIVE/ {print \\\$12}' | cut -f2 -d=" @@ -2756,6 +3179,13 @@ test_70() { # Just start and stop the copytool to generate events. cdt_clear_no_retry + + # Wait for the copytool to register. + wait_update --verbose $(facet_active_host mds1) \ + "$LCTL get_param -n ${MDT_PREFIX}0.hsm.agents | grep -o ^uuid" \ + uuid 100 || + error "copytool failed to register with MDT0000" + copytool_cleanup local REGISTER_EVENT @@ -2799,7 +3229,9 @@ test_71() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2981,7 +3413,14 @@ test_90() { fid=$(copy_file /etc/hosts $f.$i) echo $f.$i >> $FILELIST done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then + copytool_setup + else + local dai=$(get_hsm_param default_archive_id) + copytool_setup $SINGLEAGT $MOUNT $dai $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -3068,7 +3507,10 @@ test_104() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + # if cdt is on, it can serve too quickly the request cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f @@ -3157,11 +3599,24 @@ check_agent_unregistered() { done } -test_106() { - local uuid=$(do_rpc_nodes $(facet_active_host $SINGLEAGT) \ - get_client_uuid $MOUNT | cut -d' ' -f2) +get_agent_uuid() { + local agent=${1:-$(facet_active_host $SINGLEAGT)} + # Lustre mount-point is mandatory and last parameter on + # copytool cmd-line. + local mntpnt=$(do_rpc_nodes $agent pgrep -fl $HSMTOOL_BASE | + grep -v pgrep | awk '{print $NF}') + [ -n "$mntpnt" ] || error "Found no Agent or with no mount-point "\ + "parameter" + do_rpc_nodes $agent get_client_uuid $mntpnt | cut -d' ' -f2 +} + +test_106() { + # test needs a running copytool copytool_setup + + local uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) + check_agent_registered $uuid search_copytools || error "No copytool found" @@ -3170,6 +3625,7 @@ test_106() { check_agent_unregistered $uuid copytool_setup + uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) check_agent_registered $uuid copytool_cleanup @@ -3379,10 +3835,15 @@ test_200() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return + # test with cdt on is made in test_221 cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + # wait archive to register at CDT + wait_request_state $fid ARCHIVE WAITING $LFS hsm_cancel $f cdt_enable wait_request_state $fid ARCHIVE CANCELED @@ -3405,6 +3866,8 @@ test_201() { # test with cdt on is made in test_222 cdt_disable $LFS hsm_restore $f + # wait restore to register at CDT + wait_request_state $fid RESTORE WAITING $LFS hsm_cancel $f cdt_enable wait_request_state $fid RESTORE CANCELED @@ -3420,12 +3883,17 @@ test_202() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED cdt_disable $LFS hsm_remove $f + # wait remove to register at CDT + wait_request_state $fid REMOVE WAITING $LFS hsm_cancel $f cdt_enable wait_request_state $fid REMOVE CANCELED @@ -3465,7 +3933,9 @@ test_221() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -3572,7 +4042,9 @@ test_223b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -3632,7 +4104,9 @@ test_225() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_large_for_progress $f) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -3835,7 +4309,9 @@ test_251() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_large_for_cancel $f) + [ $? != 0 ] && skip "not enough free space" && return cdt_disable # to have a short test @@ -3847,6 +4323,10 @@ test_251() { set_hsm_param loop_period 2 cdt_enable + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE STARTED sleep 5 @@ -4033,12 +4513,12 @@ test_403() { copytool_cleanup local agent=$(facet_active_host $SINGLEAGT) - local uuid=$(do_rpc_nodes $agent get_client_uuid | cut -d' ' -f2) # deactivate all mdc for MDT0001 mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" copytool_setup + local uuid=$(get_agent_uuid $agent) # check the agent is registered on MDT0000, and not on MDT0001 check_agent_registered_by_mdt $uuid 0 check_agent_unregistered_by_mdt $uuid 1 @@ -4139,6 +4619,79 @@ test_405() { } run_test 405 "archive and release under striped directory" +test_406() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "need MDS version at least 2.7.64" && return 0 + + local fid + local mdt_index + + copytool_setup + mkdir -p $DIR/$tdir + fid=$(make_small $DIR/$tdir/$tfile) + echo "old fid $fid" + + $LFS hsm_archive $DIR/$tdir/$tfile + wait_request_state "$fid" ARCHIVE SUCCEED + $LFS hsm_release $DIR/$tdir/$tfile + + # Should migrate $tdir but not $tfile. + $LFS mv -M1 $DIR/$tdir && + error "migrating HSM an archived file should fail" + + $LFS hsm_restore $DIR/$tdir/$tfile + wait_request_state "$fid" RESTORE SUCCEED + + $LFS hsm_remove $DIR/$tdir/$tfile + wait_request_state "$fid" REMOVE SUCCEED + + cat $DIR/$tdir/$tfile > /dev/null || + error "cannot read $DIR/$tdir/$tfile" + + $LFS mv -M1 $DIR/$tdir || + error "cannot complete migration after HSM remove" + + mdt_index=$($LFS getstripe -M $DIR/$tdir) + if ((mdt_index != 1)); then + error "expected MDT index 1, got $mdt_index" + fi + + # Refresh fid after migration. + fid=$(path2fid $DIR/$tdir/$tfile) + echo "new fid $fid" + + $LFS hsm_archive $DIR/$tdir/$tfile + wait_request_state "$fid" ARCHIVE SUCCEED 1 + + lctl set_param debug=+trace + $LFS hsm_release $DIR/$tdir/$tfile || + error "cannot release $DIR/$tdir/$tfile" + + $LFS hsm_restore $DIR/$tdir/$tfile + wait_request_state "$fid" RESTORE SUCCEED 1 + + cat $DIR/$tdir/$tfile > /dev/null || + error "cannot read $DIR/$tdir/$tfile" + + copytool_cleanup +} +run_test 406 "attempting to migrate HSM archived files is safe" + +test_500() +{ + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.92) ] && + skip "HSM migrate is not supported" && return + + # Stop the existing copytool + copytool_cleanup + + test_mkdir -p $DIR/$tdir + llapi_hsm_test -d $DIR/$tdir || error "One llapi HSM test failed" +} +run_test 500 "various LLAPI HSM tests" + copytool_cleanup complete $SECONDS