X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=25c9cf8a14f21bf93c2119b77382a4a5b690eda9;hb=cc6ef11d2f972ebc440013bddda87a536a09750c;hp=fd2f71121f5ee8a65492ae2314ef59a8c6c1e8aa;hpb=89a668f6c8770c5af8c99984003ffd9844dc2904;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index fd2f711..25c9cf8 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,8 +11,8 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: LU-3815 -ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" +# bug number for skipped test: +ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -170,15 +170,18 @@ wait_copytools() { local wait_timeout=200 local wait_start=$SECONDS local wait_end=$((wait_start + wait_timeout)) + local sleep_time=100000 # 0.1 second while ((SECONDS < wait_end)); do - sleep 2 if ! search_copytools $hosts; then echo "copytools stopped in $((SECONDS - wait_start))s" return 0 fi echo "copytools still running on $hosts" + usleep $sleep_time + [ $sleep_time -lt 32000000 ] && # 3.2 seconds + sleep_time=$(bc <<< "$sleep_time * 2") done # try to dump Copytool's stack @@ -258,12 +261,6 @@ copytool_setup() { local agent=$(facet_active_host $facet) - if [[ -z "$arc_id" ]] && - do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then - echo "Only wakeup running copytool $facet on $agent" - return 0 - fi - if $HSM_ARCHIVE_PURGE; then echo "Purging archive on $agent" do_facet $facet "rm -rf $hsm_root/$HSMTMP/*" @@ -377,6 +374,13 @@ copytool_suspend() { echo "Copytool is suspended on $agents" } +copytool_continue() { + local agents=${1:-$(facet_active_host $SINGLEAGT)} + + do_nodesv $agents "pkill -CONT -x $HSMTOOL_BASE" || return 0 + echo "Copytool is continued on $agents" +} + copytool_remove_backend() { local fid=$1 local be=$(do_facet $SINGLEAGT find $HSM_ARCHIVE -name $fid) @@ -681,66 +685,18 @@ check_enough_free_space() { return 0 } -make_large_for_striping() { +make_custom_file_for_progress() { local file2=${1/$DIR/$DIR2} - local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) - - cleanup_large_files - - check_enough_free_space 5 $sz - [ $? != 0 ] && return $? - - dd if=/dev/urandom of=$file2 count=5 bs=$sz conv=fsync || - file_creation_failure dd $file2 $? - - path2fid $1 || error "cannot get fid on $1" -} - -make_large_for_progress() { - local file2=${1/$DIR/$DIR2} - - cleanup_large_files - - check_enough_free_space 39 1000000 - [ $? != 0 ] && return $? - - # big file is large enough, so copy time is > 30s - # so copytool make 1 progress - # size is not a multiple of 1M to avoid stripe - # aligment - dd if=/dev/urandom of=$file2 count=39 bs=1000000 conv=fsync || - file_creation_failure dd $file2 $? - - path2fid $1 || error "cannot get fid on $1" -} - -make_large_for_progress_aligned() { - local file2=${1/$DIR/$DIR2} - - cleanup_large_files + local fsize=${2:-"39"} + local blksz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) + blksz=${3:-$blksz} - check_enough_free_space 33 1048576 - [ $? != 0 ] && return $? - - # big file is large enough, so copy time is > 30s - # so copytool make 1 progress - # size is a multiple of 1M to have stripe - # aligment - dd if=/dev/urandom of=$file2 count=33 bs=1M conv=fsync || - file_creation_failure dd $file2 $? - path2fid $1 || error "cannot get fid on $1" -} - -make_large_for_cancel() { - local file2=${1/$DIR/$DIR2} + [[ $fsize -gt 0 ]] || error "Invalid file size" + [[ $blksz -gt 0 ]] || error "Invalid stripe size" cleanup_large_files - - check_enough_free_space 103 1048576 - [ $? != 0 ] && return $? - - # Copy timeout is 100s. 105MB => 105s - dd if=/dev/urandom of=$file2 count=103 bs=1M conv=fsync || + check_enough_free_space $fsize $blksz || return $? + dd if=/dev/zero of=$file2 count=$fsize bs=$blksz conv=fsync || file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -782,6 +738,15 @@ get_request_count() { "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'" } +# Ensure the number of HSM request for a given FID is correct +# assert_request_count FID REQUEST_TYPE COUNT [ERROR_MSG] +assert_request_count() { + local request_count=$(get_request_count $1 $2) + local default_error_msg=("expected $3 '$2' request(s) for '$1', found " + "'$request_count'") + [ $request_count -eq $3 ] || error "${4:-"${default_error_msg[@]}"}" +} + wait_all_done() { local timeout=$1 local fid=$2 @@ -799,6 +764,11 @@ wait_for_grace_delay() { sleep $val } +wait_for_loop_period() { + local val=$(get_hsm_param loop_period) + sleep $val +} + parse_json_event() { local raw_event=$1 @@ -887,9 +857,6 @@ echo "Set HSM on and start" cdt_set_mount_state enabled cdt_check_state enabled -echo "Start copytool" -copytool_setup - echo "Set sanity-hsm HSM policy" cdt_set_sanity_policy @@ -936,6 +903,8 @@ test_1a() { local f=$DIR/$tdir/$tfile local fid=$(make_small $f) + copytool_setup + $LFS hsm_archive $f || error "could not archive file" wait_request_state $fid ARCHIVE SUCCEED @@ -945,9 +914,43 @@ test_1a() { check_hsm_flags $f "0x0000000d" $MMAP_CAT $f > /dev/null || error "failed mmap & cat release file" + + copytool_cleanup } run_test 1a "mmap & cat a HSM released file" +test_1b() { + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -E 64M -c 2 -E -1 -c 4 $DIR/$tdir || + error "failed to set default stripe" + local f=$DIR/$tdir/$tfile + rm -f $f + + dd if=/dev/random of=$f bs=1M count=1 conv=sync || + error "failed to create file" + local fid=$(path2fid $f) + + copytool_setup + + echo "archive $f" + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + echo "release $f" + $LFS hsm_release $f || error "could not release file" + echo "verify released state: " + check_hsm_flags $f "0x0000000d" && echo "pass" + + echo "restore $f" + $LFS hsm_restore $f || error "could not restore file" + wait_request_state $fid RESTORE SUCCEED + echo "verify restored state: " + check_hsm_flags $f "0x00000009" && echo "pass" + + copytool_cleanup +} +run_test 1b "Archive, Release & Restore composite file" + test_2() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile @@ -1305,7 +1308,7 @@ test_12c() { local f=$DIR/$tdir/$tfile $LFS setstripe -c 2 $f local fid - fid=$(make_large_for_striping $f) + fid=$(make_custom_file_for_progress $f 5) [ $? != 0 ] && skip "not enough free space" && return local FILE_CRC=$(md5sum $f) @@ -1773,7 +1776,7 @@ test_16() { # Add 1 to account for rounding errors between start and end (LU-8155) local duration=$((end - start + 1)) - [[ $duration -ge $goal ]] || + [[ $duration -ge $((goal - 1)) ]] || error "Transfer is too fast $duration < $goal" copytool_cleanup @@ -2173,6 +2176,7 @@ run_test 24c "check that user,group,other request masks work" cleanup_test_24d() { trap 0 mount -o remount,rw $MOUNT2 + zconf_umount $(facet_host $SINGLEAGT) "$MOUNT3" } test_24d() { @@ -2181,16 +2185,24 @@ test_24d() { local fid1 local fid2 - copytool_setup - mkdir -p $DIR/$tdir rm -f $file1 fid1=$(make_small $file1) + echo $fid1 + $LFS getstripe $file1 + trap cleanup_test_24d EXIT + zconf_mount $(facet_host $SINGLEAGT) "$MOUNT3" || + error "cannot mount '$MOUNT3' on '$SINGLEAGT'" + copytool_setup $SINGLEAGT "$MOUNT3" || + error "unable to setup a copytool for the test" mount -o remount,ro $MOUNT2 + do_nodes $(comma_list $(nodes_list)) $LCTL clear + start_full_debug_logging + fid2=$(path2fid $file2) [ "$fid1" == "$fid2" ] || error "FID mismatch '$fid1' != '$fid2'" @@ -2199,15 +2211,17 @@ test_24d() { error "archive should fail on read-only mount" check_hsm_flags $file1 "0x00000000" - $LFS hsm_archive $file1 + $LFS hsm_archive $file1 || error "Fail to archive $file1" wait_request_state $fid1 ARCHIVE SUCCEED + stop_full_debug_logging + $LFS hsm_release $file1 $LFS hsm_restore $file2 wait_request_state $fid1 RESTORE SUCCEED $LFS hsm_release $file1 || error "cannot release '$file1'" - dd if=$file2 of=/dev/null bs=1M || "cannot read '$file2'" + dd if=$file2 of=/dev/null bs=1M || error "cannot read '$file2'" $LFS hsm_release $file2 && error "release should fail on read-only mount" @@ -2320,7 +2334,7 @@ test_26() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2335,6 +2349,222 @@ test_26() { } run_test 26 "Remove the archive of a valid file" +cleanup_test_26a() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26a() { + local raolu=$(get_hsm_param remove_archive_on_last_unlink) + [[ $raolu -eq 0 ]] || error "RAoLU policy should be off" + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + local f3=$DIR/$tdir/${tfile}_3 + local fid3=$(copy_file /etc/passwd $f3) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f3 + wait_request_state $fid3 ARCHIVE SUCCEED + + trap cleanup_test_26a EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 1 + + ln "$f3" "$f3"_bis || error "Unable to create hard-link" + rm -f $f3 + + rm -f $f2 + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid2 REMOVE SUCCEED + + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + assert_request_count $fid3 REMOVE 0 \ + "Unexpected archived data remove request for $f3" + + cleanup_test_26a +} +run_test 26a "Remove Archive On Last Unlink (RAoLU) policy" + +cleanup_test_26b() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + copytool_cleanup +} + +test_26b() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + trap cleanup_test_26b EXIT + + set_hsm_param remove_archive_on_last_unlink 1 + + cdt_shutdown + cdt_check_state stopped + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE WAITING + + cdt_enable + # copytool must re-register + kill_copytools + wait_copytools || error "copytool failed to stop" + HSM_ARCHIVE_PURGE=false copytool_setup + + wait_request_state $fid REMOVE SUCCEED + + cleanup_test_26b +} +run_test 26b "RAoLU policy when CDT off" + +cleanup_test_26c() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26c() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + trap cleanup_test_26c EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + set_hsm_param remove_archive_on_last_unlink 1 + + multiop_bg_pause $f O_c || error "open $f failed" + local pid=$! + + rm -f $f + rm -f $f2 + + wait_request_state $fid2 REMOVE SUCCEED + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + + kill -USR1 $pid || error "multiop early exit" + # should reach autotest timeout if multiop fails to trap + # signal, close file, and exit ... + wait $pid || error + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE SUCCEED + + cleanup_test_26c +} +run_test 26c "RAoLU effective when file closed" + +cleanup_test_26d() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26d() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/motd $f 1) + + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + trap cleanup_test_26d EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + set_hsm_param remove_archive_on_last_unlink 1 + + multiop_bg_pause $f O_c || error "multiop failed" + local MULTIPID=$! + + rm -f $f + + mds_evict_client + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE SUCCEED + + client_up || client_up || true + + kill -USR1 $MULTIPID + wait $MULTIPID || error "multiop close failed" + + cleanup_test_26d +} +run_test 26d "RAoLU when Client eviction" + test_27a() { # test needs a running copytool copytool_setup @@ -2360,7 +2590,7 @@ test_27b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2382,7 +2612,7 @@ test_28() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2471,6 +2701,78 @@ test_29c() { } run_test 29c "Archive/delete/remove by FID, using a file list." +test_29d() { + # test needs more than one CT + needclients 3 || return 0 + + local n + local file + local fid + + copytool_cleanup $(comma_list $(agts_nodes)) + + # start all of the copytools + for n in $(seq $AGTCOUNT); do + copytool_setup agt$n $MOUNT2 $n + done + + trap "copytool_cleanup $(comma_list $(agts_nodes))" EXIT + # archive files + mkdir -p $DIR/$tdir + file=$DIR/$tdir/$tfile + fid=$(make_small $file) + + $LFS hsm_archive $file + wait_request_state $fid ARCHIVE SUCCEED + check_hsm_flags $file "0x00000009" + + rm -f $file + + $LFS hsm_remove --mntpath "$MOUNT" -a 0 $fid || + error "cannot hsm_remove '$fid'" + + # give time for CDT to handle remove request and create broadcasted + sleep 2 + + # remove request has been broadcasted ? + local cnt=$(get_request_count $fid REMOVE) + # broadcasted requests + original + [[ $cnt -eq $((AGTCOUNT + 1)) ]] || + error "remove not broadcasted to all CTs" + + # give time for CDT and CTs to handle broadcasted + wait_for_loop_period + + # each agent serves one different archive_id, so broadcasted + # hsm_remove request should only succeed once and fail at all others + local res + local scnt=0 + local fcnt=0 + for n in $(seq $AGTCOUNT); do + res=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + $HSM_PARAM.actions | awk \ + '/'$fid'.*action=REMOVE archive#='$n'/ \ + {print \\\$13}' | cut -f2 -d=") + if [[ "$res" == "SUCCEED" ]]; then + scnt=$((scnt + 1)) + elif [[ "$res" == "FAILED" ]]; then + fcnt=$((fcnt + 1)) + fi + done + + [[ $scnt -ne 1 ]] && + error "one and only CT should have removed successfully" + + [[ $AGTCOUNT -ne $((scnt + fcnt)) ]] && + error "all but one CT should have failed to remove" + + trap - EXIT + copytool_cleanup $(comma_list $(agts_nodes)) + +} +run_test 29d "hsm_remove by FID with archive_id 0 for unlinked file cause "\ + "request to be sent once for each registered archive_id" + test_30a() { # restore at exec cannot work on agent node (because of Linux kernel # protection of executables) @@ -2588,19 +2890,18 @@ restore_and_check_size() { while [[ "$st" != "0x00000009" && $cpt -le 10 ]] do n=$(stat -c "%s" $f) - # we echo in both cases to show stat is not - # hang + # we echo in both cases to show stat is not hang if [[ $n != $s ]]; then echo "size seen is $n != $s" err=1 else echo "size seen is right: $n == $s" fi - st=$(get_hsm_flags $f) sleep 10 cpt=$((cpt + 1)) + st=$(get_hsm_flags $f) done - if [[ $cpt -lt 10 ]]; then + if [[ "$st" = "0x00000009" ]]; then echo " "done else echo " restore is too long" @@ -2639,7 +2940,7 @@ test_31b() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2663,7 +2964,7 @@ test_31c() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress_aligned $f) + fid=$(make_custom_file_for_progress $f 33 1048576) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2687,7 +2988,7 @@ test_33() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2754,7 +3055,7 @@ test_34() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2790,7 +3091,7 @@ test_35() { local f=$DIR/$tdir/$tfile local f1=$DIR/$tdir/$tfile-1 local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return local fid1=$(copy_file /etc/passwd $f1) @@ -2829,7 +3130,7 @@ test_36() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2873,6 +3174,9 @@ test_37() { wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f || error "cannot release $f" + # Allow previous archive request to expire from the actions log. + wait_for_grace_delay + # Dirty file. dd if=/dev/urandom of=$f bs=1M count=1 || error "cannot dirty file" @@ -2894,6 +3198,12 @@ multi_archive() { echo "$count archive requests submitted" } +cleanup_test_40() { + trap 0 + set_hsm_param max_requests $max_requests + copytool_cleanup +} + test_40() { local stream_count=4 local file_count=100 @@ -2902,6 +3212,17 @@ test_40() { local i="" local p="" local fid="" + local max_requests=$(get_hsm_param max_requests) + + # Increase the number of HSM request that can be performed in + # parallel. With the coordinator running once per second, this + # also limits the number of requests per seconds that can be + # performed, so we pick a decent number. But we also need to keep + # that number low because the copytool has no rate limit and will + # fail some requests if if gets too many at once. + set_hsm_param max_requests 300 + + trap cleanup_test_40 EXIT for i in $(seq 1 $file_count); do for p in $(seq 1 $stream_count); do @@ -2928,7 +3249,8 @@ test_40() { wait ${pids[*]} echo OK wait_all_done 100 - copytool_cleanup + + cleanup_test_40 } run_test 40 "Parallel archive requests" @@ -2993,7 +3315,7 @@ test_54() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid=$(make_custom_file_for_progress $f 39 1000000) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -3021,7 +3343,7 @@ test_55() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid=$(make_custom_file_for_progress $f 39 1000000) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -3050,7 +3372,7 @@ test_56() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || @@ -3191,7 +3513,7 @@ test_60() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 10) [ $? != 0 ] && skip "not enough free space" && return local mdtidx=0 @@ -3246,11 +3568,32 @@ test_60() { error "Expected progress update after at least $interval seconds" fi + echo "Wait for on going archive hsm action to complete" + wait_update $agent "grep -o copied $copytool_log" "copied" 10 || + echo "File archiving not completed even after 10 secs" + cdt_clear_no_retry copytool_cleanup } run_test 60 "Changing progress update interval from default" +test_61() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + cdt_disable + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + rm -f $f + cdt_enable + wait_request_state $fid ARCHIVE FAILED + + copytool_cleanup +} +run_test 61 "Waiting archive of a removed file should fail" + test_70() { # test needs a new running copytool copytool_cleanup @@ -3310,7 +3653,7 @@ test_71() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || @@ -3582,34 +3925,45 @@ run_test 103 "Purge all requests" DATA=CEA DATAHEX='[434541]' test_104() { - # test needs a running copytool - copytool_setup mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return - # if cdt is on, it can serve too quickly the request - cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f local data1=$(do_facet $SINGLEMDS "$LCTL get_param -n\ $HSM_PARAM.actions |\ grep $fid | cut -f16 -d=") - cdt_enable [[ "$data1" == "$DATAHEX" ]] || error "Data field in records is ($data1) and not ($DATAHEX)" + # archive the file + copytool_setup + + wait_request_state $fid ARCHIVE SUCCEED + copytool_cleanup } run_test 104 "Copy tool data field" +cleanup_test_105() { + trap 0 + set_hsm_param max_requests $max_requests + copytool_cleanup +} + test_105() { + local max_requests=$(get_hsm_param max_requests) mkdir -p $DIR/$tdir local i="" + set_hsm_param max_requests 300 + + trap cleanup_test_105 EXIT + cdt_disable for i in $(seq -w 1 10); do cp /etc/passwd $DIR/$tdir/$i @@ -3619,6 +3973,7 @@ test_105() { $HSM_PARAM.actions |\ grep WAITING | wc -l") cdt_restart + cdt_disable local reqcnt2=$(do_facet $SINGLEMDS "$LCTL get_param -n\ $HSM_PARAM.actions |\ @@ -3628,6 +3983,8 @@ test_105() { [[ "$reqcnt1" == "$reqcnt2" ]] || error "Requests count after shutdown $reqcnt2 != "\ "before shutdown $reqcnt1" + + cleanup_test_105 } run_test 105 "Restart of coordinator" @@ -3856,7 +4213,7 @@ test_200() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_cancel $f) + fid=$(make_custom_file_for_progress $f 103 1048576) [ $? != 0 ] && skip "not enough free space" && return # test with cdt on is made in test_221 @@ -3904,7 +4261,7 @@ test_202() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -3946,6 +4303,43 @@ test_220() { } run_test 220 "Changelog for archive" +test_220a() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + + # wait request to reach CT + wait_request_state $fid ARCHIVE STARTED + + rm -f $f + + copytool_continue + + wait_request_state $fid ARCHIVE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + changelog_cleanup + + # HE_ARCHIVE|ENOENT + local target=0x2 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" + + copytool_cleanup +} +run_test 220a "Changelog for failed archive" + test_221() { # test needs a running copytool copytool_setup @@ -3954,7 +4348,7 @@ test_221() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_cancel $f) + fid=$(make_custom_file_for_progress $f 103 1048576) [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -4025,6 +4419,72 @@ test_222b() { } run_test 222b "Changelog for implicit restore" +test_222c() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + copy2archive /etc/passwd $tdir/$tfile + + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + local fid=$(path2fid $f) + + changelog_setup + + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + + $LFS hsm_restore $f + + # wait request to reach CT + wait_request_state $fid RESTORE STARTED + + rm -f $f + + copytool_continue + + wait_request_state $fid RESTORE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + + # HE_RESTORE|ENOENT + local target=0x82 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" + + cleanup +} +run_test 222c "Changelog for failed explicit restore" + +test_222d() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + + copytool_remove_backend $fid + md5sum $f + + wait_request_state $fid RESTORE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + + # HE_RESTORE|ENOENT + local target=0x82 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" + + cleanup +} +run_test 222d "Changelog for failed implicit restore" + test_223a() { # test needs a running copytool copytool_setup @@ -4063,7 +4523,7 @@ test_223b() { local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -4112,6 +4572,47 @@ test_224() { } run_test 224 "Changelog for remove" +test_224a() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + copytool_remove_backend $fid + + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + + $LFS hsm_remove $f + + # wait for request to reach CT + wait_request_state $fid REMOVE STARTED + + rm -f $f + + copytool_continue + + wait_request_state $fid REMOVE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -n 1) + + # HE_REMOVE|ENOENT + local target=0x202 + [[ $flags == $target ]] || + error "Changelog flag is $flags not $target" + + cleanup +} +run_test 224a "Changelog for failed remove" + test_225() { # test needs a running copytool copytool_setup @@ -4125,7 +4626,7 @@ test_225() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_progress $f) + fid=$(make_custom_file_for_progress $f 39 1000000) [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -4330,7 +4831,7 @@ test_251() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile local fid - fid=$(make_large_for_cancel $f) + fid=$(make_custom_file_for_progress $f 103 1048576) [ $? != 0 ] && skip "not enough free space" && return cdt_disable @@ -4359,6 +4860,43 @@ test_251() { } run_test 251 "Coordinator request timeout" +test_252() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_custom_file_for_progress $f 103 1048576) + + cdt_disable + # to have a short test + local old_to=$(get_hsm_param active_request_timeout) + set_hsm_param active_request_timeout 20 + # to be sure the cdt will wake up frequently so + # it will be able to cancel the "old" request + local old_loop=$(get_hsm_param loop_period) + set_hsm_param loop_period 2 + cdt_enable + + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE STARTED + rm -f $f + + # wait but less than active_request_timeout+grace_delay + sleep 25 + wait_request_state $fid ARCHIVE CANCELED + + set_hsm_param active_request_timeout $old_to + set_hsm_param loop_period $old_loop + + copytool_cleanup +} +run_test 252 "Timeout'ed running archive of a removed file should be canceled" + test_300() { # the only way to test ondisk conf is to restart MDS ... echo "Stop coordinator and remove coordinator state at mount" @@ -4507,7 +5045,7 @@ mdc_change_state() # facet, MDT_pattern, activate|deactivate done } -test_402() { +test_402a() { # make sure there is no running copytool copytool_cleanup @@ -4524,7 +5062,34 @@ test_402() { # reactivate MDCs mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" } -run_test 402 "Copytool start fails if all MDTs are inactive" +run_test 402a "Copytool start fails if all MDTs are inactive" + +test_402b() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + touch $f || error "touch $f failed" + local fid=$(path2fid $f) + +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d + do_facet $SINGLEAGT lctl set_param fail_loc=0x14d + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + + # give time for CDT to send request and to keep it for retry + wait_for_loop_period + + wait_request_state $fid ARCHIVE WAITING + + do_facet $SINGLEAGT lctl set_param fail_loc=0 + + # request should succeed now + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 402b "CDT must retry request upon slow start of CT" test_403() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return