X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=a69302a8d41779dd3e4aa1b354b2c59ac78f58be;hb=cff9f1e7c6a41bfa05d1455b8964860803d12612;hp=dd700c3ee8d0c400704740fc32cfe46c8309292f;hpb=2dbb4d1ae3bcda5c733c512df141837289e03b7a;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index dd700c3..a69302a 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -170,15 +170,18 @@ wait_copytools() { local wait_timeout=200 local wait_start=$SECONDS local wait_end=$((wait_start + wait_timeout)) + local sleep_time=100000 # 0.1 second while ((SECONDS < wait_end)); do - sleep 2 if ! search_copytools $hosts; then echo "copytools stopped in $((SECONDS - wait_start))s" return 0 fi echo "copytools still running on $hosts" + usleep $sleep_time + [ $sleep_time -lt 32000000 ] && # 3.2 seconds + sleep_time=$(bc <<< "$sleep_time * 2") done # try to dump Copytool's stack @@ -258,12 +261,6 @@ copytool_setup() { local agent=$(facet_active_host $facet) - if [[ -z "$arc_id" ]] && - do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then - echo "Only wakeup running copytool $facet on $agent" - return 0 - fi - if $HSM_ARCHIVE_PURGE; then echo "Purging archive on $agent" do_facet $facet "rm -rf $hsm_root/$HSMTMP/*" @@ -741,6 +738,15 @@ get_request_count() { "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'" } +# Ensure the number of HSM request for a given FID is correct +# assert_request_count FID REQUEST_TYPE COUNT [ERROR_MSG] +assert_request_count() { + local request_count=$(get_request_count $1 $2) + local default_error_msg=("expected $3 '$2' request(s) for '$1', found " + "'$request_count'") + [ $request_count -eq $3 ] || error "${4:-"${default_error_msg[@]}"}" +} + wait_all_done() { local timeout=$1 local fid=$2 @@ -851,9 +857,6 @@ echo "Set HSM on and start" cdt_set_mount_state enabled cdt_check_state enabled -echo "Start copytool" -copytool_setup - echo "Set sanity-hsm HSM policy" cdt_set_sanity_policy @@ -900,6 +903,8 @@ test_1a() { local f=$DIR/$tdir/$tfile local fid=$(make_small $f) + copytool_setup + $LFS hsm_archive $f || error "could not archive file" wait_request_state $fid ARCHIVE SUCCEED @@ -909,9 +914,43 @@ test_1a() { check_hsm_flags $f "0x0000000d" $MMAP_CAT $f > /dev/null || error "failed mmap & cat release file" + + copytool_cleanup } run_test 1a "mmap & cat a HSM released file" +test_1b() { + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -E 64M -c 2 -E -1 -c 4 $DIR/$tdir || + error "failed to set default stripe" + local f=$DIR/$tdir/$tfile + rm -f $f + + dd if=/dev/random of=$f bs=1M count=1 conv=sync || + error "failed to create file" + local fid=$(path2fid $f) + + copytool_setup + + echo "archive $f" + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + echo "release $f" + $LFS hsm_release $f || error "could not release file" + echo "verify released state: " + check_hsm_flags $f "0x0000000d" && echo "pass" + + echo "restore $f" + $LFS hsm_restore $f || error "could not restore file" + wait_request_state $fid RESTORE SUCCEED + echo "verify restored state: " + check_hsm_flags $f "0x00000009" && echo "pass" + + copytool_cleanup +} +run_test 1b "Archive, Release & Restore composite file" + test_2() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile @@ -2137,6 +2176,7 @@ run_test 24c "check that user,group,other request masks work" cleanup_test_24d() { trap 0 mount -o remount,rw $MOUNT2 + zconf_umount $(facet_host $SINGLEAGT) "$MOUNT3" } test_24d() { @@ -2145,16 +2185,24 @@ test_24d() { local fid1 local fid2 - copytool_setup - mkdir -p $DIR/$tdir rm -f $file1 fid1=$(make_small $file1) + echo $fid1 + $LFS getstripe $file1 + trap cleanup_test_24d EXIT + zconf_mount $(facet_host $SINGLEAGT) "$MOUNT3" || + error "cannot mount '$MOUNT3' on '$SINGLEAGT'" + copytool_setup $SINGLEAGT "$MOUNT3" || + error "unable to setup a copytool for the test" mount -o remount,ro $MOUNT2 + do_nodes $(comma_list $(nodes_list)) $LCTL clear + start_full_debug_logging + fid2=$(path2fid $file2) [ "$fid1" == "$fid2" ] || error "FID mismatch '$fid1' != '$fid2'" @@ -2163,9 +2211,11 @@ test_24d() { error "archive should fail on read-only mount" check_hsm_flags $file1 "0x00000000" - $LFS hsm_archive $file1 + $LFS hsm_archive $file1 || error "Fail to archive $file1" wait_request_state $fid1 ARCHIVE SUCCEED + stop_full_debug_logging + $LFS hsm_release $file1 $LFS hsm_restore $file2 wait_request_state $fid1 RESTORE SUCCEED @@ -2299,6 +2349,222 @@ test_26() { } run_test 26 "Remove the archive of a valid file" +cleanup_test_26a() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26a() { + local raolu=$(get_hsm_param remove_archive_on_last_unlink) + [[ $raolu -eq 0 ]] || error "RAoLU policy should be off" + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + local f3=$DIR/$tdir/${tfile}_3 + local fid3=$(copy_file /etc/passwd $f3) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f3 + wait_request_state $fid3 ARCHIVE SUCCEED + + trap cleanup_test_26a EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 1 + + ln "$f3" "$f3"_bis || error "Unable to create hard-link" + rm -f $f3 + + rm -f $f2 + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid2 REMOVE SUCCEED + + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + assert_request_count $fid3 REMOVE 0 \ + "Unexpected archived data remove request for $f3" + + cleanup_test_26a +} +run_test 26a "Remove Archive On Last Unlink (RAoLU) policy" + +cleanup_test_26b() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + copytool_cleanup +} + +test_26b() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + trap cleanup_test_26b EXIT + + set_hsm_param remove_archive_on_last_unlink 1 + + cdt_shutdown + cdt_check_state stopped + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE WAITING + + cdt_enable + # copytool must re-register + kill_copytools + wait_copytools || error "copytool failed to stop" + HSM_ARCHIVE_PURGE=false copytool_setup + + wait_request_state $fid REMOVE SUCCEED + + cleanup_test_26b +} +run_test 26b "RAoLU policy when CDT off" + +cleanup_test_26c() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26c() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + trap cleanup_test_26c EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + set_hsm_param remove_archive_on_last_unlink 1 + + multiop_bg_pause $f O_c || error "open $f failed" + local pid=$! + + rm -f $f + rm -f $f2 + + wait_request_state $fid2 REMOVE SUCCEED + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + + kill -USR1 $pid || error "multiop early exit" + # should reach autotest timeout if multiop fails to trap + # signal, close file, and exit ... + wait $pid || error + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE SUCCEED + + cleanup_test_26c +} +run_test 26c "RAoLU effective when file closed" + +cleanup_test_26d() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26d() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/motd $f 1) + + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + trap cleanup_test_26d EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + set_hsm_param remove_archive_on_last_unlink 1 + + multiop_bg_pause $f O_c || error "multiop failed" + local MULTIPID=$! + + rm -f $f + + mds_evict_client + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE SUCCEED + + client_up || client_up || true + + kill -USR1 $MULTIPID + wait $MULTIPID || error "multiop close failed" + + cleanup_test_26d +} +run_test 26d "RAoLU when Client eviction" + test_27a() { # test needs a running copytool copytool_setup @@ -2462,7 +2728,8 @@ test_29d() { rm -f $file - $LFS hsm_remove -a 0 $fid + $LFS hsm_remove --mntpath "$MOUNT" -a 0 $fid || + error "cannot hsm_remove '$fid'" # give time for CDT to handle remove request and create broadcasted sleep 2 @@ -3292,6 +3559,23 @@ test_60() { } run_test 60 "Changing progress update interval from default" +test_61() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + cdt_disable + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + rm -f $f + cdt_enable + wait_request_state $fid ARCHIVE FAILED + + copytool_cleanup +} +run_test 61 "Waiting archive of a removed file should fail" + test_70() { # test needs a new running copytool copytool_cleanup @@ -4533,6 +4817,43 @@ test_251() { } run_test 251 "Coordinator request timeout" +test_252() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_custom_file_for_progress $f 103 1048576) + + cdt_disable + # to have a short test + local old_to=$(get_hsm_param active_request_timeout) + set_hsm_param active_request_timeout 20 + # to be sure the cdt will wake up frequently so + # it will be able to cancel the "old" request + local old_loop=$(get_hsm_param loop_period) + set_hsm_param loop_period 2 + cdt_enable + + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE STARTED + rm -f $f + + # wait but less than active_request_timeout+grace_delay + sleep 25 + wait_request_state $fid ARCHIVE CANCELED + + set_hsm_param active_request_timeout $old_to + set_hsm_param loop_period $old_loop + + copytool_cleanup +} +run_test 252 "Timeout'ed running archive of a removed file should be canceled" + test_300() { # the only way to test ondisk conf is to restart MDS ... echo "Stop coordinator and remove coordinator state at mount" @@ -4681,7 +5002,7 @@ mdc_change_state() # facet, MDT_pattern, activate|deactivate done } -test_402() { +test_402a() { # make sure there is no running copytool copytool_cleanup @@ -4698,7 +5019,34 @@ test_402() { # reactivate MDCs mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" } -run_test 402 "Copytool start fails if all MDTs are inactive" +run_test 402a "Copytool start fails if all MDTs are inactive" + +test_402b() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + touch $f || error "touch $f failed" + local fid=$(path2fid $f) + +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d + do_facet $SINGLEAGT lctl set_param fail_loc=0x14d + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + + # give time for CDT to send request and to keep it for retry + wait_for_loop_period + + wait_request_state $fid ARCHIVE WAITING + + do_facet $SINGLEAGT lctl set_param fail_loc=0 + + # request should succeed now + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 402b "CDT must retry request upon slow start of CT" test_403() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return