X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=fd2f71121f5ee8a65492ae2314ef59a8c6c1e8aa;hp=bdabe108ba514eb4e3ce70f6207f61d90505d379;hb=89a668f6c8770c5af8c99984003ffd9844dc2904;hpb=74d92933108dc64b110a843352cf3336dca249d0 diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index bdabe10..fd2f711 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -103,7 +103,16 @@ init_agt_vars() { export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""} export HSMTOOL_TESTDIR export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") + # $hsm_root/$HSMTMP Makes $hsm_root dir path less generic to ensure + # rm -rf $hsm_root/* is safe even if $hsm_root becomes unset to avoid + # deleting everything in filesystem, independent of any copytool. + export HSMTMP=${HSMTMP:-"shsm"} + HSM_ARCHIVE=$(copytool_device $SINGLEAGT) + + [ -z "${HSM_ARCHIVE// /}" ] && error "HSM_ARCHIVE is empty!" + HSM_ARCHIVE=$HSM_ARCHIVE/$HSMTMP + HSM_ARCHIVE_NUMBER=2 # The test only support up to 10 MDTs @@ -244,6 +253,9 @@ copytool_setup() { local lustre_mntpnt=${2:-${MOUNT2:-$MOUNT}} local arc_id=$3 local hsm_root=${4:-$(copytool_device $facet)} + + [ -z "${hsm_root// /}" ] && error "copytool_setup: hsm_root empty!" + local agent=$(facet_active_host $facet) if [[ -z "$arc_id" ]] && @@ -254,14 +266,16 @@ copytool_setup() { if $HSM_ARCHIVE_PURGE; then echo "Purging archive on $agent" - do_facet $facet "rm -rf $hsm_root/*" + do_facet $facet "rm -rf $hsm_root/$HSMTMP/*" fi echo "Starting copytool $facet on $agent" - do_facet $facet "mkdir -p $hsm_root" || error "mkdir '$hsm_root' failed" + do_facet $facet "mkdir -p $hsm_root/$HSMTMP/" || + error "mkdir '$hsm_root/$HSMTMP' failed" # bandwidth is limited to 1MB/s so the copy time is known and # independent of hardware - local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root $hsm_root" + local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon" + cmd+=" --hsm-root $hsm_root/$HSMTMP" [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" [[ -z "$HSMTOOL_UPDATE_INTERVAL" ]] || cmd+=" --update-interval $HSMTOOL_UPDATE_INTERVAL" @@ -301,6 +315,9 @@ copytool_cleanup() { local agt_facet=$SINGLEAGT local agt_hosts=${1:-$(facet_active_host $agt_facet)} local hsm_root=$(copytool_device $agt_facet) + + [ -z "${hsm_root// /}" ] && error "copytool_cleanup: hsm_root empty!" + local i local facet local param @@ -349,7 +366,7 @@ copytool_cleanup() { done if do_facet $agt_facet "df $hsm_root" >/dev/null 2>&1 ; then - do_facet $agt_facet "rm -rf $hsm_root/*" + do_facet $agt_facet "rm -rf $hsm_root/$HSMTMP/*" fi } @@ -794,12 +811,72 @@ parse_json_event() { echo $raw_event | python -c "$json_parser" } -# populate MDT device array -get_mdt_devices +get_agent_by_uuid_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + do_facet $mds "$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.agents |\ + grep $uuid" +} + +check_agent_registered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ ! -z "$agent" ]]; then + echo "found agent $agent on $mds" + else + error "uuid $uuid not found in agent list on $mds" + fi +} + +check_agent_unregistered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ -z "$agent" ]]; then + echo "uuid not found in agent list on $mds" + else + error "uuid found in agent list on $mds: $agent" + fi +} + +check_agent_registered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_registered_by_mdt $uuid $((mdsno - 1)) + done +} + +check_agent_unregistered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_unregistered_by_mdt $uuid $((mdsno - 1)) + done +} + +get_agent_uuid() { + local agent=${1:-$(facet_active_host $SINGLEAGT)} + + # Lustre mount-point is mandatory and last parameter on + # copytool cmd-line. + local mntpnt=$(do_rpc_nodes $agent ps -C $HSMTOOL_BASE -o args= | + awk '{print $NF}') + [ -n "$mntpnt" ] || error "Found no Agent or with no mount-point "\ + "parameter" + do_rpc_nodes $agent get_client_uuid $mntpnt | cut -d' ' -f2 +} # initiate variables init_agt_vars +# populate MDT device array +get_mdt_devices + # cleanup from previous bad setup kill_copytools @@ -982,13 +1059,19 @@ test_8() { run_test 8 "Test default archive number" test_9() { - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/passwd $f) # we do not use the default one to be sure local new_an=$((HSM_ARCHIVE_NUMBER + 1)) copytool_cleanup copytool_setup $SINGLEAGT $MOUNT $new_an + + # give time for CT to register with MDTs + sleep $(($MDSCOUNT*2)) + local uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) + check_agent_registered $uuid + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) $LFS hsm_archive --archive $new_an $f wait_request_state $fid ARCHIVE SUCCEED @@ -1687,7 +1770,8 @@ test_16() { $LFS hsm_archive $f wait_request_state $fid ARCHIVE SUCCEED local end=$(date +%s) - local duration=$((end - start)) + # Add 1 to account for rounding errors between start and end (LU-8155) + local duration=$((end - start + 1)) [[ $duration -ge $goal ]] || error "Transfer is too fast $duration < $goal" @@ -1921,8 +2005,7 @@ test_24a() { [ $ctime0 -eq $ctime1 ] || error "release changed ctime from $ctime0 to $ctime1" - # Restore should not change atime or mtime and should not - # decrease ctime. + # Restore should not change any timestamps. $LFS hsm_restore $file wait_request_state $fid RESTORE SUCCEED @@ -2049,7 +2132,6 @@ test_24c() { chown $RUNAS_ID:nobody $file || error "cannot chown '$file' to '$RUNAS_ID:nobody'" - set_hsm_param user_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by user should fail" @@ -2063,7 +2145,6 @@ test_24c() { chown nobody:$RUNAS_GID $file || error "cannot chown '$file' to 'nobody:$RUNAS_GID'" - set_hsm_param group_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by group should fail" @@ -2077,7 +2158,6 @@ test_24c() { chown nobody:nobody $file || error "cannot chown '$file' to 'nobody:nobody'" - set_hsm_param other_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by other should fail" @@ -2255,270 +2335,6 @@ test_26() { } run_test 26 "Remove the archive of a valid file" -cleanup_test_26a() { - trap 0 - set_hsm_param remove_archive_on_last_unlink 0 - set_hsm_param loop_period $orig_loop_period - set_hsm_param grace_delay $orig_grace_delay - copytool_cleanup -} - -test_26a() { - local raolu=$(get_hsm_param remove_archive_on_last_unlink) - [[ $raolu -eq 0 ]] || error "RAoLU policy should be off" - - # test needs a running copytool - copytool_setup - - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/passwd $f) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f - wait_request_state $fid ARCHIVE SUCCEED - - rm -f $f - - local f2=$DIR/$tdir/${tfile}_2 - local fid2=$(copy_file /etc/passwd $f2) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 - wait_request_state $fid2 ARCHIVE SUCCEED - - cat $f2 > /dev/null - - local f3=$DIR/$tdir/${tfile}_3 - local fid3=$(copy_file /etc/passwd $f3) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f3 - wait_request_state $fid3 ARCHIVE SUCCEED - - local f4=$DIR/$tdir/${tfile}_4 - local fid4=$(copy_file /etc/passwd $f4) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f4 - wait_request_state $fid4 ARCHIVE SUCCEED - - trap cleanup_test_26a EXIT - - # set a long grace_delay vs short loop_period - local orig_loop_period=$(get_hsm_param loop_period) - local orig_grace_delay=$(get_hsm_param grace_delay) - set_hsm_param loop_period 10 - set_hsm_param grace_delay 100 - - set_hsm_param remove_archive_on_last_unlink 1 - - rm -f $f3 - cat $f4 > /dev/null - local f4bis=$DIR/$tdir/${tfile}_4bis - ln $f4 $f4bis - [[ $? -eq 0 ]] || error "Unable to create hard-link" - rm -f $f4 - - # Since CDT is not signaled for RAoLU requests to be sure it - # will wake-up to send remove request and copytool will process - # it, wait for loop_period + some extra-time. - local loop_period=$(get_hsm_param loop_period) - sleep $((loop_period + 5)) - - set_hsm_param remove_archive_on_last_unlink 0 - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid - [[ $? -eq 0 ]] || error "File being removed on archive" - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid2 - [[ $? -eq 0 ]] || error "File being removed on archive" - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid3 - [[ $? -eq 0 ]] && error "File not being removed on archive" - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid4 - [[ $? -eq 0 ]] || error "File being removed on archive" - - # previous actions elapsed time should be < grace_delay - wait_request_state $fid3 REMOVE SUCCEED - - cleanup_test_26a -} -run_test 26a "Remove Archive On Last Unlink (RAoLU) policy" - -cleanup_test_26b() { - trap 0 - set_hsm_param remove_archive_on_last_unlink 0 - copytool_cleanup -} - -test_26b() { - - # test needs a running copytool - copytool_setup - - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/passwd $f) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f - wait_request_state $fid ARCHIVE SUCCEED - - trap cleanup_test_26b EXIT - - set_hsm_param remove_archive_on_last_unlink 1 - - cdt_shutdown - cdt_check_state stopped - - rm -f $f - - set_hsm_param remove_archive_on_last_unlink 0 - - wait_request_state $fid REMOVE WAITING - - cdt_enable - # copytool must re-register - search_and_kill_copytool - sleep 5 - search_copytools && error "Copytool should have stopped" - HSM_ARCHIVE_PURGE=false copytool_setup - - wait_request_state $fid REMOVE SUCCEED - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid - [[ $? -eq 0 ]] && error "File not being removed on archive" - - cleanup_test_26b -} -run_test 26b "RAoLU policy when CDT off" - -cleanup_test_26c() { - trap 0 - set_hsm_param remove_archive_on_last_unlink 0 - set_hsm_param loop_period $orig_loop_period - set_hsm_param grace_delay $orig_grace_delay - copytool_cleanup -} - -test_26c() { - - # test needs a running copytool - copytool_setup - - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/passwd $f) - - $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f - wait_request_state $fid ARCHIVE SUCCEED - - trap cleanup_test_26c EXIT - - # set a long grace_delay vs short loop_period - local orig_loop_period=$(get_hsm_param loop_period) - local orig_grace_delay=$(get_hsm_param grace_delay) - set_hsm_param loop_period 10 - set_hsm_param grace_delay 100 - - set_hsm_param remove_archive_on_last_unlink 1 - - multiop_bg_pause $f O_c || error "open $f failed" - local pid=$! - # give multiop a chance to open - sleep 2 - - rm -f $f - - # Since CDT is not signaled for RAoLU requests to be sure it - # will wake-up to send remove request and copytool will process - # it, wait for loop_period + some extra-time. - local loop_period=$(get_hsm_param loop_period) - sleep $((loop_period + 5)) - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid - [[ $? -eq 0 ]] || error "File being removed on archive" - - kill -USR1 $pid || error "multiop early exit" - # should reach autotest timeout if multiop fails to trap - # signal, close file, and exit ... - wait $pid || error - - # again, wait for loop_period + some extra-time, to allow - # CDT enough time to handle remove request. - sleep $((loop_period + 5)) - - set_hsm_param remove_archive_on_last_unlink 0 - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid - [[ $? -eq 0 ]] && error "File not being removed on archive" - - # previous actions elapsed time should be < grace_delay - wait_request_state $fid REMOVE SUCCEED - - cleanup_test_26c -} -run_test 26c "RAoLU effective when file closed" - -cleanup_test_26d() { - trap 0 - set_hsm_param remove_archive_on_last_unlink 0 - set_hsm_param loop_period $orig_loop_period - set_hsm_param grace_delay $orig_grace_delay - copytool_cleanup -} - -test_26d() { - - # test needs a running copytool - copytool_setup - - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/motd $f 1) - - $LFS hsm_archive $f || error "could not archive file" - wait_request_state $fid ARCHIVE SUCCEED - check_hsm_flags $f "0x00000009" - - trap cleanup_test_26d EXIT - - # set a long grace_delay vs short loop_period - local orig_loop_period=$(get_hsm_param loop_period) - local orig_grace_delay=$(get_hsm_param grace_delay) - set_hsm_param loop_period 10 - set_hsm_param grace_delay 100 - - set_hsm_param remove_archive_on_last_unlink 1 - - multiop_bg_pause $f O_c || error "multiop failed" - local MULTIPID=$! - # give multiop a chance to open - sleep 2 - - rm -f $f - - mds_evict_client - - # Since CDT is not signaled for RAoLU requests to be sure it - # will wake-up to send remove request and copytool will process - # it, wait for loop_period + some extra-time. - local loop_period=$(get_hsm_param loop_period) - sleep $((loop_period + 5)) - - set_hsm_param remove_archive_on_last_unlink 0 - - do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid - [[ $? -eq 0 ]] && error "File not being removed on archive" - - # previous actions elapsed time should be < grace_delay - wait_request_state $fid REMOVE SUCCEED - - client_up || client_up || true - - kill -USR1 $MULTIPID - wait $MULTIPID || error "multiop close failed" - - cleanup_test_26d -} -run_test 26d "RAoLU when Client eviction" - test_27a() { # test needs a running copytool copytool_setup @@ -3815,66 +3631,6 @@ test_105() { } run_test 105 "Restart of coordinator" -get_agent_by_uuid_mdt() { - local uuid=$1 - local mdtidx=$2 - local mds=mds$(($mdtidx + 1)) - do_facet $mds "$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.agents |\ - grep $uuid" -} - -check_agent_registered_by_mdt() { - local uuid=$1 - local mdtidx=$2 - local mds=mds$(($mdtidx + 1)) - local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) - if [[ ! -z "$agent" ]]; then - echo "found agent $agent on $mds" - else - error "uuid $uuid not found in agent list on $mds" - fi -} - -check_agent_unregistered_by_mdt() { - local uuid=$1 - local mdtidx=$2 - local mds=mds$(($mdtidx + 1)) - local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) - if [[ -z "$agent" ]]; then - echo "uuid not found in agent list on $mds" - else - error "uuid found in agent list on $mds: $agent" - fi -} - -check_agent_registered() { - local uuid=$1 - local mdsno - for mdsno in $(seq 1 $MDSCOUNT); do - check_agent_registered_by_mdt $uuid $((mdsno - 1)) - done -} - -check_agent_unregistered() { - local uuid=$1 - local mdsno - for mdsno in $(seq 1 $MDSCOUNT); do - check_agent_unregistered_by_mdt $uuid $((mdsno - 1)) - done -} - -get_agent_uuid() { - local agent=${1:-$(facet_active_host $SINGLEAGT)} - - # Lustre mount-point is mandatory and last parameter on - # copytool cmd-line. - local mntpnt=$(do_rpc_nodes $agent pgrep -fl $HSMTOOL_BASE | - grep -v pgrep | awk '{print $NF}') - [ -n "$mntpnt" ] || error "Found no Agent or with no mount-point "\ - "parameter" - do_rpc_nodes $agent get_client_uuid $mntpnt | cut -d' ' -f2 -} - test_106() { # test needs a running copytool copytool_setup