X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=0e4735f9b6e8967379e31e2cc627cff584209461;hb=0bafbd7d8f652997d83b3cc2419894f48833f424;hp=efb42ea0ccefed9d68e20297c4650536a63fd583;hpb=289d11769bde4d5b427a8d28f6a86b9492aed0b3;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index efb42ea0..0e4735f 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -15,6 +15,8 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" # bug number for skipped test:4178 4176 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a" +# bug number for skipped test:LU-3852 +ALWAYS_EXCEPT="$ALWAYS_EXCEPT 251" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -91,6 +93,8 @@ init_agt_vars() { export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""} + export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""} + export HSMTOOL_TESTDIR export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") HSM_ARCHIVE=$(copytool_device $SINGLEAGT) HSM_ARCHIVE_NUMBER=2 @@ -113,6 +117,7 @@ copytool_device() { # Stop copytool and unregister an existing changelog user. cleanup() { + copytool_monitor_cleanup copytool_cleanup changelog_cleanup cdt_set_sanity_policy @@ -125,7 +130,7 @@ get_mdt_devices() { local idx=$(($mdtno - 1)) MDT[$idx]=$($LCTL get_param -n \ mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid | - awk '{gsub(/_UUID/,""); print $1}' | head -1) + awk '{gsub(/_UUID/,""); print $1}' | head -n1) done } @@ -141,6 +146,63 @@ search_and_kill_copytool() { do_nodesv $agents "killall -q $HSMTOOL_BASE" || true } +copytool_monitor_setup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + local cmd="mktemp --tmpdir=/tmp -d ${TESTSUITE}.${TESTNAME}.XXXX" + local test_dir=$(do_node $agent "$cmd") || + error "Failed to create tempdir on $agent" + export HSMTOOL_MONITOR_DIR=$test_dir + + # Create the fifo and a monitor (cat dies when copytool dies) + do_node $agent "mkfifo -m 0644 $test_dir/fifo" || + error "failed to create copytool fifo on $agent" + cmd="cat $test_dir/fifo > $test_dir/events &" + cmd+=" echo \\\$! > $test_dir/monitor_pid" + + if [[ $PDSH == *Rmrsh* ]]; then + # This is required for pdsh -Rmrsh and its handling of remote + # shells. + # Regular ssh and pdsh -Rssh work fine without this + # backgrounded subshell nonsense. + (do_node $agent "$cmd") & + export HSMTOOL_MONITOR_PDSH=$! + + # Slightly racy, but just making a best-effort to catch obvious + # problems. + sleep 1 + ps -p $HSMTOOL_MONITOR_PDSH >&- || + error "Failed to start copytool monitor on $agent" + else + do_node $agent "$cmd" + if [ $? != 0 ]; then + error "Failed to start copytool monitor on $agent" + fi + fi +} + +copytool_monitor_cleanup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + if [ -n "$HSMTOOL_MONITOR_DIR" ]; then + # Should die when the copytool dies, but just in case. + local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)" + cmd+=" 2>/dev/null || true" + do_node $agent "$cmd" + do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR" + export HSMTOOL_MONITOR_DIR= + fi + + # The pdsh should die on its own when the monitor dies. Just + # in case, though, try to clean up to avoid any cruft. + if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then + kill $HSMTOOL_MONITOR_PDSH 2>/dev/null + export HSMTOOL_MONITOR_PDSH= + fi +} + copytool_setup() { local facet=${1:-$SINGLEAGT} local lustre_mntpnt=${2:-$MOUNT} @@ -167,6 +229,8 @@ copytool_setup() { [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" [[ -z "$HSMTOOL_UPDATE_INTERVAL" ]] || cmd+=" --update-interval $HSMTOOL_UPDATE_INTERVAL" + [[ -z "$HSMTOOL_EVENT_FIFO" ]] || + cmd+=" --event-fifo $HSMTOOL_EVENT_FIFO" cmd+=" --bandwidth 1 $lustre_mntpnt" # Redirect the standard output and error to a log file which @@ -180,6 +244,17 @@ copytool_setup() { trap cleanup EXIT } +get_copytool_event_log() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + [ -z "$HSMTOOL_MONITOR_DIR" ] && + error "Can't get event log: No monitor directory!" + + do_node $agent "cat $HSMTOOL_MONITOR_DIR/events" || + error "Could not collect event log from $agent" +} + copytool_cleanup() { trap - EXIT local agents=${1:-$(facet_active_host $SINGLEAGT)} @@ -496,7 +571,7 @@ cleanup_large_files() { make_large_for_striping() { local file2=${1/$DIR/$DIR2} - local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -1) + local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) cleanup_large_files @@ -583,8 +658,10 @@ get_request_count() { wait_all_done() { local timeout=$1 + local fid=$2 local cmd="$LCTL get_param -n $HSM_PARAM.actions" + [[ -n $fid ]] && cmd+=" | grep '$fid'" cmd+=" | egrep 'WAITING|STARTED'" wait_result $SINGLEMDS "$cmd" "" $timeout || @@ -596,6 +673,18 @@ wait_for_grace_delay() { sleep $val } +parse_json_event() { + local raw_event=$1 + + # python2.6 in EL6 includes an internal json module + local json_parser='import json; import fileinput;' + json_parser+=' print "\n".join(["local %s=\"%s\"" % tuple for tuple in ' + json_parser+='json.loads([line for line in ' + json_parser+='fileinput.input()][0]).items()])' + + echo $raw_event | python -c "$json_parser" +} + # populate MDT device array get_mdt_devices @@ -929,16 +1018,16 @@ test_12a() { local f=$DIR/$tdir/$tfile import_file $tdir/$tfile $f - local f=$DIR2/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile echo "Verifying released state: " - check_hsm_flags $f "0x0000000d" + check_hsm_flags $f2 "0x0000000d" - local fid=$(path2fid $f) - $LFS hsm_restore $f + local fid=$(path2fid $f2) + $LFS hsm_restore $f2 wait_request_state $fid RESTORE SUCCEED echo "Verifying file state: " - check_hsm_flags $f "0x00000009" + check_hsm_flags $f2 "0x00000009" do_facet $SINGLEAGT diff -q $HSM_ARCHIVE/$tdir/$tfile $f @@ -2138,6 +2227,15 @@ test_33() { wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f + # to be sure wait_all_done will not be mislead by previous tests + # and ops. + cdt_purge + wait_for_grace_delay + # Also raise grace_delay significantly so the Canceled + # Restore action will stay enough long avail. + local old_grace=$(get_hsm_param grace_delay) + set_hsm_param grace_delay 100 + md5sum $f >/dev/null & local pid=$! wait_request_state $fid RESTORE STARTED @@ -2150,8 +2248,29 @@ test_33() { $LFS hsm_cancel $f - wait_request_state $fid RESTORE CANCELED - wait_request_state $fid CANCEL SUCCEED + # instead of waiting+checking both Restore and Cancel ops + # sequentially, wait for both to be finished and then check + # each results. + wait_all_done 100 $fid + local rstate=$(get_request_state $fid RESTORE) + local cstate=$(get_request_state $fid CANCEL) + + # restore orig grace_delay. + set_hsm_param grace_delay $old_grace + + if [[ "$rstate" == "CANCELED" ]] ; then + [[ "$cstate" == "SUCCEED" ]] || + error "Restore state is CANCELED and Cancel state " \ + "is not SUCCEED but $cstate" + echo "Restore state is CANCELED, Cancel state is SUCCEED" + elif [[ "$rstate" == "SUCCEED" ]] ; then + [[ "$cstate" == "FAILED" ]] || + error "Restore state is SUCCEED and Cancel state " \ + "is not FAILED but $cstate" + echo "Restore state is SUCCEED, Cancel state is FAILED" + else + error "Restore state is $rstate and Cancel state is $cstate" + fi [ -z $killed ] || error "Cannot kill process waiting for restore ($killed)" @@ -2543,7 +2662,7 @@ test_60() { # option changes the progress reporting interval from the default # (30 seconds) to the user-specified interval. local interval=5 - local progress_timeout=$((interval * 2)) + local progress_timeout=$((interval * 3)) # test needs a new running copytool copytool_cleanup @@ -2586,9 +2705,7 @@ test_60() { local elapsed=$((finish_at - start_at)) # Ensure that the progress update occurred within the expected window. - if [ $elapsed -gt $progress_timeout ]; then - error "Expected progress update within $progress_timeout seconds" - elif [ $elapsed -lt $interval ]; then + if [ $elapsed -lt $interval ]; then error "Expected progress update after at least $interval seconds" fi @@ -2597,10 +2714,231 @@ test_60() { } run_test 60 "Changing progress update interval from default" -test_90() { - file_count=57 +test_70() { + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + # Just start and stop the copytool to generate events. + cdt_clear_no_retry + copytool_cleanup + + local REGISTER_EVENT + local UNREGISTER_EVENT + while read event; do + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "REGISTER" ]; then + REGISTER_EVENT=$event + elif [ $event_type == "UNREGISTER" ]; then + UNREGISTER_EVENT=$event + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$REGISTER_EVENT" ]; then + error "Copytool failed to send register event to FIFO" + fi + + if [ -z "$UNREGISTER_EVENT" ]; then + error "Copytool failed to send unregister event to FIFO" + fi + + copytool_monitor_cleanup + echo "Register/Unregister events look OK." +} +run_test 70 "Copytool logs JSON register/unregister events to FIFO" + +test_71() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile + local fid=$(make_large_for_progress $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "ARCHIVE_START" ]; then + START_EVENT=$event + continue + elif [ $event_type == "ARCHIVE_FINISH" ]; then + FINISH_EVENT=$event + continue + elif [ $event_type != "ARCHIVE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # These should be identical throughout an archive + # operation. + if [ $source_fid != $data_fid ]; then + error "Expected source_fid to equal data_fid" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send archive start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send archive finish event to FIFO" + fi + + echo "Archive events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup +} +run_test 71 "Copytool logs JSON archive events to FIFO" + +test_72() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + local test_file=$HSMTOOL_MONITOR_DIR/file + + local cmd="dd if=/dev/urandom of=$test_file count=16 bs=1000000 " + cmd+="conv=fsync" + do_facet $SINGLEAGT "$cmd" || + error "cannot create $test_file on $SINGLEAGT" + copy2archive $test_file $tdir/$tfile + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + f=$DIR2/$tdir/$tfile + echo "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + local fid=$(path2fid $f) + $LFS hsm_restore $f + wait_request_state $fid RESTORE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "RESTORE_START" ]; then + START_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at start" + fi + continue + elif [ $event_type == "RESTORE_FINISH" ]; then + FINISH_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at finish" + fi + continue + elif [ $event_type != "RESTORE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # When a restore starts out, the data fid is the same as the + # source fid. After the restore has gotten going, we learn + # the new data fid. Once the restore has finished, the source + # fid is set to the new data fid. + # + # We test this because some monitoring software may depend on + # this behavior. If it changes, then the consumers of these + # events may need to be modified. + if [ $source_fid == $data_fid ]; then + error "source_fid should != data_fid during restore" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send restore start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send restore finish event to FIFO" + fi + + echo "Restore events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup + + rm -rf $test_dir +} +run_test 72 "Copytool logs JSON restore events to FIFO" + +test_90() { + file_count=51 # Max number of files constrained by LNET message size + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + local f=$DIR/$tdir/$tfile local FILELIST=/tmp/filelist.txt local i=""