X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=a69302a8d41779dd3e4aa1b354b2c59ac78f58be;hp=d3c9e89646c0dc4c60bbc0a7971fefc3d4460774;hb=cff9f1e7c6a41bfa05d1455b8964860803d12612;hpb=38695729d61958ab10e9e108175298f8a7d40536;ds=sidebyside diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh old mode 100644 new mode 100755 index d3c9e89..a69302a --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,8 +11,8 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: 3815 3939 -ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36 40" +# bug number for skipped test: +ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -24,13 +24,13 @@ init_logging MULTIOP=${MULTIOP:-multiop} OPENFILE=${OPENFILE:-openfile} -MCREATE=${MCREATE:-mcreate} +MMAP_CAT=${MMAP_CAT:-mmap_cat} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false -if [[ $MDSCOUNT -ge 2 ]]; then - skip_env "Only run with single MDT for now" && exit -fi +# script only handles up to 10 MDTs (because of MDT_PREFIX) +[ $MDSCOUNT -gt 9 ] && + error "script cannot handle more than 9 MDTs, please fix" && exit check_and_setup_lustre @@ -46,6 +46,14 @@ check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS build_test_filter +# if there is no CLIENT1 defined, some tests can be ran on localhost +CLIENT1=${CLIENT1:-$HOSTNAME} +# if CLIENT2 doesn't exist then use CLIENT1 instead +# All tests should use CLIENT2 with MOUNT2 only therefore it will work if +# $CLIENT2 == CLIENT1 +# Exception is the test which need two separate nodes +CLIENT2=${CLIENT2:-$CLIENT1} + # # In order to test multiple remote HSM agents, a new facet type named "AGT" and # the following associated variables are added: @@ -74,8 +82,11 @@ init_agt_vars() { exit 0 fi + # We used to put the HSM archive in $SHARED_DIRECTORY but that + # meant NFS issues could hose sanity-hsm sessions. So now we + # use $TMP instead. for n in $(seq $AGTCOUNT); do - eval export AGTDEV$n=\$\{AGTDEV$n:-"$SHARED_DIRECTORY/arc$n"\} + eval export AGTDEV$n=\$\{AGTDEV$n:-"$TMP/arc$n"\} agent=CLIENT$((n + 1)) if [[ -z "${!agent}" ]]; then [[ $CLIENTCOUNT -eq 1 ]] && agent=CLIENT1 || @@ -88,15 +99,31 @@ init_agt_vars() { export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"} export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""} + export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""} + export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""} + export HSMTOOL_TESTDIR export HSMTOOL_BASE=$(basename "$HSMTOOL" | cut -f1 -d" ") + # $hsm_root/$HSMTMP Makes $hsm_root dir path less generic to ensure + # rm -rf $hsm_root/* is safe even if $hsm_root becomes unset to avoid + # deleting everything in filesystem, independent of any copytool. + export HSMTMP=${HSMTMP:-"shsm"} + HSM_ARCHIVE=$(copytool_device $SINGLEAGT) + + [ -z "${HSM_ARCHIVE// /}" ] && error "HSM_ARCHIVE is empty!" + HSM_ARCHIVE=$HSM_ARCHIVE/$HSMTMP + HSM_ARCHIVE_NUMBER=2 - MDT_PARAM="mdt.$FSNAME-MDT0000" - HSM_PARAM="$MDT_PARAM.hsm" + # The test only support up to 10 MDTs + MDT_PREFIX="mdt.$FSNAME-MDT000" + HSM_PARAM="${MDT_PREFIX}0.hsm" # archive is purged at copytool setup HSM_ARCHIVE_PURGE=true + + # Don't allow copytool error upon start/setup + HSMTOOL_NOERROR=false } # Get the backend root path for the given agent facet. @@ -109,42 +136,148 @@ copytool_device() { # Stop copytool and unregister an existing changelog user. cleanup() { + copytool_monitor_cleanup copytool_cleanup changelog_cleanup cdt_set_sanity_policy } -search_and_kill_copytool() { - local agents=${1:-$(facet_active_host $SINGLEAGT)} +get_mdt_devices() { + local mdtno + # get MDT device for each mdc + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + MDT[$idx]=$($LCTL get_param -n \ + mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid | + awk '{gsub(/_UUID/,""); print $1}' | head -n1) + done +} - echo "Killing existing copytools on $agents" - do_nodesv $agents "killall -q $HSMTOOL_BASE" || true +search_copytools() { + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + do_nodesv $hosts "pgrep -x $HSMTOOL_BASE" } -copytool_setup() { +kill_copytools() { + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + + echo "Killing existing copytools on $hosts" + do_nodesv $hosts "killall -q $HSMTOOL_BASE" || true +} + +wait_copytools() { + local hosts=${1:-$(facet_active_host $SINGLEAGT)} + local wait_timeout=200 + local wait_start=$SECONDS + local wait_end=$((wait_start + wait_timeout)) + local sleep_time=100000 # 0.1 second + + while ((SECONDS < wait_end)); do + if ! search_copytools $hosts; then + echo "copytools stopped in $((SECONDS - wait_start))s" + return 0 + fi + + echo "copytools still running on $hosts" + usleep $sleep_time + [ $sleep_time -lt 32000000 ] && # 3.2 seconds + sleep_time=$(bc <<< "$sleep_time * 2") + done + + # try to dump Copytool's stack + do_nodesv $hosts "echo 1 >/proc/sys/kernel/sysrq ; " \ + "echo t >/proc/sysrq-trigger" + + echo "copytools failed to stop in ${wait_timeout}s" + + return 1 +} + +copytool_monitor_setup() { local facet=${1:-$SINGLEAGT} - local lustre_mntpnt=${2:-$MOUNT} - local arc_id=$3 - local hsm_root=$(copytool_device $facet) local agent=$(facet_active_host $facet) - if [[ -z "$arc_id" ]] && - do_facet $facet "pkill -CONT -x $HSMTOOL_BASE"; then - echo "Wakeup copytool $facet on $agent" - return 0 + local cmd="mktemp --tmpdir=/tmp -d ${TESTSUITE}.${TESTNAME}.XXXX" + local test_dir=$(do_node $agent "$cmd") || + error "Failed to create tempdir on $agent" + export HSMTOOL_MONITOR_DIR=$test_dir + + # Create the fifo and a monitor (cat dies when copytool dies) + do_node $agent "mkfifo -m 0644 $test_dir/fifo" || + error "failed to create copytool fifo on $agent" + cmd="cat $test_dir/fifo > $test_dir/events &" + cmd+=" echo \\\$! > $test_dir/monitor_pid" + + if [[ $PDSH == *Rmrsh* ]]; then + # This is required for pdsh -Rmrsh and its handling of remote + # shells. + # Regular ssh and pdsh -Rssh work fine without this + # backgrounded subshell nonsense. + (do_node $agent "$cmd") & + export HSMTOOL_MONITOR_PDSH=$! + + # Slightly racy, but just making a best-effort to catch obvious + # problems. + sleep 1 + ps -p $HSMTOOL_MONITOR_PDSH > /dev/null || + error "Failed to start copytool monitor on $agent" + else + do_node $agent "$cmd" + if [ $? != 0 ]; then + error "Failed to start copytool monitor on $agent" + fi + fi +} + +copytool_monitor_cleanup() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + if [ -n "$HSMTOOL_MONITOR_DIR" ]; then + # Should die when the copytool dies, but just in case. + local cmd="kill \\\$(cat $HSMTOOL_MONITOR_DIR/monitor_pid)" + cmd+=" 2>/dev/null || true" + do_node $agent "$cmd" + do_node $agent "rm -fr $HSMTOOL_MONITOR_DIR" + export HSMTOOL_MONITOR_DIR= + fi + + # The pdsh should die on its own when the monitor dies. Just + # in case, though, try to clean up to avoid any cruft. + if [ -n "$HSMTOOL_MONITOR_PDSH" ]; then + kill $HSMTOOL_MONITOR_PDSH 2>/dev/null + export HSMTOOL_MONITOR_PDSH= fi +} + +copytool_setup() { + local facet=${1:-$SINGLEAGT} + # Use MOUNT2 by default if defined + local lustre_mntpnt=${2:-${MOUNT2:-$MOUNT}} + local arc_id=$3 + local hsm_root=${4:-$(copytool_device $facet)} + + [ -z "${hsm_root// /}" ] && error "copytool_setup: hsm_root empty!" + + local agent=$(facet_active_host $facet) if $HSM_ARCHIVE_PURGE; then echo "Purging archive on $agent" - do_facet $facet "rm -rf $hsm_root/*" + do_facet $facet "rm -rf $hsm_root/$HSMTMP/*" fi echo "Starting copytool $facet on $agent" - do_facet $facet "mkdir -p $hsm_root" || error "mkdir '$hsm_root' failed" + do_facet $facet "mkdir -p $hsm_root/$HSMTMP/" || + error "mkdir '$hsm_root/$HSMTMP' failed" # bandwidth is limited to 1MB/s so the copy time is known and # independent of hardware - local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon --hsm-root $hsm_root" + local cmd="$HSMTOOL $HSMTOOL_VERBOSE --daemon" + cmd+=" --hsm-root $hsm_root/$HSMTMP" [[ -z "$arc_id" ]] || cmd+=" --archive $arc_id" + [[ -z "$HSMTOOL_UPDATE_INTERVAL" ]] || + cmd+=" --update-interval $HSMTOOL_UPDATE_INTERVAL" + [[ -z "$HSMTOOL_EVENT_FIFO" ]] || + cmd+=" --event-fifo $HSMTOOL_EVENT_FIFO" cmd+=" --bandwidth 1 $lustre_mntpnt" # Redirect the standard output and error to a log file which @@ -153,18 +286,85 @@ copytool_setup() { [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME local copytool_log=$prefix.copytool${arc_id}_log.$agent.log - do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" || - error "start copytool $facet on $agent failed" + do_facet $facet "$cmd < /dev/null > $copytool_log 2>&1" + if [[ $? != 0 ]]; then + [[ $HSMTOOL_NOERROR == true ]] || + error "start copytool $facet on $agent failed" + echo "start copytool $facet on $agent failed" + fi + trap cleanup EXIT } +get_copytool_event_log() { + local facet=${1:-$SINGLEAGT} + local agent=$(facet_active_host $facet) + + [ -z "$HSMTOOL_MONITOR_DIR" ] && + error "Can't get event log: No monitor directory!" + + do_node $agent "cat $HSMTOOL_MONITOR_DIR/events" || + error "Could not collect event log from $agent" +} + copytool_cleanup() { trap - EXIT - local agents=${1:-$(facet_active_host $SINGLEAGT)} + local agt_facet=$SINGLEAGT + local agt_hosts=${1:-$(facet_active_host $agt_facet)} + local hsm_root=$(copytool_device $agt_facet) - do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 - sleep 1 - echo "Copytool is stopped on $agents" + [ -z "${hsm_root// /}" ] && error "copytool_cleanup: hsm_root empty!" + + local i + local facet + local param + local -a state + + kill_copytools $agt_hosts + wait_copytools $agt_hosts || error "copytools failed to stop" + + # Clean all CDTs orphans requests from previous tests that + # would otherwise need to timeout to clear. + for ((i = 0; i < MDSCOUNT; i++)); do + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + state[$i]=$(do_facet $facet "$LCTL get_param -n $param") + + # Skip already stopping or stopped CDTs. + [[ "${state[$i]}" =~ ^stop ]] && continue + + do_facet $facet "$LCTL set_param $param=shutdown" + done + + for ((i = 0; i < MDSCOUNT; i++)); do + # Only check and restore CDTs that we stopped in the first loop. + [[ "${state[$i]}" =~ ^stop ]] && continue + + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + + wait_result $facet "$LCTL get_param -n $param" stopped 20 || + error "$facet CDT state is not stopped" + + # Restore old CDT state. + do_facet $facet "$LCTL set_param $param=${state[$i]}" + done + + for ((i = 0; i < MDSCOUNT; i++)); do + # Only check CDTs that we stopped in the first loop. + [[ "${state[$i]}" =~ ^stop ]] && continue + + facet=mds$((i + 1)) + param=$(printf 'mdt.%s-MDT%04x.hsm_control' $FSNAME $i) + + # Check that the old CDT state was restored. + wait_result $facet "$LCTL get_param -n $param" "${state[$i]}" \ + 20 || error "$facet CDT state is not '${state[$i]}'" + done + + if do_facet $agt_facet "df $hsm_root" >/dev/null 2>&1 ; then + do_facet $agt_facet "rm -rf $hsm_root/$HSMTMP/*" + fi } copytool_suspend() { @@ -174,9 +374,16 @@ copytool_suspend() { echo "Copytool is suspended on $agents" } +copytool_continue() { + local agents=${1:-$(facet_active_host $SINGLEAGT)} + + do_nodesv $agents "pkill -CONT -x $HSMTOOL_BASE" || return 0 + echo "Copytool is continued on $agents" +} + copytool_remove_backend() { local fid=$1 - local be=$(find $HSM_ARCHIVE -name $fid) + local be=$(do_facet $SINGLEAGT find $HSM_ARCHIVE -name $fid) echo "Remove from backend: $fid = $be" do_facet $SINGLEAGT rm -f $be } @@ -192,7 +399,7 @@ make_archive() { local file=$HSM_ARCHIVE/$1 do_facet $SINGLEAGT mkdir -p $(dirname $file) do_facet $SINGLEAGT dd if=/dev/urandom of=$file count=32 bs=1000000 || - error "cannot create $file" + file_creation_failure dd $file $? } copy2archive() { @@ -201,20 +408,66 @@ copy2archive() { do_facet $SINGLEAGT cp -p $1 $file || error "cannot copy $1 to $file" } +mdts_set_param() { + local arg=$1 + local key=$2 + local value=$3 + local mdtno + local rc=0 + if [[ "$value" != "" ]]; then + value="=$value" + fi + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local facet=mds${mdtno} + # if $arg include -P option, run 1 set_param per MDT on the MGS + # else, run set_param on each MDT + [[ $arg = *"-P"* ]] && facet=mgs + do_facet $facet $LCTL set_param $arg mdt.${MDT[$idx]}.$key$value + [[ $? != 0 ]] && rc=1 + done + return $rc +} + +mdts_check_param() { + local key="$1" + local target="$2" + local timeout="$3" + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + wait_result mds${mdtno} \ + "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \ + $timeout || + error "$key state is not '$target' on mds${mdtno}" + done +} + changelog_setup() { - CL_USER=$(do_facet $SINGLEMDS $LCTL --device $MDT0\ - changelog_register -n) - do_facet $SINGLEMDS lctl set_param mdd.$MDT0.changelog_mask="+hsm" - $LFS changelog_clear $MDT0 $CL_USER 0 + CL_USERS=() + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local cl_user=$(do_facet mds${mdtno} $LCTL \ + --device ${MDT[$idx]} \ + changelog_register -n) + CL_USERS+=($cl_user) + do_facet mds${mdtno} lctl set_param \ + mdd.${MDT[$idx]}.changelog_mask="+hsm" + $LFS changelog_clear ${MDT[$idx]} $cl_user 0 + done } changelog_cleanup() { -# $LFS changelog $MDT0 - [[ -n "$CL_USER" ]] || return 0 - - $LFS changelog_clear $MDT0 $CL_USER 0 - do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $CL_USER - CL_USER= + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + [[ -z ${CL_USERS[$idx]} ]] && continue + $LFS changelog_clear ${MDT[$idx]} ${CL_USERS[$idx]} 0 + do_facet mds${mdtno} lctl --device ${MDT[$idx]} \ + changelog_deregister ${CL_USERS[$idx]} + done + CL_USERS=() } changelog_get_flags() { @@ -235,64 +488,64 @@ set_hsm_param() { local param=$1 local value=$2 local opt=$3 - if [[ "$value" != "" ]]; then - value="=$value" - fi - do_facet $SINGLEMDS $LCTL set_param $opt -n $HSM_PARAM.$param$value + mdts_set_param "$opt -n" "hsm.$param" "$value" return $? } set_test_state() { local cmd=$1 local target=$2 - do_facet $SINGLEMDS $LCTL set_param $MDT_PARAM.hsm_control=$cmd - wait_result $SINGLEMDS "$LCTL get_param -n $MDT_PARAM.hsm_control"\ - $target 10 || error "cdt state is not $target" + mdts_set_param "" hsm_control "$cmd" + mdts_check_param hsm_control "$target" 10 } cdt_set_sanity_policy() { if [[ "$CDT_POLICY_HAD_CHANGED" ]] then # clear all - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "+NRA" + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED= fi } cdt_set_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA + mdts_set_param "" hsm.policy "+NRA" CDT_POLICY_HAD_CHANGED=true } cdt_clear_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NRA + mdts_set_param "" hsm.policy "-NRA" CDT_POLICY_HAD_CHANGED=true } cdt_set_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NBR + mdts_set_param "" hsm.policy "+NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -d -P $MDT_PARAM.hsm_control + mdts_set_param "-P -d" hsm_control "" } cdt_set_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -P $MDT_PARAM.hsm_control=$1 + mdts_set_param "-P" hsm_control "$1" + # set_param -P is asynchronous operation and could race with set_param. + # In such case configs could be retrieved and applied at mgc after + # set_param -P completion. Sleep here to avoid race with set_param. + # We need at least 20 seconds. 10 for mgc_requeue_thread to wake up + # MGC_TIMEOUT_MIN_SECONDS + MGC_TIMEOUT_RAND_CENTISEC(5 + 5) + # and 10 seconds to retrieve config from server. + sleep 20 } cdt_check_state() { - local target=$1 - wait_result $SINGLEMDS\ - "$LCTL get_param -n $MDT_PARAM.hsm_control" "$target" 20 || - error "cdt state is not $target" + mdts_check_param hsm_control "$1" 20 } cdt_disable() { @@ -328,17 +581,19 @@ needclients() { path2fid() { $LFS path2fid $1 | tr -d '[]' + return ${PIPESTATUS[0]} } get_hsm_flags() { local f=$1 local u=$2 + local st if [[ $u == "user" ]]; then - local st=$($RUNAS $LFS hsm_state $f) + st=$($RUNAS $LFS hsm_state $f) else - local st=$($LFS hsm_state $f) u=root + st=$($LFS hsm_state $f) fi [[ $? == 0 ]] || error "$LFS hsm_state $f failed (run as $u)" @@ -349,7 +604,8 @@ get_hsm_flags() { get_hsm_archive_id() { local f=$1 - local st=$($LFS hsm_state $f) + local st + st=$($LFS hsm_state $f) [[ $? == 0 ]] || error "$LFS hsm_state $f failed" local ar=$(echo $st | grep "archive_id" | cut -f5 -d" " | @@ -373,6 +629,15 @@ check_hsm_flags_user() { [[ $st == $fl ]] || error "hsm flags on $f are $st != $fl" } +file_creation_failure() { + local cmd=$1 + local f=$2 + local err=$3 + + df $MOUNT $MOUNT2 >&2 + error "cannot create $f with $cmd, status=$err" +} + copy_file() { local f= @@ -386,69 +651,53 @@ copy_file() { f=${f/$DIR/$DIR2} fi rm -f $f - cp $1 $f || error "cannot copy $1 to $f" + cp $1 $f || file_creation_failure cp $f $? + path2fid $f || error "cannot get fid on $f" } make_small() { local file2=${1/$DIR/$DIR2} dd if=/dev/urandom of=$file2 count=2 bs=1M conv=fsync || - error "cannot create $file2" - path2fid $1 || error "cannot get fid on $1" -} + file_creation_failure dd $file2 $? -cleanup_large_files() { - local ratio=$(df $MOUNT |awk '{print $5}' |sed 's/%//g' |grep -v Use) - [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \; + path2fid $1 || error "cannot get fid on $1" } -make_large_for_striping() { - local file2=${1/$DIR/$DIR2} - local sz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -1) - - cleanup_large_files - - dd if=/dev/urandom of=$file2 count=5 bs=$sz conv=fsync || - error "cannot create $file2" +make_small_sync() { + dd if=/dev/urandom of=$1 count=1 bs=1M conv=sync || + file_creation_failure dd $1 $? path2fid $1 || error "cannot get fid on $1" } -make_large_for_progress() { - local file2=${1/$DIR/$DIR2} - - cleanup_large_files - - # big file is large enough, so copy time is > 30s - # so copytool make 1 progress - # size is not a multiple of 1M to avoid stripe - # aligment - dd if=/dev/urandom of=$file2 count=39 bs=1000000 conv=fsync || - error "cannot create $file2" - path2fid $1 || error "cannot get fid on $1" +cleanup_large_files() { + local ratio=$(df -P $MOUNT | tail -1 | awk '{print $5}' | + sed 's/%//g') + [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \; } -make_large_for_progress_aligned() { - local file2=${1/$DIR/$DIR2} - - cleanup_large_files - - # big file is large enough, so copy time is > 30s - # so copytool make 1 progress - # size is a multiple of 1M to have stripe - # aligment - dd if=/dev/urandom of=$file2 count=33 bs=1M conv=fsync || - error "cannot create $file2" - path2fid $1 || error "cannot get fid on $1" +check_enough_free_space() { + local nb=$1 + local unit=$2 + local need=$((nb * unit /1024)) + local free=$(df -kP $MOUNT | tail -1 | awk '{print $4}') + (( $need >= $free )) && return 1 + return 0 } -make_large_for_cancel() { +make_custom_file_for_progress() { local file2=${1/$DIR/$DIR2} + local fsize=${2:-"39"} + local blksz=$($LCTL get_param -n lov.*-clilov-*.stripesize | head -n1) + blksz=${3:-$blksz} - cleanup_large_files + [[ $fsize -gt 0 ]] || error "Invalid file size" + [[ $blksz -gt 0 ]] || error "Invalid stripe size" - # Copy timeout is 100s. 105MB => 105s - dd if=/dev/urandom of=$file2 count=103 bs=1M conv=fsync || - error "cannot create $file2" + cleanup_large_files + check_enough_free_space $fsize $blksz || return $? + dd if=/dev/zero of=$file2 count=$fsize bs=$blksz conv=fsync || + file_creation_failure dd $file2 $? path2fid $1 || error "cannot get fid on $1" } @@ -462,12 +711,15 @@ wait_request_state() { local fid=$1 local request=$2 local state=$3 + # 4th arg (mdt index) is optional + local mdtidx=${4:-0} + local mds=mds$(($mdtidx + 1)) - local cmd="$LCTL get_param -n $HSM_PARAM.actions" + local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $SINGLEMDS "$cmd" $state 100 || - error "request on $fid is not $state" + wait_result $mds "$cmd" $state 200 || + error "request on $fid is not $state on $mds" } get_request_state() { @@ -486,10 +738,21 @@ get_request_count() { "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'" } +# Ensure the number of HSM request for a given FID is correct +# assert_request_count FID REQUEST_TYPE COUNT [ERROR_MSG] +assert_request_count() { + local request_count=$(get_request_count $1 $2) + local default_error_msg=("expected $3 '$2' request(s) for '$1', found " + "'$request_count'") + [ $request_count -eq $3 ] || error "${4:-"${default_error_msg[@]}"}" +} + wait_all_done() { local timeout=$1 + local fid=$2 local cmd="$LCTL get_param -n $HSM_PARAM.actions" + [[ -n $fid ]] && cmd+=" | grep '$fid'" cmd+=" | egrep 'WAITING|STARTED'" wait_result $SINGLEMDS "$cmd" "" $timeout || @@ -501,14 +764,91 @@ wait_for_grace_delay() { sleep $val } -MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | - awk '{gsub(/_UUID/,""); print $1}' | head -1) +wait_for_loop_period() { + local val=$(get_hsm_param loop_period) + sleep $val +} + +parse_json_event() { + local raw_event=$1 + + # python2.6 in EL6 includes an internal json module + local json_parser='import json; import fileinput;' + json_parser+=' print "\n".join(["local %s=\"%s\"" % tuple for tuple in ' + json_parser+='json.loads([line for line in ' + json_parser+='fileinput.input()][0]).items()])' + + echo $raw_event | python -c "$json_parser" +} + +get_agent_by_uuid_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + do_facet $mds "$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.agents |\ + grep $uuid" +} + +check_agent_registered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ ! -z "$agent" ]]; then + echo "found agent $agent on $mds" + else + error "uuid $uuid not found in agent list on $mds" + fi +} + +check_agent_unregistered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ -z "$agent" ]]; then + echo "uuid not found in agent list on $mds" + else + error "uuid found in agent list on $mds: $agent" + fi +} + +check_agent_registered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_registered_by_mdt $uuid $((mdsno - 1)) + done +} + +check_agent_unregistered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_unregistered_by_mdt $uuid $((mdsno - 1)) + done +} + +get_agent_uuid() { + local agent=${1:-$(facet_active_host $SINGLEAGT)} + + # Lustre mount-point is mandatory and last parameter on + # copytool cmd-line. + local mntpnt=$(do_rpc_nodes $agent ps -C $HSMTOOL_BASE -o args= | + awk '{print $NF}') + [ -n "$mntpnt" ] || error "Found no Agent or with no mount-point "\ + "parameter" + do_rpc_nodes $agent get_client_uuid $mntpnt | cut -d' ' -f2 +} # initiate variables init_agt_vars +# populate MDT device array +get_mdt_devices + # cleanup from previous bad setup -search_and_kill_copytool +kill_copytools # for recovery tests, coordinator needs to be started at mount # so force it @@ -517,9 +857,6 @@ echo "Set HSM on and start" cdt_set_mount_state enabled cdt_check_state enabled -echo "Start copytool" -copytool_setup - echo "Set sanity-hsm HSM policy" cdt_set_sanity_policy @@ -561,6 +898,59 @@ test_1() { } run_test 1 "lfs hsm flags root/non-root access" +test_1a() { + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_small $f) + + copytool_setup + + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + # Release and check states + $LFS hsm_release $f || error "could not release file" + echo -n "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + $MMAP_CAT $f > /dev/null || error "failed mmap & cat release file" + + copytool_cleanup +} +run_test 1a "mmap & cat a HSM released file" + +test_1b() { + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -E 64M -c 2 -E -1 -c 4 $DIR/$tdir || + error "failed to set default stripe" + local f=$DIR/$tdir/$tfile + rm -f $f + + dd if=/dev/random of=$f bs=1M count=1 conv=sync || + error "failed to create file" + local fid=$(path2fid $f) + + copytool_setup + + echo "archive $f" + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + echo "release $f" + $LFS hsm_release $f || error "could not release file" + echo "verify released state: " + check_hsm_flags $f "0x0000000d" && echo "pass" + + echo "restore $f" + $LFS hsm_restore $f || error "could not restore file" + wait_request_state $fid RESTORE SUCCEED + echo "verify restored state: " + check_hsm_flags $f "0x00000009" && echo "pass" + + copytool_cleanup +} +run_test 1b "Archive, Release & Restore composite file" + test_2() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile @@ -616,7 +1006,7 @@ test_3() { error "user could not change hsm flags" dd if=/etc/passwd of=$f.append bs=1 count=3\ conv=notrunc oflag=append status=noxfer || - error "could not append to test file" + file_creation_failure dd $f.append $? check_hsm_flags $f.append "0x00000003" # Modify a file sets it dirty @@ -625,7 +1015,7 @@ test_3() { error "user could not change hsm flags" dd if=/dev/zero of=$f.modify bs=1 count=3\ conv=notrunc status=noxfer || - error "could not modify test file" + file_creation_failure dd $f.modify $? check_hsm_flags $f.modify "0x00000003" # Open O_TRUNC sets dirty @@ -672,13 +1062,19 @@ test_8() { run_test 8 "Test default archive number" test_9() { - mkdir -p $DIR/$tdir - local f=$DIR/$tdir/$tfile - local fid=$(copy_file /etc/passwd $f) # we do not use the default one to be sure local new_an=$((HSM_ARCHIVE_NUMBER + 1)) copytool_cleanup copytool_setup $SINGLEAGT $MOUNT $new_an + + # give time for CT to register with MDTs + sleep $(($MDSCOUNT*2)) + local uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) + check_agent_registered $uuid + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) $LFS hsm_archive --archive $new_an $f wait_request_state $fid ARCHIVE SUCCEED @@ -686,7 +1082,7 @@ test_9() { copytool_cleanup } -run_test 9 "Use of explict archive number, with dedicated copytool" +run_test 9 "Use of explicit archive number, with dedicated copytool" test_9a() { needclients 3 || return 0 @@ -799,7 +1195,7 @@ test_10d() { } run_test 10d "Archive a file on the default archive id" -test_11() { +test_11a() { mkdir -p $DIR/$tdir copy2archive /etc/hosts $tdir/$tfile local f=$DIR/$tdir/$tfile @@ -823,7 +1219,31 @@ test_11() { local AFILE=$(do_facet $SINGLEAGT ls $HSM_ARCHIVE'/*/*/*/*/*/*/'$fid) || error "fid $fid not in archive $HSM_ARCHIVE" } -run_test 11 "Import a file" +run_test 11a "Import a file" + +test_11b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + $LFS hsm_archive -a $HSM_ARCHIVE_NUMBER $f || + error "hsm_archive failed" + wait_request_state $fid ARCHIVE SUCCEED + + local FILE_HASH=$(md5sum $f) + rm -f $f + + import_file $fid $f + + echo "$FILE_HASH" | md5sum -c + + [[ $? -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 11b "Import a deleted file using its FID" test_12a() { # test needs a running copytool @@ -834,16 +1254,16 @@ test_12a() { local f=$DIR/$tdir/$tfile import_file $tdir/$tfile $f - local f=$DIR2/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile echo "Verifying released state: " - check_hsm_flags $f "0x0000000d" + check_hsm_flags $f2 "0x0000000d" - local fid=$(path2fid $f) - $LFS hsm_restore $f + local fid=$(path2fid $f2) + $LFS hsm_restore $f2 wait_request_state $fid RESTORE SUCCEED echo "Verifying file state: " - check_hsm_flags $f "0x00000009" + check_hsm_flags $f2 "0x00000009" do_facet $SINGLEAGT diff -q $HSM_ARCHIVE/$tdir/$tfile $f @@ -887,7 +1307,10 @@ test_12c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile $LFS setstripe -c 2 $f - local fid=$(make_large_for_striping $f) + local fid + fid=$(make_custom_file_for_progress $f 5) + [ $? != 0 ] && skip "not enough free space" && return + local FILE_CRC=$(md5sum $f) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -1063,43 +1486,185 @@ test_12n() { } run_test 12n "Import/implicit restore/release" -test_13() { +test_12o() { # test needs a running copytool copytool_setup - local ARC_SUBDIR="import.orig" - local d="" - local f="" - - # populate directory to be imported - for d in $(seq 1 10); do - local CURR_DIR="$HSM_ARCHIVE/$ARC_SUBDIR/dir.$d" - do_facet $SINGLEAGT mkdir -p "$CURR_DIR" - for f in $(seq 1 10); do - CURR_FILE="$CURR_DIR/$tfile.$f" - # write file-specific data - do_facet $SINGLEAGT \ - echo "d=$d, f=$f, dir=$CURR_DIR, "\ - "file=$CURR_FILE" > $CURR_FILE - done - done - # import to Lustre - import_file "$ARC_SUBDIR" $DIR/$tdir - # diff lustre content and origin (triggers file restoration) - # there must be 10x10 identical files, and no difference - local cnt_ok=$(do_facet $SINGLEAGT diff -rs $HSM_ARCHIVE/$ARC_SUBDIR \ - $DIR/$tdir/$ARC_SUBDIR | grep identical | wc -l) - local cnt_diff=$(do_facet $SINGLEAGT diff -r $HSM_ARCHIVE/$ARC_SUBDIR \ - $DIR/$tdir/$ARC_SUBDIR | wc -l) + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) - [ $cnt_diff -eq 0 ] || - error "$cnt_diff imported files differ from read data" - [ $cnt_ok -eq 100 ] || - error "not enough identical files ($cnt_ok != 100)" + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" - copytool_cleanup -} -run_test 13 "Recursively import and restore a directory" +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 + do_facet $SINGLEMDS lctl set_param fail_loc=0x152 + + # set no retry action mode + cdt_set_no_retry + + diff -q /etc/hosts $f + local st=$? + + # we check we had a restore failure + wait_request_state $fid RESTORE FAILED + + [[ $st -eq 0 ]] && error "Restore must fail" + + # remove no retry action mode + cdt_clear_no_retry + + # check file is still released + check_hsm_flags $f "0x0000000d" + + # retry w/o failure injection + do_facet $SINGLEMDS lctl set_param fail_loc=0 + + # to be sure previous RESTORE result is gone + cdt_purge + wait_for_grace_delay + + diff -q /etc/hosts $f + st=$? + + # we check we had a restore done + wait_request_state $fid RESTORE SUCCEED + + [[ $st -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 12o "Layout-swap failure during Restore leaves file released" + +test_12p() { + # test needs a running copytool + copytool_setup + + mkdir $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + $LFS hsm_release $f || error "cannot release $f" + do_facet $SINGLEAGT cat $f > /dev/null || error "cannot cat $f" + + copytool_cleanup +} +run_test 12p "implicit restore of a file on copytool mount point" + +cleanup_test_12q() { + trap 0 + zconf_umount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot umount $MOUNT3 on $SINGLEAGT" +} + +test_12q() { + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.58) ] && + skip "need MDS version at least 2.7.58" && return 0 + + zconf_mount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot mount $MOUNT3 on $SINGLEAGT" + + trap cleanup_test_12q EXIT + + # test needs a running copytool + copytool_setup $SINGLEAGT $MOUNT3 + + mkdir $DIR/$tdir + local f=$DIR/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile + local fid=$(make_small $f) + local orig_size=$(stat -c "%s" $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + $LFS hsm_release $f || error "could not release file" + check_hsm_flags $f "0x0000000d" + + kill_copytools + wait_copytools || error "copytool failed to stop" + + cat $f > /dev/null & + + # wait a bit to allow implicit restore request to be handled. + # if not, next stat would also block on layout-lock. + sleep 5 + + local size=$(stat -c "%s" $f2) + [ $size -eq $orig_size ] || + error "$f2: wrong size after archive: $size != $orig_size" + + HSM_ARCHIVE_PURGE=false copytool_setup $SINGLEAGT /mnt/lustre3 + + wait + + size=$(stat -c "%s" $f) + [ $size -eq $orig_size ] || + error "$f: wrong size after restore: $size != $orig_size" + + size=$(stat -c "%s" $f2) + [ $size -eq $orig_size ] || + error "$f2: wrong size after restore: $size != $orig_size" + + :>$f + + size=$(stat -c "%s" $f) + [ $size -eq 0 ] || + error "$f: wrong size after overwrite: $size != 0" + + size=$(stat -c "%s" $f2) + [ $size -eq 0 ] || + error "$f2: wrong size after overwrite: $size != 0" + + copytool_cleanup + zconf_umount $(facet_host $SINGLEAGT) $MOUNT3 || + error "cannot umount $MOUNT3 on $SINGLEAGT" +} +run_test 12q "file attributes are refreshed after restore" + +test_13() { + # test needs a running copytool + copytool_setup + + local ARC_SUBDIR="import.orig" + local d="" + local f="" + + # populate directory to be imported + for d in $(seq 1 10); do + local CURR_DIR="$HSM_ARCHIVE/$ARC_SUBDIR/dir.$d" + do_facet $SINGLEAGT mkdir -p "$CURR_DIR" + for f in $(seq 1 10); do + CURR_FILE="$CURR_DIR/$tfile.$f" + # write file-specific data + do_facet $SINGLEAGT \ + "echo d=$d, f=$f, dir=$CURR_DIR, "\ + "file=$CURR_FILE > $CURR_FILE" + done + done + # import to Lustre + import_file "$ARC_SUBDIR" $DIR/$tdir + # diff lustre content and origin (triggers file restoration) + # there must be 10x10 identical files, and no difference + local cnt_ok=$(do_facet $SINGLEAGT diff -rs $HSM_ARCHIVE/$ARC_SUBDIR \ + $DIR/$tdir/$ARC_SUBDIR | grep identical | wc -l) + local cnt_diff=$(do_facet $SINGLEAGT diff -r $HSM_ARCHIVE/$ARC_SUBDIR \ + $DIR/$tdir/$ARC_SUBDIR | wc -l) + + [ $cnt_diff -eq 0 ] || + error "$cnt_diff imported files differ from read data" + [ $cnt_ok -eq 100 ] || + error "not enough identical files ($cnt_ok != 100)" + + copytool_cleanup +} +run_test 13 "Recursively import and restore a directory" test_14() { # test needs a running copytool @@ -1208,9 +1773,10 @@ test_16() { $LFS hsm_archive $f wait_request_state $fid ARCHIVE SUCCEED local end=$(date +%s) - local duration=$((end - start)) + # Add 1 to account for rounding errors between start and end (LU-8155) + local duration=$((end - start + 1)) - [[ $duration -ge $goal ]] || + [[ $duration -ge $((goal - 1)) ]] || error "Transfer is too fast $duration < $goal" copytool_cleanup @@ -1257,19 +1823,41 @@ test_21() { local fid=$(make_small $f) check_hsm_flags $f "0x00000000" + # LU-4388/LU-4389 - ZFS does not report full number of blocks + # used until file is flushed to disk + if [ $(facet_fstype ost1) == "zfs" ]; then + # this causes an OST_SYNC rpc to be sent + dd if=/dev/zero of=$f bs=512 count=1 oflag=sync conv=notrunc,fsync + # clear locks to reread file data + cancel_lru_locks osc + fi + + local orig_size=$(stat -c "%s" $f) + local orig_blocks=$(stat -c "%b" $f) + + start_full_debug_logging + $LFS hsm_archive $f || error "could not archive file" wait_request_state $fid ARCHIVE SUCCEED - [ $(stat -c "%b" $f) -ne "1" ] || error "wrong block number" - local sz=$(stat -c "%s" $f) - [ $sz -ne "0" ] || error "file size should not be zero" + local blocks=$(stat -c "%b" $f) + [ $blocks -eq $orig_blocks ] || + error "$f: wrong block number after archive: " \ + "$blocks != $orig_blocks" + local size=$(stat -c "%s" $f) + [ $size -eq $orig_size ] || + error "$f: wrong size after archive: $size != $orig_size" # Release and check states $LFS hsm_release $f || error "could not release file" check_hsm_flags $f "0x0000000d" - [ $(stat -c "%b" $f) -eq "1" ] || error "wrong block number" - [ $(stat -c "%s" $f) -eq $sz ] || error "wrong file size" + blocks=$(stat -c "%b" $f) + [ $blocks -gt 5 ] && + error "$f: too many blocks after release: $blocks > 5" + size=$(stat -c "%s" $f) + [ $size -ne $orig_size ] && + error "$f: wrong size after release: $size != $orig_size" # Check we can release an file without stripe info f=$f.nolov @@ -1287,6 +1875,8 @@ test_21() { $LFS hsm_release $f || fail "second release should succeed" check_hsm_flags $f "0x0000000d" + stop_full_debug_logging + copytool_cleanup } run_test 21 "Simple release tests" @@ -1418,8 +2008,7 @@ test_24a() { [ $ctime0 -eq $ctime1 ] || error "release changed ctime from $ctime0 to $ctime1" - # Restore should not change atime or mtime and should not - # decrease ctime. + # Restore should not change any timestamps. $LFS hsm_restore $file wait_request_state $fid RESTORE SUCCEED @@ -1433,7 +2022,7 @@ test_24a() { [ $mtime0 -eq $mtime1 ] || error "restore changed mtime from $mtime0 to $mtime1" - [ $ctime0 -le $ctime1 ] || + [ $ctime0 -eq $ctime1 ] || error "restore changed ctime from $ctime0 to $ctime1" copytool_cleanup @@ -1452,7 +2041,7 @@ test_24a() { [ $mtime0 -eq $mtime1 ] || error "remount changed mtime from $mtime0 to $mtime1" - [ $ctime0 -le $ctime1 ] || + [ $ctime0 -eq $ctime1 ] || error "remount changed ctime from $ctime0 to $ctime1" } run_test 24a "Archive, release, and restore does not change a/mtime (i/o)" @@ -1468,7 +2057,7 @@ test_24b() { copytool_setup mkdir -p $DIR/$tdir - # Check that root can do HSM actions on a ordinary user's file. + # Check that root can do HSM actions on a regular user's file. rm -f $file fid=$(make_small $file) sum0=$(md5sum $file) @@ -1546,7 +2135,6 @@ test_24c() { chown $RUNAS_ID:nobody $file || error "cannot chown '$file' to '$RUNAS_ID:nobody'" - set_hsm_param user_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by user should fail" @@ -1560,7 +2148,6 @@ test_24c() { chown nobody:$RUNAS_GID $file || error "cannot chown '$file' to 'nobody:$RUNAS_GID'" - set_hsm_param group_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by group should fail" @@ -1574,7 +2161,6 @@ test_24c() { chown nobody:nobody $file || error "cannot chown '$file' to 'nobody:nobody'" - set_hsm_param other_request_mask "" $RUNAS $LFS hsm_$action $file && error "$action by other should fail" @@ -1590,6 +2176,7 @@ run_test 24c "check that user,group,other request masks work" cleanup_test_24d() { trap 0 mount -o remount,rw $MOUNT2 + zconf_umount $(facet_host $SINGLEAGT) "$MOUNT3" } test_24d() { @@ -1598,16 +2185,24 @@ test_24d() { local fid1 local fid2 - copytool_setup - mkdir -p $DIR/$tdir rm -f $file1 fid1=$(make_small $file1) + echo $fid1 + $LFS getstripe $file1 + trap cleanup_test_24d EXIT + zconf_mount $(facet_host $SINGLEAGT) "$MOUNT3" || + error "cannot mount '$MOUNT3' on '$SINGLEAGT'" + copytool_setup $SINGLEAGT "$MOUNT3" || + error "unable to setup a copytool for the test" mount -o remount,ro $MOUNT2 + do_nodes $(comma_list $(nodes_list)) $LCTL clear + start_full_debug_logging + fid2=$(path2fid $file2) [ "$fid1" == "$fid2" ] || error "FID mismatch '$fid1' != '$fid2'" @@ -1616,15 +2211,17 @@ test_24d() { error "archive should fail on read-only mount" check_hsm_flags $file1 "0x00000000" - $LFS hsm_archive $file1 + $LFS hsm_archive $file1 || error "Fail to archive $file1" wait_request_state $fid1 ARCHIVE SUCCEED + stop_full_debug_logging + $LFS hsm_release $file1 $LFS hsm_restore $file2 wait_request_state $fid1 RESTORE SUCCEED $LFS hsm_release $file1 || error "cannot release '$file1'" - dd if=$file2 of=/dev/null bs=1M || "cannot read '$file2'" + dd if=$file2 of=/dev/null bs=1M || error "cannot read '$file2'" $LFS hsm_release $file2 && error "release should fail on read-only mount" @@ -1634,6 +2231,55 @@ test_24d() { } run_test 24d "check that read-only mounts are respected" +test_24e() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid + + fid=$(make_small $f) || error "cannot create $f" + $LFS hsm_archive $f || error "cannot archive $f" + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + while ! $LFS hsm_state $f | grep released; do + sleep 1 + done + + tar -cf $TMP/$tfile.tar $DIR/$tdir || error "cannot tar $DIR/$tdir" + + copytool_cleanup +} +run_test 24e "tar succeeds on HSM released files" # LU-6213 + +test_24f() { + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir/d1 + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + sum0=$(md5sum $f) + echo $sum0 + $LFS hsm_archive -a $HSM_ARCHIVE_NUMBER $f || + error "hsm_archive failed" + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + tar --xattrs -cvf $f.tar -C $DIR/$tdir $tfile + rm -f $f + sync + tar --xattrs -xvf $f.tar -C $DIR/$tdir || + error "Can not recover the tar contents" + sum1=$(md5sum $f) + echo "Sum0 = $sum0, sum1 = $sum1" + [ "$sum0" == "$sum1" ] || error "md5sum mismatch for '$tfile'" + + copytool_cleanup +} +run_test 24f "root can archive, release, and restore tar files" + test_25a() { # test needs a running copytool copytool_setup @@ -1687,7 +2333,10 @@ test_26() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -1700,93 +2349,457 @@ test_26() { } run_test 26 "Remove the archive of a valid file" -test_27a() { +cleanup_test_26a() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay + copytool_cleanup +} + +test_26a() { + local raolu=$(get_hsm_param remove_archive_on_last_unlink) + [[ $raolu -eq 0 ]] || error "RAoLU policy should be off" + # test needs a running copytool copytool_setup mkdir -p $DIR/$tdir - make_archive $tdir/$tfile local f=$DIR/$tdir/$tfile - import_file $tdir/$tfile $f - local fid=$(path2fid $f) + local fid=$(copy_file /etc/passwd $f) - $LFS hsm_remove $f + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED - [[ $? != 0 ]] || error "Remove of a released file should fail" + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + local f3=$DIR/$tdir/${tfile}_3 + local fid3=$(copy_file /etc/passwd $f3) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f3 + wait_request_state $fid3 ARCHIVE SUCCEED + trap cleanup_test_26a EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 1 + + ln "$f3" "$f3"_bis || error "Unable to create hard-link" + rm -f $f3 + + rm -f $f2 + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid2 REMOVE SUCCEED + + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + assert_request_count $fid3 REMOVE 0 \ + "Unexpected archived data remove request for $f3" + + cleanup_test_26a +} +run_test 26a "Remove Archive On Last Unlink (RAoLU) policy" + +cleanup_test_26b() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 copytool_cleanup } -run_test 27a "Remove the archive of an imported file (Operation not permitted)" -test_27b() { +test_26b() { + # test needs a running copytool copytool_setup mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid=$(copy_file /etc/passwd $f) + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED - $LFS hsm_release $f - $LFS hsm_remove $f + trap cleanup_test_26b EXIT - [[ $? != 0 ]] || error "Remove of a released file should fail" + set_hsm_param remove_archive_on_last_unlink 1 + + cdt_shutdown + cdt_check_state stopped + + rm -f $f + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE WAITING + + cdt_enable + # copytool must re-register + kill_copytools + wait_copytools || error "copytool failed to stop" + HSM_ARCHIVE_PURGE=false copytool_setup + + wait_request_state $fid REMOVE SUCCEED + + cleanup_test_26b +} +run_test 26b "RAoLU policy when CDT off" +cleanup_test_26c() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay copytool_cleanup } -run_test 27b "Remove the archive of a relased file (Operation not permitted)" -test_28() { +test_26c() { + # test needs a running copytool copytool_setup mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid=$(copy_file /etc/passwd $f) + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED - cdt_disable - $LFS hsm_remove $f + local f2=$DIR/$tdir/${tfile}_2 + local fid2=$(copy_file /etc/passwd $f2) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f2 + wait_request_state $fid2 ARCHIVE SUCCEED + + trap cleanup_test_26c EXIT + + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 + + set_hsm_param remove_archive_on_last_unlink 1 + + multiop_bg_pause $f O_c || error "open $f failed" + local pid=$! rm -f $f + rm -f $f2 - cdt_enable + wait_request_state $fid2 REMOVE SUCCEED + assert_request_count $fid REMOVE 0 \ + "Unexpected archived data remove request for $f" + + kill -USR1 $pid || error "multiop early exit" + # should reach autotest timeout if multiop fails to trap + # signal, close file, and exit ... + wait $pid || error + + set_hsm_param remove_archive_on_last_unlink 0 wait_request_state $fid REMOVE SUCCEED + cleanup_test_26c +} +run_test 26c "RAoLU effective when file closed" + +cleanup_test_26d() { + trap 0 + set_hsm_param remove_archive_on_last_unlink 0 + set_hsm_param loop_period $orig_loop_period + set_hsm_param grace_delay $orig_grace_delay copytool_cleanup } -run_test 28 "Concurrent archive/file remove" -test_30a() { - # restore at exec cannot work on agent node (because of Linux kernel - # protection of executables) - needclients 2 || return 0 +test_26d() { # test needs a running copytool copytool_setup mkdir -p $DIR/$tdir - copy2archive /bin/true $tdir/$tfile + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/motd $f 1) - local f=$DIR/$tdir/true - import_file $tdir/$tfile $f + $LFS hsm_archive $f || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED - local fid=$(path2fid $f) + trap cleanup_test_26d EXIT - # set no retry action mode - cdt_set_no_retry - do_node $CLIENT2 $f - local st=$? + # set a long grace_delay vs short loop_period + local orig_loop_period=$(get_hsm_param loop_period) + local orig_grace_delay=$(get_hsm_param grace_delay) + set_hsm_param loop_period 10 + set_hsm_param grace_delay 100 - # cleanup - # remove no try action mode - cdt_clear_no_retry - $LFS hsm_state $f + set_hsm_param remove_archive_on_last_unlink 1 - [[ $st == 0 ]] || error "Failed to exec a released file" + multiop_bg_pause $f O_c || error "multiop failed" + local MULTIPID=$! + + rm -f $f + + mds_evict_client + + set_hsm_param remove_archive_on_last_unlink 0 + + wait_request_state $fid REMOVE SUCCEED + + client_up || client_up || true + + kill -USR1 $MULTIPID + wait $MULTIPID || error "multiop close failed" + + cleanup_test_26d +} +run_test 26d "RAoLU when Client eviction" + +test_27a() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + make_archive $tdir/$tfile + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + local fid=$(path2fid $f) + + $LFS hsm_remove $f + + [[ $? != 0 ]] || error "Remove of a released file should fail" + + copytool_cleanup +} +run_test 27a "Remove the archive of an imported file (Operation not permitted)" + +test_27b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + + $LFS hsm_remove $f + + [[ $? != 0 ]] || error "Remove of a released file should fail" + + copytool_cleanup +} +run_test 27b "Remove the archive of a relased file (Operation not permitted)" + +test_28() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + cdt_disable + $LFS hsm_remove $f + + rm -f $f + + cdt_enable + + wait_request_state $fid REMOVE SUCCEED + + copytool_cleanup +} +run_test 28 "Concurrent archive/file remove" + +test_29a() { + # Tests --mntpath and --archive options + + local archive_id=7 + copytool_setup $SINGLEAGT $MOUNT $archive_id + + # Bad archive number + $LFS hsm_remove -m $MOUNT -a 33 0x857765760:0x8:0x2 2>&1 | + grep "Invalid argument" || + error "unexpected hsm_remove failure (1)" + + # mntpath is present but file is given + $LFS hsm_remove --mntpath $MOUNT --archive 30 /qwerty/uyt 2>&1 | + grep "hsm: '/qwerty/uyt' is not a valid FID" || + error "unexpected hsm_remove failure (2)" + + copytool_cleanup +} +run_test 29a "Tests --mntpath and --archive options" + +test_29b() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_small $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + rm -f $f + + $LFS hsm_remove -m $MOUNT -a $HSM_ARCHIVE_NUMBER $fid + wait_request_state $fid REMOVE SUCCEED + + copytool_cleanup +} +run_test 29b "Archive/delete/remove by FID from the archive." + +test_29c() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local fid1=$(make_small $DIR/$tdir/$tfile-1) + local fid2=$(make_small $DIR/$tdir/$tfile-2) + local fid3=$(make_small $DIR/$tdir/$tfile-3) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tdir/$tfile-[1-3] + wait_request_state $fid1 ARCHIVE SUCCEED + wait_request_state $fid2 ARCHIVE SUCCEED + wait_request_state $fid3 ARCHIVE SUCCEED + + rm -f $DIR/$tdir/$tfile-[1-3] + + echo $fid1 > $DIR/$tdir/list + echo $fid2 >> $DIR/$tdir/list + echo $fid3 >> $DIR/$tdir/list + + $LFS hsm_remove -m $MOUNT -a $HSM_ARCHIVE_NUMBER \ + --filelist $DIR/$tdir/list + wait_request_state $fid1 REMOVE SUCCEED + wait_request_state $fid2 REMOVE SUCCEED + wait_request_state $fid3 REMOVE SUCCEED + + copytool_cleanup +} +run_test 29c "Archive/delete/remove by FID, using a file list." + +test_29d() { + # test needs more than one CT + needclients 3 || return 0 + + local n + local file + local fid + + copytool_cleanup $(comma_list $(agts_nodes)) + + # start all of the copytools + for n in $(seq $AGTCOUNT); do + copytool_setup agt$n $MOUNT2 $n + done + + trap "copytool_cleanup $(comma_list $(agts_nodes))" EXIT + # archive files + mkdir -p $DIR/$tdir + file=$DIR/$tdir/$tfile + fid=$(make_small $file) + + $LFS hsm_archive $file + wait_request_state $fid ARCHIVE SUCCEED + check_hsm_flags $file "0x00000009" + + rm -f $file + + $LFS hsm_remove --mntpath "$MOUNT" -a 0 $fid || + error "cannot hsm_remove '$fid'" + + # give time for CDT to handle remove request and create broadcasted + sleep 2 + + # remove request has been broadcasted ? + local cnt=$(get_request_count $fid REMOVE) + # broadcasted requests + original + [[ $cnt -eq $((AGTCOUNT + 1)) ]] || + error "remove not broadcasted to all CTs" + + # give time for CDT and CTs to handle broadcasted + wait_for_loop_period + + # each agent serves one different archive_id, so broadcasted + # hsm_remove request should only succeed once and fail at all others + local res + local scnt=0 + local fcnt=0 + for n in $(seq $AGTCOUNT); do + res=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + $HSM_PARAM.actions | awk \ + '/'$fid'.*action=REMOVE archive#='$n'/ \ + {print \\\$13}' | cut -f2 -d=") + if [[ "$res" == "SUCCEED" ]]; then + scnt=$((scnt + 1)) + elif [[ "$res" == "FAILED" ]]; then + fcnt=$((fcnt + 1)) + fi + done + + [[ $scnt -ne 1 ]] && + error "one and only CT should have removed successfully" + + [[ $AGTCOUNT -ne $((scnt + fcnt)) ]] && + error "all but one CT should have failed to remove" + + trap - EXIT + copytool_cleanup $(comma_list $(agts_nodes)) + +} +run_test 29d "hsm_remove by FID with archive_id 0 for unlinked file cause "\ + "request to be sent once for each registered archive_id" + +test_30a() { + # restore at exec cannot work on agent node (because of Linux kernel + # protection of executables) + needclients 2 || return 0 + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + copy2archive /bin/true $tdir/$tfile + + local f=$DIR/$tdir/true + import_file $tdir/$tfile $f + + local fid=$(path2fid $f) + + # set no retry action mode + cdt_set_no_retry + do_node $CLIENT2 $f + local st=$? + + # cleanup + # remove no try action mode + cdt_clear_no_retry + $LFS hsm_state $f + + [[ $st == 0 ]] || error "Failed to exec a released file" copytool_cleanup } @@ -1824,6 +2837,47 @@ test_30b() { } run_test 30b "Restore at exec (release case)" +test_30c() { + needclients 2 || return 0 + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/SLEEP + local slp_sum1=$(md5sum /bin/sleep) + local fid=$(copy_file /bin/sleep $f) + chmod 755 $f + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + check_hsm_flags $f "0x0000000d" + # set no retry action mode + cdt_set_no_retry + do_node $CLIENT2 "$f 10" & + local pid=$! + sleep 3 + echo 'Hi!' > $f + [[ $? == 0 ]] && error "Update during exec of released file must fail" + wait $pid + [[ $? == 0 ]] || error "Execution failed during run" + cmp /bin/sleep $f + if [[ $? != 0 ]]; then + local slp_sum2=$(md5sum /bin/sleep) + # in case sleep file is modified during the test + [[ $slp_sum1 == $slp_sum2 ]] && + error "Binary overwritten during exec" + fi + + # cleanup + # remove no try action mode + cdt_clear_no_retry + check_hsm_flags $f "0x00000009" + + copytool_cleanup +} +run_test 30c "Update during exec of released file must fail" + restore_and_check_size() { local f=$1 local fid=$2 @@ -1836,24 +2890,23 @@ restore_and_check_size() { while [[ "$st" != "0x00000009" && $cpt -le 10 ]] do n=$(stat -c "%s" $f) - # we echo in both cases to show stat is not - # hang + # we echo in both cases to show stat is not hang if [[ $n != $s ]]; then echo "size seen is $n != $s" err=1 else echo "size seen is right: $n == $s" fi - st=$(get_hsm_flags $f) sleep 10 cpt=$((cpt + 1)) + st=$(get_hsm_flags $f) done - if [[ $cpt -lt 10 ]]; then - echo " restore is too long" - else + if [[ "$st" = "0x00000009" ]]; then echo " "done + else + echo " restore is too long" + wait_request_state $fid RESTORE SUCCEED fi - wait_request_state $fid RESTORE SUCCEED return $err } @@ -1886,7 +2939,10 @@ test_31b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -1907,7 +2963,10 @@ test_31c() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress_aligned $f) + local fid + fid=$(make_custom_file_for_progress $f 33 1048576) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -1928,11 +2987,23 @@ test_33() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f + # to be sure wait_all_done will not be mislead by previous tests + # and ops. + cdt_purge + wait_for_grace_delay + # Also raise grace_delay significantly so the Canceled + # Restore action will stay enough long avail. + local old_grace=$(get_hsm_param grace_delay) + set_hsm_param grace_delay 100 + md5sum $f >/dev/null & local pid=$! wait_request_state $fid RESTORE STARTED @@ -1945,8 +3016,29 @@ test_33() { $LFS hsm_cancel $f - wait_request_state $fid RESTORE CANCELED - wait_request_state $fid CANCEL SUCCEED + # instead of waiting+checking both Restore and Cancel ops + # sequentially, wait for both to be finished and then check + # each results. + wait_all_done 100 $fid + local rstate=$(get_request_state $fid RESTORE) + local cstate=$(get_request_state $fid CANCEL) + + # restore orig grace_delay. + set_hsm_param grace_delay $old_grace + + if [[ "$rstate" == "CANCELED" ]] ; then + [[ "$cstate" == "SUCCEED" ]] || + error "Restore state is CANCELED and Cancel state " \ + "is not SUCCEED but $cstate" + echo "Restore state is CANCELED, Cancel state is SUCCEED" + elif [[ "$rstate" == "SUCCEED" ]] ; then + [[ "$cstate" == "FAILED" ]] || + error "Restore state is SUCCEED and Cancel state " \ + "is not FAILED but $cstate" + echo "Restore state is SUCCEED, Cancel state is FAILED" + else + error "Restore state is $rstate and Cancel state is $cstate" + fi [ -z $killed ] || error "Cannot kill process waiting for restore ($killed)" @@ -1962,7 +3054,10 @@ test_34() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -1995,7 +3090,10 @@ test_35() { local f=$DIR/$tdir/$tfile local f1=$DIR/$tdir/$tfile-1 - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + local fid1=$(copy_file /etc/passwd $f1) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED @@ -2031,7 +3129,10 @@ test_36() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $f @@ -2058,6 +3159,34 @@ test_36() { } run_test 36 "Move file during restore" +test_37() { + # LU-5683: check that an archived dirty file can be rearchived. + copytool_cleanup + copytool_setup $SINGLEAGT $MOUNT2 + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + + fid=$(make_small $f) || error "cannot create small file" + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "cannot release $f" + + # Allow previous archive request to expire from the actions log. + wait_for_grace_delay + + # Dirty file. + dd if=/dev/urandom of=$f bs=1M count=1 || error "cannot dirty file" + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 37 "re-archive a dirty file" + multi_archive() { local prefix=$1 local count=$2 @@ -2083,7 +3212,13 @@ test_40() { fid=$(copy_file /etc/hosts $f.$p.$i) done done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then + copytool_setup + else + copytool_setup $SINGLEAGT $MOUNT $HSM_ARCHIVE_NUMBER $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2162,7 +3297,7 @@ test_54() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_small $f) + local fid=$(make_custom_file_for_progress $f 39 1000000) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2190,7 +3325,7 @@ test_55() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_small $f) + local fid=$(make_custom_file_for_progress $f 39 1000000) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2218,7 +3353,9 @@ test_56() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || error "could not archive file" @@ -2327,10 +3464,352 @@ test_58() { } run_test 58 "Truncate a released file will trigger restore" -test_90() { - file_count=57 +test_59() { + local fid + local server_version=$(lustre_version_code $SINGLEMDS) + [[ $server_version -lt $(version_code 2.7.63) ]] && + skip "Need MDS version at least 2.7.63" && return + + copytool_setup + $MCREATE $DIR/$tfile || error "mcreate failed" + $TRUNCATE $DIR/$tfile 42 || error "truncate failed" + $LFS hsm_archive $DIR/$tfile || error "archive request failed" + fid=$(path2fid $DIR/$tfile) + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $DIR/$tfile || error "release failed" + copytool_cleanup +} +run_test 59 "Release stripeless file with non-zero size" + +test_60() { + # This test validates the fix for LU-4512. Ensure that the -u + # option changes the progress reporting interval from the + # default (30 seconds) to the user-specified interval. + local interval=5 + local progress_timeout=$((interval * 4)) + + # test needs a new running copytool + copytool_cleanup + HSMTOOL_UPDATE_INTERVAL=$interval copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + fid=$(make_custom_file_for_progress $f 10) + [ $? != 0 ] && skip "not enough free space" && return + + local mdtidx=0 + local mdt=${MDT_PREFIX}${mdtidx} + local mds=mds$((mdtidx + 1)) + + # Wait for copytool to register + wait_update_facet $mds \ + "$LCTL get_param -n ${mdt}.hsm.agents | grep -o ^uuid" \ + uuid 100 || error "coyptool failed to register with $mdt" + + local start_at=$(date +%s) + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + + local agent=$(facet_active_host $SINGLEAGT) + local prefix=$TESTLOG_PREFIX + [[ -z "$TESTNAME" ]] || prefix=$prefix.$TESTNAME + local copytool_log=$prefix.copytool_log.$agent.log + + + wait_update $agent \ + "grep -o start.copy $copytool_log" "start copy" 100 || + error "copytool failed to start" + + local cmd="$LCTL get_param -n ${mdt}.hsm.active_requests" + cmd+=" | awk '/'$fid'.*action=ARCHIVE/ {print \\\$12}' | cut -f2 -d=" + + local RESULT + local WAIT=0 + local sleep=1 + + echo -n "Expecting a progress update within $progress_timeout seconds... " + while [ true ]; do + RESULT=$(do_node $(facet_active_host $mds) "$cmd") + if [ $RESULT -gt 0 ]; then + echo "$RESULT bytes copied in $WAIT seconds." + break + elif [ $WAIT -ge $progress_timeout ]; then + error "Timed out waiting for progress update!" + break + fi + WAIT=$((WAIT + sleep)) + sleep $sleep + done + + local finish_at=$(date +%s) + local elapsed=$((finish_at - start_at)) + + # Ensure that the progress update occurred within the expected window. + if [ $elapsed -lt $((interval - 1)) ]; then + error "Expected progress update after at least $interval seconds" + fi + + echo "Wait for on going archive hsm action to complete" + wait_update $agent "grep -o copied $copytool_log" "copied" 10 || + echo "File archiving not completed even after 10 secs" + + cdt_clear_no_retry + copytool_cleanup +} +run_test 60 "Changing progress update interval from default" + +test_61() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + cdt_disable + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + rm -f $f + cdt_enable + wait_request_state $fid ARCHIVE FAILED + + copytool_cleanup +} +run_test 61 "Waiting archive of a removed file should fail" + +test_70() { + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + # Just start and stop the copytool to generate events. + cdt_clear_no_retry + + # Wait for the copytool to register. + wait_update --verbose $(facet_active_host mds1) \ + "$LCTL get_param -n ${MDT_PREFIX}0.hsm.agents | grep -o ^uuid" \ + uuid 100 || + error "copytool failed to register with MDT0000" + + copytool_cleanup + + local REGISTER_EVENT + local UNREGISTER_EVENT + while read event; do + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "REGISTER" ]; then + REGISTER_EVENT=$event + elif [ $event_type == "UNREGISTER" ]; then + UNREGISTER_EVENT=$event + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$REGISTER_EVENT" ]; then + error "Copytool failed to send register event to FIFO" + fi + + if [ -z "$UNREGISTER_EVENT" ]; then + error "Copytool failed to send unregister event to FIFO" + fi + + copytool_monitor_cleanup + echo "Register/Unregister events look OK." +} +run_test 70 "Copytool logs JSON register/unregister events to FIFO" + +test_71() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f || + error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "ARCHIVE_START" ]; then + START_EVENT=$event + continue + elif [ $event_type == "ARCHIVE_FINISH" ]; then + FINISH_EVENT=$event + continue + elif [ $event_type != "ARCHIVE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # These should be identical throughout an archive + # operation. + if [ $source_fid != $data_fid ]; then + error "Expected source_fid to equal data_fid" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send archive start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send archive finish event to FIFO" + fi + + echo "Archive events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup +} +run_test 71 "Copytool logs JSON archive events to FIFO" + +test_72() { + # Bump progress interval for livelier events. + local interval=5 + + # test needs a new running copytool + copytool_cleanup + copytool_monitor_setup + HSMTOOL_UPDATE_INTERVAL=$interval \ + HSMTOOL_EVENT_FIFO=$HSMTOOL_MONITOR_DIR/fifo copytool_setup + local test_file=$HSMTOOL_MONITOR_DIR/file + + local cmd="dd if=/dev/urandom of=$test_file count=16 bs=1000000 " + cmd+="conv=fsync" + do_facet $SINGLEAGT "$cmd" || + error "cannot create $test_file on $SINGLEAGT" + copy2archive $test_file $tdir/$tfile + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + f=$DIR2/$tdir/$tfile + echo "Verifying released state: " + check_hsm_flags $f "0x0000000d" + + local fid=$(path2fid $f) + $LFS hsm_restore $f + wait_request_state $fid RESTORE SUCCEED + + local expected_fields="event_time data_fid source_fid" + expected_fields+=" total_bytes current_bytes" + + local START_EVENT + local FINISH_EVENT + while read event; do + # Make sure we're not getting anything from previous events. + for field in $expected_fields; do + unset $field + done + + local parsed=$(parse_json_event "$event") + if [ -z "$parsed" ]; then + error "Copytool sent malformed event: $event" + fi + eval $parsed + + if [ $event_type == "RESTORE_START" ]; then + START_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at start" + fi + continue + elif [ $event_type == "RESTORE_FINISH" ]; then + FINISH_EVENT=$event + if [ $source_fid != $data_fid ]; then + error "source_fid should == data_fid at finish" + fi + continue + elif [ $event_type != "RESTORE_RUNNING" ]; then + continue + fi + + # Do some simple checking of the progress update events. + for expected_field in $expected_fields; do + if [ -z ${!expected_field+x} ]; then + error "Missing $expected_field field in event" + fi + done + + if [ $total_bytes -eq 0 ]; then + error "Expected total_bytes to be > 0" + fi + + # When a restore starts out, the data fid is the same as the + # source fid. After the restore has gotten going, we learn + # the new data fid. Once the restore has finished, the source + # fid is set to the new data fid. + # + # We test this because some monitoring software may depend on + # this behavior. If it changes, then the consumers of these + # events may need to be modified. + if [ $source_fid == $data_fid ]; then + error "source_fid should != data_fid during restore" + fi + done < <(echo $"$(get_copytool_event_log)") + + if [ -z "$START_EVENT" ]; then + error "Copytool failed to send restore start event to FIFO" + fi + + if [ -z "$FINISH_EVENT" ]; then + error "Copytool failed to send restore finish event to FIFO" + fi + + echo "Restore events look OK." + + cdt_clear_no_retry + copytool_cleanup + copytool_monitor_cleanup + + rm -rf $test_dir +} +run_test 72 "Copytool logs JSON restore events to FIFO" + +test_90() { + file_count=51 # Max number of files constrained by LNET message size + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + local f=$DIR/$tdir/$tfile local FILELIST=/tmp/filelist.txt local i="" @@ -2339,7 +3818,14 @@ test_90() { fid=$(copy_file /etc/hosts $f.$i) echo $f.$i >> $FILELIST done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if do_facet $SINGLEAGT "df --local $HSM_ARCHIVE" >/dev/null 2>&1 ; then + copytool_setup + else + local dai=$(get_hsm_param default_archive_id) + copytool_setup $SINGLEAGT $MOUNT $dai $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2426,7 +3912,10 @@ test_104() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + # if cdt is on, it can serve too quickly the request cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f @@ -2471,24 +3960,20 @@ test_106() { # test needs a running copytool copytool_setup - local uuid=$(do_rpc_nodes $(facet_active_host $SINGLEAGT) \ - get_client_uuid $MOUNT | cut -d' ' -f2) - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + local uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) + + check_agent_registered $uuid + + search_copytools || error "No copytool found" + copytool_cleanup - [[ ! -z "$agent" ]] || error "My uuid $uuid not found in agent list" - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) - [[ -z "$agent" ]] || - error "My uuid $uuid still found in agent list,"\ - " after copytool shutdown" + check_agent_unregistered $uuid + copytool_setup - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + uuid=$(get_agent_uuid $(facet_active_host $SINGLEAGT)) + check_agent_registered $uuid + copytool_cleanup - [[ ! -z "$agent" ]] || - error "My uuid $uuid not found in agent list after"\ - " copytool restart" } run_test 106 "Copytool register/unregister" @@ -2695,10 +4180,15 @@ test_200() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_custom_file_for_progress $f 103 1048576) + [ $? != 0 ] && skip "not enough free space" && return + # test with cdt on is made in test_221 cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + # wait archive to register at CDT + wait_request_state $fid ARCHIVE WAITING $LFS hsm_cancel $f cdt_enable wait_request_state $fid ARCHIVE CANCELED @@ -2721,6 +4211,8 @@ test_201() { # test with cdt on is made in test_222 cdt_disable $LFS hsm_restore $f + # wait restore to register at CDT + wait_request_state $fid RESTORE WAITING $LFS hsm_cancel $f cdt_enable wait_request_state $fid RESTORE CANCELED @@ -2736,21 +4228,50 @@ test_202() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + cdt_disable + $LFS hsm_remove $f + # wait remove to register at CDT + wait_request_state $fid REMOVE WAITING + $LFS hsm_cancel $f + cdt_enable + wait_request_state $fid REMOVE CANCELED + + copytool_cleanup +} +run_test 202 "Register/Cancel remove" + +test_220() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED - cdt_disable - $LFS hsm_remove $f - $LFS hsm_cancel $f - cdt_enable - wait_request_state $fid REMOVE CANCELED + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + changelog_cleanup + + local target=0x0 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" copytool_cleanup } -run_test 202 "Register/Cancel remove" +run_test 220 "Changelog for archive" -test_220() { +test_220a() { # test needs a running copytool copytool_setup @@ -2761,18 +4282,28 @@ test_220() { changelog_setup + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f - wait_request_state $fid ARCHIVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + rm -f $f + + copytool_continue + + wait_request_state $fid ARCHIVE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) changelog_cleanup - local target=0x0 + # HE_ARCHIVE|ENOENT + local target=0x2 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" copytool_cleanup } -run_test 220 "Changelog for archive" +run_test 220a "Changelog for failed archive" test_221() { # test needs a running copytool @@ -2781,7 +4312,9 @@ test_221() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_custom_file_for_progress $f 103 1048576) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup @@ -2791,7 +4324,7 @@ test_221() { wait_request_state $fid ARCHIVE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x7d [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2816,7 +4349,7 @@ test_222a() { $LFS hsm_restore $f wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2842,7 +4375,7 @@ test_222b() { wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2851,6 +4384,68 @@ test_222b() { } run_test 222b "Changelog for implicit restore" +test_222c() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + copy2archive /etc/passwd $tdir/$tfile + + local f=$DIR/$tdir/$tfile + import_file $tdir/$tfile $f + local fid=$(path2fid $f) + + changelog_setup + + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + + $LFS hsm_restore $f + rm -f $f + + copytool_continue + + wait_request_state $fid RESTORE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + + # HE_RESTORE|ENOENT + local target=0x82 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" + + cleanup +} +run_test 222c "Changelog for failed explicit restore" + +test_222d() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + + copytool_remove_backend $fid + md5sum $f + + wait_request_state $fid RESTORE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) + + # HE_RESTORE|ENOENT + local target=0x82 + [[ $flags == $target ]] || error "Changelog flag is $flags not $target" + + cleanup +} +run_test 222d "Changelog for failed implicit restore" + test_223a() { # test needs a running copytool copytool_setup @@ -2871,7 +4466,7 @@ test_223a() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2888,7 +4483,9 @@ test_223b() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2900,7 +4497,7 @@ test_223b() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2926,7 +4523,7 @@ test_224() { $LFS hsm_remove $f wait_request_state $fid REMOVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -n 1) local target=0x200 [[ $flags == $target ]] || @@ -2936,6 +4533,43 @@ test_224() { } run_test 224 "Changelog for remove" +test_224a() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/passwd $f) + + changelog_setup + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + copytool_remove_backend $fid + + # block copytool operations to allow for HSM request to be + # submitted and file be unlinked (CDT will find object removed) + copytool_suspend + + $LFS hsm_remove $f + rm -f $f + + copytool_continue + + wait_request_state $fid REMOVE FAILED + + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -n 1) + + # HE_REMOVE|ENOENT + local target=0x202 + [[ $flags == $target ]] || + error "Changelog flag is $flags not $target" + + cleanup +} +run_test 224a "Changelog for failed remove" + test_225() { # test needs a running copytool copytool_setup @@ -2948,7 +4582,9 @@ test_225() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_progress $f) + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return changelog_setup $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f @@ -2962,9 +4598,9 @@ test_225() { wait_request_state $fid REMOVE CANCELED wait_request_state $fid CANCEL SUCCEED - flags=$(changelog_get_flags $MDT0 RENME $fid2) - local flags=$($LFS changelog $MDT0 | grep HSM | grep $fid | tail -1 | - awk '{print $5}') + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) + local flags=$($LFS changelog ${MDT[0]} | grep HSM | grep $fid | + tail -n 1 | awk '{print $5}') local target=0x27d [[ $flags == $target ]] || @@ -2996,7 +4632,7 @@ test_226() { rm $f1 || error "rm $f1 failed" - local flags=$(changelog_get_flags $MDT0 UNLNK $fid1) + local flags=$(changelog_get_flags ${MDT[0]} UNLNK $fid1) local target=0x3 [[ $flags == $target ]] || @@ -3004,7 +4640,7 @@ test_226() { mv $f3 $f2 || error "mv $f3 $f2 failed" - flags=$(changelog_get_flags $MDT0 RENME $fid2) + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) target=0x3 [[ $flags == $target ]] || @@ -3024,7 +4660,7 @@ check_flags_changes() { local target=0x280 $LFS hsm_set --$hsm_flag $f || error "Cannot set $hsm_flag on $f" - local flags=($(changelog_get_flags $MDT0 HSM $fid)) + local flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) local seen=${#flags[*]} cnt=$((fst + cnt)) [[ $seen == $cnt ]] || @@ -3035,7 +4671,7 @@ check_flags_changes() { $LFS hsm_clear --$hsm_flag $f || error "Cannot clear $hsm_flag on $f" - flags=($(changelog_get_flags $MDT0 HSM $fid)) + flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) seen=${#flags[*]} cnt=$(($cnt + 1)) [[ $cnt == $seen ]] || @@ -3075,10 +4711,9 @@ test_228() { # test needs a running copytool copytool_setup - dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 conv=sync || - error "creating $DIR/$tfile" + local fid=$(make_small_sync $DIR/$tfile) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tfile - wait_request_state $(path2fid $DIR/$tfile) ARCHIVE SUCCEED + wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" @@ -3095,13 +4730,15 @@ test_228() { $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir || error "mkdir $tdir failed" tar cf - --sparse $DIR/$tfile | tar xvf - -C $DIR/$tdir || error "tar failed" cmp $DIR/$tfile $DIR/$tdir/$DIR/$tfile || error "comparing untarred $DIR/$tfile" + rm -f $DIR/$tfile $DIR/$tfile.2 || + error "rm $DIR/$tfile or $DIR/$tfile.2 failed" copytool_cleanup } run_test 228 "On released file, return extend to FIEMAP. For [cp,tar] --sparse" @@ -3150,7 +4787,9 @@ test_251() { mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile - local fid=$(make_large_for_cancel $f) + local fid + fid=$(make_custom_file_for_progress $f 103 1048576) + [ $? != 0 ] && skip "not enough free space" && return cdt_disable # to have a short test @@ -3162,6 +4801,10 @@ test_251() { set_hsm_param loop_period 2 cdt_enable + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE STARTED sleep 5 @@ -3174,6 +4817,43 @@ test_251() { } run_test 251 "Coordinator request timeout" +test_252() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(make_custom_file_for_progress $f 103 1048576) + + cdt_disable + # to have a short test + local old_to=$(get_hsm_param active_request_timeout) + set_hsm_param active_request_timeout 20 + # to be sure the cdt will wake up frequently so + # it will be able to cancel the "old" request + local old_loop=$(get_hsm_param loop_period) + set_hsm_param loop_period 2 + cdt_enable + + # clear locks to avoid extra delay caused by flush/cancel + # and thus prevent early copytool death to timeout. + cancel_lru_locks osc + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE STARTED + rm -f $f + + # wait but less than active_request_timeout+grace_delay + sleep 25 + wait_request_state $fid ARCHIVE CANCELED + + set_hsm_param active_request_timeout $old_to + set_hsm_param loop_period $old_loop + + copytool_cleanup +} +run_test 252 "Timeout'ed running archive of a removed file should be canceled" + test_300() { # the only way to test ondisk conf is to restart MDS ... echo "Stop coordinator and remove coordinator state at mount" @@ -3224,7 +4904,11 @@ test_302() { cdt_shutdown set_hsm_param default_archive_id $new -P - fail $SINGLEMDS + + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + fail mds${mdtno} + done # check cdt is on cdt_check_state enabled @@ -3238,6 +4922,318 @@ test_302() { } run_test 302 "HSM tunnable are persistent when CDT is off" +test_400() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that hsm request on mdt0 is sent to the right MDS + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # check that hsm request on mdt1 is sent to the right MDS + $LFS hsm_archive $dir_mdt1/$tfile || error "lfs hsm_archive" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 400 "Single request is sent to the right MDT" + +test_401() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that compound requests are shunt to the rights MDTs + $LFS hsm_archive $dir_mdt0/$tfile $dir_mdt1/$tfile || + error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 401 "Compound requests split and sent to their respective MDTs" + +mdc_change_state() # facet, MDT_pattern, activate|deactivate +{ + local facet=$1 + local pattern="$2" + local state=$3 + local node=$(facet_active_host $facet) + local mdc + for mdc in $(do_facet $facet "$LCTL dl | grep -E ${pattern}-mdc" | + awk '{print $4}'); do + echo "$3 $mdc on $node" + do_facet $facet "$LCTL --device $mdc $state" || return 1 + done +} + +test_402a() { + # make sure there is no running copytool + copytool_cleanup + + # deactivate all mdc on agent1 + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "deactivate" + + HSMTOOL_NOERROR=true copytool_setup $SINGLEAGT + + check_agent_unregistered "uuid" # match any agent + + # no expected running copytool + search_copytools $agent && error "Copytool start should have failed" + + # reactivate MDCs + mdc_change_state $SINGLEAGT "$FSNAME-MDT000." "activate" +} +run_test 402a "Copytool start fails if all MDTs are inactive" + +test_402b() { + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + touch $f || error "touch $f failed" + local fid=$(path2fid $f) + +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d + do_facet $SINGLEAGT lctl set_param fail_loc=0x14d + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + + # give time for CDT to send request and to keep it for retry + wait_for_loop_period + + wait_request_state $fid ARCHIVE WAITING + + do_facet $SINGLEAGT lctl set_param fail_loc=0 + + # request should succeed now + wait_request_state $fid ARCHIVE SUCCEED + + copytool_cleanup +} +run_test 402b "CDT must retry request upon slow start of CT" + +test_403() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + # make sure there is no running copytool + copytool_cleanup + + local agent=$(facet_active_host $SINGLEAGT) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" + + copytool_setup + local uuid=$(get_agent_uuid $agent) + # check the agent is registered on MDT0000, and not on MDT0001 + check_agent_registered_by_mdt $uuid 0 + check_agent_unregistered_by_mdt $uuid 1 + + # check running copytool process + search_copytools $agent || error "No running copytools on $agent" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" + + # make sure the copytool is now registered to all MDTs + check_agent_registered $uuid + + copytool_cleanup +} +run_test 403 "Copytool starts with inactive MDT and register on reconnect" + +test_404() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + # create files on both MDT0000 and MDT0001 + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + + # create 1 file on mdt0 + local fid1=$(make_small $dir_mdt0/$tfile) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "deactivate" + + # send an HSM request for files in MDT0000 + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + + # check for completion of files in MDT0000 + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "$FSNAME-MDT0001" "activate" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 +} +run_test 404 "Inactive MDT does not block requests for active MDTs" + +test_405() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local striped_dir=$DIR/$tdir/striped_dir + + # create striped dir on all of MDTs + $LFS mkdir -i 0 -c $MDSCOUNT $striped_dir || error "lfs mkdir" + + local fid1=$(make_small_sync $striped_dir/${tfile}_0) + local fid2=$(make_small_sync $striped_dir/${tfile}_1) + local fid3=$(make_small_sync $striped_dir/${tfile}_2) + local fid4=$(make_small_sync $striped_dir/${tfile}_3) + + local idx1=$($LFS getstripe -M $striped_dir/${tfile}_0) + local idx2=$($LFS getstripe -M $striped_dir/${tfile}_1) + local idx3=$($LFS getstripe -M $striped_dir/${tfile}_2) + local idx4=$($LFS getstripe -M $striped_dir/${tfile}_3) + + # check that compound requests are shunt to the rights MDTs + $LFS hsm_archive $striped_dir/${tfile}_0 $striped_dir/${tfile}_1 \ + $striped_dir/${tfile}_2 $striped_dir/${tfile}_3 || + error "lfs hsm_archive" + + wait_request_state $fid1 ARCHIVE SUCCEED $idx1 && + echo "archive successful on $fid1" + wait_request_state $fid2 ARCHIVE SUCCEED $idx2 && + echo "archive successful on $fid2" + wait_request_state $fid3 ARCHIVE SUCCEED $idx3 && + echo "archive successful on $fid3" + wait_request_state $fid4 ARCHIVE SUCCEED $idx4 && + echo "archive successful on $fid4" + + $LFS hsm_release $striped_dir/${tfile}_0 || error "lfs hsm_release 1" + $LFS hsm_release $striped_dir/${tfile}_1 || error "lfs hsm_release 2" + $LFS hsm_release $striped_dir/${tfile}_2 || error "lfs hsm_release 3" + $LFS hsm_release $striped_dir/${tfile}_3 || error "lfs hsm_release 4" + + cat $striped_dir/${tfile}_0 > /dev/null || error "cat ${tfile}_0 failed" + cat $striped_dir/${tfile}_1 > /dev/null || error "cat ${tfile}_1 failed" + cat $striped_dir/${tfile}_2 > /dev/null || error "cat ${tfile}_2 failed" + cat $striped_dir/${tfile}_3 > /dev/null || error "cat ${tfile}_3 failed" + + copytool_cleanup +} +run_test 405 "archive and release under striped directory" + +test_406() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "need MDS version at least 2.7.64" && return 0 + + local fid + local mdt_index + + copytool_setup + mkdir -p $DIR/$tdir + fid=$(make_small $DIR/$tdir/$tfile) + echo "old fid $fid" + + $LFS hsm_archive $DIR/$tdir/$tfile + wait_request_state "$fid" ARCHIVE SUCCEED + $LFS hsm_release $DIR/$tdir/$tfile + + # Should migrate $tdir but not $tfile. + $LFS mv -M1 $DIR/$tdir && + error "migrating HSM an archived file should fail" + + $LFS hsm_restore $DIR/$tdir/$tfile + wait_request_state "$fid" RESTORE SUCCEED + + $LFS hsm_remove $DIR/$tdir/$tfile + wait_request_state "$fid" REMOVE SUCCEED + + cat $DIR/$tdir/$tfile > /dev/null || + error "cannot read $DIR/$tdir/$tfile" + + $LFS mv -M1 $DIR/$tdir || + error "cannot complete migration after HSM remove" + + mdt_index=$($LFS getstripe -M $DIR/$tdir) + if ((mdt_index != 1)); then + error "expected MDT index 1, got $mdt_index" + fi + + # Refresh fid after migration. + fid=$(path2fid $DIR/$tdir/$tfile) + echo "new fid $fid" + + $LFS hsm_archive $DIR/$tdir/$tfile + wait_request_state "$fid" ARCHIVE SUCCEED 1 + + lctl set_param debug=+trace + $LFS hsm_release $DIR/$tdir/$tfile || + error "cannot release $DIR/$tdir/$tfile" + + $LFS hsm_restore $DIR/$tdir/$tfile + wait_request_state "$fid" RESTORE SUCCEED 1 + + cat $DIR/$tdir/$tfile > /dev/null || + error "cannot read $DIR/$tdir/$tfile" + + copytool_cleanup +} +run_test 406 "attempting to migrate HSM archived files is safe" + +test_500() +{ + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.92) ] && + skip "HSM migrate is not supported" && return + + # Stop the existing copytool + copytool_cleanup + + test_mkdir -p $DIR/$tdir + llapi_hsm_test -d $DIR/$tdir || error "One llapi HSM test failed" +} +run_test 500 "various LLAPI HSM tests" + copytool_cleanup complete $SECONDS