X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-hsm.sh;h=4ef21eb60461cb05e31635d8acf7aee9361510cd;hb=3f5fc6412d3ae2d3a57f8fdb8a457f35e9d9576e;hp=37a66ce30d69191ee2ce836a51beb190e18ea9fe;hpb=57118830eb55ab43b4b6fc096ad40b2bd83c6de3;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 37a66ce..4ef21eb 100644 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -11,12 +11,11 @@ SRCDIR=$(dirname $0) export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin ONLY=${ONLY:-"$*"} -# bug number for skipped test: +# bug number for skipped test: 3815 +ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36" +# bug number for skipped test:4178 4176 +ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -# skip test cases failed before landing - Jinshan - -ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 31a 34 35 36" -ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 201 221 223a 223b 225" LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} @@ -31,9 +30,9 @@ MCREATE=${MCREATE:-mcreate} MOUNT_2=${MOUNT_2:-"yes"} FAIL_ON_ERROR=false -if [[ $MDSCOUNT -ge 2 ]]; then - skip_env "Only run with single MDT for now" && exit -fi +# script only handles up to 10 MDTs (because of MDT_PREFIX) +[ $MDSCOUNT -gt 9 ] && + error "script cannot handle more than 9 MDTs, please fix" && exit check_and_setup_lustre @@ -95,8 +94,9 @@ init_agt_vars() { HSM_ARCHIVE=$(copytool_device $SINGLEAGT) HSM_ARCHIVE_NUMBER=2 - MDT_PARAM="mdt.$FSNAME-MDT0000" - HSM_PARAM="$MDT_PARAM.hsm" + # The test only support up to 10 MDTs + MDT_PREFIX="mdt.$FSNAME-MDT000" + HSM_PARAM="${MDT_PREFIX}0.hsm" # archive is purged at copytool setup HSM_ARCHIVE_PURGE=true @@ -117,6 +117,22 @@ cleanup() { cdt_set_sanity_policy } +get_mdt_devices() { + local mdtno + # get MDT device for each mdc + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + MDT[$idx]=$($LCTL get_param -n \ + mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid | + awk '{gsub(/_UUID/,""); print $1}' | head -1) + done +} + +search_copytools() { + local agents=${1:-$(facet_active_host $SINGLEAGT)} + do_nodesv $agents "pgrep -x $HSMTOOL_BASE" +} + search_and_kill_copytool() { local agents=${1:-$(facet_active_host $SINGLEAGT)} @@ -128,7 +144,7 @@ copytool_setup() { local facet=${1:-$SINGLEAGT} local lustre_mntpnt=${2:-$MOUNT} local arc_id=$3 - local hsm_root=$(copytool_device $facet) + local hsm_root=${4:-$(copytool_device $facet)} local agent=$(facet_active_host $facet) if [[ -z "$arc_id" ]] && @@ -164,10 +180,34 @@ copytool_setup() { copytool_cleanup() { trap - EXIT local agents=${1:-$(facet_active_host $SINGLEAGT)} + local mdtno + local idx + local oldstate + local mdt_hsmctrl do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0 sleep 1 echo "Copytool is stopped on $agents" + + # clean all CDTs orphans requests from previous tests + # that would otherwise need to timeout to clear. + for mdtno in $(seq 1 $MDSCOUNT); do + idx=$(($mdtno - 1)) + mdt_hsmctrl="mdt.$FSNAME-MDT000${idx}.hsm_control" + oldstate=$(do_facet mds${mdtno} "$LCTL get_param -n " \ + "$mdt_hsmctrl") + # skip already stop[ed,ing] CDTs + echo $oldstate | grep stop && continue + + do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=shutdown" + wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \ + "stopped" 20 || + error "mds${mdtno} cdt state is not stopped" + do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=$oldstate" + wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \ + "$oldstate" 20 || + error "mds${mdtno} cdt state is not $oldstate" + done } copytool_suspend() { @@ -179,7 +219,7 @@ copytool_suspend() { copytool_remove_backend() { local fid=$1 - local be=$(find $HSM_ARCHIVE -name $fid) + local be=$(do_facet $SINGLEAGT find $HSM_ARCHIVE -name $fid) echo "Remove from backend: $fid = $be" do_facet $SINGLEAGT rm -f $be } @@ -204,20 +244,66 @@ copy2archive() { do_facet $SINGLEAGT cp -p $1 $file || error "cannot copy $1 to $file" } +mdts_set_param() { + local arg=$1 + local key=$2 + local value=$3 + local mdtno + local rc=0 + if [[ "$value" != "" ]]; then + value="=$value" + fi + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local facet=mds${mdtno} + # if $arg include -P option, run 1 set_param per MDT on the MGS + # else, run set_param on each MDT + [[ $arg = *"-P"* ]] && facet=mgs + do_facet $facet $LCTL set_param $arg mdt.${MDT[$idx]}.$key$value + [[ $? != 0 ]] && rc=1 + done + return $rc +} + +mdts_check_param() { + local key="$1" + local target="$2" + local timeout="$3" + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + wait_result mds${mdtno} \ + "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \ + $timeout || + error "$key state is not '$target' on mds${mdtno}" + done +} + changelog_setup() { - CL_USER=$(do_facet $SINGLEMDS $LCTL --device $MDT0\ - changelog_register -n) - do_facet $SINGLEMDS lctl set_param mdd.$MDT0.changelog_mask="+hsm" - $LFS changelog_clear $MDT0 $CL_USER 0 + CL_USERS=() + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + local cl_user=$(do_facet mds${mdtno} $LCTL \ + --device ${MDT[$idx]} \ + changelog_register -n) + CL_USERS+=($cl_user) + do_facet mds${mdtno} lctl set_param \ + mdd.${MDT[$idx]}.changelog_mask="+hsm" + $LFS changelog_clear ${MDT[$idx]} $cl_user 0 + done } changelog_cleanup() { -# $LFS changelog $MDT0 - [[ -n "$CL_USER" ]] || return 0 - - $LFS changelog_clear $MDT0 $CL_USER 0 - do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $CL_USER - CL_USER= + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + local idx=$(($mdtno - 1)) + [[ -z ${CL_USERS[$idx]} ]] && continue + $LFS changelog_clear ${MDT[$idx]} ${CL_USERS[$idx]} 0 + do_facet mds${mdtno} lctl --device ${MDT[$idx]} \ + changelog_deregister ${CL_USERS[$idx]} + done + CL_USERS=() } changelog_get_flags() { @@ -238,64 +324,57 @@ set_hsm_param() { local param=$1 local value=$2 local opt=$3 - if [[ "$value" != "" ]]; then - value="=$value" - fi - do_facet $SINGLEMDS $LCTL set_param $opt -n $HSM_PARAM.$param$value + mdts_set_param "$opt -n" "hsm.$param" "$value" return $? } set_test_state() { local cmd=$1 local target=$2 - do_facet $SINGLEMDS $LCTL set_param $MDT_PARAM.hsm_control=$cmd - wait_result $SINGLEMDS "$LCTL get_param -n $MDT_PARAM.hsm_control"\ - $target 10 || error "cdt state is not $target" + mdts_set_param "" hsm_control "$cmd" + mdts_check_param hsm_control "$target" 10 } cdt_set_sanity_policy() { if [[ "$CDT_POLICY_HAD_CHANGED" ]] then # clear all - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "+NRA" + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED= fi } cdt_set_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NRA + mdts_set_param "" hsm.policy "+NRA" CDT_POLICY_HAD_CHANGED=true } cdt_clear_no_retry() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NRA + mdts_set_param "" hsm.policy "-NRA" CDT_POLICY_HAD_CHANGED=true } cdt_set_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=+NBR + mdts_set_param "" hsm.policy "+NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_non_blocking_restore() { - do_facet $SINGLEMDS $LCTL set_param $HSM_PARAM.policy=-NBR + mdts_set_param "" hsm.policy "-NBR" CDT_POLICY_HAD_CHANGED=true } cdt_clear_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -d -P $MDT_PARAM.hsm_control + mdts_set_param "-P -d" hsm_control "" } cdt_set_mount_state() { - do_facet $SINGLEMDS $LCTL set_param -P $MDT_PARAM.hsm_control=$1 + mdts_set_param "-P" hsm_control "$1" } cdt_check_state() { - local target=$1 - wait_result $SINGLEMDS\ - "$LCTL get_param -n $MDT_PARAM.hsm_control" "$target" 20 || - error "cdt state is not $target" + mdts_check_param hsm_control "$1" 20 } cdt_disable() { @@ -400,8 +479,15 @@ make_small() { path2fid $1 || error "cannot get fid on $1" } +make_small_sync() { + dd if=/dev/urandom of=$1 count=1 bs=1M conv=sync || + error "cannot create $1" + path2fid $1 || error "cannot get fid on $1" +} + cleanup_large_files() { - local ratio=$(df $MOUNT |awk '{print $5}' |sed 's/%//g' |grep -v Use) + local ratio=$(df -P $MOUNT | tail -1 | awk '{print $5}' | + sed 's/%//g') [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \; } @@ -465,19 +551,22 @@ wait_request_state() { local fid=$1 local request=$2 local state=$3 + # 4th arg (mdt index) is optional + local mdtidx=${4:-0} + local mds=mds$(($mdtidx + 1)) - local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions" + local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $SINGLEMDS "$cmd" $state 100 || - error "request on $fid is not $state" + wait_result $mds "$cmd" $state 100 || + error "request on $fid is not $state on $mds" } get_request_state() { local fid=$1 local request=$2 - do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\ + do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.actions |"\ "awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" } @@ -485,14 +574,14 @@ get_request_count() { local fid=$1 local request=$2 - do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\ + do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.actions |"\ "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'" } wait_all_done() { local timeout=$1 - local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions" + local cmd="$LCTL get_param -n $HSM_PARAM.actions" cmd+=" | egrep 'WAITING|STARTED'" wait_result $SINGLEMDS "$cmd" "" $timeout || @@ -504,8 +593,8 @@ wait_for_grace_delay() { sleep $val } -MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | - awk '{gsub(/_UUID/,""); print $1}' | head -1) +# populate MDT device array +get_mdt_devices # initiate variables init_agt_vars @@ -1066,6 +1155,53 @@ test_12n() { } run_test 12n "Import/implicit restore/release" +test_12o() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" + +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 + do_facet $SINGLEMDS lctl set_param fail_loc=0x152 + + # set no retry action mode + cdt_set_no_retry + + diff -q /etc/hosts $f + local st=$? + + # we check we had a restore failure + wait_request_state $fid RESTORE FAILED + + [[ $st -eq 0 ]] && error "Restore must fail" + + # remove no retry action mode + cdt_clear_no_retry + + # check file is still released + check_hsm_flags $f "0x0000000d" + + # retry w/o failure injection + do_facet $SINGLEMDS lctl set_param fail_loc=0 + + diff -q /etc/hosts $f + st=$? + + # we check we had a restore done + wait_request_state $fid RESTORE SUCCEED + + [[ $st -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 12o "Layout-swap failure during Restore leaves file released" + test_13() { # test needs a running copytool copytool_setup @@ -1082,8 +1218,8 @@ test_13() { CURR_FILE="$CURR_DIR/$tfile.$f" # write file-specific data do_facet $SINGLEAGT \ - echo "d=$d, f=$f, dir=$CURR_DIR, "\ - "file=$CURR_FILE" > $CURR_FILE + "echo d=$d, f=$f, dir=$CURR_DIR, "\ + "file=$CURR_FILE > $CURR_FILE" done done # import to Lustre @@ -1260,19 +1396,41 @@ test_21() { local fid=$(make_small $f) check_hsm_flags $f "0x00000000" + # LU-4388/LU-4389 - ZFS does not report full number of blocks + # used until file is flushed to disk + if [ $(facet_fstype ost1) == "zfs" ]; then + # this causes an OST_SYNC rpc to be sent + dd if=/dev/zero of=$f bs=512 count=1 oflag=sync conv=notrunc,fsync + # clear locks to reread file data + cancel_lru_locks osc + fi + + local orig_size=$(stat -c "%s" $f) + local orig_blocks=$(stat -c "%b" $f) + + start_full_debug_logging + $LFS hsm_archive $f || error "could not archive file" wait_request_state $fid ARCHIVE SUCCEED - [ $(stat -c "%b" $f) -ne "1" ] || error "wrong block number" - local sz=$(stat -c "%s" $f) - [ $sz -ne "0" ] || error "file size should not be zero" + local blocks=$(stat -c "%b" $f) + [ $blocks -eq $orig_blocks ] || + error "$f: wrong block number after archive: " \ + "$blocks != $orig_blocks" + local size=$(stat -c "%s" $f) + [ $size -eq $orig_size ] || + error "$f: wrong size after archive: $size != $orig_size" # Release and check states $LFS hsm_release $f || error "could not release file" check_hsm_flags $f "0x0000000d" - [ $(stat -c "%b" $f) -eq "1" ] || error "wrong block number" - [ $(stat -c "%s" $f) -eq $sz ] || error "wrong file size" + blocks=$(stat -c "%b" $f) + [ $blocks -gt 5 ] && + error "$f: too many blocks after release: $blocks > 5" + size=$(stat -c "%s" $f) + [ $size -ne $orig_size ] && + error "$f: wrong size after release: $size != $orig_size" # Check we can release an file without stripe info f=$f.nolov @@ -1290,6 +1448,8 @@ test_21() { $LFS hsm_release $f || fail "second release should succeed" check_hsm_flags $f "0x0000000d" + stop_full_debug_logging + copytool_cleanup } run_test 21 "Simple release tests" @@ -1827,6 +1987,41 @@ test_30b() { } run_test 30b "Restore at exec (release case)" +test_30c() { + needclients 2 || return 0 + + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/SLEEP + local fid=$(copy_file /bin/sleep $f) + chmod 755 $f + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + check_hsm_flags $f "0x0000000d" + # set no retry action mode + cdt_set_no_retry + do_node $CLIENT2 "$f 10" & + local pid=$! + sleep 3 + echo 'Hi!' > $f + [[ $? == 0 ]] && error "Update during exec of released file must fail" + wait $pid + [[ $? == 0 ]] || error "Execution failed during run" + cmp /bin/sleep $f + [[ $? == 0 ]] || error "Binary overwritten during exec" + + # cleanup + # remove no try action mode + cdt_clear_no_retry + check_hsm_flags $f "0x00000009" + + copytool_cleanup +} +run_test 30c "Update during exec of released file must fail" + restore_and_check_size() { local f=$1 local fid=$2 @@ -1852,11 +2047,11 @@ restore_and_check_size() { cpt=$((cpt + 1)) done if [[ $cpt -lt 10 ]]; then - echo " restore is too long" - else echo " "done + else + echo " restore is too long" + wait_request_state $fid RESTORE SUCCEED fi - wait_request_state $fid RESTORE SUCCEED return $err } @@ -2086,7 +2281,13 @@ test_40() { fid=$(copy_file /etc/hosts $f.$p.$i) done done - copytool_setup + # force copytool to use a local/temp archive dir to ensure best + # performance vs remote/NFS mounts used in auto-tests + if df --local $HSM_ARCHIVE >/dev/null 2>&1 ; then + copytool_setup + else + copytool_setup $SINGLEAGT $MOUNT $HSM_ARCHIVE_NUMBER $TMP/$tdir + fi # to be sure wait_all_done will not be mislead by previous tests cdt_purge wait_for_grace_delay @@ -2382,7 +2583,7 @@ double_verify_reset_hsm_param() { test_100() { double_verify_reset_hsm_param loop_period double_verify_reset_hsm_param grace_delay - double_verify_reset_hsm_param request_timeout + double_verify_reset_hsm_param active_request_timeout double_verify_reset_hsm_param max_requests double_verify_reset_hsm_param default_archive_id } @@ -2412,7 +2613,7 @@ test_103() { echo "Current requests" local res=$(do_facet $SINGLEMDS "$LCTL get_param -n\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep -v CANCELED | grep -v SUCCEED | grep -v FAILED") [[ -z "$res" ]] || error "Some request have not been canceled" @@ -2434,7 +2635,7 @@ test_104() { cdt_disable $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f local data1=$(do_facet $SINGLEMDS "$LCTL get_param -n\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep $fid | cut -f16 -d=") cdt_enable @@ -2455,12 +2656,12 @@ test_105() { $LFS hsm_archive $DIR/$tdir/$i done local reqcnt1=$(do_facet $SINGLEMDS "$LCTL get_param -n\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep WAITING | wc -l") cdt_restart cdt_disable local reqcnt2=$(do_facet $SINGLEMDS "$LCTL get_param -n\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep WAITING | wc -l") cdt_enable cdt_purge @@ -2470,28 +2671,70 @@ test_105() { } run_test 105 "Restart of coordinator" -test_106() { - # test needs a running copytool - copytool_setup +get_agent_by_uuid_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + do_facet $mds "$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.agents |\ + grep $uuid" +} + +check_agent_registered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ ! -z "$agent" ]]; then + echo "found agent $agent on $mds" + else + error "uuid $uuid not found in agent list on $mds" + fi +} +check_agent_unregistered_by_mdt() { + local uuid=$1 + local mdtidx=$2 + local mds=mds$(($mdtidx + 1)) + local agent=$(get_agent_by_uuid_mdt $uuid $mdtidx) + if [[ -z "$agent" ]]; then + echo "uuid not found in agent list on $mds" + else + error "uuid found in agent list on $mds: $agent" + fi +} + +check_agent_registered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_registered_by_mdt $uuid $((mdsno - 1)) + done +} + +check_agent_unregistered() { + local uuid=$1 + local mdsno + for mdsno in $(seq 1 $MDSCOUNT); do + check_agent_unregistered_by_mdt $uuid $((mdsno - 1)) + done +} + +test_106() { local uuid=$(do_rpc_nodes $(facet_active_host $SINGLEAGT) \ get_client_uuid $MOUNT | cut -d' ' -f2) - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + + copytool_setup + check_agent_registered $uuid + + search_copytools || error "No copytool found" + copytool_cleanup - [[ ! -z "$agent" ]] || error "My uuid $uuid not found in agent list" - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) - [[ -z "$agent" ]] || - error "My uuid $uuid still found in agent list,"\ - " after copytool shutdown" + check_agent_unregistered $uuid + copytool_setup - local agent=$(do_facet $SINGLEMDS $LCTL get_param -n $HSM_PARAM.agents | - grep $uuid) + check_agent_registered $uuid + copytool_cleanup - [[ ! -z "$agent" ]] || - error "My uuid $uuid not found in agent list after"\ - " copytool restart" } run_test 106 "Copytool register/unregister" @@ -2767,7 +3010,7 @@ test_220() { $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f wait_request_state $fid ARCHIVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) changelog_cleanup local target=0x0 @@ -2794,7 +3037,7 @@ test_221() { wait_request_state $fid ARCHIVE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x7d [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2819,7 +3062,7 @@ test_222a() { $LFS hsm_restore $f wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2845,7 +3088,7 @@ test_222b() { wait_request_state $fid RESTORE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0x80 [[ $flags == $target ]] || error "Changelog flag is $flags not $target" @@ -2874,7 +3117,7 @@ test_223a() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2903,7 +3146,7 @@ test_223b() { wait_request_state $fid RESTORE CANCELED wait_request_state $fid CANCEL SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -1) local target=0xfd [[ $flags == $target ]] || @@ -2929,7 +3172,7 @@ test_224() { $LFS hsm_remove $f wait_request_state $fid REMOVE SUCCEED - local flags=$(changelog_get_flags $MDT0 HSM $fid | tail -1) + local flags=$(changelog_get_flags ${MDT[0]} HSM $fid | tail -n 1) local target=0x200 [[ $flags == $target ]] || @@ -2965,9 +3208,9 @@ test_225() { wait_request_state $fid REMOVE CANCELED wait_request_state $fid CANCEL SUCCEED - flags=$(changelog_get_flags $MDT0 RENME $fid2) - local flags=$($LFS changelog $MDT0 | grep HSM | grep $fid | tail -1 | - awk '{print $5}') + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) + local flags=$($LFS changelog ${MDT[0]} | grep HSM | grep $fid | + tail -n 1 | awk '{print $5}') local target=0x27d [[ $flags == $target ]] || @@ -2999,7 +3242,7 @@ test_226() { rm $f1 || error "rm $f1 failed" - local flags=$(changelog_get_flags $MDT0 UNLNK $fid1) + local flags=$(changelog_get_flags ${MDT[0]} UNLNK $fid1) local target=0x3 [[ $flags == $target ]] || @@ -3007,7 +3250,7 @@ test_226() { mv $f3 $f2 || error "mv $f3 $f2 failed" - flags=$(changelog_get_flags $MDT0 RENME $fid2) + flags=$(changelog_get_flags ${MDT[0]} RENME $fid2) target=0x3 [[ $flags == $target ]] || @@ -3027,7 +3270,7 @@ check_flags_changes() { local target=0x280 $LFS hsm_set --$hsm_flag $f || error "Cannot set $hsm_flag on $f" - local flags=($(changelog_get_flags $MDT0 HSM $fid)) + local flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) local seen=${#flags[*]} cnt=$((fst + cnt)) [[ $seen == $cnt ]] || @@ -3038,7 +3281,7 @@ check_flags_changes() { $LFS hsm_clear --$hsm_flag $f || error "Cannot clear $hsm_flag on $f" - flags=($(changelog_get_flags $MDT0 HSM $fid)) + flags=($(changelog_get_flags ${MDT[0]} HSM $fid)) seen=${#flags[*]} cnt=$(($cnt + 1)) [[ $cnt == $seen ]] || @@ -3078,10 +3321,9 @@ test_228() { # test needs a running copytool copytool_setup - dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 conv=sync || - error "creating $DIR/$tfile" + local fid=$(make_small_sync $DIR/$tfile) $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $DIR/$tfile - wait_request_state $(path2fid $DIR/$tfile) ARCHIVE SUCCEED + wait_request_state $fid ARCHIVE SUCCEED $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" @@ -3098,13 +3340,15 @@ test_228() { $LFS hsm_release $DIR/$tfile check_hsm_flags $DIR/$tfile "0x0000000d" - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir || error "mkdir $tdir failed" tar cf - --sparse $DIR/$tfile | tar xvf - -C $DIR/$tdir || error "tar failed" cmp $DIR/$tfile $DIR/$tdir/$DIR/$tfile || error "comparing untarred $DIR/$tfile" + rm -f $DIR/$tfile $DIR/$tfile.2 || + error "rm $DIR/$tfile or $DIR/$tfile.2 failed" copytool_cleanup } run_test 228 "On released file, return extend to FIEMAP. For [cp,tar] --sparse" @@ -3133,12 +3377,12 @@ test_250() { while [[ $cnt != 0 || $wt != 0 ]]; do sleep 1 cnt=$(do_facet $SINGLEMDS "$LCTL get_param -n\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep STARTED | grep -v CANCEL | wc -l") [[ $cnt -le $maxrequest ]] || error "$cnt > $maxrequest too many started requests" wt=$(do_facet $SINGLEMDS "$LCTL get_param\ - $HSM_PARAM.agent_actions |\ + $HSM_PARAM.actions |\ grep WAITING | wc -l") echo "max=$maxrequest started=$cnt waiting=$wt" done @@ -3157,8 +3401,8 @@ test_251() { cdt_disable # to have a short test - local old_to=$(get_hsm_param request_timeout) - set_hsm_param request_timeout 4 + local old_to=$(get_hsm_param active_request_timeout) + set_hsm_param active_request_timeout 4 # to be sure the cdt will wake up frequently so # it will be able to cancel the "old" request local old_loop=$(get_hsm_param loop_period) @@ -3170,7 +3414,7 @@ test_251() { sleep 5 wait_request_state $fid ARCHIVE CANCELED - set_hsm_param request_timeout $old_to + set_hsm_param active_request_timeout $old_to set_hsm_param loop_period $old_loop copytool_cleanup @@ -3227,7 +3471,11 @@ test_302() { cdt_shutdown set_hsm_param default_archive_id $new -P - fail $SINGLEMDS + + local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do + fail mds${mdtno} + done # check cdt is on cdt_check_state enabled @@ -3241,6 +3489,168 @@ test_302() { } run_test 302 "HSM tunnable are persistent when CDT is off" +test_400() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that hsm request on mdt0 is sent to the right MDS + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # check that hsm request on mdt1 is sent to the right MDS + $LFS hsm_archive $dir_mdt1/$tfile || error "lfs hsm_archive" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 400 "Single request is sent to the right MDT" + +test_401() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + local dir_mdt1=$DIR/$tdir/mdt1 + + # create 1 dir per MDT + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + $LFS mkdir -i 1 $dir_mdt1 || error "lfs mkdir" + + # create 1 file in each MDT + local fid1=$(make_small $dir_mdt0/$tfile) + local fid2=$(make_small $dir_mdt1/$tfile) + + # check that compound requests are shunt to the rights MDTs + $LFS hsm_archive $dir_mdt0/$tfile $dir_mdt1/$tfile || + error "lfs hsm_archive" + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + wait_request_state $fid2 ARCHIVE SUCCEED 1 && + echo "archive successful on mdt1" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 $dir_mdt1 +} +run_test 401 "Compound requests split and sent to their respective MDTs" + +mdc_change_state() # facet, MDT_pattern, activate|deactivate +{ + local facet=$1 + local pattern="$2" + local state=$3 + local node=$(facet_active_host $facet) + local mdc + for mdc in $(do_facet $facet "$LCTL dl | grep -E ${pattern}-mdc" | + awk '{print $4}'); do + echo "$3 $mdc on $node" + do_facet $facet "$LCTL --device $mdc $state" || return 1 + done +} + +test_402() { + # make sure there is no running copytool + copytool_cleanup + + # deactivate all mdc on agent1 + mdc_change_state $SINGLEAGT "MDT000." "deactivate" + + copytool_setup $SINGLEAGT + + check_agent_unregistered "uuid" # match any agent + + # no expected running copytool + search_copytools $agent && error "Copytool start should have failed" + + # reactivate MDCs + mdc_change_state $SINGLEAGT "MDT000." "activate" +} +run_test 402 "Copytool start fails if all MDTs are inactive" + +test_403() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + # make sure there is no running copytool + copytool_cleanup + + local agent=$(facet_active_host $SINGLEAGT) + local uuid=$(do_rpc_nodes $agent get_client_uuid | cut -d' ' -f2) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "MDT0001" "deactivate" + + copytool_setup + # check the agent is registered on MDT0000, and not on MDT0001 + check_agent_registered_by_mdt $uuid 0 + check_agent_unregistered_by_mdt $uuid 1 + + # check running copytool process + search_copytools $agent || error "No running copytools on $agent" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "MDT0001" "activate" + + # make sure the copytool is now registered to all MDTs + check_agent_registered $uuid + + copytool_cleanup +} +run_test 403 "Copytool starts with inactive MDT and register on reconnect" + +test_404() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + + copytool_setup + + # create files on both MDT0000 and MDT0001 + mkdir -p $DIR/$tdir + + local dir_mdt0=$DIR/$tdir/mdt0 + $LFS mkdir -i 0 $dir_mdt0 || error "lfs mkdir" + + # create 1 file on mdt0 + local fid1=$(make_small $dir_mdt0/$tfile) + + # deactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "MDT0001" "deactivate" + + # send an HSM request for files in MDT0000 + $LFS hsm_archive $dir_mdt0/$tfile || error "lfs hsm_archive" + + # check for completion of files in MDT0000 + wait_request_state $fid1 ARCHIVE SUCCEED 0 && + echo "archive successful on mdt0" + + # reactivate all mdc for MDT0001 + mdc_change_state $SINGLEAGT "MDT0001" "activate" + + copytool_cleanup + # clean test files and directories + rm -rf $dir_mdt0 +} +run_test 404 "Inactive MDT does not block requests for active MDTs" + copytool_cleanup complete $SECONDS