LU-14632 tests: fix sanity-hsm test_606()

[fs/lustre-release.git] / lustre / tests / sanity-hsm.sh
diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh

index 0d9292c..82b7c5a 100755 (executable)
--- a/lustre/tests/sanity-hsm.sh
+++ b/lustre/tests/sanity-hsm.sh
@@ -16,8 +16,8 @@ init_logging
  
  ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT "
  if $SHARED_KEY; then
-# bug number for skipped tests: LU-9795 LU-9795
-       ALWAYS_EXCEPT+="        13      402b "
+# bug number for skipped tests: LU-9795
+       ALWAYS_EXCEPT+="        402b "
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  fi
  
@@ -48,7 +48,6 @@ build_test_filter
  [ -n "$FILESET" ] && skip "Not functional for FILESET set"
  
  OPENFILE=${OPENFILE:-openfile}
-MMAP_CAT=${MMAP_CAT:-mmap_cat}
  MOUNT_2=${MOUNT_2:-"yes"}
  FAIL_ON_ERROR=false
  
@@ -83,98 +82,9 @@ CLIENT1=${CLIENT1:-$HOSTNAME}
  # Exception is the test which need two separate nodes
  CLIENT2=${CLIENT2:-$CLIENT1}
  
-#
-# In order to test multiple remote HSM agents, a new facet type named "AGT" and
-# the following associated variables are added:
-#
-# AGTCOUNT: number of agents
-# AGTDEV{N}: target HSM mount point (root path of the backend)
-# agt{N}_HOST: hostname of the agent agt{N}
-# SINGLEAGT: facet of the single agent
-#
-# The number of agents is initialized as the number of remote client nodes.
-# By default, only single copytool is started on a remote client/agent. If there
-# was no remote client, then the copytool will be started on the local client.
-#
-init_agt_vars() {
-       local n
-       local agent
-
-       export AGTCOUNT=${AGTCOUNT:-$((CLIENTCOUNT - 1))}
-       [[ $AGTCOUNT -gt 0 ]] || AGTCOUNT=1
-
-       export SHARED_DIRECTORY=${SHARED_DIRECTORY:-$TMP}
-       if [[ $CLIENTCOUNT -gt 1 ]] &&
-               ! check_shared_dir $SHARED_DIRECTORY $CLIENTS; then
-               skip_env "SHARED_DIRECTORY should be accessible"\
-                        "on all client nodes"
-               exit 0
-       fi
-
-       # We used to put the HSM archive in $SHARED_DIRECTORY but that
-       # meant NFS issues could hose sanity-hsm sessions. So now we
-       # use $TMP instead.
-       for n in $(seq $AGTCOUNT); do
-               eval export AGTDEV$n=\$\{AGTDEV$n:-"$TMP/arc$n"\}
-               agent=CLIENT$((n + 1))
-               if [[ -z "${!agent}" ]]; then
-                       [[ $CLIENTCOUNT -eq 1 ]] && agent=CLIENT1 ||
-                               agent=CLIENT2
-               fi
-               eval export agt${n}_HOST=\$\{agt${n}_HOST:-${!agent}\}
-       done
-
-       export SINGLEAGT=${SINGLEAGT:-agt1}
-
-       export HSMTOOL=${HSMTOOL:-"lhsmtool_posix"}
-       export HSMTOOL_VERBOSE=${HSMTOOL_VERBOSE:-""}
-       export HSMTOOL_UPDATE_INTERVAL=${HSMTOOL_UPDATE_INTERVAL:=""}
-       export HSMTOOL_EVENT_FIFO=${HSMTOOL_EVENT_FIFO:=""}
-       export HSMTOOL_TESTDIR
-
-       HSM_ARCHIVE_NUMBER=2
-
-       # The test only support up to 10 MDTs
-       MDT_PREFIX="mdt.$FSNAME-MDT000"
-       HSM_PARAM="${MDT_PREFIX}0.hsm"
-
-       # archive is purged at copytool setup
-       HSM_ARCHIVE_PURGE=true
-
-       # Don't allow copytool error upon start/setup
-       HSMTOOL_NOERROR=false
-}
-
-# Get the backend root path for the given agent facet.
-copytool_device() {
-       local facet=$1
-       local dev=AGTDEV$(facet_number $facet)
-
-       echo -n ${!dev}
-}
-
-get_mdt_devices() {
-       local mdtno
-       # get MDT device for each mdc
-       for mdtno in $(seq 1 $MDSCOUNT); do
-               local idx=$(($mdtno - 1))
-               MDT[$idx]=$($LCTL get_param -n \
-                       mdc.$FSNAME-MDT000${idx}-mdc-*.mds_server_uuid |
-                       awk '{gsub(/_UUID/,""); print $1}' | head -n1)
-       done
-}
-
  search_copytools() {
         local hosts=${1:-$(facet_active_host $SINGLEAGT)}
-       do_nodesv $hosts "libtool execute pgrep -x $HSMTOOL"
-}
-
-kill_copytools() {
-       local hosts=${1:-$(facet_active_host $SINGLEAGT)}
-
-       echo "Killing existing copytools on $hosts"
-       do_nodesv $hosts "libtool execute killall -q $HSMTOOL" || true
-       copytool_continue "$hosts"
+       do_nodesv $hosts "pgrep --pidfile=$HSMTOOL_PID_FILE hsmtool"
  }
  
  wait_copytools() {
@@ -182,7 +92,7 @@ wait_copytools() {
         local wait_timeout=200
         local wait_start=$SECONDS
         local wait_end=$((wait_start + wait_timeout))
-       local sleep_time=100000 # 0.1 second
+       local sleep_time=1
  
         while ((SECONDS < wait_end)); do
                 if ! search_copytools $hosts; then
@@ -191,9 +101,8 @@ wait_copytools() {
                 fi
  
                 echo "copytools still running on $hosts"
-               usleep $sleep_time
-               [ $sleep_time -lt 32000000 ] && # 3.2 seconds
-                       sleep_time=$(bc <<< "$sleep_time * 2")
+               sleep $sleep_time
+               [ $sleep_time -lt 5 ] && sleep_time=$((sleep_time + 1))
         done
  
         # try to dump Copytool's stack
@@ -220,24 +129,17 @@ copytool_monitor_setup() {
         cmd="cat $test_dir/fifo > $test_dir/events &"
         cmd+=" echo \\\$! > $test_dir/monitor_pid"
  
-       if [[ $PDSH == *Rmrsh* ]]; then
-               # This is required for pdsh -Rmrsh and its handling of remote
-               # shells.
-               # Regular ssh and pdsh -Rssh work fine without this
-               # backgrounded subshell nonsense.
-               (do_node $agent "$cmd") &
-               export HSMTOOL_MONITOR_PDSH=$!
+       # This background subshell nonsense is required when pdsh/ssh decides
+       # to wait for the cat process to exit on the remote client
+       (do_node $agent "$cmd") &
+       export HSMTOOL_MONITOR_PDSH=$!
  
-               # Slightly racy, but just making a best-effort to catch obvious
-               # problems.
-               sleep 1
-               ps -p $HSMTOOL_MONITOR_PDSH > /dev/null ||
-                       error "Failed to start copytool monitor on $agent"
-       else
-               do_node $agent "$cmd"
-               if [ $? != 0 ]; then
-                       error "Failed to start copytool monitor on $agent"
-               fi
+       # Slightly racy, but just making a best-effort to catch obvious
+       # problems.
+       sleep 1
+       do_node $agent "stat $HSMTOOL_MONITOR_DIR/monitor_pid 2>&1 > /dev/null"
+       if [ $? != 0 ]; then
+               error "Failed to start copytool monitor on $agent"
         fi
  }
  
@@ -245,10 +147,13 @@ fid2archive()
  {
         local fid="$1"
  
-       case "$HSMTOOL" in
-       lhsmtool_posix)
-               printf "%s" "$(hsm_root)/*/*/*/*/*/*/$fid"
-               ;;
+       case "$HSMTOOL_ARCHIVE_FORMAT" in
+               v1)
+                       printf "%s" "$(hsm_root)/*/*/*/*/*/*/$fid"
+                       ;;
+               v2)
+                       printf "%s" "$(hsm_root)/*/$fid"
+                       ;;
         esac
  }
  
@@ -266,19 +171,11 @@ get_copytool_event_log() {
  copytool_suspend() {
         local agents=${1:-$(facet_active_host $SINGLEAGT)}
  
-       stack_trap \
-               "do_nodesv $agents libtool execute pkill -CONT -x '$HSMTOOL' || true" EXIT
-       do_nodesv $agents "libtool execute pkill -STOP -x $HSMTOOL" || return 0
+       stack_trap "pkill_copytools $agents CONT || true" EXIT
+       pkill_copytools $agents STOP || return 0
         echo "Copytool is suspended on $agents"
  }
  
-copytool_continue() {
-       local agents=${1:-$(facet_active_host $SINGLEAGT)}
-
-       do_nodesv $agents "libtool execute pkill -CONT -x $HSMTOOL" || return 0
-       echo "Copytool is continued on $agents"
-}
-
  copytool_remove_backend() {
         local fid=$1
         local be=$(do_facet $SINGLEAGT find "$(hsm_root)" -name $fid)
@@ -514,7 +411,7 @@ wait_all_done() {
         [[ -n $fid ]] && cmd+=" | grep '$fid'"
         cmd+=" | egrep 'WAITING|STARTED'"
  
-       wait_result $SINGLEMDS "$cmd" "" $timeout ||
+       wait_update_facet --verbose mds1 "$cmd" "" $timeout ||
                 error "requests did not complete"
  }
  
@@ -593,7 +490,8 @@ get_agent_uuid() {
  
         # Lustre mount-point is mandatory and last parameter on
         # copytool cmd-line.
-       local mntpnt=$(do_rpc_nodes $agent libtool execute ps -C $HSMTOOL -o args= |
+       local mntpnt=$(do_rpc_nodes $agent \
+                       pgrep --pidfile=$HSMTOOL_PID_FILE --list-full hsmtool |
                        awk '{print $NF}')
         [ -n "$mntpnt" ] || error "Found no Agent or with no mount-point "\
                                   "parameter"
@@ -740,8 +638,8 @@ test_1c() {
                 error "wrong archive number, $st != $LOCAL_HSM_ARCHIVE_NUMBER"
  
         LOCAL_HSM_ARCHIVE_NUMBER=33
-       if [ $(lustre_version_code client) -ge $(version_code 2.11.56) ] &&
-          [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.11.56) ]; then
+       if [ "$CLIENT_VERSION" -ge $(version_code 2.11.56) ] &&
+          [ "$MDS1_VERSION" -ge $(version_code 2.11.56) ]; then
                 # lustre in the new version supports unlimited archiveID.
                 # Test whether setting archive number > 32 is supported
                 $LFS hsm_set --exists --archive-id $LOCAL_HSM_ARCHIVE_NUMBER $f ||
@@ -787,7 +685,7 @@ test_1d() {
  run_test 1d "Archive, Release and Restore DoM file"
  
  test_1e() {
-       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code $SEL_VER) ] &&
+       [ "$MDS1_VERSION" -lt $(version_code $SEL_VER) ] &&
                 skip "skipped for lustre < $SEL_VER"
  
         mkdir -p $DIR/$tdir
@@ -1099,6 +997,18 @@ test_11b() {
  }
  run_test 11b "Import a deleted file using its FID"
  
+test_11c() {
+       pool_add $TESTNAME || error "Pool creation failed"
+       pool_add_targets $TESTNAME 1 1 || error "pool_add_targets failed"
+
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -p "$TESTNAME" $DIR/$tdir
+
+       copy2archive /etc/hosts $tdir/$tfile
+       copytool import $tdir/$tfile $DIR/$tdir/$tfile
+}
+run_test 11c "Import a file to a directory with a pool"
+
  test_12a() {
         # test needs a running copytool
         copytool setup
@@ -1445,6 +1355,26 @@ test_12q() {
  }
  run_test 12q "file attributes are refreshed after restore"
  
+test_12r() {
+       # test needs a running copytool
+       copytool setup
+
+       mkdir -p $DIR/$tdir
+       local f=$DIR/$tdir/$tfile
+       local fid=$(copy_file /etc/hosts $f)
+
+       $LFS hsm_archive $f || error "archive of $f failed"
+       wait_request_state $fid ARCHIVE SUCCEED
+       $LFS hsm_release $f || error "release of $f failed"
+
+       offset=$(lseek_test -d 7 $f)
+
+       # we check we had a restore done
+       wait_request_state $fid RESTORE SUCCEED
+       [[ $offset == 7 ]] || error "offset $offset != 7"
+}
+run_test 12r "lseek restores released file"
+
  test_13() {
         local -i i j k=0
         for i in {1..10}; do
@@ -1616,7 +1546,7 @@ test_21() {
  
         # LU-4388/LU-4389 - ZFS does not report full number of blocks
         # used until file is flushed to disk
-       if [  $(facet_fstype ost1) == "zfs" ]; then
+       if [ "$ost1_FSTYPE" == "zfs" ]; then
             # this causes an OST_SYNC rpc to be sent
             dd if=/dev/zero of=$f bs=512 count=1 oflag=sync conv=notrunc,fsync
             # clear locks to reread file data
@@ -3384,12 +3314,12 @@ test_90() {
         wait_for_grace_delay
         $LFS hsm_archive --filelist $FILELIST ||
                 error "cannot archive a file list"
-       wait_all_done 100
+       wait_all_done 200
         $LFS hsm_release --filelist $FILELIST ||
                 error "cannot release a file list"
         $LFS hsm_restore --filelist $FILELIST ||
                 error "cannot restore a file list"
-       wait_all_done 100
+       wait_all_done 200
  }
  run_test 90 "Archive/restore a file list"
  
@@ -5111,6 +5041,7 @@ test_407() {
         md5sum $f2 &
         sleep 2
  
+       do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"
         # after umount hsm_actions->O/x/x log shouldn't have
         # double RESTORE records like below
         #[0x200000401:0x1:0x0]...0x58d03a0d/0x58d03a0c action=RESTORE...WAITING
@@ -5118,21 +5049,24 @@ test_407() {
         sleep 30 &&
                 do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"&
         fail $SINGLEMDS
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+       do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"
  
         copytool_continue
-       wait_request_state $fid RESTORE SUCCEED
+       wait_all_done 100 $fid
  }
  run_test 407 "Check for double RESTORE records in llog"
  
  test_500()
  {
-       [ $MDS1_VERSION -lt $(version_code 2.6.92) ] &&
+       [ "$MDS1_VERSION" -lt $(version_code 2.6.92) ] &&
                 skip "HSM migrate is not supported"
  
         test_mkdir -p $DIR/$tdir
  
-       if [ $(lustre_version_code client) -lt $(version_code 2.11.56) ] ||
-            [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.11.56) ];
+       if [ "$CLIENT_VERSION" -lt $(version_code 2.11.56) ] ||
+            [ "$MDS1_VERSION" -lt $(version_code 2.11.56) ];
         then
                 llapi_hsm_test -d $DIR/$tdir -b ||
                         error "One llapi HSM test failed"
@@ -5144,7 +5078,7 @@ test_500()
  run_test 500 "various LLAPI HSM tests"
  
  test_600() {
-       [ $MDS1_VERSION -lt $(version_code 2.10.58) ] &&
+       [ "$MDS1_VERSION" -lt $(version_code 2.10.58) ] &&
                 skip "need MDS version at least 2.10.58"
  
         mkdir -p $DIR/$tdir
@@ -5436,8 +5370,7 @@ test_606() {
         local llog_reader=$(do_facet mgs "which llog_reader 2> /dev/null")
         llog_reader=${llog_reader:-$LUSTRE/utils/llog_reader}
         [ -z $(do_facet mgs ls -d $llog_reader 2> /dev/null) ] &&
-                       skip_env "missing llog_reader" && return
-       local fstype=$(facet_fstype mds1)
+                       skip_env "missing llog_reader"
  
         mkdir -p $DIR/$tdir
  
@@ -5457,8 +5390,9 @@ test_606() {
         local entry
  
         #remount mds1 as ldiskfs or zfs type
-       stack_trap "stop mds1; start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS" EXIT
         stop mds1 || error "stop mds1 failed"
+       stack_trap "unmount_fstype mds1; start mds1 $(mdsdevname 1)\
+               $MDS_MOUNT_OPTS" EXIT
         mount_fstype mds1 || error "remount mds1 failed"
  
         for ((i = 0; i < 1; i++)); do