Whamcloud - gitweb
LU-4093 tests: prevent zombie requests when stopping CT
[fs/lustre-release.git] / lustre / tests / sanity-hsm.sh
index 37a66ce..8b89a18 100644 (file)
@@ -11,12 +11,11 @@ SRCDIR=$(dirname $0)
 export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/utils:$PATH:/sbin:/usr/sbin
 
 ONLY=${ONLY:-"$*"}
-# bug number for skipped test:
+# bug number for skipped test:    3815     3939
+ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 34 35 36 40"
+# bug number for skipped test:4178         4176
+ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 221 223b 31a"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-# skip test cases failed before landing - Jinshan
-
-ALWAYS_EXCEPT="$SANITY_HSM_EXCEPT 31a 34 35 36"
-ALWAYS_EXCEPT="$ALWAYS_EXCEPT 200 201 221 223a 223b 225"
 
 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 
@@ -164,10 +163,34 @@ copytool_setup() {
 copytool_cleanup() {
        trap - EXIT
        local agents=${1:-$(facet_active_host $SINGLEAGT)}
+       local mdtno
+       local idx
+       local oldstate
+       local mdt_hsmctrl
 
        do_nodesv $agents "pkill -INT -x $HSMTOOL_BASE" || return 0
        sleep 1
        echo "Copytool is stopped on $agents"
+
+       # clean all CDTs orphans requests from previous tests
+       # that would otherwise need to timeout to clear.
+       for mdtno in $(seq 1 $MDSCOUNT); do
+               idx=$(($mdtno - 1))
+               mdt_hsmctrl="mdt.$FSNAME-MDT000${idx}.hsm_control"
+               oldstate=$(do_facet mds${mdtno} "$LCTL get_param -n " \
+                                  "$MDT_HSMCTRL")
+               # skip already stop[ed,ing] CDTs
+               echo $oldstate | grep stop || continue
+
+               do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=shutdown"
+               wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \
+                       "stopped" 20 ||
+                       error "mds${mdtno} cdt state is not stopped"
+               do_facet mds${mdtno} "$LCTL set_param $mdt_hsmctrl=$oldstate"
+               wait_result mds${mdtno} "$LCTL get_param -n $mdt_hsmctrl" \
+                       "$oldstate" 20 ||
+                       error "mds${mdtno} cdt state is not $oldstate"
+       done
 }
 
 copytool_suspend() {
@@ -179,7 +202,7 @@ copytool_suspend() {
 
 copytool_remove_backend() {
        local fid=$1
-       local be=$(find $HSM_ARCHIVE -name $fid)
+       local be=$(do_facet $SINGLEAGT find $HSM_ARCHIVE -name $fid)
        echo "Remove from backend: $fid = $be"
        do_facet $SINGLEAGT rm -f $be
 }
@@ -401,7 +424,8 @@ make_small() {
 }
 
 cleanup_large_files() {
-       local ratio=$(df $MOUNT |awk '{print $5}' |sed 's/%//g' |grep -v Use)
+       local ratio=$(df -P $MOUNT | tail -1 | awk '{print $5}' |
+                     sed 's/%//g')
        [ $ratio -gt 50 ] && find $MOUNT -size +10M -exec rm -f {} \;
 }
 
@@ -466,7 +490,7 @@ wait_request_state() {
        local request=$2
        local state=$3
 
-       local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions"
+       local cmd="$LCTL get_param -n $HSM_PARAM.actions"
        cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d="
 
        wait_result $SINGLEMDS "$cmd" $state 100 ||
@@ -477,7 +501,7 @@ get_request_state() {
        local fid=$1
        local request=$2
 
-       do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\
+       do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.actions |"\
                "awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d="
 }
 
@@ -485,14 +509,14 @@ get_request_count() {
        local fid=$1
        local request=$2
 
-       do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.agent_actions |"\
+       do_facet $SINGLEMDS "$LCTL get_param -n $HSM_PARAM.actions |"\
                "awk -vn=0 '/'$fid'.*action='$request'/ {n++}; END {print n}'"
 }
 
 wait_all_done() {
        local timeout=$1
 
-       local cmd="$LCTL get_param -n $HSM_PARAM.agent_actions"
+       local cmd="$LCTL get_param -n $HSM_PARAM.actions"
        cmd+=" | egrep 'WAITING|STARTED'"
 
        wait_result $SINGLEMDS "$cmd" "" $timeout ||
@@ -1082,8 +1106,8 @@ test_13() {
                        CURR_FILE="$CURR_DIR/$tfile.$f"
                        # write file-specific data
                        do_facet $SINGLEAGT \
-                               echo "d=$d, f=$f, dir=$CURR_DIR, "\
-                                    "file=$CURR_FILE" > $CURR_FILE
+                               "echo d=$d, f=$f, dir=$CURR_DIR, "\
+                                       "file=$CURR_FILE > $CURR_FILE"
                done
        done
        # import to Lustre
@@ -1827,6 +1851,41 @@ test_30b() {
 }
 run_test 30b "Restore at exec (release case)"
 
+test_30c() {
+       needclients 2 || return 0
+
+       # test needs a running copytool
+       copytool_setup
+
+       mkdir -p $DIR/$tdir
+       local f=$DIR/$tdir/SLEEP
+       local fid=$(copy_file /bin/sleep $f)
+       chmod 755 $f
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f
+       wait_request_state $fid ARCHIVE SUCCEED
+       $LFS hsm_release $f
+       check_hsm_flags $f "0x0000000d"
+       # set no retry action mode
+       cdt_set_no_retry
+       do_node $CLIENT2 "$f 10" &
+       local pid=$!
+       sleep 3
+       echo 'Hi!' > $f
+       [[ $? == 0 ]] && error "Update during exec of released file must fail"
+       wait $pid
+       [[ $? == 0 ]] || error "Execution failed during run"
+       cmp /bin/sleep $f
+       [[ $? == 0 ]] || error "Binary overwritten during exec"
+
+       # cleanup
+       # remove no try action mode
+       cdt_clear_no_retry
+       check_hsm_flags $f "0x00000009"
+
+       copytool_cleanup
+}
+run_test 30c "Update during exec of released file must fail"
+
 restore_and_check_size() {
        local f=$1
        local fid=$2
@@ -2382,7 +2441,7 @@ double_verify_reset_hsm_param() {
 test_100() {
        double_verify_reset_hsm_param loop_period
        double_verify_reset_hsm_param grace_delay
-       double_verify_reset_hsm_param request_timeout
+       double_verify_reset_hsm_param active_request_timeout
        double_verify_reset_hsm_param max_requests
        double_verify_reset_hsm_param default_archive_id
 }
@@ -2412,7 +2471,7 @@ test_103() {
 
        echo "Current requests"
        local res=$(do_facet $SINGLEMDS "$LCTL get_param -n\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep -v CANCELED | grep -v SUCCEED | grep -v FAILED")
 
        [[ -z "$res" ]] || error "Some request have not been canceled"
@@ -2434,7 +2493,7 @@ test_104() {
        cdt_disable
        $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER --data $DATA $f
        local data1=$(do_facet $SINGLEMDS "$LCTL get_param -n\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep $fid | cut -f16 -d=")
        cdt_enable
 
@@ -2455,12 +2514,12 @@ test_105() {
                $LFS hsm_archive $DIR/$tdir/$i
        done
        local reqcnt1=$(do_facet $SINGLEMDS "$LCTL get_param -n\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep WAITING | wc -l")
        cdt_restart
        cdt_disable
        local reqcnt2=$(do_facet $SINGLEMDS "$LCTL get_param -n\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep WAITING | wc -l")
        cdt_enable
        cdt_purge
@@ -3133,12 +3192,12 @@ test_250() {
        while [[ $cnt != 0 || $wt != 0 ]]; do
                sleep 1
                cnt=$(do_facet $SINGLEMDS "$LCTL get_param -n\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep STARTED | grep -v CANCEL | wc -l")
                [[ $cnt -le $maxrequest ]] ||
                        error "$cnt > $maxrequest too many started requests"
                wt=$(do_facet $SINGLEMDS "$LCTL get_param\
-                       $HSM_PARAM.agent_actions |\
+                       $HSM_PARAM.actions |\
                        grep WAITING | wc -l")
                echo "max=$maxrequest started=$cnt waiting=$wt"
        done
@@ -3157,8 +3216,8 @@ test_251() {
 
        cdt_disable
        # to have a short test
-       local old_to=$(get_hsm_param request_timeout)
-       set_hsm_param request_timeout 4
+       local old_to=$(get_hsm_param active_request_timeout)
+       set_hsm_param active_request_timeout 4
        # to be sure the cdt will wake up frequently so
        # it will be able to cancel the "old" request
        local old_loop=$(get_hsm_param loop_period)
@@ -3170,7 +3229,7 @@ test_251() {
        sleep 5
        wait_request_state $fid ARCHIVE CANCELED
 
-       set_hsm_param request_timeout $old_to
+       set_hsm_param active_request_timeout $old_to
        set_hsm_param loop_period $old_loop
 
        copytool_cleanup