b=24239 use SAMPLE_FILE instead of termcap

[fs/lustre-release.git] / lustre / tests / recovery-small.sh
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 41087cf..cb75978 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -2,23 +2,17 @@
  
  set -e
  
-#         bug  5494 7288 5493
-ALWAYS_EXCEPT="24   27   52 $RECOVERY_SMALL_EXCEPT"
+#         bug  5494 5493
+ALWAYS_EXCEPT="24   52 $RECOVERY_SMALL_EXCEPT"
  
  PTLDEBUG=${PTLDEBUG:--1}
  LUSTRE=${LUSTRE:-`dirname $0`/..}
  . $LUSTRE/tests/test-framework.sh
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
  
-if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
-    CONFIG_EXCEPTIONS="52"
-    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
-    echo "Except the tests: $CONFIG_EXCEPTIONS"
-    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
-fi
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
  
  # also long tests: 19, 21a, 21e, 21f, 23, 27
  #                                   1  2.5  2.5    4    4          (min)"
@@ -218,7 +212,7 @@ test_17() {
      remote_ost_nodsh && skip "remote OST with nodsh" && return 0
  
      # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
-    if at_is_valid && at_is_enabled; then
+    if at_is_enabled; then
          at_max_saved=$(at_max_get ost1)
          at_max_set $TIMEOUT ost1
      fi
@@ -227,7 +221,7 @@ test_17() {
      # OST bulk will time out here, client retries
      do_facet ost1 lctl set_param fail_loc=0x80000503
      # need to ensure we send an RPC
-    do_facet client cp /etc/termcap $DIR/$tfile
+    do_facet client cp $SAMPLE_FILE $DIR/$tfile
      sync
  
      # with AT, client will wait adaptive_max*factor+net_latency before
@@ -237,15 +231,15 @@ test_17() {
      do_facet ost1 lctl set_param fail_loc=0
      do_facet client "df $DIR"
      # expect cmp to succeed, client resent bulk
-    do_facet client "cmp /etc/termcap $DIR/$tfile" || return 3
+    do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3
      do_facet client "rm $DIR/$tfile" || return 4
-    [ $at_max_saved -ne 0 ] && $(at_max_set $at_max_saved ost1)
+    [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved ost1
      return 0
  }
  run_test 17 "timeout bulk get, don't evict client (2732)"
  
  test_18a() {
-    [ -z ${ost2_svc} ] && skip "needs 2 osts" && return 0
+    [ -z ${ost2_svc} ] && skip_env "needs 2 osts" && return 0
  
      do_facet client mkdir -p $DIR/$tdir
      f=$DIR/$tdir/$tfile
@@ -658,6 +652,11 @@ test_26a() {      # was test_26 bug 5921 - evict dead exports by pinger
         remote_ost_nodsh && skip "remote OST with nodsh" && return 0
         remote_mds || { skip "local MDS" && return 0; }
  
+        if [ $(facet_host mgs) = $(facet_host ost1) ]; then
+                skip "msg and ost1 are at the same node"
+                return 0
+        fi
+
         check_timeout || return 1
  
         local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2)
@@ -680,9 +679,15 @@ run_test 26a "evict dead exports"
  test_26b() {      # bug 10140 - evict dead exports by pinger
         remote_ost_nodsh && skip "remote OST with nodsh" && return 0
  
+        if [ $(facet_host mgs) = $(facet_host ost1) ]; then
+                skip "msg and ost1 are at the same node"
+                return 0
+        fi
+
         check_timeout || return 1
-       client_df
-       zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
+       clients_up
+       zconf_mount `hostname` $MOUNT2 ||
+                { error "Failed to mount $MOUNT2"; return 2; }
         sleep 1 # wait connections being established
  
         local MDS_NEXP=$(do_facet $SINGLEMDS lctl get_param -n mdt.${mds1_svc}.num_exports | cut -d' ' -f2)
@@ -692,16 +697,19 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
  
         zconf_umount `hostname` $MOUNT2 -f
  
+       # PING_INTERVAL max(obd_timeout / 4, 1U)
+       # PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+
         # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.  
         # But if there's a race to start the evictor from various obds, 
         # the loser might have to wait for the next ping.
-       # PING_INTERVAL max(obd_timeout / 4, 1U)
-       # sleep (2*PING_INTERVAL) 
-
-        local rc=0
-        wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \
+       # = 9 * PING_INTERVAL + PING_INTERVAL
+       # = 10 PING_INTERVAL = 10 obd_timeout / 4 = 2.5 obd_timeout
+       # let's wait $((TIMEOUT * 3)) # bug 19887
+       local rc=0
+       wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 3)) || \
                 error "Client was not evicted by ost" rc=1
-       wait_client_evicted $SINGLEMDS $MDS_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \
+       wait_client_evicted $SINGLEMDS $MDS_NEXP $((TIMEOUT * 3)) || \
                 error "Client was not evicted by mds"
  }
  run_test 26b "evict dead exports"
@@ -717,12 +725,8 @@ test_27() {
  #define OBD_FAIL_OSC_SHUTDOWN            0x407
         do_facet $SINGLEMDS lctl set_param fail_loc=0x80000407
         # need to wait for reconnect
-       echo -n waiting for fail_loc
-       while [ $(do_facet $SINGLEMDS lctl get_param -n fail_loc) -eq -2147482617 ]; do
-           sleep 1
-           echo -n .
-       done
-       do_facet $SINGLEMDS lctl get_param -n fail_loc
+       echo waiting for fail_loc
+       wait_update_facet $SINGLEMDS "lctl get_param -n fail_loc" "-2147482617"
         facet_failover $SINGLEMDS
         #no crashes allowed!
          kill -USR1 $CLIENT_PID
@@ -738,12 +742,32 @@ test_28() {      # bug 6086 - error adding new clients
         #define OBD_FAIL_MDS_CLIENT_ADD 0x12f
         do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012f"
         # fail once (evicted), reconnect fail (fail_loc), ok
-       df || (sleep 10; df) || (sleep 10; df) || error "reconnect failed"
+       client_up || (sleep 10; client_up) || (sleep 10; client_up) || error "reconnect failed"
         rm -f $DIR/$tfile
         fail $SINGLEMDS         # verify MDS last_rcvd can be loaded
  }
  run_test 28 "handle error adding new clients (bug 6086)"
  
+test_29a() { # bug 22273 - error adding new clients
+       #define OBD_FAIL_TGT_CLIENT_ADD 0x711
+       do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000711"
+       # fail abort so client will be new again
+       fail_abort $SINGLEMDS
+       client_up || error "reconnect failed"
+       return 0
+}
+run_test 29a "error adding new clients doesn't cause LBUG (bug 22273)"
+
+test_29b() { # bug 22273 - error adding new clients
+       #define OBD_FAIL_TGT_CLIENT_ADD 0x711
+       do_facet ost1 "lctl set_param fail_loc=0x80000711"
+       # fail abort so client will be new again
+       fail_abort ost1
+       client_up || error "reconnect failed"
+       return 0
+}
+run_test 29b "error adding new clients doesn't cause LBUG (bug 22273)"
+
  test_50() {
         mkdir -p $DIR/$tdir
         # put a load of file creates/writes/deletes
@@ -770,6 +794,9 @@ test_50() {
  run_test 50 "failover MDS under load"
  
  test_51() {
+       #define OBD_FAIL_MDS_SYNC_CAPA_SL                    0x1310
+       do_facet ost1 lctl set_param fail_loc=0x00001310
+
         mkdir -p $DIR/$tdir
         # put a load of file creates/writes/deletes
         writemany -q $DIR/$tdir/$tfile 0 5 &
@@ -983,6 +1010,7 @@ test_59() { # bug 10589
  run_test 59 "Read cancel race on client eviction"
  
  err17935 () {
+    # we assume that all md changes are in the MDT0 changelog
      if [ $MDSCOUNT -gt 1 ]; then
         error_ignore 17935 $*
      else
@@ -991,17 +1019,17 @@ err17935 () {
  }
  
  test_60() {
-       remote_mds && { skip "remote MDS" && return 0; }
+        MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | \
+           awk '{gsub(/_UUID/,""); print $1}' | head -1)
  
         NUM_FILES=15000
         mkdir -p $DIR/$tdir
  
-       # Enable and clear changelog
-       $LCTL conf_param ${mds1_svc}.mdd.changelog=on
-       $LCTL set_param -n mdd.*.changelog on
-       $LFS changelog_clear $FSNAME 0
+       # Register (and start) changelog
+       USER=$(do_facet $SINGLEMDS lctl --device $MDT0 changelog_register -n)
+       echo "Registered as $MDT0 changelog user $USER"
  
-       # Create NUM_FILES in the background
+       # Generate a large number of changelog entries
         createmany -o $DIR/$tdir/$tfile $NUM_FILES
         sync
         sleep 5
@@ -1011,30 +1039,64 @@ test_60() {
         CLIENT_PID=$!
         sleep 1
  
-       # Failover the MDS while creates are happening
+       # Failover the MDS while unlinks are happening
         facet_failover $SINGLEMDS
  
         # Wait for unlinkmany to finish
         wait $CLIENT_PID
  
-       # Check if NUM_FILES create/unlink events were recorded
+       # Check if all the create/unlink events were recorded
         # in the changelog
-       $LFS changelog $FSNAME >> $DIR/$tdir/changelog
+       $LFS changelog $MDT0 >> $DIR/$tdir/changelog
         local cl_count=$(grep UNLNK $DIR/$tdir/changelog | wc -l)
-       echo "$cl_count unlinks in changelog"
-
-       [ $cl_count -eq $NUM_FILES ] || err17935 "Recorded ${cl_count} unlinks out
-of $NUM_FILES"
-
-       # Also make sure we can clear large changelogs
-       lctl set_param -n mdd.*.changelog off
-       $LFS changelog_clear $FSNAME 0
-
-       cl_count=$($LFS changelog $FSNAME | wc -l)
-       [ $cl_count -eq 1 ] || error "Changelog not empty: $cl_count entries"
+       echo "$cl_count unlinks in $MDT0 changelog"
+
+       do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $USER
+       USERS=$(( $(do_facet $SINGLEMDS lctl get_param -n \
+           mdd.$MDT0.changelog_users | wc -l) - 2 ))
+       if [ $USERS -eq 0 ]; then
+           [ $cl_count -eq $NUM_FILES ] || \
+               err17935 "Recorded ${cl_count} unlinks out of $NUM_FILES"
+           # Also make sure we can clear large changelogs
+           cl_count=$($LFS changelog $FSNAME | wc -l)
+           [ $cl_count -le 2 ] || \
+               error "Changelog not empty: $cl_count entries"
+       else
+           # If there are other users, there may be other unlinks in the log
+           [ $cl_count -ge $NUM_FILES ] || \
+               err17935 "Recorded ${cl_count} unlinks out of $NUM_FILES"
+           echo "$USERS other changelog users; can't verify clear"
+       fi
  }
  run_test 60 "Add Changelog entries during MDS failover"
  
+test_61()
+{
+       local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $FSNAME-OST0000)
+       mdtosc=${mdtosc/-MDT*/-MDT\*}
+       local cflags="osc.$mdtosc.connect_flags"
+       do_facet $SINGLEMDS "lctl get_param -n $cflags" |grep -q skip_orphan
+       [ $? -ne 0 ] && skip "don't have skip orphan feature" && return
+
+       mkdir -p $DIR/$tdir || error "mkdir dir $DIR/$tdir failed"
+       # Set the default stripe of $DIR/$tdir to put the files to ost1
+       $LFS setstripe -c 1 --index 0 $DIR/$tdir
+
+       replay_barrier $SINGLEMDS
+       createmany -o $DIR/$tdir/$tfile-%d 10 
+       local oid=`do_facet ost1 "lctl get_param -n obdfilter.${ost1_svc}.last_id"`
+
+       fail_abort $SINGLEMDS
+       
+       touch $DIR/$tdir/$tfile
+       local id=`$LFS getstripe $DIR/$tdir/$tfile |awk '($1 ~ 0 && $2 ~ /^[1-9]+/) {print $2}'`
+       [ $id -le $oid ] && error "the orphan objid was reused, failed"
+
+       # Cleanup
+       rm -rf $DIR/$tdir
+}
+run_test 61 "Verify to not reuse orphan objects - bug 17025"
+
  equals_msg `basename $0`: test complete, cleaning up
  check_and_cleanup_lustre
  [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true