Branch b1_4

[fs/lustre-release.git] / lustre / tests / recovery-small.sh
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 91672cd..d7480b6 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -2,7 +2,7 @@
  
  set -e
  
-#         bug  2986 
+#         bug  2986
  ALWAYS_EXCEPT="20b"
  
  
@@ -21,7 +21,7 @@ build_test_filter
  # setting SETUP=" " and CLEANUP=" "
  SETUP=${SETUP:-"setup"}
  CLEANUP=${CLEANUP:-"cleanup"}
-
+FORCE=${FORCE:-"--force"}
  
  make_config() {
      rm -f $XMLCONFIG
@@ -66,7 +66,7 @@ fi
  
  if [ "$ONLY" == "cleanup" ]; then
      sysctl -w portals.debug=0 || true
-    cleanup
+    FORCE=--force cleanup
      exit
  fi
  
@@ -181,13 +181,13 @@ run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"
  
  # Bug 113, check that readdir lost recv timeout works.
  test_13() {
-    mkdir /mnt/lustre/readdir
-    touch /mnt/lustre/readdir/newentry
+    mkdir /mnt/lustre/readdir || return 1
+    touch /mnt/lustre/readdir/newentry || return
  # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE
      do_facet mds "sysctl -w lustre.fail_loc=0x80000104"
-    ls /mnt/lustre/readdir || return 1
+    ls /mnt/lustre/readdir || return 3
      do_facet mds "sysctl -w lustre.fail_loc=0"
-    rm -rf /mnt/lustre/readdir
+    rm -rf /mnt/lustre/readdir || return 4
  }
  run_test 13 "mdc_readpage restart test (bug 1138)"
  
@@ -244,8 +244,8 @@ test_17() {
      # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
      # client will get evicted here
      sysctl -w lustre.fail_loc=0x80000503
-    do_facet client cp /etc/termcap $DIR/$tfile
-
+    # need to write enough to ensure we send an RPC
+    do_facet client dd if=/dev/zero of=$DIR/$tfile bs=1024k count=2
      sleep $TIMEOUT
      sysctl -w lustre.fail_loc=0
      do_facet client "df $DIR"
@@ -364,19 +364,118 @@ test_20b() {     # bug 2986 - ldlm_handle_enqueue error during open
  }
  run_test 20b "ldlm_handle_enqueue error (should return error)"
  
-test_21() {    # bug 3267 - eviction fails writeback but app doesn't see it
+#b_cray run_test 21a "drop close request while close and open are both in flight"
+#b_cray run_test 21b "drop open request while close and open are both in flight"
+#b_cray run_test 21c "drop both request while close and open are both in flight"
+#b_cray run_test 21d "drop close reply while close and open are both in flight"
+#b_cray run_test 21e "drop open reply while close and open are both in flight"
+#b_cray run_test 21f "drop both reply while close and open are both in flight"
+#b_cray run_test 21g "drop open reply and close request while close and open are both in flight"
+#b_cray run_test 21h "drop open request and close reply while close and open are both in flight"
+#b_cray run_test 22 "drop close request and do mknod"
+#b_cray run_test 23 "client hang when close a file after mds crash"
+
+test_24() {    # bug 2248 - eviction fails writeback but app doesn't see it
         mkdir -p $DIR/$tdir
         cancel_lru_locks OSC
-       multiop $DIR/$tdir/$tfile Owyw_yc &
+       multiop $DIR/$tdir/$tfile Owy_wyc &
         MULTI_PID=$!
         usleep 500
  # OBD_FAIL_PTLRPC_BULK_PUT_NET|OBD_FAIL_ONCE
         sysctl -w lustre.fail_loc=0x80000503
+       usleep 500
         kill -USR1 $MULTI_PID
         wait $MULTI_PID
         rc=$?
+       sysctl -w lustre.fail_loc=0x0
+       client_reconnect
         [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true
  }
-run_test 21 "fsync error (should return error)" 
+run_test 24 "fsync error (should return error)" 
+
+
+test_25a() {
+       mkdir -p $DIR/$tdir
+       # put a load of file creates/writes/deletes for 10 min.
+       do_facet client "writemany -q -a $DIR/$tdir/$tfile 600 5" &
+        CLIENT_PID=$!
+       echo writemany pid $CLIENT_PID
+       sleep 10
+       FAILURE_MODE="SOFT"
+       fail mds
+       # wait for client to reconnect to MDS
+       sleep 60
+       fail mds
+       sleep 60
+       fail mds
+       # client process should see no problems even though MDS went down
+       wait $CLIENT_PID 
+       rc=$?
+       echo writemany returned $rc
+       return $rc
+}
+run_test 25a "failover MDS under load"
+
+test_25b() {
+       mkdir -p $DIR/$tdir
+       # put a load of file creates/writes/deletes
+       do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
+        CLIENT_PID=$!
+       echo writemany pid $CLIENT_PID
+       sleep 1
+       FAILURE_MODE="SOFT"
+       facet_failover mds
+       # failover at various points during recovery
+       sleep 1
+       facet_failover mds
+       sleep 5
+       facet_failover mds
+       sleep 10
+       facet_failover mds
+       sleep 20
+       facet_failover mds
+       # client process should see no problems even though MDS went down
+        # and recovery was interrupted
+       wait $CLIENT_PID 
+       rc=$?
+       echo writemany returned $rc
+       return $rc
+}
+run_test 25b "failover MDS during recovery"
+
+test_25c_guts() {
+       do_facet client "writemany -q $DIR/$tdir/$tfile 600 5" &
+        CLIENT_PID=$!
+       echo writemany pid $CLIENT_PID
+       sleep 10
+       FAILURE_MODE="SOFT"
+       fail ost
+       rc=0
+       wait $CLIENT_PID || rc=$?
+       # active client process should see an EIO for down OST
+       [ $rc -eq 5 ] && { echo "writemany correctly failed $rc" && return 0; }
+       # but timing or failover setup may allow success
+       [ $rc -eq 0 ] && { echo "writemany succeeded" && return 0; }
+       echo "writemany returned $rc"
+       return $rc
+}
+
+test_25c() {
+       mkdir -p $DIR/$tdir
+       test_25c_guts
+       rc=$?
+       [ $rc -ne 0 ] && { return $rc; }
+       # wait for client to reconnect to OST
+       sleep 30
+       test_25c_guts
+       rc=$?
+       [ $rc -ne 0 ] && { return $rc; }
+       sleep 30
+       test_25c_guts
+       rc=$?
+       client_reconnect
+       return $rc
+}
+run_test 25c "failover OST under load"
  
-$CLEANUP
+FORCE=--force $CLEANUP