Land b_smallfix onto HEAD (20040423_1603)

[fs/lustre-release.git] / lustre / tests / replay-single.sh
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 8fdcb68..fbf8d19 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -40,7 +40,7 @@ cleanup() {
      if [ $activemds != "mds" ]; then
          fail mds
      fi
      if [ $activemds != "mds" ]; then
          fail mds
      fi
-    zconf_umount $MOUNT
+    zconf_umount `hostname` $MOUNT
      stop mds ${FORCE} $MDSLCONFARGS
      stop ost2 ${FORCE} --dump cleanup.log
      stop ost ${FORCE} --dump cleanup.log
      stop mds ${FORCE} $MDSLCONFARGS
      stop ost2 ${FORCE} --dump cleanup.log
      stop ost ${FORCE} --dump cleanup.log
@@ -62,7 +62,8 @@ setup() {
      start ost2 --reformat $OSTLCONFARGS 
      [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
      start mds $MDSLCONFARGS --reformat
      start ost2 --reformat $OSTLCONFARGS 
      [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
      start mds $MDSLCONFARGS --reformat
-    zconf_mount $MOUNT
+    zconf_mount `hostname` $MOUNT
+    echo 0x3f0410 > /proc/sys/portals/debug
  }
  
  $SETUP
  }
  
  $SETUP
@@ -107,7 +108,7 @@ test_2b() {
  }
  run_test 2b "touch"
  
  }
  run_test 2b "touch"
  
-test_3() {
+test_3a() {
      replay_barrier mds
      mcreate $DIR/$tfile
      o_directory $DIR/$tfile
      replay_barrier mds
      mcreate $DIR/$tfile
      o_directory $DIR/$tfile
@@ -115,7 +116,32 @@ test_3() {
      $CHECKSTAT -t file $DIR/$tfile || return 2
      rm $DIR/$tfile
  }
      $CHECKSTAT -t file $DIR/$tfile || return 2
      rm $DIR/$tfile
  }
-run_test 3 "replay failed open"
+run_test 3a "replay failed open(O_DIRECTORY)"
+
+test_3b() {
+    replay_barrier mds
+#define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE
+    do_facet mds "sysctl -w lustre.fail_loc=0x80000114"
+    touch $DIR/$tfile
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    fail mds
+    $CHECKSTAT -t file $DIR/$tfile && return 2
+    return 0
+}
+run_test 3b "replay failed open -ENOMEM"
+
+test_3c() {
+    replay_barrier mds
+#define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE
+    do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
+    touch $DIR/$tfile
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    fail mds
+
+    $CHECKSTAT -t file $DIR/$tfile && return 2
+    return 0
+}
+run_test 3c "replay failed open -ENOMEM"
  
  test_4() {
      replay_barrier mds
  
  test_4() {
      replay_barrier mds
@@ -124,7 +150,7 @@ test_4() {
      done 
      fail mds
      for i in `seq 10`; do
      done 
      fail mds
      for i in `seq 10`; do
-      grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
+      grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
      done 
  }
  run_test 4 "|x| 10 open(O_CREAT)s"
      done 
  }
  run_test 4 "|x| 10 open(O_CREAT)s"
@@ -353,6 +379,7 @@ test_18() {
      sleep 1 
      rm -f $DIR/$tfile
      touch $DIR/$tfile-2 || return 1
      sleep 1 
      rm -f $DIR/$tfile
      touch $DIR/$tfile-2 || return 1
+    echo "pid: $pid will close"
      kill -USR1 $pid
      wait $pid || return 2
  
      kill -USR1 $pid
      wait $pid || return 2
  
@@ -772,6 +799,98 @@ test_40(){
  }
  run_test 40 "cause recovery in ptlrpc, ensure IO continues"
  
  }
  run_test 40 "cause recovery in ptlrpc, ensure IO continues"
  
+
+#b=2814
+# make sure that a read to one osc doesn't try to double-unlock its page just
+# because another osc is invalid.  trigger_group_io used to mistakenly return
+# an error if any oscs were invalid even after having successfully put rpcs
+# on valid oscs.  This was fatal if the caller was ll_readpage who unlocked
+# the page, guarnateeing that the unlock from the RPC completion would
+# assert on trying to unlock the unlocked page.
+test_41() {
+    local f=$MOUNT/$tfile
+    # make sure the start of the file is ost1
+    lfs setstripe $f $((128 * 1024)) 0 0 
+    do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
+    cancel_lru_locks OSC
+    # fail ost2 and read from ost1
+    local osc2_dev=`$LCTL device_list | \
+               awk '(/ost2.*client_facet/){print $4}' `
+    $LCTL --device %$osc2_dev deactivate
+    do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3
+    $LCTL --device %$osc2_dev activate
+    return 0
+}
+run_test 41 "read from a valid osc while other oscs are invalid"
+
+# test MDS recovery after ost failure
+test_42() {
+    blocks=`df $MOUNT | tail -1 | awk '{ print $1 }'`
+    createmany -o $DIR/$tfile-%d 800
+    replay_barrier ost
+    unlinkmany $DIR/$tfile-%d 0 400
+    facet_failover ost
+    
+    # osc is evicted, fs is smaller
+    blocks_after=`df $MOUNT | tail -1 | awk '{ print $1 }'`
+    [ $blocks_after -lt $blocks ] || return 1
+    echo wait for MDS to timeout and recover
+    sleep $((TIMEOUT * 2))
+    unlinkmany $DIR/$tfile-%d 400 400
+    $CHECKSTAT -t file $DIR/$tfile-* && return 2 || true
+}
+run_test 42 "recovery after ost failure"
+
+# b=2530
+# directory orphans can't be unlinked from PENDING directory
+test_43() {
+    replay_barrier mds
+
+    # OBD_FAIL_OST_CREATE_NET 0x204
+    do_facet ost "sysctl -w lustre.fail_loc=0x80000204"
+    facet_failover mds
+    df $MOUNT || return 1
+    sleep 10
+    do_facet ost "sysctl -w lustre.fail_loc=0"
+
+    return 0
+}
+run_test 43 "mds osc import failure during recovery; don't LBUG"
+
+test_44() {
+    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
+    $LCTL --device $mdcdev recover
+    df $MOUNT
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    return 0
+}
+run_test 44 "race in target handle connect"
+
+# Handle failed close
+test_45() {
+    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    $LCTL --device $mdcdev recover
+
+    multiop $DIR/$tfile O_c &
+    pid=$!
+    sleep 1
+
+    # This will cause the CLOSE to fail before even 
+    # allocating a reply buffer
+    $LCTL --device $mdcdev deactivate
+
+    # try the close
+    kill -USR1 $pid
+    wait $pid || return 1
+
+    $LCTL --device $mdcdev activate
+
+    $CHECKSTAT -t file $DIR/$tfile || return 2
+    return 0
+}
+run_test 45 "Handle failed close"
+
  equals_msg test complete, cleaning up
  $CLEANUP
  
  equals_msg test complete, cleaning up
  $CLEANUP