b=18150

[fs/lustre-release.git] / lustre / tests / conf-sanity.sh
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index 0732fec..95a4c85 100644 (file)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -592,6 +592,8 @@ test_22() {
         stop_ost
         mount_client $MOUNT
         # check_mount will block trying to contact ost
+       mcreate $DIR/$tfile || return 40
+       rm -f $DIR/$tfile || return 42
         umount_client $MOUNT
         pass
  
@@ -972,8 +974,7 @@ cleanup_32() {
  }
  
  test_32a() {
-       # this test is totally useless on a client-only system
-       [ -n "$CLIENTONLY" -o -n "$CLIENTMODSONLY" ] && skip "client only testing" && return 0
+       client_only && skip "client only testing" && return 0
         [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
         [ -z "$TUNEFS" ] && skip_env "No tunefs" && return 0
  
@@ -1005,16 +1006,21 @@ test_32a() {
  
         local NID=$($LCTL list_nids | head -1)
  
-       echo "OSC changes should return err:"
+       echo "OSC changes should succeed:"
         $LCTL conf_param lustre-OST0000.osc.max_dirty_mb=15 || return 7
         $LCTL conf_param lustre-OST0000.failover.node=$NID || return 8
-
         echo "ok."
+
         echo "MDC changes should succeed:"
         $LCTL conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
         $LCTL conf_param lustre-MDT0000.failover.node=$NID || return 10
         echo "ok."
  
+       echo "LOV changes should succeed:"
+       $LCTL pool_new lustre.interop || return 11
+       $LCTL conf_param lustre-MDT0000.lov.stripesize=4M || return 12
+       echo "ok."
+
         cleanup_32
  
         # mount a second time to make sure we didnt leave upgrade flag on
@@ -1030,8 +1036,7 @@ test_32a() {
  run_test 32a "Upgrade from 1.8 (not live)"
  
  test_32b() {
-       # this test is totally useless on a client-only system
-       [ -n "$CLIENTONLY" -o -n "$CLIENTMODSONLY" ] && skip "client only testing" && return 0
+       client_only && skip "client only testing" && return 0
         [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
         [ -z "$TUNEFS" ] && skip_env "No tunefs" && return
  
@@ -1064,13 +1069,18 @@ test_32b() {
         local NID=$($LCTL list_nids | head -1)
  
         echo "OSC changes should succeed:"
-
         $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7
         $LCTL conf_param ${NEWNAME}-OST0000.failover.node=$NID || return 8
-
         echo "ok."
+
         echo "MDC changes should succeed:"
         $LCTL conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
+       $LCTL conf_param lustre-MDT0000.failover.node=$NID || return 10
+       echo "ok."
+
+       echo "LOV changes should succeed:"
+       $LCTL pool_new lustre.interop || return 11
+       $LCTL conf_param lustre-MDT0000.lov.stripesize=4M || return 12
         echo "ok."
  
         # MDT and OST should have registered with new nids, so we should have
@@ -1189,7 +1199,7 @@ test_34c() {
  }
  run_test 34c "force umount with failed ost should be normal"
  
-test_35() { # bug 12459
+test_35a() { # bug 12459
         setup
  
         DBG_SAVE="`lctl get_param -n debug`"
@@ -1203,7 +1213,7 @@ test_35() { # bug 12459
         log "Wait for RECONNECT_INTERVAL seconds (10s)"
         sleep 10
  
-       MSG="conf-sanity.sh test_35 `date +%F%kh%Mm%Ss`"
+       MSG="conf-sanity.sh test_35a `date +%F%kh%Mm%Ss`"
         $LCTL clear
         log "$MSG"
         log "Stopping the MDT:"
@@ -1234,7 +1244,74 @@ test_35() { # bug 12459
         [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
         cleanup
  }
-run_test 35 "Reconnect to the last active server first"
+run_test 35a "Reconnect to the last active server first"
+
+test_35b() { # bug 18674
+       remote_mds || { skip "local MDS" && return 0; }
+       setup
+
+       debugsave
+       $LCTL set_param debug="ha"
+       $LCTL clear
+       MSG="conf-sanity.sh test_35b `date +%F%kh%Mm%Ss`"
+       log "$MSG"
+
+       log "Set up a fake failnode for the MDS"
+       FAKENID="127.0.0.2"
+       local device=$(do_facet mds "$LCTL get_param -n devices" | \
+                       awk '($3 ~ "mdt" && $4 ~ "MDT") { print $4 }' | head -1)
+       do_facet mds "$LCTL conf_param ${device}.failover.node=$FAKENID" || \
+               return 1
+
+       local at_max_saved=0
+       # adaptive timeouts may prevent seeing the issue 
+       if at_is_enabled; then
+               at_max_saved=$(at_max_get mds)
+               at_max_set 0 mds client
+       fi
+
+       mkdir -p $MOUNT/testdir
+       touch $MOUNT/testdir/test
+
+       log "Injecting EBUSY on MDS"
+       # Setting OBD_FAIL_MDS_RESEND=0x136
+       do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2
+
+       log "Stat on a test file"
+       stat $MOUNT/testdir/test
+
+       log "Stop injecting EBUSY on MDS"
+       do_facet mds "$LCTL set_param fail_loc=0" || return 3
+       rm -f $MOUNT/testdir/test
+
+       log "done"
+       # restore adaptive timeout
+       [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds client
+
+       $LCTL dk $TMP/lustre-log-$TESTNAME.log
+
+       # retrieve from the log if the client has ever tried to
+       # contact the fake server after the loss of connection
+       FAILCONN=`awk "BEGIN {ret = 0;}
+                      /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+                               ret = 1;
+                               if (\\\$NF ~ /$FAKENID/) {
+                                       ret = 2;
+                                       exit;
+                               }
+                      }
+                      END {print ret}" $TMP/lustre-log-$TESTNAME.log`
+
+       [ "$FAILCONN" == "0" ] && \
+               log "ERROR: The client reconnection has not been triggered" && \
+               return 4
+       [ "$FAILCONN" == "2" ] && \
+               log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
+               return 5
+
+        cleanup
+}
+run_test 35b "Continue reconnection retries, if the active server is busy"
  
  test_36() { # 12743
          local rc
@@ -1310,7 +1387,7 @@ test_36() { # 12743
  run_test 36 "df report consistency on OSTs with different block size"
  
  test_37() {
-       [ -n "$CLIENTONLY" -o -n "$CLIENTMODSONLY" ] && skip "client only testing" && return 0
+       client_only && skip "client only testing" && return 0
         LOCAL_MDSDEV="$TMP/mdt.img"
         SYM_MDSDEV="$TMP/sym_mdt.img"
  
@@ -1591,7 +1668,7 @@ run_test 45 "long unlink handling in ptlrpcd"
  cleanup_46a() {
         trap 0
         local rc=0
-       local count=5
+       local count=$1
  
         umount_client $MOUNT2 || rc=$?
         umount_client $MOUNT || rc=$?
@@ -1608,7 +1685,7 @@ cleanup_46a() {
  }
  
  test_46a() {
-       [ $OSTCOUNT -lt 5 ] && skip_env "too few OSTs" && return
+       echo "Testing with $OSTCOUNT OSTs"
         reformat
         start_mds || return 1
         #first client should see only one ost
@@ -1616,18 +1693,19 @@ test_46a() {
          wait_osc_import_state mds ost FULL
         #start_client
         mount_client $MOUNT || return 3
-       trap cleanup_46a EXIT ERR
-
-       start_ost2 || return 4
-       start ost3 `ostdevname 3` $OST_MOUNT_OPTS || return 5
-       start ost4 `ostdevname 4` $OST_MOUNT_OPTS || return 6
-       start ost5 `ostdevname 5` $OST_MOUNT_OPTS || return 7
-       # wait until ost2-5 is sync
-        # ping_interval + 1
-        wait_osc_import_state mds ost2 FULL
-        wait_osc_import_state mds ost3 FULL
-        wait_osc_import_state mds ost4 FULL
-        wait_osc_import_state mds ost5 FULL
+       trap "cleanup_46a $OSTCOUNT" EXIT ERR
+
+       local i 
+       for (( i=2; i<=$OSTCOUNT; i++ )); do
+           start ost$i `ostdevname $i` $OST_MOUNT_OPTS || return $((i+2))
+       done
+
+       # wait until osts in sync
+       for (( i=2; i<=$OSTCOUNT; i++ )); do
+           wait_osc_import_state mds ost$i FULL
+       done
+
+
         #second client see all ost's
  
         mount_client $MOUNT2 || return 8
@@ -1644,7 +1722,7 @@ test_46a() {
         # will be deadlock
         stat $MOUNT/widestripe || return 12
  
-       cleanup_46a || { echo "cleanup_46a failed!" && return 13; }
+       cleanup_46a $OSTCOUNT || { echo "cleanup_46a failed!" && return 13; }
         return 0
  }
  run_test 46a "handle ost additional - wide striped file"
@@ -1951,6 +2029,31 @@ test_50f() {
  }
  run_test 50f "normal statfs one server in down =========================="
  
+test_50g() {
+       [ "$OSTCOUNT" -lt "2" ] && skip_env "$OSTCOUNT < 2, skipping" && return
+       setup
+       start_ost2 || error "Unable to start OST2"
+
+       local PARAM="${FSNAME}-OST0001.osc.active"
+
+       $LFS setstripe -c -1 $DIR/$tfile || error "Unable to lfs setstripe"
+       do_facet mgs $LCTL conf_param $PARAM=0 || error "Unable to deactivate OST"
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       mount_client $MOUNT || error "Unable to mount client"
+       # This df should not cause a panic
+       df -k $MOUNT
+
+       do_facet mgs $LCTL conf_param $PARAM=1 || error "Unable to activate OST"
+       rm -f $DIR/$tfile
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_ost2 || error "Unable to stop OST2"
+       stop_ost || error "Unable to stop OST1"
+       stop_mds || error "Unable to stop MDS"
+       writeconf
+}
+run_test 50g "deactivated OST should not cause panic====================="
+
  test_51() {
         local LOCAL_TIMEOUT=20