Branch HEAD

author tappro <tappro>

Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)

committer tappro <tappro>

Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)
author tappro <tappro>
Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)
committer tappro <tappro>
Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c

index 35ad7a7..4fe16bc 100644 (file)
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -581,6 +581,14 @@ static int osc_wr_lockless_truncate(struct file *file, const char *buffer,
                  count;
  }
  
+static int osc_rd_destroys_in_flight(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+        return snprintf(page, count, "%u\n",
+                        atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+
  static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,        0, 0 },
          { "ping",            0, lprocfs_wr_ping,     0, 0, 0222 },
@@ -600,6 +608,7 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
                                 osc_wr_max_pages_per_rpc, 0 },
          { "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight,
                                  osc_wr_max_rpcs_in_flight, 0 },
+        { "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 },
          { "max_dirty_mb",    osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
          { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
          { "cur_grant_bytes", osc_rd_cur_grant_bytes,
diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh

index c47082e..11a3122 100755 (executable)
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -236,7 +236,7 @@ test_13() {
      kill -USR1 $MULTIPID || return 3
      wait $MULTIPID || return 4
  
-    # drop close 
+    # drop close
      do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
      facet_failover $SINGLEMDS
      do_facet $SINGLEMDS lctl set_param fail_loc=0
@@ -279,17 +279,18 @@ test_14b() {
      createmany -o $MOUNT1/$tfile-3- 5
      umount $MOUNT2
  
-    facet_failover $SINGLEMDS
-    # expect recovery don't fail due to VBR
-    df $MOUNT1 || return 1
+    fail $SINGLEMDS
+    wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
  
      # first 25 files should have been replayed
      unlinkmany $MOUNT1/$tfile- 5 || return 2
      unlinkmany $MOUNT1/$tfile-3- 5 || return 3
  
      zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
-    # give ost time to process llogs
-    sleep 3
+
+    wait_mds_ost_sync || return 4
+    wait_destroy_complete || return 5
+
      AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
      log "before $BEFOREUSED, after $AFTERUSED"
      [ $AFTERUSED -ne $BEFOREUSED ] && \
@@ -298,7 +299,7 @@ test_14b() {
  }
  run_test 14b "delete ost orphans if gap occured in objids due to VBR"
  
-test_15a() {   # was test_15
+test_15a() { # was test_15
      replay_barrier $SINGLEMDS
      createmany -o $MOUNT1/$tfile- 25
      createmany -o $MOUNT2/$tfile-2- 1
@@ -318,14 +319,14 @@ run_test 15a "timeout waiting for lost client during replay, 1 client completes"
  test_15c() {
      replay_barrier $SINGLEMDS
      for ((i = 0; i < 2000; i++)); do
-       echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
+        echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
      done
-    
+
      umount $MOUNT2
      facet_failover $SINGLEMDS
  
      df $MOUNT || return 1
-    
+
      zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
      return 0
  }
diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh

index 4d7c5bb..1bdbcbf 100755 (executable)
--- a/lustre/tests/replay-ost-single.sh
+++ b/lustre/tests/replay-ost-single.sh
@@ -124,7 +124,7 @@ iozone_bg () {
      # need to check iozone output  on "complete"
      local iozonelog=$TMP/${TESTSUITE}.iozone.log
      rm -f $iozonelog
-    cat $tmppipe | while read line ; do 
+    cat $tmppipe | while read line ; do
          echo "$line"
          echo "$line" >>$iozonelog
      done;
@@ -138,7 +138,7 @@ iozone_bg () {
      fi
      rm -f $tmppipe
      rm -f $iozonelog
-    return $rc 
+    return $rc
  }
  
  test_5() {
@@ -158,7 +158,7 @@ test_5() {
      local pid=$!
  
      echo iozone bg pid=$pid
-    
+
      sleep 8
      fail ost1
      local rc=0
@@ -187,7 +187,7 @@ test_6() {
      get_stripe_info client $f
  
      sync
-    sleep 2                                    # ensure we have a fresh statfs
+    sleep 2 # ensure we have a fresh statfs
      sync
  #define OBD_FAIL_MDS_REINT_NET_REP       0x119
      do_facet mds "lctl set_param fail_loc=0x80000119"
@@ -196,10 +196,12 @@ test_6() {
      (( $before > $after_dd )) || return 1
      rm -f $f
      fail ost$((stripe_index + 1))
+    wait_recovery_complete ost$((stripe_index + 1)) || error "OST recovery not done"
      $CHECKSTAT -t file $f && return 2 || true
      sync
      # let the delete happen
-    sleep 5
+    wait_mds_ost_sync || return 4
+    wait_destroy_complete || return 5
      after=`kbytesfree`
      log "before: $before after: $after"
      (( $before <= $after + 40 )) || return 3   # take OST logs into account
@@ -213,7 +215,7 @@ test_7() {
      before=`kbytesfree`
      dd if=/dev/urandom bs=4096 count=1280 of=$f || return 4
      sync
-    sleep 2                                    # ensure we have a fresh statfs
+    sleep 2 # ensure we have a fresh statfs
      sync
      after_dd=`kbytesfree`
      log "before: $before after_dd: $after_dd"
@@ -221,10 +223,12 @@ test_7() {
      replay_barrier ost1
      rm -f $f
      fail ost1
+    wait_recovery_complete ost1 || error "OST recovery not done"
      $CHECKSTAT -t file $f && return 2 || true
      sync
      # let the delete happen
-    sleep 5
+    wait_mds_ost_sync || return 4
+    wait_destroy_complete || return 5
      after=`kbytesfree`
      log "before: $before after: $after"
      (( $before <= $after + 40 )) || return 3   # take OST logs into account
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 8a182a4..792176d 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -479,27 +479,7 @@ test_20b() { # bug 10480
      fail $SINGLEMDS                            # start orphan recovery
      df -P $DIR || df -P $DIR || true    # reconnect
      wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
-
-    # just because recovery is done doesn't mean we've finished
-    # orphan cleanup. Wait for llogs to get synchronized.
-    echo waiting for orphan cleanup...
-    while [ true ]; do
-            local -a sync=($(do_nodes $(comma_list $(osts_nodes)) \
-                "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
-            local con=1
-            for ((i=0; i<${#sync[@]}; i++)); do
-                    [ ${sync[$i]} -eq 0 ] && continue
-                    # there is a not finished MDS-OST synchronization
-                    con=0
-                    break;
-            done
-            [ ${con} -eq 1 ] && break
-            sleep 1
-    done
-
-    # let the statfs cache to get old enough.
-    sleep 1
-
+    wait_mds_ost_sync || return 3
      AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
      log "before $BEFOREUSED, after $AFTERUSED"
      [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index be4ae46..c0c1bd9 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -1013,12 +1013,13 @@ wait_delete_completed () {
          sleep 1
          TOTAL=`lctl get_param -n osc.*.kbytesavail | \
                 awk 'BEGIN{total=0}; {total+=$1}; END{print total}'`
-        [ "$TOTAL" -eq "$TOTALPREV" ] && break
+        [ "$TOTAL" -eq "$TOTALPREV" ] && return 0
          echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL "
          TOTALPREV=$TOTAL
          WAIT=$(( WAIT + 1))
      done
-    echo "Delete completed."
+    echo "Delete is not completed in $MAX_WAIT sec"
+    return 1
  }
  
  wait_for_host() {
@@ -1036,12 +1037,12 @@ wait_for() {
  wait_recovery_complete () {
      local facet=$1
  
-    # Use default policy if $2 is not passed by caller. 
+    # Use default policy if $2 is not passed by caller.
      #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
      # as we are in process of changing obd_timeout in different ways
      # let's set MAX longer than that
      local MAX=${2:-$(( TIMEOUT * 4 ))}
- 
+
      local var_svc=${facet}_svc
      local procfile="*.${!var_svc}.recovery_status"
      local WAIT=0
@@ -1058,6 +1059,57 @@ wait_recovery_complete () {
      return 1
  }
  
+wait_mds_ost_sync () {
+    # just because recovery is done doesn't mean we've finished
+    # orphan cleanup. Wait for llogs to get synchronized.
+    echo "Waiting for orphan cleanup..."
+    # MAX value includes time needed for MDS-OST reconnection
+    local MAX=$(( TIMEOUT * 2 ))
+    local WAIT=0
+    while [ $WAIT -lt $MAX ]; do
+        local -a sync=($(do_nodes $(comma_list $(osts_nodes)) \
+            "$LCTL get_param -n obdfilter.*.mds_sync"))
+        local con=1
+        for ((i=0; i<${#sync[@]}; i++)); do
+            [ ${sync[$i]} -eq 0 ] && continue
+            # there is a not finished MDS-OST synchronization
+            con=0
+            break;
+        done
+        sleep 2 # increase waiting time and cover statfs cache
+        [ ${con} -eq 1 ] && return 0
+        echo "Waiting $WAIT secs for $facet mds-ost sync done."
+        WAIT=$((WAIT + 2))
+    done
+    echo "$facet recovery not done in $MAX sec. $STATUS"
+    return 1
+}
+
+wait_destroy_complete () {
+    echo "Waiting for destroy to be done..."
+    # MAX value shouldn't be big as this mean server responsiveness
+    # never increase this just to make test pass but investigate
+    # why it takes so long time
+    local MAX=5
+    local WAIT=0
+    while [ $WAIT -lt $MAX ]; do
+        local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight))
+        local con=1
+        for ((i=0; i<${#RPCs[@]}; i++)); do
+            [ ${RPCs[$i]} -eq 0 ] && continue
+            # there are still some destroy RPCs in flight
+            con=0
+            break;
+        done
+        sleep 1
+        [ ${con} -eq 1 ] && return 0 # done waiting
+        echo "Waiting $WAIT secs for destroys to be done."
+        WAIT=$((WAIT + 1))
+    done
+    echo "Destroys weren't done in $MAX sec."
+    return 1
+}
+
  wait_exit_ST () {
      local facet=$1
author	tappro <tappro>
	Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)
committer	tappro <tappro>
	Tue, 13 Oct 2009 06:02:55 +0000 (06:02 +0000)
lustre/osc/lproc_osc.c		patch \| blob \| history
lustre/tests/replay-dual.sh		patch \| blob \| history
lustre/tests/replay-ost-single.sh		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history