LU-8687 tests: list pool on mds when mgs is separate

[fs/lustre-release.git] / lustre / tests / test-framework.sh
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index a9097c1..ffc5253 100755 (executable)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -17,10 +17,6 @@ export QUOTA_AUTO=1
  # specify environment variable containing batch job name for server statistics
  export JOBID_VAR=${JOBID_VAR:-"procname_uid"}  # or "existing" or "disable"
  
-# LOAD_LLOOP: LU-409: only load llite_lloop module if kernel < 2.6.32 or
-#             LOAD_LLOOP is true. LOAD_LLOOP is false by default.
-export LOAD_LLOOP=${LOAD_LLOOP:-false}
-
  #export PDSH="pdsh -S -Rssh -w"
  export MOUNT_CMD=${MOUNT_CMD:-"mount -t lustre"}
  export UMOUNT=${UMOUNT:-"umount -d"}
@@ -270,6 +266,8 @@ init_test_env() {
      fi
      export LL_DECODE_FILTER_FID=${LL_DECODE_FILTER_FID:-"$LUSTRE/utils/ll_decode_filter_fid"}
      [ ! -f "$LL_DECODE_FILTER_FID" ] && export LL_DECODE_FILTER_FID="ll_decode_filter_fid"
+    export LL_DECODE_LINKEA=${LL_DECODE_LINKEA:-"$LUSTRE/utils/ll_decode_linkea"}
+    [ ! -f "$LL_DECODE_LINKEA" ] && export LL_DECODE_LINKEA="ll_decode_linkea"
      export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"}
      [ ! -f "$MKFS" ] && export MKFS="mkfs.lustre"
      export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"}
@@ -494,20 +492,6 @@ load_module() {
         fi
  }
  
-llite_lloop_enabled() {
-    local n1=$(uname -r | cut -d. -f1)
-    local n2=$(uname -r | cut -d. -f2)
-    local n3=$(uname -r | cut -d- -f1 | cut -d. -f3)
-
-    # load the llite_lloop module for < 2.6.32 kernels
-    if [[ $n1 -lt 2 ]] || [[ $n1 -eq 2 && $n2 -lt 6 ]] || \
-       [[ $n1 -eq 2 && $n2 -eq 6 && $n3 -lt 32 ]] || \
-        $LOAD_LLOOP; then
-        return 0
-    fi
-    return 1
-}
-
  load_modules_local() {
         if [ -n "$MODPROBE" ]; then
                 # use modprobe
@@ -546,6 +530,9 @@ load_modules_local() {
         fi
  
         load_module ../libcfs/libcfs/libcfs
+       # Prevent local MODOPTS_LIBCFS being passed as part of environment
+       # variable to remote nodes
+       unset MODOPTS_LIBCFS
  
      [ "$PTLDEBUG" ] && lctl set_param debug="$PTLDEBUG"
      [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug="${SUBSYSTEM# }"
@@ -602,7 +589,6 @@ load_modules_local() {
         fi
  
         load_module llite/lustre
-       llite_lloop_enabled && load_module llite/llite_lloop
         [ -d /r ] && OGDB=${OGDB:-"/r/tmp"}
         OGDB=${OGDB:-$TMP}
         rm -f $OGDB/ogdb-$HOSTNAME
@@ -998,7 +984,8 @@ running_in_vm() {
         virt=$(dmidecode -s system-product-name | awk '{print $1}')
  
         case $virt in
-               VMware|KVM|VirtualBox|Parallels) echo ${virt,,} ;;
+               VMware|KVM|VirtualBox|Parallels)
+                       echo $virt | tr '[A-Z]' '[a-z]' ;;
                 *) ;;
         esac
  }
@@ -1050,7 +1037,8 @@ create_zpool() {
         shift 3
         local opts=${@:-"-o cachefile=none"}
  
-       do_facet $facet "$ZPOOL list -H $poolname >/dev/null 2>&1 ||
+       do_facet $facet "modprobe zfs;
+               $ZPOOL list -H $poolname >/dev/null 2>&1 ||
                 $ZPOOL create -f $opts $poolname $vdev"
  }
  
@@ -1116,7 +1104,8 @@ import_zpool() {
  
         if [[ -n "$poolname" ]]; then
                 opts+=" -d $(dirname $(facet_vdevice $facet))"
-               do_facet $facet "$ZPOOL list -H $poolname >/dev/null 2>&1 ||
+               do_facet $facet "modprobe zfs;
+                       $ZPOOL list -H $poolname >/dev/null 2>&1 ||
                         $ZPOOL import -f $opts $poolname"
         fi
  }
@@ -1345,7 +1334,9 @@ mount_facet() {
  
         # commit the device label change to disk
         if [[ $devicelabel =~ (:[a-zA-Z]{3}[0-9]{4}) ]]; then
+               echo "Commit the device label on ${!dev}"
                 do_facet $facet "sync; sync; sync"
+               sleep 5
         fi
  
  
@@ -1929,55 +1920,59 @@ start_client_loads () {
  
  # only for remote client
  check_client_load () {
-    local client=$1
-    local var=$(node_var_name $client)_load
-    local TESTLOAD=run_${!var}.sh
-
-    ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-
-    # bug 18914: try to connect several times not only when
-    # check ps, but  while check_catastrophe also
-    local tries=3
-    local RC=254
-    while [ $RC = 254 -a $tries -gt 0 ]; do
-        let tries=$tries-1
-        # assume success
-        RC=0
-        if ! check_catastrophe $client; then
-            RC=${PIPESTATUS[0]}
-            if [ $RC -eq 254 ]; then
-                # FIXME: not sure how long we shuold sleep here
-                sleep 10
-                continue
-            fi
-            echo "check catastrophe failed: RC=$RC "
-            return $RC
-        fi
-    done
-    # We can continue try to connect if RC=254
-    # Just print the warning about this
-    if [ $RC = 254 ]; then
-        echo "got a return status of $RC from do_node while checking catastrophe on $client"
-    fi
-
-    # see if the load is still on the client
-    tries=3
-    RC=254
-    while [ $RC = 254 -a $tries -gt 0 ]; do
-        let tries=$tries-1
-        # assume success
-        RC=0
-        if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
-            RC=${PIPESTATUS[0]}
-            sleep 30
-        fi
-    done
-    if [ $RC = 254 ]; then
-        echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client"
-        # see if we can diagnose a bit why this is
-    fi
+       local client=$1
+       local var=$(node_var_name $client)_load
+       local testload=run_${!var}.sh
+
+       ps auxww | grep -v grep | grep $client | grep -q $testload || return 1
+
+       # bug 18914: try to connect several times not only when
+       # check ps, but  while check_node_health also
+
+       local tries=3
+       local RC=254
+       while [ $RC = 254 -a $tries -gt 0 ]; do
+               let tries=$tries-1
+               # assume success
+               RC=0
+               if ! check_node_health $client; then
+                       RC=${PIPESTATUS[0]}
+                       if [ $RC -eq 254 ]; then
+                               # FIXME: not sure how long we shuold sleep here
+                               sleep 10
+                               continue
+                       fi
+                       echo "check node health failed: RC=$RC "
+                       return $RC
+               fi
+       done
+       # We can continue try to connect if RC=254
+       # Just print the warning about this
+       if [ $RC = 254 ]; then
+               echo "got a return status of $RC from do_node while checking " \
+               "node health on $client"
+       fi
+
+       # see if the load is still on the client
+       tries=3
+       RC=254
+       while [ $RC = 254 -a $tries -gt 0 ]; do
+               let tries=$tries-1
+               # assume success
+               RC=0
+               if ! do_node $client \
+                       "ps auxwww | grep -v grep | grep -q $testload"; then
+                       RC=${PIPESTATUS[0]}
+                       sleep 30
+               fi
+       done
+       if [ $RC = 254 ]; then
+               echo "got a return status of $RC from do_node while checking " \
+               "(node health and 'ps') the client load on $client"
+               # see if we can diagnose a bit why this is
+       fi
  
-    return $RC
+       return $RC
  }
  check_client_loads () {
     local clients=${1//,/ }
@@ -2173,6 +2168,15 @@ sync_all_data() {
                 grep -v 'Found no match'
  }
  
+wait_zfs_commit() {
+       # the occupied disk space will be released
+       # only after DMUs are committed
+       if [[ $(facet_fstype $1) == zfs ]]; then
+               echo "sleep $2 for ZFS OSD"
+               sleep $2
+       fi
+}
+
  wait_delete_completed_mds() {
         local MAX_WAIT=${1:-20}
         # for ZFS, waiting more time for DMUs to be committed
@@ -2193,6 +2197,7 @@ wait_delete_completed_mds() {
                 mds2sync="$mds2sync $node"
         done
         if [ -z "$mds2sync" ]; then
+               wait_zfs_commit $SINGLEMDS $ZFS_WAIT
                 return
         fi
         mds2sync=$(comma_list $mds2sync)
@@ -2210,16 +2215,7 @@ wait_delete_completed_mds() {
                         "$LCTL get_param -n osc.*MDT*.sync_*" | calc_sum)
                 #echo "$node: $changes changes on all"
                 if [[ $changes -eq 0 ]]; then
-                       etime=$(date +%s)
-                       #echo "delete took $((etime - stime)) seconds"
-
-                       # the occupied disk space will be released
-                       # only after DMUs are committed
-                       if [[ $(facet_fstype $SINGLEMDS) == zfs ]]; then
-                               echo "sleep $ZFS_WAIT for ZFS OSD"
-                               sleep $ZFS_WAIT
-                       fi
-
+                       wait_zfs_commit $SINGLEMDS $ZFS_WAIT
                         return
                 fi
                 sleep 1
@@ -2426,25 +2422,27 @@ wait_remote_prog () {
      return $rc
  }
  
+lfs_df_check() {
+       local clients=${1:-$CLIENTS}
+
+       if [ -z "$clients" ]; then
+               $LFS df $MOUNT
+       else
+               $PDSH $clients "$LFS df $MOUNT" > /dev/null
+       fi
+}
+
+
  clients_up() {
-    # not every config has many clients
-    sleep 1
-    if [ ! -z "$CLIENTS" ]; then
-        $PDSH $CLIENTS "stat -f $MOUNT" > /dev/null
-    else
-        stat -f $MOUNT > /dev/null
-    fi
+       # not every config has many clients
+       sleep 1
+       lfs_df_check
  }
  
  client_up() {
-    local client=$1
-    # usually checked on particular client or locally
-    sleep 1
-    if [ ! -z "$client" ]; then
-        $PDSH $client "stat -f $MOUNT" > /dev/null
-    else
-        stat -f $MOUNT > /dev/null
-    fi
+       # usually checked on particular client or locally
+       sleep 1
+       lfs_df_check $1
  }
  
  client_evicted() {
@@ -5108,7 +5106,7 @@ run_one() {
         cd $SAVE_PWD
         reset_fail_loc
         check_grant ${testnum} || error "check_grant $testnum failed with $?"
-       check_catastrophe || error "LBUG/LASSERT detected"
+       check_node_health
         check_dmesg_for_errors || error "Error in dmesg detected"
         if [ "$PARALLEL" != "yes" ]; then
                 ps auxww | grep -v grep | grep -q multiop &&
@@ -5804,16 +5802,21 @@ restore_lustre_params() {
         done
  }
  
-check_catastrophe() {
+check_node_health() {
         local nodes=${1:-$(comma_list $(nodes_list))}
  
-       do_nodes $nodes "rc=0;
-val=\\\$($LCTL get_param -n catastrophe 2>&1);
-if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
-       echo \\\$(hostname -s): \\\$val;
-       rc=\\\$val;
-fi;
-exit \\\$rc"
+       for node in ${nodes//,/ }; do
+               check_network "$node" 5
+               if [ $? -eq 0 ]; then
+                       do_node $node "rc=0;
+                       val=\\\$($LCTL get_param -n catastrophe 2>&1);
+                       if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
+                               echo \\\$(hostname -s): \\\$val;
+                               rc=\\\$val;
+                       fi;
+                       exit \\\$rc" || error "$node:LBUG/LASSERT detected"
+               fi
+       done
  }
  
  mdsrate_cleanup () {
@@ -6202,27 +6205,41 @@ oos_full() {
         return $OSCFULL
  }
  
-pool_list () {
-   do_facet mgs lctl pool_list $1
+list_pool() {
+       echo -e "$(do_facet $SINGLEMDS $LCTL pool_list $1 | sed '1d')"
+}
+
+check_pool_not_exist() {
+       local fsname=${1%%.*}
+       local poolname=${1##$fsname.}
+       [[ $# -ne 1 ]] && return 0
+       [[ x$poolname = x ]] &&  return 0
+       list_pool $fsname | grep -w $1 && return 1
+       return 0
  }
  
  create_pool() {
-    local fsname=${1%%.*}
-    local poolname=${1##$fsname.}
-
-    do_facet mgs lctl pool_new $1
-    local RC=$?
-    # get param should return err unless pool is created
-    [[ $RC -ne 0 ]] && return $RC
-
-    wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
-        2>/dev/null || echo foo" "" || RC=1
-    if [[ $RC -eq 0 ]]; then
-        add_pool_to_list $1
-    else
-        error "pool_new failed $1"
-    fi
-    return $RC
+       local fsname=${1%%.*}
+       local poolname=${1##$fsname.}
+
+       do_facet mgs lctl pool_new $1
+       local RC=$?
+       # get param should return err unless pool is created
+       [[ $RC -ne 0 ]] && return $RC
+
+       for mds_id in $(seq $MDSCOUNT); do
+               local mdt_id=$((mds_id-1))
+               local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov
+               wait_update_facet mds$mds_id \
+                       "lctl get_param -n lod.$lodname.pools.$poolname \
+                               2>/dev/null || echo foo" "" ||
+                       error "mds$mds_id: pool_new failed $1"
+       done
+       wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
+               2>/dev/null || echo foo" "" || error "pool_new failed $1"
+
+       add_pool_to_list $1
+       return $RC
  }
  
  add_pool_to_list () {
@@ -6244,39 +6261,43 @@ remove_pool_from_list () {
  }
  
  destroy_pool_int() {
-    local ost
-    local OSTS=$(do_facet $SINGLEMDS lctl pool_list $1 | \
-        awk '$1 !~ /^Pool:/ {print $1}')
-    for ost in $OSTS; do
-        do_facet mgs lctl pool_remove $1 $ost
-    done
-    do_facet mgs lctl pool_destroy $1
+       local ost
+       local OSTS=$(list_pool $1)
+       for ost in $OSTS; do
+               do_facet mgs lctl pool_remove $1 $ost
+       done
+       do_facet mgs lctl pool_destroy $1
  }
  
  # <fsname>.<poolname> or <poolname>
  destroy_pool() {
-    local fsname=${1%%.*}
-    local poolname=${1##$fsname.}
+       local fsname=${1%%.*}
+       local poolname=${1##$fsname.}
  
-    [[ x$fsname = x$poolname ]] && fsname=$FSNAME
+       [[ x$fsname = x$poolname ]] && fsname=$FSNAME
  
-    local RC
+       local RC
  
-    pool_list $fsname.$poolname || return $?
+       check_pool_not_exist $fsname.$poolname
+       [[ $? -eq 0 ]] && return 0
  
-    destroy_pool_int $fsname.$poolname
-    RC=$?
-    [[ $RC -ne 0 ]] && return $RC
+       destroy_pool_int $fsname.$poolname
+       RC=$?
+       [[ $RC -ne 0 ]] && return $RC
+       for mds_id in $(seq $MDSCOUNT); do
+               local mdt_id=$((mds_id-1))
+               local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov
+               wait_update_facet mds$mds_id \
+                       "lctl get_param -n lod.$lodname.pools.$poolname \
+                               2>/dev/null || echo foo" "foo" ||
+                       error "mds$mds_id: destroy pool failed $1"
+       done
+       wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
+               2>/dev/null || echo foo" "foo" || error "destroy pool failed $1"
  
-    wait_update $HOSTNAME "lctl get_param -n lov.$fsname-*.pools.$poolname \
-      2>/dev/null || echo foo" "foo" || RC=1
+       remove_pool_from_list $fsname.$poolname
  
-    if [[ $RC -eq 0 ]]; then
-        remove_pool_from_list $fsname.$poolname
-    else
-        error "destroy pool failed $1"
-    fi
-    return $RC
+       return $RC
  }
  
  destroy_pools () {
@@ -6284,8 +6305,6 @@ destroy_pools () {
      local poolname
      local listvar=${fsname}_CREATED_POOLS
  
-    pool_list $fsname
-
      [ x${!listvar} = x ] && return 0
  
      echo destroy the created pools: ${!listvar}
@@ -7274,6 +7293,18 @@ pool_add_targets() {
         local t=$(for i in $list; do printf "$FSNAME-OST%04x_UUID " $i; done)
         do_facet mgs $LCTL pool_add \
                         $FSNAME.$pool $FSNAME-OST[$first-$last/$step]
+
+       # wait for OSTs to be added to the pool
+       for mds_id in $(seq $MDSCOUNT); do
+               local mdt_id=$((mds_id-1))
+               local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+               wait_update_facet mds$mds_id \
+                       "lctl get_param -n lod.$lodname.pools.$pool |
+                               sort -u | tr '\n' ' ' " "$t" || {
+                       error_noexit "mds$mds_id: Add to pool failed"
+                       return 3
+               }
+       done
         wait_update $HOSTNAME "lctl get_param -n lov.$FSNAME-*.pools.$pool \
                         | sort -u | tr '\n' ' ' " "$t" || {
                 error_noexit "Add to pool failed"
@@ -7410,6 +7441,17 @@ pool_remove_first_target() {
         local pname="lov.$FSNAME-*.pools.$pool"
         local t=$($LCTL get_param -n $pname | head -1)
         do_facet mgs $LCTL pool_remove $FSNAME.$pool $t
+       for mds_id in $(seq $MDSCOUNT); do
+               local mdt_id=$((mds_id-1))
+               local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+               wait_update_facet mds$mds_id \
+                       "lctl get_param -n lod.$lodname.pools.$pool |
+                               grep $t" "" || {
+                       error_noexit "mds$mds_id: $t not removed from" \
+                       "$FSNAME.$pool"
+                       return 2
+               }
+       done
         wait_update $HOSTNAME "lctl get_param -n $pname | grep $t" "" || {
                 error_noexit "$t not removed from $FSNAME.$pool"
                 return 1
@@ -7425,6 +7467,15 @@ pool_remove_all_targets() {
         do
                 do_facet mgs $LCTL pool_remove $FSNAME.$pool $t
         done
+       for mds_id in $(seq $MDSCOUNT); do
+               local mdt_id=$((mds_id-1))
+               local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov
+               wait_update_facet mds$mds_id "lctl get_param -n \
+                       lod.$lodname.pools.$pool" "" || {
+                       error_noexit "mds$mds_id: Pool $pool not drained"
+                       return 4
+               }
+       done
         wait_update $HOSTNAME "lctl get_param -n $pname" "" || {
                 error_noexit "Pool $FSNAME.$pool cannot be drained"
                 return 1