* ior-survey taking shape

author eeb <eeb>

Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)

committer eeb <eeb>

Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)
author eeb <eeb>
Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)
committer eeb <eeb>
Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)
diff --git a/lustre-iokit/ior-survey/ior-survey b/lustre-iokit/ior-survey/ior-survey

index 764744a..28e91ef 100644 (file)
--- a/lustre-iokit/ior-survey/ior-survey
+++ b/lustre-iokit/ior-survey/ior-survey
@@ -1,91 +1,67 @@
  #!/bin/bash
  
  #!/bin/bash
  
-cluster=adev
-servers=(4 7)
-server_disks=([4]=sdd [7]=sdd)
-clients=(8-15 0-1)
+# cluster name (expect all node names to be this followed by a number)
+cluster=mdev
  
  
-min_clients=1
-max_clients=10
+# client node numbers (individual numbers or inclusive ranges)
+clients=(7-8)
  
  
-per_client_size=4G
-transfer_size=1M
-tasks_per_client=1
+# numbers of clients to survey
+clients_lo=1
+clients_hi=2
+clients_iterator="+=1"
+
+# numbers of tasks per client to survey
+tasks_per_client_lo=1
+tasks_per_client_hi=16
+tasks_per_client_iterator="*=4"
+
+# record sizes to survey
+rsize_lo=1M
+rsize_hi=1M
+rsize_iterator="*=2"
+
+## which tests to run (first must be write)
+# remount)   not really a test; just remount to uncache everything
+# *write*)   write
+# *)         read
+#tests=(write rewrite read reread rewrite_again)
+tests=(write rewrite remount read)
+
+# total # bytes written/read by any client node
+min_per_client_size=75M
+min_total_size=100M
+
+# should each task do I/O to its own file?
  file_per_task=1
  
  file_per_task=1
  
+# the IOR binary
  IOR="/home/ericb/ior/src/C/IOR"
  IOR="/home/ericb/ior/src/C/IOR"
-script="/home/ericb/eeb_ior_script"
  
  
-testfile=/mnt/lustre/ior_survey_testfile
+# the pdsh binary
+pdsh=pdsh
  
  
+# the llmount binary
+llmount=/home/ericb/lustre/utils/llmount
  
  
-################################################################################
-cat > $script <<EOF
-IOR START
-#                                      -f <this file>
-#   blockSize=<set from cmdline>       -b 8G
-#   transferSize=<set from cmdline>    -t 1M
-#   filePerProc=<set from cmdline>     -F
-#   testFile=<set from cmdline>        -o /mnt/lustre/ior_testfile
-#   uniqueDir=<set from cmdline>       -u
-#   verbose=<set from cmdline>         -v
-#   reorderTasks=<set from cmdline>    -C
-
-# unused options  
-#   collective=0 (MPI only)
-#   individualDataSets=0 [not working]
-#   noFill=0 (HDF5 only)    
-#   preallocate=0 (MPI only)
-#   useSharedFilePointer=0 [not working]
-#   useFileView=<MPI only>
-#   useStridedDataType=0(MPI only)
-#   showHints=0
-#   showHelp=0
-
-# setup
-    api=POSIX
-    fsync=1
-    repetitions=1
-    useO_DIRECT=0
-    interTestDelay=10
-    intraTestBarriers=1
-    storeFileOffset=0
-    quitOnError=1
-    segmentCount=1
-    singleXferAttempt=0
-
-# write
-    readFile=0
-    writeFile=1
-    useExistingTestFile=0
-    keepFile=1
-    RUN
-
-# rewrite
-    useExistingTestFile=1
-    RUN
-
-# read
-    readFile=1
-    writeFile=0
-    useExistingTestFile=1
-    RUN
-
-# reread (bug in ior means it needs each run needs at least 1 directive)
-    readFile=1
-    RUN
-
-# write again
-    readFile=0
-    writeFile=1
-    useExistingTestFile=1
-    keepFile=0
-    RUN
-IOR STOP
-
-EOF
+# where lustre is mounted on the clients
+lustre=/mnt/lustre
+
+# basename of the test file(s)
+testfile=${lustre}/ior_survey_testfile
+
+# how to unmount and remount the F/S on a client (to clear the cache)
+remount="umount $lustre && $llmount -o nettype=elan mdev6:/ll_mds/client $lustre"
+
+# the result file prefix (date/time + hostname makes unique)
+#rslt=/home/ericb/ior_survey_`date +%F@%R`_`uname -n`
+rslt=/home/ericb/ior_survey
+
+#don't spin for MPI completions
+export LIBELAN_WAITTYPE=0
  
  ################################################################################
  
  ################################################################################
+# dont change stuff below here
  
  count_range() {
      echo $1 | awk '{ nvals=split($1, vals, "-");\
  
  count_range() {
      echo $1 | awk '{ nvals=split($1, vals, "-");\
@@ -172,22 +148,143 @@ pp_number() {
      fi
  }
  
      fi
  }
  
-nservers=`countnodes ${servers[@]}`
+if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then
+    echo "First test must be 'write'" 1>&2
+    exit 1
+fi
+
+rsltf="${rslt}.summary"
+iorcf="${rslt}.script"
+workf="${rslt}.detail"
+tmpf="${workf}_tmp"
+
+echo -n > $rsltf
+echo -n > $workf
+
+print_summary () {
+    if [ "$1" = "-n" ]; then
+       minusn=$1; shift
+    else
+       minusn=""
+    fi
+    echo $minusn "$*" >> $rsltf
+    echo $minusn "$*"
+}
+
+min_per_client_size=`parse_number $min_per_client_size`
+min_total_size=`parse_number $min_total_size`
+
+rsize_lo=`parse_number $rsize_lo`
+rsize_hi=`parse_number $rsize_hi`
+
  nclients=`countnodes ${clients[@]}`
  
  nclients=`countnodes ${clients[@]}`
  
-if ((max_clients > nclients)); then max_clients=$nclients; fi
-if ((file_per_task)); then minusFopt=-F; else minusFopt=""; fi
+if ((clients_hi > nclients)); then clients_hi=$nclients; fi
+
+for ((rsize=rsize_lo; rsize<=rsize_hi; rsize$rsize_iterator)); do
+    pp_rsize=`pp_number $rsize`
+
+    for ((nclnt=clients_lo; nclnt<=clients_hi; nclnt$clients_iterator)); do
+       test_clients="${cluster}`n2noderange $nclnt ${clients[@]}`"
+
+       per_client_size=$((min_total_size/nclnt))
+       if ((per_client_size < min_per_client_size)); then
+           per_client_size=$min_per_client_size
+       fi
+       total_size=`pp_number $((per_client_size * nclnt))`
+
+       for ((ntask=tasks_per_client_lo; ntask <= tasks_per_client_hi; ntask$tasks_per_client_iterator)); do
+           per_task_size=$((per_client_size/ntask))
+           
+           hdrstr=`printf "Total: %5sB rsize: %4s clients: %4d tasks: %3d: " $total_size $rsize $nclnt $ntask`
+           print_summary -n "$hdrstr"
+
+           for ((test_idx=0; test_idx < ${#tests[@]}; test_idx++)); do
+               test=${tests[$test_idx]}
+               
+               print_summary -n "$test "
+               echo "===========> ${hdrstr} on $test_clients doing $test" >> $workf
+               echo -n > $tmpf
+
+               if [ "$test" = "remount" ]; then
+                   echo "=> $remount" >> $tmpf
+                   $pdsh -S -b -w "$test_clients" >> $tmpf 2>&1 \
+                       "$remount"
+                   status=$?
+                   echo "Completion Status: $status" >> $tmpf
+
+                   if ((status)); then
+                       result="ERROR"
+                   else
+                       result="OK"
+                   fi
+               else
+                   cmd="(mount -t lustre; mount -t lustre_lite) | grep $lustre"
+                   echo "=> $cmd" >> $tmpf
+                   $pdsh -S -b -w "$test_clients" >> $tmpf 2>&1 \
+                       "$cmd"
+                   status=$?
+                   echo "Completion Status: $status" >> $tmpf
  
  
-for ((i=min_clients - 1;i<max_clients;i++)); do
-    tc=`printf "%3d tasks, %4d clients" $tasks_per_client $((i+1))`
-    echo "=================================== $tc ================================================"
+                   if ((status)); then
+                       cat $tmpf >> $workf
+                       rm $tmpf
+                       print_summary "Lustre NOT mounted on $lustre somewhere"
+                       exit 1
+                   fi
  
  
-    clients="${cluster}`n2noderange $((i+1)) ${clients[@]}`"
-    per_task_size=$((`parse_number $per_client_size`/tasks_per_client))
-    per_task_size=`pp_number $per_task_size`
+                   cmdline=(
+                   $IOR                     # the command
+                   -o${testfile}            # test file prefix
+                   -b${per_task_size}       # bytes per task
+                   -t${rsize}               # record size
+                   -e                       # fsync before close
+                   -q                       # quit on error
+                   )
  
  
-    pdsh -b -Rmqsh -w "$clients" -n $tasks_per_client \
-         $IOR -b${per_task_size} -t${transfer_size} $minusFopt -o $testfile -f $script
+                   idx=${#cmdline[@]}
  
  
+                    # keep the test file(s) unless this is the last test
+                   ((test_idx < ${#tests[@]}-1)) && cmdline[$((idx++))]="-k"
+
+                   # use the existing test file(s) unless this is the first test
+                   ((test_idx > 0)) && cmdline[$((idx++))]="-E"
+
+                   # file-per-task
+                   ((file_per_task)) && cmdline[$((idx++))]="-F"
+
+                   case "$test" in
+                   *write*) cmdline[$((idx++))]="-w"
+                            awkstr="Max Write";;
+                    *)       cmdline[$((idx++))]="-r"
+                            awkstr="Max Read";;
+                    esac
+
+                   echo "=> ${cmdline[@]}" >> $tmpf
+       
+                   $pdsh -S -b -Rmqsh -w "$test_clients" -n $ntask >> $tmpf 2>&1 \
+                       "${cmdline[@]}"
+                   status=$?
+
+                   echo "Completion Status: $status" >> $tmpf
+              
+                   if ((status)); then
+                       result="ERROR"
+                   else
+                       result=`awk < $tmpf "/$awkstr/ {print $ 3; found=1; exit}\
+                                            END       {if (!found) print \"ERROR\"}"`
+                   fi
+               fi
+
+               cat $tmpf >> $workf
+               rm $tmpf
+
+               str=`printf "%8s" "$result"`
+               print_summary -n "$str "
+           done
+           print_summary ""
+       done
+    done
  done
  
  done
  
+# rm $iorcf
diff --git a/lustre-iokit/obdfilter-survey/README b/lustre-iokit/obdfilter-survey/README

index 1ff1eb2..cea897a 100644 (file)
--- a/lustre-iokit/obdfilter-survey/README
+++ b/lustre-iokit/obdfilter-survey/README
@@ -64,7 +64,11 @@ leave 'ost_names' undefined.
  You can optionally prefix any name in 'ost_names' or 'client_names' with
  the hostname that it is running on (e.g. remote_node:ost4) if your
  obdfilters or echo_clients are running on more than one node.  In this
  You can optionally prefix any name in 'ost_names' or 'client_names' with
  the hostname that it is running on (e.g. remote_node:ost4) if your
  obdfilters or echo_clients are running on more than one node.  In this
-case, you need to ensure 'custom_remote_shell()' works on your cluster.
+case, you need to ensure...
+
+(a) 'custom_remote_shell()' works on your cluster
+(b) all pathnames you specify in the script are mounted on the node you
+    start the survey from and all the remote nodes.
  
  Use 'lctl device_list' to verify the obdfilter/echo_client instance names
  e.g...
  
  Use 'lctl device_list' to verify the obdfilter/echo_client instance names
  e.g...
@@ -83,9 +87,10 @@ on node 'ns9' you could simply add 'ost3' to 'ost_names'.
  When the script runs, it creates a number of working files and a pair of
  result files.  All files start with the prefix given by ${rslt}.
  
  When the script runs, it creates a number of working files and a pair of
  result files.  All files start with the prefix given by ${rslt}.
  
-${rslt}_<date/time>.summary       same as stdout
-${rslt}_<date/time>.detail_tmp*   tmp files
-${rslt}_<date/time>.detail        collected tmp files for post-mortem
+${rslt}.summary           same as stdout
+${rslt}.script_*          per-host test script files
+${rslt}.detail_tmp*       per-ost result files
+${rslt}.detail            collected result files for post-mortem
  
  The script iterates over the given numbers of threads and objects
  performing all the specified tests and checking that all test processes
  
  The script iterates over the given numbers of threads and objects
  performing all the specified tests and checking that all test processes
diff --git a/lustre-iokit/obdfilter-survey/obdfilter-survey b/lustre-iokit/obdfilter-survey/obdfilter-survey

index da0d797..06c26e8 100755 (executable)
--- a/lustre-iokit/obdfilter-survey/obdfilter-survey
+++ b/lustre-iokit/obdfilter-survey/obdfilter-survey
@@ -41,7 +41,7 @@ restart_rsz=
  restart_thr=1
  restart_nobj=1
  
  restart_thr=1
  restart_nobj=1
  
-# machine's page size
+# machine's page size (K)
  PAGE_SIZE=64
  
  # max buffer_mem (total_threads * buffer size)
  PAGE_SIZE=64
  
  # max buffer_mem (total_threads * buffer size)
@@ -58,16 +58,23 @@ custom_remote_shell () {
      # commands. One of the following will probably work.
      ssh $host "cd $here; $cmds"
      #rsh $host "cd $here; $cmds"
      # commands. One of the following will probably work.
      ssh $host "cd $here; $cmds"
      #rsh $host "cd $here; $cmds"
-    #pdsh -w $host "cd $here; $cmds"
+    # we have to remove the leading `uname -n`: from pdsh output lines
+    #pdsh -w $host "cd $here; $cmds" | sed 's/^[^:]*://'
  }
  
  #####################################################################
  # leave the rest of this alone unless you know what you're doing...
  
  }
  
  #####################################################################
  # leave the rest of this alone unless you know what you're doing...
  
+lsmod="/sbin/lsmod"
+modprobe="/sbin/modprobe"
+insmod="/sbin/insmod"
+rmmod="/sbin/rmmod"
+
  snap=1
  verify=1
  
  rsltf="${rslt}.summary"
  snap=1
  verify=1
  
  rsltf="${rslt}.summary"
+cmdsf="${rslt}.script"
  workf="${rslt}.detail"
  echo -n > $rsltf
  echo -n > $workf
  workf="${rslt}.detail"
  echo -n > $rsltf
  echo -n > $workf
@@ -91,23 +98,23 @@ remote_shell () {
  
  check_obdecho() {
      local host=$1
  
  check_obdecho() {
      local host=$1
-    remote_shell $host lsmod | grep obdecho > /dev/null 2>&1
+    remote_shell $host $lsmod | grep obdecho > /dev/null 2>&1
  }
  
  load_obdecho () {
      local host=$1
      if [ -z "$lustre_root" ]; then
  }
  
  load_obdecho () {
      local host=$1
      if [ -z "$lustre_root" ]; then
-       remote_shell $host modprobe obdecho
+       remote_shell $host $modprobe obdecho
      elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then
      elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then
-       remote_shell $host insmod ${lustre_root}/obdecho/obdecho.ko
+       remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.ko
      else
      else
-       remote_shell $host insmod ${lustre_root}/obdecho/obdecho.o
+       remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.o
      fi
  }
  
  unload_obdecho () {
      local host=$1
      fi
  }
  
  unload_obdecho () {
      local host=$1
-    remote_shell $host rmmod obdecho
+    remote_shell $host $rmmod obdecho
  }
  
  get_devno () {
  }
  
  get_devno () {
@@ -294,7 +301,7 @@ for host in ${unique_hosts[@]}; do
      fi
      load_obdecho $host
      if check_obdecho $host; then
      fi
      load_obdecho $host
      if check_obdecho $host; then
-       do_unload_obdecho[$host]=0
+       do_unload_obdecho[$host]=1
         continue
      fi
      echo "Can't load obdecho on $host" 1>&2
         continue
      fi
      echo "Can't load obdecho on $host" 1>&2
@@ -366,29 +373,29 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
             for test in write $tests; do
                 print_summary -n "$test "
                 for host in ${unique_hosts[@]}; do
             for test in write $tests; do
                 print_summary -n "$test "
                 for host in ${unique_hosts[@]}; do
-                   echo -n > ${workf}_${host}_script
+                   echo -n > ${cmdsf}_${host}
                 done
                 for ((idx=0; idx < ndevs; idx++)); do
                     host=${host_names[$idx]}
                     devno=${devnos[$idx]}
                     tmpfi="${tmpf}_$idx"
                     first_obj=${first_objs[$idx]}
                 done
                 for ((idx=0; idx < ndevs; idx++)); do
                     host=${host_names[$idx]}
                     devno=${devnos[$idx]}
                     tmpfi="${tmpf}_$idx"
                     first_obj=${first_objs[$idx]}
-                   echo >> ${workf}_${host}_script \
+                   echo >> ${cmdsf}_${host} \
                         "$lctl > $tmpfi 2>&1 \\
                           --threads $thr -$snap $devno \\
                          test_brw $count `testname2type $test` q $pages ${thr}t${first_obj} &"
                 done
                 for host in ${unique_hosts[@]}; do
                         "$lctl > $tmpfi 2>&1 \\
                           --threads $thr -$snap $devno \\
                          test_brw $count `testname2type $test` q $pages ${thr}t${first_obj} &"
                 done
                 for host in ${unique_hosts[@]}; do
-                   echo "wait" >> ${workf}_${host}_script
+                   echo "wait" >> ${cmdsf}_${host}
                 done
                 t0=`date +%s.%N`
                 for host in ${unique_hosts[@]}; do
                 done
                 t0=`date +%s.%N`
                 for host in ${unique_hosts[@]}; do
-                   remote_shell $host bash ${workf}_${host}_script&
+                   remote_shell $host bash ${cmdsf}_${host}&
                 done
                 wait
                 t1=`date +%s.%N`
                 for host in ${unique_hosts[@]}; do
                 done
                 wait
                 t1=`date +%s.%N`
                 for host in ${unique_hosts[@]}; do
-                   rm ${workf}_${host}_script
+                   rm ${cmdsf}_${host}
                 done
                 str=`awk "BEGIN {printf \"%7.2f \",\
                          $total_size / (( $t1 - $t0 ) * 1024)}"`
                 done
                 str=`awk "BEGIN {printf \"%7.2f \",\
                          $total_size / (( $t1 - $t0 ) * 1024)}"`
@@ -408,9 +415,9 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
                 rm $tmpf
                 if ((stats[0] <= 0)); then
                     if ((stats[0] < 0)); then
                 rm $tmpf
                 if ((stats[0] <= 0)); then
                     if ((stats[0] < 0)); then
-                       str=`printf "%15s " ERROR`
+                       str=`printf "%17s " ERROR`
                     else
                     else
-                       str=`printf "%15s " SHORT`
+                       str=`printf "%17s " SHORT`
                     fi
                 else
                     str=`awk "BEGIN {printf \"[%7.2f,%7.2f] \",\
                     fi
                 else
                     str=`awk "BEGIN {printf \"[%7.2f,%7.2f] \",\
author	eeb <eeb>
	Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)
committer	eeb <eeb>
	Thu, 7 Oct 2004 23:30:35 +0000 (23:30 +0000)
lustre-iokit/ior-survey/ior-survey		patch \| blob \| history
lustre-iokit/obdfilter-survey/README		patch \| blob \| history
lustre-iokit/obdfilter-survey/obdfilter-survey		patch \| blob \| history