From: jitendra Date: Fri, 27 Jul 2007 09:42:58 +0000 (+0000) Subject: Improved obdfilter-survey. X-Git-Tag: v1_7_91~123 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c2ec235e28383022a50c0a85c64844a491569d2e;hp=3b713b9ca7b527805a8a0882e2bd722ac069bf3f Improved obdfilter-survey. added libobd, a common library to setup, echo clients/servers, cleanups and other common lctl related setups and cleanups etc. b=11171 i:nathan --- diff --git a/lustre-iokit/obdfilter-survey/libobd b/lustre-iokit/obdfilter-survey/libobd new file mode 100644 index 0000000..3beeecf --- /dev/null +++ b/lustre-iokit/obdfilter-survey/libobd @@ -0,0 +1,355 @@ +#!/bin/bash +#* Copyright (C) 2002 Cluster File Systems, Inc. +#* Author: Jitendra Pawar +#* +#* Lustre-iokit is free software; you can redistribute it and/or +#* modify it under the terms of version 2 of the GNU General Public +#* License as published by the Free Software Foundation. +#* +#* Lustre-iokit is distributed in the hope that it will be useful, +#* but WITHOUT ANY WARRANTY; without even the implied warranty of +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +#* GNU General Public License for more details. +#* +#* You should have received a copy of the GNU General Public License +#* along with Lustre; if not, write to the Free Software +#* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# binaries +lsmod="/sbin/lsmod" +modprobe="/sbin/modprobe" +insmod="/sbin/insmod" +rmmod="/sbin/rmmod" + +declare -a ost_names +declare -a host_list +declare -a dev_list +declare -a unique_hosts +declare count +declare -a vmstatpids +declare -a do_unload_echo + + +# This function executes the command sent through parameters to host +# parameters +# 1. hostname +# 2. command to be executed on host +custom_remote_shell () { + host=$1 + shift + cmds="$*" + here=`pwd` + # Hop on to the remote node, chdir to 'here' and run the given + # commands. One of the following will probably work. + ssh $host "cd $here; $cmds" + #rsh $host "cd $here; $cmds" + # we have to remove the leading `uname -n`: from pdsh output lines + #pdsh -w $host "cd $here; $cmds" | sed 's/^[^:]*://' +} + +# how to run commands on other nodes +# You need to make this work on your cluster if you have specified +# non-local obd instances above +remote_shell () { + host=$1 + shift + cmds="$*" + if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then + eval "$cmds" + else + custom_remote_shell $host "$cmds" + fi +} + +# checks whether obdecho module is loded on given host. +# parameter: 1. hostname +obdecho_loaded() { + local host=$1 + remote_shell $host $lsmod | grep obdecho > /dev/null 2>&1 +} + +# load obdecho.ko or obdecho.o module on host kernel. +load_obdecho () { + local index=$1 + local host=${unique_hosts[$index]} + do_unload_echo[$index]=0 + if obdecho_loaded $host; then + return 0 + fi + if [ -z "$lustre_root" ]; then + remote_shell $host $modprobe obdecho + elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then + remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.ko + else + remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.o + fi + if obdecho_loaded $host; then + do_unload_echo[$index]=1 + else + echo Could not install obdecho on $host + return 1 + fi + return 0 +} + +load_obdechos () { + for ((i = 0; i < ${#unique_hosts[@]}; i++)); do + load_obdecho $i || cleanup 1 + done +} + +# unload obdecho module from host kernel. +unload_obdecho () { + local index=$1 + local host=${unique_hosts[$index]} + if ((${do_unload_echo[$index]})); then + remote_shell $host $rmmod obdecho + do_unload_echo[$index]=0 + fi +} + +# returns the device number which is displayed in "lctl device_list" +# +# parameter: 1. hostname +# 2. type of device ex: echo_client +# 3. name of device ex: ECHO_matrix.linsyssoft.com +get_devno () { + local host=$1 + local type=$2 + local name=$3 + remote_shell $host $lctl device_list | \ + awk "{if (\$2 == \"UP\" && \$3 == \"$type\" && \$4 == \"$name\") {\ + print \$1; exit}}" +} + +get_devnos () { + local i=0 + local host + for ((i = 0; i < $count; i++)); do + ost=${ost_names[$i]} + host=${host_list[$i]} + dev=$(get_devno $host obdfilter $ost) + dev_list[$i]=$dev + if [ -z "$dev" ]; then + echo Cant find device for $ost on $host + return 1 + fi + done + return 0 +} + +# do cleanup and exit. +cleanup () { + for ((i = 0; i < ndevs; i++)); do + host=${host_names[$i]} + if [ -n ${do_teardown_ec[$i]} ]; then + teardown_ec_devno $host ${client_names[$i]} + fi + done + pidcount=0 + for host in ${unique_hosts[@]}; do + remote_shell $host "killall vmstat -q" & + pid=$! + kill -term ${vmstatpids[$pidcount]} 2>/dev/null + kill -kill ${vmstatpids[$pidcount]} 2>/dev/null + wait $pid + pidcount=$((pidcount+1)) + if ((${do_unload_obdecho[$host]})); then + unload_obdecho $host + fi + done + if [ $case == "network" ]; then + lctl <&2 + return + fi + client_name=${ost_name}_echo_client + fi + ec=`get_devno $host echo_client $client_name` + if [ -n "$ec" ]; then + echo $ec $client_name $client_name + return + fi + if [ -z "$ost_name" ]; then + echo "no echo client and ost_name not set, client: $client_name, host: $host" 1>&2 + return + fi + ost=`get_devno $host obdfilter $ost_name` + if [ -z "$ost" ]; then + echo "OST $ost_name not setup" 1>&2 + return + fi + remote_shell $host "$lctl <&2 + return + fi + echo $ec $client_name 1 +} + +# Create echo-clients using osc_names and osc_uuid +# It creates echoclients for all osc listed using #lctl device_list command +ec_using_osc () { + local osc_name=$1 + local osc_uuid=$2 + $lctl < $rfile 2>&1 + first=0 + prev=0 + count=0 + error=0 + while read line; do + echo "$line" | grep -q 'is object id' + if [ $? -ne 0 ]; then + continue + fi + if [ $first -eq 0 ]; then + first=$(echo $line | awk '{print $6}') + first=$(printf "%d" $first) + prev=$first + count=1 + else + obj=$(echo $line | awk '{print $6}') + obj=$(printf "%d" $obj) + diff=$((obj - (prev+1))) + if [ $diff -ne 0 ]; then + error=1 + fi + prev=$obj + count=$((count+1)) + fi + done < $rfile + if [ $nobj -ne $count ]; then + echo "ERROR: $nobj != $count" >&2 + cat $rfile >&2 + echo "ERROR" + elif [ $error -ne 0 ]; then + echo "ERROR: non contiguous objs found" >&2 + else + echo $first + fi +} + +# destroys all objects created in create_objects routine +# parameter: 3. start obj id. +destroy_objects () { + local host=$1 + local devno=$2 + local obj0=$3 + local nobj=$4 + local rfile=$5 + remote_shell $host $lctl --device $devno destroy $obj0 $nobj > $rfile 2>&1 +} + +get_stats () { + local rfile=$1 + awk < $rfile \ + '/^Selected device [0-9]+$/ {n = 0; next}\ + /error/ {n = -1; exit}\ + /^[0-9]+\/[0-9]+ Total: [0-9]+\.[0-9]+\/second$/ {n++; v=strtonum($3); \ + if (n == 1 || v < min) min = v;\ + if (n == 1 || v > max) max = v;\ + next}\ + {if (n != 0) {n = -1; exit}}\ + END {printf "%d %f %f\n", n, min, max}' +} + +get_global_stats () { + local rfile=$1 + awk < $rfile 'BEGIN {n = 0;}\ + {n++; if (n == 1) {err = $1; min = $2; max = $3} else\ + {if ($1 < err) err = $1;\ + if ($2 < min) min = $2;\ + if ($3 > max) max = $3}}\ + END {if (n == 0) err = 0;\ + printf "%d %f %f\n", err, min, max}' +} + +# enable or disable data check. +# parameter: 1. read/write +testname2type () { + # 'x' disables data check + if ((verify)); then + x="" + else + x="x" + fi + case $1 in + *write*) echo "w$x";; + *) echo "r$x";; + esac +} + +print_summary () { + if [ "$1" = "-n" ]; then + minusn=$1; shift + else + minusn="" + fi + echo $minusn "$*" >> $rsltf + echo $minusn "$*" +} + +# Customisation variables +##################################################################### +# One can change variable values in this section as per requirements + +OSTS=${OSTS:-""} +server_nid=${server_nid:-""} +case=${case:-"disk"} +if [ -n "$OSTS" ]; then declare -a ost_names count=0 - for name in $ost_names_str; do + for name in $OSTS; do ost_names[$count]=$name count=$((count+1)) done -else - ost_names=(ost{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}) fi - -#client_names=(ns8:ECHO_ns8 ns9:ECHO_ns9) -client_names_str=${client_names_str:-""} -if [ -n "$client_names_str" ]; then +ECHO_CLIENTS=${ECHO_CLIENTS:-""} +if [ -n "$ECHO_CLIENTS" ]; then # make sure we unset ost_names so that our client_names get noticed... unset ost_names declare -a client_names count=0 - for name in $client_names_str; do + for name in $ECHO_CLIENTS; do client_names[$count]=$name count=$((count+1)) done @@ -40,9 +177,6 @@ fi # NB ensure path to it exists rslt=${rslt:-"/tmp/obdfilter_survey_`date +%F@%R`_`uname -n`"} -# lustre root (if running with own source tree) -# lustre_root=${lustre_root:-"/my/directory/lustre"} - # what tests to run (first must be write) tests_str=${tests_str:-""} if [ -n "$tests_str" ]; then @@ -58,7 +192,7 @@ else fi # Set this true to check file contents -verify=${verify:-0} +verify=0 # total size (MBytes) per obd instance # large enough to avoid cache effects @@ -71,11 +205,12 @@ rszhi=${rszhi:-1024} # number of objects per OST nobjlo=${nobjlo:-1} -nobjhi=${nobjhi:-512} +#was nobjhi=${nobjhi:-512} +nobjhi=${nobjhi:-16} # threads per OST (1024 max) thrlo=${thrlo:-1} -thrhi=${thrhi:-64} +thrhi=${thrhi:-16} # restart from here iff all are defined restart_rsz= @@ -92,37 +227,13 @@ PAGE_SIZE=${PAGE_SIZE:-4} # max buffer_mem (total_threads * buffer size) # (to avoid lctl ENOMEM problems) -max_buffer_mem=$((1024*1024)) - -# how to run commands on other nodes -# You need to make this work on your cluster if you have specified -# non-local obd instances above -custom_remote_shell () { - host=$1 - shift - cmds="$*" - here=`pwd` - # Hop on to the remote node, chdir to 'here' and run the given - # commands. One of the following will probably work. - ssh $host "cd $here; $cmds" - #rsh $host "cd $here; $cmds" - # we have to remove the leading `uname -n`: from pdsh output lines - #pdsh -w $host "cd $here; $cmds" | sed 's/^[^:]*://' -} +max_buffer_mem=$((1024 * 1024)) +snap=1 +# Customisation variables ends here. ##################################################################### # leave the rest of this alone unless you know what you're doing... -# binaries -lsmod="/sbin/lsmod" -modprobe="/sbin/modprobe" -insmod="/sbin/insmod" -rmmod="/sbin/rmmod" - -# lctl::test_brw bandwidth snapshot interval (seconds) -snap=1 - - if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then echo "tests: ${tests[@]}" echo "First test must be 'write'" 1>&2 @@ -136,8 +247,6 @@ vmstatf="${rslt}.vmstat" echo -n > $rsltf echo -n > $workf -declare -a vmstatpids - # hide a little trick to unset this from the command line if [ "$lustre_root" == " " ]; then unset lustre_root @@ -149,210 +258,85 @@ else lctl=${lustre_root}/utils/lctl fi -remote_shell () { - host=$1 - shift - cmds="$*" - if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then - eval "$cmds" - else - custom_remote_shell $host "$cmds" +if [ $case == "network" ]; then + if [ -z "$server_nid" ]; then + echo "Specify the server NID" + exit 1; fi -} - -obdecho_loaded() { - local host=$1 - remote_shell $host $lsmod | grep obdecho > /dev/null 2>&1 -} + osc_names_string=`ssh root@"$server_nid" lctl dl` + count=0; + for name in $osc_names_str; do + count=$((count+1)) + done -load_obdecho () { - local host=$1 - if [ -z "$lustre_root" ]; then - remote_shell $host $modprobe obdecho - elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then - remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.ko - else - remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.o + if [ $count != 0 ]; then + echo "The existing setup must be cleaned"; + exit 0; fi -} - -unload_obdecho () { - local host=$1 - remote_shell $host $rmmod obdecho -} - -get_devno () { - local host=$1 - local type=$2 - local name=$3 - remote_shell $host $lctl device_list | \ - awk "{if (\$2 == \"UP\" && \$3 == \"$type\" && \$4 == \"$name\") {\ - print \$1; exit}}" -} - -get_ec_devno () { - local host=$1 - local client_name="$2" - local ost_name="$3" - if [ -z "$client_name" ]; then - if [ -z "$ost_name" ]; then - echo "client and ost name both null" 1>&2 - return + # Now do the server setup + setup_srv_obd $server_nid "ost_testfs" + op_string=`ssh root@"$server_nid" lctl dl` + + obdecho=0 + ost=0 + for name in $op_string; do + if [ "$name" = "obdecho" ]; then + obdecho=1 fi - client_name=${ost_name}_echo_client - fi - ec=`get_devno $host echo_client $client_name` - if [ -n "$ec" ]; then - echo $ec $client_name 0 - return - fi - if [ -z "$ost_name" ]; then - echo "no echo client and ost_name not set, client: $client_name, host: $host" 1>&2 - return - fi - ost=`get_devno $host obdfilter $ost_name` - if [ -z "$ost" ]; then - echo "OST $ost_name not setup" 1>&2 - return - fi - remote_shell $host "$lctl <&2 - return - fi - echo $ec $client_name 1 -} - -teardown_ec_devno () { - local host=$1 - local client_name=$2 - remote_shell $host "$lctl < $rfile 2>&1 - first=0 - prev=0 - count=0 - error=0 - while read line; do - echo "$line" | grep -q 'is object id' - if [ $? -ne 0 ]; then - continue - fi - if [ $first -eq 0 ]; then - first=$(echo $line | awk '{print $6}') - first=$(printf "%d" $first) - prev=$first - count=1 - else - obj=$(echo $line | awk '{print $6}') - obj=$(printf "%d" $obj) - diff=$((obj - (prev+1))) - if [ $diff -ne 0 ]; then - error=1 - fi - prev=$obj - count=$((count+1)) - fi - done < $rfile - if [ $nobj -ne $count ]; then - echo "ERROR: $nobj != $count" >&2 - cat $rfile >&2 - echo "ERROR" - elif [ $error -ne 0 ]; then - echo "ERROR: non contiguous objs found" >&2 - echo "ERROR" - else - echo $first + if (( $obdecho == 0 || $ost == 0 )); then + echo "Server setup not done properly" + exit 1 fi -} - -destroy_objects () { - local host=$1 - local devno=$2 - local obj0=$3 - local nobj=$4 - local rfile=$5 - remote_shell $host $lctl --device $devno destroy $obj0 $nobj > $rfile 2>&1 -} - -get_stats () { - local rfile=$1 - awk < $rfile \ - '/^Selected device [0-9]+$/ {n = 0; next}\ - /error/ {n = -1; exit}\ - /^[0-9]+\/[0-9]+ Total: [0-9]+\.[0-9]+\/second$/ {n++; v=strtonum($3); \ - if (n == 1 || v < min) min = v;\ - if (n == 1 || v > max) max = v;\ - next}\ - {if (n != 0) {n = -1; exit}}\ - END {printf "%d %f %f\n", n, min, max}' -} - -get_global_stats () { - local rfile=$1 - awk < $rfile 'BEGIN {n = 0;}\ - {n++; if (n == 1) {err = $1; min = $2; max = $3} else\ - {if ($1 < err) err = $1;\ - if ($2 < min) min = $2;\ - if ($3 > max) max = $3}}\ - END {if (n == 0) err = 0;\ - printf "%d %f %f\n", err, min, max}' -} - -testname2type () { - # 'x' disables data check - if ((verify)); then - x="" - else - x="x" + # Now start client setup + osc_names_str=$(lctl dl) + if [ -n "$osc_names_str" ]; then + echo "The existing setup must be cleaned"; + exit 0; fi - case $1 in - *write*) echo "w$x";; - *) echo "r$x";; - esac -} + ec_using_srv_nid $server_nid "osc_testfs" "test_obdfs" + declare -a client_names + client_names[0]="ECHO_osc_testfs" +fi -print_summary () { - if [ "$1" = "-n" ]; then - minusn=$1; shift - else - minusn="" +if [ -z "$ECHO_CLIENTS" ]; then + if [ $case == "netdisk" ]; then + declare -a osc_names + declare -a osc_uuids + osc_names_str=$(lctl dl |grep osc | awk "{if (\$2 == \"UP\" && \$3 == \"osc\") {print \$4} }") + count=0; + for name in $osc_names_str; do + osc_names[$count]=$name + count=$((count+1)) + done + osc_uuid_str=$(lctl dl |grep osc | awk "{if (\$2 == \"UP\" && \$3 == \"osc\") {print \$5} }") + count=0; + for uuid in $osc_uuid_str; do + osc_uuids[$count]=$uuid + count=$((count+1)) + done + for (( i = 0 ; i < $count; i++ )) + do + ec_using_osc ${osc_names[$i]} ${osc_uuids[$i]} + done + ECHO_CLIENTS=$(lctl dl | grep echo_client | awk "{if (\$2 == \"UP\" && \$3 == \"echo_client\") {print \$4} }") + cnt=0; + for name in $ECHO_CLIENTS; do + client_names[$cnt]=$name + cnt=$((cnt+1)) + done fi - echo $minusn "$*" >> $rsltf - echo $minusn "$*" -} - -unique () { - echo "$@" | xargs -n1 echo | sort -u -} +fi -split_hostname () { - local name=$1 - case $name in - *:*) host=`echo $name | sed 's/:.*$//'` - name=`echo $name | sed 's/[^:]*://'` - ;; - *) host=localhost - ;; - esac - echo "$host $name" -} +if [ -z "$OSTS" ]; then + if [ $case == "disk" ]; then + get_targets + fi +fi # split out hostnames from client/ost names ndevs=${#client_names[@]} @@ -361,7 +345,7 @@ if ((ndevs != 0)); then echo "Please specify client_names or ost_names, but not both" 1>&2 exit 1 fi - for ((i=0; i&2 exit 1 fi - for ((i=0; i /proc/sys/lnet/debug" host_vmstatf=${vmstatf}_${host} echo -n > $host_vmstatf remote_shell $host "vmstat 5 >> $host_vmstatf" & @@ -405,7 +388,7 @@ for host in ${unique_hosts[@]}; do done # get all the echo_client device numbers and names -for ((i=0; i Create $nobj on $client_name" >> $workf first_obj=`create_objects $host $devno $nobj $tmpf` cat $tmpf >> $workf - rm -f $tmpf + rm $tmpf if [ $first_obj = "ERROR" ]; then print_summary "created object #s on $client_name not contiguous" exit 1 @@ -478,7 +460,7 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do for host in ${unique_hosts[@]}; do echo -n > ${cmdsf}_${host} done - for ((idx=0; idx < ndevs; idx++)); do + for ((idx = 0; idx < ndevs; idx++)); do host=${host_names[$idx]} devno=${devnos[$idx]} tmpfi="${tmpf}_$idx" @@ -514,7 +496,7 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do t1=`date +%s.%N` # clean up per-host script files for host in ${unique_hosts[@]}; do - rm -f ${cmdsf}_${host} + rm ${cmdsf}_${host} done # compute bandwidth from total data / elapsed time str=`awk "BEGIN {printf \"%7.2f \",\ @@ -522,7 +504,7 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do print_summary -n "$str" # collect/check individual OST stats echo -n > $tmpf - for ((idx=0; idx < ndevs; idx++)); do + for ((idx = 0; idx < ndevs; idx++)); do client_name="${host_names[$idx]}:${client_names[$idx]}" tmpfi="${tmpf}_$idx" echo "=============> $test $client_name" >> $workf @@ -530,13 +512,13 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do scp -q ${host}:$tmpfi $tmpfi > /dev/null cat $tmpfi >> $workf get_stats $tmpfi >> $tmpf - rm -f $tmpfi + rm $tmpfi done # compute/display global min/max stats echo "=============> $test global" >> $workf cat $tmpf >> $workf stats=(`get_global_stats $tmpf`) - rm -f $tmpf + rm $tmpf if ((stats[0] <= 0)); then if ((stats[0] < 0)); then str=`printf "%17s " ERROR` @@ -552,7 +534,7 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do done print_summary "" # destroy objects we created - for ((idx=0; idx < ndevs; idx++)); do + for ((idx = 0; idx < ndevs; idx++)); do host=${host_names[$idx]} devno=${devnos[$idx]} client_name="${host}:${client_names[$idx]}" @@ -560,32 +542,11 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do echo "=============> Destroy $nobj on $client_name" >> $workf destroy_objects $host $devno $first_obj $nobj $tmpf cat $tmpf >> $workf - rm -f $tmpf + rm $tmpf done done done done -# tear down any echo clients we created -for ((i=0; i/dev/null - wait $pid - pidcount=$((pidcount+1)) - if ((${do_unload_obdecho[$host]})); then - unload_obdecho $host - fi -done - +cleanup 0 exit 0