#!/bin/bash ###################################################################### # customize per survey # CHOOSE EITHER scsidevs or rawdevs # the SCSI devices to measure - WARNING: will be erased. # The raw devices to use # rawdevs=${rawdevs:-"/dev/raw/raw1"} # scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev # result file prefix. # NB ensure the path exists on all servers if it includes subdirs rslt_loc=${rslt_loc:-"/tmp"} rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"} # what to do (read or write) actions=${actions:-"write read"} # total size per device (MBytes) # NB bigger than device cache is good size=${size:-8192} # record size (KBytes) rszlo=${rszlo:-1024} rszhi=${rszhi:-1024} # Concurrent regions per device crglo=${crglo:-1} crghi=${crghi:-256} # boundary blocks between concurrent regions per device boundary=${boundary:-1024} # threads to share between concurrent regions per device # multiple threads per region simulates a deeper request queue # NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE thrlo=${thrlo:-1} thrhi=${thrhi:-4096} # NUMA support # User provided script that returns a cpu list from a specified device. # Implementation depends on the type of device (scsi/raw, with/without # multipath, technology fc/sas/ib) # For example: # $ cat bin/dev2cpus # #!/bin/bash # dev=$(basename $1) # pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5) # cat ${pci}/local_cpulist dev2cpus=${dev2cpus:-""} ##################################################################### # leave the rest of this alone unless you know what you're doing... # and max # threads one instance will spawn SG_MAX_QUEUE=16 # numactl command NUMACTL=${NUMACTL:-"/usr/bin/numactl"} unique () { echo "$@" | xargs -n1 echo | sort -u } split_hostname () { local name=$1 case $name in *:*) host=`echo $name | sed 's/:.*$//'` name=`echo $name | sed 's/[^:]*://'` ;; *) host=localhost ;; esac echo "$host $name" } DSH=${DSH:-"ssh"} dsh () { local node="$1" local user="$2" shift 2 local command="$@" command="export PATH=/sbin:/usr/sbin:\$PATH; $command" case $DSH in ssh) if [ -n "$user" ]; then user="$user@" fi $DSH $user$node "$command" ;; rsh) if [ -n "$user" ]; then user="-l $user" fi $DSH $user $node "$command" ;; esac } # how to run commands on other nodes remote_shell () { local host=$1 shift local cmds="$@" if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then eval "$cmds" else # split $host into $host and $user local user="" if [[ $host == *@* ]]; then user=${host%@*} host=${host#*@} fi dsh $host "$user" "$cmds" fi } # check either scsidevs or rawdevs is specified # but only one of them if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then echo "Must either specify scsidevs or rawdevs" exit 1 fi # retrieve host and device if specified as "hostname:device" ndevs=0 devs=() for d in $scsidevs $rawdevs; do str=(`split_hostname $d`) hosts[$ndevs]=${str[0]} devs[$ndevs]=${str[1]} ndevs=$((ndevs+1)) done unique_hosts=(`unique ${hosts[@]}`) # get device cpu list devcpus=() if [ -n "$dev2cpus" ]; then for ((i=0; i < $ndevs; i++)); do devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]}) done fi # map given device names into SG device names if [ "$scsidevs" ]; then # make sure sg kernel module is loaded for host in ${unique_hosts[@]}; do sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \ && echo true || echo false) if ! $sg_is_loaded; then echo "loading the sg kernel module on $host" remote_shell $host modprobe sg sg_was_loaded_on="$sg_was_loaded_on $host" fi done for ((i=0; i < $ndevs; i++)); do # resolve symbolic link if any devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]}) # retrieve associated sg device # we will test for a LUN, the test for a partition # if the partition number is > 9 this will fail tmp=$(remote_shell ${hosts[$i]} sg_map | \ awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}') if [ -z "$tmp" ]; then echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \ "testing for partition" pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'` # Try again tmp=$(remote_shell ${hosts[$i]} sg_map | \ awk -v dev=$pt '{if ($2 == dev) print $1}') if [ -z "$tmp" ]; then echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \ "Do you have the sg module configured for your kernel?" exit 1 fi fi devs[$i]=$tmp done elif [ "$rawdevs" ]; then for ((i=0; i < $ndevs; i++)); do RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]}) if [ $? -ne 0 ];then echo "Raw device ${hosts[$i]}:${devs[$i]} not set up" exit 1 fi done fi # determine block size of each device. This should also work for raw devices # If it fails, set to 512 for ((i=0; i < $ndevs; i++)); do # retrieve device size (in kbytes) and block size (in bytes) tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` ) bs[$i]=$((tmp[1])) if [ ${bs[$i]} == 0 ]; then echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \ "setting block size to 512" bs[$i]=512 fi devsize=$((tmp[0]*bs[$i]/1024)) # check record size is a multiple of block size if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \ "for device ${hosts[$i]}:${devs[$i]}" exit 1 fi # check device size if [ $devsize -lt $((size*1024)) ]; then echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \ "$devsize < $((size*1024)).\nConsider reducing \$size" exit 1 fi done rsltf=${rslt}.summary workf=${rslt}.detail cmdsf=${rslt}.script echo -n > $rsltf echo -n > $workf print_summary () { if [ "$1" = "-n" ]; then minusn=$1; shift else minusn="" fi echo $minusn "$*" >> $rsltf echo $minusn "$*" } print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)" for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do for ((crg=$crglo;crg<=$crghi;crg*=2)); do for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then continue fi # compute total size (in kbytes) total_size=0 for ((i=0; i < $ndevs; i++)); do tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024)) total_size=$((total_size+tsize)) done # show test parameters str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \ $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))` echo "==============> $str" >> $workf print_summary -n "$str" # check memory for each host for host in ${unique_hosts[@]}; do numdevs=0 for ((i=0; i < $ndevs; i++)); do if [ ${hosts[$i]} == $host ]; then numdevs=$((numdevs+1)) fi done freemem=$(remote_shell $host cat /proc/meminfo | \ awk '/^MemTotal:/ {printf "%d\n", $2}') if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then echo "ENOMEM on $host" >> $workf print_summary "ENOMEM" continue 2 fi done # run tests for action in $actions; do declare -a pidarray print_summary -n "$action " echo "=====> $action" >> $workf tmpf=${workf}_tmp # create per-host script files for host in ${unique_hosts[@]}; do echo -n > ${cmdsf}_${host} done for ((i=0; i < $ndevs; i++)); do bpt=$((rsz*1024/bs[$i])) blocks=$((size*((1024*1024)/bs[$i])/crg)) count=$blocks host=${hosts[$i]} dev=${devs[$i]} if [ $action = read ]; then inf="if=$dev" outf="of=/dev/null" skip=skip else inf="if=/dev/zero" outf="of=$dev" skip=seek fi if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc" else numacmd="" fi for ((j=0;j> ${cmdsf}_${host} \ "$numacmd " \ "sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \ "${skip}=$((boundary+j*blocks)) " \ "thr=$((thr/crg)) count=$count bs=${bs[$i]} " \ "bpt=$bpt time=1&" done done for host in ${unique_hosts[@]}; do echo "wait" >> ${cmdsf}_${host} done # run of all the per-host script files t0=`date +%s.%N` pidcount=0 for host in ${unique_hosts[@]}; do remote_shell $host bash < ${cmdsf}_${host} & pidarray[$pidcount]=$! pidcount=$((pidcount+1)) done pidcount=0 for host in ${unique_hosts[@]}; do wait ${pidarray[$pidcount]} pidcount=$((pidcount+1)) done t1=`date +%s.%N` # clean up per-host script files for host in ${unique_hosts[@]}; do rm ${cmdsf}_${host} done # collect/check individual stats echo > $tmpf ok=0 for ((i=0;i $rtmp if grep 'error' $rtmp > /dev/null 2>&1; then echo "Error found in $rtmp" elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then ok=$((ok + 1)) fi cat ${rtmp} >> $tmpf cat ${rtmp} >> $workf rm ${rtmp} remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j} done done if ((ok != ndevs*crg)); then print_summary -n "$((ndevs*crg - ok)) failed " else # compute bandwidth in MiB/s from total data / elapsed time bw=`awk "BEGIN {printf \"%7.2f \", \ $total_size / (( $t1 - $t0 ) * 1024); exit}"` # compute global min/max stats minmax=`awk < $tmpf \ '/time to transfer data/ {mb=$8/1.048576; \ if (n == 0 || mb < min) min = mb; \ if (n == 0 || mb > max) max = mb; \ n++} \ END {printf "[ %7.2f, %7.2f] ",min,max;}'` print_summary -n "$bw $minmax " fi rm $tmpf done print_summary "" done done done for host in $sg_was_loaded_on; do echo "unloading sg module on $host" remote_shell $host rmmod sg done