# rawdevs=${rawdevs:-"/dev/raw/raw1"}
# scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev
-# result file prefix. date/time+hostname makes unique
-# NB ensure the path exists if it includes subdirs
+# result file prefix.
+# NB ensure the path exists on all servers if it includes subdirs
rslt_loc=${rslt_loc:-"/tmp"}
-rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`_`uname -n`"}
+rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"}
# what to do (read or write)
actions=${actions:-"write read"}
thrlo=${thrlo:-1}
thrhi=${thrhi:-4096}
+# NUMA support
+# User provided script that returns a cpu list from a specified device.
+# Implementation depends on the type of device (scsi/raw, with/without
+# multipath, technology fc/sas/ib)
+# For example:
+# $ cat bin/dev2cpus
+# #!/bin/bash
+# dev=$(basename $1)
+# pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5)
+# cat ${pci}/local_cpulist
+dev2cpus=${dev2cpus:-""}
+
#####################################################################
# leave the rest of this alone unless you know what you're doing...
# and max # threads one instance will spawn
SG_MAX_QUEUE=16
-# is the sg module loaded?
-sg_is_loaded=$(grep -q "^sg " /proc/modules && echo true || echo false)
+# numactl command
+NUMACTL=${NUMACTL:-"/usr/bin/numactl"}
-# did we load it?
-sg_was_loaded=false
+unique () {
+ echo "$@" | xargs -n1 echo | sort -u
+}
-# map given device names into SG device names
-i=0
+split_hostname () {
+ local name=$1
+ case $name in
+ *:*) host=`echo $name | sed 's/:.*$//'`
+ name=`echo $name | sed 's/[^:]*://'`
+ ;;
+ *) host=localhost
+ ;;
+ esac
+ echo "$host $name"
+}
+
+DSH=${DSH:-"ssh"}
+
+dsh () {
+ local node="$1"
+ local user="$2"
+ shift 2
+ local command="$@"
+
+ command="export PATH=/sbin:/usr/sbin:\$PATH; $command"
+
+ case $DSH in
+ ssh)
+ if [ -n "$user" ]; then
+ user="$user@"
+ fi
+ $DSH $user$node "$command"
+ ;;
+ rsh)
+ if [ -n "$user" ]; then
+ user="-l $user"
+ fi
+ $DSH $user $node "$command"
+ ;;
+ esac
+}
+
+# how to run commands on other nodes
+remote_shell () {
+ local host=$1
+ shift
+ local cmds="$@"
+ if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then
+ eval "$cmds"
+ else
+ # split $host into $host and $user
+ local user=""
+ if [[ $host == *@* ]]; then
+ user=${host%@*}
+ host=${host#*@}
+ fi
+ dsh $host "$user" "$cmds"
+ fi
+}
+
+
+# check either scsidevs or rawdevs is specified
+# but only one of them
+if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then
+ echo "Must either specify scsidevs or rawdevs"
+ exit 1
+fi
+
+# retrieve host and device if specified as "hostname:device"
+ndevs=0
devs=()
-if [ "$scsidevs" ]; then
- # we will test for a LUN, the test for a partition
- # if the partition number is > 9 this will fail
+for d in $scsidevs $rawdevs; do
+ str=(`split_hostname $d`)
+ hosts[$ndevs]=${str[0]}
+ devs[$ndevs]=${str[1]}
+ ndevs=$((ndevs+1))
+done
+unique_hosts=(`unique ${hosts[@]}`)
+# get device cpu list
+devcpus=()
+if [ -n "$dev2cpus" ]; then
+ for ((i=0; i < $ndevs; i++)); do
+ devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]})
+ done
+fi
+
+# map given device names into SG device names
+if [ "$scsidevs" ]; then
# make sure sg kernel module is loaded
- if ! $sg_is_loaded; then
- echo "loading the sg kernel module"
- modprobe sg && sg_was_loaded=true
- sg_is_loaded=true
- fi
+ for host in ${unique_hosts[@]}; do
+ sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \
+ && echo true || echo false)
+ if ! $sg_is_loaded; then
+ echo "loading the sg kernel module on $host"
+ remote_shell $host modprobe sg
+ sg_was_loaded_on="$sg_was_loaded_on $host"
+ fi
+ done
- for d in $scsidevs; do
- if [[ -L "$d" ]]; then
- echo "Device $d specified by alias. Will 'readlink' for device name"
- d=$(readlink -f $d)
- fi
- devs[$i]=`sg_map | awk "{if (\\\$2 == \"$d\") print \\\$1}"`
- if [ -z "${devs[i]}" ]; then
- echo "Can't find SG device for $d, testing for partition"
- pt=`echo $d | sed 's/[0-9]*$//'`
- # Try again
- devs[$i]=`sg_map | awk "{if (\\\$2 == \"$pt\") print \\\$1}"`
- if [ -z "${devs[i]}" ]; then
- echo -e "Can't find SG device $pt.\nDo you have the sg module configured for your kernel?"
- exit 1
- fi
+ for ((i=0; i < $ndevs; i++)); do
+ # resolve symbolic link if any
+ devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]})
+
+ # retrieve associated sg device
+ # we will test for a LUN, the test for a partition
+ # if the partition number is > 9 this will fail
+ tmp=$(remote_shell ${hosts[$i]} sg_map | \
+ awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}')
+ if [ -z "$tmp" ]; then
+ echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \
+ "testing for partition"
+ pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'`
+ # Try again
+ tmp=$(remote_shell ${hosts[$i]} sg_map | \
+ awk -v dev=$pt '{if ($2 == dev) print $1}')
+ if [ -z "$tmp" ]; then
+ echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \
+ "Do you have the sg module configured for your kernel?"
+ exit 1
+ fi
fi
- i=$((i+1))
+ devs[$i]=$tmp
done
elif [ "$rawdevs" ]; then
- for r in $rawdevs; do
- RES=`raw -q $r`
- if [ $? -eq 0 ];then
- devs[$i]=$r
- i=$((i+1))
- else
- echo "Raw device $r not set up"
+ for ((i=0; i < $ndevs; i++)); do
+ RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]})
+ if [ $? -ne 0 ];then
+ echo "Raw device ${hosts[$i]}:${devs[$i]} not set up"
exit 1
fi
done
-else
- echo "Must specify scsidevs or rawdevs"
- exit 1
fi
-ndevs=${#devs[@]}
-
-# determine block size. This should also work for raw devices
+# determine block size of each device. This should also work for raw devices
# If it fails, set to 512
-bs=$((`sg_readcap -lb ${devs[0]} | awk '{print $2}'`))
-if [ $bs == 0 ];then
- echo "sg_readcap failed, setting block size to 512"
- bs=512
-fi
+for ((i=0; i < $ndevs; i++)); do
+ # retrieve device size (in kbytes) and block size (in bytes)
+ tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` )
+ bs[$i]=$((tmp[1]))
+ if [ ${bs[$i]} == 0 ]; then
+ echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \
+ "setting block size to 512"
+ bs[$i]=512
+ fi
+ devsize=$((tmp[0]*bs[$i]/1024))
+
+ # check record size is a multiple of block size
+ if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then
+ echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \
+ "for device ${hosts[$i]}:${devs[$i]}"
+ exit 1
+ fi
+
+ # check device size
+ if [ $devsize -lt $((size*1024)) ]; then
+ echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \
+ "$devsize < $((size*1024)).\nConsider reducing \$size"
+ exit 1
+ fi
+done
+
rsltf=${rslt}.summary
workf=${rslt}.detail
+cmdsf=${rslt}.script
echo -n > $rsltf
echo -n > $workf
if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then
continue
fi
- # compute parameters
- bpt=$((rsz*1024/bs))
- blocks=$((size*((1024*1024)/bs)/crg))
- count=$blocks
- # show computed parameters
- actual_rsz=$((bpt*bs/1024))
- actual_size=$((bs*count*crg/1024))
- str=`printf 'total_size %8dK rsz %4d crg %5d thr %5d ' \
- $((actual_size*ndevs)) $actual_rsz $((crg*ndevs)) $((thr*ndevs))`
+ # compute total size (in kbytes)
+ total_size=0
+ for ((i=0; i < $ndevs; i++)); do
+ tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024))
+ total_size=$((total_size+tsize))
+ done
+ # show test parameters
+ str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \
+ $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))`
echo "==============> $str" >> $workf
print_summary -n "$str"
- freemem=`awk < /proc/meminfo '/^MemTotal:/ {printf "%d\n", $2}'`
- if (((actual_rsz*thr/crg + 64)*crg*ndevs > freemem)); then
- print_summary "ENOMEM"
- continue
- fi
+
+ # check memory for each host
+ for host in ${unique_hosts[@]}; do
+ numdevs=0
+ for ((i=0; i < $ndevs; i++)); do
+ if [ ${hosts[$i]} == $host ]; then
+ numdevs=$((numdevs+1))
+ fi
+ done
+ freemem=$(remote_shell $host cat /proc/meminfo | \
+ awk '/^MemTotal:/ {printf "%d\n", $2}')
+ if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then
+ echo "ENOMEM on $host" >> $workf
+ print_summary "ENOMEM"
+ continue 2
+ fi
+ done
+
# run tests
for action in $actions; do
+ declare -a pidarray
print_summary -n "$action "
echo "=====> $action" >> $workf
tmpf=${workf}_tmp
- # start test
- t0=`date +%s.%N`
- for ((i=0;i<ndevs;i++)); do
- dev=${devs[i]}
- devsize=$((bs*`sg_readcap -lb ${dev} | awk '{print $1}'`/1024))
- if [ $devsize -lt $actual_size ]; then
- _dev=$(sg_map | grep $dev | awk '{ print $2; }')
- echo -e "device $_dev not big enough: $devsize <" \
- "$actual_size.\nConsider reducing \$size"
- exit 1
- fi
+
+ # create per-host script files
+ for host in ${unique_hosts[@]}; do
+ echo -n > ${cmdsf}_${host}
+ done
+ for ((i=0; i < $ndevs; i++)); do
+ bpt=$((rsz*1024/bs[$i]))
+ blocks=$((size*((1024*1024)/bs[$i])/crg))
+ count=$blocks
+ host=${hosts[$i]}
+ dev=${devs[$i]}
if [ $action = read ]; then
inf="if=$dev"
outf="of=/dev/null"
outf="of=$dev"
skip=seek
fi
+ if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then
+ numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc"
+ else
+ numacmd=""
+ fi
for ((j=0;j<crg;j++)); do
- sgp_dd 2> ${tmpf}_${i}_${j} \
- $inf $outf ${skip}=$((boundary+j*blocks)) \
- thr=$((thr/crg)) count=$count bs=$bs bpt=$bpt time=1&
+ echo >> ${cmdsf}_${host} \
+ "$numacmd " \
+ "sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \
+ "${skip}=$((boundary+j*blocks)) " \
+ "thr=$((thr/crg)) count=$count bs=${bs[$i]} " \
+ "bpt=$bpt time=1&"
done
- done
- wait
+ done
+ for host in ${unique_hosts[@]}; do
+ echo "wait" >> ${cmdsf}_${host}
+ done
+
+ # run of all the per-host script files
+ t0=`date +%s.%N`
+ pidcount=0
+ for host in ${unique_hosts[@]}; do
+ remote_shell $host bash < ${cmdsf}_${host} &
+ pidarray[$pidcount]=$!
+ pidcount=$((pidcount+1))
+ done
+ pidcount=0
+ for host in ${unique_hosts[@]}; do
+ wait ${pidarray[$pidcount]}
+ pidcount=$((pidcount+1))
+ done
t1=`date +%s.%N`
- # collect/check individual stats
+
+ # clean up per-host script files
+ for host in ${unique_hosts[@]}; do
+ rm ${cmdsf}_${host}
+ done
+
+ # collect/check individual stats
echo > $tmpf
ok=0
for ((i=0;i<ndevs;i++)); do
for ((j=0;j<crg;j++)); do
- rtmp=${tmpf}_${i}_${j}
+ rtmp=${tmpf}_${i}_${j}_local
+ remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp
if grep 'error' $rtmp > /dev/null 2>&1; then
- echo "Error found in $rtmp"
+ echo "Error found in $rtmp"
elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then
ok=$((ok + 1))
fi
cat ${rtmp} >> $tmpf
cat ${rtmp} >> $workf
rm ${rtmp}
+ remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j}
done
done
if ((ok != ndevs*crg)); then
print_summary -n "$((ndevs*crg - ok)) failed "
else
- # compute MB/sec from elapsed
- bw=`awk "BEGIN {printf \"%7.2f MB/s\", $actual_size * $ndevs / (( $t1 - $t0 ) * 1024); exit}"`
- # compute MB/sec from nregions*slowest
- check=`awk < $tmpf \
- '/time to transfer data/ {mb=$8/1.048576; if (n == 0 || mb < min) min = mb; n++}\
- END {printf "%5d x %6.2f = %7.2f MB/s", n, min, min * n}'`
- print_summary -n "$bw $check "
+ # compute bandwidth in MiB/s from total data / elapsed time
+ bw=`awk "BEGIN {printf \"%7.2f \", \
+ $total_size / (( $t1 - $t0 ) * 1024); exit}"`
+ # compute global min/max stats
+ minmax=`awk < $tmpf \
+ '/time to transfer data/ {mb=$8/1.048576; \
+ if (n == 0 || mb < min) min = mb; \
+ if (n == 0 || mb > max) max = mb; \
+ n++} \
+ END {printf "[ %7.2f, %7.2f] ",min,max;}'`
+ print_summary -n "$bw $minmax "
fi
rm $tmpf
done
done
done
-if $sg_was_loaded; then
- echo "unloading sg module"
- rmmod sg
-fi
+for host in $sg_was_loaded_on; do
+ echo "unloading sg module on $host"
+ remote_shell $host rmmod sg
+done