X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre-iokit%2Fsgpdd-survey%2Fsgpdd-survey;h=8edd1e8e6c020da532dc313cf72608a5da35df94;hp=16709e302bba1c0fa9b171cddca16afb57401432;hb=e0d53481a038828094a80ea1e36030772fcbf4e7;hpb=6246beb119322f5a31872e26ec51652b7a0702b9 diff --git a/lustre-iokit/sgpdd-survey/sgpdd-survey b/lustre-iokit/sgpdd-survey/sgpdd-survey index 16709e3..8edd1e8 100755 --- a/lustre-iokit/sgpdd-survey/sgpdd-survey +++ b/lustre-iokit/sgpdd-survey/sgpdd-survey @@ -3,56 +3,227 @@ ###################################################################### # customize per survey -# the SCSI devices to measure -scsidevs=${scsidevs:-"/dev/sde /dev/sdh"} +# CHOOSE EITHER scsidevs or rawdevs +# the SCSI devices to measure - WARNING: will be erased. +# The raw devices to use +# rawdevs=${rawdevs:-"/dev/raw/raw1"} +# scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev -# result file prefix. date/time+hostname makes unique -# NB ensure the path exists if it includes subdirs -rslt=${rslt:-"/tmp/sgpdd_survey_`date +%F@%R`_`uname -n`"} +# result file prefix. +# NB ensure the path exists on all servers if it includes subdirs +rslt_loc=${rslt_loc:-"/tmp"} +rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"} # what to do (read or write) -actions="write read" +actions=${actions:-"write read"} # total size per device (MBytes) # NB bigger than device cache is good -size=8192 +size=${size:-8192} # record size (KBytes) -rszlo=1024 -rszhi=1024 +rszlo=${rszlo:-1024} +rszhi=${rszhi:-1024} # Concurrent regions per device crglo=${crglo:-1} crghi=${crghi:-256} +# boundary blocks between concurrent regions per device +boundary=${boundary:-1024} + # threads to share between concurrent regions per device +# multiple threads per region simulates a deeper request queue # NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE thrlo=${thrlo:-1} thrhi=${thrhi:-4096} +# NUMA support +# User provided script that returns a cpu list from a specified device. +# Implementation depends on the type of device (scsi/raw, with/without +# multipath, technology fc/sas/ib) +# For example: +# $ cat bin/dev2cpus +# #!/bin/bash +# dev=$(basename $1) +# pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5) +# cat ${pci}/local_cpulist +dev2cpus=${dev2cpus:-""} + ##################################################################### # leave the rest of this alone unless you know what you're doing... -# sgp_dd's idea of disk sector size (Bytes) -bs=512 # and max # threads one instance will spawn SG_MAX_QUEUE=16 -# map given device names into SG device names -i=0 +# numactl command +NUMACTL=${NUMACTL:-"/usr/bin/numactl"} + +unique () { + echo "$@" | xargs -n1 echo | sort -u +} + +split_hostname () { + local name=$1 + case $name in + *:*) host=`echo $name | sed 's/:.*$//'` + name=`echo $name | sed 's/[^:]*://'` + ;; + *) host=localhost + ;; + esac + echo "$host $name" +} + +DSH=${DSH:-"ssh"} + +dsh () { + local node="$1" + local user="$2" + shift 2 + local command="$@" + + command="export PATH=/sbin:/usr/sbin:\$PATH; $command" + + case $DSH in + ssh) + if [ -n "$user" ]; then + user="$user@" + fi + $DSH $user$node "$command" + ;; + rsh) + if [ -n "$user" ]; then + user="-l $user" + fi + $DSH $user $node "$command" + ;; + esac +} + +# how to run commands on other nodes +remote_shell () { + local host=$1 + shift + local cmds="$@" + if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then + eval "$cmds" + else + # split $host into $host and $user + local user="" + if [[ $host == *@* ]]; then + user=${host%@*} + host=${host#*@} + fi + dsh $host "$user" "$cmds" + fi +} + + +# check either scsidevs or rawdevs is specified +# but only one of them +if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then + echo "Must either specify scsidevs or rawdevs" + exit 1 +fi + +# retrieve host and device if specified as "hostname:device" +ndevs=0 devs=() -for d in $scsidevs; do - devs[$i]=`sg_map | awk "{if ($ 2 == \"$d\") print $ 1}"` - if [ -z "$devs[$i]" ]; then - echo "Can't find SG device for $d" +for d in $scsidevs $rawdevs; do + str=(`split_hostname $d`) + hosts[$ndevs]=${str[0]} + devs[$ndevs]=${str[1]} + ndevs=$((ndevs+1)) +done +unique_hosts=(`unique ${hosts[@]}`) + +# get device cpu list +devcpus=() +if [ -n "$dev2cpus" ]; then + for ((i=0; i < $ndevs; i++)); do + devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]}) + done +fi + +# map given device names into SG device names +if [ "$scsidevs" ]; then + # make sure sg kernel module is loaded + for host in ${unique_hosts[@]}; do + sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \ + && echo true || echo false) + if ! $sg_is_loaded; then + echo "loading the sg kernel module on $host" + remote_shell $host modprobe sg + sg_was_loaded_on="$sg_was_loaded_on $host" + fi + done + + for ((i=0; i < $ndevs; i++)); do + # resolve symbolic link if any + devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]}) + + # retrieve associated sg device + # we will test for a LUN, the test for a partition + # if the partition number is > 9 this will fail + tmp=$(remote_shell ${hosts[$i]} sg_map | \ + awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}') + if [ -z "$tmp" ]; then + echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \ + "testing for partition" + pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'` + # Try again + tmp=$(remote_shell ${hosts[$i]} sg_map | \ + awk -v dev=$pt '{if ($2 == dev) print $1}') + if [ -z "$tmp" ]; then + echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \ + "Do you have the sg module configured for your kernel?" + exit 1 + fi + fi + devs[$i]=$tmp + done +elif [ "$rawdevs" ]; then + for ((i=0; i < $ndevs; i++)); do + RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]}) + if [ $? -ne 0 ];then + echo "Raw device ${hosts[$i]}:${devs[$i]} not set up" + exit 1 + fi + done +fi + +# determine block size of each device. This should also work for raw devices +# If it fails, set to 512 +for ((i=0; i < $ndevs; i++)); do + # retrieve device size (in kbytes) and block size (in bytes) + tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` ) + bs[$i]=$((tmp[1])) + if [ ${bs[$i]} == 0 ]; then + echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \ + "setting block size to 512" + bs[$i]=512 + fi + devsize=$((tmp[0]*bs[$i]/1024)) + + # check record size is a multiple of block size + if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then + echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \ + "for device ${hosts[$i]}:${devs[$i]}" + exit 1 + fi + + # check device size + if [ $devsize -lt $((size*1024)) ]; then + echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \ + "$devsize < $((size*1024)).\nConsider reducing \$size" exit 1 fi - i=$((i+1)) done -ndevs=${#devs[@]} rsltf=${rslt}.summary workf=${rslt}.detail +cmdsf=${rslt}.script echo -n > $rsltf echo -n > $workf @@ -66,37 +237,60 @@ print_summary () { echo $minusn "$*" } +print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)" + for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do for ((crg=$crglo;crg<=$crghi;crg*=2)); do for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then continue fi - # compute parameters - bpt=$((rsz*1024/bs)) - blocks=$((size*((1024*1024)/bs)/crg)) - count=$blocks - # show computed parameters - actual_rsz=$((bpt*bs/1024)) - actual_size=$((bs*count*crg/1024)) - str=`printf 'total_size %8dK rsz %4d crg %5d thr %5d ' \ - $((actual_size*ndevs)) $actual_rsz $((crg*ndevs)) $((thr*ndevs))` + # compute total size (in kbytes) + total_size=0 + for ((i=0; i < $ndevs; i++)); do + tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024)) + total_size=$((total_size+tsize)) + done + # show test parameters + str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \ + $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))` echo "==============> $str" >> $workf print_summary -n "$str" - freemem=`awk < /proc/meminfo '/^MemTotal:/ {printf "%d\n", $2}'` - if (((actual_rsz*thr/crg + 64)*crg*ndevs > freemem)); then - print_summary "ENOMEM" - continue - fi + + # check memory for each host + for host in ${unique_hosts[@]}; do + numdevs=0 + for ((i=0; i < $ndevs; i++)); do + if [ ${hosts[$i]} == $host ]; then + numdevs=$((numdevs+1)) + fi + done + freemem=$(remote_shell $host cat /proc/meminfo | \ + awk '/^MemTotal:/ {printf "%d\n", $2}') + if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then + echo "ENOMEM on $host" >> $workf + print_summary "ENOMEM" + continue 2 + fi + done + # run tests for action in $actions; do + declare -a pidarray print_summary -n "$action " echo "=====> $action" >> $workf tmpf=${workf}_tmp - # start test - t0=`date +%s.%N` - for ((i=0;i ${cmdsf}_${host} + done + for ((i=0; i < $ndevs; i++)); do + bpt=$((rsz*1024/bs[$i])) + blocks=$((size*((1024*1024)/bs[$i])/crg)) + count=$blocks + host=${hosts[$i]} + dev=${devs[$i]} if [ $action = read ]; then inf="if=$dev" outf="of=/dev/null" @@ -106,38 +300,76 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do outf="of=$dev" skip=seek fi + if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then + numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc" + else + numacmd="" + fi for ((j=0;j ${tmpf}_${i}_${j} \ - $inf $outf ${skip}=$((1024+j*blocks)) \ - thr=$((thr/crg)) count=$count bs=$bs bpt=$bpt time=1& + echo >> ${cmdsf}_${host} \ + "$numacmd " \ + "sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \ + "${skip}=$((boundary+j*blocks)) " \ + "thr=$((thr/crg)) count=$count bs=${bs[$i]} " \ + "bpt=$bpt time=1&" done - done - wait + done + for host in ${unique_hosts[@]}; do + echo "wait" >> ${cmdsf}_${host} + done + + # run of all the per-host script files + t0=`date +%s.%N` + pidcount=0 + for host in ${unique_hosts[@]}; do + remote_shell $host bash < ${cmdsf}_${host} & + pidarray[$pidcount]=$! + pidcount=$((pidcount+1)) + done + pidcount=0 + for host in ${unique_hosts[@]}; do + wait ${pidarray[$pidcount]} + pidcount=$((pidcount+1)) + done t1=`date +%s.%N` - # collect/check individual stats + + # clean up per-host script files + for host in ${unique_hosts[@]}; do + rm ${cmdsf}_${host} + done + + # collect/check individual stats echo > $tmpf ok=0 for ((i=0;i /dev/null 2>&1; then + rtmp=${tmpf}_${i}_${j}_local + remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp + if grep 'error' $rtmp > /dev/null 2>&1; then + echo "Error found in $rtmp" + elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then ok=$((ok + 1)) fi cat ${rtmp} >> $tmpf cat ${rtmp} >> $workf rm ${rtmp} + remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j} done done if ((ok != ndevs*crg)); then print_summary -n "$((ndevs*crg - ok)) failed " else - # compute MB/sec from elapsed - bw=`awk "BEGIN {printf \"%7.2f MB/s\", $actual_size * $ndevs / (( $t1 - $t0 ) * 1024); exit}"` - # compute MB/sec from nregions*slowest - check=`awk < $tmpf \ - '/time to transfer data/ {mb=$8/1.048576; if (n == 0 || mb < min) min = mb; n++}\ - END {printf "%5d x %6.2f = %7.2f MB/s", n, min, min * n}'` - print_summary -n "$bw $check " + # compute bandwidth in MiB/s from total data / elapsed time + bw=`awk "BEGIN {printf \"%7.2f \", \ + $total_size / (( $t1 - $t0 ) * 1024); exit}"` + # compute global min/max stats + minmax=`awk < $tmpf \ + '/time to transfer data/ {mb=$8/1.048576; \ + if (n == 0 || mb < min) min = mb; \ + if (n == 0 || mb > max) max = mb; \ + n++} \ + END {printf "[ %7.2f, %7.2f] ",min,max;}'` + print_summary -n "$bw $minmax " fi rm $tmpf done @@ -145,3 +377,8 @@ for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do done done done + +for host in $sg_was_loaded_on; do + echo "unloading sg module on $host" + remote_shell $host rmmod sg +done