Whamcloud - gitweb
LU-3963 ptlrpc: convert to linux list api
[fs/lustre-release.git] / lustre-iokit / sgpdd-survey / sgpdd-survey
1 #!/bin/bash
2
3 ######################################################################
4 # customize per survey
5
6 # CHOOSE EITHER scsidevs or rawdevs
7 # the SCSI devices to measure - WARNING: will be erased.
8 # The raw devices to use
9 # rawdevs=${rawdevs:-"/dev/raw/raw1"}
10 # scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev
11
12 # result file prefix.
13 # NB ensure the path exists on all servers if it includes subdirs
14 rslt_loc=${rslt_loc:-"/tmp"}
15 rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"}
16
17 # what to do (read or write)
18 actions=${actions:-"write read"}
19
20 # total size per device (MBytes)
21 # NB bigger than device cache is good
22 size=${size:-8192}
23
24 # record size (KBytes)
25 rszlo=${rszlo:-1024}
26 rszhi=${rszhi:-1024}
27
28 # Concurrent regions per device
29 crglo=${crglo:-1}
30 crghi=${crghi:-256}
31
32 # boundary blocks between concurrent regions per device
33 boundary=${boundary:-1024}
34
35 # threads to share between concurrent regions per device
36 # multiple threads per region simulates a deeper request queue
37 # NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE
38 thrlo=${thrlo:-1}
39 thrhi=${thrhi:-4096}
40
41 # NUMA support
42 # User provided script that returns a cpu list from a specified device.
43 # Implementation depends on the type of device (scsi/raw, with/without
44 # multipath, technology fc/sas/ib)
45 # For example:
46 #   $ cat bin/dev2cpus
47 #   #!/bin/bash
48 #   dev=$(basename $1)
49 #   pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5)
50 #   cat ${pci}/local_cpulist
51 dev2cpus=${dev2cpus:-""}
52
53 #####################################################################
54 # leave the rest of this alone unless you know what you're doing...
55
56 # and max # threads one instance will spawn
57 SG_MAX_QUEUE=16
58
59 # numactl command
60 NUMACTL=${NUMACTL:-"/usr/bin/numactl"}
61
62 unique () {
63     echo "$@" | xargs -n1 echo | sort -u
64 }
65
66 split_hostname () {
67     local name=$1
68     case $name in
69     *:*) host=`echo $name | sed 's/:.*$//'`
70          name=`echo $name | sed 's/[^:]*://'`
71          ;;
72     *)   host=localhost
73          ;;
74     esac
75     echo "$host $name"
76 }
77
78 DSH=${DSH:-"ssh"}
79
80 dsh () {
81     local node="$1"
82     local user="$2"
83     shift 2
84     local command="$@"
85
86     command="export PATH=/sbin:/usr/sbin:\$PATH; $command"
87
88     case $DSH in
89         ssh)
90             if [ -n "$user" ]; then
91                 user="$user@"
92             fi
93             $DSH $user$node "$command"
94             ;;
95         rsh)
96             if [ -n "$user" ]; then
97                 user="-l $user"
98             fi
99             $DSH $user $node "$command"
100             ;;
101     esac
102 }
103
104 # how to run commands on other nodes
105 remote_shell () {
106     local host=$1
107     shift
108     local cmds="$@"
109     if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then
110         eval "$cmds"
111     else
112         # split $host into $host and $user
113         local user=""
114         if [[ $host == *@* ]]; then
115             user=${host%@*}
116             host=${host#*@}
117         fi
118         dsh $host "$user" "$cmds"
119     fi
120 }
121
122
123 # check either scsidevs or rawdevs is specified
124 # but only one of them
125 if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then
126     echo "Must either specify scsidevs or rawdevs"
127     exit 1
128 fi
129
130 # retrieve host and device if specified as "hostname:device"
131 ndevs=0
132 devs=()
133 for d in $scsidevs $rawdevs; do
134     str=(`split_hostname $d`)
135     hosts[$ndevs]=${str[0]}
136     devs[$ndevs]=${str[1]}
137     ndevs=$((ndevs+1))
138 done
139 unique_hosts=(`unique ${hosts[@]}`)
140
141 # get device cpu list
142 devcpus=()
143 if [ -n "$dev2cpus" ]; then
144     for ((i=0; i < $ndevs; i++)); do
145         devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]})
146     done
147 fi
148
149 # map given device names into SG device names
150 if [ "$scsidevs" ]; then
151     # make sure sg kernel module is loaded
152     for host in ${unique_hosts[@]}; do
153         sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \
154                        && echo true || echo false)
155         if ! $sg_is_loaded; then
156             echo "loading the sg kernel module on $host"
157             remote_shell $host modprobe sg
158             sg_was_loaded_on="$sg_was_loaded_on $host"
159         fi
160     done
161
162     for ((i=0; i < $ndevs; i++)); do
163         # resolve symbolic link if any
164         devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]})
165
166         # retrieve associated sg device
167         # we will test for a LUN, the test for a partition
168         # if the partition number is > 9 this will fail
169         tmp=$(remote_shell ${hosts[$i]} sg_map | \
170               awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}')
171         if [ -z "$tmp" ]; then
172             echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \
173                  "testing for partition"
174             pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'`
175             # Try again
176             tmp=$(remote_shell ${hosts[$i]} sg_map | \
177                   awk -v dev=$pt '{if ($2 == dev) print $1}')
178             if [ -z "$tmp" ]; then
179                 echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \
180                         "Do you have the sg module configured for your kernel?"
181                 exit 1
182            fi
183         fi
184         devs[$i]=$tmp
185     done
186 elif [ "$rawdevs" ]; then
187     for ((i=0; i < $ndevs; i++)); do
188         RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]})
189         if [ $? -ne 0 ];then
190             echo "Raw device ${hosts[$i]}:${devs[$i]} not set up"
191             exit 1
192         fi
193     done
194 fi
195
196 # determine block size of each device. This should also work for raw devices
197 # If it fails, set to 512
198 for ((i=0; i < $ndevs; i++)); do
199     # retrieve device size (in kbytes) and block size (in bytes)
200     tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` )
201     bs[$i]=$((tmp[1]))
202     if [ ${bs[$i]} == 0  ]; then
203         echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \
204              "setting block size to 512"
205         bs[$i]=512
206     fi
207     devsize=$((tmp[0]*bs[$i]/1024))
208
209     # check record size is a multiple of block size
210     if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then
211         echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \
212              "for device ${hosts[$i]}:${devs[$i]}"
213         exit 1
214     fi
215
216     # check device size
217     if [ $devsize -lt $((size*1024)) ]; then
218         echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \
219                 "$devsize < $((size*1024)).\nConsider reducing \$size"
220         exit 1
221     fi
222 done
223
224 rsltf=${rslt}.summary
225 workf=${rslt}.detail
226 cmdsf=${rslt}.script
227 echo -n > $rsltf
228 echo -n > $workf
229
230 print_summary () {
231     if [ "$1" = "-n" ]; then
232         minusn=$1; shift
233     else
234         minusn=""
235     fi
236     echo $minusn "$*" >> $rsltf
237     echo $minusn "$*"
238 }
239
240 print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)"
241
242 for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
243     for ((crg=$crglo;crg<=$crghi;crg*=2)); do 
244         for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do
245             if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then
246                 continue
247             fi
248             # compute total size (in kbytes)
249             total_size=0
250             for ((i=0; i < $ndevs; i++)); do
251                 tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024))
252                 total_size=$((total_size+tsize))
253             done
254             # show test parameters
255             str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \
256                          $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))`
257             echo "==============> $str" >> $workf
258             print_summary -n "$str"
259
260             # check memory for each host
261             for host in ${unique_hosts[@]}; do
262                 numdevs=0
263                 for ((i=0; i < $ndevs; i++)); do
264                     if [ ${hosts[$i]} == $host ]; then
265                         numdevs=$((numdevs+1))
266                     fi
267                 done
268                 freemem=$(remote_shell $host cat /proc/meminfo | \
269                           awk '/^MemTotal:/ {printf "%d\n", $2}')
270                 if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then
271                     echo "ENOMEM on $host" >> $workf
272                     print_summary "ENOMEM"
273                     continue 2
274                 fi
275             done
276
277             # run tests
278             for action in $actions; do
279                 declare -a pidarray
280                 print_summary -n "$action "
281                 echo "=====> $action" >> $workf
282                 tmpf=${workf}_tmp
283
284                 # create per-host script files
285                 for host in ${unique_hosts[@]}; do
286                     echo -n > ${cmdsf}_${host}
287                 done
288                 for ((i=0; i < $ndevs; i++)); do
289                     bpt=$((rsz*1024/bs[$i]))
290                     blocks=$((size*((1024*1024)/bs[$i])/crg))
291                     count=$blocks
292                     host=${hosts[$i]}
293                     dev=${devs[$i]}
294                     if [ $action = read ]; then
295                         inf="if=$dev"
296                         outf="of=/dev/null"
297                         skip=skip
298                     else
299                         inf="if=/dev/zero"
300                         outf="of=$dev"
301                         skip=seek
302                     fi
303                     if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then
304                         numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc"
305                     else
306                         numacmd=""
307                     fi
308                     for ((j=0;j<crg;j++)); do 
309                         echo >> ${cmdsf}_${host} \
310                                 "$numacmd " \
311                                 "sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \
312                                 "${skip}=$((boundary+j*blocks)) " \
313                                 "thr=$((thr/crg)) count=$count bs=${bs[$i]} " \
314                                 "bpt=$bpt time=1&"
315                     done
316                 done
317                 for host in ${unique_hosts[@]}; do
318                     echo "wait" >> ${cmdsf}_${host}
319                 done
320
321                 # run of all the per-host script files
322                 t0=`date +%s.%N`
323                 pidcount=0
324                 for host in ${unique_hosts[@]}; do
325                     remote_shell $host bash < ${cmdsf}_${host} &
326                     pidarray[$pidcount]=$!
327                     pidcount=$((pidcount+1))
328                 done
329                 pidcount=0
330                 for host in ${unique_hosts[@]}; do
331                     wait ${pidarray[$pidcount]}
332                     pidcount=$((pidcount+1))
333                 done
334                 t1=`date +%s.%N`
335
336                 # clean up per-host script files
337                 for host in ${unique_hosts[@]}; do
338                     rm ${cmdsf}_${host}
339                 done
340
341                 # collect/check individual stats
342                 echo > $tmpf
343                 ok=0
344                 for ((i=0;i<ndevs;i++)); do
345                     for ((j=0;j<crg;j++)); do
346                         rtmp=${tmpf}_${i}_${j}_local
347                         remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp
348                         if grep 'error' $rtmp > /dev/null 2>&1; then
349                             echo "Error found in $rtmp"
350                         elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then
351                             ok=$((ok + 1))
352                         fi
353                         cat ${rtmp} >> $tmpf
354                         cat ${rtmp} >> $workf
355                         rm  ${rtmp}
356                         remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j}
357                     done
358                 done
359                 if ((ok != ndevs*crg)); then
360                     print_summary -n "$((ndevs*crg - ok)) failed "
361                 else
362                     # compute bandwidth in MiB/s from total data / elapsed time
363                     bw=`awk "BEGIN {printf \"%7.2f \", \
364                                     $total_size / (( $t1 - $t0 ) * 1024); exit}"`
365                     # compute global min/max stats
366                     minmax=`awk < $tmpf \
367                         '/time to transfer data/ {mb=$8/1.048576; \
368                                                   if (n == 0 || mb < min) min = mb; \
369                                                   if (n == 0 || mb > max) max = mb; \
370                                                   n++} \
371                         END {printf "[ %7.2f, %7.2f] ",min,max;}'`
372                     print_summary -n "$bw $minmax "
373                 fi
374                 rm $tmpf
375             done
376             print_summary ""
377         done
378     done
379 done
380
381 for host in $sg_was_loaded_on; do
382     echo "unloading sg module on $host"
383     remote_shell $host rmmod sg
384 done