Whamcloud - gitweb
LU-6142 misc: SPDX for lustre-iokit
[fs/lustre-release.git] / lustre-iokit / sgpdd-survey / sgpdd-survey
1 #!/bin/bash
2 # SPDX-License-Identifier: GPL-2.0
3
4 #
5 # This file is part of Lustre, http://www.lustre.org/
6 #
7
8 ######################################################################
9 # customize per survey
10
11 # CHOOSE EITHER scsidevs or rawdevs
12 # the SCSI devices to measure - WARNING: will be erased.
13 # The raw devices to use
14 # rawdevs=${rawdevs:-"/dev/raw/raw1"}
15 # scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev
16
17 # result file prefix.
18 # NB ensure the path exists on all servers if it includes subdirs
19 rslt_loc=${rslt_loc:-"/tmp"}
20 rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`"}
21
22 # what to do (read or write)
23 actions=${actions:-"write read"}
24
25 # total size per device (MBytes)
26 # NB bigger than device cache is good
27 size=${size:-8192}
28
29 # record size (KBytes)
30 rszlo=${rszlo:-1024}
31 rszhi=${rszhi:-1024}
32
33 # Concurrent regions per device
34 crglo=${crglo:-1}
35 crghi=${crghi:-256}
36
37 # boundary blocks between concurrent regions per device
38 boundary=${boundary:-1024}
39
40 # threads to share between concurrent regions per device
41 # multiple threads per region simulates a deeper request queue
42 # NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE
43 thrlo=${thrlo:-1}
44 thrhi=${thrhi:-4096}
45
46 # NUMA support
47 # User provided script that returns a cpu list from a specified device.
48 # Implementation depends on the type of device (scsi/raw, with/without
49 # multipath, technology fc/sas/ib)
50 # For example:
51 #   $ cat bin/dev2cpus
52 #   #!/bin/bash
53 #   dev=$(basename $1)
54 #   pci=$(readlink -f /sys/class/block/$dev | cut -d/ -f1-5)
55 #   cat ${pci}/local_cpulist
56 dev2cpus=${dev2cpus:-""}
57
58 #####################################################################
59 # leave the rest of this alone unless you know what you're doing...
60
61 # and max # threads one instance will spawn
62 SG_MAX_QUEUE=16
63
64 # numactl command
65 NUMACTL=${NUMACTL:-"/usr/bin/numactl"}
66
67 unique () {
68     echo "$@" | xargs -n1 echo | sort -u
69 }
70
71 split_hostname () {
72     local name=$1
73     case $name in
74     *:*) host=`echo $name | sed 's/:.*$//'`
75          name=`echo $name | sed 's/[^:]*://'`
76          ;;
77     *)   host=localhost
78          ;;
79     esac
80     echo "$host $name"
81 }
82
83 DSH=${DSH:-"ssh"}
84
85 dsh () {
86     local node="$1"
87     local user="$2"
88     shift 2
89     local command="$@"
90
91     command="export PATH=/sbin:/usr/sbin:\$PATH; $command"
92
93     case $DSH in
94         ssh)
95             if [ -n "$user" ]; then
96                 user="$user@"
97             fi
98             $DSH $user$node "$command"
99             ;;
100         rsh)
101             if [ -n "$user" ]; then
102                 user="-l $user"
103             fi
104             $DSH $user $node "$command"
105             ;;
106     esac
107 }
108
109 # how to run commands on other nodes
110 remote_shell () {
111     local host=$1
112     shift
113     local cmds="$@"
114     if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then
115         eval "$cmds"
116     else
117         # split $host into $host and $user
118         local user=""
119         if [[ $host == *@* ]]; then
120             user=${host%@*}
121             host=${host#*@}
122         fi
123         dsh $host "$user" "$cmds"
124     fi
125 }
126
127
128 # check either scsidevs or rawdevs is specified
129 # but only one of them
130 if [ -n "$scsidevs" -a -n "$rawdevs" -o -z "$scsidevs$rawdevs" ]; then
131     echo "Must either specify scsidevs or rawdevs"
132     exit 1
133 fi
134
135 # retrieve host and device if specified as "hostname:device"
136 ndevs=0
137 devs=()
138 for d in $scsidevs $rawdevs; do
139     str=(`split_hostname $d`)
140     hosts[$ndevs]=${str[0]}
141     devs[$ndevs]=${str[1]}
142     ndevs=$((ndevs+1))
143 done
144 unique_hosts=(`unique ${hosts[@]}`)
145
146 # get device cpu list
147 devcpus=()
148 if [ -n "$dev2cpus" ]; then
149     for ((i=0; i < $ndevs; i++)); do
150         devcpus[$i]=$(remote_shell ${hosts[$i]} $dev2cpus ${devs[$i]})
151     done
152 fi
153
154 # map given device names into SG device names
155 if [ "$scsidevs" ]; then
156     # make sure sg kernel module is loaded
157     for host in ${unique_hosts[@]}; do
158         sg_is_loaded=$(remote_shell $host grep -q "^sg " /proc/modules \
159                        && echo true || echo false)
160         if ! $sg_is_loaded; then
161             echo "loading the sg kernel module on $host"
162             remote_shell $host modprobe sg
163             sg_was_loaded_on="$sg_was_loaded_on $host"
164         fi
165     done
166
167     for ((i=0; i < $ndevs; i++)); do
168         # resolve symbolic link if any
169         devs[$i]=$(remote_shell ${hosts[$i]} readlink -f ${devs[$i]})
170
171         # retrieve associated sg device
172         # we will test for a LUN, the test for a partition
173         # if the partition number is > 9 this will fail
174         tmp=$(remote_shell ${hosts[$i]} sg_map | \
175               awk -v dev=${devs[$i]} '{if ($2 == dev) print $1}')
176         if [ -z "$tmp" ]; then
177             echo "Can't find SG device for ${hosts[$i]}:${devs[$i]}, " \
178                  "testing for partition"
179             pt=`echo ${devs[$i]} | sed 's/[0-9]*$//'`
180             # Try again
181             tmp=$(remote_shell ${hosts[$i]} sg_map | \
182                   awk -v dev=$pt '{if ($2 == dev) print $1}')
183             if [ -z "$tmp" ]; then
184                 echo -e "Can't find SG device ${hosts[$i]}:$pt.\n" \
185                         "Do you have the sg module configured for your kernel?"
186                 exit 1
187            fi
188         fi
189         devs[$i]=$tmp
190     done
191 elif [ "$rawdevs" ]; then
192     for ((i=0; i < $ndevs; i++)); do
193         RES=$(remote_shell ${hosts[$i]} raw -q ${devs[$i]})
194         if [ $? -ne 0 ];then
195             echo "Raw device ${hosts[$i]}:${devs[$i]} not set up"
196             exit 1
197         fi
198     done
199 fi
200
201 # determine block size of each device. This should also work for raw devices
202 # If it fails, set to 512
203 for ((i=0; i < $ndevs; i++)); do
204     # retrieve device size (in kbytes) and block size (in bytes)
205     tmp=( `remote_shell ${hosts[$i]} sg_readcap -lb ${devs[$i]}` )
206     bs[$i]=$((tmp[1]))
207     if [ ${bs[$i]} == 0  ]; then
208         echo "sg_readcap on device ${hosts[$i]}:${devs[$i]} failed, " \
209              "setting block size to 512"
210         bs[$i]=512
211     fi
212     devsize=$((tmp[0]*bs[$i]/1024))
213
214     # check record size is a multiple of block size
215     if [ $((rszlo*1024%bs[$i])) -ne 0 ]; then
216         echo "Record size is not a multiple of block size (${bs[$i]} bytes) " \
217              "for device ${hosts[$i]}:${devs[$i]}"
218         exit 1
219     fi
220
221     # check device size
222     if [ $devsize -lt $((size*1024)) ]; then
223         echo -e "device ${hosts[$i]}:${devs[$i]} not big enough: " \
224                 "$devsize < $((size*1024)).\nConsider reducing \$size"
225         exit 1
226     fi
227 done
228
229 rsltf=${rslt}.summary
230 workf=${rslt}.detail
231 cmdsf=${rslt}.script
232 echo -n > $rsltf
233 echo -n > $workf
234
235 print_summary () {
236     if [ "$1" = "-n" ]; then
237         minusn=$1; shift
238     else
239         minusn=""
240     fi
241     echo $minusn "$*" >> $rsltf
242     echo $minusn "$*"
243 }
244
245 print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)"
246
247 for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do
248     for ((crg=$crglo;crg<=$crghi;crg*=2)); do 
249         for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do
250             if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then
251                 continue
252             fi
253             # compute total size (in kbytes)
254             total_size=0
255             for ((i=0; i < $ndevs; i++)); do
256                 tsize=$((size*1024*1024/bs[$i]/crg*crg*bs[$i]/1024))
257                 total_size=$((total_size+tsize))
258             done
259             # show test parameters
260             str=`printf 'dev %2d sz %8dK rsz %4dK crg %5d thr %5d ' \
261                          $ndevs $total_size $rsz $((crg*ndevs)) $((thr*ndevs))`
262             echo "==============> $str" >> $workf
263             print_summary -n "$str"
264
265             # check memory for each host
266             for host in ${unique_hosts[@]}; do
267                 numdevs=0
268                 for ((i=0; i < $ndevs; i++)); do
269                     if [ ${hosts[$i]} == $host ]; then
270                         numdevs=$((numdevs+1))
271                     fi
272                 done
273                 freemem=$(remote_shell $host cat /proc/meminfo | \
274                           awk '/^MemTotal:/ {printf "%d\n", $2}')
275                 if (((rsz*thr/crg + 64)*crg*numdevs > freemem)); then
276                     echo "ENOMEM on $host" >> $workf
277                     print_summary "ENOMEM"
278                     continue 2
279                 fi
280             done
281
282             # run tests
283             for action in $actions; do
284                 declare -a pidarray
285                 print_summary -n "$action "
286                 echo "=====> $action" >> $workf
287                 tmpf=${workf}_tmp
288
289                 # create per-host script files
290                 for host in ${unique_hosts[@]}; do
291                     echo -n > ${cmdsf}_${host}
292                 done
293                 for ((i=0; i < $ndevs; i++)); do
294                     bpt=$((rsz*1024/bs[$i]))
295                     blocks=$((size*((1024*1024)/bs[$i])/crg))
296                     count=$blocks
297                     host=${hosts[$i]}
298                     dev=${devs[$i]}
299                     if [ $action = read ]; then
300                         inf="if=$dev"
301                         outf="of=/dev/null"
302                         skip=skip
303                     else
304                         inf="if=/dev/zero"
305                         outf="of=$dev"
306                         skip=seek
307                     fi
308                     if [ -n "${devcpus[$i]}" -a -x "$NUMACTL" ]; then
309                         numacmd="$NUMACTL --physcpubind=${devcpus[$i]} --localalloc"
310                     else
311                         numacmd=""
312                     fi
313                     for ((j=0;j<crg;j++)); do 
314                         echo >> ${cmdsf}_${host} \
315                                 "$numacmd " \
316                                 "sgp_dd 2> ${tmpf}_${i}_${j} $inf $outf " \
317                                 "${skip}=$((boundary+j*blocks)) " \
318                                 "thr=$((thr/crg)) count=$count bs=${bs[$i]} " \
319                                 "bpt=$bpt time=1&"
320                     done
321                 done
322                 for host in ${unique_hosts[@]}; do
323                     echo "wait" >> ${cmdsf}_${host}
324                 done
325
326                 # run of all the per-host script files
327                 t0=`date +%s.%N`
328                 pidcount=0
329                 for host in ${unique_hosts[@]}; do
330                     remote_shell $host bash < ${cmdsf}_${host} &
331                     pidarray[$pidcount]=$!
332                     pidcount=$((pidcount+1))
333                 done
334                 pidcount=0
335                 for host in ${unique_hosts[@]}; do
336                     wait ${pidarray[$pidcount]}
337                     pidcount=$((pidcount+1))
338                 done
339                 t1=`date +%s.%N`
340
341                 # clean up per-host script files
342                 for host in ${unique_hosts[@]}; do
343                     rm ${cmdsf}_${host}
344                 done
345
346                 # collect/check individual stats
347                 echo > $tmpf
348                 ok=0
349                 for ((i=0;i<ndevs;i++)); do
350                     for ((j=0;j<crg;j++)); do
351                         rtmp=${tmpf}_${i}_${j}_local
352                         remote_shell ${hosts[$i]} cat ${tmpf}_${i}_${j} > $rtmp
353                         if grep 'error' $rtmp > /dev/null 2>&1; then
354                             echo "Error found in $rtmp"
355                         elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then
356                             ok=$((ok + 1))
357                         fi
358                         cat ${rtmp} >> $tmpf
359                         cat ${rtmp} >> $workf
360                         rm  ${rtmp}
361                         remote_shell ${hosts[$i]} rm ${tmpf}_${i}_${j}
362                     done
363                 done
364                 if ((ok != ndevs*crg)); then
365                     print_summary -n "$((ndevs*crg - ok)) failed "
366                 else
367                     # compute bandwidth in MiB/s from total data / elapsed time
368                     bw=`awk "BEGIN {printf \"%7.2f \", \
369                                     $total_size / (( $t1 - $t0 ) * 1024); exit}"`
370                     # compute global min/max stats
371                     minmax=`awk < $tmpf \
372                         '/time to transfer data/ {mb=$8/1.048576; \
373                                                   if (n == 0 || mb < min) min = mb; \
374                                                   if (n == 0 || mb > max) max = mb; \
375                                                   n++} \
376                         END {printf "[ %7.2f, %7.2f] ",min,max;}'`
377                     print_summary -n "$bw $minmax "
378                 fi
379                 rm $tmpf
380             done
381             print_summary ""
382         done
383     done
384 done
385
386 for host in $sg_was_loaded_on; do
387     echo "unloading sg module on $host"
388     remote_shell $host rmmod sg
389 done