3 # cluster name (all node names are this followed by the node number)
6 # client node numbers (individual numbers or inclusive ranges)
9 # numbers of clients to survey
12 clients_iterator="+=1"
14 # numbers of tasks per client to survey
17 tasks_per_client_iterator="*=2"
19 # record sizes to survey
24 ## which tests to run (first must be write)
25 # remount) not really a test; just remount to uncache everything
28 #tests=(write rewrite read reread rewrite_again)
29 tests=(write rewrite remount read reread)
31 # total # bytes written/read by any client node
32 min_per_client_size=4G
35 # should each task do I/O to its own file?
39 IOR="/home/ericb/ior/src/C/IOR"
40 llmount=/home/ericb/lustre/utils/llmount
43 # the result file prefix (date/time + hostname makes unique)
44 #rslt=/home/ericb/ior_survey_`date +%F@%R`_`uname -n`
45 rslt=/home/ericb/ior_survey
47 # where lustre is mounted on the clients
50 # basename of the test file(s)
51 testfile=${lustre}/ior_survey_testfile
53 # how to unmount and remount the F/S on a client (to clear the cache)
54 # change this depending on lustre config (network type, MDS etc)
55 remount="umount $lustre && $llmount -o nettype=elan mdev6:/ll_mds/client $lustre"
57 # pdsh args required to instantiate all instances of IOR in parallel
58 # the chosen module must support '-n <procs-per-node>'
59 # -R<module>, -f<fanout> etc
62 #don't spin for MPI completions
63 export LIBELAN_WAITTYPE=0
65 ################################################################################
66 # dont change stuff below here unless you know what you're doing...
69 echo $1 | awk '{ nvals=split($1, vals, "-");\
70 if (nvals == 1) print 1;\
71 else if (nvals == 2) printf "%d\n", vals[2] - vals[1] + 1;}'
75 echo $1 | awk '{ split($1, vals, "-"); print vals[1]; }'
82 if [ -z "$range" ]; then
85 chunk=`count_range $range`
86 if ((chunk > n)); then
87 base=`base_range $range`
100 local range=$1; shift
101 if [ -z "$range" ]; then
104 local base=`base_range $range`
105 local chunk=`count_range $range`
106 if ((chunk > n)); then chunk=n; fi
107 local nodes="${nodes}${sep}${base}"; sep=","
108 if ((chunk > 1)); then nodes="${nodes}-$((base+chunk-1))"; fi
117 while ((radix > 0)); do
118 local nodes=`n2noderange $((n+radix)) $@`
119 if [ -n "$nodes" ]; then
130 *G|*g) n=`echo $str | sed 's/[gG]//'`; echo $((n*1024*1024*1024));;
131 *M|*m) n=`echo $str | sed 's/[Mm]//'`; echo $((n*1024*1024));;
132 *K|*k) n=`echo $str | sed 's/[Kk]//'`; echo $((n*1024));;
139 local G=$((1024*1024*1024))
140 local M=$((1024*1024))
142 if ((n%G == 0 && n >= G)); then
144 elif ((n%M == 0 && n >= M)); then
146 elif ((n%K == 0 && n >= K)); then
153 if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then
154 echo "First test must be 'write'" 1>&2
158 rsltf="${rslt}.summary"
159 workf="${rslt}.detail"
164 if [ "$1" = "-n" ]; then
169 echo $minusn "$*" >> $rsltf
173 # convert params to actual numbers
174 min_per_client_size=`parse_number $min_per_client_size`
175 min_total_size=`parse_number $min_total_size`
177 rsize_lo=`parse_number $rsize_lo`
178 rsize_hi=`parse_number $rsize_hi`
180 # check on actual numbers of client nodes
181 nclients=`countnodes ${clients[@]}`
182 if ((clients_hi > nclients)); then clients_hi=$nclients; fi
184 for ((rsize=rsize_lo; rsize<=rsize_hi; rsize$rsize_iterator)); do
185 pp_rsize=`pp_number $rsize`
187 for ((nclnt=clients_lo; nclnt<=clients_hi; nclnt$clients_iterator)); do
188 test_clients="${cluster}`n2noderange $nclnt ${clients[@]}`"
190 per_client_size=$((min_total_size/nclnt))
191 if ((per_client_size < min_per_client_size)); then
192 per_client_size=$min_per_client_size
195 for ((ntask=tasks_per_client_lo; ntask <= tasks_per_client_hi; ntask$tasks_per_client_iterator)); do
196 per_task_size=$((per_client_size/ntask))
197 if ((per_task_size%rsize != 0)); then
198 per_task_size=$(((per_task_size/rsize + 1)*rsize))
200 total_size=`pp_number $((per_task_size*nclnt*ntask))`
202 hdrstr=`printf "Total: %5sB rsize: %4sB clients: %4d tasks: %3d: " \
203 $total_size $pp_rsize $nclnt $ntask`
204 print_summary -n "$hdrstr"
206 for ((test_idx=0; test_idx < ${#tests[@]}; test_idx++)); do
207 test=${tests[$test_idx]}
209 print_summary -n "$test "
210 echo "===========> ${hdrstr} on $test_clients doing $test" >> $workf
214 if [ "$test" = "remount" ]; then
215 echo "=> $remount" >> $tmpf
216 $pdsh -S -b -w "$test_clients" >> $tmpf 2>&1 \
219 echo "Completion Status: $status" >> $tmpf
227 # check lustre is mounted everywhere it's needed
228 cmd="(mount -t lustre; mount -t lustre_lite) | grep $lustre"
229 echo "=> Mount Check: $cmd" >> $tmpf
230 $pdsh -S -b -w "$test_clients" >> $tmpf 2>&1 \
233 echo "Completion Status: $status" >> $tmpf
237 print_summary "Lustre NOT mounted on $lustre somewhere"
243 -o${testfile} # test file prefix
244 -b${per_task_size} # bytes per task
245 -t${rsize} # record size
246 -e # fsync before close
252 # keep the test file(s) unless this is the last test
253 ((test_idx < ${#tests[@]}-1)) && cmdline[$((idx++))]="-k"
255 # use the existing test file(s) unless this is the first test
256 ((test_idx > 0)) && cmdline[$((idx++))]="-E"
259 ((file_per_task)) && cmdline[$((idx++))]="-F"
262 *write*) cmdline[$((idx++))]="-w"
264 *) cmdline[$((idx++))]="-r"
268 echo "=> ${cmdline[@]}" >> $tmpf
270 $pdsh -S -b $pdsh_mpiargs -w "$test_clients" -n $ntask >> $tmpf 2>&1 \
274 echo "Completion Status: $status" >> $tmpf
279 result=`awk < $tmpf "/$awkstr/ {print $ 3; found=1; exit}\
280 END {if (!found) print \"ERROR\"}"`
287 str=`printf "%8s" "$result"`
288 print_summary -n "$str "