3 #########################################################################
4 # gather_stats_everywhere:
5 # script on a selection of nodes and collect all the results into a single
8 # Copyright (c) 2007 - Cluster File Systems, Inc.
9 #########################################################################
16 printf $"Usage: gather_stats_everywhere [-help] config_file [start|stop|cleanup] <log_name>\n"
20 The distribution script will run on a single node. It is parameterised
21 with a set of target node names. It may assume ssh/scp to these node
22 names works without requiring a password. It will run in 2 modes...
24 gather_stats_everywhere config_file start
26 ...will copy the script to /tmp everywhere described in
27 config_file running on all the target hosts. And...
29 gather_stats_everywhere config_file stop log_name
31 ...will stop script running on all the hosts it started on and collect
32 all the individual stats files into a single compressed tarball if the log_name is
35 The config file is just a list of shell variable assignments that can be
38 Serveral variables must be set in the config file
40 Targets: the nodes where run the script.
48 options=`getopt -o h --long help:: -- "$@"`
55 eval set -- "$options"
70 if [ $# != 2 -a $# != 3 ] ; then
81 if [ ! -r $CONFIG ]; then
82 error "Config_file: $CONFIG does not exist "
87 if [ -z "$SCRIPT" ]; then
88 error "SCRIPT in ${CONFIG} is empty"
91 if [ -z "$TARGETS" ]; then
92 error "TARGETS in ${CONFIG} is empty"
95 #check nodes accessiable
96 Check_nodes_avaible() {
97 local NODES_NOT_AVAIBLE=""
99 for TARGET in $TARGETS; do
100 if ! ping -c 1 -w 3 $TARGET > /dev/null; then
101 NODES_NOT_AVAIBLE=$NODES_NOT_AVAIBLE$TARGET
104 if [ -z "$NODES_NOT_AVAIBLE" ]; then
107 echo "Nodes ${NODES_NOT_AVAIBLE} not respond to ping"
112 if ! Check_nodes_avaible; then
113 error "not all the nodes are availble"
116 Check_nodes_are_clean() {
117 local NODES_NO_CLEAN=""
119 # check whether there are running threads on the targets
120 for TARGET in $TARGETS; do
121 ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
122 if [ -n "$ps_str" ]; then
123 NODES_NO_CLEAN=${NODES_NO_CLEAN}$TARGET
127 if [ -n "$NODES_NO_CLEAN" ]; then
134 copy_target_script() {
137 #copy alex's run scripts to the target
138 copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
139 ${copy_cmd} 1>/dev/null 2>&1
140 if [ ${PIPESTATUS[0]} != 0 ]; then
141 echo "copy command failed: ${copy_cmd}" 2>&1
144 echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
149 start_target_script() {
152 if ! copy_target_script $target; then
153 echo "copy_target_script $target failed." 2>&1
157 #run the script on the target
158 $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \
159 SDIO_INTERVAL=${SDIO_INTERVAL} \
160 SERVICE_INTERVAL=${SERVICE_INTERVAL} \
161 BRW_INTERVAL=${BRW_INTERVAL} \
162 JBD_INTERVAL=${JBD_INTERVAL} \
163 IO_INTERVAL=${IO_INTERVAL} \
164 MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \
165 sh ${TMP}/${SCRIPT}-${target} start \
166 1> /dev/null 2>/dev/null </dev/null"
168 if [ ${PIPESTATUS[0]} != 0 ]; then
169 echo "Start the ${SCRIPT} on ${target} failed"
172 echo "Start the ${SCRIPT} on ${target} success"
177 stop_target_script() {
180 #stop the target script first
181 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
182 if [ ${PIPESTATUS[0]} != 0 ]; then
183 echo "stop the collecting stats script on ${target} failed"
186 echo "stop the collecting stats script on ${target} success"
189 #remove those tmp file
190 $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
191 echo "cleanup ${target} tmp file after stop "
195 generate_timestamp() {
196 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
198 export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
199 echo "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
206 local date=${GLOBAL_TIMESTAMP}
207 local target_log_name="stats-${target}-${date}"
209 echo "Getting log: ${target_log_name}.tar.gz from ${target}"
210 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} fetch " \
211 > $TMP/${target_log_name}.tar.gz
212 echo "Got log: ${target_log_name}.tar.gz from ${target}"
214 echo "Moving $TMP/${target_log_name}.tar.gz to $TMP/$log_name"
215 mv $TMP/${target_log_name}.tar.gz $TMP/$log_name
220 local log_name=${GLOBAL_TIMESTAMP}
221 local stat_tar_name=$1
223 local -a clients_array
225 if ! mkdir -p $TMP/$log_name ; then
226 error "can not mkdir $log_name"
229 #retrive the log_tarball from remote nodes background
231 for TARGET in $TARGETS; do
232 (fetch_target_log ${TARGET}) &
234 clients_array[$n]=$TARGET
239 #Waiting log fetch finished
240 for ((n=0; $n < $num_pids; n++)); do
241 wait ${pids_array[$n]}
244 #compress the log tarball
245 cmd="$TAR ${stat_tar_name} $TMP/${log_name}"
246 echo "Creating compressed tar file ${stat_tar_name} from log files in $TMP/${log_name}"
247 ${cmd} 1>/dev/null 2>&1
248 if [ ${PIPESTATUS[0]} == 0 ]; then
249 echo "removing temporary directory $TMP/${log_name}"
250 rm -rf $TMP/${log_name}
252 echo "Compressed logfiles are in $TMP/${stat_tar_name}"
256 stop_targets_script() {
258 local -a clients_array
260 for TARGET in $TARGETS; do
261 (stop_target_script ${TARGET}) &
263 clients_array[$n]=$TARGET
268 #Waiting log fetch finished
269 for ((n=0; $n < $num_pids; n++)); do
270 if ! wait ${pids_array[$n]}; then
271 echo "${clients_array[$n]}: can not stop stats collect"
278 local -a clients_array
281 #check whether the collect scripts already start in some targets
282 if ! Check_nodes_are_clean ; then
283 error "$SCRIPT already running in some targets, please cleanup first"
286 for TARGET in $TARGETS; do
287 (start_target_script ${TARGET}) &
289 clients_array[$n]=$TARGET
295 #Waiting log fetch finished
296 for ((n=0; $n < $num_pids; n++)); do
297 if ! wait ${pids_array[$n]}; then
298 echo "${clients_array[$n]}: can not start stats collect"
303 if [ $RC != 0 ]; then
309 if Check_nodes_are_clean ; then
314 if [ -n "$log" ]; then
321 start) gather_start ;;
322 stop) gather_stop $@;;
323 *) error "Unknown option ${OPTION}"