Whamcloud - gitweb
LU-1187 tests: Fixes in test-framework for DNE
[fs/lustre-release.git] / lustre / scripts / lc_hb.in
1 #!/bin/bash
2
3 # vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
4
5 #
6 # lc_hb - script for generating the Heartbeat HA software's
7 #         configuration files
8 #
9 ###############################################################################
10
11 # Usage
12 usage() {
13         cat >&2 <<EOF
14
15 Usage:  `basename $0`   <-r HBver> <-n hostnames> [-v]
16                         <-d target device> [-d target device...]
17
18         -r HBver                the version of Heartbeat software
19                                 The Heartbeat software versions which are curr-
20                                 ently supported are: hbv1 (Heartbeat version 1) 
21                                 and hbv2 (Heartbeat version 2).
22         -n hostnames            the nodenames of the primary node and its fail-
23                                 overs
24                                 Multiple nodenames are separated by colon (:)
25                                 delimeter. The first one is the nodename of the 
26                                 primary node, the others are failover nodenames.
27         -v                      verbose mode
28         -d target device        the target device name and mount point
29                                 The device name and mount point are separated by
30                                 colon (:) delimeter. 
31
32 EOF
33         exit 1
34 }
35
36 # Get the library of functions
37 . @scriptlibdir@/lc_common
38
39 #****************************** Global variables ******************************#
40 # Heartbeat tools
41 HB_TOOLS_PATH=${HB_TOOLS_PATH:-"/usr/lib64/heartbeat"}  # Heartbeat tools path
42 CIB_GEN_SCRIPT=${HB_TOOLS_PATH}/haresources2cib.py
43 CL_STATUS=${CL_STATUS:-"/usr/bin/cl_status"}
44
45 # Service directories and names
46 HARES_DIR=${HARES_DIR:-"${HA_DIR}/resource.d"}          # Heartbeat resources
47 LUSTRE_SRV=${LUSTRE_SRV:-"Filesystem"}  # Service script provided by Heartbeat
48
49 TMP_DIR=${HB_TMP_DIR}                   # Temporary directory
50 HACF_TEMP=${TMP_DIR}/ha.cf.temp
51 AUTHKEYS_TEMP=${TMP_DIR}/authkeys${FILE_SUFFIX}
52
53 declare -a NODE_NAMES                   # Node names in the failover group
54
55 # Lustre target device names, service names and mount points
56 declare -a TARGET_DEVNAMES TARGET_SRVNAMES TARGET_MNTPNTS
57 declare -i TARGET_NUM=0                 # Number of targets
58
59
60 # Get and check the positional parameters
61 VERBOSE_OUTPUT=false
62 while getopts "r:n:vd:" OPTION; do
63         case $OPTION in
64         r) 
65                 HBVER_OPT=$OPTARG
66                 if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \
67                 && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then
68                         error_output "Invalid Heartbeat software" \
69                                   "version - ${HBVER_OPT}!"
70                         usage
71                 fi
72                 ;;
73         n)
74                 HOSTNAME_OPT=$OPTARG 
75                 PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
76                 if [ -z "${PRIM_NODENAME}" ]; then
77                         error_output "Missing primary nodename!"
78                         usage
79                 fi
80                 HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
81                 if [ ${HOSTNAME_NUM} -lt 2 ]; then
82                         error_output "Missing failover nodenames!"
83                         usage
84                 fi
85                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]
86                 then
87                         error_output "Heartbeat version 1 can" \
88                                   "only support 2 nodes!"
89                         usage
90                 fi
91                 ;;
92         v) 
93                 VERBOSE_OUTPUT=true
94                 ;;
95         d)
96                 DEVICE_OPT=$OPTARG 
97                 TARGET_DEVNAMES[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $1}'`
98                 TARGET_MNTPNTS[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $2}'`
99                 if [ -z "${TARGET_DEVNAMES[TARGET_NUM]}" ]; then
100                         error_output "Missing target device name!"
101                         usage
102                 fi
103                 if [ -z "${TARGET_MNTPNTS[TARGET_NUM]}" ]; then
104                         error_output "Missing mount point for target"\
105                                   "${TARGET_DEVNAMES[TARGET_NUM]}!"
106                         usage
107                 fi
108                 TARGET_NUM=$(( TARGET_NUM + 1 ))
109                 ;;
110         ?) 
111                 usage 
112         esac
113 done
114
115 # Check the required parameters
116 if [ -z "${HBVER_OPT}" ]; then
117         error_output "Missing -r option!"
118         usage
119 fi
120
121 if [ -z "${HOSTNAME_OPT}" ]; then
122         error_output "Missing -n option!"
123         usage
124 fi
125
126 if [ -z "${DEVICE_OPT}" ]; then
127         error_output "Missing -d option!"
128         usage
129 fi
130
131 # get_nodenames
132 #
133 # Get all the node names in this failover group
134 get_nodenames() {
135         declare -i idx
136         local nodename_str nodename
137
138         nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\
139                       END {for (i in a) print a[i]}'`
140         idx=0
141         for nodename in ${nodename_str}
142         do
143                 NODE_NAMES[idx]=${nodename}
144                 idx=$idx+1
145         done
146
147         return 0
148 }
149
150 # check_remote_file host_name file
151 #
152 # Run remote command to check whether @file exists in @host_name
153 check_remote_file() {
154         local host_name=$1
155         local file_name=$2
156
157         if [ -z "${host_name}" ]; then
158                 error_output "check_remote_file():"\
159                          "Missing hostname!"
160                 return 1
161         fi
162
163         if [ -z "${file_name}" ]; then
164                 error_output "check_remote_file():"\
165                          "Missing file name!"
166                 return 1
167         fi
168
169         # Execute remote command to check the file 
170         ${REMOTE} ${host_name} "[ -e ${file_name} ]"
171         if [ $? -ne 0 ]; then
172                 error_output "check_remote_file():"\
173                 "${file_name} does not exist in host ${host_name}!"
174                 return 1
175         fi
176
177         return 0
178 }
179
180 # hb_running host_name
181
182 # Run remote command to check whether heartbeat service is running in @host_name
183 hb_running() {
184         local host_name=$1
185         local ret_str
186
187         ret_str=`${REMOTE} ${host_name} "${CL_STATUS} hbstatus" 2>&1`
188         if [ $? -ne 0 ]; then
189                 if [ "${ret_str}" = "${ret_str#*stop*}" ]; then
190                         error_output "hb_running():"\
191                         "remote command to ${host_name} error: ${ret_str}!"
192                         return 2
193                 else
194                         return 1
195                 fi
196         fi
197
198         return 0
199 }
200
201 # stop_heartbeat host_name
202 #
203 # Run remote command to stop heartbeat service running in @host_name
204 stop_heartbeat() {
205         local host_name=$1
206         local ret_str
207
208         ret_str=$(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin
209 service heartbeat stop < /dev/null" 2>&1)
210         if [ $? -ne 0 ]; then
211                 error_output "stop_heartbeat():"\
212                 "remote command to ${host_name} error: ${ret_str}!"
213                 return 1
214         fi
215
216         echo "`basename $0`: Heartbeat service is stopped on node ${host_name}."
217         return 0
218 }
219
220 # check_heartbeat
221 #
222 # Run remote command to check each node's heartbeat service
223 check_heartbeat() {
224         declare -i idx
225         local OK
226
227         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
228                 # Check Heartbeat configuration directory
229                 if ! check_remote_file ${NODE_NAMES[idx]} ${HA_DIR}; then
230                         error_output "check_heartbeat():"\
231                         "Is Heartbeat package installed?"
232                         return 1
233                 fi
234
235                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
236                         # Check mon configuration directory
237                         if ! check_remote_file ${NODE_NAMES[idx]} ${MON_DIR}; then
238                                 error_output "check_heartbeat():"\
239                                 "Is mon package installed?"
240                                 return 1
241                         fi
242                 fi
243
244                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
245                         # Check crm directory
246                         if ! check_remote_file ${NODE_NAMES[idx]} ${CIB_DIR}; then
247                                 error_output "check_heartbeat():"\
248                                 "Is Heartbeat v2 package installed?"
249                                 return 1
250                         fi
251                 fi
252                 
253                 # Check heartbeat service status
254                 hb_running ${NODE_NAMES[idx]}
255                 rc=$?
256                 if [ "$rc" -eq "2" ]; then
257                         return 1
258                 elif [ "$rc" -eq "1" ]; then
259                         verbose_output "Heartbeat service is stopped on"\
260                         "node ${NODE_NAMES[idx]}."
261                 elif [ "$rc" -eq "0" ]; then
262                         OK=
263                         echo -n "`basename $0`: Heartbeat service is running on"\
264                         "${NODE_NAMES[idx]}, go ahead to stop the service and"\
265                         "generate new configurations? [y/n]:"
266                         read OK
267                         if [ "${OK}" = "n" ]; then
268                                 echo "`basename $0`: New Heartbeat configurations"\
269                                 "are not generated."
270                                 return 2
271                         fi
272
273                         # Stop heartbeat service        
274                         stop_heartbeat ${NODE_NAMES[idx]}
275                 fi
276         done
277
278         return 0
279 }
280
281 # get_srvname hostname target_devname
282 #
283 # Get the lustre target server name from the node @hostname
284 get_srvname() {
285         local host_name=$1
286         local target_devname=$2
287         local target_srvname=
288         local ret_str
289
290         # Execute remote command to get the target server name
291         ret_str=$(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin
292 ${TUNEFS} --print --verbose ${target_devname} | grep Target:" 2>&1)
293         if [ $? -ne 0 ]; then
294                 echo "`basename $0`: get_srvname() error:" \
295                      "from host ${host_name} - ${ret_str}"
296                 return 1
297         fi
298
299         if [ "${ret_str}" != "${ret_str#*Target: }" ]; then
300                 ret_str=${ret_str#*Target: }
301                 target_srvname=`echo ${ret_str} | awk '{print $1}'`
302         fi
303         
304         if [ -z "${target_srvname}" ]; then
305                 echo "`basename $0`: get_srvname() error: Cannot get the"\
306                      "server name of target ${target_devname} in ${host_name}!"
307                 return 1
308         fi
309
310         echo ${target_srvname}
311         return 0
312
313
314 # get_srvnames
315 #
316 # Get server names of all the Lustre targets in this failover group
317 get_srvnames() {
318         declare -i i
319
320         # Initialize the TARGET_SRVNAMES array
321         unset TARGET_SRVNAMES
322
323         # Get Lustre target service names
324         for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do
325                 TARGET_SRVNAMES[i]=$(get_srvname ${PRIM_NODENAME} \
326                                      ${TARGET_DEVNAMES[i]})
327                 if [ $? -ne 0 ]; then
328                         error_output "${TARGET_SRVNAMES[i]}"
329                         return 1
330                 fi
331         done
332
333         return 0
334 }
335
336 # create_template
337 #
338 # Create the templates for ha.cf and authkeys files
339 create_template() {
340         /bin/mkdir -p ${TMP_DIR}
341
342         # Create the template for ha.cf
343         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
344                 cat >${HACF_TEMP} <<EOF
345 debugfile /var/log/ha-debug
346 logfile /var/log/ha-log
347 logfacility     local0
348 keepalive 2
349 deadtime 30
350 initdead 120
351
352 auto_failback off
353
354 EOF
355         elif [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
356                 cat >${HACF_TEMP} <<EOF
357 use_logd        yes
358 keepalive 1
359 deadtime 10
360 initdead 60
361
362 crm yes
363
364 EOF
365         fi
366
367         # Create the template for authkeys
368         if [ ! -s ${AUTHKEYS_TEMP} ]; then
369                 cat >${AUTHKEYS_TEMP} <<EOF
370 auth 1
371 1 sha1 HelloLustre!
372 EOF
373         fi
374
375         return 0
376 }
377
378 # create_hacf
379 #
380 # Create the ha.cf file and scp it to each node's /etc/ha.d/
381 create_hacf() {
382         HACF_PRIMNODE=${TMP_DIR}$"/ha.cf."${PRIM_NODENAME}
383         HACF_LUSTRE=${TMP_DIR}$"/ha.cf"${FILE_SUFFIX}
384
385         declare -i idx
386
387         if [ -e ${HACF_PRIMNODE} ]; then
388                 # The ha.cf file for the primary node has already existed.
389                 verbose_output "${HACF_PRIMNODE} already exists."
390                 return 0
391         fi
392
393         /bin/cp -f ${HACF_TEMP} ${HACF_LUSTRE}
394
395         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
396                 echo "node    ${NODE_NAMES[idx]}" >> ${HACF_LUSTRE}
397         done
398
399         # scp ha.cf file to all the nodes
400         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
401                 touch ${TMP_DIR}$"/ha.cf."${NODE_NAMES[idx]}
402                 scp ${HACF_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
403                 if [ $? -ne 0 ]; then
404                         error_output "Failed to scp ha.cf file"\
405                                  "to node ${NODE_NAMES[idx]}!"
406                         return 1
407                 fi
408         done
409
410         return 0
411 }
412
413 # create_haresources
414 #
415 # Create the haresources file and scp it to the each node's /etc/ha.d/
416 create_haresources() {
417         HARES_PRIMNODE=${TMP_DIR}$"/haresources."${PRIM_NODENAME}
418         HARES_LUSTRE=${TMP_DIR}$"/haresources"${FILE_SUFFIX}
419         declare -i idx
420         local res_line
421
422         if [ -s ${HARES_PRIMNODE} ]; then
423                 # The haresources file for the primary node has already existed
424                 if [ -n "`/bin/grep ${TARGET_DEVNAMES[0]} ${HARES_PRIMNODE}`" ]; then
425                         verbose_output "${HARES_PRIMNODE} already exists."
426                         return 0
427                 fi
428         fi
429                 
430         # Add the resource group line into the haresources file
431         res_line=${PRIM_NODENAME}
432         for ((idx = 0; idx < ${#TARGET_DEVNAMES[@]}; idx++)); do
433                 res_line=${res_line}" "${LUSTRE_SRV}::${TARGET_DEVNAMES[idx]}::${TARGET_MNTPNTS[idx]}::${FS_TYPE}
434                         
435                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
436                         res_line=${res_line}" "${TARGET_SRVNAMES[idx]}"-mon"
437                 fi
438         done
439         echo "${res_line}" >> ${HARES_LUSTRE}
440
441         # Generate the cib.xml file
442         if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
443                 # Add group haclient and user hacluster
444                 [ -z "`grep haclient /etc/group`" ] && groupadd haclient
445                 [ -z "`grep hacluster /etc/passwd`" ] && useradd -g haclient hacluster
446
447                 CIB_LUSTRE=${TMP_DIR}$"/cib.xml"${FILE_SUFFIX}
448                 python ${CIB_GEN_SCRIPT} --stdout \
449                 ${HARES_LUSTRE} > ${CIB_LUSTRE}
450                 if [ $? -ne 0 ]; then
451                         error_output "Failed to generate cib.xml file"\
452                                  "for node ${PRIM_NODENAME}!"
453                         return 1
454                 fi
455         fi
456
457         # scp the haresources file or cib.xml file
458         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
459                 /bin/cp -f ${HARES_LUSTRE} ${TMP_DIR}$"/haresources."${NODE_NAMES[idx]}
460                 scp ${HARES_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
461                 if [ $? -ne 0 ]; then
462                         error_output "Failed to scp haresources file"\
463                                  "to node ${NODE_NAMES[idx]}!"
464                         return 1
465                 fi
466
467                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
468                         scp ${CIB_LUSTRE} ${NODE_NAMES[idx]}:${CIB_DIR}/
469                         if [ $? -ne 0 ]; then
470                                 error_output "Failed to scp cib.xml"\
471                                          "file to node ${NODE_NAMES[idx]}!"
472                                 return 1
473                         fi
474                 fi
475         done
476
477         return 0
478 }
479
480 # create_authkeys
481 #
482 # Create the authkeys file and scp it to the each node's /etc/ha.d/
483 create_authkeys() {
484         AUTHKEYS_PRIMNODE=${TMP_DIR}$"/authkeys."${PRIM_NODENAME}
485         declare -i idx
486
487         if [ -e ${AUTHKEYS_PRIMNODE} ]; then
488                 verbose_output "${AUTHKEYS_PRIMNODE} already exists."
489                 return 0
490         fi
491
492         # scp the authkeys file to all the nodes
493         chmod 600 ${AUTHKEYS_TEMP}
494         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
495                 touch ${TMP_DIR}$"/authkeys."${NODE_NAMES[idx]}
496                 scp -p ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}/
497                 if [ $? -ne 0 ]; then
498                         error_output "Failed to scp authkeys file"\
499                                  "to node ${NODE_NAMES[idx]}!"
500                         return 1
501                 fi
502         done
503
504         return 0
505 }
506
507 # create_moncf
508 #
509 # Create the mon.cf file and scp it to the each node's /etc/mon/
510 create_moncf() {
511         MONCF_PRIMNODE=${TMP_DIR}$"/mon.cf."${PRIM_NODENAME}
512         MONCF_LUSTRE=${TMP_DIR}$"/mon.cf"${FILE_SUFFIX}
513         local srv_name params=
514         declare -i idx
515         declare -a OLD_TARGET_SRVNAMES          # targets in other nodes 
516                                                 # in this failover group
517         # Initialize the OLD_TARGET_SRVNAMES array
518         unset OLD_TARGET_SRVNAMES
519
520         if [ -s ${MONCF_PRIMNODE} ]; then
521                 if [ -n "`/bin/grep ${TARGET_SRVNAMES[0]} ${MONCF_PRIMNODE}`" ]
522                 then
523                         verbose_output "${MONCF_PRIMNODE} already exists."
524                         return 0
525                 else
526                         # Get the Lustre target service names 
527                         # from the previous mon.cf file
528                         idx=0
529                         for srv_name in `grep hostgroup ${MONCF_PRIMNODE}\
530                                         |awk '$2 ~ /-mon/ {print $2}'|xargs`
531                         do
532                                 OLD_TARGET_SRVNAMES[idx]=`echo ${srv_name}\
533                                                           |sed 's/-mon//g'`
534                                 idx=$(( idx + 1 ))
535                         done
536                 fi
537         fi
538
539         # Construct the parameters to mon.cf generation script
540         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
541                 params=${params}" -n "${NODE_NAMES[idx]}
542         done
543
544         for ((idx = 0; idx < ${#OLD_TARGET_SRVNAMES[@]}; idx++)); do
545                 params=${params}" -o "${OLD_TARGET_SRVNAMES[idx]}
546         done
547
548         for ((idx = 0; idx < ${#TARGET_SRVNAMES[@]}; idx++)); do
549                 params=${params}" -o "${TARGET_SRVNAMES[idx]}
550         done
551
552         ${SCRIPT_GEN_MONCF} ${params}
553         if [ $? -ne 0 ]; then
554                 error_output "Failed to generate mon.cf file"\
555                          "by using ${SCRIPT_GEN_MONCF}!"
556                 return 1
557         fi
558
559         /bin/mv *-mon.cfg ${MONCF_LUSTRE}
560
561         # scp the mon.cf file to all the nodes
562         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
563                 /bin/cp -f ${MONCF_LUSTRE} ${TMP_DIR}$"/mon.cf."${NODE_NAMES[idx]}
564
565                 scp ${MONCF_LUSTRE} ${NODE_NAMES[idx]}:${MON_DIR}/
566                 if [ $? -ne 0 ]; then
567                         error_output "Failed to scp mon.cf file"\
568                                  "to node ${NODE_NAMES[idx]}!"
569                         return 1
570                 fi
571         done
572
573         return 0
574 }
575
576 # generate_config
577 #
578 # Generate the configuration files for Heartbeat and scp them to all the nodes
579 generate_config() {
580         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
581                 # Get server names of Lustre targets
582                 if ! get_srvnames; then
583                         return 1
584                 fi
585         fi
586         
587         if ! create_template; then
588                 return 1
589         fi
590
591         verbose_output "Creating and remote copying ha.cf${FILE_SUFFIX} file to"\
592                        "${PRIM_NODENAME} failover group hosts..." 
593         if ! create_hacf; then
594                 return 1
595         fi
596         verbose_output "OK"
597
598         verbose_output "Creating and remote copying haresources${FILE_SUFFIX} file"\
599                        "to ${PRIM_NODENAME} failover group hosts..."
600         if ! create_haresources; then
601                 return 1
602         fi
603         verbose_output "OK"
604
605         verbose_output "Creating and remote copying authkeys${FILE_SUFFIX} file to" \
606                        "${PRIM_NODENAME} failover group hosts..."
607         if ! create_authkeys; then
608                 return 1
609         fi
610         verbose_output "OK"
611
612         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
613                 verbose_output "Creating and remote copying mon.cf${FILE_SUFFIX} file to" \
614                                 "${PRIM_NODENAME} failover group hosts..."
615                 if ! create_moncf; then
616                         return 1
617                 fi
618                 verbose_output "OK"
619         fi
620
621         return 0
622 }
623
624 # Main flow
625 # Get all the node names
626 if ! get_nodenames; then
627         exit 1
628 fi
629
630 # Check heartbeat services
631 verbose_output "Checking heartbeat service in the ${PRIM_NODENAME}"\
632                "failover group hosts..."
633 check_heartbeat
634 rc=$?
635 if [ "$rc" -eq "2" ]; then
636         verbose_output "OK"
637         exit 0
638 elif [ "$rc" -eq "1" ]; then
639         exit 1
640 fi
641 verbose_output "OK"
642
643 # Generate configuration files
644 if ! generate_config; then
645         exit 1
646 fi
647
648 exit 0