Whamcloud - gitweb
update slow list
[fs/lustre-release.git] / lustre / scripts / lc_hb.in
1 #!/bin/bash
2 #
3 # lc_hb - script for generating the Heartbeat HA software's
4 #         configuration files
5 #
6 ###############################################################################
7
8 # Usage
9 usage() {
10         cat >&2 <<EOF
11
12 Usage:  `basename $0`   <-r HBver> <-n hostnames> [-v]
13                         <-d target device> [-d target device...]
14
15         -r HBver                the version of Heartbeat software
16                                 The Heartbeat software versions which are curr-
17                                 ently supported are: hbv1 (Heartbeat version 1) 
18                                 and hbv2 (Heartbeat version 2).
19         -n hostnames            the nodenames of the primary node and its fail-
20                                 overs
21                                 Multiple nodenames are separated by colon (:)
22                                 delimeter. The first one is the nodename of the 
23                                 primary node, the others are failover nodenames.
24         -v                      verbose mode
25         -d target device        the target device name and mount point
26                                 The device name and mount point are separated by
27                                 colon (:) delimeter. 
28
29 EOF
30         exit 1
31 }
32
33 # Get the library of functions
34 . @scriptlibdir@/lc_common
35
36 #****************************** Global variables ******************************#
37 # Heartbeat tools
38 HB_TOOLS_PATH=${HB_TOOLS_PATH:-"/usr/lib64/heartbeat"}  # Heartbeat tools path
39 CIB_GEN_SCRIPT=${HB_TOOLS_PATH}/haresources2cib.py
40 CL_STATUS=${CL_STATUS:-"/usr/bin/cl_status"}
41
42 # Service directories and names
43 HARES_DIR=${HARES_DIR:-"${HA_DIR}/resource.d"}          # Heartbeat resources
44 LUSTRE_SRV=${LUSTRE_SRV:-"Filesystem"}  # Service script provided by Heartbeat
45
46 TMP_DIR=${HB_TMP_DIR}                   # Temporary directory
47 HACF_TEMP=${TMP_DIR}/ha.cf.temp
48 AUTHKEYS_TEMP=${TMP_DIR}/authkeys${FILE_SUFFIX}
49
50 declare -a NODE_NAMES                   # Node names in the failover group
51
52 # Lustre target device names, service names and mount points
53 declare -a TARGET_DEVNAMES TARGET_SRVNAMES TARGET_MNTPNTS
54 declare -i TARGET_NUM=0                 # Number of targets
55
56
57 # Get and check the positional parameters
58 VERBOSE_OUTPUT=false
59 while getopts "r:n:vd:" OPTION; do
60         case $OPTION in
61         r) 
62                 HBVER_OPT=$OPTARG
63                 if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \
64                 && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then
65                         echo >&2 $"`basename $0`: Invalid Heartbeat software" \
66                                   "version - ${HBVER_OPT}!"
67                         usage
68                 fi
69                 ;;
70         n)
71                 HOSTNAME_OPT=$OPTARG 
72                 PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
73                 if [ -z "${PRIM_NODENAME}" ]; then
74                         echo >&2 $"`basename $0`: Missing primary nodename!"
75                         usage
76                 fi
77                 HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
78                 if [ ${HOSTNAME_NUM} -lt 2 ]; then
79                         echo >&2 $"`basename $0`: Missing failover nodenames!"
80                         usage
81                 fi
82                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]
83                 then
84                         echo >&2 $"`basename $0`: Heartbeat version 1 can" \
85                                   "only support 2 nodes!"
86                         usage
87                 fi
88                 ;;
89         v) 
90                 VERBOSE_OUTPUT=true
91                 ;;
92         d)
93                 DEVICE_OPT=$OPTARG 
94                 TARGET_DEVNAMES[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $1}'`
95                 TARGET_MNTPNTS[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $2}'`
96                 if [ -z "${TARGET_DEVNAMES[TARGET_NUM]}" ]; then
97                         echo >&2 $"`basename $0`: Missing target device name!"
98                         usage
99                 fi
100                 if [ -z "${TARGET_MNTPNTS[TARGET_NUM]}" ]; then
101                         echo >&2 $"`basename $0`: Missing mount point for target"\
102                                   "${TARGET_DEVNAMES[TARGET_NUM]}!"
103                         usage
104                 fi
105                 TARGET_NUM=$(( TARGET_NUM + 1 ))
106                 ;;
107         ?) 
108                 usage 
109         esac
110 done
111
112 # Check the required parameters
113 if [ -z "${HBVER_OPT}" ]; then
114         echo >&2 $"`basename $0`: Missing -r option!"
115         usage
116 fi
117
118 if [ -z "${HOSTNAME_OPT}" ]; then
119         echo >&2 $"`basename $0`: Missing -n option!"
120         usage
121 fi
122
123 if [ -z "${DEVICE_OPT}" ]; then
124         echo >&2 $"`basename $0`: Missing -d option!"
125         usage
126 fi
127
128 # get_nodenames
129 #
130 # Get all the node names in this failover group
131 get_nodenames() {
132         declare -i idx
133         local nodename_str nodename
134
135         nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\
136                       END {for (i in a) print a[i]}'`
137         idx=0
138         for nodename in ${nodename_str}
139         do
140                 NODE_NAMES[idx]=${nodename}
141                 idx=$idx+1
142         done
143
144         return 0
145 }
146
147 # check_remote_file host_name file
148 #
149 # Run remote command to check whether @file exists in @host_name
150 check_remote_file() {
151         local host_name=$1
152         local file_name=$2
153
154         if [ -z "${host_name}" ]; then
155                 echo >&2 "`basename $0`: check_remote_file() error:"\
156                          "Missing hostname!"
157                 return 1
158         fi
159
160         if [ -z "${file_name}" ]; then
161                 echo >&2 "`basename $0`: check_remote_file() error:"\
162                          "Missing file name!"
163                 return 1
164         fi
165
166         # Execute remote command to check the file 
167         ${REMOTE} ${host_name} "[ -e ${file_name} ]"
168         if [ $? -ne 0 ]; then
169                 echo >&2 "`basename $0`: check_remote_file() error:"\
170                 "${file_name} does not exist in host ${host_name}!"
171                 return 1
172         fi
173
174         return 0
175 }
176
177 # hb_running host_name
178
179 # Run remote command to check whether heartbeat service is running in @host_name
180 hb_running() {
181         local host_name=$1
182         local ret_str
183
184         ret_str=`${REMOTE} ${host_name} "${CL_STATUS} hbstatus" 2>&1`
185         if [ $? -ne 0 ]; then
186                 if [ "${ret_str}" = "${ret_str#*stop*}" ]; then
187                         echo >&2 "`basename $0`: hb_running() error:"\
188                         "remote command to ${host_name} error: ${ret_str}!"
189                         return 2
190                 else
191                         return 1
192                 fi
193         fi
194
195         return 0
196 }
197
198 # stop_heartbeat host_name
199 #
200 # Run remote command to stop heartbeat service running in @host_name
201 stop_heartbeat() {
202         local host_name=$1
203         local ret_str
204
205         ret_str=`${REMOTE} ${host_name} "/sbin/service heartbeat stop" 2>&1`
206         if [ $? -ne 0 ]; then
207                 echo >&2 "`basename $0`: stop_heartbeat() error:"\
208                 "remote command to ${host_name} error: ${ret_str}!"
209                 return 1
210         fi
211
212         echo "`basename $0`: Heartbeat service is stopped on node ${host_name}."
213         return 0
214 }
215
216 # check_heartbeat
217 #
218 # Run remote command to check each node's heartbeat service
219 check_heartbeat() {
220         declare -i idx
221         local OK
222
223         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
224                 # Check Heartbeat configuration directory
225                 if ! check_remote_file ${NODE_NAMES[idx]} ${HA_DIR}; then
226                         echo >&2 "`basename $0`: check_heartbeat() error:"\
227                         "Is Heartbeat package installed?"
228                         return 1
229                 fi
230
231                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
232                         # Check mon configuration directory
233                         if ! check_remote_file ${NODE_NAMES[idx]} ${MON_DIR}; then
234                                 echo >&2 "`basename $0`: check_heartbeat()"\
235                                 "error: Is mon package installed?"
236                                 return 1
237                         fi
238                 fi
239
240                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
241                         # Check crm directory
242                         if ! check_remote_file ${NODE_NAMES[idx]} ${CIB_DIR}; then
243                                 echo >&2 "`basename $0`: check_heartbeat()"\
244                                 "error: Is Heartbeat v2 package installed?"
245                                 return 1
246                         fi
247                 fi
248                 
249                 # Check heartbeat service status
250                 hb_running ${NODE_NAMES[idx]}
251                 rc=$?
252                 if [ "$rc" -eq "2" ]; then
253                         return 1
254                 elif [ "$rc" -eq "1" ]; then
255                         verbose_output "Heartbeat service is stopped on"\
256                         "node ${NODE_NAMES[idx]}."
257                 elif [ "$rc" -eq "0" ]; then
258                         OK=
259                         echo -n "`basename $0`: Heartbeat service is running on"\
260                         "${NODE_NAMES[idx]}, go ahead to stop the service and"\
261                         "generate new configurations? [y/n]:"
262                         read OK
263                         if [ "${OK}" = "n" ]; then
264                                 echo "`basename $0`: New Heartbeat configurations"\
265                                 "are not generated."
266                                 return 2
267                         fi
268
269                         # Stop heartbeat service        
270                         stop_heartbeat ${NODE_NAMES[idx]}
271                 fi
272         done
273
274         return 0
275 }
276
277 # get_srvname hostname target_devname
278 #
279 # Get the lustre target server name from the node @hostname
280 get_srvname() {
281         local host_name=$1
282         local target_devname=$2
283         local target_srvname=
284         local ret_str
285
286         # Execute remote command to get the target server name
287         ret_str=`${REMOTE} ${host_name} \
288                 "${TUNEFS} --print --verbose ${target_devname} | grep Target:" 2>&1`
289         if [ $? -ne 0 ]; then
290                 echo "`basename $0`: get_srvname() error:" \
291                      "from host ${host_name} - ${ret_str}"
292                 return 1
293         fi
294
295         if [ "${ret_str}" != "${ret_str#*Target: }" ]; then
296                 ret_str=${ret_str#*Target: }
297                 target_srvname=`echo ${ret_str} | awk '{print $1}'`
298         fi
299         
300         if [ -z "${target_srvname}" ]; then
301                 echo "`basename $0`: get_srvname() error: Cannot get the"\
302                      "server name of target ${target_devname} in ${host_name}!"
303                 return 1
304         fi
305
306         echo ${target_srvname}
307         return 0
308
309
310 # get_srvnames
311 #
312 # Get server names of all the Lustre targets in this failover group
313 get_srvnames() {
314         declare -i i
315
316         # Initialize the TARGET_SRVNAMES array
317         unset TARGET_SRVNAMES
318
319         # Get Lustre target service names
320         for ((i = 0; i < ${#TARGET_DEVNAMES[@]}; i++)); do
321                 TARGET_SRVNAMES[i]=$(get_srvname ${PRIM_NODENAME} \
322                                      ${TARGET_DEVNAMES[i]})
323                 if [ $? -ne 0 ]; then
324                         echo >&2 "${TARGET_SRVNAMES[i]}"
325                         return 1
326                 fi
327         done
328
329         return 0
330 }
331
332 # create_template
333 #
334 # Create the templates for ha.cf and authkeys files
335 create_template() {
336         /bin/mkdir -p ${TMP_DIR}
337
338         # Create the template for ha.cf
339         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
340                 cat >${HACF_TEMP} <<EOF
341 debugfile /var/log/ha-debug
342 logfile /var/log/ha-log
343 logfacility     local0
344 keepalive 2
345 deadtime 30
346 initdead 120
347
348 auto_failback off
349
350 EOF
351         elif [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
352                 cat >${HACF_TEMP} <<EOF
353 use_logd        yes
354 keepalive 1
355 deadtime 10
356 initdead 60
357
358 crm yes
359
360 EOF
361         fi
362
363         # Create the template for authkeys
364         if [ ! -s ${AUTHKEYS_TEMP} ]; then
365                 cat >${AUTHKEYS_TEMP} <<EOF
366 auth 1
367 1 sha1 HelloLustre!
368 EOF
369         fi
370
371         return 0
372 }
373
374 # create_hacf
375 #
376 # Create the ha.cf file and scp it to each node's /etc/ha.d/
377 create_hacf() {
378         HACF_PRIMNODE=${TMP_DIR}$"/ha.cf."${PRIM_NODENAME}
379         HACF_LUSTRE=${TMP_DIR}$"/ha.cf"${FILE_SUFFIX}
380
381         declare -i idx
382
383         if [ -e ${HACF_PRIMNODE} ]; then
384                 # The ha.cf file for the primary node has already existed.
385                 verbose_output "${HACF_PRIMNODE} already exists."
386                 return 0
387         fi
388
389         /bin/cp -f ${HACF_TEMP} ${HACF_LUSTRE}
390
391         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
392                 echo "node    ${NODE_NAMES[idx]}" >> ${HACF_LUSTRE}
393         done
394
395         # scp ha.cf file to all the nodes
396         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
397                 touch ${TMP_DIR}$"/ha.cf."${NODE_NAMES[idx]}
398                 scp ${HACF_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
399                 if [ $? -ne 0 ]; then
400                         echo >&2 "`basename $0`: Failed to scp ha.cf file"\
401                                  "to node ${NODE_NAMES[idx]}!"
402                         return 1
403                 fi
404         done
405
406         return 0
407 }
408
409 # create_haresources
410 #
411 # Create the haresources file and scp it to the each node's /etc/ha.d/
412 create_haresources() {
413         HARES_PRIMNODE=${TMP_DIR}$"/haresources."${PRIM_NODENAME}
414         HARES_LUSTRE=${TMP_DIR}$"/haresources"${FILE_SUFFIX}
415         declare -i idx
416         local res_line
417
418         if [ -s ${HARES_PRIMNODE} ]; then
419                 # The haresources file for the primary node has already existed
420                 if [ -n "`/bin/grep ${TARGET_DEVNAMES[0]} ${HARES_PRIMNODE}`" ]; then
421                         verbose_output "${HARES_PRIMNODE} already exists."
422                         return 0
423                 fi
424         fi
425                 
426         # Add the resource group line into the haresources file
427         res_line=${PRIM_NODENAME}
428         for ((idx = 0; idx < ${#TARGET_DEVNAMES[@]}; idx++)); do
429                 res_line=${res_line}" "${LUSTRE_SRV}::${TARGET_DEVNAMES[idx]}::${TARGET_MNTPNTS[idx]}::${FS_TYPE}
430                         
431                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
432                         res_line=${res_line}" "${TARGET_SRVNAMES[idx]}"-mon"
433                 fi
434         done
435         echo "${res_line}" >> ${HARES_LUSTRE}
436
437         # Generate the cib.xml file
438         if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
439                 # Add group haclient and user hacluster
440                 [ -z "`grep haclient /etc/group`" ] && groupadd haclient
441                 [ -z "`grep hacluster /etc/passwd`" ] && useradd -g haclient hacluster
442
443                 CIB_LUSTRE=${TMP_DIR}$"/cib.xml"${FILE_SUFFIX}
444                 python ${CIB_GEN_SCRIPT} --stdout \
445                 ${HARES_LUSTRE} > ${CIB_LUSTRE}
446                 if [ $? -ne 0 ]; then
447                         echo >&2 "`basename $0`: Failed to generate cib.xml file"\
448                                  "for node ${PRIM_NODENAME}!"
449                         return 1
450                 fi
451         fi
452
453         # scp the haresources file or cib.xml file
454         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
455                 /bin/cp -f ${HARES_LUSTRE} ${TMP_DIR}$"/haresources."${NODE_NAMES[idx]}
456                 scp ${HARES_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
457                 if [ $? -ne 0 ]; then
458                         echo >&2 "`basename $0`: Failed to scp haresources file"\
459                                  "to node ${NODE_NAMES[idx]}!"
460                         return 1
461                 fi
462
463                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
464                         scp ${CIB_LUSTRE} ${NODE_NAMES[idx]}:${CIB_DIR}/
465                         if [ $? -ne 0 ]; then
466                                 echo >&2 "`basename $0`: Failed to scp cib.xml"\
467                                          "file to node ${NODE_NAMES[idx]}!"
468                                 return 1
469                         fi
470                 fi
471         done
472
473         return 0
474 }
475
476 # create_authkeys
477 #
478 # Create the authkeys file and scp it to the each node's /etc/ha.d/
479 create_authkeys() {
480         AUTHKEYS_PRIMNODE=${TMP_DIR}$"/authkeys."${PRIM_NODENAME}
481         declare -i idx
482
483         if [ -e ${AUTHKEYS_PRIMNODE} ]; then
484                 verbose_output "${AUTHKEYS_PRIMNODE} already exists."
485                 return 0
486         fi
487
488         # scp the authkeys file to all the nodes
489         chmod 600 ${AUTHKEYS_TEMP}
490         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
491                 touch ${TMP_DIR}$"/authkeys."${NODE_NAMES[idx]}
492                 scp -p ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}/
493                 if [ $? -ne 0 ]; then
494                         echo >&2 "`basename $0`: Failed to scp authkeys file"\
495                                  "to node ${NODE_NAMES[idx]}!"
496                         return 1
497                 fi
498         done
499
500         return 0
501 }
502
503 # create_moncf
504 #
505 # Create the mon.cf file and scp it to the each node's /etc/mon/
506 create_moncf() {
507         MONCF_PRIMNODE=${TMP_DIR}$"/mon.cf."${PRIM_NODENAME}
508         MONCF_LUSTRE=${TMP_DIR}$"/mon.cf"${FILE_SUFFIX}
509         local srv_name params=
510         declare -i idx
511         declare -a OLD_TARGET_SRVNAMES          # targets in other nodes 
512                                                 # in this failover group
513         # Initialize the OLD_TARGET_SRVNAMES array
514         unset OLD_TARGET_SRVNAMES
515
516         if [ -s ${MONCF_PRIMNODE} ]; then
517                 if [ -n "`/bin/grep ${TARGET_SRVNAMES[0]} ${MONCF_PRIMNODE}`" ]
518                 then
519                         verbose_output "${MONCF_PRIMNODE} already exists."
520                         return 0
521                 else
522                         # Get the Lustre target service names 
523                         # from the previous mon.cf file
524                         idx=0
525                         for srv_name in `grep hostgroup ${MONCF_PRIMNODE}\
526                                         |awk '$2 ~ /-mon/ {print $2}'|xargs`
527                         do
528                                 OLD_TARGET_SRVNAMES[idx]=`echo ${srv_name}\
529                                                           |sed 's/-mon//g'`
530                                 idx=$(( idx + 1 ))
531                         done
532                 fi
533         fi
534
535         # Construct the parameters to mon.cf generation script
536         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
537                 params=${params}" -n "${NODE_NAMES[idx]}
538         done
539
540         for ((idx = 0; idx < ${#OLD_TARGET_SRVNAMES[@]}; idx++)); do
541                 params=${params}" -o "${OLD_TARGET_SRVNAMES[idx]}
542         done
543
544         for ((idx = 0; idx < ${#TARGET_SRVNAMES[@]}; idx++)); do
545                 params=${params}" -o "${TARGET_SRVNAMES[idx]}
546         done
547
548         ${SCRIPT_GEN_MONCF} ${params}
549         if [ $? -ne 0 ]; then
550                 echo >&2 "`basename $0`: Failed to generate mon.cf file"\
551                          "by using ${SCRIPT_GEN_MONCF}!"
552                 return 1
553         fi
554
555         /bin/mv *-mon.cfg ${MONCF_LUSTRE}
556
557         # scp the mon.cf file to all the nodes
558         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
559                 /bin/cp -f ${MONCF_LUSTRE} ${TMP_DIR}$"/mon.cf."${NODE_NAMES[idx]}
560
561                 scp ${MONCF_LUSTRE} ${NODE_NAMES[idx]}:${MON_DIR}/
562                 if [ $? -ne 0 ]; then
563                         echo >&2 "`basename $0`: Failed to scp mon.cf file"\
564                                  "to node ${NODE_NAMES[idx]}!"
565                         return 1
566                 fi
567         done
568
569         return 0
570 }
571
572 # generate_config
573 #
574 # Generate the configuration files for Heartbeat and scp them to all the nodes
575 generate_config() {
576         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
577                 # Get server names of Lustre targets
578                 if ! get_srvnames; then
579                         return 1
580                 fi
581         fi
582         
583         if ! create_template; then
584                 return 1
585         fi
586
587         verbose_output "Creating and remote copying ha.cf${FILE_SUFFIX} file to"\
588                        "${PRIM_NODENAME} failover group hosts..." 
589         if ! create_hacf; then
590                 return 1
591         fi
592         verbose_output "OK"
593
594         verbose_output "Creating and remote copying haresources${FILE_SUFFIX} file"\
595                        "to ${PRIM_NODENAME} failover group hosts..."
596         if ! create_haresources; then
597                 return 1
598         fi
599         verbose_output "OK"
600
601         verbose_output "Creating and remote copying authkeys${FILE_SUFFIX} file to" \
602                        "${PRIM_NODENAME} failover group hosts..."
603         if ! create_authkeys; then
604                 return 1
605         fi
606         verbose_output "OK"
607
608         if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
609                 verbose_output "Creating and remote copying mon.cf${FILE_SUFFIX} file to" \
610                                 "${PRIM_NODENAME} failover group hosts..."
611                 if ! create_moncf; then
612                         return 1
613                 fi
614                 verbose_output "OK"
615         fi
616
617         return 0
618 }
619
620 # Main flow
621 # Get all the node names
622 if ! get_nodenames; then
623         exit 1
624 fi
625
626 # Check heartbeat services
627 verbose_output "Checking heartbeat service in the ${PRIM_NODENAME}"\
628                "failover group hosts..."
629 check_heartbeat
630 rc=$?
631 if [ "$rc" -eq "2" ]; then
632         verbose_output "OK"
633         exit 0
634 elif [ "$rc" -eq "1" ]; then
635         exit 1
636 fi
637 verbose_output "OK"
638
639 # Generate configuration files
640 if ! generate_config; then
641         exit 1
642 fi
643
644 exit 0