Whamcloud - gitweb
LU-107 Add scripts for implementing heartbeat v1 failover
authorNed Bass <bass6@llnl.gov>
Thu, 13 Oct 2011 17:56:03 +0000 (10:56 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 11 Jul 2012 23:38:55 +0000 (19:38 -0400)
/usr/sbin/ldev - list devices, determine validity, etc.
/usr/sbin/lhbadm - wrapper for heartbeat utils for failover/failback/status
/etc/ha.d/resource.d/Lustre - heartbeat resource agent (wraps init script)
/etc/init.d/lustre - lustre init script
/etc/init.d/lnet - lnet init script
/usr/sbin/haconfig - helper script for building heartbeat config files

The scripts use two configuration files:
 /etc/ldev.conf - maps hostnames to failover partners, devices, and labels
 /etc/nids - hostnames to lustre NIDS

In addition to heartbeat support, the ldev script enables parallel
execution of commands against all luns configured on a server.  The
lustre init script supports devices backed by Linux software RAID, ZFS,
or traditional block devices.

NOTE: these scripts presume the udev rules for persistent block device
naming are in place, in particular that lustre labels can be mapped to
block devices in /dev/disk/by-id.

Change-Id: I8391744ce6eed989c061f131aca4a2da7b5d51b2
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-on: http://review.whamcloud.com/290
Reviewed-by: Doug Oucharek <doug@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Tested-by: Hudson
Reviewed-by: Oleg Drokin <green@whamcloud.com>
19 files changed:
lustre.spec.in
lustre/autoconf/lustre-core.m4
lustre/conf/Makefile.am
lustre/conf/ldev.conf [new file with mode: 0644]
lustre/conf/lustre [new file with mode: 0644]
lustre/doc/Makefile.am
lustre/doc/ldev.8 [new file with mode: 0644]
lustre/doc/ldev.conf.5 [new file with mode: 0644]
lustre/doc/lhbadm.8 [new file with mode: 0644]
lustre/doc/nids.5 [new file with mode: 0644]
lustre/scripts/.gitignore
lustre/scripts/Lustre [new file with mode: 0644]
lustre/scripts/Makefile.am
lustre/scripts/haconfig [new file with mode: 0644]
lustre/scripts/ldev [new file with mode: 0644]
lustre/scripts/lhbadm [new file with mode: 0644]
lustre/scripts/lnet [new file with mode: 0644]
lustre/scripts/lustre [deleted file]
lustre/scripts/lustre.in [new file with mode: 0644]

index cd914af..a2e52fd 100644 (file)
@@ -233,6 +233,13 @@ cat >lustre.files <<EOF
 %attr(-, root, root) %{_libexecdir}/lustre/lc_common
 
 %attr(-, root, root) %{_sysconfdir}/udev/rules.d/99-lustre.rules
 %attr(-, root, root) %{_libexecdir}/lustre/lc_common
 
 %attr(-, root, root) %{_sysconfdir}/udev/rules.d/99-lustre.rules
+
+%attr(-, root, root) %{_sysconfdir}/init.d/lnet
+%attr(-, root, root) %{_sysconfdir}/init.d/lustre
+%attr(-, root, root) %{_sysconfdir}/ldev.conf
+%attr(-, root, root) %{_sysconfdir}/sysconfig/lustre
+%attr(-, root, root) %{_libexecdir}/lustre/haconfig
+%attr(-, root, root) %{_sysconfdir}/ha.d/resource.d/Lustre
 EOF
 
 if [ -f $RPM_BUILD_ROOT%{_libdir}/libcfsutil.a ] ; then
 EOF
 
 if [ -f $RPM_BUILD_ROOT%{_libdir}/libcfsutil.a ] ; then
index f6d3433..daed428 100644 (file)
@@ -2707,6 +2707,7 @@ lustre/ptlrpc/gss/autoMakefile
 lustre/quota/Makefile
 lustre/quota/autoMakefile
 lustre/scripts/Makefile
 lustre/quota/Makefile
 lustre/quota/autoMakefile
 lustre/scripts/Makefile
+lustre/scripts/lustre
 lustre/tests/Makefile
 lustre/tests/mpi/Makefile
 lustre/utils/Makefile
 lustre/tests/Makefile
 lustre/tests/mpi/Makefile
 lustre/utils/Makefile
index 6d6b7fe..2f25ba2 100644 (file)
@@ -35,7 +35,7 @@
 #
 
 EXTRA_DIST = lustre.dtd slapd-lustre.conf lustre2ldif.xsl top.ldif \
 #
 
 EXTRA_DIST = lustre.dtd slapd-lustre.conf lustre2ldif.xsl top.ldif \
-             99-lustre.rules
+             99-lustre.rules lustre ldev.conf
 ldapconfdir = $(sysconfdir)/openldap
 
 if UTILS
 ldapconfdir = $(sysconfdir)/openldap
 
 if UTILS
@@ -45,3 +45,8 @@ endif
 
 udevrulesdir = $(sysconfdir)/udev/rules.d
 udevrules_DATA = 99-lustre.rules
 
 udevrulesdir = $(sysconfdir)/udev/rules.d
 udevrules_DATA = 99-lustre.rules
+
+sysconfigdir = $(sysconfdir)/sysconfig
+sysconfig_DATA = lustre
+
+sysconf_DATA = ldev.conf
diff --git a/lustre/conf/ldev.conf b/lustre/conf/ldev.conf
new file mode 100644 (file)
index 0000000..a4efb0c
--- /dev/null
@@ -0,0 +1,14 @@
+# example /etc/ldev.conf
+#
+#local  foreign/-  label       [md|zfs:]device-path   [journal-path]/- [raidtab]
+#
+#zeno-mds1 - zeno-MDT0000 zfs:lustre-zeno-mds1/mdt1
+#
+#zeno1 zeno5 zeno-OST0000 zfs:lustre-zeno1/ost1
+#zeno2 zeno6 zeno-OST0001 zfs:lustre-zeno2/ost1
+#zeno3 zeno7 zeno-OST0002 zfs:lustre-zeno3/ost1
+#zeno4 zeno8 zeno-OST0003 zfs:lustre-zeno4/ost1
+#zeno5 zeno1 zeno-OST0004 zfs:lustre-zeno5/ost1
+#zeno6 zeno2 zeno-OST0005 zfs:lustre-zeno6/ost1
+#zeno7 zeno3 zeno-OST0006 zfs:lustre-zeno7/ost1
+#zeno8 zeno4 zeno-OST0007 zfs:lustre-zeno8/ost1
diff --git a/lustre/conf/lustre b/lustre/conf/lustre
new file mode 100644 (file)
index 0000000..3f894c3
--- /dev/null
@@ -0,0 +1,70 @@
+# Configuration options for /etc/init.d/lustre
+
+# The command in PREEXEC_SCRIPT is run before starting services.  Its first
+# parameter is mode of the init script (start|restart|condrestart).  If the
+# command has a non-zero return code, then the init script will abort without
+# taking any action.
+#PREEXEC_SCRIPT="/usr/bin/somescript"
+
+# The command in PREEXEC_CHECK is run before starting services.  It is not
+# passed any arguments.  If the command has a non-zero return code, then the
+# init script will abort without taking any action.
+#PREEXEC_CHECK="command"
+
+# The commands in POSTEXEC_SCRIPT and/or POSTEXEC_CHECK are run after starting
+# services.  If the command has a non-zero return code, then the init script
+# will terminate with an exit status of 1.
+#POSTEXEC_SCRIPT="/usr/bin/somescript"
+#POSTEXEC_CHECK="command"
+
+# If SCSI_DEVICE_TIMEOUT is set, its value is echoed into
+#  /sys/block/sdXX/device/timeout
+# before checking file systems or starting Lustre
+#SCSI_DEVICE_TIMEOUT=60
+
+# LOCAL_SRV or FOREIGN_SRV can be set to a space-delimited list of
+# labels that will be mounted as local and foreign (failover) lustre services.
+# If unset or null, /etc/ldev.conf establishes the labels for these services.
+#LOCAL_SRV="`shopt -s nullglob && cd /dev/disk/by-label 2>/dev/null && echo *-OST* *-MDT* *MGS* *MGT*`"
+
+# Before mounting any lustre backend devices, the init script will
+# run pfsck.ldiskfs only if the following FSCK_ARGS variable is a
+# non-empty string.  There are no default options for this fsck.
+# The command takes the form:
+#
+#     /sbin/pfsck.ldiskfs $devices -- ${FSCK_ARGS}
+#
+#FSCK_ARGS="-p"
+
+# Uncomment to insert server mount options - see mount.lustre(8)
+#MOUNT_OPTIONS="-o abort_recov"
+
+# Stagger mounts by MOUNT_DELAY seconds to avoid possible module loading races
+# due to multiple mount commands running in parallel.  This obviously does not
+# eliminate the race but provides a safety buffer.  The default is 2 seconds.
+# Set to 0 or empty string to disable staggering of mounts.
+#MOUNT_DELAY=0
+
+# Uncomment to disable the check for the mmp ldiskfs feature (only
+# applies if foreign # devices are configured).
+# REQUIRE_MMP_FEATURE=no
+
+# Override default mount points for lustre services
+#LOCAL_MOUNT_DIR=/mnt/lustre/local
+#FOREIGN_MOUNT_DIR=/mnt/lustre/foreign
+
+# Uncomment to cause the lustre init scripts to explicitly modprobe the zfs
+# module when starting services.  The zfs module is normally loaded
+# automatically by the zfs command line utilities, for example when the zpool
+# is imported.
+#LOAD_ZFS="yes"
+
+# Uncomment to pass additional arguments to 'zpool import'.  For example,
+# the -m option can be used to allow the pool to be imported even if its
+# missing a non-critical log device.
+#ZPOOL_IMPORT_ARGS="-m"
+
+# Uncomment to force ZFS to import the pool using the device names in the
+# given directory.  By default, the /dev/disk/by-vdev/ device names will be
+# used if they are configured followed by the /dev/mapper device names.
+#ZPOOL_IMPORT_DIR="/dev/disk/by-id"
index 15af7da..26a1753 100644 (file)
@@ -53,7 +53,7 @@ MANFILES = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 \
        plot-llstat.8 l_getgroups.8 lst.8 routerstat.8 lshowmount.8 \
        ll_recover_lost_found_objs.8 llog_reader.8 llapi_file_open.3 \
        llapi_file_create.3 llapi_file_get_stripe.3 liblustreapi.7 \
        plot-llstat.8 l_getgroups.8 lst.8 routerstat.8 lshowmount.8 \
        ll_recover_lost_found_objs.8 llog_reader.8 llapi_file_open.3 \
        llapi_file_create.3 llapi_file_get_stripe.3 liblustreapi.7 \
-       lustre_rsync.8 lfs_migrate.1
+       lustre_rsync.8 lfs_migrate.1 lhbadm.8 ldev.8 ldev.conf.5 nids.5
 
 if UTILS
 man_MANS = $(MANFILES)
 
 if UTILS
 man_MANS = $(MANFILES)
diff --git a/lustre/doc/ldev.8 b/lustre/doc/ldev.8
new file mode 100644 (file)
index 0000000..438ffd3
--- /dev/null
@@ -0,0 +1,112 @@
+.TH ldev 8 Lustre ldev ldev
+.SH NAME
+ldev \- lustre device utility
+.SH SYNOPSIS
+.B "ldev [OPTIONS]"
+.br
+.SH DESCRIPTION
+.B ldev
+can be used to query information about lustre devices configured in
+/etc/ldev.conf.  It is used by the lustre init script.
+.SH OPTIONS
+.B ldev
+accepts the following options:
+.TP
+.I "-h, --help"
+Display help message.
+.TP
+.I "-c, --config FILE"
+Set path to config file.
+.TP
+.I "-H, --hostname NAME"
+Use NAME instead of local hostname for queries.
+.TP
+.I "-p, --partner"
+Print hostname of failover partner.
+.TP
+.I "-l, --local"
+Print labels for local devices.
+.TP
+.I "-f, --foreign"
+Print labels for foreign devices.
+.TP
+.I "-a, --all"
+Print labels for local and foreign devices.
+.TP
+.I "-s, --sanity"
+Sanity check config on this node.
+If any expected local or foreign devices are not present, print an error.
+If devices do not contain the expected labels, print an error.
+.TP
+.I "-d, --device LABEL"
+Print storage device of label.
+.TP
+.I "-j, --journal LABEL"
+Print journal device corresponding to label if defined.
+.TP
+.I "-r, --raidtab LABEL"
+Print Linux software raid configuration file or ZFS cache file associated with
+LABEL, if any.  Using non-default names for these files may help prevent arrays
+from being automatically started by the system.  This is important in failover
+configurations where the timing of device initialization must be strictly
+controlled.
+.TP
+.I "-t, --type LABEL"
+Print device type of LABEL, i.e. "zfs" or "md".
+.TP
+.I "-z, --zpool LABEL"
+Print zpool containing LABEL.
+.TP
+.I "CMD [ARGS...]"
+Run one instance of \fICMD [ARGS]\fR for each label in parallel.
+Only the local labels are used by default, but foreign or all labels
+may be selected by adding the \fI--foreign\fR or \fI--all\fR options.
+The following substitutions are made:
+%f=fsname, %d=device, %j=journal, %i=index, %I=hex-index, %t=type, %l=label,
+%n=nid, %N=failnid.  On failure of any child processes, \fBldev\fR will
+return a non-zero exit code.
+.LP
+It is an error if %n or %N is used in a command and /etc/nids does not
+contain appropriate host to NID mappings.
+.SH EXAMPLES
+To run a preen check on all devices in a cluster in parallel:
+.IP
+.nf
+pdsh -S -g ost ldev fsck.ldiskfs -p %d
+.fi
+.LP
+To re-format an entire file system:
+.IP
+.nf
+#!/bin/bash -xe
+export FANOUT=64
+
+# MDTs
+pdsh -S -g mds service lustre stop
+pdsh -S -g mds ldev "yes \\| mkfs.ldiskfs -q -b4096 \\
+         -Ojournal_dev %j"
+pdsh -S -g mds ldev dd if=/dev/zero of=%d count=8
+pdsh -S -g mds ldev mkfs.lustre --mdt --mgs --fsname=%f \\
+         --index=%i --mkfsoptions=-Jdevice=%j \\
+         --mkfsoptions=-i2048 \\
+         --mountfsoptions=errors=panic,iopen_nopriv,user_xattr,\\
+                          maxdirsize=20000000 %d
+pdsh -S -g mds ldev tune.ldiskfs -i0 -m0 -c0 %d
+
+# OSTs
+mgs=172.16.2.200@tcp0
+pdsh -S -g ost service heartbeat stop
+pdsh -S -g ost service lustre stop
+pdsh -S -g ost ldev dd if=/dev/zero of=%d count=8
+pdsh -S -g ost ldev mkfs.lustre --ost --mgsnode=$mgs --fsname=%f \\
+         --index=%i --param=lov.stripecount=2 --failnode=%N \\
+         --mountfsoptions=errors=panic,extents,mballoc %d
+pdsh -S -g ost ldev tune.ldiskfs -epanic -i0 -m0 -c0 %d
+.fi
+.SH FILES
+/etc/ldev.conf
+.br
+/etc/nids
+.SH "SEE ALSO"
+.BR ldev.conf (5)
+.BR nids (5)
diff --git a/lustre/doc/ldev.conf.5 b/lustre/doc/ldev.conf.5
new file mode 100644 (file)
index 0000000..f108785
--- /dev/null
@@ -0,0 +1,67 @@
+.TH ldev.conf 5 Lustre ldev.conf /etc/ldev.conf
+.SH NAME
+/etc/ldev.conf \- lustre device configuration file
+.SH DESCRIPTION
+The ldev.conf file contains a list of Lustre devices used by the
+\fBldev\fR utility.
+.SH FORMAT
+Comments beginning with a hash (#) are ignored.  Each line represents one
+device and includes the following information separated by white space:
+.TP
+.I "local hostname"
+The name of the host where the device normally runs.
+.TP
+.I "foreign hostname"
+The name of the host where the device runs when failed over.
+If failover is not used, insert a hypen as a placeholder.
+.TP
+.I "label"
+The Lustre label associated with the device in the form \fIfsname-SRVnnnn\fR
+where \fIfsname\fR is the file system name, \fISRV\fR is \fBOST\fR or
+\fBMDT\fR, and \fInnnn\fR is the four-digit hex index of the device.
+.TP
+.I "path"
+The path name of the device.  In failover configurations it should be available
+on both local and foreign hosts, e.g. use the symlinks maintained by udev
+in \fI/dev/disk/by-id\fR.
+.TP
+.I "journal-path"
+The path name of the journal device, if any.  This field may be omitted unless
+the raidtab field is present.  If a journal device is not used a hyphen may be
+inserted as a placeholder.
+.TP
+.I "raidtab"
+The path name of a Linux software raid configuration file or ZFS cache file.
+Using non-default names for these files may help prevent arrays from being
+automatically started by the system.  This is important in failover
+configurations where the timing of device initialization must be strictly
+controlled.  This field may be omitted.
+.SH EXAMPLES
+.nf
+
+#local  foreign/-  label    [md:|zfs:]device-path   [journal-path]/- [raidtab]
+
+# ldiskfs on block device example
+tycho-mds1 -    lc1-MDT0000 /dev/sda                /dev/sdc
+tycho1  tycho5  lc1-OST0000 /dev/disk/by-id/scsi-10103a262891d340100
+tycho1  tycho5  lc1-OST0008 /dev/disk/by-id/scsi-10103a262681d340200
+tycho1  tycho5  lc1-OST0010 /dev/disk/by-id/scsi-10103a2629e1d340300
+tycho5  tycho1  lc1-OST0004 /dev/disk/by-id/scsi-101046e6b401d341100
+tycho5  tycho1  lc1-OST000c /dev/disk/by-id/scsi-101046e6b591d341200
+tycho5  tycho1  lc1-OST0014 /dev/disk/by-id/scsi-101046e6bb41d341300
+
+# ldiskfs on Linux software RAID example
+#local  foreign/-  label    [md:|zfs:]device-path   [journal-path]/- [raidtab]
+zwicky-mds1  -    zwicky-MDT0000 md:/dev/md0 -         /etc/mdadm.conf.mds
+zwicky1  zwicky2  zwicky-OST0000 md:/dev/md0 /dev/md10 /etc/mdadm.conf.oss
+zwicky2  zwicky1  zwicky-OST0001 md:/dev/md1 /dev/md20 /etc/mdadm.conf.oss
+
+# ZFS example
+#local  foreign/-  label    [md:|zfs:]device-path   [journal-path]/- [raidtab]
+zeno-mds1 -   zeno-MDT0000 zfs:lustre-zeno-mds1/mdt1 - /etc/zfs/zpool.cache.zeno
+zeno1  zeno5  zeno-OST0000 zfs:lustre-zeno1/ost1     - /etc/zfs/zpool.cache.zeno
+zeno5  zeno1  zeno-OST0001 zfs:lustre-zeno5/ost1     - /etc/zfs/zpool.cache.zeno
+
+.fi
+.SH "SEE ALSO"
+.BR ldev (8)
diff --git a/lustre/doc/lhbadm.8 b/lustre/doc/lhbadm.8
new file mode 100644 (file)
index 0000000..c37dc89
--- /dev/null
@@ -0,0 +1,82 @@
+.TH lhbadm 8 "2009 Apr 29" Lustre "System Administration Utilities"
+.SH NAME
+lhbadm \- Lustre failover utility
+.SH SYNOPSIS
+.B lhbadm {failover|failback} reason ...
+.br
+.B pdsh -g lustre lhbadm status | dshbak -c
+.br
+.SH DESCRIPTION
+.B lhbadm
+simlifies heartbeat administration on Lustre clusters.
+It offers the following operations:
+.TP
+.B status
+Print a single line status message consisting of the heartbeat resource
+status a hyphen, and lustre status.
+Under normal circumstatus, server status should be \fIlocal-local\fR.
+.TP
+.B failover \fIreason ...\fR
+Initiate failover of local services (and foreign if active) to the
+failover partner.
+The command blocks until the transition is complete, which includes
+starting lustre on the partner node.
+Initiation and completion of failover is logged to the syslog
+\fRuser.err\fI facility.
+.TP
+.B failback \fIreason ...\fR
+Initiate failback of the local services from the failover
+partner.  The command blocks until the transition is complete, which
+includes starting lustre.
+Initiation and completion of failback is logged to the syslog
+\fRuser.err\fI facility.
+.SH "HEARTBEAT STATUS VALUES"
+The heartbeat resource status values returned by \fBlhbadm status\fR
+may be one of the following:
+.TP
+.B local
+Hearbeat expects only the local services to be running.
+.TP
+.B none
+Hearbeat expects no services to be running.
+.TP
+.B all
+Hearbeat expects local and foreign services to be running.
+.TP
+.B foreign
+Hearbeat expects only the foreign services to be running.
+.TP
+.B transition
+Resources are in transition.
+.SH "LUSTRE STATUS VALUES"
+The lustre status values returned by \fBlhbadm status\fR
+may be one of the following:
+.TP
+.B loaded
+Lustre modules are loaded but that's about it.
+.TP
+.B recovery
+One or more Lustre services is in recovery.
+.TP
+.B unhealthy
+Lustre is not healthy.
+.TP
+.B none
+Lustre is not running any services
+.TP
+.B local
+Lustre is running only the local services.
+.TP
+.B foreign
+Lustre is running only the foreign services.
+.TP
+.B all
+Lustre is running both the local and foreign services.
+.TP
+.B partial
+Lustre is partially started and may be running one or more services,
+but not exactly the local, foreign, or all sets.
+.SH SEE ALSO
+.BR cl_status (1)
+.BR hb_takeover (1)
+.BR hb_standby (1)
diff --git a/lustre/doc/nids.5 b/lustre/doc/nids.5
new file mode 100644 (file)
index 0000000..fc47ca5
--- /dev/null
@@ -0,0 +1,33 @@
+.TH nids 5 Lustre nids /etc/nids
+.SH NAME
+/etc/nids \- The static lookup table for Lustre NIDs
+.SH DESCRIPTION
+The nids file maps host names to NIDs and vice-versa.
+.SH FORMAT
+Comments beginning with a hash (#) are ignored.  Each line represents one
+host and includes the following information separated by white space:
+.TP
+.I "hostname"
+The primary hostname of the node, e.g. \fIuname -n\fR.
+.TP
+.I "primary nid"
+The primary NID of the node.
+.TP
+.I "other nid ..."
+Any additional NIDs.
+.SH EXAMPLE
+.nf
+## Tycho
+tycho-mds1 172.16.2.200@tcp 172.16.10.200@tcp
+tycho1     172.16.2.1@tcp   172.16.10.1@tcp
+tycho2     172.16.2.2@tcp   172.16.10.2@tcp
+tycho3     172.16.2.3@tcp   172.16.10.3@tcp
+tycho4     172.16.2.4@tcp   172.16.10.4@tcp
+tycho5     172.16.2.5@tcp   172.16.10.5@tcp
+tycho6     172.16.2.6@tcp   172.16.10.6@tcp
+tycho7     172.16.2.7@tcp   172.16.10.7@tcp
+tycho8     172.16.2.8@tcp   172.16.10.8@tcp
+.fi
+
+.SH FILES
+/etc/nids
index 498ec14..fe6b49d 100644 (file)
@@ -24,5 +24,6 @@
 /lc_net
 /lustre_config
 /lustre_createcsv
 /lc_net
 /lustre_config
 /lustre_createcsv
+/lustre
 /lustre_start
 /tree_status.pl
 /lustre_start
 /tree_status.pl
diff --git a/lustre/scripts/Lustre b/lustre/scripts/Lustre
new file mode 100644 (file)
index 0000000..19c7875
--- /dev/null
@@ -0,0 +1,73 @@
+#!/bin/bash
+#
+# Lustre - Heartbeat R1 Resource Agent for the Lustre file system
+#
+# Usage: Lustre <resource-name> start|stop|status
+#  where <resource-name> has the form "<hostname>-targets"
+#
+
+warn ()
+{
+    if [ -e /etc/logd.cf ] && [ -x /usr/sbin/ha_logger ]; then
+        /usr/sbin/ha_logger -t heartbeat "Lustre: $*"
+    elif [ -x /usr/bin/logger ]; then
+        /usr/bin/logger -t heartbeat "Lustre: $*"
+    elif [ -x /bin/logger ]; then
+        /bin/logger -t heartbeat "Lustre: $*"
+    else
+       echo "Lustre: $*"
+    fi
+}
+
+die ()
+{
+    warn "$*"
+    exit 1
+}
+
+
+if [ $# != 2 ]; then
+    die "wrong number of arguments: $*"
+fi
+if ! [ "$2" == "start" -o "$2" == "stop" -o "$2" == "status" ]; then
+    die "bad action arg[2]: $*"
+fi
+
+if ! [ -x /usr/sbin/ldev ]; then
+    die "/usr/sbin/ldev is missing or not executable"
+fi
+if ! [ -x /etc/init.d/lustre ]; then
+    die "/etc/init.d/lustre is missing or not executable"
+fi
+
+action=$2
+if [ "`uname -n`-targets" == "$1" ]; then
+    service=local
+elif [ "`/usr/sbin/ldev -p`-targets" == "$1" ]; then
+    service=foreign
+else
+    die: "bad service arg[1]: $*"
+fi
+
+# Until multi-mount protect is implemented for ZFS we allow heartbeat to
+# force import a pool.  This is required because ZFS will not allow you to
+# import a pool on a new host unless you have cleanly exported it.
+export ZPOOL_IMPORT_ARGS='-f'
+
+# N.B. If status action reports "running", this must pass through to
+# heartbeat unmodified.  Otherwise, stdout/stderr is discarded by heartbeat,
+# so if we want to log diagnostic output from init scripts, we have to
+# redirect it here.
+
+warn /etc/init.d/lustre $action $service
+
+tmpout=`mktemp` || die "mktemp failed"
+/etc/init.d/lustre $action $service >$tmpout
+result=$?
+cat $tmpout | while read line; do
+    echo "$line"
+    warn "$line"
+done
+rm -f $tmpout
+
+exit $result
index 9b00685..19fc744 100644 (file)
 # Lustre is a trademark of Sun Microsystems, Inc.
 #
 
 # Lustre is a trademark of Sun Microsystems, Inc.
 #
 
-sbinscripts = lc_servip lustre_up14 lustre_rmmod
+sbinscripts = lc_servip lustre_up14 lustre_rmmod lhbadm ldev
 
 # These are scripts that are generated from .in files
 genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv \
 
 # These are scripts that are generated from .in files
 genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv \
-    lc_md lc_lvm lustre_start
+    lc_md lc_lvm lustre_start lustre
+
+initdir = $(sysconfdir)/init.d
+init_SCRIPTS = lustre lnet
+
+hadir = $(sysconfdir)/ha.d/resource.d
+ha_SCRIPTS = Lustre
 
 sbin_SCRIPTS = $(genscripts) $(sbinscripts)
 bin_SCRIPTS = lustre_req_history lfs_migrate
 
 sbin_SCRIPTS = $(genscripts) $(sbinscripts)
 bin_SCRIPTS = lustre_req_history lfs_migrate
@@ -46,9 +52,11 @@ bin_SCRIPTS = lustre_req_history lfs_migrate
 EXTRA_DIST = license-status maketags.sh version_tag.pl version_tag-git.pl \
             version_tag-cvs.pl version_tag-none.pl lc_common \
             $(addsuffix .in,$(genscripts)) lc_mon $(sbinscripts) \
 EXTRA_DIST = license-status maketags.sh version_tag.pl version_tag-git.pl \
             version_tag-cvs.pl version_tag-none.pl lc_common \
             $(addsuffix .in,$(genscripts)) lc_mon $(sbinscripts) \
-            $(bin_SCRIPTS) make_META.pl
+            $(bin_SCRIPTS) make_META.pl lustre.in lnet lhbadm \
+            haconfig ldev Lustre
 
 scriptlibdir = @libexecdir@/@PACKAGE@
 
 scriptlibdir = @libexecdir@/@PACKAGE@
+scriptlib_SCRIPTS = haconfig
 scriptlib_DATA = lc_common
 
 CLEANFILES = $(genscripts)
 scriptlib_DATA = lc_common
 
 CLEANFILES = $(genscripts)
diff --git a/lustre/scripts/haconfig b/lustre/scripts/haconfig
new file mode 100644 (file)
index 0000000..5869ea2
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# haconfig - config helper to process heartbeat V1 config skel files
+
+local=`uname -n`
+
+[ -x /usr/sbin/ldev ] || exit 0
+foreign=`/usr/sbin/ldev -p`
+[ -n "$foreign" ] || exit 0
+
+
+umask 022
+
+for file in /etc/ha.d/haresources /etc/ha.d/ha.cf; do
+    if [ -r ${file}.in ]; then
+        sed -e "s!@LOCAL@!$local!g" -e "s!@FOREIGN@!$foreign!g" \
+           < ${file}.in >${file}
+     fi
+done
+
+exit 0
diff --git a/lustre/scripts/ldev b/lustre/scripts/ldev
new file mode 100644 (file)
index 0000000..4b7b008
--- /dev/null
@@ -0,0 +1,459 @@
+#!/usr/bin/perl
+#
+# ldev - parser for /etc/ldev.conf
+#
+use strict;
+use File::Basename;
+use Getopt::Long qw/ :config posix_default no_ignore_case/;
+
+$ENV{PATH} = "/sbin:/usr/sbin:/bin:/usr/bin";
+
+my $prog = basename($0);
+
+my $usage = <<EOF;
+Usage: $prog [OPTIONS]...
+
+Parse ldev.conf and answer the following queries:
+
+  -h, --help          Display this help.
+  -c, --config FILE   Set path to config file.
+  -H, --hostname NAME Use NAME instead of local hostname for queries.
+  -p, --partner       Print hostname of failover partner.
+  -l, --local         Print labels for local devices.
+  -f, --foreign       Print labels for foreign devices.
+  -a, --all           Print labels for local and foreign devices.
+  -s, --sanity        Sanity check config on this node.
+  -d, --device LABEL  Print storage device of LABEL.
+  -j, --journal LABEL Print journal device of LABEL if it exists.
+  -r, --raidtab LABEL Print raidtab of LABEL if it exists.
+  -t, --type LABEL    Print device type of LABEL, i.e. "zfs" or "md".
+  -z, --zpool LABEL   Print zpool containing LABEL.
+  CMD [ARGS] ...      Run CMD in parallel for each device substituting:
+                      %f=fsname  %d=device  %i=dec-index %n=main-nid %l=label
+                      %t=srvtype %j=journal %I=hex-index %N=fail-nid
+                      May be used in combination with -l, -f, -a options.
+EOF
+
+my %eparse = (
+   elabel_uniq  =>    "label used more than once",
+   epairwise    =>    "local and foreign host not mapped to each other",
+   efieldcount  =>    "line has less than the minimum number of fields (4)",
+   ekeyval      =>    "malformed id=name",
+);
+
+my %conf = ();
+
+#
+# Main
+#
+
+parse_cmdline ();
+
+parse_config ();
+
+sanity ()         if $conf{sanity};
+exec_cmd ()       if $conf{execcmd};
+query_partner ()  if $conf{partner};
+query_local ()    if $conf{local};
+query_foreign ()  if $conf{foreign};
+query_all ()      if $conf{all};
+query_device ()   if $conf{device};
+query_journal ()  if $conf{journal};
+query_raidtab ()  if $conf{raidtab};
+query_type ()     if $conf{type};
+query_zpool ()    if $conf{zpool};
+
+exit(0);
+
+#
+# Subroutines
+#
+
+sub parse_cmdline
+{
+    my $help = 0;
+    my $host = "";
+
+    $conf{partner} = 0;
+    $conf{all} = 0;
+    $conf{local} = 0;
+    $conf{foreign} = 0;
+    $conf{config} = "/etc/ldev.conf";
+    $conf{nidsfile} = "/etc/nids";
+    $conf{hostname} = `uname -n`; chomp $conf{hostname};
+    $conf{device} = "";
+    $conf{sanity} = 0;
+    $conf{execcmd} = "";
+    $conf{journal} = "";
+
+    my $rc = GetOptions (
+        "help|h!"         => \$help,
+        "partner|p!"      => \$conf{partner},
+        "all|a!"          => \$conf{all},
+        "local|l!"        => \$conf{local},
+        "foreign|f!"      => \$conf{foreign},
+        "config|c=s"      => \$conf{config},
+        "nidsfile|n=s"    => \$conf{nidsfile},
+        "hostname|H=s"    => \$conf{hostname},
+        "sanity|s!"       => \$conf{sanity},
+        "device|d=s"      => \$conf{device},
+        "journal|j=s"     => \$conf{journal},
+        "raidtab|r=s"     => \$conf{raidtab},
+        "type|t=s"        => \$conf{type},
+        "zpool|z=s"       => \$conf{zpool},
+    );
+
+    usage() if $help || !$rc;
+
+    log_fatal ("cannot read config file\n") if (! -r $conf{config});
+
+    if (@ARGV) {
+        $conf{execcmd} = " " . join " ", @ARGV;
+    }
+
+    parse_nids () if ($conf{execcmd} =~ /(%n|%N)/);
+}
+
+sub parse_config
+{
+    my $line = 0;
+    my %l2f = ();
+    my %label2local = ();
+    my %label2dev = ();
+    my %label2journal = ();
+    my %label2raidtab = ();
+    my %label2type = ();
+    my %label2zpool = ();
+    my @local_labels = ();
+    my @foreign_labels = ();
+
+    open (CONF, "< $conf{config}") or log_fatal ("$conf{config}: $!\n");
+
+    while (<CONF>) {
+        my $type;
+        $line++;
+        s/#.*//;
+        s/(\s)*$//;
+        next if (/^(\s)*$/);
+        chomp;
+        my ($local, $foreign, $label, $dev, $j, $raidtab) = split;
+        if ($dev !~ /^\// && $dev =~ /^([^:]+):(.+)$/) {
+            $type = $1;
+            $dev = $2;
+        }
+        eparse_line ($line, "efieldcount") if (!defined $dev);
+        eparse_line ($line, "epairwise") if (exists $l2f{$local}
+                                         && $l2f{$local} ne $foreign);
+        $l2f{$local} = $foreign;
+
+        eparse_line ($line, "elabel_uniq") if (exists $label2dev{$label}
+                                         || exists $label2local{$label});
+        $label2dev{$label} = $dev;
+        $label2local{$label} = $local;
+        $label2journal{$label} = $j if defined $j;
+        $label2raidtab{$label} = $raidtab if defined $raidtab;
+        if (defined $type) {
+            $label2type{$label} = $type;
+            if ($type eq "zfs" && $dev =~ m{^([^/]+)/[^/]+$}) {
+                $label2zpool{$label} = $1;
+            }
+        }
+
+        if ($local eq $conf{hostname}) {
+            push @local_labels, $label;
+        } elsif ($foreign eq $conf{hostname}) {
+            push @foreign_labels, $label;
+        }
+    }
+    close CONF;
+
+    foreach (keys %l2f) {
+        my $foreign = $l2f{$_};
+        next if ($foreign eq "-");
+        eparse_str ($_, "epairwise")
+                    unless (!exists $l2f{$foreign} or $l2f{$foreign} eq $_);
+    }
+
+    @{$conf{local_labels}} = @local_labels;
+    @{$conf{foreign_labels}} = @foreign_labels;
+    %{$conf{l2f}} = %l2f;
+    %{$conf{label2dev}} = %label2dev;
+    %{$conf{label2local}} = %label2local;
+    %{$conf{label2journal}} = %label2journal;
+    %{$conf{label2raidtab}} = %label2raidtab;
+    %{$conf{label2type}} = %label2type;
+    %{$conf{label2zpool}} = %label2zpool;
+}
+
+sub parse_nids ()
+{
+    my $line = 0;
+    my %host2nid = ();
+    my %nid2host = ();
+
+    open (NIDS, "< $conf{nidsfile}") or log_fatal ("$conf{nidsfile}: $!\n");
+
+    while (<NIDS>) {
+        $line++;
+        s/#.*//;
+        next if (/^(\s)*$/);
+        chomp;
+        my ($host, $nid, $morenids) = split (/\s+/, $_, 3);
+        if (!defined $nid) {
+            log_fatal ("$conf{nidsfile} line $line: incomplete line\n");
+        }
+        $host2nid{$host} = $nid;
+        $nid2host{$nid} = $host;
+        map { $nid2host{$_} = $host; } split (/\s+/, $morenids);
+    }
+    close NIDS;
+
+    %{$conf{host2nid}} = %host2nid;
+    %{$conf{nid2host}} = %nid2host;
+}
+
+sub query_partner
+{
+    my %l2f = %{$conf{l2f}};
+    my $hostname = $conf{hostname};
+    if (exists $l2f{$hostname} && $l2f{$hostname} ne "-") {
+        print "$l2f{$hostname}\n";
+    }
+}
+
+sub query_local
+{
+    map { print "$_\n"; } @{$conf{local_labels}};
+}
+
+sub query_foreign
+{
+    map { print "$_\n"; } @{$conf{foreign_labels}};
+}
+
+sub query_all
+{
+    query_local ();
+    query_foreign ();
+}
+
+sub query_device
+{
+    my %label2dev = %{$conf{label2dev}};
+
+    if (exists $label2dev{$conf{device}}) {
+        print "$label2dev{$conf{device}}\n";
+    }
+}
+
+sub query_raidtab
+{
+    my %label2raidtab = %{$conf{label2raidtab}};
+
+    if (exists $label2raidtab{$conf{raidtab}}) {
+        print "$label2raidtab{$conf{raidtab}}\n";
+    }
+}
+
+sub query_journal
+{
+    my %label2journal = %{$conf{label2journal}};
+
+    if (exists $label2journal{$conf{journal}} &&
+       $label2journal{$conf{journal}} ne "-") {
+        print "$label2journal{$conf{journal}}\n";
+    }
+}
+
+sub query_type
+{
+    my %label2type = %{$conf{label2type}};
+
+    if (exists $label2type{$conf{type}}) {
+        print "$label2type{$conf{type}}\n";
+    }
+}
+
+sub query_zpool
+{
+    my %label2zpool = %{$conf{label2zpool}};
+
+    if (exists $label2zpool{$conf{zpool}}) {
+        print "$label2zpool{$conf{zpool}}\n";
+    }
+}
+
+sub dd_test
+{
+    my ($dpath) = @_;
+    my $retval = 0;
+    my $bs =      `blockdev --getss   $dpath 2>/dev/null`; chomp $bs;
+    my $max512  = `blockdev --getsize $dpath 2>/dev/null`; chomp $max512;
+    if ($? == 0 && $bs > 0 && $max512 > 0) {
+        my $maxb = ($max512 / $bs) * 512;
+        my $count = 10 * 1024 * 1024 / $bs;  # read first 10mb
+        my $dev = `readlink -f $dpath`; chomp $dev;
+        $count = $maxb if ($count > $maxb);
+        `dd if=$dev of=/dev/null bs=$bs count=$count >/dev/null 2>&1`;
+        $retval = ($? == 0);
+    }
+    return $retval;
+}
+
+sub sanity
+{
+    my $exit_val = 0;
+
+    my @local_labels = @{$conf{local_labels}};
+    my @foreign_labels = @{$conf{foreign_labels}};
+    my %label2dev = %{$conf{label2dev}};
+    my %label2journal = %{$conf{label2journal}};
+
+    foreach (@local_labels, @foreign_labels) {
+        my $lpath = "/dev/disk/by-label/$_";
+        my $dpath = $label2dev{$_};
+        my $jpath = $label2journal{$_};
+        my $label = $_;
+        if (! -e $lpath) {
+            log_error ("$lpath does not exist\n");
+            $exit_val = 1;
+        }
+        if (! -e $dpath) {
+            log_error ("$dpath does not exist\n");
+            $exit_val = 1;
+        } elsif (!dd_test ($dpath)) {
+            log_error ("$dpath failed dd test\n");
+            $exit_val = 1;
+        }
+        if (`readlink -f $lpath` ne `readlink -f $dpath`) {
+            log_error ("$lpath and $dpath point to different things\n");
+            $exit_val = 1;
+        }
+        if ($jpath) {
+            if (! -e $jpath) {
+                log_error ("$jpath (journal for $label) does not exist\n");
+                $exit_val = 1;
+            } elsif (!dd_test ($jpath)) {
+                log_error ("$jpath failed dd test\n");
+                $exit_val = 1;
+            }
+        }
+    }
+    exit($exit_val);
+}
+
+sub par_exec
+{
+    my @pids = ();
+    my %pid2label = ();
+    my %pid2cmd = ();
+    my $pid;
+    my $result = 0;
+
+    my $tmpfile = `mktemp \${TMPDIR:-/tmp}/ldev.XXXXXXXXXX`; chomp $tmpfile;
+    log_fatal ("failed to create $tmpfile\n") if (! -e $tmpfile);
+
+    foreach (@_) {
+        my ($label, $cmd) = split (/\s+/, $_, 2);
+        my ($basecmd) = split (/\s+/, $cmd);
+        if (($pid = fork)) {       # parent
+            $pid2label{$pid} = $label;
+            $pid2cmd{$pid} = $basecmd;
+        } elsif (defined $pid) {   # child
+            #print STDERR "$label: running $cmd\n";
+            exec "($cmd 2>&1 || rm -f $tmpfile) | sed -e 's/^/$label: /'";
+            print STDERR "$label: exec $basecmd: $!\n"; unlink $tmpfile;
+        } else {                   # error
+            log_fatal ("label: fork: $!\n"); unlink $tmpfile;
+        }
+    }
+    while (($pid = wait) != -1) {
+        #print STDERR "$pid2label{$pid}: completed\n";
+    }
+
+    # sentinel is intact, so there were no errors
+    if (-e $tmpfile) {
+        unlink $tmpfile;
+        $result = 1;
+    }
+
+    return $result;
+}
+
+sub exec_cmd
+{
+    my @labels = ();
+    my @cmds = ();
+    my %label2dev = %{$conf{label2dev}};
+    my %label2journal = %{$conf{label2journal}};
+    my %l2f = %{$conf{l2f}};
+    my ($nid, $failnid);
+
+    if ($conf{execcmd} =~ /%n/) {
+        my %host2nid = %{$conf{host2nid}};
+        if (!defined $host2nid{$conf{hostname}}) {
+            log_fatal ("%n used but no nid defined for this host\n");
+        }
+        $nid = $host2nid{$conf{hostname}};
+    }
+    if ($conf{execcmd} =~ /%N/) {
+        if (!defined $l2f{$conf{hostname}}) {
+            log_fatal ("%N used but foreign host is undefined\n");
+        }
+        my %host2nid = %{$conf{host2nid}};
+        if (!defined $host2nid{$l2f{$conf{hostname}}}) {
+            log_fatal ("%N used but foreign nid is undefined\n");
+        }
+        $failnid = $host2nid{$l2f{$conf{hostname}}};
+    }
+
+    if ($conf{foreign} and !$conf{local} and !$conf{all}) {
+        @labels = @{$conf{foreign_labels}};
+    } elsif (!$conf{foreign} and !$conf{all}) {
+        @labels = @{$conf{local_labels}};
+    } else {
+        @labels = (@{$conf{local_labels}}, @{$conf{foreign_labels}});
+    }
+    foreach (@labels) {
+        /(\w+)-(OST|MDT|MGT)([0-9a-fA-F]{4})/;
+
+        my $fsname = $1;
+        my $type = $2; $type =~ tr/A-Z/a-z/;
+        my $hexindex = $3;
+        my $decindex = hex($3);
+        my $label = $_;
+        my $cmd = $conf{execcmd};
+        my $device = $label2dev{$_};
+        if ($conf{execcmd} =~ /%j/ && !defined $label2journal{$_}) {
+            log_fatal ("%j used but no journal defined for $_\n");
+        }
+        my $journal = $label2journal{$_};
+
+        $cmd =~ s/%f/$fsname/g;  # %f = fsname
+        $cmd =~ s/%t/$type/g;    # %t = server type
+        $cmd =~ s/%I/$hexindex/g;# %I = index (hex)
+        $cmd =~ s/%i/$decindex/g;# %i = index (dec)
+        $cmd =~ s/%l/$label/g;   # %l = label
+        $cmd =~ s/%d/$device/g;  # %d = device
+        $cmd =~ s/%j/$journal/g; # %j = journal device
+        $cmd =~ s/%n/$nid/g;     # %n = nid
+        $cmd =~ s/%N/$failnid/g; # %N = failnid
+
+        push @cmds, "$_ $cmd";
+    }
+
+    par_exec (@cmds) or log_fatal ("parallel command execution failed\n");
+    exit 0;
+}
+
+sub usage
+{
+    print STDERR "$usage";
+    exit 0;
+}
+
+sub log_msg     { print STDERR "$prog: ", @_; }
+sub log_error   { log_msg ("Error: ", @_) }
+sub log_fatal   { log_msg ("Fatal: ", @_); exit 1; }
+sub eparse_line { log_fatal ("$conf{config} line $_[0]: $eparse{$_[1]}\n"); }
+sub eparse_str  { log_fatal ("$conf{config}: $_[0]: $eparse{$_[1]}\n"); }
diff --git a/lustre/scripts/lhbadm b/lustre/scripts/lhbadm
new file mode 100644 (file)
index 0000000..e8a0a0a
--- /dev/null
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# lhbadm - handle some common heartbeat/lustre failover ops
+
+PATH=/sbin:/usr/sbin:/usr/bin:$PATH:/usr/lib64/heartbeat:/usr/lib/heartbeat
+
+declare -r prog=lhbadm
+
+die ()
+{
+    echo "$prog: $@"
+    exit 1
+}
+
+warn ()
+{
+    echo "$prog: $@"
+}
+
+usage ()
+{
+    echo "Usage: $prog status|lstatus|failback|failover"
+    echo "  status -   print one-line heartbeat-lustre status"
+    echo "  failover - fail all my active resources over to partner"
+    echo "  failback - fail my normal resources back"
+    exit 1
+}
+
+test_mounts ()
+{
+    local label
+    local lcount=0
+    local fcount=0
+    local ltot=0
+    local ftot=0
+
+    for label in $(ldev -l); do
+        ltot=$((ltot + 1))
+        if [ "$(service lustre status $label)" == "running" ]; then
+            lcount=$((lcount + 1))
+        fi
+    done
+    for label in $(ldev -f); do
+        ftot=$((ftot+ 1))
+        if [ "$(service lustre status $label)" == "running" ]; then
+            fcount=$((fcount + 1))
+        fi
+    done
+
+    if [ $(($lcount + $fcount)) == 0 ]; then
+        echo none
+    elif [ $lcount == $ltot -a $fcount == 0 ]; then
+        echo local
+    elif [ $lcount == 0     -a $fcount == $ftot ]; then
+        echo foreign
+    elif [ $lcount == $ltot -a $fcount == $ftot ]; then
+        echo all
+    else
+        echo partial
+    fi
+}
+
+status ()
+{
+    local rstat fstat
+    local labels
+
+    rstat=$(cl_status rscstatus) || die "cl_status rscstatus failed"
+    fstat=$(service lustre status)
+
+    if [ "$fstat" == "running" ]; then
+        fstat=$(test_mounts)
+    fi
+
+    echo $rstat-$fstat
+}
+
+wait_for_transition ()
+{
+    while sleep 5; do
+        state=$(cl_status rscstatus) || die "cl_status rscstatus failed"
+        [ "$state" == "transition" ] || break
+    done
+}
+
+failover ()
+{
+    local s
+
+    [ "$(id -un)" == "root" ] || die "failover requires root privileges"
+    [ $# -gt 0 ] || die "please include a descriptive reason for the logs"
+
+    s=$(status)
+    logger -s -t Lustre-ha -p user.err "failover start, status=$s, reason: $*"
+
+    hb_standby all 2>/dev/null 1>&2 || die "hb_standby all failed"
+    wait_for_transition
+
+    s=$(status)
+    logger -s -t Lustre-ha -p user.err "failover complete, status=$s"
+}
+
+failback ()
+{
+    local s
+
+    [ "$(id -un)" == "root" ] || die "failback requires root privileges"
+    [ $# -gt 0 ] || die "please include a descriptive reason for the logs"
+
+    s=$(status)
+    logger -s -t Lustre-ha -p user.err "failback start, status=$s, reason: $*"
+
+    hb_takeover local || die "hb_takeover local failed"
+    wait_for_transition
+
+    s=$(status)
+    logger -s -t Lustre-ha -p user.err "failover complete, status=$s"
+}
+
+
+#
+# MAIN
+#
+
+[ $# == 0 ] && usage
+[ -x /usr/bin/cl_status ] || die "Heartbeat is not installed"
+hstat=$(cl_status hbstatus) || die "$hstat"
+
+case "$1" in
+    status)   status ;;
+    lstatus)  lstatus ;;
+    failback) shift; failback $*;;
+    failover) shift; failover $*;;
+    *) usage ;;
+esac
+
+#  vi: ts=4 sw=4 expandtab
diff --git a/lustre/scripts/lnet b/lustre/scripts/lnet
new file mode 100644 (file)
index 0000000..7033891
--- /dev/null
@@ -0,0 +1,214 @@
+#!/bin/bash
+#
+# lnet This shell script takes care of starting and stopping
+#       the lnet (Lustre networking) services.
+#
+# chkconfig: - 59 76
+# description:  Part of the lustre file system.
+# probe: true
+# config: /etc/sysconfig/lustre
+
+# Source function library.
+[ -f /etc/rc.d/init.d/functions ] && . /etc/rc.d/init.d/functions
+
+# Source networking configuration and check that networking is up.
+[ -f /etc/sysconfig/network ] && . /etc/sysconfig/network && \
+[ "${NETWORKING}" = "no" ] && exit 0
+
+# Check for and source configuration file otherwise set defaults
+[ -f /etc/sysconfig/lnet ] && . /etc/sysconfig/lnet
+
+declare -r TOP_MODULES=(       \
+       obdecho                 \
+       llite                   \
+       lustre                  \
+       osc                     \
+       lov                     \
+       mds                     \
+       mdc                     \
+       mgs                     \
+       mgc                     \
+       ost                     \
+       obdfilter               \
+       lquota                  \
+       ptlrpc                  \
+)
+declare -r BOTTOM_MODULES=(    \
+       ksocklnd                \
+       kqswlnd                 \
+       ko2iblnd                \
+       fsfilt_ldiskfs          \
+       obdclass                \
+       lnet                    \
+       lvfs                    \
+       libcfs                  \
+       ldiskfs                 \
+)
+
+declare -r awkprog='BEGIN { rc = -1 }
+                      { if ( $1 == module_name ) { rc = $3; exit; } }
+                   END { print rc }'
+
+# Usage: run_preexec_check [ start | restart | condrestart ]
+# The single parameter will be passed to the PREEXEC_SCRIPT
+run_preexec_check ()
+{
+       if [ -n "$PREEXEC_CHECK" ] && ! $PREEXEC_CHECK ; then
+               echo "Pre-exec check \"$PREEXEC_CHECK\" failed.  Aborting."
+               exit 1
+       fi
+
+       if [ -n "$PREEXEC_SCRIPT" ] && ! "$PREEXEC_SCRIPT" "$1" ; then
+               echo "Pre-exec script \"$PREEXEC_SCRIPT\" failed.  Aborting."
+               exit 1
+       fi
+}
+
+# Usage: run_postexec_check [ start | restart | condrestart ]
+# The single parameter will be passed to the POSTEXEC_SCRIPT
+run_postexec_check ()
+{
+       if [ -n "$POSTEXEC_CHECK" ] && ! $POSTEXEC_CHECK ; then
+               echo "Post-exec check \"$POSTEXEC_CHECK\" failed.  Aborting."
+               exit 1
+       fi
+
+       if [ -n "$POSTEXEC_SCRIPT" ] && ! "$POSTEXEC_SCRIPT" "$1" ; then
+               echo "Post-exec script \"$POSTEXEC_SCRIPT\" failed.  Aborting."
+               exit 1
+       fi
+}
+
+remove_modules ()
+{
+       local modules="${@}"
+       local ref_cnt
+
+       for mod in $modules; do
+               ref_cnt=`/sbin/lsmod | awk "$awkprog" "module_name=$mod"`
+               if [ $ref_cnt -lt 0 ]; then
+                       # module not loaded, skip it
+                       continue
+               fi
+               if [ $ref_cnt -gt 0 ]; then
+                       # module in use.  maybe it just needs a few seconds
+                       # after removal of previous modules.
+                       sleep 5
+                       ref_cnt=`/sbin/lsmod | awk "$awkprog" module_name=$mod`
+               fi
+               if [ $ref_cnt -eq 0 ]; then
+                       # unload the module
+                       echo "Removing module $mod"
+                       /sbin/rmmod $mod
+                       if [ $? -ne 0 ]; then
+                               echo "ERROR: Failed to remove module $mod."
+                               return 1
+                       fi
+               else
+                       # boo!  module still in use.
+                       echo "ERROR: Module $mod has non-zero reference count."
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+stop_lnet ()
+{
+       local errmsg=`/usr/sbin/lctl network unconfigure 2>&1`
+       if [ $? -gt 0 ]; then
+               # The following error message means that lnet is already
+               # unconfigured, and the modules are not loaded.
+               echo $errmsg | grep "LNET unconfigure error 19" > /dev/null
+               if [ $? -gt 0 ]; then
+                       return 0
+               else
+                       echo "$errmsg"
+                       return 1
+               fi
+       fi
+       return 0
+}
+
+status ()
+{
+       old_nullglob="`shopt -p nullglob`"
+       shopt -u nullglob
+
+       STATE="stopped"
+       # LSB compliance - return 3 if service is not running
+       # Lustre-specific returns
+       # 150 - partial startup
+       # 151 - health_check unhealthy
+       # 152 - LBUG
+       RETVAL=3
+       egrep -q "lnet" /proc/modules && STATE="loaded"
+
+       # check for any routes - on a portals router this is the only thing
+       [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0
+
+       # check if this is a router
+       if [ -d /proc/sys/lnet ]; then
+               ROUTER="`cat /proc/sys/lnet/routes | head -1 | grep -i -c \"Routing enabled\"`"
+               if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then
+                       STATE="running"
+                       RETVAL=0
+               fi
+       fi
+
+       # check for error in health_check
+       HEALTH="/proc/fs/lustre/health_check"
+       [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=1
+
+       # check for LBUG
+       [ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152
+
+       echo $STATE
+       eval $old_nullglob
+}
+
+# See how we were called.
+case "$1" in
+  start)
+       run_preexec_check "start"
+       touch /var/lock/subsys/lnet
+       modprobe lnet || exit 1
+       lctl network up || exit 1
+       run_postexec_check "start"
+       ;;
+  stop)
+       run_preexec_check "stop"
+       remove_modules ${TOP_MODULES[*]} || exit 1
+       stop_lnet || exit 1
+       remove_modules ${BOTTOM_MODULES[*]} || exit 1
+       rm -f /var/lock/subsys/lnet
+       run_postexec_check "stop"
+       ;;
+  status)
+       status
+       ;;
+  restart)
+       $0 stop
+       $0 start
+       ;;
+  reload)
+       touch /var/lock/subsys/lnet
+       ;;
+  probe)
+       if [ ! -f /var/lock/subsys/lnet ] ; then
+         echo $"start"; exit 0
+       fi
+       ;;
+  condrestart)
+       [ -f /var/lock/subsys/lnet ] && {
+               $0 stop
+               $0 start
+       }
+       ;;
+  *)
+       echo $"Usage: lustre {start|stop|status|restart|reload|condrestart}"
+       exit 1
+esac
+
+exit 0
diff --git a/lustre/scripts/lustre b/lustre/scripts/lustre
deleted file mode 100755 (executable)
index 73c5b22..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/bin/sh
-#
-# lustre   This shell script takes care of starting and stopping Lustre
-#
-# chkconfig: - 99 1
-# description: Lustre Lite network File System.
-#              This starts both Lustre client and server functions.
-# processname: lconf
-# config: /etc/lustre/config.xml
-# pidfile: /var/run/lustre.pid
-### BEGIN INIT INFO
-# Provides: lustre
-# Required-Start: $network +sshd
-# Required-Stop: $network
-# Should-Start:
-# Should-Stop:
-# Default-Start: 
-# Default-Stop: 0 1 2 3 4 5 6
-# Short-Description: Lustre Lite network File System.
-# Description: This starts both Lustre client and server functions.
-### END INIT INFO
-
-
-SERVICE=${0##*/}
-
-: ${LUSTRE_CFG:=/etc/lustre/lustre.cfg}
-[ -f ${LUSTRE_CFG} ] && . ${LUSTRE_CFG}
-[ -f /etc/sysconfig/lustre ] && . /etc/sysconfig/lustre
-
-: ${LUSTRE_CONFIG_XML:=/etc/lustre/config.xml}
-: ${LCONF:=lconf}
-: ${LCTL:=lctl}
-# Some distros use modprobe.conf.local
-if [ -f /etc/modprobe.conf.local ]; then
-   : ${MODPROBE_CONF:=/etc/modprobe.conf.local}
-else
-   : ${MODPROBE_CONF:=/etc/modprobe.conf}
-fi
-# Be sure the proper directories are in PATH. 
-export PATH="/sbin:$PATH"
-
-case "$SERVICE" in
-    [SK][[:digit:]][[:digit:]]lustre | lustre)
-        SERVICE="lustre"
-       : ${LCONF_START_ARGS:="${LUSTRE_CONFIG_XML}"}
-       : ${LCONF_STOP_ARGS:="--force --cleanup ${LUSTRE_CONFIG_XML}"}
-       ;;
-    *)
-       : ${LCONF_START_ARGS:="--group ${SERVICE} --select ${SERVICE}=${HOSTNAME} ${LUSTRE_CONFIG_XML}"}
-       : ${LCONF_STOP_ARGS:="--group ${SERVICE} --select ${SERVICE}=${HOSTNAME} --failover --cleanup ${LUSTRE_CONFIG_XML}"}
-       ;;
-esac
-LOCK=/var/lock/subsys/$SERVICE
-
-# Source function library.
-if [ -f /etc/init.d/functions ] ; then
-       . /etc/init.d/functions
-fi
-
-# Source networking configuration.
-if [ -f /etc/sysconfig/network ] ; then
-       . /etc/sysconfig/network
-fi
-
-check_start_stop() {
-       # Exit codes now LSB compliant
-       # Check that networking is up. - exit 'not running'
-       [ "${NETWORKING}" = "no" ] && exit 7 
-
-       # exit 'not installed' 
-       [ -x ${LCONF} -a -x ${LCTL} ] || exit 5
-
-       if [ ${LUSTRE_CONFIG_XML:0:1} = "/" ] ; then
-                       if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then
-                       echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping."
-                       # exit 'not configured'
-                       exit 6
-               fi
-       fi
-
-       # Create /var/lustre directory 
-       # This is used by snmp agent for checking lustre services
-       #    status online/offline/online pending/offline pending.
-
-       [ -d ${STATUS_DIR:=/var/lustre} ] || mkdir -p $STATUS_DIR
-       STATUS=${STATUS_DIR}/sysStatus
-}
-
-start() {
-       if [ -x "/usr/sbin/clustat" -a "${SERVICE}" = "lustre" ] ; then
-               if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then
-               cat >&2 <<EOF
-This script was run directly, which can be dangerous if you are using
-clumanager to manage Lustre services.
-
-If you are not using clumanager for Lustre services, run the following
-command to have this script start Lustre instead:
-
-touch /etc/lustre/start-despite-clumanager
-EOF
-               RETVAL=6  # program not configured
-               return
-           fi
-       fi
-       check_start_stop
-       echo -n "Starting $SERVICE: "
-       if [ $UID -ne 0 ]; then
-               echo "Lustre should be started as root"
-               RETVAL=4 # insufficent privileges
-               return
-       fi
-       # Cat the modprobe file and place all lines that follow a trailing backslash on the same line
-       ROUTER=`cat ${MODPROBE_CONF} | sed ':a;N;$!ba;s#\\\[:space:]*\\n##g' | grep lnet | grep forwarding=\"enabled\"`
-       if [[ ! -z ${ROUTER} ]]; then
-               modprobe lnet
-               ${LCTL} network configure
-       else
-               ${LCONF} ${LCONF_START_ARGS}
-       fi
-       RETVAL=$?
-       echo $SERVICE
-       if [ $RETVAL -eq 0 ]; then
-               touch $LOCK
-               echo "online" >$STATUS
-       else
-               echo "online pending" >$STATUS
-       fi
-}
-
-stop() {
-       check_start_stop
-       echo -n "Shutting down $SERVICE: "
-       if [ $UID -ne 0 ]; then
-               echo "Lustre should be stopped as root"
-               RETVAL=4 # insufficent privileges
-               return
-       fi
-       # Cat the modprobe file and place all lines that follow a trailing backslash on the same line
-+      ROUTER=`cat ${MODPROBE_CONF} | sed ':a;N;$!ba;s#\\\[:space:]*\\n##g' | grep lnet | grep forwarding=\"enabled\"`
-       if [[ ! -z ${ROUTER} ]]; then
-               MODULE_LOADED=`lsmod | awk ' { print $1 } ' | grep lnet`
-               if [[ ! -z ${MODULE_LOADED} ]]; then
-                       ${LCTL} network unconfigure
-               fi
-               ${LCTL} modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1
-               # do it again, in case we tried to unload ksocklnd too early
-               ${LCTL} modules | awk '{ print $2 }' | xargs rmmod
-
-       else
-               ${LCONF} ${LCONF_STOP_ARGS}
-       fi
-       RETVAL=$?
-       echo $SERVICE
-       rm -f $LOCK 
-       if [ $RETVAL -eq 0 ]; then
-               echo "offline" >$STATUS
-       else
-               echo "offline pending" >$STATUS
-       fi
-}
-
-restart() {
-       stop
-       start
-}
-
-status() {
-       STATE="stopped"
-       # LSB compliance - return 3 if service is not running
-       # Lustre-specific returns
-       # 150 - partial startup
-       # 151 - health_check unhealthy
-       # 152 - LBUG
-       RETVAL=3
-       egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded"
-
-       # check for any routes - on a portals router this is the only thing
-       [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0
-       
-       # check for any configured devices (may indicate partial startup)
-       if [ -d /proc/fs/lustre ]; then
-               [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=150
-
-               # check for either a server or a client filesystem
-               MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`"
-               OST="`ls /proc/fs/lustre/obdfilter/*/recovery_status 2> /dev/null`"
-               LLITE="`ls /proc/fs/lustre/llite/fs* 2> /dev/null`"
-               [ "$MDS" -o "$OST" -o "$LLITE" ] && STATE="running" && RETVAL=0
-       else
-               # check if this is a router
-               if [ -d /proc/sys/lnet ]; then
-                       ROUTER="`cat /proc/sys/lnet/routes | head -1 | grep -i -c \"Routing enabled\"`"
-                       if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then
-                               STATE="running"
-                               RETVAL=0
-                       fi
-               fi
-       fi
-
-       # check for server disconnections 
-       DISCON="`grep -v FULL /proc/fs/lustre/*c/*/*server_uuid 2> /dev/null`"
-       [ "$DISCON" ] && STATE="disconnected" && RETVAL=0
-
-       # check for servers in recovery
-       [ "$MDS$OST" ] && grep -q RECOV $MDS $OST && STATE="recovery" && RETVAL=0
-
-       # check for error in health_check
-       HEALTH="/proc/fs/lustre/health_check"
-       [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=151
-
-       # check for LBUG
-       [ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152
-
-       # If Lustre is up , check if the service really exists
-        # Skip this is we are not checking a specific service
-       if [ $RETVAL -eq 0 ] && [ $SERVICE != 'lustre' ]; then
-               DUMMY=$( $LCTL dl | grep "$SERVICE")
-               [ $? -ne 0 ] && STATE="not_found" && RETVAL=3
-       fi
-
-       echo $STATE
-}
-
-# See how we were called.
-case "$1" in
-  start)
-       start
-       ;;
-  stop)
-       stop
-       ;;
-  restart)
-       restart
-       ;;
-  status)
-       status $SERVICE
-       ;;
-  *)
-       echo "Usage: $SERVICE {start|stop|restart|status}"
-       exit 1
-esac
-
-exit $RETVAL
diff --git a/lustre/scripts/lustre.in b/lustre/scripts/lustre.in
new file mode 100644 (file)
index 0000000..ea8ac39
--- /dev/null
@@ -0,0 +1,740 @@
+#!/bin/bash
+#
+# lustre       This shell script takes care of starting and stopping
+#             the lustre services.
+#
+# chkconfig: - 60 20
+# description:  Part of the lustre file system.
+# probe: true
+# config: /etc/sysconfig/lustre
+
+# Source function library.
+. /etc/rc.d/init.d/functions
+
+# Source networking configuration.
+if [ ! -f /etc/sysconfig/network ]; then
+       exit 0
+fi
+
+. /etc/sysconfig/network
+
+LDEV=${LDEV:-"/usr/sbin/ldev"}
+ZPOOL_LAYOUT=/usr/bin/zpool_layout
+UDEVADM=${UDEVADM:-/sbin/udevadm}
+
+# Check that networking is up.
+[ "${NETWORKING}" = "no" ] && exit 0
+
+# Check for and source configuration file otherwise set defaults
+[ -f /etc/sysconfig/lustre ] && . /etc/sysconfig/lustre
+FSCK_ARGS=${FSCK_ARGS:-""}
+MOUNT_OPTIONS=${MOUNT_OPTIONS:-""}
+LOCAL_SRV=${LOCAL_SRV:-"`$LDEV -l 2>/dev/null`"}
+FOREIGN_SRV=${FOREIGN_SRV:-"`$LDEV -f 2>/dev/null`"}
+REQUIRE_MMP_FEATURE=${REQUIRE_MMP_FEATURE:-${FOREIGN_SRV:+"yes"}}
+LOCAL_MOUNT_DIR=${LOCAL_MOUNT_DIR:-"/mnt/lustre/local"}
+FOREIGN_MOUNT_DIR=${FOREIGN_MOUNT_DIR:-"/mnt/lustre/foreign"}
+SETUP_DEVICES=${SETUP_DEVICES:-""}
+ZPOOL_LAYOUT_BUSES=${ZPOOL_LAYOUT_BUSES:-""}
+ZPOOL_LAYOUT_PORTS=${ZPOOL_LAYOUT_PORTS:-""}
+ZPOOL_LAYOUT_MAP=${ZPOOL_LAYOUT_MAP:-""}
+MOUNT_DELAY=${MOUNT_DELAY:-2}
+LOAD_ZFS=${LOAD_ZFS:-""}
+
+shopt -s nullglob
+
+start_zfs_services ()
+{
+       if [ -n "$ZPOOL_LAYOUT_BUSES" -a -n "$ZPOOL_LAYOUT_PORTS" ] ; then
+               MAP_ARG=${ZPOOL_LAYOUT_MAP:+"-m $ZPOOL_LAYOUT_MAP"}
+               $ZPOOL_LAYOUT -t -b "$ZPOOL_LAYOUT_BUSES" \
+                       -p "$ZPOOL_LAYOUT_PORTS" $MAP_ARG
+       fi
+       if [ "$LOAD_ZFS" = "yes" ] && ! modprobe zfs ; then
+               echo "Failed to load zfs module.  Aborting."
+               exit 1
+       fi
+}
+
+stop_devices ()
+{
+       local labels=$*
+       local result=0
+       local label devtype
+       for label in $labels; do
+               devtype=`$LDEV -t $label`
+               if [ "$devtype" = "zfs" ] ; then
+                       export_zpool $label
+               elif [ "$devtype" = "md" ] ; then
+                       dev=`label_to_device $label`
+                       journal=`$LDEV -j $label`
+                       stop_md_device $dev
+                       stop_md_device $journal
+               fi
+       done
+}
+
+import_zpool ()
+{
+       local result=1
+       local label=$1
+       local pool=`$LDEV -z $label`
+       local args="-N $ZPOOL_IMPORT_ARGS"
+       local cache=`$LDEV -r $label`
+       # -c is incompatible with -d
+       if [ -n "$cache" ] ; then
+               args="$args -c $cache"
+       elif [ -n "$ZPOOL_IMPORT_DIR" ] ; then
+               args="$args -d $ZPOOL_IMPORT_DIR"
+       elif [ -d "/dev/disk/by-vdev" ] ; then
+               args="$args -d /dev/disk/by-vdev"
+       elif [ -d "/dev/mapper" ] ; then
+               args="$args -d /dev/mapper"
+       fi
+
+       if zpool status $pool >/dev/null 2>&1 ; then
+               result=0
+       elif [ -n "$pool" ] ; then
+               zpool import $pool $args 2>/dev/null
+               result=$?
+       fi
+       return $result
+}
+
+export_zpool ()
+{
+       local label=$1
+       local pool=`$LDEV -z $label`
+       zpool export $pool 2>/dev/null
+}
+
+# Trigger udev and wait for it to settle.
+udev_trigger()
+{
+       if [ -x ${UDEVADM} ]; then
+               ${UDEVADM} trigger --action=change --subsystem-match=block
+               ${UDEVADM} settle
+       else
+               /sbin/udevtrigger
+               /sbin/udevsettle
+       fi
+}
+
+# Usage: run_preexec_check [ start | restart | condrestart ]
+# The single parameter will be passed to the PREEXEC_SCRIPT
+run_preexec_check ()
+{
+       if [ -n "$PREEXEC_CHECK" ] && ! $PREEXEC_CHECK ; then
+               echo "Pre-exec check \"$PREEXEC_CHECK\" failed.  Aborting."
+               exit 1
+       fi
+
+       if [ -n "$PREEXEC_SCRIPT" ] && ! "$PREEXEC_SCRIPT" "$1" ; then
+               echo "Pre-exec script \"$PREEXEC_SCRIPT\" failed.  Aborting."
+               exit 1
+       fi
+}
+
+# Usage: run_postexec_check [ start | restart | condrestart ]
+# The single parameter will be passed to the PREEXEC_SCRIPT
+run_postexec_check ()
+{
+       if [ -n "$POSTEXEC_CHECK" ] && ! $POSTEXEC_CHECK ; then
+               echo "Post-exec check \"$POSTEXEC_CHECK\" failed.  Aborting."
+               exit 1
+       fi
+
+       if [ -n "$POSTEXEC_SCRIPT" ] && ! "$POSTEXEC_SCRIPT" "$1" ; then
+               echo "Post-exec script \"$POSTEXEC_SCRIPT\" failed.  Aborting."
+               exit 1
+       fi
+}
+
+# Usage: adjust_scsi_timeout <dev>
+adjust_scsi_timeout ()
+{
+       local dev=$1
+
+       if [ -n "$SCSI_DEVICE_TIMEOUT" ]; then
+               # make sure that it is actually a SCSI (sd) device
+               local name=`basename $dev`
+               local proc=/sys/block/${name}/device/timeout
+               local driver=`readlink /sys/block/${name}/device/driver`
+               if [ -n "$driver" ] && [ "`basename $driver`" == "sd" ]; then
+                       if ! echo $SCSI_DEVICE_TIMEOUT >$proc; then
+                               echo "FAILED: could not adjust ${dev} timeout"
+                               return 1
+                       fi
+               fi
+       fi
+       return 0
+}
+
+# Usage: fsck_test <dev> [ <dev> ... ]
+# Checks all devices in parallel if FSCK_ARGS is set.
+fsck_test ()
+{
+       local devices="$*"
+
+       # Filter out non-absolute paths, which are probably ZFS datasets
+       devices=`echo $devices |xargs -n 1|grep '^/'|xargs`
+
+       if [ -n "${FSCK_ARGS}" -a -n "$devices" ]; then
+               if [ -x /sbin/@PFSCK@ ] ; then
+                       echo "@PFSCK@ $devices -- ${FSCK_ARGS}"
+                       /sbin/@PFSCK@ $devices -- ${FSCK_ARGS}
+                       if [ $? -ne 0 -a $? -ne 1 ] ; then
+                               echo "FAILED: @PFSCK@ -- ${FSCK_ARGS}: $?"
+                               return 1
+                       fi
+               else
+                       echo "/sbin/@PFSCK@ not found"
+                       return 1
+               fi
+       fi
+       return 0
+}
+
+# Usage: test_feature_flag <dev> <flag>
+test_feature_flag()
+{
+       local dev=$1
+       local flag=$2
+       local result=1
+       local feature
+
+       for feature in `/sbin/@TUNE2FS@ -l $dev 2>/dev/null \
+                               | grep features: | sed -e 's/^.*: //'`; do
+               if [ "$feature" == "$flag" ]; then
+                       result=0
+                       break
+               fi
+       done
+
+       return $result
+}
+
+# Usage: mmp_test <dev>
+# Returns 0 if it is set or not required, 1 if unset and required or error.
+mmp_test ()
+{
+       local dev=$1
+       local result=0
+
+       if [ "$REQUIRE_MMP_FEATURE" == "yes" ]; then
+               if [ -x /sbin/@TUNE2FS@ ]; then
+                       if ! test_feature_flag $dev "mmp"; then
+                               echo "mmp feature flag is not set on $dev"
+                               result=1
+                       fi
+               else
+                       echo "/sbin/@TUNE2FS@ not found"
+                       result=1
+               fi
+       fi
+
+       return $result
+}
+
+# Usage: label_to_mountpt <label>
+# Prints mount point path, if label matches a local or foreign server.
+label_to_mountpt ()
+{
+       local label=$1
+       local serv
+
+       for serv in $LOCAL_SRV; do
+               if [ "$serv" == "$label" ]; then
+                       echo "$LOCAL_MOUNT_DIR/$label"
+                       return
+               fi
+       done
+       for serv in $FOREIGN_SRV; do
+               if [ "$serv" == "$label" ]; then
+                       echo "$FOREIGN_MOUNT_DIR/$label"
+                       return
+               fi
+       done
+}
+
+# Usage: label_to_device <label>
+# Prints canonical device path.
+label_to_device ()
+{
+       local label=$1
+       local path=/dev/disk/by-label/$label
+
+       if [ -h $path ] ; then
+               readlink --canonicalize $path
+       else
+               $LDEV -d $label
+       fi
+}
+
+# helper for mountpt_is_active() and device_is_active()
+declare -r awkprog='BEGIN {rc = 1;}
+                       { if ($field == path) {rc = 0;} }
+                   END { exit rc;}'
+
+# Usage: mountpt_is_active <label>
+# Return 1 (inactive) on invalid label.
+mountpt_is_active ()
+{
+       local dir=`label_to_mountpt $1`
+       local result=1
+
+       if [ -n "$dir" ]; then
+               cat /proc/mounts | awk "$awkprog" field=2 path=$dir
+               result=$?
+       fi
+       return $result
+}
+
+# Usage: device_is_active <label>
+# Return 1 (inactive) on invalid label.
+device_is_active ()
+{
+       local dev=`label_to_device $1`
+       local result=1
+
+       if [ -n "$dev" ]; then
+               cat /proc/mounts | awk "$awkprog" field=1 path=$dir
+               result=$?
+       fi
+       return $result
+}
+
+# Usage: mount_one_device <label> <successflag> [devtype]
+# Remove <succesflag> on error (trick to detect errors after parallel runs).
+mount_one_device ()
+{
+       local label=$1
+       local successflag=$2
+       local devtype=$3
+       local dev=`label_to_device $label`
+       local dir=`label_to_mountpt $label`
+
+       # $dir and $dev have already been checked at ths point
+       if [ ! -d $dir ] && ! mkdir -p $dir; then
+               rm -f $successflag
+               return
+       fi
+       echo "Mounting $dev on $dir"
+       if ! mount -t lustre $MOUNT_OPTIONS $dev $dir; then
+               rm -f $successflag
+               return
+       fi
+}
+
+# Usage: assemble_md_device <device>
+# Assemble the md device backing device.
+# Return 0 if the array is assembled successfully or was already active,
+# otherwise return error code from mdadm.
+assemble_md_device ()
+{
+       local dev=$1
+       local raidtab=$2
+       local args="-Aq"
+       local result=0
+
+       if [ -n "$raidtab" ] ; then
+               args="$args -c $raidtab"
+       fi
+
+       if ! md_array_is_active $dev ; then
+               mdadm $args $dev
+               result=$?
+       fi
+
+       udev_trigger
+       return $result
+}
+
+# Usage: stop_md_device <device>
+# Stop the md device backing device.
+# Return 0 if the array is stopped successfully or was not active,
+# otherwise return error code from mdadm.
+stop_md_device ()
+{
+       local dev=$1
+       local raidtab=$2
+       local args="-Sq"
+       local result=0
+
+       if [ -n "$raidtab" ] ; then
+               args="$args -c $raidtab"
+       fi
+
+       if [ -e $dev ] && md_array_is_active $dev ; then
+               mdadm $args $dev
+               result=$?
+       fi
+
+       return $result
+}
+
+# Usage: md_array_is_active <device>
+# return 0 if device is an active md RAID array, or 1 otherwise
+md_array_is_active ()
+{
+       local device=$1
+
+       [ -e "$device" ] || return 1
+
+       mdadm --detail -t $device > /dev/null 2>&1
+       if [ $? -eq 4 ] ; then
+               return 1
+       fi
+       return 0
+}
+
+# Usage: start_services <label> [ <label> ... ]
+# fsck and mount any devices listed as arguments (in parallel).
+# Attempt to assemble software raid arrays or zfs pools backing
+# Lustre devices.
+start_services ()
+{
+       local result=0
+       local devices=""
+       local dir dev label
+       local successflag
+       local labels
+
+       start_zfs_services
+       for label in $*; do
+               dir=`label_to_mountpt $label`
+               devtype=`$LDEV -t $label`
+               dev=`label_to_device $label`
+               journal=`$LDEV -j $label`
+               raidtab=`$LDEV -r $label`
+
+               if [ -z "$dir" ] || [ -z "$dev" ]; then
+                       echo "$label is not a valid lustre label on this node"
+                       result=2
+                       continue
+               fi
+
+               if [ "$devtype" = "md" ] ; then
+                       if ! assemble_md_device $dev $raidtab ; then
+                               echo "failed to assemble array $dev backing $label"
+                               result=2
+                               continue
+                       fi
+               elif [ "$devtype" = "zfs" ] ; then
+                       if ! import_zpool $label ; then
+                               result=2
+                       fi
+               fi
+
+               # Journal device field in ldev.conf may be "-" or empty,
+               # so only attempt to assemble if its an absolute path.
+               # Ignore errors since the journal device may not be an
+               # md device.
+               if echo $journal | grep -q ^/ ; then
+                       assemble_md_device $journal $raidtab 2>/dev/null
+               fi
+
+               if [ "x$devtype" != "xzfs" ] ; then
+                       if mountpt_is_active $label || \
+                          device_is_active $label; then
+                               echo "$label is already mounted"
+                               # no error
+                               continue
+                       fi
+                       if ! mmp_test $dev; then
+                               result=2
+                               continue
+                       fi
+                       if ! adjust_scsi_timeout $dev; then
+                               result=2
+                               continue
+                       fi
+               fi
+               devices="$devices $dev"
+               labels="$labels $label"
+       done
+       if [ $result == 0 ]; then
+               fsck_test $devices || return 2
+
+               # Fork to handle multiple mount_one_device()'s in parallel.
+               # Errors occurred if $successflag comes up missing afterwards.
+               successflag=`mktemp`
+               [ -e $successflag ] || return 2
+               for label in $labels; do
+                       mount_one_device $label $successflag `$LDEV -t $label` &
+                       # stagger to avoid module loading races
+                       if [[ -n $MOUNT_DELAY && $MOUNT_DELAY -gt 0 ]] ; then
+                               sleep $MOUNT_DELAY
+                       fi
+               done
+               for label in $labels; do
+                       wait
+               done
+               [ -e $successflag ] || return 2
+               rm -f $successflag
+       fi
+
+       return $result
+}
+
+# Usage: stop_services <label> [ <label> ... ]
+# Unmount any devices listed as arguments (serially).
+# Any devices which are not mounted or don't exist are skipped with no error.
+stop_services ()
+{
+       local labels=$*
+       local result=0
+       local dir dev label
+
+       for label in $labels; do
+               dir=`label_to_mountpt $label`
+               if [ -z "$dir" ]; then
+                       echo "$label is not a valid lustre label on this node"
+                       result=2
+                       continue
+               fi
+               if ! mountpt_is_active $label; then
+                       #echo "$label is not mounted"
+                       # no error
+                       continue
+               fi
+               echo "Unmounting $dir"
+               umount $dir || result=2
+       done
+       # double check!
+       for label in $labels; do
+               if mountpt_is_active $label; then
+                       dir=`label_to_mountpt $label`
+                       echo "Mount point $dir is still active"
+                       result=2
+               fi
+               if device_is_active $label; then
+                       dev=`label_to_device $label`
+                       echo "Device $dev is still active"
+                       result=2
+               fi
+       done
+       stop_devices $labels
+
+       return $result
+}
+
+# Usage: start_lustre_services [local|foreign|all|<label>]
+# If no parameter is specified, local devices will be started.
+start_lustre_services ()
+{
+       local labels=""
+
+       case "$1" in
+               ""|local)
+                       labels=$LOCAL_SRV
+                       ;;
+               foreign)
+                       labels=$FOREIGN_SRV
+                       ;;
+               all)    labels="$LOCAL_SRV $FOREIGN_SRV"
+                       ;;
+               *)      labels="$1"
+                       ;;
+       esac
+       # for use by heartbeat V1 resource agent:
+       # starting an already-started service must not be an error
+       start_services $labels || exit 2
+}
+
+# Usage: stop_lustre_services [local|foreign|all|<label>]
+# If no parameter is specified all devices will be stopped.
+stop_lustre_services ()
+{
+       local labels=""
+
+       case "$1" in
+               local) labels=$LOCAL_SRV
+                       ;;
+               foreign)
+                       labels=$FOREIGN_SRV
+                       ;;
+               ""|all) labels="$LOCAL_SRV $FOREIGN_SRV"
+                       ;;
+               *)      labels="$1"
+                       ;;
+       esac
+       # for use by heartbeat V1 resource agent:
+       # stopping already-stopped service must not be an error
+       stop_services $labels || exit 2
+}
+
+# General lustre health check - not device specific.
+health_check ()
+{
+       old_nullglob="`shopt -p nullglob`"
+       shopt -u nullglob
+
+       STATE="stopped"
+       # LSB compliance - return 3 if service is not running
+       # Lustre-specific returns
+       # 150 - partial startup
+       # 151 - health_check unhealthy
+       # 152 - LBUG
+       RETVAL=3
+       egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded"
+
+       # check for any configured devices (may indicate partial startup)
+       if [ -d /proc/fs/lustre ]; then
+               if [ -n "`cat /proc/fs/lustre/devices 2> /dev/null`" ] ; then
+                       STATE="partial"
+                       RETVAL=150
+               fi
+
+               # check for either a server or a client filesystem
+               MDT="`ls /proc/fs/lustre/mdt/*/recovery_status 2> /dev/null`"
+               OST="`ls /proc/fs/lustre/obdfilter/*/recovery_status \
+                       2> /dev/null`"
+               LLITE="`ls /proc/fs/lustre/llite/fs* 2> /dev/null`"
+               if [ "$MDT" -o "$OST" -o "$LLITE" ]; then
+                       STATE="running"
+                       RETVAL=0
+               fi
+       else
+               # check if this is a router
+               if [ -d /proc/sys/lnet ]; then
+                       ROUTER="`cat /proc/sys/lnet/routes | head -1 |
+                               grep -i -c \"Routing enabled\"`"
+                       if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then
+                               STATE="running"
+                               RETVAL=0
+                       fi
+               fi
+       fi
+
+       # check for server disconnections
+       DISCON="`grep -v FULL /proc/fs/lustre/*c/*/*server_uuid 2> /dev/null`"
+       if [ -n "$DISCON" ] ; then
+               STATE="disconnected"
+               RETVAL=0
+       fi
+
+       # check for servers in recovery
+       if [ -n "$MDT$OST" ] && grep -q RECOV $MDT $OST ; then
+               STATE="recovery"
+               RETVAL=0
+       fi
+
+       # check for error in health_check
+       HEALTH="/proc/fs/lustre/health_check"
+       if [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH ; then
+               STATE="unhealthy"
+               RETVAL=1
+       fi
+
+       # check for LBUG
+       if [ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH ; then
+               STATE="LBUG"
+               RETVAL=152
+       fi
+
+       echo $STATE
+       eval $old_nullglob
+       return $RETVAL
+}
+
+# Usage: status [local|foreign|all|<label>]
+# If no parameter is specified, general lustre health status will be reported.
+status ()
+{
+       local labels=""
+       local label dir
+       local valid_devs=0
+
+       case "$1" in
+               local) labels=$LOCAL_SRV;
+                       ;;
+               foreign)
+                       labels=$FOREIGN_SRV;
+                       ;;
+               all)    labels="$LOCAL_SRV $FOREIGN_SRV"
+                       ;;
+               "")     # ASSUMPTION: this is not the heartbeat res agent
+                       health_check
+                       exit $?
+                       ;;
+               *)      labels=$1
+                       ;;
+       esac
+       # for use by heartbeat V1 resource agent:
+       # print "running" if *anything* is running.
+       for label in $labels; do
+               dir=`label_to_device $label`
+               if [ -z "$dir" ]; then
+                       echo "$label is not a valid lustre label on this node"
+                       # no error
+                       continue
+               fi
+               valid_devs=1
+               if mountpt_is_active $label || device_is_active $label; then
+                       echo "running"
+                       exit 0
+               fi
+       done
+       [ $valid_devs == 1 ] && echo "stopped"
+       exit 3
+}
+
+usage ()
+{
+       cat <<EOF
+Usage: lustre {start|stop|status|restart|reload|condrestart}
+
+       lustre start  [local|foreign|<label>]
+       lustre stop   [local|foreign|<label>]
+       lustre status [local|foreign|<label>]
+EOF
+       exit 1
+}
+
+# See how we were called.
+case "$1" in
+  start)
+       if [ $# -gt 2 ] ; then
+               echo "ERROR: Too many arguments."
+               usage
+       fi
+       run_preexec_check "start"
+       start_lustre_services $2
+       run_postexec_check "start"
+       ;;
+  stop)
+       if [ $# -gt 2 ] ; then
+               echo "ERROR: Too many arguments."
+               usage
+       fi
+       run_preexec_check "stop"
+       stop_lustre_services $2
+       run_postexec_check "stop"
+       ;;
+  status)
+       if [ $# -gt 2 ] ; then
+               echo "ERROR: Too many arguments."
+               usage
+       fi
+       status $2
+       ;;
+  restart)
+       $0 stop
+       $0 start
+       ;;
+  reload)
+       ;;
+  probe)
+       ;;
+  condrestart)
+       if grep lustre /proc/mounts ; then
+               $0 stop
+               $0 start
+       fi
+       ;;
+  *)
+       usage
+esac
+
+exit 0