Whamcloud - gitweb
LU-8458 pacemaker: Script to monitor Server status 97/22297/7
authorGabriele Paciucci <gabriele.paciucci@intel.com>
Mon, 8 Aug 2016 16:29:24 +0000 (17:29 +0100)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 6 Apr 2017 00:59:57 +0000 (00:59 +0000)
A new script to be used in Pacemaker to monitor
the Lustre Servers status compatible with ZFS and
LDISKFS based Lustre server installations.

This RA is able to monitor a Lustre Server
using the Pacemaker's clone technology.

pcs resource create [Resource Name] ocf:lustre:healthLUSTRE
dampen=[seconds 5s]
--clone

where:
* dampen The time to wait (dampening) further changes occur

This script should be located in /usr/lib/ocf/resource.d/lustre/
of both the Lustre servers with permission 755.

Test-Parameters: trivial
Signed-off-by: Gabriele Paciucci <gabriele.paciucci@intel.com>
Change-Id: Ibfbad748e8c1b0c7faecc91984def87002070033
Reviewed-on: https://review.whamcloud.com/22297
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Nathaniel Clark <nathaniel.l.clark@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
contrib/scripts/pacemaker/healthLUSTRE [new file with mode: 0755]

diff --git a/contrib/scripts/pacemaker/healthLUSTRE b/contrib/scripts/pacemaker/healthLUSTRE
new file mode 100755 (executable)
index 0000000..07e0bad
--- /dev/null
@@ -0,0 +1,220 @@
+#!/bin/sh
+#
+#
+#      HealthLUSTRE OCF RA
+#
+
+# License:      GNU General Public License (GPL)v2
+# Description:  Manages ZFS and Lustre on a shared storage
+# Written by:   Gabriele Paciucci
+# Release Date: 01 November 2016
+# Release Version: 0.99.3
+# Copyright (c) 2009 Andrew Beekhof
+# Copyright (c) 2016, Intel Corporation
+
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.
+# If not, see <http://www.gnu.org/licenses/gpl-2.0.html>
+#
+
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
+. ${OCF_FUNCTIONS}
+: ${__OCF_ACTION=$1}
+
+#######################################################################
+
+meta_data() {
+       cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="healthLUSTRE">
+<version>0.99.3</version>
+
+<longdesc lang="en">
+Every time the monitor action is run, this resource agent
+records (in the CIB) the current number of healthy lustre server
+</longdesc>
+<shortdesc lang="en">lustre servers healthy</shortdesc>
+
+<parameters>
+
+<parameter name="pidfile" unique="0">
+<longdesc lang="en">PID file</longdesc>
+<shortdesc lang="en">PID file</shortdesc>
+<content type="string" default="$HA_VARRUN/healthLUSTRE-${OCF_RESOURCE_INSTANCE}" />
+</parameter>
+
+<parameter name="dampen" unique="0">
+<longdesc lang="en">
+The time to wait (dampening) further changes occur
+</longdesc>
+<shortdesc lang="en">Dampening interval</shortdesc>
+<content type="integer" default="5s"/>
+</parameter>
+
+
+<parameter name="name" unique="0">
+<longdesc lang="en">
+The name of the attributes to set.
+This is the name to be used in the constraints.
+</longdesc>
+<shortdesc lang="en">Attribute name</shortdesc>
+<content type="string" default="lustred"/>
+</parameter>
+
+
+
+<parameter name="debug" unique="0">
+<longdesc lang="en">
+Enables to use default attrd_updater verbose logging on every call.
+</longdesc>
+<shortdesc lang="en">Verbose logging</shortdesc>
+<content type="string" default="false"/>
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start"   timeout="60" />
+<action name="stop"    timeout="20" />
+<action name="reload"  timeout="100" />
+<action name="monitor" depth="0"  timeout="60" interval="10"/>
+<action name="meta-data"  timeout="5" />
+<action name="validate-all"  timeout="30" />
+</actions>
+</resource-agent>
+END
+}
+
+#######################################################################
+
+lustre_conditional_log() {
+       level=$1; shift
+       if [ ${OCF_RESKEY_debug} = "true" ]; then
+               ocf_log $level "$*"
+       fi
+}
+
+lustre_usage() {
+       cat <<END
+usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+}
+
+lustre_start() {
+       lustre_monitor
+       if [ $? =  $OCF_SUCCESS ]; then
+               return $OCF_SUCCESS
+       fi
+       touch ${OCF_RESKEY_pidfile}
+       lustre_update
+}
+
+lustre_stop() {
+       rm -f ${OCF_RESKEY_pidfile}
+       attrd_updater -D -n $OCF_RESKEY_name -d $OCF_RESKEY_dampen $attrd_options
+       return $OCF_SUCCESS
+}
+
+lustre_monitor() {
+       if [ -f ${OCF_RESKEY_pidfile} ]; then
+               lustre_update
+               if [ $? -eq 0 ]; then
+                   return $OCF_SUCCESS
+               fi
+               return $OCF_ERR_GENERIC
+       fi
+       return $OCF_NOT_RUNNING
+}
+
+
+lustre_check() {
+       active=0
+
+       # added head -1 due the LU-7486
+       l_out=`cat /proc/fs/lustre/health_check | head -1 |grep -w healthy 2>&1`; rc=$?
+
+       case $rc in
+           0) active=`expr $active + 1`;;
+           1) lustre_conditional_log warn "Lustre is not healthy: $l_out";;
+           *) ocf_log err "Unexpected result for '/proc/fs/lustre/health_check' $rc: $l_out";;
+       esac
+       return $active
+}
+
+lustre_update() {
+       lustre_check
+       active=$?
+
+       attrd_updater -n $OCF_RESKEY_name -v $active -d $OCF_RESKEY_dampen $attrd_options
+       rc=$?
+       case $rc in
+               0) lustre_conditional_log debug "Updated $OCF_RESKEY_name = $active" ;;
+               *) ocf_log warn "Could not update $OCF_RESKEY_name = $active: rc=$rc";;
+       esac
+       if [ $rc -ne 0 ]; then
+               return $rc
+       fi
+       return 0
+}
+
+
+if [ ! -f /proc/fs/lustre/health_check ]; then
+       ocf_log warn "Attention Health_Check file doesn't exist. Lustre will be loaded"
+       modprobe lustre
+fi
+
+
+if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
+       : ${OCF_RESKEY_pidfile:="$HA_VARRUN/healthLUSTRE-${OCF_RESKEY_name}"}
+else
+       : ${OCF_RESKEY_pidfile:="$HA_VARRUN/healthLUSTRE-${OCF_RESOURCE_INSTANCE}"}
+fi
+
+attrd_options='-q'
+if ocf_is_true ${OCF_RESKEY_debug} ; then
+    attrd_options=''
+fi
+
+: ${OCF_RESKEY_name:="lustred"}
+: ${OCF_RESKEY_debug:="false"}
+
+case $__OCF_ACTION in
+meta-data)     meta_data
+               exit $OCF_SUCCESS
+               ;;
+start)         lustre_start;;
+stop)          lustre_stop;;
+monitor)       lustre_monitor;;
+reload)                lustre_start;;
+validate-all)  lustre_usage
+               exit $OCF_SUCCESS
+               ;;
+usage|help)    lustre_usage
+               exit $OCF_SUCCESS
+               ;;
+*)             lustre_usage
+               exit $OCF_ERR_UNIMPLEMENTED
+               ;;
+esac