8 # License: GNU General Public License (GPL)v2
9 # Description: Manages ZFS and Lustre on a shared storage
10 # Written by: Gabriele Paciucci
11 # Release Date: 01 November 2016
12 # Release Version: 0.99.4
14 # Copyright (c) 2009 Andrew Beekhof
15 # Copyright (c) 2016, Intel Corporation
19 # This program is free software; you can redistribute it and/or modify
20 # it under the terms of version 2 of the GNU General Public License as
21 # published by the Free Software Foundation.
23 # This program is distributed in the hope that it would be useful, but
24 # WITHOUT ANY WARRANTY; without even the implied warranty of
25 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
27 # Further, this software is distributed without any warranty that it is
28 # free of the rightful claim of any third person regarding infringement
29 # or the like. Any license provided herein, whether implied or
30 # otherwise, applies only to this software file. Patent licenses, if
31 # any, provided herein do not apply to combinations of this program with
32 # other software, or any other product whatsoever.
34 # You should have received a copy of the GNU General Public License
35 # along with this program. If not, see <http://www.gnu.org/licenses/>.
38 #######################################################################
41 : ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
45 #######################################################################
50 <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
51 <resource-agent name="healthLNET">
52 <version>0.99.4</version>
55 Every time the monitor action is run, this resource agent records (in the CIB)
56 the current number of lctl ping nodes the host can connect to.
58 <shortdesc lang="en">LNet connectivity</shortdesc>
62 <parameter name="pidfile" unique="0">
63 <longdesc lang="en">PID file</longdesc>
64 <shortdesc lang="en">PID file</shortdesc>
65 <content type="string" default="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}" />
68 <parameter name="dampen" unique="0">
70 The time to wait (dampening) further changes occur
72 <shortdesc lang="en">Dampening interval</shortdesc>
73 <content type="integer" default="5s"/>
76 <parameter name="name" unique="0">
78 The name of the attributes to set. This is the name to be used in the constraints.
80 <shortdesc lang="en">Attribute name</shortdesc>
81 <content type="string" default="pingd"/>
84 <parameter name="multiplier" unique="0">
86 The number by which to multiply the number of connected ping nodes by
88 <shortdesc lang="en">Value multiplier</shortdesc>
89 <content type="integer" default=""/>
92 <parameter name="host_list" unique="0" required="1">
94 The list of ping nodes to count.
96 <shortdesc lang="en">Host list</shortdesc>
97 <content type="string" default=""/>
100 <parameter name="attempts" unique="0">
102 Number of ping attempts, per host, before declaring it dead
104 <shortdesc lang="en">no. of ping attempts</shortdesc>
105 <content type="integer" default="2"/>
108 <parameter name="timeout" unique="0">
110 How long, in seconds, to wait before declaring a ping lost
112 <shortdesc lang="en">ping timeout in seconds</shortdesc>
113 <content type="integer" default="2"/>
116 <parameter name="lctl" unique="0">
118 Option to enable lctl ping. The default is true
120 <shortdesc lang="en">Extra Options</shortdesc>
121 <content type="string" default="true"/>
124 <parameter name="device" unique="0">
126 Device used for the LNET network. We assume the same device accross the cluster
128 <shortdesc lang="en">LNET device</shortdesc>
129 <content type="string" default=""/>
133 <parameter name="options" unique="0">
135 A catch all for any other options that need to be passed to ping.
137 <shortdesc lang="en">Extra Options</shortdesc>
138 <content type="string" default=""/>
141 <parameter name="failure_score" unique="0">
143 Resource is failed if the score is less than failure_score.
146 <shortdesc lang="en">failure_score</shortdesc>
147 <content type="integer" default=""/>
150 <parameter name="debug" unique="0">
152 Enables to use default attrd_updater verbose logging on every call.
154 <shortdesc lang="en">Verbose logging</shortdesc>
155 <content type="string" default="false"/>
161 <action name="start" timeout="300s" />
162 <action name="stop" timeout="300s" />
163 <action name="reload" timeout="300s" />
164 <action name="monitor" depth="0" timeout="300s" interval="20s"/>
165 <action name="meta-data" timeout="5" />
166 <action name="validate-all" timeout="30" />
172 #######################################################################
174 ping_conditional_log() {
176 if [ ${OCF_RESKEY_debug} = "true" ]; then
183 usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}
185 Expects to have a fully populated OCF RA-compliant environment set.
191 if [ $? = $OCF_SUCCESS ]; then
194 touch ${OCF_RESKEY_pidfile}
200 rm -f ${OCF_RESKEY_pidfile}
201 attrd_updater -D -n $OCF_RESKEY_name -d $OCF_RESKEY_dampen $attrd_options
206 if [ -f ${OCF_RESKEY_pidfile} ]; then
208 if [ $? -eq 0 ]; then
211 return $OCF_ERR_GENERIC
213 return $OCF_NOT_RUNNING
217 # Is the state directory writable?
218 state_dir=`dirname "$OCF_RESKEY_pidfile"`
219 touch "$state_dir/$$"
221 ocf_log err "Invalid location for 'state': $state_dir is not writable"
226 # Pidfile better be an absolute path
227 case $OCF_RESKEY_pidfile in
229 *) ocf_log warn "You should use an absolute path for pidfile not: $OCF_RESKEY_pidfile" ;;
232 # Check the host list
233 if [ "x" = "x$OCF_RESKEY_host_list" ]; then
234 ocf_log err "Empty host_list. Please specify some nodes to ping"
235 exit $OCF_ERR_CONFIGURED
245 for host in $OCF_RESKEY_host_list; do
248 lctl_out=`$lctl_exe $host $OCF_RESKEY_timeout 2>&1`; rc=$?
250 # ocf_log info "$lctl_exe $host $OCF_RESKEY_timeout"
253 0) active=`expr $active + 1`;;
254 1) ping_conditional_log warn "$host is inactive: $lctl_out";;
255 *) ocf_log err "Unexpected result for '$lctl_exe $host $OCF_RESKEY_timeout' $rc: $p_out";;
267 for host in $OCF_RESKEY_host_list; do
271 Linux) p_args="-n -q -W $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts";;
272 Darwin) p_args="-n -q -t $OCF_RESKEY_timeout -c $OCF_RESKEY_attempts -o";;
273 *) ocf_log err "Unknown host type: `uname`"; exit $OCF_ERR_INSTALLED;;
280 p_out=`$p_exe $p_args $OCF_RESKEY_options $host 2>&1`; rc=$?
283 0) active=`expr $active + 1`;;
284 1) ping_conditional_log warn "$host is inactive: $p_out";;
285 *) ocf_log err "Unexpected result for '$p_exe $p_args $OCF_RESKEY_options $host' $rc: $p_out";;
292 # first I'm testing if I have the physical link up.
293 # If not I give up without any additional tests.
294 # but first we need to find which is the device we are using on the localhost.
296 CARRIER=/sys/class/net/$OCF_RESKEY_device/carrier
297 OPERSTATE=/sys/class/net/$OCF_RESKEY_device/operstate
299 CAR_STAT=$(cat $CARRIER)
300 OPER_STAT=$(cat $OPERSTATE)
303 # ocf_log info "$CAR_STAT - $OPER_STAT"
306 if [ "$CAR_STAT" == "1" ] && [ "$OPER_STAT" == "up" ]; then
307 if [ ${OCF_RESKEY_lctl} = "true" ]; then
319 # ocf_log info "$active"
321 score=`expr $active \* $OCF_RESKEY_multiplier`
322 attrd_updater -n $OCF_RESKEY_name -v $score -d $OCF_RESKEY_dampen $attrd_options
325 0) ping_conditional_log debug "Updated $OCF_RESKEY_name = $score" ;;
326 *) ocf_log warn "Could not update $OCF_RESKEY_name = $score: rc=$rc";;
328 if [ $rc -ne 0 ]; then
331 if [ $score -eq 0 ]; then
332 ocf_log err "LNet connection failed please check"
334 if [ -n "$OCF_RESKEY_failure_score" -a "$score" -lt "$OCF_RESKEY_failure_score" ]; then
335 ocf_log warn "$OCF_RESKEY_name is less than failure_score($OCF_RESKEY_failure_score)"
341 : ${OCF_RESKEY_name:="pingd"}
342 : ${OCF_RESKEY_dampen:="5s"}
343 : ${OCF_RESKEY_attempts:="3"}
344 : ${OCF_RESKEY_multiplier:="1"}
345 : ${OCF_RESKEY_debug:="false"}
346 : ${OCF_RESKEY_lctl:="true"}
347 #: ${OCF_RESKEY_device:="eth1"}
348 : ${OCF_RESKEY_failure_score:="0"}
350 : ${OCF_RESKEY_CRM_meta_timeout:="20000"}
351 : ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
353 integer=`echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*'`
354 case ${OCF_RESKEY_timeout} in
355 *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=`expr $integer / 1000`;;
356 *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=`expr $integer \* 60`;;
357 *[0-9]h|*[0-9]hr) OCF_RESKEY_timeout=`expr $integer \* 60 \* 60`;;
358 *) OCF_RESKEY_timeout=$integer;;
361 if [ -z ${OCF_RESKEY_timeout} ]; then
362 if [ x"$OCF_RESKEY_host_list" != x ]; then
363 host_count=`echo $OCF_RESKEY_host_list | awk '{print NF}'`
364 OCF_RESKEY_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / $host_count / $OCF_RESKEY_attempts`
365 OCF_RESKEY_timeout=`expr $OCF_RESKEY_timeout / 1100` # Convert to seconds and finish 10% early
371 if [ ${OCF_RESKEY_timeout} -lt 1 ]; then
373 elif [ ${OCF_RESKEY_timeout} -gt 1000 ]; then
374 # ping actually complains if this value is too high, 5 minutes is plenty
375 OCF_RESKEY_timeout=300
378 if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
379 : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESKEY_name}"}
381 : ${OCF_RESKEY_pidfile:="$HA_VARRUN/ping-${OCF_RESOURCE_INSTANCE}"}
385 if ocf_is_true ${OCF_RESKEY_debug} ; then
389 # Check the debug option
390 case "${OCF_RESKEY_debug}" in
391 true|True|TRUE|1) OCF_RESKEY_debug=true;;
392 false|False|FALSE|0) OCF_RESKEY_debug=false;;
394 ocf_log warn "Value for 'debug' is incorrect. Please specify 'true' or 'false' not: ${OCF_RESKEY_debug}"
395 OCF_RESKEY_debug=false
399 case $__OCF_ACTION in
405 monitor) ping_monitor;;
407 validate-all) ping_validate;;
408 usage|help) ping_usage
412 exit $OCF_ERR_UNIMPLEMENTED