Linux Cluster Node Status

Linux.ClusterMonitoring.NodeStatus.Monitor (UnitMonitor)

Monitor that alerts if a Cluster node does not report an Online status.

Element properties:

TargetLinux.ClusterMonitoring.Node
Parent MonitorSystem.Health.AvailabilityState
CategoryAvailabilityHealth
EnabledTrue
Alert GenerateTrue
Alert SeverityMatchMonitorHealth
Alert PriorityNormal
Alert Auto ResolveTrue
Monitor TypeUnix.Authoring.ShellScript.MatchesRegExp.TwoState.MonitorType
RemotableTrue
AccessibilityInternal
Alert Message
Linux Cluster Node Status
A cluster node of the cluster: {2} is not Online. The output is: {1}
RunAsDefault

Source Code:

<UnitMonitor ID="Linux.ClusterMonitoring.NodeStatus.Monitor" Accessibility="Internal" Enabled="true" Target="Linux.ClusterMonitoring.Node" ParentMonitorID="Health!System.Health.AvailabilityState" Remotable="true" Priority="Normal" TypeID="UnixAuth!Unix.Authoring.ShellScript.MatchesRegExp.TwoState.MonitorType" ConfirmDelivery="false">
<Category>AvailabilityHealth</Category>
<AlertSettings AlertMessage="Linux.ClusterMonitoring.NodeStatus.Monitor.AlertMessage">
<AlertOnState>Error</AlertOnState>
<AutoResolve>true</AutoResolve>
<AlertPriority>Normal</AlertPriority>
<AlertSeverity>MatchMonitorHealth</AlertSeverity>
<AlertParameters>
<AlertParameter1>
$Target/Host/Property[Type="Unix!Microsoft.Unix.Computer"]/NetworkName$
</AlertParameter1>
<AlertParameter2>$Data/Context///*[local-name()="StdOut"]$</AlertParameter2>
<AlertParameter3>$Target/Property[Type="Linux.ClusterMonitoring.Node"]/ClusterName$</AlertParameter3>
</AlertParameters>
</AlertSettings>
<OperationalStates>
<OperationalState ID="StatusOK" MonitorTypeStateID="StatusOK" HealthState="Success"/>
<OperationalState ID="StatusError" MonitorTypeStateID="StatusError" HealthState="Error"/>
</OperationalStates>
<Configuration>
<Interval>300</Interval>
<TargetSystem>
$Target/Host/Property[Type="Unix!Microsoft.Unix.Computer"]/NetworkName$
</TargetSystem>
<ShellScript>

#!/bin/bash
sstat=`clustat`
startln=`echo "${sstat}" |grep -n ' Member Name'|awk '{print $1}' |cut -f1 -d:`
endln=`echo "${sstat}" |grep -n ' Service' |awk '{print $1}' |cut -f1 -d:`
startln=`expr $startln + 2`
endln=`expr $endln - 2`
nodestat=`printf "${sstat}" |sed -n ''${startln}','${endln}'p'`

anyOffline=`echo "${nodestat}"|grep Offline|wc -l`
if [ $anyOffline -ge 1 ]
then
echo -e "ERROR:\n ${nodestat}"
else
echo -e "OK:\n ${nodestat}"
fi

</ShellScript>
<ScriptArguments/>
<Timeout>60</Timeout>
<UserName>$RunAs[Name="Linux.ClusterMonitoring.Account"]/UserName$</UserName>
<Password>$RunAs[Name="Linux.ClusterMonitoring.Account"]/Password$</Password>
<ErrorRegExp>^ERROR</ErrorRegExp>
</Configuration>
</UnitMonitor>