Script for Components Health State Data Source

Fujitsu.Servers.PRIMERGY.Linux.HealthState.DataSource.Script (DataSourceModuleType)

Element properties:

TypeDataSourceModuleType
IsolationAny
AccessibilityPublic
RunAsDefault
OutputTypeSystem.PropertyBagData

Member Modules:

ID Module Type TypeId RunAs 
Scheduler DataSource System.Scheduler Default
InvokeProbe ProbeAction Fujitsu.Servers.PRIMERGY.Linux.PythonScript.Generic.PropertyBag.ProbeAction Default

Overrideable Parameters:

IDParameterTypeSelectorDisplay NameDescription
IntervalSecondsint$Config/IntervalSeconds$Interval SecondsIntervalSeconds for Components Health State Data Source Script
CIMPortint$Config/CIMPort$CIM PortCIM port number passed to script for Components Health State Data Source

Source Code:

<DataSourceModuleType ID="Fujitsu.Servers.PRIMERGY.Linux.HealthState.DataSource.Script" Accessibility="Public" Batching="false">
<Configuration>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="IntervalSeconds" type="xsd:integer"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="NetworkName" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="CIMPort" type="xsd:integer"/>
</Configuration>
<OverrideableParameters>
<OverrideableParameter ID="IntervalSeconds" Selector="$Config/IntervalSeconds$" ParameterType="int"/>
<OverrideableParameter ID="CIMPort" Selector="$Config/CIMPort$" ParameterType="int"/>
</OverrideableParameters>
<ModuleImplementation Isolation="Any">
<Composite>
<MemberModules>
<DataSource ID="Scheduler" TypeID="System!System.Scheduler">
<Scheduler>
<SimpleReccuringSchedule>
<Interval>$Config/IntervalSeconds$</Interval>
<SyncTime/>
</SimpleReccuringSchedule>
<ExcludeDates/>
</Scheduler>
</DataSource>
<ProbeAction ID="InvokeProbe" TypeID="Fujitsu.Servers.PRIMERGY.Linux.PythonScript.Generic.PropertyBag.ProbeAction">
<NetworkName>$Config/NetworkName$</NetworkName>
<UserName>$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/UserName$</UserName>
<Password>$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/Password$</Password>
<PyScriptName>ComponentsMonitor.py</PyScriptName>
<PyScriptBody>
# Fujitsu
# Copyright 2014-2018 FUJITSU LIMITED

import sys, commands
from FTSModule import *

if __name__ == '__main__':
if len(sys.argv) &lt; 7: exit(1)
if fts_version() != "__MP_VERSION__":
print "FTSModule version mismatch. Wait for next discovery run."
exit(1)

use_compression = True
raid_present = False
health_list = None
components_health = None # worst status of all components
username = ""
password = ""
network_name = "localhost"
cim_port = 5989
check_component = ALL_COMPONENTS
snmp_community = DEF_COMMSTRING
cim_target = "localhost"
raid_info = { 'daemon': False, 'avail': False, 'version': NOT_AVAIL, 'protocol': 'SNMP', 'controllers': None }

# Functions
# ---------------------------------------------------------------

def main():
global health_list, components_health, log, scom, cim, snmp

setup_argv()

logname = 'monitoring.log' if check_component.lower() == 'all' else 'monitoring_' + check_component.lower() + '.log'
log = Logger(username, logname, MONITORING_SCRIPT)
scom = SCOMMonitor(log, use_compression)

check_sv_agent(scom)

cim = CIMClient(scom, username, password, cim_target, cim_port)
snmp = SNMPClient(log, username, password, snmp_community, SNMP_HOST, True)

get_raid_info()

log.debug('Getting monitoring data')
health_list = cim.enumerate_instances("SVS_PGYHealthStateComponent")


snmp.cache_devices(check_component)

# Monitoring of the components
for i in range(15): # Enumerate through all components
if check_component == ALL_COMPONENTS or check_component == components_name[i]:
if i in [RAIDSUBSYSTEM, RAIDCONTROLLER, PHYSICALDISK, LOGICALDRIVE] \
and (raid_info['controllers'] == None or len(raid_info['controllers']) == 0):
continue
if i in [STORAGE, NETWORK, RAIDSUBSYSTEM, RAIDCONTROLLER, PHYSICALDISK, LOGICALDRIVE]:
status = snmp_components_health_status(i)
else:
status = cim_components_health_status(i)
# Save worst state for Other status
if status != STATUS_OK:
if status == STATUS_ERROR:
components_health = status
else:
if components_health != STATUS_ERROR:
components_health = status

scom.output_objects()

def setup_argv():
global username, password, network_name, cim_port, check_component, snmp_community, use_compression, cim_target
username = sys.argv[1]
password = sys.argv[2]
network_name = sys.argv[3]
cim_port = sys.argv[4]
check_component = sys.argv[5]
snmp_community = sys.argv[6]
if len(sys.argv) &gt; 7: use_compression = bool(int(sys.argv[7]))
cim_target = commands.getoutput('hostname -f 2&gt;/dev/null')
if cim_target == '': cim_target = network_name

def cim_components_health_status(subsystem_number):
component_name, component_status, failure_reasons = [], [], []
component = components_name[subsystem_number]
subsystem_name = cim_health_subsystem[subsystem_number]
overall_health = STATUS_OK
if subsystem_number == MGMTCTRL:
return get_management_health()
if subsystem_number == POWERCONSUMP:
return get_power_consumption_heatlh()
if subsystem_number == PYOVERALLSTATE:
for instance in health_list:
if str(instance['elementname']) == subsystem_name:
component_name.append('ServerView Health State')
if components_health != None:
tmpstatus = get_health(instance['healthstate'])
if tmpstatus == components_health:
component_status.append(STATUS_OK) # One of components has the same error status, user need to solve that error first
failure_reasons.append("")
else:
component_status.append(tmpstatus)
log.debug("Trying to get reason for overall status %s..." % tmpstatus)
reason = get_other_failure_reason(health_list)
log.debug("Reason: %s." % reason)
failure_reasons.append(reason)
else:
component_status.append(get_health(instance['healthstate']))
component_name.append('ServerView Agents Version')
component_status.append(STATUS_OK) # Assume OK
if 'BIOS Selftest' in str(instance['elementname']):
component_name.append('BIOS Selftest')
component_status.append(get_health(instance['healthstate']))
else:
for instance in health_list:
if str(instance['elementname']) == subsystem_name:
overall_health = get_health(instance['healthstate'])
instance_id = instance['instanceid']
for subinstance in health_list:
if instance_id+"-" in subinstance['instanceid'] and int(subinstance['presence']) != 2:
component_name.append(scom.get_component_name(subsystem_number, \
str(subinstance['elementname']), str(subinstance['associationkey']), str(subinstance['instanceid'])))
status = get_health(subinstance['healthstate'])
if status == STATUS_OK:
failure_reasons.append("")
else:
log.debug("Trying to get reason for status %s..." % status)
reason = get_failure_reason(cim, str(subinstance['elementname']), subsystem_number)
log.debug("Reason: %s." % reason)
failure_reasons.append(reason)
if int(subinstance['presence']) == 3 and status != STATUS_ERROR:
status = STATUS_DEGRADED
failure_reasons.append("Presence is set to 'not manageable'")
if int(subinstance['presence']) != 1 and int(subinstance['presence']) != 3:
scom.communication_errors.append("Invalid presence value (%d) for '%s'" % \
(int(subinstance['presence']), str(subinstance['elementname'])))
component_status.append(status)
return scom.append_status(component, component_name, component_status, subsystem_number, failure_reasons)

def snmp_components_health_status(subsystem_number):
try:
if subsystem_number == STORAGE:
return get_storage_health()
if subsystem_number == NETWORK:
return get_network_health()
if subsystem_number == RAIDSUBSYSTEM:
return get_overall_raid_health()
if subsystem_number == RAIDCONTROLLER:
return get_raid_health()
if subsystem_number == PHYSICALDISK:
return get_raid_phys_dev_health()
if subsystem_number == LOGICALDRIVE:
return get_raid_log_drv_health()
except:
log.scom_alert(SCOM_INFO, '260 Error gathering %s component info from SNMP: %s' % (cim_health_subsystem[subsystem_number], sys.exc_info()))
return STATUS_UNKNOWN

def get_power_consumption_heatlh():
component_name, component_status = [], []
for instance in health_list:
if str(instance['elementname']) == cim_health_subsystem[POWERCONSUMP]:
component_name.append(components_name[POWERCONSUMP])
component_status.append(get_health(instance['healthstate']))
return scom.append_status(components_name[POWERCONSUMP], component_name, component_status, POWERCONSUMP)

def get_management_health():
component_name, component_status = [], []
instance_list = cim.enumerate_instances("SVS_PGYManagementController")
for instance in instance_list:
component_name.append(str(instance['elementname']))
component_status.append(get_health(instance['healthstate']))
return scom.append_status(components_name[MGMTCTRL], component_name, component_status, MGMTCTRL)

def get_storage_health():
component_name, component_status = [], []
# if (not snmp.enabled) or (raid_present): return STATUS_UNKNOWN # we should monitor both Storage and RAID subsystems
if not snmp.enabled: return STATUS_UNKNOWN
for idx in snmp.dev_idxs: # cached
if "DiskStorage" in snmp.get_var(SNMP_DEVICE_TYPE, idx):
descr = snmp.get_var(SNMP_DEVICE_DESCR, idx)
status = SNMP_StorState[snmp.get_var_i(SNMP_DEVICE_STATUS, idx)]
component_name.append(descr)
component_status.append(status)
return scom.append_status(components_name[STORAGE], component_name, component_status, STORAGE)

def get_network_health():
if not snmp.enabled: return STATUS_UNKNOWN
component_name, component_status = [], []
idxs = snmp.get_idx(SNMP_NET_IF_INDEX, SNMP_NET_IF_INDEX_LEN)
for idx in idxs:
if 'ethernet' in snmp.get_var(SNMP_NET_IF_TYPE, idx):
descr = snmp.get_var(SNMP_NET_IF_DESCR, idx)
admstat = int(snmp.get_var(SNMP_NET_IF_ADMIN_STATUS, idx).split('(')[1][:-1])
oprstat = int(snmp.get_var(SNMP_NET_IF_OPER_STATUS, idx).split('(')[1][:-1])
status = STATUS_OK # we assume everything is ok
if oprstat == 4 or oprstat == 6: # ifOperStatus=unknown or notPresent
status = STATUS_DEGRADED
if admstat == 2 and oprstat == 1: # ifAdminStatus=down and ifOperStatus=up then something is wrong
status = STATUS_ERROR
# status from device
for d_idx in snmp.dev_idxs: # cached
if "Network" in snmp.get_var(SNMP_DEVICE_TYPE, d_idx):
if descr in snmp.get_var(SNMP_DEVICE_DESCR, d_idx):
dstatus = SNMP_NetState[snmp.get_var_i(SNMP_DEVICE_STATUS, d_idx)]
status = int_to_status[max(status_to_int[status], status_to_int[dstatus])]
component_name.append(descr)
component_status.append(status)
return scom.append_status(components_name[NETWORK], component_name, component_status, NETWORK)

def get_raid_info():
global raid_info
log.debug('Getting RAID info')
if check_amDaemon():
raid_info['daemon'] = True
cimControllers = cim.enumerate_instances("SVS_PGYHostRaidController", True)
if cimControllers == None and snmp.enabled:
raid_info['version'] = snmp.get_var(SNMP_SV_RAID_AGENT_VER)
raid_info['avail'] = True
raid_info['controllers'] = snmp.get_idx(SNMP_CONTROL_CTRLNR, SNMP_CONTROL_INDEX_LEN)
elif cimControllers != None:
software = cim.enumerate_instance_names("SVS_PGYSoftwareIdentity")
for instance in software:
if 'RAID Manager' in instance['InstanceID'][1]:
inst = cim.get_instance(instance)
raid_info['version'] = inst[0]['versionstring']
break
raid_info['avail'] = True
raid_info['protocol'] = 'CIM'
raid_info['controllers'] = cimControllers
log.debug(" gathered RAID info: daemon %s, avail %s, version %s, protocol %s" % (raid_info['daemon'], raid_info['avail'], raid_info['version'], raid_info['protocol']))

def get_raid_version():
if not raid_info['avail']: return STATUS_UNKNOWN
component_name, component_status = [], []
svstatus = STATUS_OK
if raid_info['version'] &lt; MIN_RD_VER:
svstatus = STATUS_ERROR
if raid_info['version'] &lt; WAR_RD_VER:
svstatus = STATUS_DEGRADED
component_name.append("ServerView RAID Version")
component_status.append(svstatus)
return scom.append_status("ServerView RAID Version", component_name, component_status, -1)

def get_overall_raid_health():
if not raid_info['avail']: return STATUS_UNKNOWN
component_name, component_status = [], []
if raid_info['protocol'] == 'CIM':
svstatus = STATUS_UNKNOWN
for item in health_list:
if item['elementname'] != None and 'ServerView RAID System' in item['elementname']:
svstatus = get_health(item['healthstate'])
else:
svstatus = SNMP_CompState[snmp.get_var_i(SNMP_SV_RAID_STATUS)]
component_name.append(components_name[RAIDSUBSYSTEM])
component_status.append(svstatus)
return scom.append_status(components_name[RAIDSUBSYSTEM], component_name, component_status, RAIDSUBSYSTEM)

def get_raid_health():
def get_cim():
if raid_info['controllers'] != None:
component_name, component_status = [], []
for controller in raid_info['controllers']:
component_name.append(controller['elementname'])
component_status.append(get_health(controller['healthstate']))
if len(raid_info['controllers']) &gt; 0:
return scom.append_status(components_name[RAIDCONTROLLER], component_name, component_status, RAIDCONTROLLER)
return STATUS_UNKNOWN
def get_snmp():
component_name, component_status, index = [], [], 1
if raid_info['controllers'] == None or len(raid_info['controllers']) == 0:
return STATUS_UNKNOWN
idxs = raid_info['controllers']
svstatus = STATUS_OK
if len(idxs) &gt; 0:
svstatus = SNMP_CompState[snmp.get_var_i(SNMP_CONTROL_SV_STATUS)]
for idx in idxs:
driver = snmp.get_var(SNMP_CONTROL_DRIVER_NAME, idx)
if driver.lower() != 'md':
descr = snmp.get_var(SNMP_CONTROL_DESCR, idx)
status = STATUS_OK
if svstatus != STATUS_OK:
status = SNMP_CompState[snmp.get_var_i(SNMP_CONTROL_STATUS, idx)]
component_name.append(descr)
component_status.append(status)
index += 1
if index &gt; 1:
return scom.append_status(components_name[RAIDCONTROLLER], component_name, component_status, RAIDCONTROLLER)
return STATUS_UNKNOWN
if not raid_info['avail']: return STATUS_UNKNOWN
if get_raid_version() == STATUS_ERROR: return STATUS_UNKNOWN # checks if we can monitor raid
if raid_info['protocol'] == 'SNMP':
return get_snmp()
else:
return get_cim()

def get_raid_drive_health(cimclass, component_no):
if not raid_info['avail']: return STATUS_UNKNOWN
drives = cim.enumerate_instances(cimclass)
if len(drives) &gt; 0:
component_name, component_status = [], []
for drive in drives:
component_name.append("%s (%s)" % (drive['elementname'], drive['deviceid']))
component_status.append(get_health(drive['healthstate']))
return scom.append_status(components_name[component_no], component_name, component_status, component_no)
return STATUS_UNKNOWN

def get_raid_phys_dev_health():
def get_snmp():
component_name, component_status, index = [], [], 0
idxs = snmp.get_idx(SNMP_PHYS_DEV_CTRLNR, SNMP_PHYS_DEV_INDEX_LEN)
svstatus = STATUS_OK
if len(idxs) &gt; 0:
svstatus = SNMP_CompState[snmp.get_var_i(SNMP_PHYS_DEV_SV_STATUS)]
for idx in idxs:
status = STATUS_OK
descr = add_controller_number(snmp.get_var(SNMP_PHYS_DEV_CTRLNR, idx), snmp.get_var(SNMP_PHYS_DEV_DESCR, idx))
if svstatus != STATUS_OK:
pdstatus = SNMP_DevState[snmp.get_var_i(SNMP_PHYS_DEV_STATUS, idx)]
if status_to_int[pdstatus] &gt; status_to_int[STATUS_OK]:
status = svstatus
component_name.append(descr)
component_status.append(status)
index += 1
if index &gt; 0:
return scom.append_status(components_name[PHYSICALDISK], component_name, component_status, PHYSICALDISK)
return STATUS_UNKNOWN
if not raid_info['avail']: return STATUS_UNKNOWN
if raid_info['protocol'] == 'SNMP':
return get_snmp()
else:
return get_raid_drive_health("SVS_PGYDiskDrive", PHYSICALDISK)

def get_raid_log_drv_health():
def get_snmp():
component_name, component_status, index = [], [], 0
idxs = snmp.get_idx(SNMP_LOG_DRV_CTRLNR, SNMP_LOG_DRV_INDEX_LEN)
svstatus = STATUS_OK
if len(idxs) &gt; 0:
svstatus = SNMP_CompState[snmp.get_var_i(SNMP_LOG_DRV_SV_STATUS)]
for idx in idxs:
descr = add_controller_number(snmp.get_var(SNMP_LOG_DRV_CTRLNR, idx), snmp.get_var(SNMP_LOG_DRV_NAME, idx))
status = STATUS_OK
if svstatus != STATUS_OK:
ldstatus = SNMP_DrvState[snmp.get_var_i(SNMP_LOG_DRV_STATUS, idx)]
if status_to_int[ldstatus] &gt; status_to_int[STATUS_OK]:
status = svstatus
component_name.append(descr)
component_status.append(status)
index += 1
if index &gt; 0:
return scom.append_status(components_name[LOGICALDRIVE], component_name, component_status, LOGICALDRIVE)
return STATUS_UNKNOWN
if not raid_info['avail']: return STATUS_UNKNOWN
if raid_info['protocol'] == 'SNMP':
return get_snmp()
else:
return get_raid_drive_health("SVS_PGYStoragePoolCompositeExtent", LOGICALDRIVE)

# ---------------------------------------------------------------

if __name__ == '__main__':
main()
</PyScriptBody>
<!-- We always check ALL components because of cookdown feature -->
<ScriptArguments>'$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/Password$' "$Config/NetworkName$" $Config/CIMPort$ "$RunAs[Name="Fujitsu.Servers.PRIMERGY.Linux.SNMPAccount"]/CommunityString$"</ScriptArguments>
<ArgumentsMap>"`whoami`" "$1" "$2" $3 "ALL" "$4"</ArgumentsMap>
<Timeout>300</Timeout>
<Context>ComponentsMonitor;HostsForMonitoring</Context>
<!-- Format: 'Section;TagHosts' -->
</ProbeAction>
</MemberModules>
<Composition>
<Node ID="InvokeProbe">
<Node ID="Scheduler"/>
</Node>
</Composition>
</Composite>
</ModuleImplementation>
<OutputType>System!System.PropertyBagData</OutputType>
</DataSourceModuleType>