Script for Components Health State Data Source

Fujitsu.Servers.PRIMERGY.Linux.HealthState.DataSource.Script (DataSourceModuleType)

Element properties:

Show References:
- DataSource Modules (1)
Type	DataSourceModuleType
Isolation	Any
Accessibility	Public
RunAs	Default
OutputType	System.PropertyBagData
Member Modules:

ID	Module Type	TypeId	RunAs
Scheduler	DataSource	System.Scheduler	Default
InvokeProbe	ProbeAction	Fujitsu.Servers.PRIMERGY.Linux.PythonScript.Generic.PropertyBag.ProbeAction	Default
Overrideable Parameters:

ID	ParameterType	Selector	Display Name	Description
IntervalSeconds	int	$Config/IntervalSeconds$	Interval Seconds	IntervalSeconds for Components Health State Data Source Script
CIMPort	int	$Config/CIMPort$	CIM Port	CIM port number passed to script for Components Health State Data Source
Source Code:

<DataSourceModuleType ID="Fujitsu.Servers.PRIMERGY.Linux.HealthState.DataSource.Script" Accessibility="Public" Batching="false">

  <Configuration>

    <xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="IntervalSeconds" type="xsd:integer"/>

    <xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="NetworkName" type="xsd:string"/>

    <xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="1" name="CIMPort" type="xsd:integer"/>

  </Configuration>

  <OverrideableParameters>

    <OverrideableParameter ID="IntervalSeconds" Selector="$Config/IntervalSeconds$" ParameterType="int"/>

    <OverrideableParameter ID="CIMPort" Selector="$Config/CIMPort$" ParameterType="int"/>

  </OverrideableParameters>

  <ModuleImplementation Isolation="Any">

    <Composite>

      <MemberModules>

        <DataSource ID="Scheduler" TypeID="System!System.Scheduler">

          <Scheduler>

            <SimpleReccuringSchedule>

              <Interval>$Config/IntervalSeconds$</Interval>

              <SyncTime/>

            </SimpleReccuringSchedule>

            <ExcludeDates/>

          </Scheduler>

        </DataSource>

        <ProbeAction ID="InvokeProbe" TypeID="Fujitsu.Servers.PRIMERGY.Linux.PythonScript.Generic.PropertyBag.ProbeAction">

          <NetworkName>$Config/NetworkName$</NetworkName>

          <UserName>$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/UserName$</UserName>

          <Password>$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/Password$</Password>

          <PyScriptName>ComponentsMonitor.py</PyScriptName>

          <PyScriptBody>

# Fujitsu

# Copyright 2014-2018 FUJITSU LIMITED



import sys, commands

from FTSModule import *



if __name__ == '__main__':

    if len(sys.argv) &lt; 7: exit(1)

    if fts_version() != "__MP_VERSION__":

        print "FTSModule version mismatch. Wait for next discovery run."

        exit(1)



use_compression   = True

raid_present      = False

health_list       = None

components_health = None # worst status of all components

username          = ""

password          = ""

network_name      = "localhost"

cim_port          = 5989

check_component   = ALL_COMPONENTS

snmp_community    = DEF_COMMSTRING

cim_target        = "localhost"

raid_info         = { 'daemon': False, 'avail': False, 'version': NOT_AVAIL, 'protocol': 'SNMP', 'controllers': None }



# Functions

# ---------------------------------------------------------------



def main():

    global health_list, components_health, log, scom, cim, snmp



    setup_argv()



    logname = 'monitoring.log' if check_component.lower() == 'all' else 'monitoring_' + check_component.lower() + '.log'

    log = Logger(username, logname, MONITORING_SCRIPT)

    scom = SCOMMonitor(log, use_compression)



    check_sv_agent(scom)



    cim  = CIMClient(scom, username, password, cim_target, cim_port)

    snmp = SNMPClient(log, username, password, snmp_community, SNMP_HOST, True)



    get_raid_info()



    log.debug('Getting monitoring data')

    health_list = cim.enumerate_instances("SVS_PGYHealthStateComponent")





    snmp.cache_devices(check_component)



    # Monitoring of the components

    for i in range(15): # Enumerate through all components

        if check_component == ALL_COMPONENTS or check_component == components_name[i]:

            if i in [RAIDSUBSYSTEM, RAIDCONTROLLER, PHYSICALDISK, LOGICALDRIVE] \

                and (raid_info['controllers'] == None or len(raid_info['controllers']) == 0):

                continue

            if i in [STORAGE, NETWORK, RAIDSUBSYSTEM, RAIDCONTROLLER, PHYSICALDISK, LOGICALDRIVE]:

                status = snmp_components_health_status(i)

            else:

                status = cim_components_health_status(i)

            # Save worst state for Other status

            if status != STATUS_OK:

                if status == STATUS_ERROR:

                    components_health = status

                else:

                    if components_health != STATUS_ERROR:

                        components_health = status



    scom.output_objects()



def setup_argv():

    global username, password, network_name, cim_port, check_component, snmp_community, use_compression, cim_target

    username        = sys.argv[1]

    password        = sys.argv[2]

    network_name    = sys.argv[3]

    cim_port        = sys.argv[4]

    check_component = sys.argv[5]

    snmp_community  = sys.argv[6]

    if len(sys.argv) &gt; 7: use_compression = bool(int(sys.argv[7]))

    cim_target    = commands.getoutput('hostname -f 2&gt;/dev/null')

    if cim_target == '': cim_target = network_name



def cim_components_health_status(subsystem_number):

    component_name, component_status, failure_reasons = [], [], []

    component      = components_name[subsystem_number]

    subsystem_name = cim_health_subsystem[subsystem_number]

    overall_health = STATUS_OK

    if subsystem_number == MGMTCTRL:

        return get_management_health()

    if subsystem_number == POWERCONSUMP:

        return get_power_consumption_heatlh()

    if subsystem_number == PYOVERALLSTATE:

        for instance in health_list:

            if str(instance['elementname']) == subsystem_name:

                component_name.append('ServerView Health State')

                if components_health != None:

                    tmpstatus = get_health(instance['healthstate'])

                    if tmpstatus == components_health:

                        component_status.append(STATUS_OK) # One of components has the same error status, user need to solve that error first

                        failure_reasons.append("")

                    else:

                        component_status.append(tmpstatus)

                        log.debug("Trying to get reason for overall status %s..." % tmpstatus)

                        reason = get_other_failure_reason(health_list)

                        log.debug("Reason: %s." % reason)

                        failure_reasons.append(reason)

                else:

                    component_status.append(get_health(instance['healthstate']))

                component_name.append('ServerView Agents Version')

                component_status.append(STATUS_OK) # Assume OK

            if 'BIOS Selftest' in str(instance['elementname']):

                component_name.append('BIOS Selftest')

                component_status.append(get_health(instance['healthstate']))

    else:

        for instance in health_list:

            if str(instance['elementname']) == subsystem_name:

                overall_health = get_health(instance['healthstate'])

                instance_id    = instance['instanceid']

                for subinstance in health_list:

                    if instance_id+"-" in subinstance['instanceid'] and int(subinstance['presence']) != 2:

                        component_name.append(scom.get_component_name(subsystem_number, \

                            str(subinstance['elementname']), str(subinstance['associationkey']), str(subinstance['instanceid'])))

                        status = get_health(subinstance['healthstate'])

                        if status == STATUS_OK:

                            failure_reasons.append("")

                        else:

                            log.debug("Trying to get reason for status %s..." % status)

                            reason = get_failure_reason(cim, str(subinstance['elementname']), subsystem_number)

                            log.debug("Reason: %s." % reason)

                            failure_reasons.append(reason)

                        if int(subinstance['presence']) == 3 and status != STATUS_ERROR:

                            status = STATUS_DEGRADED

                            failure_reasons.append("Presence is set to 'not manageable'")

                        if int(subinstance['presence']) != 1 and int(subinstance['presence']) != 3:

                            scom.communication_errors.append("Invalid presence value (%d) for '%s'" % \

                                (int(subinstance['presence']), str(subinstance['elementname'])))

                        component_status.append(status)

    return scom.append_status(component, component_name, component_status, subsystem_number, failure_reasons)



def snmp_components_health_status(subsystem_number):

    try:

        if subsystem_number == STORAGE:

            return get_storage_health()

        if subsystem_number == NETWORK:

            return get_network_health()

        if subsystem_number == RAIDSUBSYSTEM:

            return get_overall_raid_health()

        if subsystem_number == RAIDCONTROLLER:

            return get_raid_health()

        if subsystem_number == PHYSICALDISK:

            return get_raid_phys_dev_health()

        if subsystem_number == LOGICALDRIVE:

            return get_raid_log_drv_health()

    except:

        log.scom_alert(SCOM_INFO, '260 Error gathering %s component info from SNMP: %s' % (cim_health_subsystem[subsystem_number], sys.exc_info()))

        return STATUS_UNKNOWN



def get_power_consumption_heatlh():

    component_name, component_status = [], []

    for instance in health_list:

        if str(instance['elementname']) == cim_health_subsystem[POWERCONSUMP]:

            component_name.append(components_name[POWERCONSUMP])

            component_status.append(get_health(instance['healthstate']))

    return scom.append_status(components_name[POWERCONSUMP], component_name, component_status, POWERCONSUMP)



def get_management_health():

    component_name, component_status = [], []

    instance_list = cim.enumerate_instances("SVS_PGYManagementController")

    for instance in instance_list:

        component_name.append(str(instance['elementname']))

        component_status.append(get_health(instance['healthstate']))

    return scom.append_status(components_name[MGMTCTRL], component_name, component_status, MGMTCTRL)



def get_storage_health():

    component_name, component_status = [], []

    # if (not snmp.enabled) or (raid_present): return STATUS_UNKNOWN # we should monitor both Storage and RAID subsystems

    if not snmp.enabled: return STATUS_UNKNOWN

    for idx in snmp.dev_idxs: # cached

        if "DiskStorage" in snmp.get_var(SNMP_DEVICE_TYPE, idx):

            descr  = snmp.get_var(SNMP_DEVICE_DESCR, idx)

            status = SNMP_StorState[snmp.get_var_i(SNMP_DEVICE_STATUS, idx)]

            component_name.append(descr)

            component_status.append(status)

    return scom.append_status(components_name[STORAGE], component_name, component_status, STORAGE)



def get_network_health():

    if not snmp.enabled: return STATUS_UNKNOWN

    component_name, component_status = [], []

    idxs = snmp.get_idx(SNMP_NET_IF_INDEX, SNMP_NET_IF_INDEX_LEN)

    for idx in idxs:

        if 'ethernet' in snmp.get_var(SNMP_NET_IF_TYPE, idx):

            descr   = snmp.get_var(SNMP_NET_IF_DESCR, idx)

            admstat = int(snmp.get_var(SNMP_NET_IF_ADMIN_STATUS, idx).split('(')[1][:-1])

            oprstat = int(snmp.get_var(SNMP_NET_IF_OPER_STATUS,  idx).split('(')[1][:-1])

            status  = STATUS_OK # we assume everything is ok

            if oprstat == 4 or oprstat == 6: # ifOperStatus=unknown or notPresent

                status = STATUS_DEGRADED

            if admstat == 2 and oprstat == 1: # ifAdminStatus=down and ifOperStatus=up then something is wrong

                status = STATUS_ERROR

            # status from device

            for d_idx in snmp.dev_idxs: # cached

                if "Network" in snmp.get_var(SNMP_DEVICE_TYPE, d_idx):

                    if descr in snmp.get_var(SNMP_DEVICE_DESCR, d_idx):

                        dstatus = SNMP_NetState[snmp.get_var_i(SNMP_DEVICE_STATUS, d_idx)]

                        status = int_to_status[max(status_to_int[status], status_to_int[dstatus])]

            component_name.append(descr)

            component_status.append(status)

    return scom.append_status(components_name[NETWORK], component_name, component_status, NETWORK)



def get_raid_info():

    global raid_info

    log.debug('Getting RAID info')

    if check_amDaemon():

        raid_info['daemon'] = True

        cimControllers = cim.enumerate_instances("SVS_PGYHostRaidController", True)

        if cimControllers == None and snmp.enabled:

            raid_info['version']     = snmp.get_var(SNMP_SV_RAID_AGENT_VER)

            raid_info['avail']       = True

            raid_info['controllers'] = snmp.get_idx(SNMP_CONTROL_CTRLNR, SNMP_CONTROL_INDEX_LEN)

        elif cimControllers != None:

            software = cim.enumerate_instance_names("SVS_PGYSoftwareIdentity")

            for instance in software:

                if 'RAID Manager' in instance['InstanceID'][1]:

                    inst = cim.get_instance(instance)

                    raid_info['version'] = inst[0]['versionstring']

                    break

            raid_info['avail']       = True

            raid_info['protocol']    = 'CIM'

            raid_info['controllers'] = cimControllers

    log.debug(" gathered RAID info: daemon %s, avail %s, version %s, protocol %s" % (raid_info['daemon'], raid_info['avail'], raid_info['version'], raid_info['protocol']))



def get_raid_version():

    if not raid_info['avail']: return STATUS_UNKNOWN

    component_name, component_status = [], []

    svstatus = STATUS_OK

    if raid_info['version'] &lt; MIN_RD_VER:

        svstatus = STATUS_ERROR

    if raid_info['version'] &lt; WAR_RD_VER:

        svstatus = STATUS_DEGRADED

    component_name.append("ServerView RAID Version")

    component_status.append(svstatus)

    return scom.append_status("ServerView RAID Version", component_name, component_status, -1)



def get_overall_raid_health():

    if not raid_info['avail']: return STATUS_UNKNOWN

    component_name, component_status = [], []

    if raid_info['protocol'] == 'CIM':

        svstatus = STATUS_UNKNOWN

        for item in health_list:

            if item['elementname'] != None and 'ServerView RAID System' in item['elementname']:

                svstatus = get_health(item['healthstate'])

    else:

        svstatus = SNMP_CompState[snmp.get_var_i(SNMP_SV_RAID_STATUS)]

    component_name.append(components_name[RAIDSUBSYSTEM])

    component_status.append(svstatus) 

    return scom.append_status(components_name[RAIDSUBSYSTEM], component_name, component_status, RAIDSUBSYSTEM)



def get_raid_health():

    def get_cim():

        if raid_info['controllers'] != None:

            component_name, component_status = [], []

            for controller in raid_info['controllers']:

                component_name.append(controller['elementname'])

                component_status.append(get_health(controller['healthstate']))

            if len(raid_info['controllers']) &gt; 0:

                return scom.append_status(components_name[RAIDCONTROLLER], component_name, component_status, RAIDCONTROLLER)

        return STATUS_UNKNOWN

    def get_snmp():

        component_name, component_status, index = [], [], 1

        if raid_info['controllers'] == None or len(raid_info['controllers']) == 0:

            return STATUS_UNKNOWN

        idxs = raid_info['controllers']

        svstatus = STATUS_OK

        if len(idxs) &gt; 0:

            svstatus = SNMP_CompState[snmp.get_var_i(SNMP_CONTROL_SV_STATUS)]

        for idx in idxs:

            driver = snmp.get_var(SNMP_CONTROL_DRIVER_NAME, idx)

            if driver.lower() != 'md':

                descr  = snmp.get_var(SNMP_CONTROL_DESCR, idx)

                status = STATUS_OK

                if svstatus != STATUS_OK:

                    status = SNMP_CompState[snmp.get_var_i(SNMP_CONTROL_STATUS, idx)]

                component_name.append(descr)

                component_status.append(status)

                index += 1

        if index &gt; 1:

            return scom.append_status(components_name[RAIDCONTROLLER], component_name, component_status, RAIDCONTROLLER)

        return STATUS_UNKNOWN

    if not raid_info['avail']: return STATUS_UNKNOWN

    if get_raid_version() == STATUS_ERROR: return STATUS_UNKNOWN # checks if we can monitor raid

    if raid_info['protocol'] == 'SNMP':

        return get_snmp()

    else:

        return get_cim()



def get_raid_drive_health(cimclass, component_no):

    if not raid_info['avail']: return STATUS_UNKNOWN

    drives = cim.enumerate_instances(cimclass)

    if len(drives) &gt; 0:

        component_name, component_status = [], []

        for drive in drives:

            component_name.append("%s (%s)" % (drive['elementname'], drive['deviceid']))

            component_status.append(get_health(drive['healthstate']))

        return scom.append_status(components_name[component_no], component_name, component_status, component_no)

    return STATUS_UNKNOWN



def get_raid_phys_dev_health():

    def get_snmp():

        component_name, component_status, index = [], [], 0

        idxs = snmp.get_idx(SNMP_PHYS_DEV_CTRLNR, SNMP_PHYS_DEV_INDEX_LEN)

        svstatus = STATUS_OK

        if len(idxs) &gt; 0:

            svstatus = SNMP_CompState[snmp.get_var_i(SNMP_PHYS_DEV_SV_STATUS)]

        for idx in idxs:

            status = STATUS_OK

            descr  = add_controller_number(snmp.get_var(SNMP_PHYS_DEV_CTRLNR, idx), snmp.get_var(SNMP_PHYS_DEV_DESCR, idx))

            if svstatus != STATUS_OK:

                pdstatus = SNMP_DevState[snmp.get_var_i(SNMP_PHYS_DEV_STATUS, idx)]

                if status_to_int[pdstatus] &gt; status_to_int[STATUS_OK]:

                    status = svstatus

            component_name.append(descr)

            component_status.append(status)

            index += 1

        if index &gt; 0:

            return scom.append_status(components_name[PHYSICALDISK], component_name, component_status, PHYSICALDISK)

        return STATUS_UNKNOWN 

    if not raid_info['avail']: return STATUS_UNKNOWN

    if raid_info['protocol'] == 'SNMP':

        return get_snmp()

    else:

        return get_raid_drive_health("SVS_PGYDiskDrive", PHYSICALDISK)



def get_raid_log_drv_health():

    def get_snmp():

        component_name, component_status, index = [], [], 0

        idxs = snmp.get_idx(SNMP_LOG_DRV_CTRLNR, SNMP_LOG_DRV_INDEX_LEN)

        svstatus = STATUS_OK

        if len(idxs) &gt; 0:

            svstatus = SNMP_CompState[snmp.get_var_i(SNMP_LOG_DRV_SV_STATUS)]

        for idx in idxs:

            descr  = add_controller_number(snmp.get_var(SNMP_LOG_DRV_CTRLNR, idx), snmp.get_var(SNMP_LOG_DRV_NAME, idx))

            status = STATUS_OK

            if svstatus != STATUS_OK:

                ldstatus = SNMP_DrvState[snmp.get_var_i(SNMP_LOG_DRV_STATUS, idx)]

                if status_to_int[ldstatus] &gt; status_to_int[STATUS_OK]:

                    status = svstatus

            component_name.append(descr)

            component_status.append(status)

            index += 1

        if index &gt; 0:

            return scom.append_status(components_name[LOGICALDRIVE], component_name, component_status, LOGICALDRIVE)

        return STATUS_UNKNOWN

    if not raid_info['avail']: return STATUS_UNKNOWN

    if raid_info['protocol'] == 'SNMP':

        return get_snmp()

    else:

        return get_raid_drive_health("SVS_PGYStoragePoolCompositeExtent", LOGICALDRIVE)



# ---------------------------------------------------------------



if __name__ == '__main__':

    main()

</PyScriptBody>

          <!-- We always check ALL components because of cookdown feature -->

          <ScriptArguments>'$RunAs[Name="Unix!Microsoft.Unix.ActionAccount"]/Password$' "$Config/NetworkName$" $Config/CIMPort$ "$RunAs[Name="Fujitsu.Servers.PRIMERGY.Linux.SNMPAccount"]/CommunityString$"</ScriptArguments>

          <ArgumentsMap>"`whoami`" "$1" "$2" $3 "ALL" "$4"</ArgumentsMap>

          <Timeout>300</Timeout>

          <Context>ComponentsMonitor;HostsForMonitoring</Context>

          <!-- Format: 'Section;TagHosts' -->

        </ProbeAction>

      </MemberModules>

      <Composition>

        <Node ID="InvokeProbe">

          <Node ID="Scheduler"/>

        </Node>

      </Composition>

    </Composite>

  </ModuleImplementation>

  <OutputType>System!System.PropertyBagData</OutputType>

</DataSourceModuleType>