<DataSourceModuleType ID="Microsoft.Exchange.2010.TimedCITroubleshooterScript.PropertyBagProvider" Accessibility="Internal" Batching="false">
<Configuration>
<IncludeSchemaTypes>
<SchemaType>System!System.ExpressionEvaluatorSchema</SchemaType>
<SchemaType>System!System.CommandExecuterSchema</SchemaType>
</IncludeSchemaTypes>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="IntervalSeconds" type="xsd:int"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="SyncTime" type="xsd:string"/>
<!-- Diagnostic script common library -->
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="CommonLibraryScriptName" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="CommonLibraryScriptBody" type="xsd:string"/>
<!-- Wrapper script to execute the diagnostic script -->
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="ExecutionScriptName" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="ExecutionArguments" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="ExecutionScriptBody" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="ScriptName" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="Arguments" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="ScriptBody" type="xsd:string"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="SecureInput" minOccurs="0" maxOccurs="1">
<xsd:simpleType>
<xsd:restriction base="xsd:string">
<xsd:maxLength value="256"/>
</xsd:restriction>
</xsd:simpleType>
</xsd:element>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" name="TimeoutSeconds" type="xsd:integer"/>
<xsd:element xmlns:xsd="http://www.w3.org/2001/XMLSchema" minOccurs="0" maxOccurs="1" name="EventPolicy" type="CommandExecuterEventPolicyType"/>
</Configuration>
<OverrideableParameters>
<OverrideableParameter ID="IntervalSeconds" Selector="$Config/IntervalSeconds$" ParameterType="int"/>
<OverrideableParameter ID="SyncTime" Selector="$Config/SyncTime$" ParameterType="string"/>
<OverrideableParameter ID="ExecutionArguments" Selector="$Config/ExecutionArguments$" ParameterType="string"/>
<OverrideableParameter ID="Arguments" Selector="$Config/Arguments$" ParameterType="string"/>
<OverrideableParameter ID="TimeoutSeconds" Selector="$Config/TimeoutSeconds$" ParameterType="int"/>
</OverrideableParameters>
<ModuleImplementation Isolation="Any">
<Composite>
<MemberModules>
<DataSource ID="DS1" TypeID="System!System.SimpleScheduler">
<IntervalSeconds>$Config/IntervalSeconds$</IntervalSeconds>
<SyncTime>$Config/SyncTime$</SyncTime>
</DataSource>
<ProbeAction ID="Script" TypeID="Microsoft.Exchange.2010.CITroubleshooterScriptPropertyBagProbe">
<!-- Diagnostic script common library -->
<CommonLibraryScriptName>$Config/CommonLibraryScriptName$</CommonLibraryScriptName>
<CommonLibraryScriptBody>$Config/CommonLibraryScriptBody$</CommonLibraryScriptBody>
<!-- Wrapper script to execute the diagnostic script -->
<ExecutionScriptName>$Config/ExecutionScriptName$</ExecutionScriptName>
<ExecutionArguments>$Config/ExecutionArguments$</ExecutionArguments>
<ExecutionScriptBody>$Config/ExecutionScriptBody$</ExecutionScriptBody>
<!-- Diagnostic script dependencies -->
<TypesScriptName>CITSTypes.ps1</TypesScriptName>
<TypesScriptBody>
# Copyright (c) 2009 Microsoft Corporation. All rights reserved.
#
# THIS CODE IS MADE AVAILABLE AS IS, WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK
# OF THE USE OR THE RESULTS FROM THE USE OF THIS CODE REMAINS WITH THE USER.
#
# This file contains additional types used by CI troubleshooter library
#
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class HaDbFailureItemHelper
{
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct HaDbFailureItem
{
public int m_cbSize;
public int m_nameSpace;
public int m_tag;
public Guid m_guid;
[MarshalAs(UnmanagedType.LPWStr)]
public string m_instanceName;
[MarshalAs(UnmanagedType.LPWStr)]
public string m_componentName;
public IntPtr m_ioError;
public IntPtr m_notificationEventInfo;
}
public class Arguments
{
public string Server;
public string Database;
public string Symptom;
public string Action;
public string InstanceName;
public bool MonitoringContext;
public bool WriteApplicationEvent;
public bool TroubleshooterDisabled;
public int FailureCountBeforeAlert;
public int FailureTimeSpanMinutes;
public long MaxCumulativeMsftefdMemoryConsumption;
public bool CanTakeProcessCrashDumps;
}
public class CatalogHealth
{
public string ErrorCode;
public System.DateTime Timestamp;
}
public class CIStatus
{
public string Name;
public string DatabaseName;
public Guid DatabaseGuid;
public long BacklogCounter;
public long NumberOfItemsInRetryQueue;
public long NumberOfRetryItemsProcessed;
public long StallCounter;
public long NumberOfMailboxesLeftToCrawl;
public int PercentageCatalogSize;
public string Health;
public string HealthReason;
public System.DateTime HealthTimestamp;
public bool IsStalled;
public bool IsStalledExtendedPeriod;
public bool IsBacklogged;
public bool IsCorrupted;
public bool IsHealthStale;
public bool IsLargeCatalog;
public bool HasBadDiskBlock;
public bool HasRetryQueueIssues;
public bool BadDiskBlockMasterMerge;
public bool InReseedLoop;
}
public class ServerStatus
{
public string Name;
public bool IsDeadLocked;
public bool IsRTMServer;
public bool IsMsftefdMemoryConsumptionOverLimit;
public long CumulativeMsftefdMemoryConsumption;
public CIStatus[] CatalogStatusArray;
public string[] BadIFilters;
public string[] IFiltersToEnable;
}
'@
# Add the assembly System.ServiceProcess.dll to
# get the ServiceController class.
# This is used for starting/stopping search services
#
Add-Type -AssemblyName "System.ServiceProcess"
</TypesScriptBody>
<ConstantsScriptName>CITSConstants.ps1</ConstantsScriptName>
<ConstantsScriptBody>
# Copyright (c) 2009 Microsoft Corporation. All rights reserved.
#
# THIS CODE IS MADE AVAILABLE AS IS, WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK
# OF THE USE OR THE RESULTS FROM THE USE OF THIS CODE REMAINS WITH THE USER.
#
# This file contains global constants used by CI Troubleshooter library.
#
# Global constants for performance counter names
#
$searchIndicesCounterObjectName = "MSExchange Search Indices"
$aolniCounterName = "age of the last notification indexed"
$tslniCounterName = "time since last notification was indexed"
$noirtCounterName = "number of items in retry table"
$nomltcCounterName = "number of mailboxes left to crawl"
$noirfrtCounterName = "number of items removed from retry table"
# counter array used for getting performance counter values
#
$counters = (
"$searchIndicesCounterObjectName(*)\$aolniCounterName",
"$searchIndicesCounterObjectName(*)\$tslniCounterName",
"$searchIndicesCounterObjectName(*)\$noirtCounterName",
"$searchIndicesCounterObjectName(*)\$nomltcCounterName",
"$searchIndicesCounterObjectName(*)\$noirfrtCounterName"
)
# Hash table used to store performance counter values for catalogs
#
$counterHashTable = @{}
# Hash table used to store the corresponding mailbox database objects values of catalogs
#
$mailboxDatabaseHashTable = @{}
# Name of registry hive where exchange setup information is stored
#
$exchangeSetupRegKey= "HKLM:$exchangeRegKey\Setup"
# Name of registry hive where TS diagnostic information is stored
#
$troubleshooterRegKey= "$exchangeRegKey\CITroubleshooter\"
# Name of registry hive where contentIndex state is stored
#
$contentIndexKeyPath = "$exchangeRegKey\ContentIndex"
# Name of registry hive where catalog health values are stored
#
$copyHealthKeyPath = "$contentIndexKeyPath\CatalogHealth\"
# Last known Filters key Name
#
$lastKnownFiltersKeyName = "$contentIndexKeyPath\LastKnownFilters"
# Need Reindex Databases Key Name
#
$needReindexDatabasesKeyName = "$contentIndexKeyPath\NeedReindexDatabases"
# Name of the registry hive where the MSSearch IFilters are defined
#
$MsSearchFilterPath="$exchangeRegKey\MSSearch\Filters"
# Value to determine if we should automatically disable bad IFilters
# If the value is set 0, the troubleshooter will skip the logic of disabling IFilters
$disableBadIFilters = 1
# String used to report catalog corruption or other reset
# conditions in catalog health registry hive
#
$corruptionIndicator = "CatalogNeedReset"
# The minimun number of times the number of items in the retry table is found to be above the set threshold before
# checking if the retry feeder is stalled.
#
$minRetryTableIssueThreshold = 2
# The minimun number of items in the retry table that should be drained over the past hour for a catalog before assuming
# the retry feeder for that catalog is stalled.
#
$retryTableDrainThreshold = 500
# MSFTESQL service name
#
$msftesqlServiceName = "msftesql-exchange"
# MSFTESQL process name
#
$msftesqlProcessName = "msftesql"
# MSFTEFD process name
#
$msftefdProcessName = "msftefd"
# ExSearch service name
#
$exsearchServiceName = "MSExchangeSearch"
# ExSearch process name
#
$exsearchProcessName = "Microsoft.Exchange.Search.ExSearch"
# Timespan to wait before taking the next crash dump
#
$minTimeBetweenCrashDumps = New-TimeSpan -Days 2
# name used for application log
#
$appLogName = "Application"
# Event log source name for application log
#
$appLogSourceName = "CI Troubleshooter"
# name used for crimson log
#
$crimsonLogName = "Microsoft-Exchange-Troubleshooters/Operational"
# Event log source name for crimson log
#
$crimsonLogSourceName = "Content Index"
# Indicates the number of minutes we would go back from current time
# to start checking events for bad disk blocks/msftesql crashes
#
$badDiskBlockCheckIntervalInMinutes = 30
# Indicates the number of minutes we would go back from current time
# to start checking events for IFilter error messages
#
$badIFilterCheckIntervalInMinutes = 60
# Event Id that indicates msftesql-exchange service found a bad Ifilter
#
$msftesqlBadIFilterEventIdEventId = 130
# Min number of Events required in the eventLog before an Ifilter is disabled
#
$msftesqlBadIFilterEventThreshold = 25
# CPU Affinity count for the msftesql process
#
$msftesqlCPUAffinityCount = 2
# Event Id that indicates msftesql-exchange service has crashed
#
$msftesqlCrashEventId = 1053
# Event Id that indicates msftesql-exchange master merge failed
#
$msftesqlMasterMergeFailedBadDiskEventId = 4104
# Source name to use for finding bad disk events in system log
#
$diskSourceName = "Disk"
# Event Id that indicates bad disk block error in system log
#
$badDiskEventId = 7
# Suffix that should be added by the TS to an Ifilter when it needs to disabled it
#
$disabledIFilterSuffix = "_DisabledByCITroubleshooter"
# The count threshold of the number of mailboxes left to crawl remains the same for consequetive runs before the TS will asssume that the service is stalled
# and restart it
$stallDuringCrawlThreshold = 6
# Max threshold for the catalog size as a percentage of the overall database size before the
# troubleshooter raises an alert
#
$maxPercentageCatalogSize = 20
# To avoid the MSFTESQL Process from consuming a lot of CPU we limit processor affinity.
# When the value is missing, it will use 33% of available logical CPU.
#
$affinityValue = $null
# Maximum msftefd cumulative memory consumption
#
$MaxMsftefdMemoryConsumption = 6144
</ConstantsScriptBody>
<LibraryScriptName>CITSLibrary.ps1</LibraryScriptName>
<LibraryScriptBody>
# Copyright (c) 2009 Microsoft Corporation. All rights reserved.
#
# THIS CODE IS MADE AVAILABLE AS IS, WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK
# OF THE USE OR THE RESULTS FROM THE USE OF THIS CODE REMAINS WITH THE USER.
#
# This file contains Content Index Troubleshooter functions
#
#####################################################################################
#
#
# THIS FILE EXISTS IN TWO LOCATIONS. MAKE SURE TO BOTH COPIES OF THE FILE ARE UPDATED WHEN
# WHEN EITHER COPY IS CHANGED
# <DEPOT>\Sources\dev\management\src\management\scripts\troubleshooter\CITSLibrary.ps1
# <DEPOT>\Sources\dev\mgmtpack\src\HealthMainfests\scripts\troubleshooter\CITSLibrary.ps1
# The management version of the library gets deployed during exchange setup and the
# mgmtpack version of the library only gets deployed when the management pack is installed
#
######################################################################################
# Include the global constants and types
#
$scriptPath = [System.IO.Path]::GetDirectoryName($MyInvocation.MyCommand.Path)
. $scriptPath\CITSConstants.ps1
. $scriptPath\CITSTypes.ps1
<#
.DESCRIPTION
Validate-Arguments is called by Troubleshoot-CI.ps1 script to
perform additional validation of command-line arguments.
If validation fails, this function throws an ArgumentException
with specific information.
.PARAMETER Server
The simple NETBIOS name of mailbox server on which troubleshooting
should be atempted for CI catalogs. If this optional parameter is
not specified, local server is assumed.
.PARAMETER Database
The name of database to troubleshoot. If this optional parameter is
not specified, catalogs for all databases on the server specified
by the Server parameter are troubleshooted.
.PARAMETER Symptom
Specifies the symptom to detect. Possible values are:
'Deadlock', 'Corruption', 'Stall', 'Backlog' and 'All'.
When 'All' is specified, all the first four symptoms in
the list are checked.
If this optional parameter is not specified, 'All' is assumed.
.PARAMETER Action
Specifies the action to be performed to resolve a symptom. The
possible values are 'Detect', 'DetectAndResolve', 'Resolve'.
The default value is 'Detect'
.PARAMETER MonitoringContext
Specifies if the command is being run in a monitoring context.
The possible values are $true and $false. Default is $false.
If the value is $true, warning/failure events are logged to the
application event log. Otherwise, they are not logged.
.PARAMETER FailureCountBeforeAlert
Specifies the number of failures the troubleshooter will allow
before raising an Error in the event log, leading to a SCOM alert.
The allowed range for this parameter is [1,100], both inclusive.
.PARAMETER FailureTimeSpanMinutes
Specifies the number of minutes in the time span during which
the troubleshooter will check the history of failures to count
the failures and alert. If the failure count during this time
span exceeds the value for FailureCountBeforeAlert, an alert
is raised. No alerts are rasised if MonitoringContext is $false.
The default value for this parameter is 600, which is equivalent
to 10 hours.
.INPUTS
None. You cannot pipe objects to Troubleshoot-CI.ps1.
.OUTPUTS
Returns an object of type Arguments
#>
function Validate-Arguments
{
Param(
[String]
$Server,
[String]
$Database,
[String]
$Symptom,
[String]
$Action,
[Switch]
$MonitoringContext,
[Int32]
$FailureCountBeforeAlert,
[Int32]
$FailureTimeSpanMinutes,
[bool]
$CanTakeProcessCrashDumps
)
# if resolution is requested, only a specific
# symptom is allowed, 'All' is not allowed.
#
if ($Action -ieq "Resolve" -and $Symptom -ieq "All" )
{
$argError = new-object System.ArgumentException ($LocStrings.AllNotAllowedForResolve)
throw $argError
}
$Arguments = new-object -typename Arguments
# If server name wasn't supplied, default to
# local server name
#
if ([System.String]::IsNullOrEmpty($Server))
{
$Arguments.Server = $env:computername
}
else
{
$Arguments.Server = $Server
}
<#
.DESCRIPTION
Detects problems with catalog copies specified by the $Server and
$Database parameters
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER Database
The name of database.
.PARAMETER After
The start time from which to scan for issues. Specifically, this is
used for checking bad disk block issues in event log. Troubleshoot-CI.ps1
uses a default of 30 min before the script is run. This parameter
is added mostly for use by tests.
.PARAMETER Symptom
Symptom to detect.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
An instance of the ServerStatus object.
#>
function Detect-Problems
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[String]
$Database,
[DateTime]
$After,
[string]
[ValidateNotNullOrEmpty()]
$Symptom
)
# Run the memory check only if 'All' or 'MsftefdHealth' is specified as a symptom
#
if ($Symptom -ieq "MsftefdHealth" -or $Symptom -ieq "All")
{
$serverStatus.CumulativeMsftefdMemoryConsumption = Get-MsftefdMemoryUsage $server
$serverStatus.CatalogStatusArray = @()
$serverStatus.IsMsftefdMemoryConsumptionOverLimit = ($MaxMsftefdMemoryConsumption -gt 0) -and ($serverStatus.CumulativeMsftefdMemoryConsumption -gt $MaxMsftefdMemoryConsumption)
}
# Run these checks when we dont want to check msftefd health only
if ($Symptom -ine "MsftefdHealth")
{
$copyArray = @(Get-Copies -Server $Server -Database $Database)
if (IsCatalogHealthStale -CIStatus $status -StaleThresholdInSeconds $staleThresholdInSeconds)
{
write-verbose ("Catalog health for " + $status.Name + " is stale.")
$status.IsHealthStale = $true
}
if (IsLargeCatalog -CIStatus $status -PercentThreshold $maxPercentageCatalogSize)
{
write-verbose ("Percentage Catalog size for " + $status.Name + " is greater than the allowed threshold.")
$status.IsLargeCatalog = $true
}
if (IsCatalogCorrupted -CIStatus $status)
{
write-verbose ("Catalog " + $status.Name + " is corrupted.")
$status.IsCorrupted = $true
}
# If the catalog has a bad block on the disk, set the IsCorrupted to true
# so that the corresponding resolution action (reseed) can be taken.
#
if ($status.HasBadDiskBlock)
{
write-verbose ("Catalog " + $status.Name + " has bad disk block.")
$status.IsCorrupted = $true
}
# If the catalog has a bad block on the disk, set the IsCorrupted to true
# so that the corresponding resolution action (reseed) can be taken.
#
if ($status.BadDiskBlockMasterMerge)
{
write-verbose ("Catalog " + $status.Name + " has bad disk block during master merge.")
$status.IsCorrupted = $true
}
# If the catalog associated with passive is in a crawling state, set the IsCorrupted to true
# so that the corresponding resolution action (reseed) can be taken.
#
if(IsCrawling -Copies $copyArray -CIStatus $status)
{
write-verbose ("Catalog " + $status.Name + " is in crawling state.")
$status.IsCorrupted = $true
}
}
# if a database was specified in the
# argument list, it doesn't make sense to check
# if the entire set of catalogs is deadlocked.
# Otherwise, check for deadlock using all catalogs
# in the status array
#
if ([System.String]::IsNullOrEmpty($Database))
{
if ((IsDeadlocked $CIStatusArray))
{
$serverStatus.IsDeadlocked = $true
}
}
}
return $serverStatus
}
<#
.DESCRIPTION
Builds a fake ServerStatus object for a specified
Symptom. Used when a specific resolution action
was requested, overriding any real status.
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER Database
The name of database.
.PARAMETER Symptom
The symptom to use in building the server status object.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
An instance of the ServerStatus object.
#>
function Build-ServerStatus
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[String]
[AllowNull()]
$Database,
[String]
[ValidateNotNullOrEmpty()]
[ValidateSet("Deadlock", "Corruption", "Stall", "MsftefdHealth")]
$Symptom
)
# If the server status is deadlocked, log it.
#
if ($ServerStatus.IsDeadlocked)
{
$issuesFound = $True
Log-Event -Arguments $Arguments -EventInfo $LogEntries.DetectedDeadlock -Parameters @("SomeString")
}
if ($ServerStatus.IsMsftefdMemoryConsumptionOverLimit)
{
$ProcessInstances = ''
foreach($process in @(Get-Process -name $msftefdProcessName -ComputerName $Arguments.Server -ErrorAction:SilentlyContinue))
{
if ($process -eq $null)
{
continue
}
if ($serverStatus.BadIFilters -ne $null -and $serverStatus.BadIFilters.Length -gt 0)
{
[string]$filterNames = ""
foreach($filterName in $serverStatus.BadIFilters)
{
$filterNames += "'$filterName',"
}
# Remove the additional ',' from the end of the filterNames string
#
$filterNames = $filterNames.SubString(0, $filterNames.Length - 1)
$issuesFound = $true
Log-Event -Arguments $Arguments -EventInfo $LogEntries.FoundBadIFiltersEnabled -Parameters @($filterNames)
}
if ($serverStatus.IFiltersToEnable -ne $null -and $serverStatus.IFiltersToEnable.Length -gt 0)
{
[string]$filterNames = ""
foreach($filterName in $serverStatus.IFiltersToEnable)
{
$filterNames += "'$filterName',"
}
# Remove the additional ',' from the end of the filterNames string
#
$filterNames = $filterNames.SubString(0, $filterNames.Length - 1)
$issuesFound = $true
Log-Event -Arguments $Arguments -EventInfo $LogEntries.IFiltersToEnable -Parameters @($filterNames)
}
if ($catalog.HasRetryQueueIssues)
{
# Skipping logging of events per catalog. Will be logging one event for all the catalogs on the server
# [string[]]$parameters = ($Catalog.DatabaseName, $catalog.NumberOfRetryItemsProcessed, 0)
# Log-Event -Arguments $Arguments -EventInfo $LogEntries.ItemsStuckInRetryQueue -Parameters $parameters
$catalogsWithRetryQueueIssues = $catalogsWithRetryQueueIssues + $catalogStatus
}
}
elseif ($catalog.IsLargeCatalog)
{
$issuesFound = $True
Log-Event `
-Arguments $Arguments `
-EventInfo $LogEntries.CatalogSizeGreaterThanExpectedDBLimit `
-Parameters @($catalog.DatabaseName, $maxPercentageCatalogSize, $catalog.PercentageCatalogSize)
}
else
{
if ($catalog.IsCorrupted -eq $false)
{
# This particular copy was found healthy in this troubleshooter run. Clear off any state stored by the TS associated with catalog
#
Reset-EventRetryCounter -Server $Arguments.Server -EventId $LogEntries.ActiveCatalogCopyCorrupt[0] -OptionalComponent $Catalog.DatabaseName
Reset-EventRetryCounter -Server $Arguments.Server -EventId $LogEntries.CatalogReseedLoop[0] -OptionalComponent $Catalog.DatabaseName
Reset-EventRetryCounter -Server $Arguments.Server -EventId $LogEntries.ReseedFailure[0] -OptionalComponent $Catalog.DatabaseName
}
# If no issues are found, log the fact.
# This will help turn previous alerts
# green.
#
if (!($issuesFound))
{
Log-Event -Arguments $Arguments -EventInfo $LogEntries.DetectedNoIssues -Parameters @("SomeString")
}
}
<#
.DESCRIPTION
Gets the memory usage of all the filter MSDTED processes
.PARAMETER Server
Name of the server to monitor the process
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
None.
#>
function Get-MsftefdMemoryUsage
{
Param(
[string]
$Server = $env:ComputerName
)
if ($Server -ieq "localhost")
{
$Server = $env:ComputerName
}
<#
.DESCRIPTION
Attempts resolution of problems with CI catalogs
.PARAMETER Arguments
Object of type Arguments, containing command-line
arguments
.PARAMETER ServerStatus
Object of type ServerStatus, indicating the current
status of catalogs.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
A modified server status object with resolution
status added to each catalog status object.
#>
function Resolve-Problems
{
Param(
[Object]
[ValidateNotNullOrEmpty()]
$Arguments,
[Object]
[ValidateNotNullOrEmpty()]
$ServerStatus
)
# $todo$ if resolution is already
# in progress on the target server
# initiated by some other instance of
# troubleshooter, log error and return
#
if ($disableBadIFilters -gt 0)
{
if ($serverStatus.BadIFilters -ne $null -and $serverStatus.BadIFilters.Length -gt 0)
{
foreach($filterName in $serverStatus.BadIFilters)
{
Disable-BadIFilter -Server $Arguments.Server -FilterName $filterName
}
$restartServices = $true
}
if ($serverStatus.IFiltersToEnable -ne $null -and $serverStatus.IFiltersToEnable.Length -gt 0)
{
foreach($filterName in $serverStatus.IFiltersToEnable)
{
if ((Enable-IFilter -Server $Arguments.Server -FilterName $filterName) -eq $true)
{
$restartServices = $true
# Once the disabled IFilters have been enabled we should reset the Retry counter
# This will ensure that the TS does not enable an IFilter for the next 6 runs (6 Hours)
# If enabling any one of the IFilters fails in this pass the TS will try again in the next
# scheduled window (6 hours later) and not in the next run to avoid restarting services
# frequently in Datacenter. (Enabling an IFilter is low priority compared to other operations)
#
Reset-EventRetryCounter -Server $Arguments.Server -EventId $msftesqlBadIFilterEventIdEventId
}
}
}
}
if ($restartServices)
{
if ($includeCrashDump)
{
Get-ProcessDump -Arguments $Arguments -crashDumpProcessId $crashDumpProcessId -crashDumpProcessNames $crashDumpProcessNames
# Only log this error if we took a crash dump for the FD memory consumption issue
#
if ($ServerStatus.IsMsftefdMemoryConsumptionOverLimit)
{
Log-Event -Arguments $Arguments `
-EventInfo $LogEntries.MsftefdMemoryUsageHighWithCrashDump `
-Parameters @($Arguments.MaxCumulativeMsftefdMemoryConsumption, $ServerStatus.CumulativeMsftefdMemoryConsumption)
}
}
# now look for corruptions and start reseeding
# each corrupted catalog
#
foreach ($catalog in $ServerStatus.CatalogStatusArray)
{
if ($catalog -eq $null)
{
continue
}
if ($catalog.IsCorrupted -eq $True)
{
write-verbose ($catalog.Name + " seems to be corrupted. Reseeding the catalog..")
Reseed-Catalog -Arguments $Arguments -Catalog $catalog
}
}
# To avoid the MSFTESQL Process from consuming a lot of CPU we limit processor affinity.
# The processor affinity count will be read from a registry.
# Make sure that this line appears after the restart services call otherwise the changes would be lost once the process restarts
#
[void](Set-ProcessorAffinity -ProcessName $msftesqlProcessName -NumberOfCPU $affinityValue)
<#
.DESCRIPTION
Checks if Recovery actions for the catalog of a mailbox database should be ignored on this server
.PARAMETER DatabaseName
Name of the database being checked
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if the recovery should be ignored. $False otherwise
#>
function ShouldIgnoreRecovery
{
Param($DatabaseName)
if ([String]::IsNullOrEmpty($DatabaseName) -eq $false -and $disableRecoveryForDatabasesList -ne $null)
{
foreach($databaseToIgnore in $disableRecoveryForDatabasesList)
{
if ($databaseToIgnore -ne $null -and $databaseToIgnore -ieq $DatabaseName)
{
return $true
}
}
}
return $false
}
<#
.DESCRIPTION
Sets the affinity of a process to the last N number of CPU's in the system.
The reason why we pick "last" is - usually the first a few CPUs are heavily used already.
.PARAMETER ProcessName
Name of the process
.PARAMETER NumberOfCPU
Total Number of CPU's
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if the value was set successfully or $false otherwise
#>
function Set-ProcessorAffinity
{
Param(
[Object]
[ValidateNotNullOrEmpty()]
$ProcessName,
[int]
$NumberOfCPU
)
# Get the number of logical processor count
$logicalProcessorCount = 0
$processors = @(Get-WmiObject Win32_Processor)
foreach ($processor in $processors)
{
$logicalProcessorCount += $processor.NumberOfLogicalProcessors
$processor.Dispose()
}
# If we have only 1 processor, there is no point setting affinity.
if ($logicalProcessorCount -le 1)
{
return $false
}
# If we don't have a meaningful value from caller for $NumberOfCPU, we will use 1/3 of total logical processors
# If there are only 2 logical processors, we will use 1.
if ($NumberOfCPU -lt 1)
{
$NumberOfCPU = [Math]::Floor($logicalProcessorCount / 3)
if ($NumberOfCPU -eq 0)
{
$NumberOfCPU = 1
}
}
$processorAffinityValue = 0
for($power = 0; $power -lt $NumberOfCPU; $power++)
{
# Set the mask to use the last N processor.
$processorAffinityValue += [Math]::Pow(2, $logicalProcessorCount - 1 - $Power)
}
if ($processorAffinityValue -gt 0)
{
try
{
if ([int]$process.ProcessorAffinity -ne $processorAffinityValue)
{
$process.ProcessorAffinity = new-object IntPtr $processorAffinityValue
}
try
{
if ((Check-CanTakeCrashDump -Server $Arguments.Server) -eq $true)
{
if ($crashDumpProcessId -gt 0)
{
Update-LastCrashDumpTime -Server $Arguments.Server
.\dump-process.ps1 -uniquePid $crashDumpProcessId -Alias [email protected] -numDumps 1 -dfs -Full
}
else
{
foreach($processName in $crashDumpProcessNames)
{
Update-LastCrashDumpTime -Server $Arguments.Server
.\dump-process.ps1 -processname $processName -Alias [email protected] -numDumps 1 -dfs
}
}
}
}
catch [System.Exception]
{
# Catch any exceptions thrown by the dump process script
# and ignore them. Process dump collection is not a critical piece
#
$message=($error[0].Exception.ToString() + $error[0].InvocationInfo.PositionMessage)
write-verbose ("Caught Exception: $message")
}
}
<#
.DESCRIPTION
Checks if the TS can take a crash dump of a process. Function will return true only if $minTimeBetweenCrashDumps
time criteria is met
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.INPUTS
None. You cannot pipe objects to this function.
<#
.DESCRIPTION
Gets all copies of given database on the given server.
If server parameter is null or empty, local server is assumed.
If database is non-null and non-empty, just that copy
on the given server or local server is returned.
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER Database
The name of database.
.INPUTS
None. You cannot pipe objects to Troubleshoot-CI.ps1.
.OUTPUTS
An array of database copy objects.
#>
function Get-Copies
{
Param(
[String] [AllowNull()]
$Server,
[String] [AllowNull()]
$Database
)
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
Number of documents processed since last run
#>
function Get-RetryDocumentsProcessedSinceLastRun
{
Param(
[Object]
[ValidateNotNull()]
$DatabaseCopy,
[int]
$CurrentRetryTableItemsProcessed
)
<#
.DESCRIPTION
Tries to gets the management pack version deployed on the current server
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
Management pack version deployed on the server
#>
function Get-ManagementPackVersion
{
$location = Get-Location
$version = "MP version not found"
try
{
# Check if the Operations Manager Client snapin is present on the server
#
$operationsManagerClientSnapin = Get-PSSnapin -Registered | ?{$_.Name -ieq 'Microsoft.EnterpriseManagement.OperationsManager.Client'}
if ($operationsManagerClientSnapin -ne $null)
{
# TODO the ideal way would be to get the current server from the OpsMgrConnector config file.
#
$managementGroupServerList = @()
# Get the Name of the management group server that this computer is a part of
#
$OperationsManagerRegistryPath = "SOFTWARE\Microsoft\Microsoft Operations Manager\3.0"
foreach($managementGroupServer in $managementGroupServerList)
{
# Try connecting to the Management group server to get the exchange management pack version
#
Add-PSSnapin 'Microsoft.EnterpriseManagement.OperationsManager.Client' -ErrorAction:SilentlyContinue
Set-Location "OperationsManagerMonitoring::"
$Script:MG = New-ManagementGroupConnection -ConnectionString $managementGroupServer
if ($Script:MG -eq $null)
{
continue
}
<#
.DESCRIPTION
Gets the catalog size as a percentage of the overall database size
.PARAMETER databaseCopy
Database copy
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
Catalog size as a percentage of the overall database size
#>
function Get-PercentageCatalogSize
{
Param(
[Object]
[ValidateNotNull()]
$DatabaseCopy
)
$percentCatalogSize = 0
# Percentage catalog size is a best effort calculation. The troubleshooter will fail if it encounters any problems getting the catalog data
try
{
$mailboxDatabase = Get-CachedMailboxDatabase -DatabaseName $DatabaseCopy.DatabaseName
$catalogDirectory = [System.IO.Path]::Combine([System.IO.Path]::GetDirectoryName($mailboxDatabase.EdbFilePath), "CatalogData-" + $mailboxDatabase.Guid.ToString() + "-*");
$catalogSizeMb = ((gci $catalogDirectory -recurse | measure-object Length -sum).Sum / (1024 * 1204))
$edbSizeMb = ((gci $mailboxDatabase.EdbFilePath).Length / (1024 * 1204))
# Don't bother with size checks if the MDB is really small
if ($edbSizeMb -ge 1024)
{
$percentCatalogSize = ($catalogSize * 100) / $edbSize
}
}
catch [System.Exception]
{
$percentCatalogSize = -1
}
return $percentCatalogSize
}
<#
.DESCRIPTION
Checks the application event log for MSSearch crashes and
then checks the System event log for any bad block errors.
Then the function maps the disk in the error log to the catalog
and sets the status.HasBadDiskBlock to true/false.
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER StartTime
The time from which to check badk disk/msftesql crash events.
.PARAMETER CIStatusArray
An array of CIStatus objects
.INPUTS
None. You cannot pipe objects to this function.
if ($CIStatusArray.Length -eq 0)
{
return $CIStatusArray
}
# Initialize HasBadDiskBlock member of all status objects
#
foreach ($status in $CIStatusArray)
{
$status.HasBadDiskBlock = $false
}
# Check if we have any msftesql crashes in the past N minutes
#
try
{
$msftesqlCrashes = @(get-eventlog -computername $Server -after $StartTime -logname "Application" -source $msftesqlServiceName -ErrorAction:SilentlyContinue) | where {$_.eventId -eq $msftesqlCrashEventId}
}
catch [System.Exception]
{
$msftesqlCrashes = $null
}
if (($msftesqlCrashes -eq $null) -or ($msftesqlCrashes.Count -eq 0))
{
write-verbose "Check-BadDiskBlocks did not find any msftesql crashes in event log since $startTime"
return $CIStatusArray
}
# Check if we have any bad disk block errors after $startTime.
# If yes, map each disk in the error log to a catalog and
# set the HasBadDiskBlock member of the corresponding status object
# to true.
#
try
{
$badDiskEvents = @(get-eventlog -computername $Server -after $StartTime -logname "System" -source $diskSourceName -ErrorAction:SilentlyContinue) | where {$_.eventId -eq $badDiskEventId}
}
catch [System.Exception]
{
$badDiskEvents = $null
}
if (($badDiskEvents -eq $null) -or ($badDiskEvents.Count -eq 0))
{
write-verbose "Check-BadDiskBlocks did not find any bad disk block events in event log after $startTime"
return $CIStatusArray
}
# Scan bad disk events, and get the unique bad disk names.
#
$badDiskNames=@{}
$i = 0
foreach ($event in $badDiskEvents)
{
if (($event.ReplacementStrings -eq $null) -or ($event.ReplacementStrings.Length -eq 0))
{
continue;
}
$diskName = ($event.ReplacementStrings[0]).ToLower()
if (!$badDiskNames.Contains($diskName))
{
$badDiskNames.Add($diskName,$i)
$i++
}
}
if ($badDiskNames.Keys -eq $null -or $badDiskNames.Keys.Count -eq 0)
{
write-verbose "No bad disk names found in Disk event logs"
return $CIStatusArray
}
foreach ($diskName in $badDiskNames.Keys)
{
# $diskName is in the format "\device\harddisk3\dr3"
# we need to extract the disk number i.e., 3 from it.
#
$parts = $diskName.Split("\")
$prefix = "harddisk"
foreach ($part in $parts)
{
if (($part -eq $null) -or ($part.Length -eq 0))
{
continue
}
if ($part.StartsWith($prefix))
{
$number = $part.Substring($prefix.Length)
$diskNumber = [int]$number
write-verbose "Extracted disk number $diskNumber from $part"
$databaseName = Map-DiskNumberToDatabase -DiskNumber $diskNumber -Server $server
if ($databaseName -eq $null)
{
write-verbose "$diskNumber could not be mapped to any database"
continue
}
<#
.DESCRIPTION
Returns the name of database, given a physical disk number
.PARAMETER Server
Name of the server to lookup. This function does not work for a remote server yet because it uses the
diskpart utility which works only on local computers
.PARAMETER DiskNumber
physical disk number
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
name of database hosted on that disk.
#>
function Map-DiskNumberToDatabase
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[Int32]
$DiskNumber
)
if ($server -ine $env:ComputerName)
{
write-verbose "Map-DiskNumberToDatabase does not work for remote computers"
return $null
}
# The block below to parse the DiskPart output was
# provided by Daniel Joiner.
# It tries to find the volume name
#
$diskdetails = "select disk $DiskNumber`ndetail disk" | DiskPart
for($j=0; $j -lt $diskdetails.Count; $j++)
{
if($diskdetails[$j] -match "^ Volume [0-9]+ +")
{
if($diskdetails[$j+1] -match "^ [a-zA-Z0-9_-]")
{
$MountPoint = $diskdetails[$j+1] -replace "^ (.*)",'$1'
break
}
else
{
$MountPoint = $diskdetails[$j] -replace "^ Volume [0-9]+ +([A-Z]).*",'$1'
}
}
}
write-verbose "Looking for database with the mount point: $MountPoint"
# Scan bad disk events, and get the unique bad disk names.
#
$badCatalogDatabaseGuids=@()
foreach ($event in $masterMergeErrors)
{
$mdbGuid = Get-DatabaseGuidFromCatalogName -EventLogMessage $event.Message
# If the mdb guid is null then skip this event message
#
if ($mdbGuid -eq $null)
{
continue
}
$found = $false
foreach($badCatalogDbGuid in $badCatalogDatabaseGuids)
{
if ($badCatalogDbGuid -ieq $mdbGuid)
{
$found = $true
}
}
if ($found -eq $false)
{
$badCatalogDatabaseGuids += $mdbGuid
}
}
if ($badCatalogDatabaseGuids.Length -eq 0)
{
return
}
foreach($catalog in $CIStatusArray)
{
foreach($badCatalogDbGuid in $badCatalogDatabaseGuids)
{
if($badCatalogDbGuid -ieq $catalog.DatabaseGuid.ToString())
{
# Mark the Catalog for that database corrupt
#
$catalog.BadDiskBlockMasterMerge = $true
}
}
}
}
function Get-DatabaseGuidFromCatalogName
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$EventLogMessage
)
# Event message extracted from bug# 349577:
# A master merge has been paused for catalog ExSearch-ac4031a0-5982-4ddf-9967-d34b5bc1fb75-c907a6bd-8553-4d39-93c3-c97cdb23605a due to error The request could not be performed because of an I/O device error. 0x8007045d.
# It will be rescheduled later.
# Use the regex below to extract the mdb Guid from the event message. ExSearch-{MDBGUID}-{InstanceGuid}
$mdbguidExtractorRegexString = "A master merge has been paused for catalog ExSearch-(?<MdbGuid>[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}).*The request could not be performed because of an I/O device error..*0x8007045d."
$mdbguidExtractorRegex = new-object System.Text.RegularExpressions.RegEx($mdbguidExtractorRegexString, [System.Text.RegularExpressions.RegexOptions]::IgnoreCase)
$matches = $mdbguidExtractorRegex.Match($EventLogMessage)
if ($matches.Success)
{
return $matches.Groups["MdbGuid"].Value
}
return $null
}
<#
.DESCRIPTION
Returns an entry from the perf counter cache hash.
The purpose of the function is
to provide a point for injection during testing
.PARAMETER Value
The key of to look up
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
the value of the array at the key.
#>
function Get-CachedCounter
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Value
)
return $counterHashTable[$Value]
}
<#
.DESCRIPTION
Gets one or more databasecopystatus objects
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER Database
The name of database.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
An array of database copy objects.
#>
function Get-MatchingDatabaseCopyStatusObjects
{
Param(
[String] [AllowNull()]
$Server,
[String] [AllowNull()]
$Database
)
$copyName = "$db\$Server"
# Check if the database has a mounted copy on any server in the DAG
foreach($copyId in $mailboxDatabaseCopyStatus)
{
if ($copyId.Status -ieq 'Mounted')
{
$includeMdbForAnalysis = $true
break;
}
}
if ($includeMdbForAnalysis)
{
foreach($copyId in $mailboxDatabaseCopyStatus)
{
# Check if the copy of the database on the server is either Healthy or Mounted
if ($copyId.Name -ieq $copyName -and ($copyId.Status -ieq 'Mounted' -or $copyId.Status -ieq 'Healthy'))
{
$copies += $copyId
}
}
}
else
{
write-verbose "The troubleshooter will ignore $db because does not have any Active mounted copies"
}
}
return $copies
}
<#
.DESCRIPTION
Gets the cached copy of the mailbox database object
.PARAMETER DatabaseName
Mailbox database Name
.Parameter ServerName
NetBiosName of the Mailbox server
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
Cached copy of the mailbox database
#>
function Get-CachedMailboxDatabase
{
[CmdletBinding(DefaultParameterSetName='Default')]
Param(
[String]
[Parameter(ParameterSetName='Default')]
$DatabaseName,
[String]
$Server
)
if ([System.String]::IsNullOrEmpty($DatabaseName))
{
$databaseList = @(Get-MailboxDatabasesOnServer -Server $Server)
foreach ($status in $CIStatusArray)
{
if ($status.IsHealthStale)
{
$staleCatalogCount = $staleCatalogCount + 1
}
if ($status.IsStalled)
{
$stalledCatalogCount = $stalledCatalogCount + 1
}
}
if ($staleCatalogCount -ge $CIStatusArray.Length)
{
write-verbose ("Health status in registry is stale for all catalogs")
$allTimestampsStale = $true
}
if ($stalledCatalogCount -ge $CIStatusArray.Length)
{
write-verbose ("Indexing stalled for all catalogs")
$allCatalogsStalled = $true
}
<#
.DESCRIPTION
Determines if indexing is stalled for a catalog
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if a stall was detected, $false otherwise
#>
function IsCatalogStalled
{
Param(
[Object]
[ValidateNotNull()]
$CIStatus,
[Int32]
$StallThresholdInSeconds
)
$numberOfMailboxesLeftToCrawlRegkeyPath = $troubleshooterRegKey + 'MailboxesLeftToCrawl'
# If the catalog is in the crawling state check if crawl has not stalled
#
if ($CIStatus.NumberOfMailboxesLeftToCrawl -gt 0)
{
# Get the last value of the Mailboxes left to crawl counter
#
$lastCrawlMailboxCount = Get-RegKeyValue -Server $server -Path $numberOfMailboxesLeftToCrawlRegkeyPath -Name $CIStatus.DatabaseName
# Check if the stored value is greater than 0 and is not equal to the current value of NumberOfMailboxesLeftToCrawl.
# We should only do the equality check, because its possible for the service to start re-crawling all the mailboxes on the server between two runs
# in which case the new counter value will be greater than the saved registry counter
#
if ($lastCrawlMailboxCount -gt 0 -and $CIStatus.NumberOfMailboxesLeftToCrawl -eq $lastCrawlMailboxCount)
{
[REF]$outMessage = ""
$alertCritical = Check-EventThresholdReached -Server $Arguments.Server -EventId $stallDuringCrawlThreshold -Parameters @($CIStatus.DatabaseName) -Message ([REF]$outMessage)
if ($alertCritical)
{
# The number of mailboxes left to crawl has not changed for $stallDuringCrawlThreshold TS runs. Assume that the Service has stalled and restart it
#
write-verbose ("Indexing stalled detected. The number of mailboxes left to crawl counter has not reduced between two consequetive TS runs")
$isStalled = $true
}
}
else
{
# The mailboxes left to crawl counters are different reset the StallDuringCrawlThreshold counter for that database
#
Reset-EventRetryCounter -EventId $stallDuringCrawlThreshold -OptionalComponent $CIStatus.DatabaseName
}
}
# Save the NumberOfMailboxesLeftToCrawl counter value to the registry for future comparision
#
Set-RegKeyValue -Server $server -Path $numberOfMailboxesLeftToCrawlRegkeyPath -Name $CIStatus.DatabaseName -Value $CIStatus.NumberOfMailboxesLeftToCrawl
return $isStalled
}
<#
.DESCRIPTION
Determines if status data indicates corruption for a catalog.
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if a corruption was indicated, $false otherwise
#>
function IsCatalogCorrupted
{
Param(
[Object]
[ValidateNotNull()]
$CIStatus
)
if ($CIStatus.HealthReason -ieq $corruptionIndicator)
{
return $true
}
return $false
}
<#
.DESCRIPTION
Determines if indexing is backlogged for a catalog
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if a backlog was detected, $false otherwise
#>
function IsCatalogBacklogged
{
Param(
[Object]
[ValidateNotNull()]
$CIStatus,
[Int32]
$BacklogThresholdInSeconds,
[Int32]
$RetryItemsThreshold
)
if ($CIStatus.NumberOfItemsInRetryQueue -ge $RetryItemsThreshold)
{
$isBacklogged = $true
}
return $isBacklogged
}
<#
.DESCRIPTION
Determines if the retry queues for a catalog are draining over time.
.PARAMETER CatalogStatusArray
Catalog status objects for all the catalog
.PARAMETER RetryItemsThreshold
The minimum number of items that should be present in the retry queue before the troubleshooter assumes there are issues with the retry queue
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if a backlog was detected, $false otherwise
#>
function Check-ForRetryQueueIssues
{
Param($CatalogStatusArray,
[Int32]
$RetryItemsThreshold)
# Get the total number of retry documents processed on the server
foreach($catalogStatus in $CatalogStatusArray)
{
$totalRetryDocumentsProcessed = $totalRetryDocumentsProcessed + $catalogStatus.NumberOfRetryItemsProcessed
# If any one of the catalogs has retry queues greater than the threshold value we should check if the service is processing events in the retry table
# or ignore this check completly
if ($catalogStatus.NumberOfItemsInRetryQueue -ge $RetryItemsThreshold)
{
$hasCatalogsWithLargeRetryQueues = $true
}
}
if ($hasCatalogsWithLargeRetryQueues -and ($totalRetryDocumentsProcessed -le 0))
{
# If the number of documents processed by the retry feeder on the server is 0, check if the feeder
# has been stalled consequetively for $minRetryTableIssueThreshold before assuming issues with the retry queues
$lastRetryIssuesCount = [int](Get-RegKeyValue `
-Server $server `
-Path $RetryIssuesCatalogsRegKeyPath `
-Name $RetryIssuesCatalogsRegValueName `
-DefaultValue -1)
$anyCatalogHasRetryQueueIssues = $false
$minRetryTableThresholdReached = $lastRetryIssuesCount -ge $minRetryTableIssueThreshold
if ($minRetryTableThresholdReached)
{
foreach($catalogStatus in $CatalogStatusArray)
{
$catalogStatus.HasRetryQueueIssues = $catalogStatus.isBacklogged
}
}
<#
.DESCRIPTION
Determines if the size of a catalog is greater than the allowed threshold
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if health status was stale, $false otherwise
#>
function IsLargeCatalog
{
Param(
[Object]
[ValidateNotNull()]
$CIStatus,
[Int32]
$PercentThreshold
)
<#
.DESCRIPTION
Determines if health status in registry is stale for a catalog
Returns true if catalog health timestamp is older than the
threshold time span for the given catalog
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if health status was stale, $false otherwise
#>
function IsCatalogHealthStale
{
Param(
[Object]
[ValidateNotNull()]
$CIStatus,
[Int32]
$StaleThresholdInSeconds
)
$staleThreshold = New-TimeSpan -Seconds $StaleThresholdInSeconds
$now = (Get-Date)
write-verbose ("current time = " + $now)
$lastModified = (Get-Date $CIStatus.HealthTimestamp)
write-verbose ("Health status for " + $CIStatus.Name + " last modified at " + $lastModified)
$timeSpan = New-TimeSpan -Start $lastModified -End $now
if ($timeSpan -gt $staleThreshold)
{
write-verbose ("Health status in registry for " + $CIStatus.Name + " is stale")
return $true;
}
return $false
}
<#
.DESCRIPTION
Returns true if catalog for a passive is crawling
.PARAMETER Copies
An array of mailboxdatabasecopy objects
.PARAMETER CIStatus
Catalog status object for the catalog
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
$true if catalog for a passive is crawling, $false otherwise
#>
function IsCrawling
{
Param(
[Object[]]
[ValidateNotNullOrEmpty()]
$Copies,
<#
.DESCRIPTION
Reseeds a catalog from the active instance
.PARAMETER Arguments
The Arguments object constructed with script args
.PARAMETER Catalog
The name of mailbox database copy corresponding
to the catalog that needs to be reseeded
.OUTPUTS
None. Throws exception if not successful.
#>
function Reseed-Catalog
{
Param(
[Object]
[ValidateNotNull()]
$Arguments,
[Object]
[ValidateNotNull()]
$Catalog
)
$errorPref = $ErrorActionPreference
# Change the error action preference so that any error during reseed
# is reported as a failure. Without this, an error in
# update-mailboxdatabasecopy is not thrown as an exception.
#
$ErrorActionPreference="Stop"
try
{
$problemdb = Get-CachedMailboxDatabase -DatabaseName $Catalog.DatabaseName
if ($problemdb.Mounted)
{
$sourceServer = $null
$allCopiesCorrupt = $true
# Before attempting to reseed check if the catalog is not mounted
#
$databaseCopies = @(Get-MailboxDatabaseCopyStatus $Catalog.DatabaseName)
foreach($databaseCopy in $databaseCopies)
{
if (($databaseCopy.Status -ieq 'Healthy' -or $databaseCopy.Status -ieq 'Mounted') -and $databaseCopy.ContentIndexState -ieq 'Healthy')
{
# Found at least one healthy catalog. Use that as the source of the reseed
#
$allCopiesCorrupt = $false
if ($databaseCopy.MailboxServer -ine $Arguments.Server)
{
$sourceServer = $databaseCopy.MailboxServer
}
}
if ($databaseCopy.MailboxServer -ieq $Arguments.Server)
{
$catalogCopy = $databaseCopy
}
}
if ($isPassiveCopy)
{
if ($allCopiesCorrupt -and (IsCatalogCorrupted -CIStatus $Catalog) -eq $false)
{
# If we do not have any healthy copies and this copy is crawling then do not bother with the reseed. Let the passive catalog crawl
# But make sure the TS logs a Reseed Failure Error log with an Exception explaining that all catalogs are corrupt.
#
throw (new-object -typename System.InvalidOperationException("Cannot reseed a catalog that does not have any healthy copies"))
}
# If the TS reached here then at this point we have a catalog that needs to be deleted and reseeded. If all the copies are corrupt
# this operation will force the catalog to start recrawling.
#
Update-CatalogCopy -CatalogName $Catalog.Name -SourceServer $sourceServer
# Reseed succeeded Check if this particular catalog was found to be corrupted in the last troubleshooter run
#
[REF]$outMessage = ""
$Catalog.InReseedLoop = Check-EventThresholdReached `
-Server $Server `
-EventId $TSRetrySettings.CatalogReseedLoop[0] `
-Parameters @($Catalog.DatabaseName) `
-Message ([REF]$outMessage)
<#
.DESCRIPTION
Handles exceptions thrown at the time of reseed and logs appropriate error messages
.PARAMETER Arguments
The Arguments object constructed with script args
.PARAMETER Arguments
Actual errror message
.PARAMETER DataBaseName
The name of mailbox database copy corresponding
to the catalog that failed to reseeded
.PARAMETER AdditionalContextString
Additional context information that would get logged in the
reseed failed event
.OUTPUTS
None. Throws exception if not successful.
#>
function Handle-ReseedFailureError
{
Param
(
[Object]
[ValidateNotNull()]
$Arguments,
[string]
[ValidateNotNullOrEmpty()]
$ErrorMessage,
[string]
[ValidateNotNullOrEmpty()]
$DatabaseName,
[string]
[ValidateNotNull()]
$AdditionalContextString
)
if ($ErrorMessage.ToLower().Contains("microsoft.exchange.cluster.replay.seedinprogressexception"))
{
# Do nothing a reseed is in progress already
return
}
.PARAMETER Arguments
The Arguments object constructed from script args
.PARAMETER Timeout
Time span to wait for stopping/terminating
services. If processes could not be stopped within
the timeout, a TimeoutException is raised.
.PARAMETER AdditionalContext
Additional context to be logged if the method fails to restart the service
.PARAMETER ShouldRetry
Bool indicating if the service restart should be retried
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
None. Throws exception if not successful.
#>
function Restart-SearchServices
{
Param(
[Object]
[ValidateNotNull()]
$Arguments,
[TimeSpan]
$Timeout,
[string]
$AdditionalContext,
[bool]
$ShouldRetry=$true
)
# At this point, we expect msftesql service
# is also started. Otherwise, start it.
#
$newTimeout = $deadline - (Get-Date)
Start-SearchService -Server $Arguments.Server -ServiceName $msftesqlServiceName -ProcessName $msftesqlProcessName -Timeout $newTimeout
# Wait until we see an event saying exsearch service started successfully
#
$newTimeout = $deadline - (Get-Date)
Wait-ForEvent `
-Server $Arguments.Server `
-LogName "Application" `
-EventSource $exsearchEventSource `
-EventId 100 `
-StartTime $startTime `
-Timeout $newTimeout
# Log Success
#
Log-Event `
-Arguments $Arguments `
-EventInfo $LogEntries.RestartSuccess `
-Parameters @("SomeString")
}
catch
{
# In case something goes wrong, always attempt to set the service start mode back.
Set-SearchServiceStartMode -Server $Arguments.Server -ServiceName $exsearchServiceName -StartMode 'Automatic'
$reason = $error[0].Exception.ToString() + $error[0].InvocationInfo.PositionMessage
write-verbose ("Restart-SearchServices failed. Reason: " + $reason)
<#
.DESCRIPTION
Gets a string formatted server status object
.PARAMETER ServerStatus
The server status object
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
String formatted server status object
#>
function Get-ServerStatusString
{
Param(
[Object]
$ServerStatus)
if ($ServerStatus -eq $null)
{
$serverStatusString = ""
}
else
{
$serverStatusString += $serverStatus | Out-String
foreach($catalogStatus in $serverStatus.CatalogStatusArray)
{
$serverStatusString += $catalogStatus | Out-String
}
}
return $serverStatusString
}
<#
.DESCRIPTION
Stops a service on a local/remote server.
This function stops the service, makes sure
the process has been terminated. If not,
it tries to kill the process.
.PARAMETER Server
The name of mailbox server
.PARAMETER ServiceName
The name of the service
.PARAMETER ProcessName
The name of the process behind the service
.PARAMETER TimeoutMinutes
Time in minutes to wait for stopping/terminating
services. If processes could not be stopped within
the timeout, a TimeoutException is raised.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
None. Throws exception if not successful.
#>
function Stop-SearchService
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[String]
$ServiceName,
[String]
$ProcessName,
[TimeSpan]
$Timeout
)
write-verbose ("Stop-SearchService, deadline = " + $deadline)
$process = Get-WMIProcess -ProcessName $ProcessName -Server $Server
$processId = $process.ProcessId
while ($deadline -gt (Get-Date))
{
# Keep stopping and terminating the process
# until the process dies or we reach timeout
#
Stop-UsingServiceController -Server $Server -ServiceName $ServiceName -Timeout $stopServiceTimeout
# Once we call stop using the service controller check if the process has really exited
# and if not kill the process
$processStopped = VerifyAndStop-LocalOrRemoteProcess -Server $Server -ProcessName $ProcessName -Timeout $stopProcessTimeout -ProcessId $processId
write-Verbose ("Process stopped " + $processStopped)
if ($processStopped)
{
return
}
else
{
# E14 585609 - ExSearch could be waiting on msftesql process - Terminate it first
VerifyAndStop-LocalOrRemoteProcess -Server $Server -ProcessName $msftesqlProcessName -Timeout $stopProcessTimeout
# When there's a deadlock, killing msftefd
# seems to stop services faster.
#
$msftefdStopped = VerifyAndStop-LocalOrRemoteProcess -Server $Server -ProcessName $msftefdProcessName -Timeout $stopProcessTimeout
}
}
$currentTime = Get-Date
if ($currentTime.AddSeconds(30) -gt $deadline)
{
# No point in calling Wait-ForProcessToStop with a timeout value less than 30 seconds
#
$newTimeout = $currentTime.AddSeconds(30) - $currentTime
}
else
{
$newTimeout = $deadline - $currentTime
}
# If we are here, there was no timeout exception.
#
write-verbose ("Successfully stopped service " + $ServiceName + " within the timeout period.")
}
<#
.DESCRIPTION
Starts a service on a local/remote server.
This function starts the service, makes sure
the process has been running.
.PARAMETER Server
The name of mailbox server
.PARAMETER ServiceName
The name of the service
.PARAMETER ProcessName
The name of the process behind the service
.PARAMETER TimeoutMinutes
Time in minutes to wait for starting the
service. If processes could not be started within
the timeout, a TimeoutException is raised.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
None. Throws exception if not successful.
#>
function Start-SearchService
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[String]
[ValidateNotNullOrEmpty()]
$ServiceName,
[String]
[ValidateNotNullOrEmpty()]
$ProcessName,
[TimeSpan]
[ValidateNotNullOrEmpty()]
$Timeout
)
Validate-Timeout $Timeout "Start-SearchService"
$deadline = (Get-Date) + $Timeout
# Find out if the service is already started.
#
$p = Get-WMIProcess -ProcessName $ProcessName -Server $Server
if (!($p -eq $null))
{
write-verbose ("Service " + $ServiceName + " is already started.")
write-verbose ("process : $p")
return
}
# Save the original value for
# error preference
#
$errorPref = $ErrorActionPreference
try
{
Validate-Timeout $Timeout "Stop-UsingServiceController"
$serviceFilter = ("name='" + $ServiceName + "'")
write-verbose ("Service filter used for get-wmiobject = " + $serviceFilter)
$ErrorActionPreference = "Continue"
$service = get-wmiobject win32_service -filter $serviceFilter -ComputerName $Server
if ($service -ne $null)
{
write-verbose ("Stopping service " + $ServiceName + " on " + $Server + " using service controller")
[void]$service.StopService()
# Check service status often
# until we timeout or service
# is stopped.
#
while (($service.State -ine "Stopped") -and
($deadline -gt (get-date)))
{
$service.Dispose()
Start-Sleep -seconds $statusCheckIntervalSeconds
$service = get-wmiobject win32_service -filter $serviceFilter -ComputerName $Server
write-verbose ("Service state of " + $ServiceName + " on " + $Server + ":" + $service.State)
[void]$service.StopService()
}
if ($service.State -ieq "Stopped")
{
write-verbose ("Stopped service " + $ServiceName + " on " + $Server + " using service controller")
$service.Dispose()
}
}
else
{
write-verbose ("Could not get service object for service " + $ServiceName + " on computer " + $Server)
}
}
finally
{
$ErrorActionPreference = $errorPref
}
}
<#
.DESCRIPTION
Checks if a remote process is running and terminates the process
.PARAMETER Server
The name of computer
.PARAMETER ProcessName
The name of the process
.PARAMETER Timeout
Time to wait for the task to complete.
A timeout exceptionis thrown if it doesn't
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
Returns $true if process was stopped, $false otherwise
#>
function VerifyAndStop-LocalOrRemoteProcess
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$Server,
[String]
[ValidateNotNullOrEmpty()]
$ProcessName,
[TimeSpan]
[ValidateNotNullOrEmpty()]
$Timeout,
[int]
$processId = 0
)
# Save the original value for
# error preference
#
$errorPref = $ErrorActionPreference
try
{
$ErrorActionPreference = "Continue"
$process = Get-WMIProcess -ProcessName $ProcessName -Server $Server
if ($process -ne $null)
{
write-verbose ("Stopping process " + $ProcessName + " on " + $Server + " using remote WMI object")
# If the calling function has not passed in the process ID then use the ID retured by the first call to Get-WMIProcess
# The reason we check the processID is that if a service is terminated unexpectdly the service control manager might start the
# service automatically based on the recovery action set for the service.
if ($processId -eq 0 )
{
$processId = $process.ProcessId
}
if ($process.ProcessId -eq $processID -and $process.ProcessId -ne 0)
{
# Keep checking the status of the process
# until it is the expected value or we
# timeout.
#
while (($process -ne $null) -and
($process.ProcessId -eq $processId) -and
($deadline -gt (get-date)))
{
[void]$process.Terminate(1)
$process.Dispose()
Start-Sleep -seconds $statusCheckIntervalSeconds
$process = Get-WMIProcess -ProcessName $ProcessName -Server $Server
}
}
}
if ($process -eq $null -or $process.ProcessId -eq 0 -or $process.ProcessId -ne $processId)
{
write-verbose ("Process " + $ProcessName + " on computer " + $Server + " has stopped.")
return $true
}
else
{
write-verbose ("Could not stop process " + $ProcessName + " on computer " + $Server)
$process.Dispose()
return $false
}
}
finally
{
$ErrorActionPreference = $errorPref
}
return $false
}
<#
.DESCRIPTION
Gets a WMI process object on a local/remote computer
.PARAMETER ProcessName
The name of the process without the file extension
.PARAMETER Server
The name of the server
.INPUTS
None. You cannot pipe objects to this function
.OUTPUTS
Returns the first process object that matches given filter
#>
function Get-WMIProcess
{
Param(
[String]
[ValidateNotNullOrEmpty()]
$ProcessName,
[String]
[ValidateNotNullOrEmpty()]
$Server
)
# Note: When using WMI objects, we need to append '.exe'
# for process names
#
$filter = ("name='" + $ProcessName + ".exe'")
if ($processList -is [Array])
{
$process = $processList[0]
for($index = 1; $index -lt $processList.Length; $index++)
{
# Dispose handles to any other wmi process objects
#
$processList[$index].Dispose()
}
}
else
{
$process = $processList
}
return $process
}
<#
.DESCRIPTION
Translates an event type string into an event type enum.
.PARAMETER Type
String representing the event type to translate
.INPUTS
None. You cannot pipe objects to this function
.OUTPUTS
Enum representing the event type to be used by a monitoring event.
#>
function Get-EventTypeEnum
{
Param(
[String]
[ValidateSet("Error", "Warning", "Information")]
$Type
)
if ($Type -eq "Error")
{
return $EVENT_TYPE_ERROR
}
if ($Type -eq "Warning")
{
return $EVENT_TYPE_WARNING
}
if ($Type -eq "Information")
{
return $EVENT_TYPE_INFORMATION
}
}
<#
.DESCRIPTION
Logs an event in the crimson log. In addition,
if the event is not crimson-specific and monitoring
context is $True, the event is logged to the
application log.
"CI Troubleshooter" as the event source for application
log and "CITS Operational" used for crimson log.
.PARAMETER Arguments
The Arguments object constructed from script args
.PARAMETER EventInfo
An object containing event id, type and message
.PARAMETER Parameters
Array of strings that comprise the parameters for the event
These parameters must match the order and count specified
in the message in the EventInfo paremeter
.INPUTS
None. You cannot pipe objects to this function
if ($alertCritical -eq $false)
{
# Instead of suppressing the event completely we just change the EventId to be a warning type
# Warning messages logged by the TS start in the range of 5300 to 5599. All transformed events
# will start from 5400
$type = "Warning"
$id = 5400 + ($id % 100)
$message = $message + $outMessage.Value
}
write-verbose ("Log-Event called with id=" + $id + " type=" + $type + " message=" + $message)
# Replace all parameter substrings (%1, %2, etc)
# with the real parameters
# Start backwards so that %10 is replaced with the 10th parameter and not the first one.
#
for ($i = $Parameters.Length + 1; $i -gt 0; $i--)
{
$substring = "%$i"
if ($message.Contains($substring))
{
$message = $message.Replace($substring, $Parameters[$i - 1])
}
}
# Log the event to application log
# only if the id is not a crimson-specific
# event and monitoring context is $True
#
if ($id -lt 6000)
{
if ($Arguments.MonitoringContext -eq $True)
{
# Add the event into the pipeline, at the end of the TS
# call Write-MonitoringEvents to flush the events.
$eventType = Get-EventTypeEnum -Type $type
Add-MonitoringEvent `
-Id $id `
-Type $eventType `
-Message $message `
-InstanceName $Arguments.InstanceName
}
if ($Arguments.WriteApplicationEvent -eq $True)
{
Write-AppLogEntry `
-Server $Arguments.Server `
-EventSource $AppLogSourceName `
-EventId $id `
-Category $category `
-Type $type `
-MessageAndParams $messageAndParams
}
}
else
{
write-verbose "MonitoringContext was false or the event is crimson-only, so skipped logging event"
}
# First, make sure the event source is registered
#
if (!([System.Diagnostics.EventLog]::SourceExists($EventSource, $Server)))
{
write-verbose "Event source $EventSource doesn't exist on $Server. Creating it."
try
{
$creationData =
new-object System.Diagnostics.EventSourceCreationData($EventSource, $LogName)
$creationData.MachineName = $Server
[System.Diagnostics.EventLog]::CreateEventSource($creationData)
}
catch [System.Management.Automation.MethodInvocationException]
{
# Due to race condition between multiple invocations, if event source
# gets registered by one invocation, the other may fail with this error
# trying to re-register the event source. In this case, let's clear
# the error and continue the execution.
#
if ($error[0].InvocationInfo.InvocationName -eq "CreateEventSource")
{
$exception = $error[0].Exception
write-verbose ("Expected exception when event source already exists: $exception")
$error.Clear()
}
}
}
# Create the framework object for EventLog
#
$log = new-object System.Diagnostics.EventLog($LogName, $Server)
$log.Source = $EventSource
$start = $StartTime.ToUniversalTime().ToString("o")
$query = "*[System[TimeCreated[@SystemTime > '$start'] and EventID='$EventId' and Provider[@Name='$EventSource']]]"
$deadline = (Get-Date) + $Timeout
write-verbose ("Wait-ForEvent deadline: " + $deadline)
write-verbose ("Wait-ForEvent is using query: $query")
$found = $False
# Keep checking for the event until at least
# one is found or timeout expires
while ($deadline -gt (Get-Date))
{
try
{
$events = get-winevent -LogName $LogName -ComputerName $Server -FilterXPath $query -ErrorAction "Stop"
# we won't reach here if the above
# didn't find any events
#
$found = $True
write-verbose "Found the following events:"
if ($events -is [Array])
{
foreach ($event in $events)
{
write-verbose $event.Message
}
}
else
{
write-verbose $events.Message
}
}
catch
{
$exception = $error[0].Exception
write-verbose ("Error retrieving requested event from event log: $exception")
}
if (!($found))
{
Start-Sleep -Seconds 5
write-verbose "Did not find event. checking again.."
}
else
{
break
}
}
<#
.DESCRIPTION
Checks if a particular event theshold is reached.
.PARAMETER EventInfo
An object containing event id, type and message
.PARAMETER Parameters
Array of strings that comprise the parameters for the event
These parameters must match the order and count specified
in the message in the EventInfo paremeter
.PARAMETER Message
Reference parameter which is used to get more details about the actual and expected count
.INPUTS
None. You cannot pipe objects to this function
.OUTPUTS
Returns true if the Event is critical. False if its not critical and Null if its not to be considered
#>
function Check-EventThresholdReached
{
Param(
[string]
$Server = $env:ComputerName,
[int]
[ValidateNotNull()]
$EventId,
[String[]]
$Parameters,
[REF]
$Message
)
foreach($event in $TSRetrySettings.Keys)
{
# Check if the use defined the events correctly in the CiTSLibrary.ps1 or ignore that entry
#
if ($TSRetrySettings[$event].Length -ge 2)
{
# Find the a matching entry
#
if ($TSRetrySettings[$event][0] -ne $EventId)
{
continue
}
[int]$retryCount = $TSRetrySettings[$event][1]
[string]$optionalComponent = 'CurrentCount'
if ($TSRetrySettings[$event].Length -gt 2)
{
# Read the optional component if specified from the replacement strings
#
[string]$optionalComponent = $parameters[[int]$TSRetrySettings[$event][3]]
}
try
{
# Check if the Parent regkey is present. If not create it
#
$defaultValue = Get-RegKeyValue -Server $server -Path $troubleshooterRegKey -Name ""
if ($defaultValue -eq $null)
{
Set-RegKeyValue -Server $Server -Path $troubleshooterRegKey -Name "" -Value "."
}
# Get the retry count of the alert
#
$regkeyPath = $troubleshooterRegKey + $EventId.ToString()
[int]$currentCountValue = 1
$currentCountValueKey = Get-RegKeyValue -Server $server -Path $regKeyPath -Name $optionalComponent
if ($currentCountValueKey -ne $null)
{
# Increment the counter value by 1
#
[int]$currentCountValue = $currentCountValueKey + 1
}
# Set the current counter value in the registry
#
Set-RegKeyValue -Server $Server -Path $regKeyPath -Name $optionalComponent -Value ([int]$currentCountValue)
# Verify if the counter was set correctly.
#
$currentCountValueKey = Get-RegKeyValue -Server $server -Path $regKeyPath -Name $optionalComponent
if (($currentCountValueKey -eq $null) -or ([int]$currentCountValueKey -ne $currentCountValue))
{
# Could not save the registry setting.
# We should fail safe so we return the alert as critical
#
return $true;
}
$isCritical = $currentCountValue -ge $retryCount
if ($isCritical -eq $false)
{
# Message containing additional details why the event was suppressed
#
$Message.Value = " Converting error to warning because the current error count '$currentCountValue' is below the critical retry count of '$retryCount'."
}
return $isCritical
}
catch
{
# Handle the error and do nothing. But we fail safe. So return the alert as critical (Existing behavior)
Write-Verbose $error[0]
$message.Value = $error[0].ToString()
return $true
}
}
}
return $null
}
<#
.DESCRIPTION
Resets the error count of for a particular event
.PARAMETER EventId
An object containing event id, type and message
.PARAMETER OptionalComponent
Optional component associated with the event
.INPUTS
None. You cannot pipe objects to this function
.OUTPUTS
Returns true if the Event is not critical.
#>
function Reset-EventRetryCounter
{
Param(
[string]
$Server = $env:ComputerName,
[int]
$EventId,
[String]
$OptionalComponent
)
#If the optional component specified in null or string.Empty we update all the values in the reg key
if ([System.String]::IsNullOrEmpty($OptionalComponent))
{
foreach($keyValueName in (Get-RegKeyValueNames -Server $server -Path $regkeyPath))
{
$componentsToReset += $keyValueName
}
}
else
{
$componentsToReset += $OptionalComponent;
}
foreach($component in $componentsToReset)
{
Set-RegKeyValue -Server $Server -Path $regKeyPath -Name $component -Value ([int]0)
}
}
catch
{
# Do nothing. Just write to the debug trace
Write-Verbose $error[0].ToString()
}
}
<#
.DESCRIPTION
Gets the list of Bad IFilters that caused problems on the servers. Reads the event log to extract
the Filter GUID
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
List of Bad Ifilters that are currently enabled
#>
function Get-BadIfilterGuidsFromEventLog
{
Param(
[string]
$Server = $env:ComputerName
)
<#
.DESCRIPTION
Gets the list of IFilters that should be enabled by the TS, that are currently disabled
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER IsRTMServer
Bool indicating if the server is an RTM machine.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
List of Bad Ifilters that are currently enabled
#>
function Get-IFiltersToEnable
{
Param(
[string]
$Server = $env:ComputerName,
[bool]
$IsRTMServer = $false
)
write-verbose "Get-IFiltersToEnable $Server"
$returnValue = @()
# For E14RTM builds we disable the default Office 2007 IFilters because of a known bug and do not enable them
#
if ($IsRTMServer -eq $false)
{
$outMessage = ""
$subKeyNames = @(Get-RegKeySubKeyNames -Server $server -Path $MsSearchFilterPath)
foreach($subKey in $subkeyNames)
{
if ($subKey.ToLower().Contains($disabledIFilterSuffix.ToLower()))
{
Write-Verbose "Found Ifilter $subKey that should be enabled"
$returnValue += $subKey
}
}
if ($returnValue.Count -gt 0)
{
# Found Disabled IFilters. Now check if we should enable them
# If not then return an empty array
#
if ((Check-EventThresholdReached -Server $server -EventId $msftesqlBadIFilterEventIdEventId -Message ([REF]$outMessage)) -eq $false)
{
$returnValue = @()
}
}
}
return $returnValue
}
<#
.DESCRIPTION
Gets the list of Bad IFilters that are currently enabled on a server
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.PARAMETER IsRTMServer
Bool indicating if the server is an RTM machine
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
List of Bad Ifilters that are currently enabled
#>
function Get-BadIFilters
{
Param(
[string]
$Server = $env:ComputerName,
[bool]
$IsRTMServer = $false
)
write-verbose "Get-BadIFilters $Server"
$returnValue = @()
# For E14RTM builds we disable the default Office 2007 IFilters because of a known bug
#
if ($IsRTMServer -eq $false)
{
$badIFilterGuids = Get-BadIfilterGuidsFromEventLog -Server $server
if ($badIFilterGuids.Count -gt 0)
{
$subKeyNames = @(Get-RegKeySubKeyNames -Server $server -Path $MsSearchFilterPath)
foreach($subKey in $subkeyNames)
{
$currentFilterGuid = Get-RegKeyValue -Server $server -Path "$MsSearchFilterPath\$subKey" -Name $null
foreach($badIfilterguid in $badIFilterGuids.Keys)
{
if ($badIFilterGuids[$badIfilterguid] -gt $msftesqlBadIFilterEventThreshold)
{
if ($badIfilterguid -ieq $currentFilterGuid)
{
Write-Verbose "Found bad IFilter: $subKey"
$returnValue += $subKey
}
}
}
}
}
}
else
{
$subKeyNames = @(Get-RegKeySubKeyNames -Server $server -Path $MsSearchFilterPath)
foreach($subKey in $subkeyNames)
{
foreach($badIfilterName in $badIfilterNames)
{
if ($badIfilterName -ieq $subKey)
{
Write-Verbose "Found bad IFilter: $subKey"
$returnValue += $subKey
}
}
}
}
return $returnValue
}
<#
.DESCRIPTION
Gets if the current server build is an RTM server
.PARAMETER Server
The simple NETBIOS name of mailbox server.
.INPUTS
None. You cannot pipe objects to this function.
.OUTPUTS
True if the current server build is an RTM server
#>
function Get-IsRTMServer
{
return $true
}
catch
{
# Do nothing. Enabling an IFilter is not a critical operation. So if it fails we just log the error message and continue
#
$message=($error[0].Exception.ToString() + $error[0].InvocationInfo.PositionMessage)
write-verbose ("Caught Exception: $message")
Log-Event `
-Arguments $Arguments `
-EventInfo $LogEntries.EnablingIFilterFailed `
-Parameters @($filterName, $message)
return $false
}
}
<#
.DESCRIPTION
Once an IFilter is Enabled/Disabled by the troubleshooter. The exchange search service will attempt to re-process every document in the retry table
to pick up the IFilter changes. To avoid this situation we delete the NewIFilterMonitor state from the registry
.PARAMETER Server
The simple NETBIOS name of mailbox server.
<#
.DESCRIPTION
Loads Exchange Powershell Snapin,
if not already loaded.
.INPUTS
None. You cannot pipe objects to this function
.OUTPUTS
None.
#>
function Load-ExchangeSnapin
{
if (! (Get-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 -ErrorAction:SilentlyContinue) )
{
Add-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010
}
}
#
# Get the default EN strings only.
# This should also avoid the problem of missmatched versions between the MP and the install exchange build.
# In those cases the TS would pick up the old localized strings from the exchange directory versus the latest strings from
# the deployed MP version.
#
Import-LocalizedData -BindingVariable LocStrings -FileName CITSLibrary.strings.psd1
# Dictionary containing errors which are retriable
# The first element is the eventId of the message
# The second element is the max retry count before the TS will log the error in the event log
# The third element is optional and denotes the position of the 'Component' in the eventLog parameter strings
#
$TSRetrySettings = @{
# TS failed are usually transient errors because of AD issues. Log the error message only of the TS fails
# twice consequetively
TSFailed=($LogEntries.TSFailed[0], 2)
# Once a bad Ifilter is found the TS should keep it disabled for 6 runs (6 Hours because the TS runs every hour)
# of the troubleshooter
#
BadIFiltersReset=($msftesqlBadIFilterEventIdEventId, 6)
# Reseed failed events are handled per database. If the troubleshooter logs two reseed
# failed events for the same MDB the Event will be logged as an error by the troubleshooter
# The third element in the array specifies that the databasename should be picked up from
# index position 0
ReseedFailed=($LogEntries.ReseedFailure[0], 2, 0)
# Reseed failed because the active catalog copy is corrupt. On the first occurrence of this problem
# HA should cause a failover. Only alert if the catalog fails to reseed twice for the same database.
# The third element in the array specifies that the databasename should be picked up from
# index position 0
ActiveCatalogCopyCorrupt=($LogEntries.ActiveCatalogCopyCorrupt[0], 2, 0)
# If the number of mailboxes left to crawl remains the same for $stallDuringCrawlThreshold consequetive runs ($stallDuringCrawlThreshold hours) asssume that the service is stalled
# and restart it
MailboxCrawlStalled=($stallDuringCrawlThreshold, $stallDuringCrawlThreshold, 0)
# If the TS reseeds a catalog everytime in consequetive runs then raise an alert
#
CatalogReseedLoop=($LogEntries.CatalogReseedLoop[0], 3, 0)
# If the TS restarts the service everytime in consequetive runs then raise an alert
#
ServiceRestartAttempt=($LogEntries.ServiceRestartAttempt[0], 3)
}
#List if IFilters that need to be disabled on a server
#
$badIfilterNames=@(
".docx"
".pptx"
".xlsx"
".xml"
)
try
{
$affinityValue = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MsftesqlProcessorAffinityCount' -DefaultValue $affinityValue)
$msftefdAffinityValue = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MsftefdProcessorAffinityCount' -DefaultValue $msftefdAffinityValue)
$disableBadIFilters = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'DisableBadIFilters' -DefaultValue $disableBadIFilters)
$retryItemsThreshold = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'RetryItemsThreshold' -DefaultValue $retryItemsThreshold)
$staleThresholdInSeconds = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'StaleThresholdInSeconds' -DefaultValue $staleThresholdInSeconds)
$stallThresholdInSeconds = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'StallThresholdInSeconds' -DefaultValue $stallThresholdInSeconds)
$maxPercentageCatalogSize = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MaxPercentageCatalogSize' -DefaultValue $maxPercentageCatalogSize)
$stallDuringCrawlThreshold = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'StallDuringCrawlThreshold' -DefaultValue $stallDuringCrawlThreshold)
$backlogThresholdInSeconds = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'BacklogThresholdInSeconds' -DefaultValue $backlogThresholdInSeconds)
$minRetryTableIssueThreshold = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MinRetryTableIssueThreshold' -DefaultValue $minRetryTableIssueThreshold)
$MaxMsftefdMemoryConsumption = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MaxMsftefdMemoryConsumption' -DefaultValue $MaxMsftefdMemoryConsumption)
$ExtendedStallThresholdInSeconds = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'ExtendedStallThresholdInSeconds' -DefaultValue $extendedStallThresholdInSeconds)
$msftesqlBadIFilterEventThreshold = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'MsftesqlBadIFilterEventThreshold' -DefaultValue $MsftesqlBadIFilterEventThreshold)
$badIFilterCheckIntervalInMinutes = [int](Get-RegKeyValue -Path $troubleshooterRegKey -Name 'BadIFilterCheckIntervalInMinutes' -DefaultValue $badIFilterCheckIntervalInMinutes)
$disableRecoveryForDatabases = (Get-RegKeyValue -Path $troubleshooterRegKey -Name 'DisableRecoveryForDatabases' -DefaultValue "").ToString()
$disableRecoveryForDatabasesList = $disableRecoveryForDatabases.Split((@(',',';')), [StringSplitOptions]::RemoveEmptyEntries)
$disabledIFilterSuffix = Get-RegKeyValue -Path $troubleshooterRegKey -Name 'DisabledIFilterSuffix' -DefaultValue $disabledIFilterSuffix
}
catch
{
# Do nothing
}
</LibraryScriptBody>
<StringsFileName>CITSLibrary.strings.psd1</StringsFileName>
<StringsFileBody>
#####################################################################################
#
#
# THIS FILE EXISTS IN TWO LOCATIONS. MAKE SURE TO BOTH COPIES OF THE FILE ARE UPDATED WHEN
# WHEN EITHER COPY IS CHANGED.
# <DEPOT>\Sources\dev\management\src\management\scripts\troubleshooter\CITSLibrary.strings.psd1
# <DEPOT>\Sources\dev\mgmtpack\src\HealthMainfests\scripts\troubleshooter\CITSLibrary.strings.psd1
# The management version of the library gets deployed during exchange setup and the
# mgmtpack version of the library only gets deployed when the management pack is installed
#
######################################################################################
ConvertFrom-StringData @'
###PSLOC
# English strings
AllNotAllowedForResolve = When 'Resolve' is specified for the Action parameter, 'All' is not allowed for the Symptom parameter.
TroubleshooterFailed = The troubleshooter failed with error:
RegistryOpenError = Could not open the registry on server:
RegistryReadError = Couldn't read registry key:
TimeoutWaitingForEvent = Event couldn't be found within the timeout period:
TimeoutWaitingForProcessToStop = The process didn't stop within the time-out period:
TimeoutZeroOrNegative = Timeout reached zero or negative value entering function:
# Event Log strings
# Logged both to application and crimson logs
#
TSStarted=The troubleshooter started successfully. Version: %1
DetectedDeadlock=Detected search service deadlock.
DetectedCatalogCorruption=Detected catalog corruption for database %1
DetectedIndexingStall=Detected indexing stall for database %1. Stall counter value %2. Stall threshold value is %3 seconds
DetectedIndexingStallExtendedPeriod=Detected indexing stall for database %1 for an extended duration. Stall counter value %2. Stall threshold value is %3 seconds
DetectedIndexingBacklog=Indexing backlog reached a critical limit of %2 hours or the number of items in the retry queue is greater than %3 for database: %1
DetectedIndexingBacklogOrLargeRetryQueuesOnMultipleDatabases=Indexing backlog reached a critical limit of %2 hours or the number of items in the retry queue is greater than %3 for one or more databases: %1
TSFailed=The troubleshooter failed with exception %1.
TSSuccess=The troubleshooter finished successfully.
RestartSuccess=Restart of search services succeeded.
RestartFailure=Search services failed to restart. Reason: %1. Current server status: %2
DetectedNoIssues=The troubleshooter didn't find any issues for any catalog.
CatalogHasNoIssues=The troubleshooter didn't find any catalog issues for database %1.
DetectedSameSymptomTooManyTimes=The troubleshooter detected the symptom %1 %2 times in the past %3 hours for catalog %4. This exceeded the allowed limit for failures.
ReseedSuccess=Reseeding succeeded for the catalog of database %1.
ReseedFailure=Reseeding failed for the content index catalog of mailbox database %1. Reason: %2. Database copy states: %3
ActiveCatalogCopyCorrupt=The active catalog of mailbox database '%1' is corrupt. Database copy states: %2
AnotherInstanceRunning=Another instance of the troubleshooter is already running on this machine. Two or more instances cannot be run simultaneously.
MsftefdMemoryUsageHigh=The memory usage of the Msftefd processes is above the set limit of %1 Mb. Current value %2 Mb. Process instances %3
MsftefdMemoryUsageHighWithCrashDump=The memory usage of the Msftefd processes is above the set limit of %1 Mb. Current value %2 Mb. A crash dump of the process has also been taken.
FoundBadIFiltersEnabled=The troubleshooter found the following %1 Filters which should be disabled.
EnablingIFilterFailed=The troubleshooter failed to enable IFilter %1. Reason %2
IFiltersToEnable=The troubleshooter found the following IFilters to enable: %1
CatalogSizeGreaterThanExpectedDBLimit=The percentage catalog size for mailbox database %1 is greater than the threshold value of %2. Current value %3
CatalogReseedLoop=The catalog for mailbox database %1 has been reseeded %2 consecutive times by the CI troubleshooter.
TroubleshooterDisabled=The troubleshooter has been disabled on server %1
ItemsStuckInRetryQueue=The search service is not processing items in the retry queue for mailbox database %1. Documents Processed since last run %2.
ServiceRestartNotNeeded=The CI troubleshooter did not detect any issue that would require a restart of the search service.
ServiceRestartAttempt=CI troubleshooter exchange search service restart attempt %1.
CatalogRecoveryDisabled=Recovery actions for search catalog %1 has been disabled.
RetryQueuesStagnant=The search service is not processing items in the retry queue for mailbox databases [CatalogName (AgeOfLastNotificaton, NumberOfItemsInRetryQueue, NumberOfRetryQueueItemsProcessed)]: %1.
#
# Events logged only to crimson (windows) event log
#
TSDetectionStarted=The troubleshooter started detection.
TSDetectionFinished=The troubleshooter finished detection.
TSDetectionFailed=The troubleshooter failed during detection.