-
Notifications
You must be signed in to change notification settings - Fork 52
[BED-5972] Log timeout-wrapped execution time stats, add adaptive timeout logic #222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
b7a70cc
feat: Log timeout-wrapped execution time stats, add adaptive timeout …
definitelynotagoblin e8e7bbe
feat: Polish out AdaptiveTimeout, replace instances of ExecuteWithTim…
definitelynotagoblin 8712f11
feat: Finish replacing Timeout.ExecuteWithTimeout calls; polish up Ad…
definitelynotagoblin 7800926
chore: Add AdaptiveTimeout unit tests
definitelynotagoblin 74ba553
chore: Removing now-unused timeout params
definitelynotagoblin 475e986
chore: Remove obsolete timeout tests - should now be covered by Adapt…
definitelynotagoblin 0cbd7bf
fix: More aggressive time spiking for TimeSpikeSafetyValve test
definitelynotagoblin 5bda0ff
chore: Address some coderabbit comments
definitelynotagoblin 7694fd6
fix: TimeSpikeSafetyValveTest - I played myself
definitelynotagoblin 8de6101
fix: This time spike safety valve is too flexible
definitelynotagoblin 76f90db
feat: Wrapping ldap queries in AdaptiveTimeout for analytics, experim…
definitelynotagoblin 439a6a7
chore: Simplify AdaptiveTimeout constructor with default params; log …
definitelynotagoblin 54cadda
chore: Polishing up
definitelynotagoblin 9480788
fix: DCLdapProcessorTest - specify positive-value timeout args for Ld…
definitelynotagoblin a7c7f50
fix: It.Is can't be used outside of Moq setups - these were always re…
definitelynotagoblin df2d80e
feat: Log time spike events when max timeout is in effect
definitelynotagoblin 36a8120
chore: Just adding a little more transparency to Timeout fail results…
definitelynotagoblin 74def33
fix: Timeout tests check response message StartsWith 'Timeout' instea…
definitelynotagoblin 3e9ed5f
chore: Let's add an UseAdaptiveTimeout() to simplify things a bit, cl…
definitelynotagoblin d39465c
feat: Log full fail message for CheckPort timeouts
definitelynotagoblin File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,226 @@ | ||
| using System; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Extensions.Logging; | ||
| using SharpHoundRPC.NetAPINative; | ||
|
|
||
| namespace SharpHoundCommonLib; | ||
|
|
||
| public sealed class AdaptiveTimeout : IDisposable { | ||
| private readonly ExecutionTimeSampler _sampler; | ||
| private readonly ILogger _log; | ||
| private readonly TimeSpan _maxTimeout; | ||
| private readonly bool _useAdaptiveTimeout; | ||
| private readonly int _minSamplesForAdaptiveTimeout; | ||
| private int _clearSamplesDecay; | ||
| private const int TimeSpikePenalty = 2; | ||
| private const int TimeSpikeForgiveness = 1; | ||
| private const int ClearSamplesThreshold = 5; | ||
| private const int StdDevMultiplier = 5; | ||
|
|
||
| public AdaptiveTimeout(TimeSpan maxTimeout, ILogger log, int sampleCount = 100, int logFrequency = 1000, int minSamplesForAdaptiveTimeout = 30, bool useAdaptiveTimeout = true) { | ||
| if (maxTimeout <= TimeSpan.Zero) | ||
| throw new ArgumentException("maxTimeout must be positive", nameof(maxTimeout)); | ||
| if (sampleCount <= 0) | ||
| throw new ArgumentException("sampleCount must be positive", nameof(sampleCount)); | ||
| if (logFrequency <= 0) | ||
| throw new ArgumentException("logFrequency must be positive", nameof(logFrequency)); | ||
| if (minSamplesForAdaptiveTimeout <= 0) | ||
| throw new ArgumentException("minSamplesForAdaptiveTimeout must be positive", nameof(minSamplesForAdaptiveTimeout)); | ||
| if (log == null) | ||
| throw new ArgumentNullException(nameof(log)); | ||
|
|
||
| _sampler = new ExecutionTimeSampler(log, sampleCount, logFrequency); | ||
| _log = log; | ||
| _maxTimeout = maxTimeout; | ||
| _useAdaptiveTimeout = useAdaptiveTimeout; | ||
| _minSamplesForAdaptiveTimeout = minSamplesForAdaptiveTimeout; | ||
| } | ||
|
|
||
| public void ClearSamples() { | ||
| _clearSamplesDecay = 0; | ||
| _sampler.ClearSamples(); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <typeparam name="T"></typeparam> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<Result<T>> ExecuteWithTimeout<T>(Func<CancellationToken, T> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<Result> ExecuteWithTimeout(Action<CancellationToken> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <typeparam name="T"></typeparam> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<Result<T>> ExecuteWithTimeout<T>(Func<CancellationToken, Task<T>> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<Result> ExecuteWithTimeout(Func<CancellationToken, Task> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <typeparam name="T"></typeparam> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<NetAPIResult<T>> ExecuteNetAPIWithTimeout<T>(Func<CancellationToken, NetAPIResult<T>> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteNetAPIWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <typeparam name="T"></typeparam> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<SharpHoundRPC.Result<T>> ExecuteRPCWithTimeout<T>(Func<CancellationToken, SharpHoundRPC.Result<T>> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteRPCWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Ignores the result of a function if it runs longer than a budgeted time, unblocking the caller. | ||
| /// Logs aggregate execution time data. | ||
| /// Manages its own timeout. | ||
| /// A cancellation token is passed to the executing function so it may exit cleanly if timeout is reached. | ||
| /// Please don't wrap a cached function in this timeout if adaptive timeouts enabled, normal distributions are better. | ||
| /// DO NOT use a single AdaptiveTimeout for multiple functions. | ||
| /// </summary> | ||
| /// <typeparam name="T"></typeparam> | ||
| /// <param name="func"></param> | ||
| /// <param name="parentToken"></param> | ||
| /// <returns>Returns a Fail result if a task runs longer than its budgeted time.</returns> | ||
| public async Task<SharpHoundRPC.Result<T>> ExecuteRPCWithTimeout<T>(Func<CancellationToken, Task<SharpHoundRPC.Result<T>>> func, CancellationToken parentToken = default) { | ||
| var result = await Timeout.ExecuteRPCWithTimeout(GetAdaptiveTimeout(), (timeoutToken) => _sampler.SampleExecutionTime(() => func(timeoutToken)), parentToken); | ||
| TimeSpikeSafetyValve(result.IsSuccess); | ||
| return result; | ||
| } | ||
|
|
||
| public void Dispose() { | ||
| _sampler.Dispose(); | ||
| } | ||
|
|
||
| // Within 5 standard deviations will have a conservative lower bound of catching 96% of executions (1 - 1/5^2), | ||
| // regardless of sample shape | ||
| // so long as those samples are independent and identically distributed | ||
| // (and if they're not, our TimeSpikeSafetyValve should provide us with some adaptability) | ||
| // But the effective collection rate is probably closer to 98+% | ||
| // (in part because we don't need to filter out "too fast" outliers) | ||
| // But we'll cap at configured maximum timeout | ||
| // https://modelassist.epixanalytics.com/space/EA/26574957/Tchebysheffs+Rule | ||
| // https://en.wikipedia.org/wiki/Independent_and_identically_distributed_random_variables | ||
| public TimeSpan GetAdaptiveTimeout() { | ||
| if (!UseAdaptiveTimeout()) | ||
| return _maxTimeout; | ||
|
|
||
| try { | ||
| var stdDev = _sampler.StandardDeviation(); | ||
| var adaptiveTimeoutMs = _sampler.Average() + (stdDev * StdDevMultiplier); | ||
| var cappedTimeoutMS = Math.Min(adaptiveTimeoutMs, _maxTimeout.TotalMilliseconds); | ||
| return TimeSpan.FromMilliseconds(cappedTimeoutMS); | ||
| } | ||
| catch (Exception ex) { | ||
| _log.LogError(ex, "Error calculating adaptive timeout, defaulting to max timeout."); | ||
| return _maxTimeout; | ||
| } | ||
| } | ||
|
|
||
| // AdaptiveTimeout will not respond well to rapid spikes in execution time | ||
| // imagine the wrapped function very regularly executes in 10ms | ||
| // then suddenly starts taking a regular 100ms | ||
| // this is fine (if it fits in our max timeout budget), and we shouldn't block | ||
| // so we should create a safety valve in case this happens to reset our data samples | ||
| private void TimeSpikeSafetyValve(bool isSuccess) { | ||
| if (isSuccess) { | ||
| _clearSamplesDecay -= TimeSpikeForgiveness; | ||
| _clearSamplesDecay = Math.Max(0, _clearSamplesDecay); | ||
| } | ||
| else | ||
| _clearSamplesDecay += TimeSpikePenalty; | ||
|
|
||
|
|
||
| if (_clearSamplesDecay >= ClearSamplesThreshold) { | ||
| if (UseAdaptiveTimeout()) { | ||
| ClearSamples(); | ||
| _log.LogTrace("Time spike safety valve event at timeout {CurrentTimeout}.", GetAdaptiveTimeout()); | ||
| } | ||
| else { | ||
| _log.LogWarning("This call is frequently running over the maximum allowed timeout of {MaxTimeout}.", _maxTimeout); | ||
definitelynotagoblin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| } | ||
|
|
||
| private bool UseAdaptiveTimeout() { | ||
| return _useAdaptiveTimeout && _sampler.Count >= _minSamplesForAdaptiveTimeout; | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| using System; | ||
| using System.Collections.Concurrent; | ||
| using System.Diagnostics; | ||
| using System.Linq; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Extensions.Logging; | ||
|
|
||
| namespace SharpHoundCommonLib; | ||
|
|
||
| /// <summary> | ||
| /// Holds a rolling sample of execution times on a function, providing and logging data aggregates. | ||
| /// </summary> | ||
| public class ExecutionTimeSampler : IDisposable { | ||
| private readonly ILogger _log; | ||
| private readonly int _sampleCount; | ||
| private readonly int _logFrequency; | ||
| private int _samplesSinceLastLog; | ||
| private ConcurrentQueue<double> _samples; | ||
|
|
||
| public int Count => _samples.Count; | ||
|
|
||
| public ExecutionTimeSampler(ILogger log, int sampleCount, int logFrequency) { | ||
| _log = log; | ||
| _sampleCount = sampleCount; | ||
| _logFrequency = logFrequency; | ||
| _samplesSinceLastLog = 0; | ||
| _samples = new ConcurrentQueue<double>(); | ||
| } | ||
|
|
||
| public void ClearSamples() { | ||
| Log(flush: true); | ||
| _samples = new ConcurrentQueue<double>(); | ||
| } | ||
|
|
||
| public double StandardDeviation() { | ||
| double average = _samples.Average(); | ||
| double sumOfSquaresOfDifferences = _samples.Select(val => (val - average) * (val - average)).Sum(); | ||
| double stddiv = Math.Sqrt(sumOfSquaresOfDifferences / _samples.Count); | ||
|
|
||
| return stddiv; | ||
definitelynotagoblin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| public double Average() => _samples.Average(); | ||
definitelynotagoblin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| public async Task<T> SampleExecutionTime<T>(Func<Task<T>> func) { | ||
| var stopwatch = Stopwatch.StartNew(); | ||
| var result = await func.Invoke(); | ||
| stopwatch.Stop(); | ||
| AddTimeSample(stopwatch.Elapsed); | ||
|
|
||
| return result; | ||
| } | ||
|
|
||
| public async Task SampleExecutionTime(Func<Task> func) { | ||
| var stopwatch = Stopwatch.StartNew(); | ||
| await func.Invoke(); | ||
| stopwatch.Stop(); | ||
| AddTimeSample(stopwatch.Elapsed); | ||
| } | ||
|
|
||
| public T SampleExecutionTime<T>(Func<T> func) { | ||
| var stopwatch = Stopwatch.StartNew(); | ||
| var result = func.Invoke(); | ||
| stopwatch.Stop(); | ||
| AddTimeSample(stopwatch.Elapsed); | ||
|
|
||
| return result; | ||
| } | ||
|
|
||
| public void SampleExecutionTime(Action func) { | ||
| var stopwatch = Stopwatch.StartNew(); | ||
| func.Invoke(); | ||
| stopwatch.Stop(); | ||
| AddTimeSample(stopwatch.Elapsed); | ||
| } | ||
|
|
||
| public void Dispose() { | ||
| Log(flush: true); | ||
| } | ||
|
|
||
| private void AddTimeSample(TimeSpan timeSpan) { | ||
| while (_samples.Count >= _sampleCount) { | ||
| _samples.TryDequeue(out _); | ||
| } | ||
|
|
||
| _samples.Enqueue(timeSpan.TotalMilliseconds); | ||
| _samplesSinceLastLog++; | ||
|
|
||
| Log(); | ||
| } | ||
|
|
||
| private void Log(bool flush = false) { | ||
| if ((flush || _samplesSinceLastLog >= _logFrequency) && _samples.Count > 0) { | ||
| try { | ||
| _log.LogInformation("Execution time Average: {Average}ms, StdDiv: {StandardDeviation}ms", _samples.Average(), StandardDeviation()); | ||
| } | ||
| catch (Exception ex) { | ||
| _log.LogWarning("Failed to calculate execution time statistics: {Error}", ex.Message); | ||
| } | ||
|
|
||
| _samplesSinceLastLog = 0; | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.