From f399e1550d00e858793f7a5b0b2d4dbc85a36566 Mon Sep 17 00:00:00 2001 From: dassayantan24 Date: Thu, 4 Sep 2025 14:09:36 +0530 Subject: [PATCH 1/5] Update logging --- src/Agent.Listener/JobDispatcher.cs | 113 +++++- src/Agent.Sdk/Knob/AgentKnobs.cs | 2 +- src/Agent.Sdk/Util/VssUtil.cs | 157 ++++++++ src/Agent.Worker/JobRunner.cs | 32 ++ .../AgentServer.cs | 367 ++++++++++++++---- 5 files changed, 569 insertions(+), 102 deletions(-) diff --git a/src/Agent.Listener/JobDispatcher.cs b/src/Agent.Listener/JobDispatcher.cs index a0dccaf8c1..9642777e46 100644 --- a/src/Agent.Listener/JobDispatcher.cs +++ b/src/Agent.Listener/JobDispatcher.cs @@ -8,6 +8,7 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; +using System.Net.NetworkInformation; using System.Net.Sockets; using System.Threading; using System.Threading.Tasks; @@ -798,29 +799,14 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke { try { + Trace.Info($"=== Job Renewal Request Starting === [RequestId: {requestId}, Attempt: {6 - firstRenewRetryLimit}, Time: {DateTime.UtcNow:HH:mm:ss.fff}]"); + var renewalStartTime = DateTime.UtcNow; + request = await agentServer.RenewAgentRequestAsync(poolId, requestId, lockToken, token); - + + var renewalDuration = DateTime.UtcNow - renewalStartTime; Trace.Info($"Successfully renew job request {requestId}, job is valid till {request.LockedUntil.Value}"); - - if (!firstJobRequestRenewed.Task.IsCompleted) - { - // fire first renew succeed event. - firstJobRequestRenewed.TrySetResult(0); - Trace.Info(StringUtil.Format("First job request renewal completed successfully [RequestId:{0}, InitialRenewal:True, Status:Confirmed]", - requestId)); - } - - if (encounteringError > 0) - { - encounteringError = 0; - agentServer.SetConnectionTimeout(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(60)); - HostContext.WritePerfCounter("JobRenewRecovered"); - Trace.Info(StringUtil.Format("Job renewal error recovery completed [RequestId:{0}, ErrorsCleared:True, ConnectionTimeout:60s, Status:Recovered]", - requestId)); - } - - // renew again after 60 sec delay - await HostContext.Delay(TimeSpan.FromSeconds(60), token); + Trace.Info($"=== Job Renewal Completed Successfully === [Duration: {renewalDuration.TotalMilliseconds:F0}ms, ValidUntil: {request.LockedUntil.Value:HH:mm:ss}]"); } catch (TaskAgentJobNotFoundException) { @@ -844,6 +830,64 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke catch (Exception ex) { Trace.Error($"Catch exception during renew agent jobrequest {requestId}."); + + // Enhanced error logging for job renewal failures + var errorTime = DateTime.UtcNow; + + if (ex is SocketException sockEx) + { + Trace.Error($"*** SOCKET ERROR during job renewal *** Socket Error: {sockEx.SocketErrorCode} ({sockEx.ErrorCode}) - {sockEx.Message}"); + Trace.Error($"This may indicate socket exhaustion or network connectivity issues"); + Trace.Error($"Check if PowerShellOnTargetMachine or similar tasks are consuming many connections"); + + if (sockEx.SocketErrorCode == SocketError.AddressAlreadyInUse || + sockEx.SocketErrorCode == SocketError.AddressNotAvailable || + sockEx.SocketErrorCode == SocketError.TooManyOpenSockets) + { + Trace.Error("*** POTENTIAL SOCKET EXHAUSTION DETECTED ***"); + Trace.Error("Consider enabling legacy HTTP handler to isolate connection pools"); + } + } + else if (ex is System.Net.Http.HttpRequestException httpEx) + { + Trace.Error($"HTTP request exception during job renewal: {httpEx.Message}"); + if (httpEx.InnerException is SocketException innerSock) + { + Trace.Error($"Inner socket exception: {innerSock.SocketErrorCode} - {innerSock.Message}"); + } + else if (httpEx.InnerException != null) + { + Trace.Error($"Inner exception: {httpEx.InnerException.GetType().Name}: {httpEx.InnerException.Message}"); + } + } + else if (ex is VssUnauthorizedException authEx) + { + Trace.Error($"*** AUTHENTICATION FAILURE during job renewal *** {authEx.Message}"); + Trace.Error("This typically indicates token expiry or invalid credentials"); + Trace.Error("Connection refresh will attempt to re-authenticate"); + + // Additional context for authentication failures + if (authEx.Message.Contains("401") || authEx.Message.Contains("Unauthorized")) + { + Trace.Error("HTTP 401 Unauthorized response - OAuth token likely expired"); + Trace.Error("Check token expiry timing in previous VssConnection logs"); + } + + // Log timing context for token expiry analysis + Trace.Error($"Authentication failure occurred at: {errorTime:yyyy-MM-dd HH:mm:ss.fff} UTC"); + Trace.Error("Review connection creation logs for token expiry warnings"); + } + else if (ex is TimeoutException timeoutEx) + { + Trace.Error($"*** TIMEOUT during job renewal *** {timeoutEx.Message}"); + Trace.Error("This may indicate server load, network issues, or connection pool exhaustion"); + } + else + { + Trace.Error($"Unexpected exception type during job renewal: {ex.GetType().Name}"); + } + + Trace.Error($"Job renewal error occurred at: {errorTime:yyyy-MM-dd HH:mm:ss.fff} UTC"); Trace.Error(ex); encounteringError++; @@ -893,6 +937,12 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke try { + // Log diagnostics periodically during job renewal delays + if (delayTime.TotalMinutes >= 1) // Only for longer delays + { + LogSimpleDiagnostics(requestId); + } + // back-off before next retry. await HostContext.Delay(delayTime, token); } @@ -1153,5 +1203,26 @@ private void Dispose(bool disposing) } } } + + private void LogSimpleDiagnostics(long requestId) + { + try + { + // Simple network and memory diagnostics using IPGlobalProperties + var ipProperties = IPGlobalProperties.GetIPGlobalProperties(); + var tcpStats = ipProperties.GetTcpIPv4Statistics(); + + // Memory usage from current process + var process = Process.GetCurrentProcess(); + var gcMemory = GC.GetTotalMemory(false); + + Trace.Info($"[RENEWAL-DIAGNOSTICS] RequestId:{requestId} TCP:{tcpStats.CurrentConnections} Memory:WS={process.WorkingSet64 / 1024 / 1024}MB,GC={gcMemory / 1024 / 1024}MB"); + } + catch (Exception ex) + { + // Don't let diagnostics crash job renewal + Trace.Warning($"Failed to log renewal diagnostics: {ex.Message}"); + } + } } } diff --git a/src/Agent.Sdk/Knob/AgentKnobs.cs b/src/Agent.Sdk/Knob/AgentKnobs.cs index 497aae1d0e..840d3ee3ff 100644 --- a/src/Agent.Sdk/Knob/AgentKnobs.cs +++ b/src/Agent.Sdk/Knob/AgentKnobs.cs @@ -324,7 +324,7 @@ public class AgentKnobs nameof(HttpTrace), "Enable http trace if true", new EnvironmentKnobSource("VSTS_AGENT_HTTPTRACE"), - new BuiltInDefaultKnobSource("false")); + new BuiltInDefaultKnobSource("true")); public static readonly Knob NoProxy = new Knob( nameof(NoProxy), diff --git a/src/Agent.Sdk/Util/VssUtil.cs b/src/Agent.Sdk/Util/VssUtil.cs index e2bbeeedf0..22683e9a1f 100644 --- a/src/Agent.Sdk/Util/VssUtil.cs +++ b/src/Agent.Sdk/Util/VssUtil.cs @@ -109,6 +109,142 @@ public static VssConnection CreateConnection( int httpRequestTimeoutSeconds = AgentKnobs.HttpTimeout.GetValue(_knobContext).AsInt(); settings.SendTimeout = timeout ?? TimeSpan.FromSeconds(Math.Min(Math.Max(httpRequestTimeoutSeconds, 100), 1200)); + // Enhanced logging for connection diagnostics + if (trace != null) + { + trace.Info($"Creating VssConnection to {serverUri}"); + trace.Info($"Connection settings - MaxRetryRequest: {settings.MaxRetryRequest}, SendTimeout: {settings.SendTimeout.TotalSeconds}s"); + trace.Info($"Using legacy HTTP handler: {PlatformUtil.UseLegacyHttpHandler}"); + + // Log credential type for diagnostics + string credType = "Unknown"; + if (credentials?.Federated is Microsoft.VisualStudio.Services.OAuth.VssOAuthAccessTokenCredential oauthCred) + { + credType = "OAuth AccessToken"; + // Try to extract token information for expiry diagnostics without exposing the token + try + { + var tokenField = oauthCred.GetType().GetField("m_accessToken", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + if (tokenField != null) + { + var tokenValue = tokenField.GetValue(oauthCred) as string; + if (!string.IsNullOrEmpty(tokenValue)) + { + // Parse JWT token to get expiry without exposing the token content + try + { + var tokenParts = tokenValue.Split('.'); + if (tokenParts.Length >= 2) + { + var payload = tokenParts[1]; + // Add padding if needed for base64 decoding + while (payload.Length % 4 != 0) payload += "="; + var decodedBytes = Convert.FromBase64String(payload); + var decodedString = System.Text.Encoding.UTF8.GetString(decodedBytes); + if (decodedString.Contains("\"exp\":")) + { + var expMatch = System.Text.RegularExpressions.Regex.Match(decodedString, @"""exp"":(\d+)"); + if (expMatch.Success && long.TryParse(expMatch.Groups[1].Value, out var expUnix)) + { + var expiry = DateTimeOffset.FromUnixTimeSeconds(expUnix); + var timeUntilExpiry = expiry - DateTimeOffset.UtcNow; + trace.Info($"OAuth token expiry: {expiry:yyyy-MM-dd HH:mm:ss} UTC (in {timeUntilExpiry.TotalMinutes:F1} minutes)"); + + if (timeUntilExpiry.TotalMinutes < 5) + { + trace.Info($"*** WARNING: OAuth token expires soon: {timeUntilExpiry.TotalMinutes:F1} minutes remaining ***"); + } + else if (timeUntilExpiry.TotalSeconds < 0) + { + trace.Info($"*** ERROR: OAuth token is EXPIRED by {Math.Abs(timeUntilExpiry.TotalMinutes):F1} minutes ***"); + } + } + } + } + } + catch (Exception tokenParseEx) + { + trace.Verbose($"Could not parse token expiry information: {tokenParseEx.Message}"); + } + } + } + } + catch (Exception ex) + { + trace.Verbose($"Could not access token metadata for expiry check: {ex.Message}"); + } + } + else if (credentials?.Federated != null && credentials.Federated.GetType().Name.Contains("VssAad")) + credType = "AAD Credential"; + else if (credentials?.Storage != null) + credType = "Basic/PAT Credential"; + else if (credentials?.Windows != null) + credType = "Windows Credential"; + + trace.Info($"Credential type: {credType}"); + + // Log additional delegating handlers + if (additionalDelegatingHandler != null) + { + int handlerCount = 0; + foreach (var handler in additionalDelegatingHandler) + { + handlerCount++; + trace.Info($"Additional HTTP handler {handlerCount}: {handler.GetType().Name}"); + } + if (handlerCount == 0) + { + trace.Info("No additional HTTP handlers"); + } + } + + // Log socket and connection pool information + try + { + // Get system-level connection information for socket exhaustion diagnostics + if (PlatformUtil.RunningOnWindows) + { + var tcpConnections = System.Net.NetworkInformation.IPGlobalProperties.GetIPGlobalProperties().GetActiveTcpConnections(); + int establishedConnections = 0; + int totalConnections = tcpConnections.Length; + + foreach (var conn in tcpConnections) + { + if (conn.State == System.Net.NetworkInformation.TcpState.Established) + establishedConnections++; + } + + trace.Info($"System TCP connections: {establishedConnections} established, {totalConnections} total"); + + if (establishedConnections > 1000) + { + trace.Info($"*** WARNING: High number of established TCP connections ({establishedConnections}) - possible socket exhaustion ***"); + } + + // Log connections to the target server specifically + var serverHost = serverUri.Host; + int serverConnections = 0; + foreach (var conn in tcpConnections) + { + if (conn.RemoteEndPoint.Address.ToString().Equals(serverHost, StringComparison.OrdinalIgnoreCase) || + (System.Net.IPAddress.TryParse(serverHost, out var serverIp) && conn.RemoteEndPoint.Address.Equals(serverIp))) + { + serverConnections++; + } + } + + if (serverConnections > 0) + { + trace.Info($"Existing connections to {serverHost}: {serverConnections}"); + } + } + } + catch (Exception ex) + { + trace.Verbose($"Could not retrieve system connection information: {ex.Message}"); + } + } + // Remove Invariant from the list of accepted languages. // // The constructor of VssHttpRequestSettings (base class of VssClientHttpRequestSettings) adds the current @@ -126,6 +262,27 @@ public static VssConnection CreateConnection( } VssConnection connection = new VssConnection(serverUri, new VssHttpMessageHandler(credentials, settings), additionalDelegatingHandler); + + // Enhanced logging for connection creation and socket diagnostics + if (trace != null) + { + trace.Info($"VssConnection created for {serverUri.Host}:{serverUri.Port}"); + + // Log handler type being used for socket pool diagnostics + if (PlatformUtil.UseLegacyHttpHandler) + { + trace.Info("Using LEGACY HTTP handler - isolated connection pools"); + } + else + { + trace.Info("Using MODERN SocketsHttpHandler - shared global connection pool"); + trace.Info("*** SHARED POOL WARNING *** Heavy task connections may exhaust sockets for agent-server communication"); + } + + // Log timing for connection creation + trace.Info($"Connection created at: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} UTC"); + } + return connection; } diff --git a/src/Agent.Worker/JobRunner.cs b/src/Agent.Worker/JobRunner.cs index be7cb93265..be8478e7be 100644 --- a/src/Agent.Worker/JobRunner.cs +++ b/src/Agent.Worker/JobRunner.cs @@ -11,9 +11,11 @@ using Microsoft.VisualStudio.Services.WebApi; using System; using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; using System.IO; using System.Linq; +using System.Net.NetworkInformation; using System.Threading; using System.Threading.Tasks; using System.Net.Http; @@ -392,6 +394,10 @@ public async Task RunAsync(Pipelines.AgentJobRequestMessage message, // Run all job steps Trace.Info("Run all job steps."); var stepsRunner = HostContext.GetService(); + + // Start periodic diagnostics logging during job execution + var diagnosticsTimer = new Timer(LogDiagnostics, jobContext, (int)TimeSpan.FromMinutes(10).TotalMilliseconds, (int)TimeSpan.FromMinutes(10).TotalMilliseconds); + try { Trace.Info("Step execution pipeline initiated - beginning job steps execution with StepsRunner"); @@ -409,6 +415,8 @@ public async Task RunAsync(Pipelines.AgentJobRequestMessage message, } finally { + // Stop periodic diagnostics logging + diagnosticsTimer?.Dispose(); Trace.Info("Finalize job."); await jobExtension.FinalizeJob(jobContext); } @@ -705,5 +713,29 @@ private void PublishTelemetry(IExecutionContext context, string area, String fea Trace.Warning($"Unable to publish agent shutdown telemetry data. Exception: {ex}"); } } + + private void LogDiagnostics(object state) + { + try + { + var jobContext = state as IExecutionContext; + if (jobContext == null) return; + + // Log network statistics using IPGlobalProperties + var ipProperties = IPGlobalProperties.GetIPGlobalProperties(); + var tcpStats = ipProperties.GetTcpIPv4Statistics(); + + // Log memory usage + var process = Process.GetCurrentProcess(); + var gcMemory = GC.GetTotalMemory(false); + + jobContext.Output($"[DIAGNOSTICS] TCP Connections: {tcpStats.CurrentConnections}, Memory: WorkingSet={process.WorkingSet64 / 1024 / 1024}MB, GC={gcMemory / 1024 / 1024}MB"); + } + catch (Exception ex) + { + // Don't let diagnostics logging crash the job + Trace.Warning($"Failed to log diagnostics: {ex.Message}"); + } + } } } diff --git a/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs b/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs index 336afd358f..7c7717444e 100644 --- a/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs +++ b/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs @@ -10,6 +10,7 @@ using Microsoft.VisualStudio.Services.Agent.Util; using Microsoft.VisualStudio.Services.WebApi; using Microsoft.VisualStudio.Services.Common; +using Microsoft.VisualStudio.Services.OAuth; using Agent.Sdk.Util; namespace Microsoft.VisualStudio.Services.Agent @@ -98,87 +99,168 @@ public async Task ConnectAsync(Uri serverUrl, VssCredentials credentials) public async Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan timeout) { Trace.Info($"Refresh {connectionType} VssConnection to get on a different AFD node."); + + // Enhanced logging for connection refresh diagnostics + Trace.Info($"=== Connection Refresh Started ==="); + Trace.Info($"Connection type: {connectionType}"); + Trace.Info($"Refresh timeout: {timeout.TotalSeconds}s"); + Trace.Info($"Current time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} UTC"); + + var refreshStartTime = DateTime.UtcNow; VssConnection newConnection = null; - switch (connectionType) + VssConnection oldConnection = null; + + try { - case AgentConnectionType.MessageQueue: - try - { - _hasMessageConnection = false; - newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _messageConnection = newConnection; - _messageTaskAgentClient = client; - } - catch (SocketException ex) - { - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); - } - catch (Exception ex) - { - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); - } - finally - { - _hasMessageConnection = true; - } - break; - case AgentConnectionType.JobRequest: - try - { - _hasRequestConnection = false; - newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _requestConnection = newConnection; - _requestTaskAgentClient = client; - } - catch (SocketException ex) - { - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); - } - catch (Exception ex) - { - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); - } - finally - { - _hasRequestConnection = true; - } - break; - case AgentConnectionType.Generic: - try - { - _hasGenericConnection = false; - newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _genericConnection = newConnection; - _genericTaskAgentClient = client; - } - catch (SocketException ex) - { - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); - } - catch (Exception ex) - { - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); - } - finally - { - _hasGenericConnection = true; - } - break; - default: - Trace.Error($"Unexpected connection type: {connectionType}."); - break; + switch (connectionType) + { + case AgentConnectionType.MessageQueue: + oldConnection = _messageConnection; + Trace.Info($"Refreshing MessageQueue connection (old connection auth state: {oldConnection?.HasAuthenticated})"); + try + { + _hasMessageConnection = false; + newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _messageConnection = newConnection; + _messageTaskAgentClient = client; + + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Info($"MessageQueue connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); + } + catch (SocketException ex) + { + Trace.Warning($"Socket error during MessageQueue connection refresh: {ex.SocketErrorCode} - {ex.Message}"); + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Error($"Failed to refresh MessageQueue connection after {refreshDuration.TotalMilliseconds:F0}ms"); + + // Enhanced logging for connection refresh failures + if (ex is VssUnauthorizedException authEx) + { + Trace.Error($"*** AUTHENTICATION FAILURE during MessageQueue refresh *** {authEx.Message}"); + Trace.Error("This typically indicates token expiry - credentials may need to be refreshed"); + + if (authEx.Message.Contains("401")) + { + Trace.Error("HTTP 401 during refresh suggests OAuth token has expired"); + } + } + else if (ex is System.Net.Http.HttpRequestException httpRefreshEx) + { + Trace.Error($"HTTP error during MessageQueue refresh: {httpRefreshEx.Message}"); + if (httpRefreshEx.InnerException is System.Net.Sockets.SocketException sockRefreshEx) + { + Trace.Error($"Socket error during refresh: {sockRefreshEx.SocketErrorCode} - suggests connection pool exhaustion"); + } + } + else if (ex is System.Net.Sockets.SocketException sockEx) + { + Trace.Error($"Socket error during MessageQueue refresh: {sockEx.SocketErrorCode} - {sockEx.Message}"); + if (sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressAlreadyInUse || + sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressNotAvailable) + { + Trace.Error("*** SOCKET EXHAUSTION during refresh *** Consider legacy HTTP handler"); + } + } + + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasMessageConnection = true; + } + break; + case AgentConnectionType.JobRequest: + oldConnection = _requestConnection; + Trace.Info($"Refreshing JobRequest connection (old connection auth state: {oldConnection?.HasAuthenticated})"); + try + { + _hasRequestConnection = false; + newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _requestConnection = newConnection; + _requestTaskAgentClient = client; + + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Info($"JobRequest connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); + } + catch (SocketException ex) + { + Trace.Warning($"Socket error during JobRequest connection refresh: {ex.SocketErrorCode} - {ex.Message}"); + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Error($"Failed to refresh JobRequest connection after {refreshDuration.TotalMilliseconds:F0}ms"); + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasRequestConnection = true; + } + break; + case AgentConnectionType.Generic: + oldConnection = _genericConnection; + Trace.Info($"Refreshing Generic connection (old connection auth state: {oldConnection?.HasAuthenticated})"); + try + { + _hasGenericConnection = false; + newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _genericConnection = newConnection; + _genericTaskAgentClient = client; + + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Info($"Generic connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); + Trace.Info("*** Generic connection refresh successful - token refresh likely completed ***"); + } + catch (SocketException ex) + { + Trace.Warning($"Socket error during Generic connection refresh: {ex.SocketErrorCode} - {ex.Message}"); + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + var refreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Error($"Failed to refresh Generic connection after {refreshDuration.TotalMilliseconds:F0}ms"); + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasGenericConnection = true; + } + break; + default: + Trace.Error($"Unexpected connection type: {connectionType}."); + break; + } + } + finally + { + var totalRefreshDuration = DateTime.UtcNow - refreshStartTime; + Trace.Info($"=== Connection Refresh Completed ==="); + Trace.Info($"Total refresh duration: {totalRefreshDuration.TotalMilliseconds:F0}ms"); + Trace.Info($"Refresh result: {(newConnection != null ? "Success" : "Failed")}"); + + if (oldConnection != null && newConnection != null && oldConnection != newConnection) + { + Trace.Info("Old connection will be disposed as new connection established"); + // Note: The old connection will be disposed by garbage collection + } } } @@ -205,23 +287,148 @@ public void SetConnectionTimeout(AgentConnectionType connectionType, TimeSpan ti private async Task EstablishVssConnection(Uri serverUrl, VssCredentials credentials, TimeSpan timeout) { Trace.Info($"Establish connection with {timeout.TotalSeconds} seconds timeout."); + + // Enhanced logging for connection diagnostics + Trace.Info($"Establishing connection to: {serverUrl}"); + Trace.Info($"Connection timeout: {timeout.TotalSeconds}s"); + + // Log credential information for debugging + if (credentials?.Federated is VssOAuthAccessTokenCredential oauthCred) + { + // Don't log the actual token, just metadata about it + Trace.Info("Using OAuth access token credential"); + // Note: We cannot safely access token content or expiry without risking exposing secrets + // The VssOAuthAccessTokenCredential doesn't expose expiry information publicly + } + else if (credentials?.Federated != null) + { + var credType = credentials.Federated.GetType().Name; + Trace.Info($"Using federated credential type: {credType}"); + } + else if (credentials?.Storage != null) + { + Trace.Info("Using stored credential (PAT/Basic)"); + } + else + { + Trace.Warning("Unknown credential type or null credentials"); + } + int attemptCount = 5; var agentCertManager = HostContext.GetService(); while (attemptCount-- > 0) { var connection = VssUtil.CreateConnection(serverUrl, credentials, timeout: timeout, trace: Trace, skipServerCertificateValidation: agentCertManager.SkipServerCertificateValidation); + var connectStartTime = DateTime.UtcNow; try { + Trace.Info($"Attempting connection (attempt {5 - attemptCount}/5)..."); + await connection.ConnectAsync(); + + var connectDuration = DateTime.UtcNow - connectStartTime; + Trace.Info($"Connection established successfully in {connectDuration.TotalMilliseconds:F0}ms"); + + // Log authentication state + if (connection.HasAuthenticated) + { + Trace.Info("Connection authenticated successfully"); + try + { + var authIdentity = connection.AuthorizedIdentity; + if (authIdentity != null) + { + Trace.Info($"Authenticated as: {authIdentity.DisplayName} (ID: {authIdentity.Id})"); + } + } + catch (Exception ex) + { + Trace.Warning($"Could not retrieve authenticated identity info: {ex.Message}"); + } + } + else + { + Trace.Warning("Connection created but authentication status unclear"); + } + return connection; } catch (Exception ex) when (attemptCount > 0) { - Trace.Info($"Catch exception during connect. {attemptCount} attempt left."); + var connectDuration = DateTime.UtcNow - connectStartTime; + Trace.Info($"Connection attempt failed after {connectDuration.TotalMilliseconds:F0}ms. {attemptCount} attempt(s) left."); + + // Enhanced error logging for different types of failures + if (ex is System.Net.Http.HttpRequestException httpEx) + { + Trace.Warning($"HTTP request exception during connection: {httpEx.Message}"); + if (httpEx.InnerException != null) + { + Trace.Warning($"Inner exception: {httpEx.InnerException.GetType().Name}: {httpEx.InnerException.Message}"); + + // Check for socket-specific issues in inner exceptions + if (httpEx.InnerException is System.Net.Sockets.SocketException sockInnerEx) + { + Trace.Warning($"*** SOCKET ERROR in HTTP request *** Error: {sockInnerEx.SocketErrorCode} ({sockInnerEx.ErrorCode})"); + + if (sockInnerEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressAlreadyInUse || + sockInnerEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressNotAvailable || + sockInnerEx.SocketErrorCode == System.Net.Sockets.SocketError.TooManyOpenSockets) + { + Trace.Warning("*** SOCKET EXHAUSTION DETECTED *** Consider enabling legacy HTTP handler for connection pool isolation"); + } + } + } + } + else if (ex is System.Net.Sockets.SocketException sockEx) + { + Trace.Warning($"Socket exception during connection: Error {sockEx.ErrorCode} - {sockEx.Message}"); + Trace.Warning($"Socket error type: {sockEx.SocketErrorCode}"); + + // Provide specific guidance for socket exhaustion + if (sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressAlreadyInUse || + sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressNotAvailable || + sockEx.SocketErrorCode == System.Net.Sockets.SocketError.TooManyOpenSockets) + { + Trace.Warning("*** SOCKET EXHAUSTION DETECTED *** This may be caused by PowerShellOnTargetMachine or similar tasks using many connections"); + } + } + else if (ex is VssUnauthorizedException authEx) + { + Trace.Warning($"Authentication failed during connection: {authEx.Message}"); + // This could indicate token expiry or invalid credentials + Trace.Warning("*** AUTHENTICATION FAILURE *** This may indicate expired or invalid authentication credentials"); + + // Log additional context for authentication failures + if (authEx.Message.Contains("401")) + { + Trace.Warning("HTTP 401 Unauthorized - likely token expiry or invalid credentials"); + } + + // Try to log credential state for debugging + if (credentials?.Federated is VssOAuthAccessTokenCredential) + { + Trace.Warning("OAuth credential may be expired - token refresh will be attempted"); + } + } + else if (ex is System.Threading.Tasks.TaskCanceledException timeoutEx) + { + Trace.Warning($"Connection timeout: {timeoutEx.Message}"); + if (timeoutEx.InnerException is System.TimeoutException) + { + Trace.Warning("Connection timed out - this may indicate server load or network issues"); + } + } + else + { + Trace.Warning($"Unexpected connection error: {ex.GetType().Name}: {ex.Message}"); + } + Trace.Error(ex); await HostContext.Delay(TimeSpan.FromMilliseconds(100), CancellationToken.None); + connection?.Dispose(); } } From f1feb51fe0bf4368eb1423c3904fe6184f64d179 Mon Sep 17 00:00:00 2001 From: dassayantan24 Date: Fri, 5 Sep 2025 10:46:54 +0530 Subject: [PATCH 2/5] Update#2 --- src/Agent.Listener/JobDispatcher.cs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/Agent.Listener/JobDispatcher.cs b/src/Agent.Listener/JobDispatcher.cs index 9642777e46..b607f5de86 100644 --- a/src/Agent.Listener/JobDispatcher.cs +++ b/src/Agent.Listener/JobDispatcher.cs @@ -807,6 +807,26 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke var renewalDuration = DateTime.UtcNow - renewalStartTime; Trace.Info($"Successfully renew job request {requestId}, job is valid till {request.LockedUntil.Value}"); Trace.Info($"=== Job Renewal Completed Successfully === [Duration: {renewalDuration.TotalMilliseconds:F0}ms, ValidUntil: {request.LockedUntil.Value:HH:mm:ss}]"); + + if (!firstJobRequestRenewed.Task.IsCompleted) + { + // fire first renew succeed event. + firstJobRequestRenewed.TrySetResult(0); + Trace.Info(StringUtil.Format("First job request renewal completed successfully [RequestId:{0}, InitialRenewal:True, Status:Confirmed]", + requestId)); + } + + if (encounteringError > 0) + { + encounteringError = 0; + agentServer.SetConnectionTimeout(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(60)); + HostContext.WritePerfCounter("JobRenewRecovered"); + Trace.Info(StringUtil.Format("Job renewal error recovery completed [RequestId:{0}, ErrorsCleared:True, ConnectionTimeout:60s, Status:Recovered]", + requestId)); + } + + // renew again after 60 sec delay + await HostContext.Delay(TimeSpan.FromSeconds(60), token); } catch (TaskAgentJobNotFoundException) { From 8dd5f8b5aba3e35b74aae2655607a64a162125c3 Mon Sep 17 00:00:00 2001 From: dassayantan24 Date: Fri, 5 Sep 2025 11:22:01 +0530 Subject: [PATCH 3/5] Update#3 --- src/Agent.Listener/JobDispatcher.cs | 89 +++-------------------------- 1 file changed, 7 insertions(+), 82 deletions(-) diff --git a/src/Agent.Listener/JobDispatcher.cs b/src/Agent.Listener/JobDispatcher.cs index b607f5de86..72c0d2fd07 100644 --- a/src/Agent.Listener/JobDispatcher.cs +++ b/src/Agent.Listener/JobDispatcher.cs @@ -8,7 +8,6 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; -using System.Net.NetworkInformation; using System.Net.Sockets; using System.Threading; using System.Threading.Tasks; @@ -851,63 +850,9 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke { Trace.Error($"Catch exception during renew agent jobrequest {requestId}."); - // Enhanced error logging for job renewal failures - var errorTime = DateTime.UtcNow; + // Simple diagnostic info for troubleshooting + Trace.Error($"DIAGNOSTIC: {ex.GetType().Name} - ErrorCode: {GetErrorCode(ex)} - {ex.Message}"); - if (ex is SocketException sockEx) - { - Trace.Error($"*** SOCKET ERROR during job renewal *** Socket Error: {sockEx.SocketErrorCode} ({sockEx.ErrorCode}) - {sockEx.Message}"); - Trace.Error($"This may indicate socket exhaustion or network connectivity issues"); - Trace.Error($"Check if PowerShellOnTargetMachine or similar tasks are consuming many connections"); - - if (sockEx.SocketErrorCode == SocketError.AddressAlreadyInUse || - sockEx.SocketErrorCode == SocketError.AddressNotAvailable || - sockEx.SocketErrorCode == SocketError.TooManyOpenSockets) - { - Trace.Error("*** POTENTIAL SOCKET EXHAUSTION DETECTED ***"); - Trace.Error("Consider enabling legacy HTTP handler to isolate connection pools"); - } - } - else if (ex is System.Net.Http.HttpRequestException httpEx) - { - Trace.Error($"HTTP request exception during job renewal: {httpEx.Message}"); - if (httpEx.InnerException is SocketException innerSock) - { - Trace.Error($"Inner socket exception: {innerSock.SocketErrorCode} - {innerSock.Message}"); - } - else if (httpEx.InnerException != null) - { - Trace.Error($"Inner exception: {httpEx.InnerException.GetType().Name}: {httpEx.InnerException.Message}"); - } - } - else if (ex is VssUnauthorizedException authEx) - { - Trace.Error($"*** AUTHENTICATION FAILURE during job renewal *** {authEx.Message}"); - Trace.Error("This typically indicates token expiry or invalid credentials"); - Trace.Error("Connection refresh will attempt to re-authenticate"); - - // Additional context for authentication failures - if (authEx.Message.Contains("401") || authEx.Message.Contains("Unauthorized")) - { - Trace.Error("HTTP 401 Unauthorized response - OAuth token likely expired"); - Trace.Error("Check token expiry timing in previous VssConnection logs"); - } - - // Log timing context for token expiry analysis - Trace.Error($"Authentication failure occurred at: {errorTime:yyyy-MM-dd HH:mm:ss.fff} UTC"); - Trace.Error("Review connection creation logs for token expiry warnings"); - } - else if (ex is TimeoutException timeoutEx) - { - Trace.Error($"*** TIMEOUT during job renewal *** {timeoutEx.Message}"); - Trace.Error("This may indicate server load, network issues, or connection pool exhaustion"); - } - else - { - Trace.Error($"Unexpected exception type during job renewal: {ex.GetType().Name}"); - } - - Trace.Error($"Job renewal error occurred at: {errorTime:yyyy-MM-dd HH:mm:ss.fff} UTC"); Trace.Error(ex); encounteringError++; @@ -954,15 +899,9 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke Trace.Info(StringUtil.Format("Job renewal connection refresh initiated [RequestId:{0}, Timeout:30s, Reason:RetryRecovery, ErrorCount:{1}]", requestId, encounteringError)); await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(30)); - + try { - // Log diagnostics periodically during job renewal delays - if (delayTime.TotalMinutes >= 1) // Only for longer delays - { - LogSimpleDiagnostics(requestId); - } - // back-off before next retry. await HostContext.Delay(delayTime, token); } @@ -1224,25 +1163,11 @@ private void Dispose(bool disposing) } } - private void LogSimpleDiagnostics(long requestId) + private string GetErrorCode(Exception ex) { - try - { - // Simple network and memory diagnostics using IPGlobalProperties - var ipProperties = IPGlobalProperties.GetIPGlobalProperties(); - var tcpStats = ipProperties.GetTcpIPv4Statistics(); - - // Memory usage from current process - var process = Process.GetCurrentProcess(); - var gcMemory = GC.GetTotalMemory(false); - - Trace.Info($"[RENEWAL-DIAGNOSTICS] RequestId:{requestId} TCP:{tcpStats.CurrentConnections} Memory:WS={process.WorkingSet64 / 1024 / 1024}MB,GC={gcMemory / 1024 / 1024}MB"); - } - catch (Exception ex) - { - // Don't let diagnostics crash job renewal - Trace.Warning($"Failed to log renewal diagnostics: {ex.Message}"); - } + return ex is SocketException sockEx ? $"Socket:{sockEx.SocketErrorCode}" : + ex is System.Net.Http.HttpRequestException ? "HTTP" : + ex is VssUnauthorizedException ? "Auth" : "Other"; } } } From b4ba7d46432360580e0ca73030969eaacb9a3bb2 Mon Sep 17 00:00:00 2001 From: dassayantan24 Date: Mon, 8 Sep 2025 10:05:25 +0530 Subject: [PATCH 4/5] Update#3 --- .../Configuration/CredentialProvider.cs | 33 ++- .../Configuration/OAuthCredential.cs | 8 + src/Agent.Listener/JobDispatcher.cs | 24 +- .../AgentServer.cs | 263 +++++++----------- 4 files changed, 143 insertions(+), 185 deletions(-) diff --git a/src/Agent.Listener/Configuration/CredentialProvider.cs b/src/Agent.Listener/Configuration/CredentialProvider.cs index 7c2c0c0039..92881158b2 100644 --- a/src/Agent.Listener/Configuration/CredentialProvider.cs +++ b/src/Agent.Listener/Configuration/CredentialProvider.cs @@ -91,24 +91,35 @@ public override void EnsureCredential(IHostContext context, CommandSettings comm private async Task AcquireATokenFromCacheOrDeviceCodeFlowAsync(IHostContext context, IPublicClientApplication app, IEnumerable scopes) { + var trace = context.GetTrace(nameof(AadDeviceCodeAccessToken)); + trace.Info("Starting AAD token acquisition"); + AuthenticationResult result = null; var accounts = await app.GetAccountsAsync().ConfigureAwait(false); if (accounts.Any()) { + trace.Info("Attempting silent token acquisition from cache"); // Attempt to get a token from the cache (or refresh it silently if needed) result = await app.AcquireTokenSilent(scopes, accounts.FirstOrDefault()) .ExecuteAsync().ConfigureAwait(false); + trace.Info("Silent token acquisition completed"); } // Cache empty or no token for account in the cache, attempt by device code flow if (result == null) { + trace.Info("Attempting device code flow token acquisition"); + result = await GetTokenUsingDeviceCodeFlowAsync(context, app, scopes).ConfigureAwait(false); + + trace.Info("Device code flow token acquisition completed"); } + trace.Info("AAD token acquisition completed"); + return result; } @@ -162,20 +173,34 @@ private Uri GetTenantAuthorityUrl(IHostContext context, string serverUrl) private async Task GetTokenUsingDeviceCodeFlowAsync(IHostContext context, IPublicClientApplication app, IEnumerable scopes) { Tracing trace = context.GetTrace(nameof(AadDeviceCodeAccessToken)); + trace.Info("Starting device code flow token acquisition"); + AuthenticationResult result; try { + trace.Info("Initiating device code flow"); + result = await app.AcquireTokenWithDeviceCode(scopes, deviceCodeCallback => { + trace.Info("Device code callback received"); + // This will print the message on the console which tells the user where to go sign-in using // a separate browser and the code to enter once they sign in. var term = context.GetService(); - term.WriteLine($"Please finish AAD device code flow in browser ({deviceCodeCallback.VerificationUrl}), user code: {deviceCodeCallback.UserCode}"); return Task.FromResult(0); + term.WriteLine($"Please finish AAD device code flow in browser ({deviceCodeCallback.VerificationUrl}), user code: {deviceCodeCallback.UserCode}"); + + trace.Info($"Waiting for user authentication at {deviceCodeCallback.VerificationUrl}"); + + return Task.FromResult(0); }).ExecuteAsync().ConfigureAwait(false); + + trace.Info("Device code flow authentication completed successfully"); } - catch (MsalServiceException) + catch (MsalServiceException msalEx) { + trace.Error($"Device code flow failed with MSAL service exception: {msalEx.Message}"); + // AADSTS50059: No tenant-identifying information found in either the request or implied by any provided credentials. // AADSTS90133: Device Code flow is not supported under /common or /consumers endpoint. // AADSTS90002: Tenant not found. This may happen if there are @@ -184,12 +209,12 @@ private async Task GetTokenUsingDeviceCodeFlowAsync(IHostC } catch (OperationCanceledException ex) { - trace.Warning(ex.Message); + trace.Warning($"Device code flow cancelled: {ex.Message}"); result = null; } catch (MsalClientException ex) { - trace.Warning(ex.Message); + trace.Warning($"Device code flow failed with MSAL client exception: {ex.Message}"); result = null; } return result; diff --git a/src/Agent.Listener/Configuration/OAuthCredential.cs b/src/Agent.Listener/Configuration/OAuthCredential.cs index 82dc71780c..9d7c341768 100644 --- a/src/Agent.Listener/Configuration/OAuthCredential.cs +++ b/src/Agent.Listener/Configuration/OAuthCredential.cs @@ -29,6 +29,9 @@ public override void EnsureCredential( public override VssCredentials GetVssCredentials(IHostContext context) { ArgUtil.NotNull(context, nameof(context)); + + var trace = context.GetTrace(nameof(OAuthCredential)); + trace.Info("Starting OAuth credential creation"); var clientId = this.CredentialData.Data.GetValueOrDefault("clientId", null); var authorizationUrl = this.CredentialData.Data.GetValueOrDefault("authorizationUrl", null); @@ -39,6 +42,9 @@ public override VssCredentials GetVssCredentials(IHostContext context) ArgUtil.NotNullOrEmpty(clientId, nameof(clientId)); ArgUtil.NotNullOrEmpty(authorizationUrl, nameof(authorizationUrl)); + trace.Info($"OAuth endpoint: {oathEndpointUrl}"); + trace.Info($"OAuth authorization URL: {authorizationUrl}"); + // We expect the key to be in the machine store at this point. Configuration should have set all of // this up correctly so we can use the key to generate access tokens. var keyManager = context.GetService(); @@ -46,6 +52,8 @@ public override VssCredentials GetVssCredentials(IHostContext context) var clientCredential = new VssOAuthJwtBearerClientCredential(clientId, authorizationUrl, signingCredentials); var agentCredential = new VssOAuthCredential(new Uri(oathEndpointUrl, UriKind.Absolute), VssOAuthGrant.ClientCredentials, clientCredential); + trace.Info("OAuth credential creation completed"); + // Construct a credentials cache with a single OAuth credential for communication. The windows credential // is explicitly set to null to ensure we never do that negotiation. return new VssCredentials(null, agentCredential, CredentialPromptType.DoNotPrompt); diff --git a/src/Agent.Listener/JobDispatcher.cs b/src/Agent.Listener/JobDispatcher.cs index 72c0d2fd07..a4bd0e18af 100644 --- a/src/Agent.Listener/JobDispatcher.cs +++ b/src/Agent.Listener/JobDispatcher.cs @@ -533,7 +533,6 @@ private async Task RunAsync(Pipelines.AgentJobRequestMessage message, WorkerDisp Trace.Info(StringUtil.Format("Sending job request message to worker process - job: {0}, size: {1}", message.JobId, numBytesString)); HostContext.WritePerfCounter($"AgentSendingJobToWorker_{message.JobId}"); - var stopWatch = Stopwatch.StartNew(); using (var csSendJobRequest = new CancellationTokenSource(_channelTimeout)) { await processChannel.SendAsync( @@ -541,8 +540,7 @@ await processChannel.SendAsync( body: body, cancellationToken: csSendJobRequest.Token); } - stopWatch.Stop(); - Trace.Info($"Took {stopWatch.ElapsedMilliseconds} ms to send job message to worker"); + Trace.Info("Job message sent to worker successfully"); } catch (OperationCanceledException) { @@ -798,14 +796,9 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke { try { - Trace.Info($"=== Job Renewal Request Starting === [RequestId: {requestId}, Attempt: {6 - firstRenewRetryLimit}, Time: {DateTime.UtcNow:HH:mm:ss.fff}]"); - var renewalStartTime = DateTime.UtcNow; - request = await agentServer.RenewAgentRequestAsync(poolId, requestId, lockToken, token); - - var renewalDuration = DateTime.UtcNow - renewalStartTime; + Trace.Info($"Successfully renew job request {requestId}, job is valid till {request.LockedUntil.Value}"); - Trace.Info($"=== Job Renewal Completed Successfully === [Duration: {renewalDuration.TotalMilliseconds:F0}ms, ValidUntil: {request.LockedUntil.Value:HH:mm:ss}]"); if (!firstJobRequestRenewed.Task.IsCompleted) { @@ -849,10 +842,6 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke catch (Exception ex) { Trace.Error($"Catch exception during renew agent jobrequest {requestId}."); - - // Simple diagnostic info for troubleshooting - Trace.Error($"DIAGNOSTIC: {ex.GetType().Name} - ErrorCode: {GetErrorCode(ex)} - {ex.Message}"); - Trace.Error(ex); encounteringError++; @@ -899,7 +888,7 @@ public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToke Trace.Info(StringUtil.Format("Job renewal connection refresh initiated [RequestId:{0}, Timeout:30s, Reason:RetryRecovery, ErrorCount:{1}]", requestId, encounteringError)); await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(30)); - + try { // back-off before next retry. @@ -1162,12 +1151,5 @@ private void Dispose(bool disposing) } } } - - private string GetErrorCode(Exception ex) - { - return ex is SocketException sockEx ? $"Socket:{sockEx.SocketErrorCode}" : - ex is System.Net.Http.HttpRequestException ? "HTTP" : - ex is VssUnauthorizedException ? "Auth" : "Other"; - } } } diff --git a/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs b/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs index 7c7717444e..16dfa51cca 100644 --- a/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs +++ b/src/Microsoft.VisualStudio.Services.Agent/AgentServer.cs @@ -99,168 +99,121 @@ public async Task ConnectAsync(Uri serverUrl, VssCredentials credentials) public async Task RefreshConnectionAsync(AgentConnectionType connectionType, TimeSpan timeout) { Trace.Info($"Refresh {connectionType} VssConnection to get on a different AFD node."); - - // Enhanced logging for connection refresh diagnostics - Trace.Info($"=== Connection Refresh Started ==="); - Trace.Info($"Connection type: {connectionType}"); - Trace.Info($"Refresh timeout: {timeout.TotalSeconds}s"); - Trace.Info($"Current time: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} UTC"); - - var refreshStartTime = DateTime.UtcNow; VssConnection newConnection = null; - VssConnection oldConnection = null; - try + switch (connectionType) { - switch (connectionType) - { - case AgentConnectionType.MessageQueue: - oldConnection = _messageConnection; - Trace.Info($"Refreshing MessageQueue connection (old connection auth state: {oldConnection?.HasAuthenticated})"); - try - { - _hasMessageConnection = false; - newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _messageConnection = newConnection; - _messageTaskAgentClient = client; - - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Info($"MessageQueue connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); - } - catch (SocketException ex) - { - Trace.Warning($"Socket error during MessageQueue connection refresh: {ex.SocketErrorCode} - {ex.Message}"); - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); - } - catch (Exception ex) - { - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Error($"Failed to refresh MessageQueue connection after {refreshDuration.TotalMilliseconds:F0}ms"); - - // Enhanced logging for connection refresh failures - if (ex is VssUnauthorizedException authEx) - { - Trace.Error($"*** AUTHENTICATION FAILURE during MessageQueue refresh *** {authEx.Message}"); - Trace.Error("This typically indicates token expiry - credentials may need to be refreshed"); - - if (authEx.Message.Contains("401")) - { - Trace.Error("HTTP 401 during refresh suggests OAuth token has expired"); - } - } - else if (ex is System.Net.Http.HttpRequestException httpRefreshEx) - { - Trace.Error($"HTTP error during MessageQueue refresh: {httpRefreshEx.Message}"); - if (httpRefreshEx.InnerException is System.Net.Sockets.SocketException sockRefreshEx) - { - Trace.Error($"Socket error during refresh: {sockRefreshEx.SocketErrorCode} - suggests connection pool exhaustion"); - } - } - else if (ex is System.Net.Sockets.SocketException sockEx) - { - Trace.Error($"Socket error during MessageQueue refresh: {sockEx.SocketErrorCode} - {sockEx.Message}"); - if (sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressAlreadyInUse || - sockEx.SocketErrorCode == System.Net.Sockets.SocketError.AddressNotAvailable) - { - Trace.Error("*** SOCKET EXHAUSTION during refresh *** Consider legacy HTTP handler"); - } - } - - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); - } - finally - { - _hasMessageConnection = true; - } - break; - case AgentConnectionType.JobRequest: - oldConnection = _requestConnection; - Trace.Info($"Refreshing JobRequest connection (old connection auth state: {oldConnection?.HasAuthenticated})"); - try - { - _hasRequestConnection = false; - newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _requestConnection = newConnection; - _requestTaskAgentClient = client; - - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Info($"JobRequest connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); - } - catch (SocketException ex) - { - Trace.Warning($"Socket error during JobRequest connection refresh: {ex.SocketErrorCode} - {ex.Message}"); - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); - } - catch (Exception ex) + case AgentConnectionType.MessageQueue: + try + { + _hasMessageConnection = false; + newConnection = await EstablishVssConnection(_messageConnection.Uri, _messageConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _messageConnection = newConnection; + _messageTaskAgentClient = client; + } + catch (SocketException ex) + { + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + // Enhanced diagnostics for connection failures + if (ex is VssUnauthorizedException) { - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Error($"Failed to refresh JobRequest connection after {refreshDuration.TotalMilliseconds:F0}ms"); - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); + Trace.Error($"Authentication failure during {connectionType} refresh - token may need renewal"); } - finally + else if (ex is System.Net.Http.HttpRequestException httpEx && httpEx.InnerException is SocketException sockEx && + (sockEx.SocketErrorCode == SocketError.AddressAlreadyInUse || sockEx.SocketErrorCode == SocketError.AddressNotAvailable)) { - _hasRequestConnection = true; + Trace.Error($"Socket exhaustion detected during {connectionType} refresh: {sockEx.SocketErrorCode}"); } - break; - case AgentConnectionType.Generic: - oldConnection = _genericConnection; - Trace.Info($"Refreshing Generic connection (old connection auth state: {oldConnection?.HasAuthenticated})"); - try + + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasMessageConnection = true; + } + break; + case AgentConnectionType.JobRequest: + try + { + _hasRequestConnection = false; + newConnection = await EstablishVssConnection(_requestConnection.Uri, _requestConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _requestConnection = newConnection; + _requestTaskAgentClient = client; + } + catch (SocketException ex) + { + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + // Enhanced diagnostics for connection failures + if (ex is VssUnauthorizedException) { - _hasGenericConnection = false; - newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, timeout); - var client = newConnection.GetClient(); - _genericConnection = newConnection; - _genericTaskAgentClient = client; - - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Info($"Generic connection refresh completed successfully in {refreshDuration.TotalMilliseconds:F0}ms"); - Trace.Info("*** Generic connection refresh successful - token refresh likely completed ***"); + Trace.Error($"Authentication failure during {connectionType} refresh - token may need renewal"); } - catch (SocketException ex) + else if (ex is System.Net.Http.HttpRequestException httpEx && httpEx.InnerException is SocketException sockEx && + (sockEx.SocketErrorCode == SocketError.AddressAlreadyInUse || sockEx.SocketErrorCode == SocketError.AddressNotAvailable)) { - Trace.Warning($"Socket error during Generic connection refresh: {ex.SocketErrorCode} - {ex.Message}"); - ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); - newConnection?.Dispose(); + Trace.Error($"Socket exhaustion detected during {connectionType} refresh: {sockEx.SocketErrorCode}"); } - catch (Exception ex) + + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasRequestConnection = true; + } + break; + case AgentConnectionType.Generic: + try + { + _hasGenericConnection = false; + newConnection = await EstablishVssConnection(_genericConnection.Uri, _genericConnection.Credentials, timeout); + var client = newConnection.GetClient(); + _genericConnection = newConnection; + _genericTaskAgentClient = client; + } + catch (SocketException ex) + { + ExceptionsUtil.HandleSocketException(ex, _requestConnection.Uri.ToString(), (msg) => Trace.Error(msg)); + newConnection?.Dispose(); + } + catch (Exception ex) + { + // Enhanced diagnostics for connection failures + if (ex is VssUnauthorizedException) { - var refreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Error($"Failed to refresh Generic connection after {refreshDuration.TotalMilliseconds:F0}ms"); - Trace.Error($"Catch exception during reset {connectionType} connection."); - Trace.Error(ex); - newConnection?.Dispose(); + Trace.Error($"Authentication failure during {connectionType} refresh - token may need renewal"); } - finally + else if (ex is System.Net.Http.HttpRequestException httpEx && httpEx.InnerException is SocketException sockEx && + (sockEx.SocketErrorCode == SocketError.AddressAlreadyInUse || sockEx.SocketErrorCode == SocketError.AddressNotAvailable)) { - _hasGenericConnection = true; + Trace.Error($"Socket exhaustion detected during {connectionType} refresh: {sockEx.SocketErrorCode}"); } - break; - default: - Trace.Error($"Unexpected connection type: {connectionType}."); - break; - } - } - finally - { - var totalRefreshDuration = DateTime.UtcNow - refreshStartTime; - Trace.Info($"=== Connection Refresh Completed ==="); - Trace.Info($"Total refresh duration: {totalRefreshDuration.TotalMilliseconds:F0}ms"); - Trace.Info($"Refresh result: {(newConnection != null ? "Success" : "Failed")}"); - - if (oldConnection != null && newConnection != null && oldConnection != newConnection) - { - Trace.Info("Old connection will be disposed as new connection established"); - // Note: The old connection will be disposed by garbage collection - } + + Trace.Error($"Catch exception during reset {connectionType} connection."); + Trace.Error(ex); + newConnection?.Dispose(); + } + finally + { + _hasGenericConnection = true; + } + break; + default: + Trace.Error($"Unexpected connection type: {connectionType}."); + break; } } @@ -286,7 +239,7 @@ public void SetConnectionTimeout(AgentConnectionType connectionType, TimeSpan ti private async Task EstablishVssConnection(Uri serverUrl, VssCredentials credentials, TimeSpan timeout) { - Trace.Info($"Establish connection with {timeout.TotalSeconds} seconds timeout."); + Trace.Info($"Starting VssConnection establishment with {timeout.TotalSeconds} seconds timeout."); // Enhanced logging for connection diagnostics Trace.Info($"Establishing connection to: {serverUrl}"); @@ -352,6 +305,8 @@ private async Task EstablishVssConnection(Uri serverUrl, VssCrede Trace.Warning("Connection created but authentication status unclear"); } + Trace.Info("VssConnection establishment completed successfully"); + return connection; } catch (Exception ex) when (attemptCount > 0) @@ -399,18 +354,6 @@ private async Task EstablishVssConnection(Uri serverUrl, VssCrede Trace.Warning($"Authentication failed during connection: {authEx.Message}"); // This could indicate token expiry or invalid credentials Trace.Warning("*** AUTHENTICATION FAILURE *** This may indicate expired or invalid authentication credentials"); - - // Log additional context for authentication failures - if (authEx.Message.Contains("401")) - { - Trace.Warning("HTTP 401 Unauthorized - likely token expiry or invalid credentials"); - } - - // Try to log credential state for debugging - if (credentials?.Federated is VssOAuthAccessTokenCredential) - { - Trace.Warning("OAuth credential may be expired - token refresh will be attempted"); - } } else if (ex is System.Threading.Tasks.TaskCanceledException timeoutEx) { From d1db9c6f4d4071fb93d62d37fc281873bec3df47 Mon Sep 17 00:00:00 2001 From: dassayantan24 Date: Mon, 8 Sep 2025 10:14:05 +0530 Subject: [PATCH 5/5] Update#4 --- src/Agent.Listener/JobDispatcher.cs | 4 +- src/Agent.Sdk/Util/VssUtil.cs | 155 ++-------------------------- 2 files changed, 10 insertions(+), 149 deletions(-) diff --git a/src/Agent.Listener/JobDispatcher.cs b/src/Agent.Listener/JobDispatcher.cs index a4bd0e18af..a0dccaf8c1 100644 --- a/src/Agent.Listener/JobDispatcher.cs +++ b/src/Agent.Listener/JobDispatcher.cs @@ -533,6 +533,7 @@ private async Task RunAsync(Pipelines.AgentJobRequestMessage message, WorkerDisp Trace.Info(StringUtil.Format("Sending job request message to worker process - job: {0}, size: {1}", message.JobId, numBytesString)); HostContext.WritePerfCounter($"AgentSendingJobToWorker_{message.JobId}"); + var stopWatch = Stopwatch.StartNew(); using (var csSendJobRequest = new CancellationTokenSource(_channelTimeout)) { await processChannel.SendAsync( @@ -540,7 +541,8 @@ await processChannel.SendAsync( body: body, cancellationToken: csSendJobRequest.Token); } - Trace.Info("Job message sent to worker successfully"); + stopWatch.Stop(); + Trace.Info($"Took {stopWatch.ElapsedMilliseconds} ms to send job message to worker"); } catch (OperationCanceledException) { diff --git a/src/Agent.Sdk/Util/VssUtil.cs b/src/Agent.Sdk/Util/VssUtil.cs index 22683e9a1f..f4a2b83057 100644 --- a/src/Agent.Sdk/Util/VssUtil.cs +++ b/src/Agent.Sdk/Util/VssUtil.cs @@ -109,139 +109,19 @@ public static VssConnection CreateConnection( int httpRequestTimeoutSeconds = AgentKnobs.HttpTimeout.GetValue(_knobContext).AsInt(); settings.SendTimeout = timeout ?? TimeSpan.FromSeconds(Math.Min(Math.Max(httpRequestTimeoutSeconds, 100), 1200)); - // Enhanced logging for connection diagnostics + // Basic connection diagnostics if (trace != null) { - trace.Info($"Creating VssConnection to {serverUri}"); - trace.Info($"Connection settings - MaxRetryRequest: {settings.MaxRetryRequest}, SendTimeout: {settings.SendTimeout.TotalSeconds}s"); - trace.Info($"Using legacy HTTP handler: {PlatformUtil.UseLegacyHttpHandler}"); + trace.Info($"Creating VssConnection to {serverUri} (timeout: {settings.SendTimeout.TotalSeconds}s)"); - // Log credential type for diagnostics - string credType = "Unknown"; - if (credentials?.Federated is Microsoft.VisualStudio.Services.OAuth.VssOAuthAccessTokenCredential oauthCred) - { - credType = "OAuth AccessToken"; - // Try to extract token information for expiry diagnostics without exposing the token - try - { - var tokenField = oauthCred.GetType().GetField("m_accessToken", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); - if (tokenField != null) - { - var tokenValue = tokenField.GetValue(oauthCred) as string; - if (!string.IsNullOrEmpty(tokenValue)) - { - // Parse JWT token to get expiry without exposing the token content - try - { - var tokenParts = tokenValue.Split('.'); - if (tokenParts.Length >= 2) - { - var payload = tokenParts[1]; - // Add padding if needed for base64 decoding - while (payload.Length % 4 != 0) payload += "="; - var decodedBytes = Convert.FromBase64String(payload); - var decodedString = System.Text.Encoding.UTF8.GetString(decodedBytes); - if (decodedString.Contains("\"exp\":")) - { - var expMatch = System.Text.RegularExpressions.Regex.Match(decodedString, @"""exp"":(\d+)"); - if (expMatch.Success && long.TryParse(expMatch.Groups[1].Value, out var expUnix)) - { - var expiry = DateTimeOffset.FromUnixTimeSeconds(expUnix); - var timeUntilExpiry = expiry - DateTimeOffset.UtcNow; - trace.Info($"OAuth token expiry: {expiry:yyyy-MM-dd HH:mm:ss} UTC (in {timeUntilExpiry.TotalMinutes:F1} minutes)"); - - if (timeUntilExpiry.TotalMinutes < 5) - { - trace.Info($"*** WARNING: OAuth token expires soon: {timeUntilExpiry.TotalMinutes:F1} minutes remaining ***"); - } - else if (timeUntilExpiry.TotalSeconds < 0) - { - trace.Info($"*** ERROR: OAuth token is EXPIRED by {Math.Abs(timeUntilExpiry.TotalMinutes):F1} minutes ***"); - } - } - } - } - } - catch (Exception tokenParseEx) - { - trace.Verbose($"Could not parse token expiry information: {tokenParseEx.Message}"); - } - } - } - } - catch (Exception ex) - { - trace.Verbose($"Could not access token metadata for expiry check: {ex.Message}"); - } - } - else if (credentials?.Federated != null && credentials.Federated.GetType().Name.Contains("VssAad")) - credType = "AAD Credential"; - else if (credentials?.Storage != null) - credType = "Basic/PAT Credential"; - else if (credentials?.Windows != null) - credType = "Windows Credential"; - - trace.Info($"Credential type: {credType}"); - - // Log additional delegating handlers - if (additionalDelegatingHandler != null) - { - int handlerCount = 0; - foreach (var handler in additionalDelegatingHandler) - { - handlerCount++; - trace.Info($"Additional HTTP handler {handlerCount}: {handler.GetType().Name}"); - } - if (handlerCount == 0) - { - trace.Info("No additional HTTP handlers"); - } - } - - // Log socket and connection pool information - try + // Log critical socket exhaustion indicators + if (PlatformUtil.UseLegacyHttpHandler) { - // Get system-level connection information for socket exhaustion diagnostics - if (PlatformUtil.RunningOnWindows) - { - var tcpConnections = System.Net.NetworkInformation.IPGlobalProperties.GetIPGlobalProperties().GetActiveTcpConnections(); - int establishedConnections = 0; - int totalConnections = tcpConnections.Length; - - foreach (var conn in tcpConnections) - { - if (conn.State == System.Net.NetworkInformation.TcpState.Established) - establishedConnections++; - } - - trace.Info($"System TCP connections: {establishedConnections} established, {totalConnections} total"); - - if (establishedConnections > 1000) - { - trace.Info($"*** WARNING: High number of established TCP connections ({establishedConnections}) - possible socket exhaustion ***"); - } - - // Log connections to the target server specifically - var serverHost = serverUri.Host; - int serverConnections = 0; - foreach (var conn in tcpConnections) - { - if (conn.RemoteEndPoint.Address.ToString().Equals(serverHost, StringComparison.OrdinalIgnoreCase) || - (System.Net.IPAddress.TryParse(serverHost, out var serverIp) && conn.RemoteEndPoint.Address.Equals(serverIp))) - { - serverConnections++; - } - } - - if (serverConnections > 0) - { - trace.Info($"Existing connections to {serverHost}: {serverConnections}"); - } - } + trace.Info("Using legacy HTTP handler - isolated connection pools"); } - catch (Exception ex) + else { - trace.Verbose($"Could not retrieve system connection information: {ex.Message}"); + trace.Info("Using modern SocketsHttpHandler - shared connection pool"); } } @@ -262,27 +142,6 @@ public static VssConnection CreateConnection( } VssConnection connection = new VssConnection(serverUri, new VssHttpMessageHandler(credentials, settings), additionalDelegatingHandler); - - // Enhanced logging for connection creation and socket diagnostics - if (trace != null) - { - trace.Info($"VssConnection created for {serverUri.Host}:{serverUri.Port}"); - - // Log handler type being used for socket pool diagnostics - if (PlatformUtil.UseLegacyHttpHandler) - { - trace.Info("Using LEGACY HTTP handler - isolated connection pools"); - } - else - { - trace.Info("Using MODERN SocketsHttpHandler - shared global connection pool"); - trace.Info("*** SHARED POOL WARNING *** Heavy task connections may exhaust sockets for agent-server communication"); - } - - // Log timing for connection creation - trace.Info($"Connection created at: {DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} UTC"); - } - return connection; }