fix: connection stability — stop node reconnect storms, fix bootstrap token handling (#287)
Some checks are pending
Build and Test / release (push) Blocked by required conditions
Build and Test / test (push) Waiting to run
Build and Test / build (win-arm64) (push) Blocked by required conditions
Build and Test / build (win-x64) (push) Blocked by required conditions
Build and Test / build-msix (ARM64, win-arm64) (push) Blocked by required conditions
Build and Test / build-msix (x64, win-x64) (push) Blocked by required conditions
Build and Test / build-extension (arm64) (push) Blocked by required conditions
Build and Test / build-extension (x64) (push) Blocked by required conditions
Some checks are pending
Build and Test / release (push) Blocked by required conditions
Build and Test / test (push) Waiting to run
Build and Test / build (win-arm64) (push) Blocked by required conditions
Build and Test / build (win-x64) (push) Blocked by required conditions
Build and Test / build-msix (ARM64, win-arm64) (push) Blocked by required conditions
Build and Test / build-msix (x64, win-x64) (push) Blocked by required conditions
Build and Test / build-extension (arm64) (push) Blocked by required conditions
Build and Test / build-extension (x64) (push) Blocked by required conditions
* fix: connection stability — stop node reconnect storms, fix bootstrap token handling Critical fixes for connection management bugs introduced in PR #272: 1. Node reconnect storm during pairing (WindowsNodeClient) - Added ShouldAutoReconnect() override with _pairingBlocked flag - Flag survives OnDisconnected() (which clears _isPendingApproval) - Added rate-limit detection for terminal auth errors - Marked _pairingBlocked/_rateLimited as volatile for thread safety - Clear _rateLimited on successful hello-ok (transient, not permanent) 2. Backoff jitter (WebSocketClientBase) - Added 0-25% random jitter to prevent thundering herd when operator + node clients reconnect simultaneously 3. Client leak on reinitialize (App.xaml.cs) - Added _gatewayClient?.Dispose() before creating new client - Old clients were keeping reconnect loops alive as zombies 4. Bootstrap token not saved as Settings.Token - Setup code decoder no longer persists bootstrap to Settings.Token - Prevents reconnect storms on app restart with stale bootstrap token - TestConnection skips writing bootstrap value to Settings.Token - InitializeGatewayClient falls back to BootstrapToken for bootstrap flow 5. Token PasswordBox → TextBox - Users can see what they pasted (SetupWizardWindow + ConnectionPage) 6. Clear stale tray data on disconnect - Sessions/channels/nodes/models cleared when disconnected/error - Tray menu no longer shows old data alongside 'Disconnected' 7. Onboarding UX fixes - Removed disruptive auto-paste-on-focus from setup code field - Setup code state only updates on valid decode (prevents focus loss) - Added 'Relaunch First-Run Setup' button to Debug page Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * test: increase PowerShell echo test timeout to 30s for slow CI runners Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
584a19fadd
commit
2fcfe76abc
@ -251,6 +251,10 @@ public abstract class WebSocketClientBase : IDisposable
|
||||
while (!_disposed && !_cts.Token.IsCancellationRequested && ShouldAutoReconnect())
|
||||
{
|
||||
var delay = BackoffMs[Math.Min(_reconnectAttempts, BackoffMs.Length - 1)];
|
||||
// Add 0-25% jitter to prevent thundering herd when multiple clients
|
||||
// (operator + node) reconnect on the same schedule
|
||||
var jitter = Random.Shared.Next(0, delay / 4);
|
||||
delay += jitter;
|
||||
_reconnectAttempts++;
|
||||
_logger.Warn($"{ClientRole} reconnecting in {delay}ms (attempt {_reconnectAttempts})");
|
||||
RaiseStatusChanged(ConnectionStatus.Connecting);
|
||||
|
||||
@ -30,6 +30,10 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
private bool _isPaired;
|
||||
// Bridges the gap between an approval event and the next hello-ok when the gateway omits auth.deviceToken.
|
||||
private bool _pairingApprovedAwaitingReconnect;
|
||||
// Persists across disconnect/error so ShouldAutoReconnect can block reconnect
|
||||
// even after OnDisconnected clears _isPendingApproval.
|
||||
private volatile bool _pairingBlocked;
|
||||
private volatile bool _rateLimited;
|
||||
private readonly string _gatewayToken;
|
||||
private readonly string? _bootstrapToken;
|
||||
|
||||
@ -277,6 +281,7 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
|
||||
_isPendingApproval = true;
|
||||
_isPaired = false;
|
||||
_pairingBlocked = true;
|
||||
_pairingApprovedAwaitingReconnect = false;
|
||||
|
||||
_logger.Info($"[NODE] Pairing requested for this device via {eventType}");
|
||||
@ -310,6 +315,7 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
{
|
||||
_isPendingApproval = false;
|
||||
_isPaired = true;
|
||||
_pairingBlocked = false; // Allow reconnect after approval
|
||||
_pairingApprovedAwaitingReconnect = true;
|
||||
|
||||
PairingStatusChanged?.Invoke(this, new PairingStatusEventArgs(
|
||||
@ -603,6 +609,7 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
PublishGatewaySelf(GatewaySelfInfo.FromHelloOk(payload));
|
||||
var reconnectingAfterApproval = _pairingApprovedAwaitingReconnect;
|
||||
_isConnected = true;
|
||||
_rateLimited = false; // Clear transient rate-limit on successful connect
|
||||
ResetReconnectAttempts();
|
||||
|
||||
// Extract node ID if returned
|
||||
@ -654,6 +661,7 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
{
|
||||
_isPendingApproval = true;
|
||||
_isPaired = false;
|
||||
_pairingBlocked = true;
|
||||
_logger.Info("Not yet paired - check 'openclaw devices list' for pending approval");
|
||||
_logger.Info($"To approve, run: openclaw devices approve {_deviceIdentity.DeviceId}");
|
||||
PairingStatusChanged?.Invoke(this, new PairingStatusEventArgs(
|
||||
@ -717,6 +725,7 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
|
||||
_isPendingApproval = true;
|
||||
_isPaired = false;
|
||||
_pairingBlocked = true;
|
||||
_pairingApprovedAwaitingReconnect = false;
|
||||
|
||||
var detail = !string.IsNullOrWhiteSpace(pairingRequestId)
|
||||
@ -731,6 +740,18 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
return;
|
||||
}
|
||||
|
||||
// Rate-limit / terminal auth errors — stop reconnecting
|
||||
if (error.Contains("too many failed", StringComparison.OrdinalIgnoreCase) ||
|
||||
error.Contains("rate limit", StringComparison.OrdinalIgnoreCase) ||
|
||||
error.Contains("origin not allowed", StringComparison.OrdinalIgnoreCase) ||
|
||||
error.Contains("token mismatch", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
_rateLimited = true;
|
||||
_logger.Warn($"[NODE] Terminal auth error; stopping reconnect. Error: {error}");
|
||||
RaiseStatusChanged(ConnectionStatus.Error);
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.Error($"Node registration failed: {error} (code: {errorCode})");
|
||||
RaiseStatusChanged(ConnectionStatus.Error);
|
||||
}
|
||||
@ -997,6 +1018,20 @@ public class WindowsNodeClient : WebSocketClientBase
|
||||
GatewaySelfUpdated?.Invoke(this, info);
|
||||
}
|
||||
|
||||
protected override bool ShouldAutoReconnect()
|
||||
{
|
||||
// Don't reconnect while awaiting pairing approval — each reconnect
|
||||
// generates a new pairing request on the gateway, causing a storm.
|
||||
// _pairingBlocked survives OnDisconnected (which clears _isPendingApproval).
|
||||
if (_pairingBlocked)
|
||||
return false;
|
||||
|
||||
if (_rateLimited)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected override void OnDisconnected()
|
||||
{
|
||||
_isConnected = false;
|
||||
|
||||
@ -1494,19 +1494,30 @@ public partial class App : Application
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(_settings.Token))
|
||||
// Need either a regular token or a bootstrap token to connect
|
||||
var effectiveToken = _settings.Token;
|
||||
if (string.IsNullOrWhiteSpace(effectiveToken))
|
||||
{
|
||||
Logger.Info("Gateway token not configured — skipping operator client initialization");
|
||||
return;
|
||||
if (useBootstrapHandoffAuth && !string.IsNullOrWhiteSpace(_settings.BootstrapToken))
|
||||
{
|
||||
// Bootstrap-only flow (setup code / QR): use bootstrap token for initial pairing
|
||||
effectiveToken = _settings.BootstrapToken;
|
||||
}
|
||||
else
|
||||
{
|
||||
Logger.Info("Gateway token not configured — skipping operator client initialization");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Unsubscribe from old client if exists
|
||||
UnsubscribeGatewayEvents();
|
||||
_gatewayClient?.Dispose();
|
||||
_lastGatewaySelf = null;
|
||||
|
||||
_gatewayClient = new OpenClawGatewayClient(
|
||||
gatewayUrl,
|
||||
_settings.Token,
|
||||
effectiveToken,
|
||||
new AppLogger(),
|
||||
useBootstrapHandoffAuth);
|
||||
_gatewayClient.SetUserRules(_settings.UserRules.Count > 0 ? _settings.UserRules : null);
|
||||
@ -1947,6 +1958,19 @@ public partial class App : Application
|
||||
if (_hubWindow != null && !_hubWindow.IsClosed)
|
||||
_hubWindow.LastAuthError = null;
|
||||
}
|
||||
|
||||
// Clear stale data when disconnected so tray menu doesn't show old sessions/nodes
|
||||
if (status == ConnectionStatus.Disconnected || status == ConnectionStatus.Error)
|
||||
{
|
||||
_lastSessions = Array.Empty<SessionInfo>();
|
||||
_lastChannels = Array.Empty<ChannelHealth>();
|
||||
_lastNodes = Array.Empty<GatewayNodeInfo>();
|
||||
_lastNodePairList = null;
|
||||
_lastDevicePairList = null;
|
||||
_lastModelsList = null;
|
||||
_lastGatewaySelf = null;
|
||||
}
|
||||
|
||||
UpdateTrayIcon();
|
||||
_dispatcherQueue?.TryEnqueue(UpdateStatusDetailWindow);
|
||||
|
||||
@ -2506,6 +2530,7 @@ public partial class App : Application
|
||||
_hubWindow.OpenDashboardAction = OpenDashboard;
|
||||
_hubWindow.CheckForUpdatesAction = () => _ = CheckForUpdatesUserInitiatedAsync();
|
||||
_hubWindow.QuickSendAction = () => ShowQuickSend();
|
||||
_hubWindow.OpenSetupAction = () => _ = ShowOnboardingAsync();
|
||||
_hubWindow.ConnectAction = () =>
|
||||
{
|
||||
InitializeGatewayClient();
|
||||
|
||||
@ -136,14 +136,14 @@ public sealed class ConnectionPage : Component<OnboardingState>
|
||||
|
||||
void OnSetupCodeChanged(string code)
|
||||
{
|
||||
setSetupCode(code);
|
||||
if (string.IsNullOrWhiteSpace(code)) return;
|
||||
|
||||
var result = SetupCodeDecoder.Decode(code);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
// Not a valid setup code — user might be still typing
|
||||
// Not a valid setup code — user might be still typing.
|
||||
// Don't call setSetupCode here to avoid re-render that steals focus.
|
||||
if (code.Length > 2048)
|
||||
Logger.Warn("[Connection] Setup code rejected: exceeds 2048 character limit");
|
||||
else
|
||||
@ -151,6 +151,8 @@ public sealed class ConnectionPage : Component<OnboardingState>
|
||||
return;
|
||||
}
|
||||
|
||||
// Valid setup code decoded — now update state (will re-render)
|
||||
setSetupCode(code);
|
||||
if (result.Url != null)
|
||||
{
|
||||
setUrl(result.Url);
|
||||
@ -159,7 +161,8 @@ public sealed class ConnectionPage : Component<OnboardingState>
|
||||
if (result.Token != null)
|
||||
{
|
||||
setToken(result.Token);
|
||||
Props.Settings.Token = result.Token;
|
||||
// Bootstrap token goes to BootstrapToken only — it's single-use for pairing.
|
||||
// Don't save as Settings.Token (causes reconnect storms on restart).
|
||||
Props.Settings.BootstrapToken = result.Token;
|
||||
}
|
||||
setStatusMsg($"✅ {LocalizationHelper.GetString("Onboarding_Connection_StatusDecoded")}");
|
||||
@ -205,7 +208,13 @@ public sealed class ConnectionPage : Component<OnboardingState>
|
||||
async void TestConnection()
|
||||
{
|
||||
Props.Settings.GatewayUrl = url;
|
||||
Props.Settings.Token = token;
|
||||
// Only save to Settings.Token if the user entered a manual token,
|
||||
// not a decoded bootstrap token (which belongs in BootstrapToken only).
|
||||
if (string.IsNullOrWhiteSpace(Props.Settings.BootstrapToken) ||
|
||||
!string.Equals(token, Props.Settings.BootstrapToken, StringComparison.Ordinal))
|
||||
{
|
||||
Props.Settings.Token = token;
|
||||
}
|
||||
|
||||
// When SSH mode, start the managed tunnel before health-checking the local URL.
|
||||
if (mode == ConnectionMode.Ssh)
|
||||
@ -473,40 +482,14 @@ public sealed class ConnectionPage : Component<OnboardingState>
|
||||
catch { /* clipboard unavailable — ignore */ }
|
||||
}
|
||||
|
||||
// Setup code row: TextField + Paste + QR buttons (Grid keeps the field expanding)
|
||||
// Setup code row: TextField + Paste + QR buttons
|
||||
cardChildren.Add(
|
||||
Grid(["1*", "Auto", "Auto"], ["Auto"],
|
||||
TextField(setupCode, OnSetupCodeChanged,
|
||||
placeholder: LocalizationHelper.GetString("Onboarding_Connection_SetupCodePlaceholder"),
|
||||
header: LocalizationHelper.GetString("Onboarding_Connection_SetupCode"))
|
||||
.OnGotFocus((sender, _) =>
|
||||
{
|
||||
if (sender is Microsoft.UI.Xaml.Controls.TextBox tb && string.IsNullOrEmpty(tb.Text))
|
||||
{
|
||||
try
|
||||
{
|
||||
var content = global::Windows.ApplicationModel.DataTransfer.Clipboard.GetContent();
|
||||
if (content.Contains(global::Windows.ApplicationModel.DataTransfer.StandardDataFormats.Text))
|
||||
{
|
||||
var task = content.GetTextAsync();
|
||||
task.Completed = (op, status) =>
|
||||
{
|
||||
if (status == global::Windows.Foundation.AsyncStatus.Completed)
|
||||
{
|
||||
var text = op.GetResults();
|
||||
tb.DispatcherQueue.TryEnqueue(() =>
|
||||
{
|
||||
tb.Text = text;
|
||||
OnSetupCodeChanged(text);
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
catch { }
|
||||
}
|
||||
})
|
||||
.Grid(row: 0, column: 0),
|
||||
.Grid(row: 0, column: 0)
|
||||
.Set(tb => Microsoft.UI.Xaml.Automation.AutomationProperties.SetAutomationId(tb, "OnboardingSetupCode")),
|
||||
Button(LocalizationHelper.GetString("Onboarding_Connection_PasteSetup"), PasteSetupCode)
|
||||
.VAlign(VerticalAlignment.Bottom)
|
||||
.Margin(6, 0, 0, 0)
|
||||
|
||||
@ -100,7 +100,7 @@
|
||||
<ColumnDefinition Width="Auto"/>
|
||||
<ColumnDefinition Width="Auto"/>
|
||||
</Grid.ColumnDefinitions>
|
||||
<PasswordBox x:Uid="TokenPromptBox" x:Name="TokenPromptBox" Grid.Column="0"
|
||||
<TextBox x:Uid="TokenPromptBox" x:Name="TokenPromptBox" Grid.Column="0"
|
||||
PlaceholderText="Gateway token" Header="Token"/>
|
||||
<Button x:Uid="ConnectionPage_Button_105" Grid.Column="1" Content="Cancel"
|
||||
VerticalAlignment="Bottom" Click="OnCancelTokenPrompt"/>
|
||||
|
||||
@ -473,14 +473,14 @@ public sealed partial class ConnectionPage : Page
|
||||
_pendingGatewayUrl = gw.ConnectionUrl;
|
||||
_pendingGatewayId = gw.Id;
|
||||
TokenPromptText.Text = $"Connect to gateway at {gw.Host}:{gw.Port}";
|
||||
TokenPromptBox.Password = _hub.Settings.Token ?? "";
|
||||
TokenPromptBox.Text = _hub.Settings.Token ?? "";
|
||||
TokenPromptPanel.Visibility = Visibility.Visible;
|
||||
TokenPromptBox.Focus(Microsoft.UI.Xaml.FocusState.Programmatic);
|
||||
}
|
||||
|
||||
private void OnConnectWithToken(object sender, RoutedEventArgs e)
|
||||
{
|
||||
var token = TokenPromptBox.Password?.Trim();
|
||||
var token = TokenPromptBox.Text?.Trim();
|
||||
if (string.IsNullOrEmpty(token) || _hub?.Settings == null || string.IsNullOrEmpty(_pendingGatewayUrl))
|
||||
return;
|
||||
|
||||
@ -535,10 +535,9 @@ public sealed partial class ConnectionPage : Page
|
||||
settings.GatewayUrl = result.Url;
|
||||
if (!string.IsNullOrEmpty(result.Token))
|
||||
{
|
||||
// Bootstrap token goes to BootstrapToken only — it's single-use for pairing.
|
||||
// Don't save it as Settings.Token, which would cause reconnect storms on restart.
|
||||
settings.BootstrapToken = result.Token;
|
||||
// Also set as the operator token so InitializeGatewayClient can connect
|
||||
if (string.IsNullOrWhiteSpace(settings.Token))
|
||||
settings.Token = result.Token;
|
||||
}
|
||||
|
||||
settings.Save();
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<Button x:Uid="DebugPage_Button_113" Content="📁 Open Diagnostics Folder" Click="OnOpenDiagnosticsFolder" HorizontalAlignment="Stretch"/>
|
||||
<Button x:Uid="DebugPage_Button_114" Content="📋 Copy Support Context" Click="OnCopySupportContext"
|
||||
Style="{ThemeResource AccentButtonStyle}" HorizontalAlignment="Stretch"/>
|
||||
<Button Content="🔄 Relaunch First-Run Setup" Click="OnRelaunchOnboarding" HorizontalAlignment="Stretch"/>
|
||||
</StackPanel>
|
||||
</Expander>
|
||||
|
||||
|
||||
@ -196,4 +196,9 @@ public sealed partial class DebugPage : Page
|
||||
timer.Start();
|
||||
}
|
||||
}
|
||||
|
||||
private void OnRelaunchOnboarding(object sender, RoutedEventArgs e)
|
||||
{
|
||||
_hub?.OpenSetupAction?.Invoke();
|
||||
}
|
||||
}
|
||||
|
||||
@ -30,6 +30,7 @@ public sealed partial class HubWindow : WindowEx
|
||||
public Action? ConnectAction { get; set; }
|
||||
public Action? DisconnectAction { get; set; }
|
||||
public Action? ReconnectAction { get; set; }
|
||||
public Action? OpenSetupAction { get; set; }
|
||||
|
||||
// Node service state (set by App.xaml.cs in ShowHub)
|
||||
public bool NodeIsConnected { get; set; }
|
||||
|
||||
@ -51,7 +51,7 @@ public sealed class SetupWizardWindow : WindowEx
|
||||
// Step 0: Setup code + manual entry
|
||||
private readonly TextBox _setupCodeBox;
|
||||
private readonly TextBox _gatewayUrlBox;
|
||||
private readonly PasswordBox _tokenBox;
|
||||
private readonly TextBox _tokenBox;
|
||||
private readonly TextBlock _testStatusLabel;
|
||||
private readonly Button _testButton;
|
||||
private readonly StackPanel _manualEntryPanel;
|
||||
@ -196,17 +196,17 @@ public sealed class SetupWizardWindow : WindowEx
|
||||
Style = (Style)Application.Current.Resources["CaptionTextBlockStyle"],
|
||||
Foreground = (SolidColorBrush)Application.Current.Resources["TextFillColorSecondaryBrush"]
|
||||
});
|
||||
_tokenBox = new PasswordBox
|
||||
_tokenBox = new TextBox
|
||||
{
|
||||
Header = LocalizationHelper.GetString("Setup_TokenHeader"),
|
||||
PlaceholderText = LocalizationHelper.GetString("Setup_TokenPlaceholder"),
|
||||
Password = _draftToken
|
||||
Text = _draftToken
|
||||
};
|
||||
AutomationProperties.SetAutomationId(_tokenBox, "TokenBox");
|
||||
_tokenBox.PasswordChanged += (s, e) => _connectionTested = false;
|
||||
_tokenBox.PasswordChanged += (s, e) =>
|
||||
_tokenBox.TextChanged += (s, e) => _connectionTested = false;
|
||||
_tokenBox.TextChanged += (s, e) =>
|
||||
{
|
||||
_draftToken = _tokenBox.Password;
|
||||
_draftToken = _tokenBox.Text;
|
||||
UpdatePairingStatusText();
|
||||
};
|
||||
_manualEntryPanel.Children.Add(_tokenBox);
|
||||
@ -718,7 +718,7 @@ public sealed class SetupWizardWindow : WindowEx
|
||||
private async void OnTestConnection(object sender, RoutedEventArgs e)
|
||||
{
|
||||
_draftGatewayUrl = _gatewayUrlBox.Text.Trim();
|
||||
_draftToken = _tokenBox.Password;
|
||||
_draftToken = _tokenBox.Text;
|
||||
UpdatePairingStatusText();
|
||||
|
||||
if (!GatewayUrlHelper.IsValidGatewayUrl(_draftGatewayUrl))
|
||||
|
||||
@ -546,7 +546,7 @@ public class LocalCommandRunnerIntegrationTests
|
||||
{
|
||||
Command = "Write-Output 'hello world'",
|
||||
Shell = "powershell",
|
||||
TimeoutMs = 10000
|
||||
TimeoutMs = 30000
|
||||
});
|
||||
|
||||
Assert.Equal(0, result.ExitCode);
|
||||
|
||||
@ -251,7 +251,7 @@ public class WebSocketClientBaseTests
|
||||
|
||||
Assert.Contains(ConnectionStatus.Error, statuses);
|
||||
Assert.True(statuses.Count(s => s == ConnectionStatus.Connecting) >= 2);
|
||||
Assert.Contains(_logger.Logs, line => line.Contains("reconnecting in 1000ms", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.Contains(_logger.Logs, line => line.Contains("reconnecting in 1", StringComparison.OrdinalIgnoreCase) && line.Contains("ms (attempt 1)", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
client.Dispose();
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user