fix: connection stability — stop node reconnect storms, fix bootstrap token handling (#287)
Some checks are pending
Build and Test / release (push) Blocked by required conditions
Build and Test / test (push) Waiting to run
Build and Test / build (win-arm64) (push) Blocked by required conditions
Build and Test / build (win-x64) (push) Blocked by required conditions
Build and Test / build-msix (ARM64, win-arm64) (push) Blocked by required conditions
Build and Test / build-msix (x64, win-x64) (push) Blocked by required conditions
Build and Test / build-extension (arm64) (push) Blocked by required conditions
Build and Test / build-extension (x64) (push) Blocked by required conditions

* fix: connection stability — stop node reconnect storms, fix bootstrap token handling

Critical fixes for connection management bugs introduced in PR #272:

1. Node reconnect storm during pairing (WindowsNodeClient)
   - Added ShouldAutoReconnect() override with _pairingBlocked flag
   - Flag survives OnDisconnected() (which clears _isPendingApproval)
   - Added rate-limit detection for terminal auth errors
   - Marked _pairingBlocked/_rateLimited as volatile for thread safety
   - Clear _rateLimited on successful hello-ok (transient, not permanent)

2. Backoff jitter (WebSocketClientBase)
   - Added 0-25% random jitter to prevent thundering herd when
     operator + node clients reconnect simultaneously

3. Client leak on reinitialize (App.xaml.cs)
   - Added _gatewayClient?.Dispose() before creating new client
   - Old clients were keeping reconnect loops alive as zombies

4. Bootstrap token not saved as Settings.Token
   - Setup code decoder no longer persists bootstrap to Settings.Token
   - Prevents reconnect storms on app restart with stale bootstrap token
   - TestConnection skips writing bootstrap value to Settings.Token
   - InitializeGatewayClient falls back to BootstrapToken for bootstrap flow

5. Token PasswordBox → TextBox
   - Users can see what they pasted (SetupWizardWindow + ConnectionPage)

6. Clear stale tray data on disconnect
   - Sessions/channels/nodes/models cleared when disconnected/error
   - Tray menu no longer shows old data alongside 'Disconnected'

7. Onboarding UX fixes
   - Removed disruptive auto-paste-on-focus from setup code field
   - Setup code state only updates on valid decode (prevents focus loss)
   - Added 'Relaunch First-Run Setup' button to Debug page

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

* test: increase PowerShell echo test timeout to 30s for slow CI runners

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Ranjesh 2026-05-06 08:13:55 -07:00 committed by GitHub
parent 584a19fadd
commit 2fcfe76abc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 105 additions and 52 deletions

View File

@ -251,6 +251,10 @@ public abstract class WebSocketClientBase : IDisposable
while (!_disposed && !_cts.Token.IsCancellationRequested && ShouldAutoReconnect())
{
var delay = BackoffMs[Math.Min(_reconnectAttempts, BackoffMs.Length - 1)];
// Add 0-25% jitter to prevent thundering herd when multiple clients
// (operator + node) reconnect on the same schedule
var jitter = Random.Shared.Next(0, delay / 4);
delay += jitter;
_reconnectAttempts++;
_logger.Warn($"{ClientRole} reconnecting in {delay}ms (attempt {_reconnectAttempts})");
RaiseStatusChanged(ConnectionStatus.Connecting);

View File

@ -30,6 +30,10 @@ public class WindowsNodeClient : WebSocketClientBase
private bool _isPaired;
// Bridges the gap between an approval event and the next hello-ok when the gateway omits auth.deviceToken.
private bool _pairingApprovedAwaitingReconnect;
// Persists across disconnect/error so ShouldAutoReconnect can block reconnect
// even after OnDisconnected clears _isPendingApproval.
private volatile bool _pairingBlocked;
private volatile bool _rateLimited;
private readonly string _gatewayToken;
private readonly string? _bootstrapToken;
@ -277,6 +281,7 @@ public class WindowsNodeClient : WebSocketClientBase
_isPendingApproval = true;
_isPaired = false;
_pairingBlocked = true;
_pairingApprovedAwaitingReconnect = false;
_logger.Info($"[NODE] Pairing requested for this device via {eventType}");
@ -310,6 +315,7 @@ public class WindowsNodeClient : WebSocketClientBase
{
_isPendingApproval = false;
_isPaired = true;
_pairingBlocked = false; // Allow reconnect after approval
_pairingApprovedAwaitingReconnect = true;
PairingStatusChanged?.Invoke(this, new PairingStatusEventArgs(
@ -603,6 +609,7 @@ public class WindowsNodeClient : WebSocketClientBase
PublishGatewaySelf(GatewaySelfInfo.FromHelloOk(payload));
var reconnectingAfterApproval = _pairingApprovedAwaitingReconnect;
_isConnected = true;
_rateLimited = false; // Clear transient rate-limit on successful connect
ResetReconnectAttempts();
// Extract node ID if returned
@ -654,6 +661,7 @@ public class WindowsNodeClient : WebSocketClientBase
{
_isPendingApproval = true;
_isPaired = false;
_pairingBlocked = true;
_logger.Info("Not yet paired - check 'openclaw devices list' for pending approval");
_logger.Info($"To approve, run: openclaw devices approve {_deviceIdentity.DeviceId}");
PairingStatusChanged?.Invoke(this, new PairingStatusEventArgs(
@ -717,6 +725,7 @@ public class WindowsNodeClient : WebSocketClientBase
_isPendingApproval = true;
_isPaired = false;
_pairingBlocked = true;
_pairingApprovedAwaitingReconnect = false;
var detail = !string.IsNullOrWhiteSpace(pairingRequestId)
@ -731,6 +740,18 @@ public class WindowsNodeClient : WebSocketClientBase
return;
}
// Rate-limit / terminal auth errors — stop reconnecting
if (error.Contains("too many failed", StringComparison.OrdinalIgnoreCase) ||
error.Contains("rate limit", StringComparison.OrdinalIgnoreCase) ||
error.Contains("origin not allowed", StringComparison.OrdinalIgnoreCase) ||
error.Contains("token mismatch", StringComparison.OrdinalIgnoreCase))
{
_rateLimited = true;
_logger.Warn($"[NODE] Terminal auth error; stopping reconnect. Error: {error}");
RaiseStatusChanged(ConnectionStatus.Error);
return;
}
_logger.Error($"Node registration failed: {error} (code: {errorCode})");
RaiseStatusChanged(ConnectionStatus.Error);
}
@ -997,6 +1018,20 @@ public class WindowsNodeClient : WebSocketClientBase
GatewaySelfUpdated?.Invoke(this, info);
}
protected override bool ShouldAutoReconnect()
{
// Don't reconnect while awaiting pairing approval — each reconnect
// generates a new pairing request on the gateway, causing a storm.
// _pairingBlocked survives OnDisconnected (which clears _isPendingApproval).
if (_pairingBlocked)
return false;
if (_rateLimited)
return false;
return true;
}
protected override void OnDisconnected()
{
_isConnected = false;

View File

@ -1494,19 +1494,30 @@ public partial class App : Application
return;
}
if (string.IsNullOrWhiteSpace(_settings.Token))
// Need either a regular token or a bootstrap token to connect
var effectiveToken = _settings.Token;
if (string.IsNullOrWhiteSpace(effectiveToken))
{
Logger.Info("Gateway token not configured — skipping operator client initialization");
return;
if (useBootstrapHandoffAuth && !string.IsNullOrWhiteSpace(_settings.BootstrapToken))
{
// Bootstrap-only flow (setup code / QR): use bootstrap token for initial pairing
effectiveToken = _settings.BootstrapToken;
}
else
{
Logger.Info("Gateway token not configured — skipping operator client initialization");
return;
}
}
// Unsubscribe from old client if exists
UnsubscribeGatewayEvents();
_gatewayClient?.Dispose();
_lastGatewaySelf = null;
_gatewayClient = new OpenClawGatewayClient(
gatewayUrl,
_settings.Token,
effectiveToken,
new AppLogger(),
useBootstrapHandoffAuth);
_gatewayClient.SetUserRules(_settings.UserRules.Count > 0 ? _settings.UserRules : null);
@ -1947,6 +1958,19 @@ public partial class App : Application
if (_hubWindow != null && !_hubWindow.IsClosed)
_hubWindow.LastAuthError = null;
}
// Clear stale data when disconnected so tray menu doesn't show old sessions/nodes
if (status == ConnectionStatus.Disconnected || status == ConnectionStatus.Error)
{
_lastSessions = Array.Empty<SessionInfo>();
_lastChannels = Array.Empty<ChannelHealth>();
_lastNodes = Array.Empty<GatewayNodeInfo>();
_lastNodePairList = null;
_lastDevicePairList = null;
_lastModelsList = null;
_lastGatewaySelf = null;
}
UpdateTrayIcon();
_dispatcherQueue?.TryEnqueue(UpdateStatusDetailWindow);
@ -2506,6 +2530,7 @@ public partial class App : Application
_hubWindow.OpenDashboardAction = OpenDashboard;
_hubWindow.CheckForUpdatesAction = () => _ = CheckForUpdatesUserInitiatedAsync();
_hubWindow.QuickSendAction = () => ShowQuickSend();
_hubWindow.OpenSetupAction = () => _ = ShowOnboardingAsync();
_hubWindow.ConnectAction = () =>
{
InitializeGatewayClient();

View File

@ -136,14 +136,14 @@ public sealed class ConnectionPage : Component<OnboardingState>
void OnSetupCodeChanged(string code)
{
setSetupCode(code);
if (string.IsNullOrWhiteSpace(code)) return;
var result = SetupCodeDecoder.Decode(code);
if (!result.Success)
{
// Not a valid setup code — user might be still typing
// Not a valid setup code — user might be still typing.
// Don't call setSetupCode here to avoid re-render that steals focus.
if (code.Length > 2048)
Logger.Warn("[Connection] Setup code rejected: exceeds 2048 character limit");
else
@ -151,6 +151,8 @@ public sealed class ConnectionPage : Component<OnboardingState>
return;
}
// Valid setup code decoded — now update state (will re-render)
setSetupCode(code);
if (result.Url != null)
{
setUrl(result.Url);
@ -159,7 +161,8 @@ public sealed class ConnectionPage : Component<OnboardingState>
if (result.Token != null)
{
setToken(result.Token);
Props.Settings.Token = result.Token;
// Bootstrap token goes to BootstrapToken only — it's single-use for pairing.
// Don't save as Settings.Token (causes reconnect storms on restart).
Props.Settings.BootstrapToken = result.Token;
}
setStatusMsg($"✅ {LocalizationHelper.GetString("Onboarding_Connection_StatusDecoded")}");
@ -205,7 +208,13 @@ public sealed class ConnectionPage : Component<OnboardingState>
async void TestConnection()
{
Props.Settings.GatewayUrl = url;
Props.Settings.Token = token;
// Only save to Settings.Token if the user entered a manual token,
// not a decoded bootstrap token (which belongs in BootstrapToken only).
if (string.IsNullOrWhiteSpace(Props.Settings.BootstrapToken) ||
!string.Equals(token, Props.Settings.BootstrapToken, StringComparison.Ordinal))
{
Props.Settings.Token = token;
}
// When SSH mode, start the managed tunnel before health-checking the local URL.
if (mode == ConnectionMode.Ssh)
@ -473,40 +482,14 @@ public sealed class ConnectionPage : Component<OnboardingState>
catch { /* clipboard unavailable — ignore */ }
}
// Setup code row: TextField + Paste + QR buttons (Grid keeps the field expanding)
// Setup code row: TextField + Paste + QR buttons
cardChildren.Add(
Grid(["1*", "Auto", "Auto"], ["Auto"],
TextField(setupCode, OnSetupCodeChanged,
placeholder: LocalizationHelper.GetString("Onboarding_Connection_SetupCodePlaceholder"),
header: LocalizationHelper.GetString("Onboarding_Connection_SetupCode"))
.OnGotFocus((sender, _) =>
{
if (sender is Microsoft.UI.Xaml.Controls.TextBox tb && string.IsNullOrEmpty(tb.Text))
{
try
{
var content = global::Windows.ApplicationModel.DataTransfer.Clipboard.GetContent();
if (content.Contains(global::Windows.ApplicationModel.DataTransfer.StandardDataFormats.Text))
{
var task = content.GetTextAsync();
task.Completed = (op, status) =>
{
if (status == global::Windows.Foundation.AsyncStatus.Completed)
{
var text = op.GetResults();
tb.DispatcherQueue.TryEnqueue(() =>
{
tb.Text = text;
OnSetupCodeChanged(text);
});
}
};
}
}
catch { }
}
})
.Grid(row: 0, column: 0),
.Grid(row: 0, column: 0)
.Set(tb => Microsoft.UI.Xaml.Automation.AutomationProperties.SetAutomationId(tb, "OnboardingSetupCode")),
Button(LocalizationHelper.GetString("Onboarding_Connection_PasteSetup"), PasteSetupCode)
.VAlign(VerticalAlignment.Bottom)
.Margin(6, 0, 0, 0)

View File

@ -100,7 +100,7 @@
<ColumnDefinition Width="Auto"/>
<ColumnDefinition Width="Auto"/>
</Grid.ColumnDefinitions>
<PasswordBox x:Uid="TokenPromptBox" x:Name="TokenPromptBox" Grid.Column="0"
<TextBox x:Uid="TokenPromptBox" x:Name="TokenPromptBox" Grid.Column="0"
PlaceholderText="Gateway token" Header="Token"/>
<Button x:Uid="ConnectionPage_Button_105" Grid.Column="1" Content="Cancel"
VerticalAlignment="Bottom" Click="OnCancelTokenPrompt"/>

View File

@ -473,14 +473,14 @@ public sealed partial class ConnectionPage : Page
_pendingGatewayUrl = gw.ConnectionUrl;
_pendingGatewayId = gw.Id;
TokenPromptText.Text = $"Connect to gateway at {gw.Host}:{gw.Port}";
TokenPromptBox.Password = _hub.Settings.Token ?? "";
TokenPromptBox.Text = _hub.Settings.Token ?? "";
TokenPromptPanel.Visibility = Visibility.Visible;
TokenPromptBox.Focus(Microsoft.UI.Xaml.FocusState.Programmatic);
}
private void OnConnectWithToken(object sender, RoutedEventArgs e)
{
var token = TokenPromptBox.Password?.Trim();
var token = TokenPromptBox.Text?.Trim();
if (string.IsNullOrEmpty(token) || _hub?.Settings == null || string.IsNullOrEmpty(_pendingGatewayUrl))
return;
@ -535,10 +535,9 @@ public sealed partial class ConnectionPage : Page
settings.GatewayUrl = result.Url;
if (!string.IsNullOrEmpty(result.Token))
{
// Bootstrap token goes to BootstrapToken only — it's single-use for pairing.
// Don't save it as Settings.Token, which would cause reconnect storms on restart.
settings.BootstrapToken = result.Token;
// Also set as the operator token so InitializeGatewayClient can connect
if (string.IsNullOrWhiteSpace(settings.Token))
settings.Token = result.Token;
}
settings.Save();

View File

@ -113,6 +113,7 @@
<Button x:Uid="DebugPage_Button_113" Content="📁 Open Diagnostics Folder" Click="OnOpenDiagnosticsFolder" HorizontalAlignment="Stretch"/>
<Button x:Uid="DebugPage_Button_114" Content="📋 Copy Support Context" Click="OnCopySupportContext"
Style="{ThemeResource AccentButtonStyle}" HorizontalAlignment="Stretch"/>
<Button Content="🔄 Relaunch First-Run Setup" Click="OnRelaunchOnboarding" HorizontalAlignment="Stretch"/>
</StackPanel>
</Expander>

View File

@ -196,4 +196,9 @@ public sealed partial class DebugPage : Page
timer.Start();
}
}
private void OnRelaunchOnboarding(object sender, RoutedEventArgs e)
{
_hub?.OpenSetupAction?.Invoke();
}
}

View File

@ -30,6 +30,7 @@ public sealed partial class HubWindow : WindowEx
public Action? ConnectAction { get; set; }
public Action? DisconnectAction { get; set; }
public Action? ReconnectAction { get; set; }
public Action? OpenSetupAction { get; set; }
// Node service state (set by App.xaml.cs in ShowHub)
public bool NodeIsConnected { get; set; }

View File

@ -51,7 +51,7 @@ public sealed class SetupWizardWindow : WindowEx
// Step 0: Setup code + manual entry
private readonly TextBox _setupCodeBox;
private readonly TextBox _gatewayUrlBox;
private readonly PasswordBox _tokenBox;
private readonly TextBox _tokenBox;
private readonly TextBlock _testStatusLabel;
private readonly Button _testButton;
private readonly StackPanel _manualEntryPanel;
@ -196,17 +196,17 @@ public sealed class SetupWizardWindow : WindowEx
Style = (Style)Application.Current.Resources["CaptionTextBlockStyle"],
Foreground = (SolidColorBrush)Application.Current.Resources["TextFillColorSecondaryBrush"]
});
_tokenBox = new PasswordBox
_tokenBox = new TextBox
{
Header = LocalizationHelper.GetString("Setup_TokenHeader"),
PlaceholderText = LocalizationHelper.GetString("Setup_TokenPlaceholder"),
Password = _draftToken
Text = _draftToken
};
AutomationProperties.SetAutomationId(_tokenBox, "TokenBox");
_tokenBox.PasswordChanged += (s, e) => _connectionTested = false;
_tokenBox.PasswordChanged += (s, e) =>
_tokenBox.TextChanged += (s, e) => _connectionTested = false;
_tokenBox.TextChanged += (s, e) =>
{
_draftToken = _tokenBox.Password;
_draftToken = _tokenBox.Text;
UpdatePairingStatusText();
};
_manualEntryPanel.Children.Add(_tokenBox);
@ -718,7 +718,7 @@ public sealed class SetupWizardWindow : WindowEx
private async void OnTestConnection(object sender, RoutedEventArgs e)
{
_draftGatewayUrl = _gatewayUrlBox.Text.Trim();
_draftToken = _tokenBox.Password;
_draftToken = _tokenBox.Text;
UpdatePairingStatusText();
if (!GatewayUrlHelper.IsValidGatewayUrl(_draftGatewayUrl))

View File

@ -546,7 +546,7 @@ public class LocalCommandRunnerIntegrationTests
{
Command = "Write-Output 'hello world'",
Shell = "powershell",
TimeoutMs = 10000
TimeoutMs = 30000
});
Assert.Equal(0, result.ExitCode);

View File

@ -251,7 +251,7 @@ public class WebSocketClientBaseTests
Assert.Contains(ConnectionStatus.Error, statuses);
Assert.True(statuses.Count(s => s == ConnectionStatus.Connecting) >= 2);
Assert.Contains(_logger.Logs, line => line.Contains("reconnecting in 1000ms", StringComparison.OrdinalIgnoreCase));
Assert.Contains(_logger.Logs, line => line.Contains("reconnecting in 1", StringComparison.OrdinalIgnoreCase) && line.Contains("ms (attempt 1)", StringComparison.OrdinalIgnoreCase));
client.Dispose();
}