From 844b162551a5276f9b869928ae91cdf6270596a4 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 6 Aug 2025 08:37:02 +0200 Subject: [PATCH] integration: Eventually, debug output, lint and format Signed-off-by: Kristoffer Dalby --- integration/acl_test.go | 259 +++- integration/auth_key_test.go | 17 +- integration/auth_oidc_test.go | 80 +- integration/auth_web_flow_test.go | 2 + integration/control.go | 5 + integration/dockertestutil/execute.go | 2 +- integration/dockertestutil/network.go | 2 +- integration/general_test.go | 1 + integration/hsic/hsic.go | 96 +- integration/route_test.go | 1745 +++++++++++++++++-------- integration/scenario.go | 41 + integration/ssh_test.go | 2 +- integration/tailscale.go | 2 + integration/tsic/tsic.go | 46 +- 14 files changed, 1719 insertions(+), 581 deletions(-) diff --git a/integration/acl_test.go b/integration/acl_test.go index d204d1f4..6a6d245c 100644 --- a/integration/acl_test.go +++ b/integration/acl_test.go @@ -5,6 +5,7 @@ import ( "net/netip" "strings" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -13,6 +14,7 @@ import ( "github.com/juanfont/headscale/integration/hsic" "github.com/juanfont/headscale/integration/integrationutil" "github.com/juanfont/headscale/integration/tsic" + "github.com/ory/dockertest/v3" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "tailscale.com/tailcfg" @@ -1271,57 +1273,262 @@ func TestACLAutogroupMember(t *testing.T) { func TestACLAutogroupTagged(t *testing.T) { IntegrationSkip(t) - scenario := aclScenario(t, - &policyv2.Policy{ - ACLs: []policyv2.ACL{ - { - Action: "accept", - Sources: []policyv2.Alias{ptr.To(policyv2.AutoGroupTagged)}, - Destinations: []policyv2.AliasWithPorts{ - aliasWithPorts(ptr.To(policyv2.AutoGroupTagged), tailcfg.PortRangeAny), - }, + // Create a custom scenario for testing autogroup:tagged + spec := ScenarioSpec{ + NodesPerUser: 2, // 2 nodes per user - one tagged, one untagged + Users: []string{"user1", "user2"}, + } + + scenario, err := NewScenario(spec) + require.NoError(t, err) + defer scenario.ShutdownAssertNoPanics(t) + + policy := &policyv2.Policy{ + TagOwners: policyv2.TagOwners{ + "tag:test": policyv2.Owners{usernameOwner("user1@"), usernameOwner("user2@")}, + }, + ACLs: []policyv2.ACL{ + { + Action: "accept", + Sources: []policyv2.Alias{ptr.To(policyv2.AutoGroupTagged)}, + Destinations: []policyv2.AliasWithPorts{ + aliasWithPorts(ptr.To(policyv2.AutoGroupTagged), tailcfg.PortRangeAny), }, }, }, + } - 2, + // Create only the headscale server (not the full environment with users/nodes) + headscale, err := scenario.Headscale( + hsic.WithACLPolicy(policy), + hsic.WithTestName("acl-autogroup-tagged"), + hsic.WithEmbeddedDERPServerOnly(), + hsic.WithTLS(), ) - defer scenario.ShutdownAssertNoPanics(t) + require.NoError(t, err) + + // Create users and nodes manually with specific tags + for _, userStr := range spec.Users { + user, err := scenario.CreateUser(userStr) + require.NoError(t, err) + + // Create a single pre-auth key per user + authKey, err := scenario.CreatePreAuthKey(user.GetId(), true, false) + require.NoError(t, err) + + // Create nodes with proper naming + for i := range spec.NodesPerUser { + var tags []string + var version string + + if i == 0 { + // First node is tagged + tags = []string{"tag:test"} + version = "head" + t.Logf("Creating tagged node for %s", userStr) + } else { + // Second node is untagged + tags = nil + version = "unstable" + t.Logf("Creating untagged node for %s", userStr) + } + + // Get the network for this scenario + networks := scenario.Networks() + var network *dockertest.Network + if len(networks) > 0 { + network = networks[0] + } + + // Create the tailscale node with appropriate options + opts := []tsic.Option{ + tsic.WithCACert(headscale.GetCert()), + tsic.WithHeadscaleName(headscale.GetHostname()), + tsic.WithNetwork(network), + tsic.WithNetfilter("off"), + tsic.WithDockerEntrypoint([]string{ + "/bin/sh", + "-c", + "/bin/sleep 3 ; apk add python3 curl ; update-ca-certificates ; python3 -m http.server --bind :: 80 & tailscaled --tun=tsdev", + }), + tsic.WithDockerWorkdir("/"), + } + + // Add tags if this is a tagged node + if len(tags) > 0 { + opts = append(opts, tsic.WithTags(tags)) + } + + tsClient, err := tsic.New( + scenario.Pool(), + version, + opts..., + ) + require.NoError(t, err) + + err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) + require.NoError(t, err) + + // Login with the auth key + err = tsClient.Login(headscale.GetEndpoint(), authKey.GetKey()) + require.NoError(t, err) + + err = tsClient.WaitForRunning(integrationutil.PeerSyncTimeout()) + require.NoError(t, err) + + // Add client to user + userObj := scenario.GetOrCreateUser(userStr) + userObj.Clients[tsClient.Hostname()] = tsClient + } + } allClients, err := scenario.ListTailscaleClients() require.NoError(t, err) + require.Len(t, allClients, 4) // 2 users * 2 nodes each - err = scenario.WaitForTailscaleSync() - require.NoError(t, err) + // Wait for nodes to see only their allowed peers + // Tagged nodes should see each other (2 tagged nodes total) + // Untagged nodes should see no one + var taggedClients []TailscaleClient + var untaggedClients []TailscaleClient - // Test that tagged nodes can access each other + // First, categorize nodes by checking their tags for _, client := range allClients { + hostname := client.Hostname() + + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + status, err := client.Status() + assert.NoError(ct, err) + + if status.Self.Tags != nil && status.Self.Tags.Len() > 0 { + // This is a tagged node + assert.Len(ct, status.Peers(), 1, "tagged node %s should see exactly 1 peer", hostname) + + // Add to tagged list only once we've verified it + found := false + for _, tc := range taggedClients { + if tc.Hostname() == hostname { + found = true + break + } + } + if !found { + taggedClients = append(taggedClients, client) + } + } else { + // This is an untagged node + assert.Empty(ct, status.Peers(), "untagged node %s should see 0 peers", hostname) + + // Add to untagged list only once we've verified it + found := false + for _, uc := range untaggedClients { + if uc.Hostname() == hostname { + found = true + break + } + } + if !found { + untaggedClients = append(untaggedClients, client) + } + } + }, 30*time.Second, 1*time.Second, "verifying peer visibility for node %s", hostname) + } + + // Verify we have the expected number of tagged and untagged nodes + require.Len(t, taggedClients, 2, "should have exactly 2 tagged nodes") + require.Len(t, untaggedClients, 2, "should have exactly 2 untagged nodes") + + // Explicitly verify tags on tagged nodes + for _, client := range taggedClients { status, err := client.Status() require.NoError(t, err) - if status.Self.Tags == nil || status.Self.Tags.Len() == 0 { - continue + require.NotNil(t, status.Self.Tags, "tagged node %s should have tags", client.Hostname()) + require.Positive(t, status.Self.Tags.Len(), "tagged node %s should have at least one tag", client.Hostname()) + t.Logf("Tagged node %s has tags: %v", client.Hostname(), status.Self.Tags) + } + + // Verify untagged nodes have no tags + for _, client := range untaggedClients { + status, err := client.Status() + require.NoError(t, err) + if status.Self.Tags != nil { + require.Equal(t, 0, status.Self.Tags.Len(), "untagged node %s should have no tags", client.Hostname()) } + t.Logf("Untagged node %s has no tags", client.Hostname()) + } - for _, peer := range allClients { + // Test that tagged nodes can communicate with each other + for _, client := range taggedClients { + for _, peer := range taggedClients { if client.Hostname() == peer.Hostname() { continue } - status, err := peer.Status() - require.NoError(t, err) - if status.Self.Tags == nil || status.Self.Tags.Len() == 0 { - continue - } - fqdn, err := peer.FQDN() require.NoError(t, err) url := fmt.Sprintf("http://%s/etc/hostname", fqdn) - t.Logf("url from %s to %s", client.Hostname(), url) + t.Logf("Testing connection from tagged node %s to tagged node %s", client.Hostname(), peer.Hostname()) - result, err := client.Curl(url) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + result, err := client.Curl(url) + assert.NoError(ct, err) + assert.Len(ct, result, 13) + }, 15*time.Second, 500*time.Millisecond, "tagged nodes should be able to communicate") + } + } + + // Test that untagged nodes cannot communicate with anyone + for _, client := range untaggedClients { + // Try to reach tagged nodes (should fail) + for _, peer := range taggedClients { + fqdn, err := peer.FQDN() require.NoError(t, err) + + url := fmt.Sprintf("http://%s/etc/hostname", fqdn) + t.Logf("Testing connection from untagged node %s to tagged node %s (should fail)", client.Hostname(), peer.Hostname()) + + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + result, err := client.CurlFailFast(url) + assert.Empty(ct, result) + assert.Error(ct, err) + }, 5*time.Second, 200*time.Millisecond, "untagged nodes should not be able to reach tagged nodes") + } + + // Try to reach other untagged nodes (should also fail) + for _, peer := range untaggedClients { + if client.Hostname() == peer.Hostname() { + continue + } + + fqdn, err := peer.FQDN() + require.NoError(t, err) + + url := fmt.Sprintf("http://%s/etc/hostname", fqdn) + t.Logf("Testing connection from untagged node %s to untagged node %s (should fail)", client.Hostname(), peer.Hostname()) + + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + result, err := client.CurlFailFast(url) + assert.Empty(ct, result) + assert.Error(ct, err) + }, 5*time.Second, 200*time.Millisecond, "untagged nodes should not be able to reach other untagged nodes") + } + } + + // Test that tagged nodes cannot reach untagged nodes + for _, client := range taggedClients { + for _, peer := range untaggedClients { + fqdn, err := peer.FQDN() + require.NoError(t, err) + + url := fmt.Sprintf("http://%s/etc/hostname", fqdn) + t.Logf("Testing connection from tagged node %s to untagged node %s (should fail)", client.Hostname(), peer.Hostname()) + + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + result, err := client.CurlFailFast(url) + assert.Empty(ct, result) + assert.Error(ct, err) + }, 5*time.Second, 200*time.Millisecond, "tagged nodes should not be able to reach untagged nodes") } } } diff --git a/integration/auth_key_test.go b/integration/auth_key_test.go index 8050f6e7..019b85f4 100644 --- a/integration/auth_key_test.go +++ b/integration/auth_key_test.go @@ -30,7 +30,11 @@ func TestAuthKeyLogoutAndReloginSameUser(t *testing.T) { assertNoErr(t, err) defer scenario.ShutdownAssertNoPanics(t) - opts := []hsic.Option{hsic.WithTestName("pingallbyip")} + opts := []hsic.Option{ + hsic.WithTestName("pingallbyip"), + hsic.WithEmbeddedDERPServerOnly(), + hsic.WithDERPAsIP(), + } if https { opts = append(opts, []hsic.Option{ hsic.WithTLS(), @@ -130,6 +134,11 @@ func TestAuthKeyLogoutAndReloginSameUser(t *testing.T) { assertLastSeenSet(t, node) } + requireAllClientsOnline(t, headscale, expectedNodes, true, "all clients should be connected to batcher", 120*time.Second) + + err = scenario.WaitForTailscaleSync() + assertNoErrSync(t, err) + allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() }) @@ -193,6 +202,7 @@ func TestAuthKeyLogoutAndReloginNewUser(t *testing.T) { err = scenario.CreateHeadscaleEnv([]tsic.Option{}, hsic.WithTestName("keyrelognewuser"), hsic.WithTLS(), + hsic.WithDERPAsIP(), ) assertNoErrHeadscaleEnv(t, err) @@ -282,7 +292,10 @@ func TestAuthKeyLogoutAndReloginSameUserExpiredKey(t *testing.T) { assertNoErr(t, err) defer scenario.ShutdownAssertNoPanics(t) - opts := []hsic.Option{hsic.WithTestName("pingallbyip")} + opts := []hsic.Option{ + hsic.WithTestName("pingallbyip"), + hsic.WithDERPAsIP(), + } if https { opts = append(opts, []hsic.Option{ hsic.WithTLS(), diff --git a/integration/auth_oidc_test.go b/integration/auth_oidc_test.go index 394d219b..6c784586 100644 --- a/integration/auth_oidc_test.go +++ b/integration/auth_oidc_test.go @@ -113,7 +113,18 @@ func TestOIDCAuthenticationPingAll(t *testing.T) { } } -// This test is really flaky. +// TestOIDCExpireNodesBasedOnTokenExpiry validates that nodes correctly transition to NeedsLogin +// state when their OIDC tokens expire. This test uses a short token TTL to validate the +// expiration behavior without waiting for production-length timeouts. +// +// The test verifies: +// - Nodes can successfully authenticate via OIDC and establish connectivity +// - When OIDC tokens expire, nodes transition to NeedsLogin state +// - The expiration is based on individual token issue times, not a global timer +// +// Known timing considerations: +// - Nodes may expire at different times due to sequential login processing +// - The test must account for login time spread between first and last node. func TestOIDCExpireNodesBasedOnTokenExpiry(t *testing.T) { IntegrationSkip(t) @@ -153,8 +164,12 @@ func TestOIDCExpireNodesBasedOnTokenExpiry(t *testing.T) { allIps, err := scenario.ListTailscaleClientsIPs() assertNoErrListClientIPs(t, err) + // Record when sync completes to better estimate token expiry timing + syncCompleteTime := time.Now() err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) + loginDuration := time.Since(syncCompleteTime) + t.Logf("Login and sync completed in %v", loginDuration) // assertClientsState(t, allClients) @@ -165,19 +180,49 @@ func TestOIDCExpireNodesBasedOnTokenExpiry(t *testing.T) { success := pingAllHelper(t, allClients, allAddrs) t.Logf("%d successful pings out of %d (before expiry)", success, len(allClients)*len(allIps)) - // This is not great, but this sadly is a time dependent test, so the - // safe thing to do is wait out the whole TTL time (and a bit more out - // of safety reasons) before checking if the clients have logged out. - // The Wait function can't do it itself as it has an upper bound of 1 - // min. + // Wait for OIDC token expiry and verify all nodes transition to NeedsLogin. + // We add extra time to account for: + // - Sequential login processing causing different token issue times + // - Network and processing delays + // - Safety margin for test reliability + loginTimeSpread := 1 * time.Minute // Account for sequential login delays + safetyBuffer := 30 * time.Second // Additional safety margin + totalWaitTime := shortAccessTTL + loginTimeSpread + safetyBuffer + + t.Logf("Waiting %v for OIDC tokens to expire (TTL: %v, spread: %v, buffer: %v)", + totalWaitTime, shortAccessTTL, loginTimeSpread, safetyBuffer) + + // EventuallyWithT retries the test function until it passes or times out. + // IMPORTANT: Use 'ct' (CollectT) for all assertions inside the function, not 't'. + // Using 't' would cause immediate test failure without retries, defeating the purpose + // of EventuallyWithT which is designed to handle timing-dependent conditions. assert.EventuallyWithT(t, func(ct *assert.CollectT) { + // Check each client's status individually to provide better diagnostics + expiredCount := 0 for _, client := range allClients { status, err := client.Status() - assert.NoError(ct, err) - assert.Equal(ct, "NeedsLogin", status.BackendState) + if assert.NoError(ct, err, "failed to get status for client %s", client.Hostname()) { + if status.BackendState == "NeedsLogin" { + expiredCount++ + } + } } - assertTailscaleNodesLogout(t, allClients) - }, shortAccessTTL+10*time.Second, 5*time.Second) + + // Log progress for debugging + if expiredCount < len(allClients) { + t.Logf("Token expiry progress: %d/%d clients in NeedsLogin state", expiredCount, len(allClients)) + } + + // All clients must be in NeedsLogin state + assert.Equal(ct, len(allClients), expiredCount, + "expected all %d clients to be in NeedsLogin state, but only %d are", + len(allClients), expiredCount) + + // Only check detailed logout state if all clients are expired + if expiredCount == len(allClients) { + assertTailscaleNodesLogout(ct, allClients) + } + }, totalWaitTime, 5*time.Second) } func TestOIDC024UserCreation(t *testing.T) { @@ -429,6 +474,7 @@ func TestOIDCReloginSameNodeNewUser(t *testing.T) { hsic.WithTLS(), hsic.WithFileInContainer("/tmp/hs_client_oidc_secret", []byte(scenario.mockOIDC.ClientSecret())), hsic.WithEmbeddedDERPServerOnly(), + hsic.WithDERPAsIP(), ) assertNoErrHeadscaleEnv(t, err) @@ -617,14 +663,18 @@ func TestOIDCReloginSameNodeNewUser(t *testing.T) { assert.NotEqual(t, listNodesAfterLoggingBackIn[0].GetNodeKey(), listNodesAfterLoggingBackIn[1].GetNodeKey()) } -func assertTailscaleNodesLogout(t *testing.T, clients []TailscaleClient) { - t.Helper() +// assertTailscaleNodesLogout verifies that all provided Tailscale clients +// are in the logged-out state (NeedsLogin). +func assertTailscaleNodesLogout(t assert.TestingT, clients []TailscaleClient) { + if h, ok := t.(interface{ Helper() }); ok { + h.Helper() + } for _, client := range clients { status, err := client.Status() - assertNoErr(t, err) - - assert.Equal(t, "NeedsLogin", status.BackendState) + assert.NoError(t, err, "failed to get status for client %s", client.Hostname()) + assert.Equal(t, "NeedsLogin", status.BackendState, + "client %s should be logged out", client.Hostname()) } } diff --git a/integration/auth_web_flow_test.go b/integration/auth_web_flow_test.go index 56c05e62..ff190142 100644 --- a/integration/auth_web_flow_test.go +++ b/integration/auth_web_flow_test.go @@ -30,6 +30,7 @@ func TestAuthWebFlowAuthenticationPingAll(t *testing.T) { nil, hsic.WithTestName("webauthping"), hsic.WithEmbeddedDERPServerOnly(), + hsic.WithDERPAsIP(), hsic.WithTLS(), ) assertNoErrHeadscaleEnv(t, err) @@ -68,6 +69,7 @@ func TestAuthWebFlowLogoutAndRelogin(t *testing.T) { err = scenario.CreateHeadscaleEnvWithLoginURL( nil, hsic.WithTestName("weblogout"), + hsic.WithDERPAsIP(), hsic.WithTLS(), ) assertNoErrHeadscaleEnv(t, err) diff --git a/integration/control.go b/integration/control.go index e3cb17bd..3994a4a5 100644 --- a/integration/control.go +++ b/integration/control.go @@ -5,6 +5,7 @@ import ( v1 "github.com/juanfont/headscale/gen/go/headscale/v1" policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" + "github.com/juanfont/headscale/hscontrol/routes" "github.com/juanfont/headscale/hscontrol/types" "github.com/ory/dockertest/v3" "tailscale.com/tailcfg" @@ -30,6 +31,10 @@ type ControlServer interface { ApproveRoutes(uint64, []netip.Prefix) (*v1.Node, error) GetCert() []byte GetHostname() string + GetIPInNetwork(network *dockertest.Network) string SetPolicy(*policyv2.Policy) error GetAllMapReponses() (map[types.NodeID][]tailcfg.MapResponse, error) + PrimaryRoutes() (*routes.DebugRoutes, error) + DebugBatcher() (*hscontrol.DebugBatcherInfo, error) + DebugNodeStore() (map[types.NodeID]types.Node, error) } diff --git a/integration/dockertestutil/execute.go b/integration/dockertestutil/execute.go index e4b39efb..b09e0d40 100644 --- a/integration/dockertestutil/execute.go +++ b/integration/dockertestutil/execute.go @@ -10,7 +10,7 @@ import ( "github.com/ory/dockertest/v3" ) -const dockerExecuteTimeout = time.Second * 30 +const dockerExecuteTimeout = time.Second * 10 var ( ErrDockertestCommandFailed = errors.New("dockertest command failed") diff --git a/integration/dockertestutil/network.go b/integration/dockertestutil/network.go index 799d70f3..0ec6a69b 100644 --- a/integration/dockertestutil/network.go +++ b/integration/dockertestutil/network.go @@ -96,7 +96,7 @@ func CleanUnreferencedNetworks(pool *dockertest.Pool) error { } for _, network := range networks { - if network.Network.Containers == nil || len(network.Network.Containers) == 0 { + if len(network.Network.Containers) == 0 { err := pool.RemoveNetwork(&network) if err != nil { log.Printf("removing network %s: %s", network.Network.Name, err) diff --git a/integration/general_test.go b/integration/general_test.go index 9da61958..0610ec36 100644 --- a/integration/general_test.go +++ b/integration/general_test.go @@ -957,6 +957,7 @@ func TestPingAllByIPManyUpDown(t *testing.T) { []tsic.Option{}, hsic.WithTestName("pingallbyipmany"), hsic.WithEmbeddedDERPServerOnly(), + hsic.WithDERPAsIP(), hsic.WithTLS(), ) assertNoErrHeadscaleEnv(t, err) diff --git a/integration/hsic/hsic.go b/integration/hsic/hsic.go index 14999bc6..b38677b4 100644 --- a/integration/hsic/hsic.go +++ b/integration/hsic/hsic.go @@ -23,6 +23,7 @@ import ( "github.com/davecgh/go-spew/spew" v1 "github.com/juanfont/headscale/gen/go/headscale/v1" policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" + "github.com/juanfont/headscale/hscontrol/routes" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/util" "github.com/juanfont/headscale/integration/dockertestutil" @@ -272,6 +273,14 @@ func WithTimezone(timezone string) Option { } } +// WithDERPAsIP enables using IP address instead of hostname for DERP server. +// This is useful for integration tests where DNS resolution may be unreliable. +func WithDERPAsIP() Option { + return func(hsic *HeadscaleInContainer) { + hsic.env["HEADSCALE_DEBUG_DERP_USE_IP"] = "1" + } +} + // WithDebugPort sets the debug port for delve debugging. func WithDebugPort(port int) Option { return func(hsic *HeadscaleInContainer) { @@ -867,9 +876,25 @@ func (t *HeadscaleInContainer) GetHealthEndpoint() string { // GetEndpoint returns the Headscale endpoint for the HeadscaleInContainer. func (t *HeadscaleInContainer) GetEndpoint() string { - hostEndpoint := fmt.Sprintf("%s:%d", - t.GetHostname(), - t.port) + return t.getEndpoint(false) +} + +// GetIPEndpoint returns the Headscale endpoint using IP address instead of hostname. +func (t *HeadscaleInContainer) GetIPEndpoint() string { + return t.getEndpoint(true) +} + +// getEndpoint returns the Headscale endpoint, optionally using IP address instead of hostname. +func (t *HeadscaleInContainer) getEndpoint(useIP bool) string { + var host string + if useIP && len(t.networks) > 0 { + // Use IP address from the first network + host = t.GetIPInNetwork(t.networks[0]) + } else { + host = t.GetHostname() + } + + hostEndpoint := fmt.Sprintf("%s:%d", host, t.port) if t.hasTLS() { return "https://" + hostEndpoint @@ -888,6 +913,11 @@ func (t *HeadscaleInContainer) GetHostname() string { return t.hostname } +// GetIPInNetwork returns the IP address of the HeadscaleInContainer in the given network. +func (t *HeadscaleInContainer) GetIPInNetwork(network *dockertest.Network) string { + return t.container.GetIPInNetwork(network) +} + // WaitForRunning blocks until the Headscale instance is ready to // serve clients. func (t *HeadscaleInContainer) WaitForRunning() error { @@ -1300,3 +1330,63 @@ func (t *HeadscaleInContainer) GetAllMapReponses() (map[types.NodeID][]tailcfg.M return res, nil } + +// PrimaryRoutes fetches the primary routes from the debug endpoint. +func (t *HeadscaleInContainer) PrimaryRoutes() (*routes.DebugRoutes, error) { + // Execute curl inside the container to access the debug endpoint locally + command := []string{ + "curl", "-s", "-H", "Accept: application/json", "http://localhost:9090/debug/routes", + } + + result, err := t.Execute(command) + if err != nil { + return nil, fmt.Errorf("fetching routes from debug endpoint: %w", err) + } + + var debugRoutes routes.DebugRoutes + if err := json.Unmarshal([]byte(result), &debugRoutes); err != nil { + return nil, fmt.Errorf("decoding routes response: %w", err) + } + + return &debugRoutes, nil +} + +// DebugBatcher fetches the batcher debug information from the debug endpoint. +func (t *HeadscaleInContainer) DebugBatcher() (*hscontrol.DebugBatcherInfo, error) { + // Execute curl inside the container to access the debug endpoint locally + command := []string{ + "curl", "-s", "-H", "Accept: application/json", "http://localhost:9090/debug/batcher", + } + + result, err := t.Execute(command) + if err != nil { + return nil, fmt.Errorf("fetching batcher debug info: %w", err) + } + + var debugInfo hscontrol.DebugBatcherInfo + if err := json.Unmarshal([]byte(result), &debugInfo); err != nil { + return nil, fmt.Errorf("decoding batcher debug response: %w", err) + } + + return &debugInfo, nil +} + +// DebugNodeStore fetches the NodeStore data from the debug endpoint. +func (t *HeadscaleInContainer) DebugNodeStore() (map[types.NodeID]types.Node, error) { + // Execute curl inside the container to access the debug endpoint locally + command := []string{ + "curl", "-s", "-H", "Accept: application/json", "http://localhost:9090/debug/nodestore", + } + + result, err := t.Execute(command) + if err != nil { + return nil, fmt.Errorf("fetching nodestore debug info: %w", err) + } + + var nodeStore map[types.NodeID]types.Node + if err := json.Unmarshal([]byte(result), &nodeStore); err != nil { + return nil, fmt.Errorf("decoding nodestore debug response: %w", err) + } + + return nodeStore, nil +} diff --git a/integration/route_test.go b/integration/route_test.go index bb13a47f..66db271d 100644 --- a/integration/route_test.go +++ b/integration/route_test.go @@ -1,11 +1,13 @@ package integration import ( + "cmp" "encoding/json" "fmt" "net/netip" "slices" "sort" + "strconv" "strings" "testing" "time" @@ -14,12 +16,14 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" v1 "github.com/juanfont/headscale/gen/go/headscale/v1" policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" + "github.com/juanfont/headscale/hscontrol/routes" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/util" "github.com/juanfont/headscale/integration/hsic" "github.com/juanfont/headscale/integration/tsic" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + xmaps "golang.org/x/exp/maps" "tailscale.com/ipn/ipnstate" "tailscale.com/net/tsaddr" "tailscale.com/tailcfg" @@ -30,6 +34,8 @@ import ( "tailscale.com/wgengine/filter" ) +const timestampFormat = "15:04:05.000" + var allPorts = filter.PortRange{First: 0, Last: 0xffff} // This test is both testing the routes command and the propagation of @@ -68,9 +74,7 @@ func TestEnablingRoutes(t *testing.T) { // advertise routes using the up command for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) - + status := client.MustStatus() command := []string{ "tailscale", "set", @@ -83,26 +87,33 @@ func TestEnablingRoutes(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - nodes, err := headscale.ListNodes() - require.NoError(t, err) + var nodes []*v1.Node + // Wait for route advertisements to propagate to NodeStore + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + var err error + nodes, err = headscale.ListNodes() + assert.NoError(ct, err) - for _, node := range nodes { - assert.Len(t, node.GetAvailableRoutes(), 1) - assert.Empty(t, node.GetApprovedRoutes()) - assert.Empty(t, node.GetSubnetRoutes()) - } + for _, node := range nodes { + assert.Len(ct, node.GetAvailableRoutes(), 1) + assert.Empty(ct, node.GetApprovedRoutes()) + assert.Empty(ct, node.GetSubnetRoutes()) + } + }, 10*time.Second, 100*time.Millisecond, "route advertisements should propagate to all nodes") // Verify that no routes has been sent to the client, // they are not yet enabled. for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - } + assert.Nil(c, peerStatus.PrimaryRoutes) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying no routes are active before approval") } for _, node := range nodes { @@ -113,14 +124,18 @@ func TestEnablingRoutes(t *testing.T) { require.NoError(t, err) } - nodes, err = headscale.ListNodes() - require.NoError(t, err) + // Wait for route approvals to propagate to NodeStore + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + var err error + nodes, err = headscale.ListNodes() + assert.NoError(ct, err) - for _, node := range nodes { - assert.Len(t, node.GetAvailableRoutes(), 1) - assert.Len(t, node.GetApprovedRoutes(), 1) - assert.Len(t, node.GetSubnetRoutes(), 1) - } + for _, node := range nodes { + assert.Len(ct, node.GetAvailableRoutes(), 1) + assert.Len(ct, node.GetApprovedRoutes(), 1) + assert.Len(ct, node.GetSubnetRoutes(), 1) + } + }, 10*time.Second, 100*time.Millisecond, "route approvals should propagate to all nodes") // Wait for route state changes to propagate to clients assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -133,7 +148,10 @@ func TestEnablingRoutes(t *testing.T) { peerStatus := status.Peer[peerKey] assert.NotNil(c, peerStatus.PrimaryRoutes) - assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 3) + assert.NotNil(c, peerStatus.AllowedIPs) + if peerStatus.AllowedIPs != nil { + assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 3) + } requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes[string(peerStatus.ID)])}) } } @@ -153,6 +171,7 @@ func TestEnablingRoutes(t *testing.T) { // Wait for route state changes to propagate to nodes assert.EventuallyWithT(t, func(c *assert.CollectT) { + var err error nodes, err = headscale.ListNodes() assert.NoError(c, err) @@ -175,27 +194,45 @@ func TestEnablingRoutes(t *testing.T) { // Verify that the clients can see the new routes for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - switch peerStatus.ID { - case "1": - requirePeerSubnetRoutes(t, peerStatus, nil) - case "2": - requirePeerSubnetRoutes(t, peerStatus, nil) - default: - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{netip.MustParsePrefix("10.0.2.0/24")}) + switch peerStatus.ID { + case "1": + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + case "2": + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + default: + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix("10.0.2.0/24")}) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying final route state visible to clients") } } func TestHASubnetRouterFailover(t *testing.T) { IntegrationSkip(t) + propagationTime := 60 * time.Second + + // Helper function to validate primary routes table state + validatePrimaryRoutes := func(t *testing.T, headscale ControlServer, expectedRoutes *routes.DebugRoutes, message string) { + t.Helper() + assert.EventuallyWithT(t, func(c *assert.CollectT) { + primaryRoutesState, err := headscale.PrimaryRoutes() + assert.NoError(c, err) + + if diff := cmpdiff.Diff(expectedRoutes, primaryRoutesState, util.PrefixComparer); diff != "" { + t.Log(message) + t.Errorf("validatePrimaryRoutes mismatch (-want +got):\n%s", diff) + } + }, propagationTime, 200*time.Millisecond, "Validating primary routes table") + } + spec := ScenarioSpec{ NodesPerUser: 3, Users: []string{"user1", "user2"}, @@ -213,7 +250,7 @@ func TestHASubnetRouterFailover(t *testing.T) { scenario, err := NewScenario(spec) require.NoErrorf(t, err, "failed to create scenario: %s", err) - defer scenario.ShutdownAssertNoPanics(t) + // defer scenario.ShutdownAssertNoPanics(t) err = scenario.CreateHeadscaleEnv( []tsic.Option{tsic.WithAcceptRoutes()}, @@ -266,11 +303,13 @@ func TestHASubnetRouterFailover(t *testing.T) { client := allClients[3] - t.Logf("Advertise route from r1 (%s), r2 (%s), r3 (%s), making it HA, n1 is primary", subRouter1.Hostname(), subRouter2.Hostname(), subRouter3.Hostname()) - // advertise HA route on node 1, 2, 3 - // ID 1 will be primary - // ID 2 will be standby - // ID 3 will be standby + t.Logf("%s (%s) picked as client", client.Hostname(), client.MustID()) + t.Logf("=== Initial Route Advertisement - Setting up HA configuration with 3 routers ===") + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" - Router 1 (%s): Advertising route %s - will become PRIMARY when approved", subRouter1.Hostname(), pref.String()) + t.Logf(" - Router 2 (%s): Advertising route %s - will be STANDBY when approved", subRouter2.Hostname(), pref.String()) + t.Logf(" - Router 3 (%s): Advertising route %s - will be STANDBY when approved", subRouter3.Hostname(), pref.String()) + t.Logf(" Expected: All 3 routers advertise the same route for redundancy, but only one will be primary at a time") for _, client := range allClients[:3] { command := []string{ "tailscale", @@ -290,28 +329,63 @@ func TestHASubnetRouterFailover(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 6) - + require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic") requireNodeRouteCountWithCollect(c, nodes[0], 1, 0, 0) requireNodeRouteCountWithCollect(c, nodes[1], 1, 0, 0) requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0) - }, 3*time.Second, 200*time.Millisecond, "all routes should be available but not yet approved") + }, propagationTime, 200*time.Millisecond, "Waiting for route advertisements: All 3 routers should have advertised routes (available=1) but none approved yet (approved=0, subnet=0)") // Verify that no routes has been sent to the client, // they are not yet enabled. for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - requirePeerSubnetRoutes(t, peerStatus, nil) + assert.Nil(c, peerStatus.PrimaryRoutes) + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } + }, propagationTime, 200*time.Millisecond, "Verifying no routes are active before approval") + } + + // Declare variables that will be used across multiple EventuallyWithT blocks + var ( + srs1, srs2, srs3 *ipnstate.Status + clientStatus *ipnstate.Status + srs1PeerStatus *ipnstate.PeerStatus + srs2PeerStatus *ipnstate.PeerStatus + srs3PeerStatus *ipnstate.PeerStatus + ) + + // Helper function to check test failure and print route map if needed + checkFailureAndPrintRoutes := func(t *testing.T, client TailscaleClient) { + if t.Failed() { + t.Logf("[%s] Test failed at this checkpoint", time.Now().Format(timestampFormat)) + status, err := client.Status() + if err == nil { + printCurrentRouteMap(t, xmaps.Values(status.Peer)...) + } + t.FailNow() } } + // Validate primary routes table state - no routes approved yet + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{}, + PrimaryRoutes: map[string]types.NodeID{}, // No primary routes yet + }, "Primary routes table should be empty (no approved routes yet)") + + checkFailureAndPrintRoutes(t, client) + // Enable route on node 1 - t.Logf("Enabling route on subnet router 1, no HA") + t.Logf("=== Approving route on router 1 (%s) - Single router mode (no HA yet) ===", subRouter1.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Expected: Router 1 becomes PRIMARY with route %s active", pref.String()) + t.Logf(" Expected: Routers 2 & 3 remain with advertised but unapproved routes") + t.Logf(" Expected: Client can access webservice through router 1 only") _, err = headscale.ApproveRoutes( MustFindNode(subRouter1.Hostname(), nodes).GetId(), []netip.Prefix{pref}, @@ -323,52 +397,92 @@ func TestHASubnetRouterFailover(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 6) - + require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic") requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) requireNodeRouteCountWithCollect(c, nodes[1], 1, 0, 0) requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0) - }, 3*time.Second, 200*time.Millisecond, "first subnet router should have approved route") + }, propagationTime, 200*time.Millisecond, "Router 1 approval verification: Should be PRIMARY (available=1, approved=1, subnet=1), others still unapproved (available=1, approved=0, subnet=0)") // Verify that the client has routes from the primary machine and can access // the webservice. - srs1 := subRouter1.MustStatus() - srs2 := subRouter2.MustStatus() - srs3 := subRouter3.MustStatus() - clientStatus := client.MustStatus() + assert.EventuallyWithT(t, func(c *assert.CollectT) { + srs1 = subRouter1.MustStatus() + srs2 = subRouter2.MustStatus() + srs3 = subRouter3.MustStatus() + clientStatus = client.MustStatus() - srs1PeerStatus := clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus := clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus := clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.True(t, srs1PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs2PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs3PeerStatus.Online, "r1 up, r2 up") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) - require.NotNil(t, srs1PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and serving as PRIMARY") + assert.True(c, srs2PeerStatus.Online, "Router 2 should be online but NOT serving routes (unapproved)") + assert.True(c, srs3PeerStatus.Online, "Router 3 should be online but NOT serving routes (unapproved)") - t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) - assert.Contains(t, - srs1PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus.PrimaryRoutes) - t.Logf("Validating access via subnetrouter(%s) to %s, no HA", subRouter1.MustIPv4().String(), webip.String()) - result, err := client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) - tr, err := client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter1.MustIPv4()) + if srs1PeerStatus.PrimaryRoutes != nil { + t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) + assert.Contains(c, + srs1PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying Router 1 is PRIMARY with routes after approval") + + t.Logf("=== Validating connectivity through PRIMARY router 1 (%s) to webservice at %s ===", must.Get(subRouter1.IPv4()).String(), webip.String()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Expected: Traffic flows through router 1 as it's the only approved route") + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 1") + + // Validate primary routes table state - router 1 is primary + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + // Note: Router 2 and 3 are available but not approved + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()), + }, + }, "Router 1 should be primary for route "+pref.String()) + + checkFailureAndPrintRoutes(t, client) // Enable route on node 2, now we will have a HA subnet router - t.Logf("Enabling route on subnet router 2, now HA, subnetrouter 1 is primary, 2 is standby") + t.Logf("=== Enabling High Availability by approving route on router 2 (%s) ===", subRouter2.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 is PRIMARY and actively serving traffic") + t.Logf(" Expected: Router 2 becomes STANDBY (approved but not primary)") + t.Logf(" Expected: Router 1 remains PRIMARY (no flapping - stability preferred)") + t.Logf(" Expected: HA is now active - if router 1 fails, router 2 can take over") _, err = headscale.ApproveRoutes( MustFindNode(subRouter2.Hostname(), nodes).GetId(), []netip.Prefix{pref}, @@ -380,52 +494,110 @@ func TestHASubnetRouterFailover(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 6) - - requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) - requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) - requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0) - }, 3*time.Second, 200*time.Millisecond, "second subnet router should have approved route") + if len(nodes) >= 3 { + requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) + requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) + requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0) + } + }, 3*time.Second, 200*time.Millisecond, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)") // Verify that the client has routes from the primary machine - srs1 = subRouter1.MustStatus() - srs2 = subRouter2.MustStatus() - srs3 = subRouter3.MustStatus() - clientStatus = client.MustStatus() + assert.EventuallyWithT(t, func(c *assert.CollectT) { + srs1 = subRouter1.MustStatus() + srs2 = subRouter2.MustStatus() + srs3 = subRouter3.MustStatus() + clientStatus = client.MustStatus() - srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.True(t, srs1PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs2PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs3PeerStatus.Online, "r1 up, r2 up") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) - require.NotNil(t, srs1PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and remain PRIMARY") + assert.True(c, srs2PeerStatus.Online, "Router 2 should be online and now approved as STANDBY") + assert.True(c, srs3PeerStatus.Online, "Router 3 should be online but still unapproved") - t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) - assert.Contains(t, - srs1PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus.PrimaryRoutes) - t.Logf("Validating access via subnetrouter(%s) to %s, 2 is standby", subRouter1.MustIPv4().String(), webip.String()) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter1.MustIPv4()) + if srs1PeerStatus.PrimaryRoutes != nil { + t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) + assert.Contains(c, + srs1PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying Router 1 remains PRIMARY after Router 2 approval") + + // Validate primary routes table state - router 1 still primary, router 2 approved but standby + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + // Note: Router 3 is available but not approved + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()), + }, + }, "Router 1 should remain primary after router 2 approval") + + checkFailureAndPrintRoutes(t, client) + + t.Logf("=== Validating HA configuration - Router 1 PRIMARY, Router 2 STANDBY ===") + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current routing: Traffic through router 1 (%s) to %s", must.Get(subRouter1.IPv4()), webip.String()) + t.Logf(" Expected: Router 1 continues to handle all traffic (no change from before)") + t.Logf(" Expected: Router 2 is ready to take over if router 1 fails") + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 in HA mode") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 1 in HA mode") + + // Validate primary routes table state - router 1 primary, router 2 approved (standby) + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + // Note: Router 3 is available but not approved + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()), + }, + }, "Router 1 primary with router 2 as standby") + + checkFailureAndPrintRoutes(t, client) // Enable route on node 3, now we will have a second standby and all will // be enabled. - t.Logf("Enabling route on subnet router 3, now HA, subnetrouter 1 is primary, 2 and 3 is standby") + t.Logf("=== Adding second STANDBY router by approving route on router 3 (%s) ===", subRouter3.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 PRIMARY, Router 2 STANDBY") + t.Logf(" Expected: Router 3 becomes second STANDBY (approved but not primary)") + t.Logf(" Expected: Router 1 remains PRIMARY, Router 2 remains first STANDBY") + t.Logf(" Expected: Full HA configuration with 1 PRIMARY + 2 STANDBY routers") _, err = headscale.ApproveRoutes( MustFindNode(subRouter3.Hostname(), nodes).GetId(), []netip.Prefix{pref}, @@ -437,43 +609,57 @@ func TestHASubnetRouterFailover(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 6) - + require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic") requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0) - }, 3*time.Second, 200*time.Millisecond, "third subnet router should have approved route") + }, 3*time.Second, 200*time.Millisecond, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY") // Verify that the client has routes from the primary machine - srs1 = subRouter1.MustStatus() - srs2 = subRouter2.MustStatus() - srs3 = subRouter3.MustStatus() - clientStatus = client.MustStatus() + assert.EventuallyWithT(t, func(c *assert.CollectT) { + srs1 = subRouter1.MustStatus() + srs2 = subRouter2.MustStatus() + srs3 = subRouter3.MustStatus() + clientStatus = client.MustStatus() - srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.True(t, srs1PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs2PeerStatus.Online, "r1 up, r2 up") - assert.True(t, srs3PeerStatus.Online, "r1 up, r2 up") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) - require.NotNil(t, srs1PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and remain PRIMARY") + assert.True(c, srs2PeerStatus.Online, "Router 2 should be online as first STANDBY") + assert.True(c, srs3PeerStatus.Online, "Router 3 should be online as second STANDBY") - t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) - assert.Contains(t, - srs1PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus.PrimaryRoutes) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) + + if srs1PeerStatus.PrimaryRoutes != nil { + t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref) + assert.Contains(c, + srs1PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying full HA with 3 routers: Router 1 PRIMARY, Routers 2 & 3 STANDBY") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 with full HA") // Wait for traceroute to work correctly through the expected router assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -495,11 +681,30 @@ func TestHASubnetRouterFailover(t *testing.T) { assert.True(c, expectedIP.IsValid(), "subRouter1 should have a valid IPv4 address") assertTracerouteViaIPWithCollect(c, tr, expectedIP) - }, 10*time.Second, 500*time.Millisecond, "traceroute should go through subRouter1") + }, 10*time.Second, 500*time.Millisecond, "Verifying traffic still flows through PRIMARY router 1 with full HA setup active") + + // Validate primary routes table state - all 3 routers approved, router 1 still primary + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()), + }, + }, "Router 1 primary with all 3 routers approved") + + checkFailureAndPrintRoutes(t, client) // Take down the current primary - t.Logf("taking down subnet router r1 (%s)", subRouter1.Hostname()) - t.Logf("expecting r2 (%s) to take over as primary", subRouter2.Hostname()) + t.Logf("=== FAILOVER TEST: Taking down PRIMARY router 1 (%s) ===", subRouter1.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 PRIMARY (serving traffic), Router 2 & 3 STANDBY") + t.Logf(" Action: Shutting down router 1 to simulate failure") + t.Logf(" Expected: Router 2 (%s) should automatically become new PRIMARY", subRouter2.Hostname()) + t.Logf(" Expected: Router 3 remains STANDBY") + t.Logf(" Expected: Traffic seamlessly fails over to router 2") err = subRouter1.Down() require.NoError(t, err) @@ -512,36 +717,72 @@ func TestHASubnetRouterFailover(t *testing.T) { srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") + + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } + assert.False(c, srs1PeerStatus.Online, "r1 should be offline") assert.True(c, srs2PeerStatus.Online, "r2 should be online") assert.True(c, srs3PeerStatus.Online, "r3 should be online") - }, 5*time.Second, 200*time.Millisecond, "router status should update after r1 goes down") - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - require.NotNil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) - requirePeerSubnetRoutes(t, srs1PeerStatus, nil) - requirePeerSubnetRoutes(t, srs2PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) - assert.Contains( - t, - srs2PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + if srs2PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs2PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Failover verification: Router 1 offline, Router 2 should be new PRIMARY with routes, Router 3 still STANDBY") - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after failover") - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter2.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter2.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 2 after failover") + + // Validate primary routes table state - router 2 is now primary after router 1 failure + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + // Router 1 is disconnected, so not in AvailableRoutes + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()), + }, + }, "Router 2 should be primary after router 1 failure") + + checkFailureAndPrintRoutes(t, client) // Take down subnet router 2, leaving none available - t.Logf("taking down subnet router r2 (%s)", subRouter2.Hostname()) - t.Logf("expecting no primary, r3 available, but no HA so no primary") + t.Logf("=== FAILOVER TEST: Taking down NEW PRIMARY router 2 (%s) ===", subRouter2.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 OFFLINE, Router 2 PRIMARY (serving traffic), Router 3 STANDBY") + t.Logf(" Action: Shutting down router 2 to simulate cascading failure") + t.Logf(" Expected: Router 3 (%s) should become new PRIMARY (last remaining router)", subRouter3.Hostname()) + t.Logf(" Expected: With only 1 router left, HA is effectively disabled") + t.Logf(" Expected: Traffic continues through router 3") err = subRouter2.Down() require.NoError(t, err) @@ -554,30 +795,64 @@ func TestHASubnetRouterFailover(t *testing.T) { srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.False(c, srs1PeerStatus.Online, "r1 should be offline") - assert.False(c, srs2PeerStatus.Online, "r2 should be offline") - assert.True(c, srs3PeerStatus.Online, "r3 should be online") - }, 5*time.Second, 200*time.Millisecond, "router status should update after r2 goes down") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - require.NotNil(t, srs3PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, nil) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, []netip.Prefix{pref}) + assert.False(c, srs1PeerStatus.Online, "Router 1 should still be offline") + assert.False(c, srs2PeerStatus.Online, "Router 2 should now be offline after failure") + assert.True(c, srs3PeerStatus.Online, "Router 3 should be online and taking over as PRIMARY") - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs3PeerStatus.PrimaryRoutes) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter3.MustIPv4()) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref}) + }, propagationTime, 200*time.Millisecond, "Second failover verification: Router 1 & 2 offline, Router 3 should be new PRIMARY (last router standing) with routes") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after second failover") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter3.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 3 after second failover") + + // Validate primary routes table state - router 3 is now primary after router 2 failure + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + // Routers 1 and 2 are disconnected, so not in AvailableRoutes + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()), + }, + }, "Router 3 should be primary after router 2 failure") + + checkFailureAndPrintRoutes(t, client) // Bring up subnet router 1, making the route available from there. - t.Logf("bringing up subnet router r1 (%s)", subRouter1.Hostname()) - t.Logf("expecting r1 (%s) to take over as primary, r1 and r3 available", subRouter1.Hostname()) + t.Logf("=== RECOVERY TEST: Bringing router 1 (%s) back online ===", subRouter1.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 OFFLINE, Router 2 OFFLINE, Router 3 PRIMARY (only router)") + t.Logf(" Action: Starting router 1 to restore HA capability") + t.Logf(" Expected: Router 3 remains PRIMARY (stability - no unnecessary failover)") + t.Logf(" Expected: Router 1 becomes STANDBY (ready for HA)") + t.Logf(" Expected: HA is restored with 2 routers available") err = subRouter1.Up() require.NoError(t, err) @@ -590,36 +865,73 @@ func TestHASubnetRouterFailover(t *testing.T) { srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.True(c, srs1PeerStatus.Online, "r1 should be back online") - assert.False(c, srs2PeerStatus.Online, "r2 should still be offline") - assert.True(c, srs3PeerStatus.Online, "r3 should still be online") - }, 5*time.Second, 200*time.Millisecond, "router status should update after r1 comes back up") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - require.NotNil(t, srs3PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, nil) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, []netip.Prefix{pref}) + assert.True(c, srs1PeerStatus.Online, "Router 1 should be back online as STANDBY") + assert.False(c, srs2PeerStatus.Online, "Router 2 should still be offline") + assert.True(c, srs3PeerStatus.Online, "Router 3 should remain online as PRIMARY") - assert.Contains( - t, - srs3PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs3PeerStatus.PrimaryRoutes) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref}) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter3.MustIPv4()) + if srs3PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs3PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Recovery verification: Router 1 back online as STANDBY, Router 3 remains PRIMARY (no flapping) with routes") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can still reach webservice through router 3 after router 1 recovery") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter3.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 3 after router 1 recovery") + + // Validate primary routes table state - router 3 remains primary after router 1 comes back + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + // Router 2 is still disconnected + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()), + }, + }, "Router 3 should remain primary after router 1 recovery") + + checkFailureAndPrintRoutes(t, client) // Bring up subnet router 2, should result in no change. - t.Logf("bringing up subnet router r2 (%s)", subRouter2.Hostname()) - t.Logf("all online, expecting r1 (%s) to still be primary (no flapping)", subRouter1.Hostname()) + t.Logf("=== FULL RECOVERY TEST: Bringing router 2 (%s) back online ===", subRouter2.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 STANDBY, Router 2 OFFLINE, Router 3 PRIMARY") + t.Logf(" Action: Starting router 2 to restore full HA (3 routers)") + t.Logf(" Expected: Router 3 (%s) remains PRIMARY (stability - avoid unnecessary failovers)", subRouter3.Hostname()) + t.Logf(" Expected: Router 1 (%s) remains first STANDBY", subRouter1.Hostname()) + t.Logf(" Expected: Router 2 (%s) becomes second STANDBY", subRouter2.Hostname()) + t.Logf(" Expected: Full HA restored with all 3 routers online") err = subRouter2.Up() require.NoError(t, err) @@ -633,35 +945,71 @@ func TestHASubnetRouterFailover(t *testing.T) { srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.True(c, srs1PeerStatus.Online, "r1 should be online") - assert.True(c, srs2PeerStatus.Online, "r2 should be online") - assert.True(c, srs3PeerStatus.Online, "r3 should be online") - }, 10*time.Second, 500*time.Millisecond, "all routers should be online after bringing up r2") + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - require.NotNil(t, srs3PeerStatus.PrimaryRoutes) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - requirePeerSubnetRoutes(t, srs1PeerStatus, nil) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, []netip.Prefix{pref}) + assert.True(c, srs1PeerStatus.Online, "Router 1 should be online as STANDBY") + assert.True(c, srs2PeerStatus.Online, "Router 2 should be back online as STANDBY") + assert.True(c, srs3PeerStatus.Online, "Router 3 should remain online as PRIMARY") - assert.Contains( - t, - srs3PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs3PeerStatus.PrimaryRoutes) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref}) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter3.MustIPv4()) + if srs3PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs3PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, 10*time.Second, 500*time.Millisecond, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes") - t.Logf("disabling route in subnet router r3 (%s)", subRouter3.Hostname()) - t.Logf("expecting route to failover to r1 (%s), which is still available with r2", subRouter1.Hostname()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after full recovery") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter3.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 3 after full recovery") + + // Validate primary routes table state - router 3 remains primary after all routers back online + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()), + }, + }, "Router 3 should remain primary after full recovery") + + checkFailureAndPrintRoutes(t, client) + + t.Logf("=== ROUTE DISABLE TEST: Removing approved route from PRIMARY router 3 (%s) ===", subRouter3.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 STANDBY, Router 2 STANDBY, Router 3 PRIMARY") + t.Logf(" Action: Disabling route approval on router 3 (route still advertised but not approved)") + t.Logf(" Expected: Router 1 (%s) should become new PRIMARY (lowest ID with approved route)", subRouter1.Hostname()) + t.Logf(" Expected: Router 2 (%s) remains STANDBY", subRouter2.Hostname()) + t.Logf(" Expected: Router 3 (%s) goes to advertised-only state (no longer serving)", subRouter3.Hostname()) _, err = headscale.ApproveRoutes(MustFindNode(subRouter3.Hostname(), nodes).GetId(), []netip.Prefix{}) // Wait for nodestore batch processing and route state changes to complete @@ -675,41 +1023,79 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 1) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 0) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0) - }, 10*time.Second, 500*time.Millisecond, "route should failover to r1 after disabling r3") + }, 10*time.Second, 500*time.Millisecond, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY") // Verify that the route is announced from subnet router 1 - clientStatus, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + clientStatus, err = client.Status() + assert.NoError(c, err) - srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - require.NotNil(t, srs1PeerStatus.PrimaryRoutes) - assert.Nil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - requirePeerSubnetRoutes(t, srs1PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs2PeerStatus, nil) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - assert.Contains( - t, - srs1PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.NotNil(c, srs1PeerStatus.PrimaryRoutes) + assert.Nil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter1.MustIPv4()) + if srs1PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs1PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying Router 1 becomes PRIMARY after Router 3 route disabled") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 after route disable") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 1 after route disable") + + // Validate primary routes table state - router 1 is primary after router 3 route disabled + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + // Router 3's route is no longer approved, so not in AvailableRoutes + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()), + }, + }, "Router 1 should be primary after router 3 route disabled") + + checkFailureAndPrintRoutes(t, client) // Disable the route of subnet router 1, making it failover to 2 - t.Logf("disabling route in subnet router r1 (%s)", subRouter1.Hostname()) - t.Logf("expecting route to failover to r2 (%s)", subRouter2.Hostname()) + t.Logf("=== ROUTE DISABLE TEST: Removing approved route from NEW PRIMARY router 1 (%s) ===", subRouter1.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 PRIMARY, Router 2 STANDBY, Router 3 advertised-only") + t.Logf(" Action: Disabling route approval on router 1") + t.Logf(" Expected: Router 2 (%s) should become new PRIMARY (only remaining approved route)", subRouter2.Hostname()) + t.Logf(" Expected: Router 1 (%s) goes to advertised-only state", subRouter1.Hostname()) + t.Logf(" Expected: Router 3 (%s) remains advertised-only", subRouter3.Hostname()) _, err = headscale.ApproveRoutes(MustFindNode(subRouter1.Hostname(), nodes).GetId(), []netip.Prefix{}) // Wait for nodestore batch processing and route state changes to complete @@ -723,41 +1109,79 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 0, 0) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0) - }, 10*time.Second, 500*time.Millisecond, "route should failover to r2 after disabling r1") + }, 10*time.Second, 500*time.Millisecond, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY") // Verify that the route is announced from subnet router 1 - clientStatus, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + clientStatus, err = client.Status() + assert.NoError(c, err) - srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - require.NotNil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - requirePeerSubnetRoutes(t, srs1PeerStatus, nil) - requirePeerSubnetRoutes(t, srs2PeerStatus, []netip.Prefix{pref}) - requirePeerSubnetRoutes(t, srs3PeerStatus, nil) + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } - assert.Contains( - t, - srs2PeerStatus.PrimaryRoutes.AsSlice(), - pref, - ) + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil) + requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, []netip.Prefix{pref}) + requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil) - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter2.MustIPv4()) + if srs2PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs2PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying Router 2 becomes PRIMARY after Router 1 route disabled") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after second route disable") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter2.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 2 after second route disable") + + // Validate primary routes table state - router 2 is primary after router 1 route disabled + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + // Router 1's route is no longer approved, so not in AvailableRoutes + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + // Router 3's route is still not approved + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()), + }, + }, "Router 2 should be primary after router 1 route disabled") + + checkFailureAndPrintRoutes(t, client) // enable the route of subnet router 1, no change expected - t.Logf("enabling route in subnet router 1 (%s)", subRouter1.Hostname()) - t.Logf("both online, expecting r2 (%s) to still be primary (no flapping)", subRouter2.Hostname()) + t.Logf("=== ROUTE RE-ENABLE TEST: Re-approving route on router 1 (%s) ===", subRouter1.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 advertised-only, Router 2 PRIMARY, Router 3 advertised-only") + t.Logf(" Action: Re-enabling route approval on router 1") + t.Logf(" Expected: Router 2 (%s) remains PRIMARY (stability - no unnecessary flapping)", subRouter2.Hostname()) + t.Logf(" Expected: Router 1 (%s) becomes STANDBY (approved but not primary)", subRouter1.Hostname()) + t.Logf(" Expected: HA fully restored with Router 2 PRIMARY and Router 1 STANDBY") r1Node := MustFindNode(subRouter1.Hostname(), nodes) _, err = headscale.ApproveRoutes( r1Node.GetId(), @@ -773,33 +1197,107 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 0) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0) - }, 5*time.Second, 200*time.Millisecond, "route state should stabilize after re-enabling r1, expecting r2 to still be primary to avoid flapping") + }, propagationTime, 200*time.Millisecond, "Re-enable verification: Router 1 approved as STANDBY, Router 2 remains PRIMARY (no flapping), full HA restored") // Verify that the route is announced from subnet router 1 - clientStatus, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + clientStatus, err = client.Status() + assert.NoError(c, err) - srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] - srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] - srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] + srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] + srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey] - assert.Nil(t, srs1PeerStatus.PrimaryRoutes) - require.NotNil(t, srs2PeerStatus.PrimaryRoutes) - assert.Nil(t, srs3PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist") + assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist") - assert.Contains( - t, - srs2PeerStatus.PrimaryRoutes.AsSlice(), - pref, + if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil { + return + } + + assert.Nil(c, srs1PeerStatus.PrimaryRoutes) + assert.NotNil(c, srs2PeerStatus.PrimaryRoutes) + assert.Nil(c, srs3PeerStatus.PrimaryRoutes) + + if srs2PeerStatus.PrimaryRoutes != nil { + assert.Contains(c, + srs2PeerStatus.PrimaryRoutes.AsSlice(), + pref, + ) + } + }, propagationTime, 200*time.Millisecond, "Verifying Router 2 remains PRIMARY after Router 1 route re-enabled") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after route re-enable") + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := subRouter2.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 2 after route re-enable") + + // Validate primary routes table state after router 1 re-approval + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + // Router 3 route is still not approved + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()), + }, + }, "Router 2 should remain primary after router 1 re-approval") + + checkFailureAndPrintRoutes(t, client) + + // Enable route on node 3, we now have all routes re-enabled + t.Logf("=== ROUTE RE-ENABLE TEST: Re-approving route on router 3 (%s) - Full HA Restoration ===", subRouter3.Hostname()) + t.Logf("[%s] Starting test section", time.Now().Format(timestampFormat)) + t.Logf(" Current state: Router 1 STANDBY, Router 2 PRIMARY, Router 3 advertised-only") + t.Logf(" Action: Re-enabling route approval on router 3") + t.Logf(" Expected: Router 2 (%s) remains PRIMARY (stability preferred)", subRouter2.Hostname()) + t.Logf(" Expected: Routers 1 & 3 are both STANDBY") + t.Logf(" Expected: Full HA restored with all 3 routers available") + r3Node := MustFindNode(subRouter3.Hostname(), nodes) + _, err = headscale.ApproveRoutes( + r3Node.GetId(), + util.MustStringsToPrefixes(r3Node.GetAvailableRoutes()), ) - result, err = client.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + // Wait for route state changes after re-enabling r3 + assert.EventuallyWithT(t, func(c *assert.CollectT) { + nodes, err = headscale.ListNodes() + assert.NoError(c, err) + assert.Len(c, nodes, 6) + require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic") + // After router 3 re-approval: Router 2 remains PRIMARY, Routers 1&3 are STANDBY + // SubnetRoutes should only show routes for PRIMARY node (actively serving) + requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 0) // Router 1: STANDBY (available, approved, but not serving) + requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 1) // Router 2: PRIMARY (available, approved, and serving) + requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0) // Router 3: STANDBY (available, approved, but not serving) + }, propagationTime, 200*time.Millisecond, "Waiting for route state after router 3 re-approval") - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, subRouter2.MustIPv4()) + // Validate primary routes table state after router 3 re-approval + validatePrimaryRoutes(t, headscale, &routes.DebugRoutes{ + AvailableRoutes: map[types.NodeID][]netip.Prefix{ + types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref}, + types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref}, + }, + PrimaryRoutes: map[string]types.NodeID{ + pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()), + }, + }, "Router 2 should remain primary after router 3 re-approval") + + checkFailureAndPrintRoutes(t, client) } // TestSubnetRouteACL verifies that Subnet routes are distributed @@ -880,42 +1378,69 @@ func TestSubnetRouteACL(t *testing.T) { client := allClients[1] for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - if route, ok := expectedRoutes[string(status.Self.ID)]; ok { - command := []string{ - "tailscale", - "set", - "--advertise-routes=" + route, + if route, ok := expectedRoutes[string(status.Self.ID)]; ok { + command := []string{ + "tailscale", + "set", + "--advertise-routes=" + route, + } + _, _, err = client.Execute(command) + assert.NoErrorf(c, err, "failed to advertise route: %s", err) } - _, _, err = client.Execute(command) - require.NoErrorf(t, err, "failed to advertise route: %s", err) - } + }, 5*time.Second, 200*time.Millisecond, "Configuring route advertisements") } err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - nodes, err := headscale.ListNodes() - require.NoError(t, err) - require.Len(t, nodes, 2) + // Wait for route advertisements to propagate to the server + var nodes []*v1.Node + require.EventuallyWithT(t, func(c *assert.CollectT) { + var err error + nodes, err = headscale.ListNodes() + assert.NoError(c, err) + assert.Len(c, nodes, 2) - requireNodeRouteCount(t, nodes[0], 1, 0, 0) - requireNodeRouteCount(t, nodes[1], 0, 0, 0) + // Find the node that should have the route by checking node IDs + var routeNode *v1.Node + var otherNode *v1.Node + for _, node := range nodes { + nodeIDStr := strconv.FormatUint(node.GetId(), 10) + if _, shouldHaveRoute := expectedRoutes[nodeIDStr]; shouldHaveRoute { + routeNode = node + } else { + otherNode = node + } + } + + assert.NotNil(c, routeNode, "could not find node that should have route") + assert.NotNil(c, otherNode, "could not find node that should not have route") + + // After NodeStore fix: routes are properly tracked in route manager + // This test uses a policy with NO auto-approvers, so routes should be: + // announced=1, approved=0, subnet=0 (routes announced but not approved) + requireNodeRouteCountWithCollect(c, routeNode, 1, 0, 0) + requireNodeRouteCountWithCollect(c, otherNode, 0, 0, 0) + }, 10*time.Second, 100*time.Millisecond, "route advertisements should propagate to server") // Verify that no routes has been sent to the client, // they are not yet enabled. for _, client := range allClients { - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - requirePeerSubnetRoutes(t, peerStatus, nil) - } + assert.Nil(c, peerStatus.PrimaryRoutes) + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying no routes are active before approval") } _, err = headscale.ApproveRoutes( @@ -935,14 +1460,22 @@ func TestSubnetRouteACL(t *testing.T) { }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to nodes") // Verify that the client has routes from the primary machine - srs1, _ := subRouter1.Status() + assert.EventuallyWithT(t, func(c *assert.CollectT) { + srs1, err := subRouter1.Status() + assert.NoError(c, err) - clientStatus, err := client.Status() - require.NoError(t, err) + clientStatus, err := client.Status() + assert.NoError(c, err) - srs1PeerStatus := clientStatus.Peer[srs1.Self.PublicKey] + srs1PeerStatus := clientStatus.Peer[srs1.Self.PublicKey] - requirePeerSubnetRoutes(t, srs1PeerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])}) + assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist") + if srs1PeerStatus == nil { + return + } + + requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])}) + }, 5*time.Second, 200*time.Millisecond, "Verifying client can see subnet routes from router") clientNm, err := client.Netmap() require.NoError(t, err) @@ -1071,14 +1604,16 @@ func TestEnablingExitRoutes(t *testing.T) { // Verify that no routes has been sent to the client, // they are not yet enabled. for _, client := range allClients { - status, err := client.Status() - assertNoErr(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - } + assert.Nil(c, peerStatus.PrimaryRoutes) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying no exit routes are active before approval") } // Enable all routes, but do v4 on one and v6 on other to ensure they @@ -1094,12 +1629,15 @@ func TestEnablingExitRoutes(t *testing.T) { ) require.NoError(t, err) - nodes, err = headscale.ListNodes() - require.NoError(t, err) - require.Len(t, nodes, 2) + // Wait for route state changes to propagate + assert.EventuallyWithT(t, func(c *assert.CollectT) { + nodes, err = headscale.ListNodes() + assert.NoError(c, err) + assert.Len(c, nodes, 2) - requireNodeRouteCount(t, nodes[0], 2, 2, 2) - requireNodeRouteCount(t, nodes[1], 2, 2, 2) + requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2) + requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2) + }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to both nodes") // Wait for route state changes to propagate to clients assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1112,9 +1650,11 @@ func TestEnablingExitRoutes(t *testing.T) { peerStatus := status.Peer[peerKey] assert.NotNil(c, peerStatus.AllowedIPs) - assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 4) - assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv4()) - assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv6()) + if peerStatus.AllowedIPs != nil { + assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 4) + assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv4()) + assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv6()) + } } } }, 10*time.Second, 500*time.Millisecond, "clients should see new routes") @@ -1186,22 +1726,29 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { _, _, err = user1c.Execute(command) require.NoErrorf(t, err, "failed to advertise route: %s", err) - nodes, err := headscale.ListNodes() - require.NoError(t, err) - assert.Len(t, nodes, 2) - requireNodeRouteCount(t, nodes[0], 1, 0, 0) + var nodes []*v1.Node + // Wait for route advertisements to propagate to NodeStore + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + var err error + nodes, err = headscale.ListNodes() + assert.NoError(ct, err) + assert.Len(ct, nodes, 2) + requireNodeRouteCountWithCollect(ct, nodes[0], 1, 0, 0) + }, 10*time.Second, 100*time.Millisecond, "route advertisements should propagate") // Verify that no routes has been sent to the client, // they are not yet enabled. - status, err := user1c.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := user1c.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - requirePeerSubnetRoutes(t, peerStatus, nil) - } + assert.Nil(c, peerStatus.PrimaryRoutes) + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying no routes are active before approval") // Enable route _, err = headscale.ApproveRoutes( @@ -1210,24 +1757,29 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { ) require.NoError(t, err) - // Wait for route state changes to propagate to nodes and clients + // Wait for route state changes to propagate to nodes assert.EventuallyWithT(t, func(c *assert.CollectT) { + var err error nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 2) requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) + }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to nodes") - // Verify that the routes have been sent to the client - status, err = user2c.Status() + // Verify that the routes have been sent to the client + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := user2c.Status() assert.NoError(c, err) for _, peerKey := range status.Peers() { peerStatus := status.Peer[peerKey] - assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *pref) + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *pref) + } requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*pref}) } - }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to nodes and clients") + }, 10*time.Second, 500*time.Millisecond, "routes should be visible to client") usernet1, err := scenario.Network("usernet1") require.NoError(t, err) @@ -1242,13 +1794,21 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { url := fmt.Sprintf("http://%s/etc/hostname", webip) t.Logf("url from %s to %s", user2c.Hostname(), url) - result, err := user2c.Curl(url) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := user2c.Curl(url) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, 5*time.Second, 200*time.Millisecond, "Verifying client can reach webservice through subnet route") - tr, err := user2c.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, user1c.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := user2c.Traceroute(webip) + assert.NoError(c, err) + ip, err := user1c.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for user1c") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, 5*time.Second, 200*time.Millisecond, "Verifying traceroute goes through subnet router") } func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { @@ -1310,36 +1870,45 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { _, _, err = user1c.Execute(command) require.NoErrorf(t, err, "failed to advertise route: %s", err) - nodes, err := headscale.ListNodes() - require.NoError(t, err) - assert.Len(t, nodes, 2) - requireNodeRouteCount(t, nodes[0], 2, 0, 0) + var nodes []*v1.Node + // Wait for route advertisements to propagate to NodeStore + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + var err error + nodes, err = headscale.ListNodes() + assert.NoError(ct, err) + assert.Len(ct, nodes, 2) + requireNodeRouteCountWithCollect(ct, nodes[0], 2, 0, 0) + }, 10*time.Second, 100*time.Millisecond, "route advertisements should propagate") // Verify that no routes has been sent to the client, // they are not yet enabled. - status, err := user1c.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := user1c.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - assert.Nil(t, peerStatus.PrimaryRoutes) - requirePeerSubnetRoutes(t, peerStatus, nil) - } + assert.Nil(c, peerStatus.PrimaryRoutes) + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying no routes sent to client before approval") // Enable route _, err = headscale.ApproveRoutes(nodes[0].GetId(), []netip.Prefix{tsaddr.AllIPv4()}) require.NoError(t, err) - // Wait for route state changes to propagate to nodes and clients + // Wait for route state changes to propagate to nodes assert.EventuallyWithT(t, func(c *assert.CollectT) { nodes, err = headscale.ListNodes() assert.NoError(c, err) assert.Len(c, nodes, 2) requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2) + }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to nodes") - // Verify that the routes have been sent to the client - status, err = user2c.Status() + // Verify that the routes have been sent to the client + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := user2c.Status() assert.NoError(c, err) for _, peerKey := range status.Peers() { @@ -1347,7 +1916,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()}) } - }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate to nodes and clients") + }, 10*time.Second, 500*time.Millisecond, "routes should be visible to client") // Tell user2c to use user1c as an exit node. command = []string{ @@ -1699,7 +2268,8 @@ func TestAutoApproveMultiNetwork(t *testing.T) { assertNoErrGetHeadscale(t, err) assert.NotNil(t, headscale) - // Set the route of usernet1 to be autoapproved + // Add the Docker network route to the auto-approvers + // Keep existing auto-approvers (like bigRoute) in place var approvers policyv2.AutoApprovers switch { case strings.HasPrefix(tt.approver, "tag:"): @@ -1794,75 +2364,130 @@ func TestAutoApproveMultiNetwork(t *testing.T) { // for all counts. nodes, err := headscale.ListNodes() assert.NoError(c, err) - requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1) - }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") + + routerNode := MustFindNode(routerUsernet1.Hostname(), nodes) + t.Logf("Initial auto-approval check - Router node %s: announced=%v, approved=%v, subnet=%v", + routerNode.GetName(), + routerNode.GetAvailableRoutes(), + routerNode.GetApprovedRoutes(), + routerNode.GetSubnetRoutes()) + + requireNodeRouteCountWithCollect(c, routerNode, 1, 1, 1) + }, 10*time.Second, 500*time.Millisecond, "Initial route auto-approval: Route should be approved via policy") // Verify that the routes have been sent to the client. - status, err := client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + // Debug output to understand peer visibility + t.Logf("Client %s sees %d peers", client.Hostname(), len(status.Peers())) - if peerStatus.ID == routerUsernet1ID.StableID() { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + routerPeerFound := false + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] + + if peerStatus.ID == routerUsernet1ID.StableID() { + routerPeerFound = true + t.Logf("Client sees router peer %s (ID=%s): AllowedIPs=%v, PrimaryRoutes=%v", + peerStatus.HostName, + peerStatus.ID, + peerStatus.AllowedIPs, + peerStatus.PrimaryRoutes) + + assert.NotNil(c, peerStatus.PrimaryRoutes) + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + + assert.True(c, routerPeerFound, "Client should see the router peer") + }, 5*time.Second, 200*time.Millisecond, "Verifying routes sent to client after auto-approval") url := fmt.Sprintf("http://%s/etc/hostname", webip) t.Logf("url from %s to %s", client.Hostname(), url) - result, err := client.Curl(url) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(url) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, 5*time.Second, 200*time.Millisecond, "Verifying client can reach webservice through auto-approved route") - tr, err := client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, routerUsernet1.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := routerUsernet1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, 5*time.Second, 200*time.Millisecond, "Verifying traceroute goes through auto-approved router") // Remove the auto approval from the policy, any routes already enabled should be allowed. prefix = *route delete(tt.pol.AutoApprovers.Routes, prefix) err = headscale.SetPolicy(tt.pol) require.NoError(t, err) + t.Logf("Policy updated: removed auto-approver for route %s", prefix) // Wait for route state changes to propagate assert.EventuallyWithT(t, func(c *assert.CollectT) { - // These route should auto approve, so the node is expected to have a route - // for all counts. + // Routes already approved should remain approved even after policy change nodes, err = headscale.ListNodes() assert.NoError(c, err) - requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1) - }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") + + routerNode := MustFindNode(routerUsernet1.Hostname(), nodes) + t.Logf("After policy removal - Router node %s: announced=%v, approved=%v, subnet=%v", + routerNode.GetName(), + routerNode.GetAvailableRoutes(), + routerNode.GetApprovedRoutes(), + routerNode.GetSubnetRoutes()) + + requireNodeRouteCountWithCollect(c, routerNode, 1, 1, 1) + }, 10*time.Second, 500*time.Millisecond, "Routes should remain approved after auto-approver removal") // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - if peerStatus.ID == routerUsernet1ID.StableID() { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + if peerStatus.ID == routerUsernet1ID.StableID() { + assert.NotNil(c, peerStatus.PrimaryRoutes) + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying routes remain after policy change") url = fmt.Sprintf("http://%s/etc/hostname", webip) t.Logf("url from %s to %s", client.Hostname(), url) - result, err = client.Curl(url) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(url) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, 5*time.Second, 200*time.Millisecond, "Verifying client can still reach webservice after policy change") - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, routerUsernet1.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := routerUsernet1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, 5*time.Second, 200*time.Millisecond, "Verifying traceroute still goes through router after policy change") // Disable the route, making it unavailable since it is no longer auto-approved _, err = headscale.ApproveRoutes( @@ -1881,13 +2506,15 @@ func TestAutoApproveMultiNetwork(t *testing.T) { }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] - requirePeerSubnetRoutes(t, peerStatus, nil) - } + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } + }, 5*time.Second, 200*time.Millisecond, "Verifying routes disabled after route removal") // Add the route back to the auto approver in the policy, the route should // now become available again. @@ -1918,31 +2545,43 @@ func TestAutoApproveMultiNetwork(t *testing.T) { }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - if peerStatus.ID == routerUsernet1ID.StableID() { - require.NotNil(t, peerStatus.PrimaryRoutes) - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + if peerStatus.ID == routerUsernet1ID.StableID() { + assert.NotNil(c, peerStatus.PrimaryRoutes) + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying routes re-enabled after policy re-approval") url = fmt.Sprintf("http://%s/etc/hostname", webip) t.Logf("url from %s to %s", client.Hostname(), url) - result, err = client.Curl(url) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := client.Curl(url) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, 5*time.Second, 200*time.Millisecond, "Verifying client can reach webservice after route re-approval") - tr, err = client.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, routerUsernet1.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := client.Traceroute(webip) + assert.NoError(c, err) + ip, err := routerUsernet1.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, 5*time.Second, 200*time.Millisecond, "Verifying traceroute goes through router after re-approval") // Advertise and validate a subnet of an auto approved route, /24 inside the // auto approved /16. @@ -1961,26 +2600,32 @@ func TestAutoApproveMultiNetwork(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1) + requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 1) }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") - requireNodeRouteCount(t, nodes[1], 1, 1, 1) // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - if peerStatus.ID == routerUsernet1ID.StableID() { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else if peerStatus.ID == "2" { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), subRoute) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{subRoute}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + if peerStatus.ID == routerUsernet1ID.StableID() { + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else if peerStatus.ID == "2" { + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), subRoute) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{subRoute}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying sub-route propagated to client") // Advertise a not approved route will not end up anywhere command = []string{ @@ -1998,24 +2643,29 @@ func TestAutoApproveMultiNetwork(t *testing.T) { nodes, err = headscale.ListNodes() assert.NoError(c, err) requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1) + requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) + requireNodeRouteCountWithCollect(c, nodes[2], 0, 0, 0) }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") - requireNodeRouteCount(t, nodes[1], 1, 1, 0) - requireNodeRouteCount(t, nodes[2], 0, 0, 0) // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - if peerStatus.ID == routerUsernet1ID.StableID() { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + if peerStatus.ID == routerUsernet1ID.StableID() { + assert.NotNil(c, peerStatus.PrimaryRoutes) + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying unapproved route not propagated") // Exit routes are also automatically approved command = []string{ @@ -2036,21 +2686,25 @@ func TestAutoApproveMultiNetwork(t *testing.T) { }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") // Verify that the routes have been sent to the client. - status, err = client.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + status, err := client.Status() + assert.NoError(c, err) - for _, peerKey := range status.Peers() { - peerStatus := status.Peer[peerKey] + for _, peerKey := range status.Peers() { + peerStatus := status.Peer[peerKey] - if peerStatus.ID == routerUsernet1ID.StableID() { - assert.Contains(t, peerStatus.PrimaryRoutes.AsSlice(), *route) - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{*route}) - } else if peerStatus.ID == "3" { - requirePeerSubnetRoutes(t, peerStatus, []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()}) - } else { - requirePeerSubnetRoutes(t, peerStatus, nil) + if peerStatus.ID == routerUsernet1ID.StableID() { + if peerStatus.PrimaryRoutes != nil { + assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route) + } + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route}) + } else if peerStatus.ID == "3" { + requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()}) + } else { + requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) + } } - } + }, 5*time.Second, 200*time.Millisecond, "Verifying exit node routes propagated to client") }) } } @@ -2067,13 +2721,17 @@ func assertTracerouteViaIP(t *testing.T, tr util.Traceroute, ip netip.Addr) { require.Equal(t, tr.Route[0].IP, ip) } -// assertTracerouteViaIPWithCollect is a version of assertTracerouteViaIP that works with assert.CollectT +// assertTracerouteViaIPWithCollect is a version of assertTracerouteViaIP that works with assert.CollectT. func assertTracerouteViaIPWithCollect(c *assert.CollectT, tr util.Traceroute, ip netip.Addr) { assert.NotNil(c, tr) assert.True(c, tr.Success) assert.NoError(c, tr.Err) assert.NotEmpty(c, tr.Route) - assert.Equal(c, tr.Route[0].IP, ip) + // Since we're inside EventuallyWithT, we can't use require.Greater with t + // but assert.NotEmpty above ensures len(tr.Route) > 0 + if len(tr.Route) > 0 { + assert.Equal(c, tr.Route[0].IP.String(), ip.String()) + } } // requirePeerSubnetRoutes asserts that the peer has the expected subnet routes. @@ -2100,6 +2758,33 @@ func requirePeerSubnetRoutes(t *testing.T, status *ipnstate.PeerStatus, expected } } +func SortPeerStatus(a, b *ipnstate.PeerStatus) int { + return cmp.Compare(a.ID, b.ID) +} + +func printCurrentRouteMap(t *testing.T, routers ...*ipnstate.PeerStatus) { + t.Logf("== Current routing map ==") + slices.SortFunc(routers, SortPeerStatus) + for _, router := range routers { + got := filterNonRoutes(router) + t.Logf(" Router %s (%s) is serving:", router.HostName, router.ID) + t.Logf(" AllowedIPs: %v", got) + if router.PrimaryRoutes != nil { + t.Logf(" PrimaryRoutes: %v", router.PrimaryRoutes.AsSlice()) + } + } +} + +// filterNonRoutes returns the list of routes that a [ipnstate.PeerStatus] is serving. +func filterNonRoutes(status *ipnstate.PeerStatus) []netip.Prefix { + return slicesx.Filter(nil, status.AllowedIPs.AsSlice(), func(p netip.Prefix) bool { + if tsaddr.IsExitRoute(p) { + return true + } + return !slices.ContainsFunc(status.TailscaleIPs, p.Contains) + }) +} + func requirePeerSubnetRoutesWithCollect(c *assert.CollectT, status *ipnstate.PeerStatus, expected []netip.Prefix) { if status.AllowedIPs.Len() <= 2 && len(expected) != 0 { assert.Fail(c, fmt.Sprintf("peer %s (%s) has no subnet routes, expected %v", status.HostName, status.ID, expected)) @@ -2110,12 +2795,7 @@ func requirePeerSubnetRoutesWithCollect(c *assert.CollectT, status *ipnstate.Pee expected = []netip.Prefix{} } - got := slicesx.Filter(nil, status.AllowedIPs.AsSlice(), func(p netip.Prefix) bool { - if tsaddr.IsExitRoute(p) { - return true - } - return !slices.ContainsFunc(status.TailscaleIPs, p.Contains) - }) + got := filterNonRoutes(status) if diff := cmpdiff.Diff(expected, got, util.PrefixComparer, cmpopts.EquateEmpty()); diff != "" { assert.Fail(c, fmt.Sprintf("peer %s (%s) subnet routes, unexpected result (-want +got):\n%s", status.HostName, status.ID, diff)) @@ -2217,27 +2897,31 @@ func TestSubnetRouteACLFiltering(t *testing.T) { ) assertNoErrHeadscaleEnv(t, err) - allClients, err := scenario.ListTailscaleClients() - assertNoErrListClients(t, err) - err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) headscale, err := scenario.Headscale() assertNoErrGetHeadscale(t, err) - // Sort clients by ID for consistent order - slices.SortFunc(allClients, func(a, b TailscaleClient) int { - return b.MustIPv4().Compare(a.MustIPv4()) - }) + // Get the router and node clients by user + routerClients, err := scenario.ListTailscaleClients(routerUser) + require.NoError(t, err) + require.Len(t, routerClients, 1) + routerClient := routerClients[0] - // Get the router and node clients - routerClient := allClients[0] - nodeClient := allClients[1] + nodeClients, err := scenario.ListTailscaleClients(nodeUser) + require.NoError(t, err) + require.Len(t, nodeClients, 1) + nodeClient := nodeClients[0] + + routerIP, err := routerClient.IPv4() + require.NoError(t, err, "failed to get router IPv4") + nodeIP, err := nodeClient.IPv4() + require.NoError(t, err, "failed to get node IPv4") aclPolicy.Hosts = policyv2.Hosts{ - policyv2.Host(routerUser): policyv2.Prefix(must.Get(routerClient.MustIPv4().Prefix(32))), - policyv2.Host(nodeUser): policyv2.Prefix(must.Get(nodeClient.MustIPv4().Prefix(32))), + policyv2.Host(routerUser): policyv2.Prefix(must.Get(routerIP.Prefix(32))), + policyv2.Host(nodeUser): policyv2.Prefix(must.Get(nodeIP.Prefix(32))), } aclPolicy.ACLs[1].Destinations = []policyv2.AliasWithPorts{ aliasWithPorts(prefixp(route.String()), tailcfg.PortRangeAny), @@ -2264,21 +2948,25 @@ func TestSubnetRouteACLFiltering(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - // List nodes and verify the router has 3 available routes - nodes, err := headscale.NodesByUser() - require.NoError(t, err) - require.Len(t, nodes, 2) + var routerNode, nodeNode *v1.Node + // Wait for route advertisements to propagate to NodeStore + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + // List nodes and verify the router has 3 available routes + nodes, err := headscale.NodesByUser() + assert.NoError(ct, err) + assert.Len(ct, nodes, 2) - // Find the router node - routerNode := nodes[routerUser][0] - nodeNode := nodes[nodeUser][0] + // Find the router node + routerNode = nodes[routerUser][0] + nodeNode = nodes[nodeUser][0] - require.NotNil(t, routerNode, "Router node not found") - require.NotNil(t, nodeNode, "Client node not found") + assert.NotNil(ct, routerNode, "Router node not found") + assert.NotNil(ct, nodeNode, "Client node not found") - // Check that the router has 3 routes available but not approved yet - requireNodeRouteCount(t, routerNode, 3, 0, 0) - requireNodeRouteCount(t, nodeNode, 0, 0, 0) + // Check that the router has 3 routes available but not approved yet + requireNodeRouteCountWithCollect(ct, routerNode, 3, 0, 0) + requireNodeRouteCountWithCollect(ct, nodeNode, 0, 0, 0) + }, 10*time.Second, 100*time.Millisecond, "route advertisements should propagate to router node") // Approve all routes for the router _, err = headscale.ApproveRoutes( @@ -2290,7 +2978,8 @@ func TestSubnetRouteACLFiltering(t *testing.T) { // Wait for route state changes to propagate assert.EventuallyWithT(t, func(c *assert.CollectT) { // List nodes and verify the router has 3 available routes - nodes, err = headscale.NodesByUser() + var err error + nodes, err := headscale.NodesByUser() assert.NoError(c, err) assert.Len(c, nodes, 2) @@ -2302,23 +2991,33 @@ func TestSubnetRouteACLFiltering(t *testing.T) { }, 10*time.Second, 500*time.Millisecond, "route state changes should propagate") // Now check the client node status - nodeStatus, err := nodeClient.Status() - require.NoError(t, err) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + nodeStatus, err := nodeClient.Status() + assert.NoError(c, err) - routerStatus, err := routerClient.Status() - require.NoError(t, err) + routerStatus, err := routerClient.Status() + assert.NoError(c, err) - // Check that the node can see the subnet routes from the router - routerPeerStatus := nodeStatus.Peer[routerStatus.Self.PublicKey] + // Check that the node can see the subnet routes from the router + routerPeerStatus := nodeStatus.Peer[routerStatus.Self.PublicKey] - // The node should only have 1 subnet route - requirePeerSubnetRoutes(t, routerPeerStatus, []netip.Prefix{*route}) + // The node should only have 1 subnet route + requirePeerSubnetRoutesWithCollect(c, routerPeerStatus, []netip.Prefix{*route}) + }, 5*time.Second, 200*time.Millisecond, "Verifying node sees filtered subnet routes") - result, err := nodeClient.Curl(weburl) - require.NoError(t, err) - assert.Len(t, result, 13) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + result, err := nodeClient.Curl(weburl) + assert.NoError(c, err) + assert.Len(c, result, 13) + }, 5*time.Second, 200*time.Millisecond, "Verifying node can reach webservice through allowed route") - tr, err := nodeClient.Traceroute(webip) - require.NoError(t, err) - assertTracerouteViaIP(t, tr, routerClient.MustIPv4()) + assert.EventuallyWithT(t, func(c *assert.CollectT) { + tr, err := nodeClient.Traceroute(webip) + assert.NoError(c, err) + ip, err := routerClient.IPv4() + if !assert.NoError(c, err, "failed to get IPv4 for routerClient") { + return + } + assertTracerouteViaIPWithCollect(c, tr, ip) + }, 5*time.Second, 200*time.Millisecond, "Verifying traceroute goes through router") } diff --git a/integration/scenario.go b/integration/scenario.go index c7facf20..8382d6a8 100644 --- a/integration/scenario.go +++ b/integration/scenario.go @@ -327,6 +327,7 @@ func (s *Scenario) ShutdownAssertNoPanics(t *testing.T) { return true }) + s.mu.Lock() for userName, user := range s.users { for _, client := range user.Clients { log.Printf("removing client %s in user %s", client.Hostname(), userName) @@ -346,6 +347,7 @@ func (s *Scenario) ShutdownAssertNoPanics(t *testing.T) { } } } + s.mu.Unlock() for _, derp := range s.derpServers { err := derp.Shutdown() @@ -429,6 +431,28 @@ func (s *Scenario) Headscale(opts ...hsic.Option) (ControlServer, error) { return headscale, nil } +// Pool returns the dockertest pool for the scenario. +func (s *Scenario) Pool() *dockertest.Pool { + return s.pool +} + +// GetOrCreateUser gets or creates a user in the scenario. +func (s *Scenario) GetOrCreateUser(userStr string) *User { + s.mu.Lock() + defer s.mu.Unlock() + + if user, ok := s.users[userStr]; ok { + return user + } + + user := &User{ + Clients: make(map[string]TailscaleClient), + } + s.users[userStr] = user + + return user +} + // CreatePreAuthKey creates a "pre authentorised key" to be created in the // Headscale instance on behalf of the Scenario. func (s *Scenario) CreatePreAuthKey( @@ -457,9 +481,11 @@ func (s *Scenario) CreateUser(user string) (*v1.User, error) { return nil, fmt.Errorf("failed to create user: %w", err) } + s.mu.Lock() s.users[user] = &User{ Clients: make(map[string]TailscaleClient), } + s.mu.Unlock() return u, nil } @@ -541,11 +567,25 @@ func (s *Scenario) CreateTailscaleNodesInUser( cert := headscale.GetCert() hostname := headscale.GetHostname() + // Determine which network this tailscale client will be in + var network *dockertest.Network + if s.userToNetwork != nil && s.userToNetwork[userStr] != nil { + network = s.userToNetwork[userStr] + } else { + network = s.networks[s.testDefaultNetwork] + } + + // Get headscale IP in this network for /etc/hosts fallback DNS + headscaleIP := headscale.GetIPInNetwork(network) + extraHosts := []string{hostname + ":" + headscaleIP} + s.mu.Lock() opts = append(opts, tsic.WithCACert(cert), tsic.WithHeadscaleName(hostname), + tsic.WithExtraHosts(extraHosts), ) + s.mu.Unlock() user.createWaitGroup.Go(func() error { @@ -673,6 +713,7 @@ func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, ret if len(allErrors) > 0 { return multierr.New(allErrors...) } + return nil } diff --git a/integration/ssh_test.go b/integration/ssh_test.go index 3015503f..a5975eb4 100644 --- a/integration/ssh_test.go +++ b/integration/ssh_test.go @@ -409,7 +409,7 @@ func doSSHWithRetry(t *testing.T, client TailscaleClient, peer TailscaleClient, // For all other errors, assert no error to trigger retry assert.NoError(ct, err) - }, 10*time.Second, 1*time.Second) + }, 10*time.Second, 200*time.Millisecond) } else { // For failure cases, just execute once result, stderr, err = client.Execute(command) diff --git a/integration/tailscale.go b/integration/tailscale.go index cc895a81..07573e6f 100644 --- a/integration/tailscale.go +++ b/integration/tailscale.go @@ -32,6 +32,7 @@ type TailscaleClient interface { Down() error IPs() ([]netip.Addr, error) MustIPs() []netip.Addr + IPv4() (netip.Addr, error) MustIPv4() netip.Addr MustIPv6() netip.Addr FQDN() (string, error) @@ -46,6 +47,7 @@ type TailscaleClient interface { WaitForPeers(expected int, timeout, retryInterval time.Duration) error Ping(hostnameOrIP string, opts ...tsic.PingOption) error Curl(url string, opts ...tsic.CurlOption) (string, error) + CurlFailFast(url string) (string, error) Traceroute(netip.Addr) (util.Traceroute, error) ContainerID() string MustID() types.NodeID diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index 90b6858f..665fd670 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -36,8 +36,8 @@ import ( const ( tsicHashLength = 6 - defaultPingTimeout = 300 * time.Millisecond - defaultPingCount = 10 + defaultPingTimeout = 200 * time.Millisecond + defaultPingCount = 5 dockerContextPath = "../." caCertRoot = "/usr/local/share/ca-certificates" dockerExecuteTimeout = 60 * time.Second @@ -573,7 +573,7 @@ func (t *TailscaleInContainer) Down() error { // IPs returns the netip.Addr of the Tailscale instance. func (t *TailscaleInContainer) IPs() ([]netip.Addr, error) { - if t.ips != nil && len(t.ips) != 0 { + if len(t.ips) != 0 { return t.ips, nil } @@ -589,7 +589,7 @@ func (t *TailscaleInContainer) IPs() ([]netip.Addr, error) { return []netip.Addr{}, fmt.Errorf("%s failed to join tailscale client: %w", t.hostname, err) } - for _, address := range strings.Split(result, "\n") { + for address := range strings.SplitSeq(result, "\n") { address = strings.TrimSuffix(address, "\n") if len(address) < 1 { continue @@ -613,6 +613,22 @@ func (t *TailscaleInContainer) MustIPs() []netip.Addr { return ips } +// IPv4 returns the IPv4 address of the Tailscale instance. +func (t *TailscaleInContainer) IPv4() (netip.Addr, error) { + ips, err := t.IPs() + if err != nil { + return netip.Addr{}, err + } + + for _, ip := range ips { + if ip.Is4() { + return ip, nil + } + } + + return netip.Addr{}, errors.New("no IPv4 address found") +} + func (t *TailscaleInContainer) MustIPv4() netip.Addr { for _, ip := range t.MustIPs() { if ip.Is4() { @@ -984,6 +1000,7 @@ func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval expected, len(peers), )} + continue } @@ -1149,11 +1166,11 @@ func WithCurlRetry(ret int) CurlOption { } const ( - defaultConnectionTimeout = 3 * time.Second - defaultMaxTime = 10 * time.Second - defaultRetry = 5 - defaultRetryDelay = 0 * time.Second - defaultRetryMaxTime = 50 * time.Second + defaultConnectionTimeout = 1 * time.Second + defaultMaxTime = 3 * time.Second + defaultRetry = 3 + defaultRetryDelay = 200 * time.Millisecond + defaultRetryMaxTime = 5 * time.Second ) // Curl executes the Tailscale curl command and curls a hostname @@ -1198,6 +1215,17 @@ func (t *TailscaleInContainer) Curl(url string, opts ...CurlOption) (string, err return result, nil } +// CurlFailFast executes the Tailscale curl command with aggressive timeouts +// optimized for testing expected connection failures. It uses minimal timeouts +// to quickly detect blocked connections without waiting for multiple retries. +func (t *TailscaleInContainer) CurlFailFast(url string) (string, error) { + // Use aggressive timeouts for fast failure detection + return t.Curl(url, + WithCurlConnectionTimeout(1*time.Second), + WithCurlMaxTime(2*time.Second), + WithCurlRetry(1)) +} + func (t *TailscaleInContainer) Traceroute(ip netip.Addr) (util.Traceroute, error) { command := []string{ "traceroute",