From 66eda92ec03bf3077731f4f71c54ddfdb4bc09b3 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 23 Jul 2025 16:03:58 +0200 Subject: [PATCH] integration: rework retry for waiting for node sync Signed-off-by: Kristoffer Dalby --- hscontrol/state/state.go | 5 -- integration/acl_test.go | 3 +- integration/integrationutil/util.go | 15 ++++ integration/scenario.go | 25 ++++-- integration/tailscale.go | 7 +- integration/tsic/tsic.go | 127 ++++++++++++++++++---------- 6 files changed, 121 insertions(+), 61 deletions(-) diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index 1768ea97..a63dad22 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -969,11 +969,6 @@ func (s *State) HandleNodeFromPreAuthKey( return node.View(), c, nil } -// AllocateNextIPs allocates the next available IPv4 and IPv6 addresses. -func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) { - return s.ipAlloc.Next() -} - // updatePolicyManagerUsers updates the policy manager with current users. // Returns true if the policy changed and notifications should be sent. // TODO(kradalby): This is a temporary stepping stone, ultimately we should diff --git a/integration/acl_test.go b/integration/acl_test.go index 3aef521e..d204d1f4 100644 --- a/integration/acl_test.go +++ b/integration/acl_test.go @@ -11,6 +11,7 @@ import ( policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/integration/hsic" + "github.com/juanfont/headscale/integration/integrationutil" "github.com/juanfont/headscale/integration/tsic" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) { allClients, err := scenario.ListTailscaleClients() require.NoError(t, err) - err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"]) + err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval()) require.NoError(t, err) for _, client := range allClients { diff --git a/integration/integrationutil/util.go b/integration/integrationutil/util.go index 7b9b63b5..0bf3545d 100644 --- a/integration/integrationutil/util.go +++ b/integration/integrationutil/util.go @@ -14,11 +14,26 @@ import ( "path/filepath" "time" + "github.com/juanfont/headscale/hscontrol/util" "github.com/juanfont/headscale/integration/dockertestutil" "github.com/ory/dockertest/v3" "github.com/ory/dockertest/v3/docker" ) +// PeerSyncTimeout returns the timeout for peer synchronization based on environment: +// 60s for dev, 120s for CI. +func PeerSyncTimeout() time.Duration { + if util.IsCI() { + return 120 * time.Second + } + return 60 * time.Second +} + +// PeerSyncRetryInterval returns the retry interval for peer synchronization checks. +func PeerSyncRetryInterval() time.Duration { + return 100 * time.Millisecond +} + func WriteFileToContainer( pool *dockertest.Pool, container *dockertest.Resource, diff --git a/integration/scenario.go b/integration/scenario.go index 8ce54b89..c7facf20 100644 --- a/integration/scenario.go +++ b/integration/scenario.go @@ -27,6 +27,7 @@ import ( "github.com/juanfont/headscale/integration/dockertestutil" "github.com/juanfont/headscale/integration/dsic" "github.com/juanfont/headscale/integration/hsic" + "github.com/juanfont/headscale/integration/integrationutil" "github.com/juanfont/headscale/integration/tsic" "github.com/oauth2-proxy/mockoidc" "github.com/ory/dockertest/v3" @@ -39,6 +40,7 @@ import ( "golang.org/x/sync/errgroup" "tailscale.com/envknob" "tailscale.com/util/mak" + "tailscale.com/util/multierr" ) const ( @@ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode( ) } - err = tsClient.WaitForNeedsLogin() + err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) if err != nil { return nil, fmt.Errorf( "failed to wait for tailscaled (%s) to need login: %w", @@ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser( ) } - err = tsClient.WaitForNeedsLogin() + err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) if err != nil { return fmt.Errorf( "failed to wait for tailscaled (%s) to need login: %w", @@ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp( } for _, client := range user.Clients { - err := client.WaitForRunning() + err := client.WaitForRunning(integrationutil.PeerSyncTimeout()) if err != nil { return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err) } @@ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int { func (s *Scenario) WaitForTailscaleSync() error { tsCount := s.CountTailscale() - err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1) + err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval()) if err != nil { for _, user := range s.users { for _, client := range user.Clients { @@ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error { // WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports // to have all other TailscaleClients present in their netmap.NetworkMap. -func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error { +func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error { + var allErrors []error + for _, user := range s.users { for _, client := range user.Clients { c := client user.syncWaitGroup.Go(func() error { - return c.WaitForPeers(peerCount) + return c.WaitForPeers(peerCount, timeout, retryInterval) }) } if err := user.syncWaitGroup.Wait(); err != nil { - return err + allErrors = append(allErrors, err) } } + if len(allErrors) > 0 { + return multierr.New(allErrors...) + } return nil } @@ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error { } for _, client := range user.Clients { - err := client.WaitForRunning() + err := client.WaitForRunning(integrationutil.PeerSyncTimeout()) if err != nil { return fmt.Errorf( "%s tailscale node has not reached running: %w", @@ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error { for _, client := range user.Clients { c := client user.syncWaitGroup.Go(func() error { - return c.WaitForNeedsLogin() + return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) }) } if err := user.syncWaitGroup.Wait(); err != nil { diff --git a/integration/tailscale.go b/integration/tailscale.go index e8a93b45..cc895a81 100644 --- a/integration/tailscale.go +++ b/integration/tailscale.go @@ -4,6 +4,7 @@ import ( "io" "net/netip" "net/url" + "time" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/util" @@ -40,9 +41,9 @@ type TailscaleClient interface { DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error) GetNodePrivateKey() (*key.NodePrivate, error) Netcheck() (*netcheck.Report, error) - WaitForNeedsLogin() error - WaitForRunning() error - WaitForPeers(expected int) error + WaitForNeedsLogin(timeout time.Duration) error + WaitForRunning(timeout time.Duration) error + WaitForPeers(expected int, timeout, retryInterval time.Duration) error Ping(hostnameOrIP string, opts ...tsic.PingOption) error Curl(url string, opts ...tsic.CurlOption) (string, error) Traceroute(netip.Addr) (util.Traceroute, error) diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index 01603512..90b6858f 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -31,6 +31,7 @@ import ( "tailscale.com/paths" "tailscale.com/types/key" "tailscale.com/types/netmap" + "tailscale.com/util/multierr" ) const ( @@ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error { return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr) } - return t.waitForBackendState("NeedsLogin") + return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout()) } // Helper that runs `tailscale up` with no arguments. @@ -904,75 +905,115 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) { // WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has // started and needs to be logged into. -func (t *TailscaleInContainer) WaitForNeedsLogin() error { - return t.waitForBackendState("NeedsLogin") +func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error { + return t.waitForBackendState("NeedsLogin", timeout) } // WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in // and ready to be used. -func (t *TailscaleInContainer) WaitForRunning() error { - return t.waitForBackendState("Running") +func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error { + return t.waitForBackendState("Running", timeout) } -func (t *TailscaleInContainer) waitForBackendState(state string) error { - return t.pool.Retry(func() error { - status, err := t.Status() - if err != nil { - return errTailscaleStatus(t.hostname, err) - } +func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error { + ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval()) + defer ticker.Stop() - // ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0 - // https://github.com/tailscale/tailscale/pull/3865 - // - // Before that, we can check the BackendState to see if the - // tailscaled daemon is connected to the control system. - if status.BackendState == state { - return nil - } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() - return errTailscaleNotConnected - }) + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout) + case <-ticker.C: + status, err := t.Status() + if err != nil { + continue // Keep retrying on status errors + } + + // ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0 + // https://github.com/tailscale/tailscale/pull/3865 + // + // Before that, we can check the BackendState to see if the + // tailscaled daemon is connected to the control system. + if status.BackendState == state { + return nil + } + } + } } // WaitForPeers blocks until N number of peers is present in the // Peer list of the Tailscale instance and is reporting Online. -func (t *TailscaleInContainer) WaitForPeers(expected int) error { - return t.pool.Retry(func() error { - status, err := t.Status() - if err != nil { - return errTailscaleStatus(t.hostname, err) - } +// +// The method verifies that each peer: +// - Has the expected peer count +// - All peers are Online +// - All peers have a hostname +// - All peers have a DERP relay assigned +// +// Uses multierr to collect all validation errors. +func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error { + ticker := time.NewTicker(retryInterval) + defer ticker.Stop() + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + var lastErrs []error + for { + select { + case <-ctx.Done(): + if len(lastErrs) > 0 { + return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...)) + } + return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout) + case <-ticker.C: + status, err := t.Status() + if err != nil { + lastErrs = []error{errTailscaleStatus(t.hostname, err)} + continue // Keep retrying on status errors + } + + if peers := status.Peers(); len(peers) != expected { + lastErrs = []error{fmt.Errorf( + "%s err: %w expected %d, got %d", + t.hostname, + errTailscaleWrongPeerCount, + expected, + len(peers), + )} + continue + } - if peers := status.Peers(); len(peers) != expected { - return fmt.Errorf( - "%s err: %w expected %d, got %d", - t.hostname, - errTailscaleWrongPeerCount, - expected, - len(peers), - ) - } else { // Verify that the peers of a given node is Online // has a hostname and a DERP relay. - for _, peerKey := range peers { + var peerErrors []error + for _, peerKey := range status.Peers() { peer := status.Peer[peerKey] if !peer.Online { - return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName) + peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName)) } if peer.HostName == "" { - return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName) + peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName)) } if peer.Relay == "" { - return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName) + peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName)) } } - } - return nil - }) + if len(peerErrors) > 0 { + lastErrs = peerErrors + continue + } + + return nil + } + } } type (