1
0
mirror of https://github.com/juanfont/headscale.git synced 2025-08-24 13:46:53 +02:00

integration: rework retry for waiting for node sync

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2025-07-23 16:03:58 +02:00
parent 112438219b
commit 66eda92ec0
No known key found for this signature in database
6 changed files with 121 additions and 61 deletions

View File

@ -969,11 +969,6 @@ func (s *State) HandleNodeFromPreAuthKey(
return node.View(), c, nil return node.View(), c, nil
} }
// AllocateNextIPs allocates the next available IPv4 and IPv6 addresses.
func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) {
return s.ipAlloc.Next()
}
// updatePolicyManagerUsers updates the policy manager with current users. // updatePolicyManagerUsers updates the policy manager with current users.
// Returns true if the policy changed and notifications should be sent. // Returns true if the policy changed and notifications should be sent.
// TODO(kradalby): This is a temporary stepping stone, ultimately we should // TODO(kradalby): This is a temporary stepping stone, ultimately we should

View File

@ -11,6 +11,7 @@ import (
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
"github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/integration/hsic" "github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic" "github.com/juanfont/headscale/integration/tsic"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) {
allClients, err := scenario.ListTailscaleClients() allClients, err := scenario.ListTailscaleClients()
require.NoError(t, err) require.NoError(t, err)
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"]) err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
require.NoError(t, err) require.NoError(t, err)
for _, client := range allClients { for _, client := range allClients {

View File

@ -14,11 +14,26 @@ import (
"path/filepath" "path/filepath"
"time" "time"
"github.com/juanfont/headscale/hscontrol/util"
"github.com/juanfont/headscale/integration/dockertestutil" "github.com/juanfont/headscale/integration/dockertestutil"
"github.com/ory/dockertest/v3" "github.com/ory/dockertest/v3"
"github.com/ory/dockertest/v3/docker" "github.com/ory/dockertest/v3/docker"
) )
// PeerSyncTimeout returns the timeout for peer synchronization based on environment:
// 60s for dev, 120s for CI.
func PeerSyncTimeout() time.Duration {
if util.IsCI() {
return 120 * time.Second
}
return 60 * time.Second
}
// PeerSyncRetryInterval returns the retry interval for peer synchronization checks.
func PeerSyncRetryInterval() time.Duration {
return 100 * time.Millisecond
}
func WriteFileToContainer( func WriteFileToContainer(
pool *dockertest.Pool, pool *dockertest.Pool,
container *dockertest.Resource, container *dockertest.Resource,

View File

@ -27,6 +27,7 @@ import (
"github.com/juanfont/headscale/integration/dockertestutil" "github.com/juanfont/headscale/integration/dockertestutil"
"github.com/juanfont/headscale/integration/dsic" "github.com/juanfont/headscale/integration/dsic"
"github.com/juanfont/headscale/integration/hsic" "github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic" "github.com/juanfont/headscale/integration/tsic"
"github.com/oauth2-proxy/mockoidc" "github.com/oauth2-proxy/mockoidc"
"github.com/ory/dockertest/v3" "github.com/ory/dockertest/v3"
@ -39,6 +40,7 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"tailscale.com/envknob" "tailscale.com/envknob"
"tailscale.com/util/mak" "tailscale.com/util/mak"
"tailscale.com/util/multierr"
) )
const ( const (
@ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode(
) )
} }
err = tsClient.WaitForNeedsLogin() err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
if err != nil { if err != nil {
return nil, fmt.Errorf( return nil, fmt.Errorf(
"failed to wait for tailscaled (%s) to need login: %w", "failed to wait for tailscaled (%s) to need login: %w",
@ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser(
) )
} }
err = tsClient.WaitForNeedsLogin() err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
if err != nil { if err != nil {
return fmt.Errorf( return fmt.Errorf(
"failed to wait for tailscaled (%s) to need login: %w", "failed to wait for tailscaled (%s) to need login: %w",
@ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp(
} }
for _, client := range user.Clients { for _, client := range user.Clients {
err := client.WaitForRunning() err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
if err != nil { if err != nil {
return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err) return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err)
} }
@ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int {
func (s *Scenario) WaitForTailscaleSync() error { func (s *Scenario) WaitForTailscaleSync() error {
tsCount := s.CountTailscale() tsCount := s.CountTailscale()
err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1) err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
if err != nil { if err != nil {
for _, user := range s.users { for _, user := range s.users {
for _, client := range user.Clients { for _, client := range user.Clients {
@ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error {
// WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports // WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
// to have all other TailscaleClients present in their netmap.NetworkMap. // to have all other TailscaleClients present in their netmap.NetworkMap.
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error { func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error {
var allErrors []error
for _, user := range s.users { for _, user := range s.users {
for _, client := range user.Clients { for _, client := range user.Clients {
c := client c := client
user.syncWaitGroup.Go(func() error { user.syncWaitGroup.Go(func() error {
return c.WaitForPeers(peerCount) return c.WaitForPeers(peerCount, timeout, retryInterval)
}) })
} }
if err := user.syncWaitGroup.Wait(); err != nil { if err := user.syncWaitGroup.Wait(); err != nil {
return err allErrors = append(allErrors, err)
} }
} }
if len(allErrors) > 0 {
return multierr.New(allErrors...)
}
return nil return nil
} }
@ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error {
} }
for _, client := range user.Clients { for _, client := range user.Clients {
err := client.WaitForRunning() err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
if err != nil { if err != nil {
return fmt.Errorf( return fmt.Errorf(
"%s tailscale node has not reached running: %w", "%s tailscale node has not reached running: %w",
@ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error {
for _, client := range user.Clients { for _, client := range user.Clients {
c := client c := client
user.syncWaitGroup.Go(func() error { user.syncWaitGroup.Go(func() error {
return c.WaitForNeedsLogin() return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
}) })
} }
if err := user.syncWaitGroup.Wait(); err != nil { if err := user.syncWaitGroup.Wait(); err != nil {

View File

@ -4,6 +4,7 @@ import (
"io" "io"
"net/netip" "net/netip"
"net/url" "net/url"
"time"
"github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/util" "github.com/juanfont/headscale/hscontrol/util"
@ -40,9 +41,9 @@ type TailscaleClient interface {
DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error) DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error)
GetNodePrivateKey() (*key.NodePrivate, error) GetNodePrivateKey() (*key.NodePrivate, error)
Netcheck() (*netcheck.Report, error) Netcheck() (*netcheck.Report, error)
WaitForNeedsLogin() error WaitForNeedsLogin(timeout time.Duration) error
WaitForRunning() error WaitForRunning(timeout time.Duration) error
WaitForPeers(expected int) error WaitForPeers(expected int, timeout, retryInterval time.Duration) error
Ping(hostnameOrIP string, opts ...tsic.PingOption) error Ping(hostnameOrIP string, opts ...tsic.PingOption) error
Curl(url string, opts ...tsic.CurlOption) (string, error) Curl(url string, opts ...tsic.CurlOption) (string, error)
Traceroute(netip.Addr) (util.Traceroute, error) Traceroute(netip.Addr) (util.Traceroute, error)

View File

@ -31,6 +31,7 @@ import (
"tailscale.com/paths" "tailscale.com/paths"
"tailscale.com/types/key" "tailscale.com/types/key"
"tailscale.com/types/netmap" "tailscale.com/types/netmap"
"tailscale.com/util/multierr"
) )
const ( const (
@ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error {
return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr) return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr)
} }
return t.waitForBackendState("NeedsLogin") return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout())
} }
// Helper that runs `tailscale up` with no arguments. // Helper that runs `tailscale up` with no arguments.
@ -904,75 +905,115 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) {
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has // WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
// started and needs to be logged into. // started and needs to be logged into.
func (t *TailscaleInContainer) WaitForNeedsLogin() error { func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error {
return t.waitForBackendState("NeedsLogin") return t.waitForBackendState("NeedsLogin", timeout)
} }
// WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in // WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
// and ready to be used. // and ready to be used.
func (t *TailscaleInContainer) WaitForRunning() error { func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error {
return t.waitForBackendState("Running") return t.waitForBackendState("Running", timeout)
} }
func (t *TailscaleInContainer) waitForBackendState(state string) error { func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error {
return t.pool.Retry(func() error { ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval())
status, err := t.Status() defer ticker.Stop()
if err != nil {
return errTailscaleStatus(t.hostname, err)
}
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0 ctx, cancel := context.WithTimeout(context.Background(), timeout)
// https://github.com/tailscale/tailscale/pull/3865 defer cancel()
//
// Before that, we can check the BackendState to see if the
// tailscaled daemon is connected to the control system.
if status.BackendState == state {
return nil
}
return errTailscaleNotConnected for {
}) select {
case <-ctx.Done():
return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout)
case <-ticker.C:
status, err := t.Status()
if err != nil {
continue // Keep retrying on status errors
}
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
// https://github.com/tailscale/tailscale/pull/3865
//
// Before that, we can check the BackendState to see if the
// tailscaled daemon is connected to the control system.
if status.BackendState == state {
return nil
}
}
}
} }
// WaitForPeers blocks until N number of peers is present in the // WaitForPeers blocks until N number of peers is present in the
// Peer list of the Tailscale instance and is reporting Online. // Peer list of the Tailscale instance and is reporting Online.
func (t *TailscaleInContainer) WaitForPeers(expected int) error { //
return t.pool.Retry(func() error { // The method verifies that each peer:
status, err := t.Status() // - Has the expected peer count
if err != nil { // - All peers are Online
return errTailscaleStatus(t.hostname, err) // - All peers have a hostname
} // - All peers have a DERP relay assigned
//
// Uses multierr to collect all validation errors.
func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error {
ticker := time.NewTicker(retryInterval)
defer ticker.Stop()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
var lastErrs []error
for {
select {
case <-ctx.Done():
if len(lastErrs) > 0 {
return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...))
}
return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout)
case <-ticker.C:
status, err := t.Status()
if err != nil {
lastErrs = []error{errTailscaleStatus(t.hostname, err)}
continue // Keep retrying on status errors
}
if peers := status.Peers(); len(peers) != expected {
lastErrs = []error{fmt.Errorf(
"%s err: %w expected %d, got %d",
t.hostname,
errTailscaleWrongPeerCount,
expected,
len(peers),
)}
continue
}
if peers := status.Peers(); len(peers) != expected {
return fmt.Errorf(
"%s err: %w expected %d, got %d",
t.hostname,
errTailscaleWrongPeerCount,
expected,
len(peers),
)
} else {
// Verify that the peers of a given node is Online // Verify that the peers of a given node is Online
// has a hostname and a DERP relay. // has a hostname and a DERP relay.
for _, peerKey := range peers { var peerErrors []error
for _, peerKey := range status.Peers() {
peer := status.Peer[peerKey] peer := status.Peer[peerKey]
if !peer.Online { if !peer.Online {
return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName) peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName))
} }
if peer.HostName == "" { if peer.HostName == "" {
return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName) peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName))
} }
if peer.Relay == "" { if peer.Relay == "" {
return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName) peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName))
} }
} }
}
return nil if len(peerErrors) > 0 {
}) lastErrs = peerErrors
continue
}
return nil
}
}
} }
type ( type (