1
0
mirror of https://github.com/juanfont/headscale.git synced 2025-08-14 13:51:01 +02:00

integration: rework retry for waiting for node sync

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2025-07-23 16:03:58 +02:00
parent 112438219b
commit 66eda92ec0
No known key found for this signature in database
6 changed files with 121 additions and 61 deletions

View File

@ -969,11 +969,6 @@ func (s *State) HandleNodeFromPreAuthKey(
return node.View(), c, nil
}
// AllocateNextIPs allocates the next available IPv4 and IPv6 addresses.
func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) {
return s.ipAlloc.Next()
}
// updatePolicyManagerUsers updates the policy manager with current users.
// Returns true if the policy changed and notifications should be sent.
// TODO(kradalby): This is a temporary stepping stone, ultimately we should

View File

@ -11,6 +11,7 @@ import (
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) {
allClients, err := scenario.ListTailscaleClients()
require.NoError(t, err)
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"])
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
require.NoError(t, err)
for _, client := range allClients {

View File

@ -14,11 +14,26 @@ import (
"path/filepath"
"time"
"github.com/juanfont/headscale/hscontrol/util"
"github.com/juanfont/headscale/integration/dockertestutil"
"github.com/ory/dockertest/v3"
"github.com/ory/dockertest/v3/docker"
)
// PeerSyncTimeout returns the timeout for peer synchronization based on environment:
// 60s for dev, 120s for CI.
func PeerSyncTimeout() time.Duration {
if util.IsCI() {
return 120 * time.Second
}
return 60 * time.Second
}
// PeerSyncRetryInterval returns the retry interval for peer synchronization checks.
func PeerSyncRetryInterval() time.Duration {
return 100 * time.Millisecond
}
func WriteFileToContainer(
pool *dockertest.Pool,
container *dockertest.Resource,

View File

@ -27,6 +27,7 @@ import (
"github.com/juanfont/headscale/integration/dockertestutil"
"github.com/juanfont/headscale/integration/dsic"
"github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic"
"github.com/oauth2-proxy/mockoidc"
"github.com/ory/dockertest/v3"
@ -39,6 +40,7 @@ import (
"golang.org/x/sync/errgroup"
"tailscale.com/envknob"
"tailscale.com/util/mak"
"tailscale.com/util/multierr"
)
const (
@ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode(
)
}
err = tsClient.WaitForNeedsLogin()
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
if err != nil {
return nil, fmt.Errorf(
"failed to wait for tailscaled (%s) to need login: %w",
@ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser(
)
}
err = tsClient.WaitForNeedsLogin()
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
if err != nil {
return fmt.Errorf(
"failed to wait for tailscaled (%s) to need login: %w",
@ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp(
}
for _, client := range user.Clients {
err := client.WaitForRunning()
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
if err != nil {
return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err)
}
@ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int {
func (s *Scenario) WaitForTailscaleSync() error {
tsCount := s.CountTailscale()
err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1)
err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
if err != nil {
for _, user := range s.users {
for _, client := range user.Clients {
@ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error {
// WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
// to have all other TailscaleClients present in their netmap.NetworkMap.
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error {
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error {
var allErrors []error
for _, user := range s.users {
for _, client := range user.Clients {
c := client
user.syncWaitGroup.Go(func() error {
return c.WaitForPeers(peerCount)
return c.WaitForPeers(peerCount, timeout, retryInterval)
})
}
if err := user.syncWaitGroup.Wait(); err != nil {
return err
allErrors = append(allErrors, err)
}
}
if len(allErrors) > 0 {
return multierr.New(allErrors...)
}
return nil
}
@ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error {
}
for _, client := range user.Clients {
err := client.WaitForRunning()
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
if err != nil {
return fmt.Errorf(
"%s tailscale node has not reached running: %w",
@ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error {
for _, client := range user.Clients {
c := client
user.syncWaitGroup.Go(func() error {
return c.WaitForNeedsLogin()
return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
})
}
if err := user.syncWaitGroup.Wait(); err != nil {

View File

@ -4,6 +4,7 @@ import (
"io"
"net/netip"
"net/url"
"time"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/util"
@ -40,9 +41,9 @@ type TailscaleClient interface {
DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error)
GetNodePrivateKey() (*key.NodePrivate, error)
Netcheck() (*netcheck.Report, error)
WaitForNeedsLogin() error
WaitForRunning() error
WaitForPeers(expected int) error
WaitForNeedsLogin(timeout time.Duration) error
WaitForRunning(timeout time.Duration) error
WaitForPeers(expected int, timeout, retryInterval time.Duration) error
Ping(hostnameOrIP string, opts ...tsic.PingOption) error
Curl(url string, opts ...tsic.CurlOption) (string, error)
Traceroute(netip.Addr) (util.Traceroute, error)

View File

@ -31,6 +31,7 @@ import (
"tailscale.com/paths"
"tailscale.com/types/key"
"tailscale.com/types/netmap"
"tailscale.com/util/multierr"
)
const (
@ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error {
return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr)
}
return t.waitForBackendState("NeedsLogin")
return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout())
}
// Helper that runs `tailscale up` with no arguments.
@ -904,75 +905,115 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) {
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
// started and needs to be logged into.
func (t *TailscaleInContainer) WaitForNeedsLogin() error {
return t.waitForBackendState("NeedsLogin")
func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error {
return t.waitForBackendState("NeedsLogin", timeout)
}
// WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
// and ready to be used.
func (t *TailscaleInContainer) WaitForRunning() error {
return t.waitForBackendState("Running")
func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error {
return t.waitForBackendState("Running", timeout)
}
func (t *TailscaleInContainer) waitForBackendState(state string) error {
return t.pool.Retry(func() error {
status, err := t.Status()
if err != nil {
return errTailscaleStatus(t.hostname, err)
}
func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error {
ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval())
defer ticker.Stop()
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
// https://github.com/tailscale/tailscale/pull/3865
//
// Before that, we can check the BackendState to see if the
// tailscaled daemon is connected to the control system.
if status.BackendState == state {
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return errTailscaleNotConnected
})
for {
select {
case <-ctx.Done():
return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout)
case <-ticker.C:
status, err := t.Status()
if err != nil {
continue // Keep retrying on status errors
}
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
// https://github.com/tailscale/tailscale/pull/3865
//
// Before that, we can check the BackendState to see if the
// tailscaled daemon is connected to the control system.
if status.BackendState == state {
return nil
}
}
}
}
// WaitForPeers blocks until N number of peers is present in the
// Peer list of the Tailscale instance and is reporting Online.
func (t *TailscaleInContainer) WaitForPeers(expected int) error {
return t.pool.Retry(func() error {
status, err := t.Status()
if err != nil {
return errTailscaleStatus(t.hostname, err)
}
//
// The method verifies that each peer:
// - Has the expected peer count
// - All peers are Online
// - All peers have a hostname
// - All peers have a DERP relay assigned
//
// Uses multierr to collect all validation errors.
func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error {
ticker := time.NewTicker(retryInterval)
defer ticker.Stop()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
var lastErrs []error
for {
select {
case <-ctx.Done():
if len(lastErrs) > 0 {
return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...))
}
return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout)
case <-ticker.C:
status, err := t.Status()
if err != nil {
lastErrs = []error{errTailscaleStatus(t.hostname, err)}
continue // Keep retrying on status errors
}
if peers := status.Peers(); len(peers) != expected {
lastErrs = []error{fmt.Errorf(
"%s err: %w expected %d, got %d",
t.hostname,
errTailscaleWrongPeerCount,
expected,
len(peers),
)}
continue
}
if peers := status.Peers(); len(peers) != expected {
return fmt.Errorf(
"%s err: %w expected %d, got %d",
t.hostname,
errTailscaleWrongPeerCount,
expected,
len(peers),
)
} else {
// Verify that the peers of a given node is Online
// has a hostname and a DERP relay.
for _, peerKey := range peers {
var peerErrors []error
for _, peerKey := range status.Peers() {
peer := status.Peer[peerKey]
if !peer.Online {
return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName)
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName))
}
if peer.HostName == "" {
return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName)
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName))
}
if peer.Relay == "" {
return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName)
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName))
}
}
}
return nil
})
if len(peerErrors) > 0 {
lastErrs = peerErrors
continue
}
return nil
}
}
}
type (