mirror of
https://github.com/juanfont/headscale.git
synced 2025-08-14 13:51:01 +02:00
integration: rework retry for waiting for node sync
Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
parent
112438219b
commit
66eda92ec0
@ -969,11 +969,6 @@ func (s *State) HandleNodeFromPreAuthKey(
|
||||
return node.View(), c, nil
|
||||
}
|
||||
|
||||
// AllocateNextIPs allocates the next available IPv4 and IPv6 addresses.
|
||||
func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) {
|
||||
return s.ipAlloc.Next()
|
||||
}
|
||||
|
||||
// updatePolicyManagerUsers updates the policy manager with current users.
|
||||
// Returns true if the policy changed and notifications should be sent.
|
||||
// TODO(kradalby): This is a temporary stepping stone, ultimately we should
|
||||
|
@ -11,6 +11,7 @@ import (
|
||||
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
|
||||
"github.com/juanfont/headscale/hscontrol/types"
|
||||
"github.com/juanfont/headscale/integration/hsic"
|
||||
"github.com/juanfont/headscale/integration/integrationutil"
|
||||
"github.com/juanfont/headscale/integration/tsic"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
@ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) {
|
||||
allClients, err := scenario.ListTailscaleClients()
|
||||
require.NoError(t, err)
|
||||
|
||||
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"])
|
||||
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, client := range allClients {
|
||||
|
@ -14,11 +14,26 @@ import (
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/juanfont/headscale/hscontrol/util"
|
||||
"github.com/juanfont/headscale/integration/dockertestutil"
|
||||
"github.com/ory/dockertest/v3"
|
||||
"github.com/ory/dockertest/v3/docker"
|
||||
)
|
||||
|
||||
// PeerSyncTimeout returns the timeout for peer synchronization based on environment:
|
||||
// 60s for dev, 120s for CI.
|
||||
func PeerSyncTimeout() time.Duration {
|
||||
if util.IsCI() {
|
||||
return 120 * time.Second
|
||||
}
|
||||
return 60 * time.Second
|
||||
}
|
||||
|
||||
// PeerSyncRetryInterval returns the retry interval for peer synchronization checks.
|
||||
func PeerSyncRetryInterval() time.Duration {
|
||||
return 100 * time.Millisecond
|
||||
}
|
||||
|
||||
func WriteFileToContainer(
|
||||
pool *dockertest.Pool,
|
||||
container *dockertest.Resource,
|
||||
|
@ -27,6 +27,7 @@ import (
|
||||
"github.com/juanfont/headscale/integration/dockertestutil"
|
||||
"github.com/juanfont/headscale/integration/dsic"
|
||||
"github.com/juanfont/headscale/integration/hsic"
|
||||
"github.com/juanfont/headscale/integration/integrationutil"
|
||||
"github.com/juanfont/headscale/integration/tsic"
|
||||
"github.com/oauth2-proxy/mockoidc"
|
||||
"github.com/ory/dockertest/v3"
|
||||
@ -39,6 +40,7 @@ import (
|
||||
"golang.org/x/sync/errgroup"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/util/mak"
|
||||
"tailscale.com/util/multierr"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode(
|
||||
)
|
||||
}
|
||||
|
||||
err = tsClient.WaitForNeedsLogin()
|
||||
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(
|
||||
"failed to wait for tailscaled (%s) to need login: %w",
|
||||
@ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser(
|
||||
)
|
||||
}
|
||||
|
||||
err = tsClient.WaitForNeedsLogin()
|
||||
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"failed to wait for tailscaled (%s) to need login: %w",
|
||||
@ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp(
|
||||
}
|
||||
|
||||
for _, client := range user.Clients {
|
||||
err := client.WaitForRunning()
|
||||
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err)
|
||||
}
|
||||
@ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int {
|
||||
func (s *Scenario) WaitForTailscaleSync() error {
|
||||
tsCount := s.CountTailscale()
|
||||
|
||||
err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1)
|
||||
err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
||||
if err != nil {
|
||||
for _, user := range s.users {
|
||||
for _, client := range user.Clients {
|
||||
@ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error {
|
||||
|
||||
// WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
|
||||
// to have all other TailscaleClients present in their netmap.NetworkMap.
|
||||
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error {
|
||||
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error {
|
||||
var allErrors []error
|
||||
|
||||
for _, user := range s.users {
|
||||
for _, client := range user.Clients {
|
||||
c := client
|
||||
user.syncWaitGroup.Go(func() error {
|
||||
return c.WaitForPeers(peerCount)
|
||||
return c.WaitForPeers(peerCount, timeout, retryInterval)
|
||||
})
|
||||
}
|
||||
if err := user.syncWaitGroup.Wait(); err != nil {
|
||||
return err
|
||||
allErrors = append(allErrors, err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(allErrors) > 0 {
|
||||
return multierr.New(allErrors...)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error {
|
||||
}
|
||||
|
||||
for _, client := range user.Clients {
|
||||
err := client.WaitForRunning()
|
||||
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"%s tailscale node has not reached running: %w",
|
||||
@ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error {
|
||||
for _, client := range user.Clients {
|
||||
c := client
|
||||
user.syncWaitGroup.Go(func() error {
|
||||
return c.WaitForNeedsLogin()
|
||||
return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||
})
|
||||
}
|
||||
if err := user.syncWaitGroup.Wait(); err != nil {
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"io"
|
||||
"net/netip"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"github.com/juanfont/headscale/hscontrol/types"
|
||||
"github.com/juanfont/headscale/hscontrol/util"
|
||||
@ -40,9 +41,9 @@ type TailscaleClient interface {
|
||||
DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error)
|
||||
GetNodePrivateKey() (*key.NodePrivate, error)
|
||||
Netcheck() (*netcheck.Report, error)
|
||||
WaitForNeedsLogin() error
|
||||
WaitForRunning() error
|
||||
WaitForPeers(expected int) error
|
||||
WaitForNeedsLogin(timeout time.Duration) error
|
||||
WaitForRunning(timeout time.Duration) error
|
||||
WaitForPeers(expected int, timeout, retryInterval time.Duration) error
|
||||
Ping(hostnameOrIP string, opts ...tsic.PingOption) error
|
||||
Curl(url string, opts ...tsic.CurlOption) (string, error)
|
||||
Traceroute(netip.Addr) (util.Traceroute, error)
|
||||
|
@ -31,6 +31,7 @@ import (
|
||||
"tailscale.com/paths"
|
||||
"tailscale.com/types/key"
|
||||
"tailscale.com/types/netmap"
|
||||
"tailscale.com/util/multierr"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error {
|
||||
return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr)
|
||||
}
|
||||
|
||||
return t.waitForBackendState("NeedsLogin")
|
||||
return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout())
|
||||
}
|
||||
|
||||
// Helper that runs `tailscale up` with no arguments.
|
||||
@ -904,21 +905,31 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) {
|
||||
|
||||
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
|
||||
// started and needs to be logged into.
|
||||
func (t *TailscaleInContainer) WaitForNeedsLogin() error {
|
||||
return t.waitForBackendState("NeedsLogin")
|
||||
func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error {
|
||||
return t.waitForBackendState("NeedsLogin", timeout)
|
||||
}
|
||||
|
||||
// WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
|
||||
// and ready to be used.
|
||||
func (t *TailscaleInContainer) WaitForRunning() error {
|
||||
return t.waitForBackendState("Running")
|
||||
func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error {
|
||||
return t.waitForBackendState("Running", timeout)
|
||||
}
|
||||
|
||||
func (t *TailscaleInContainer) waitForBackendState(state string) error {
|
||||
return t.pool.Retry(func() error {
|
||||
func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error {
|
||||
ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval())
|
||||
defer ticker.Stop()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout)
|
||||
case <-ticker.C:
|
||||
status, err := t.Status()
|
||||
if err != nil {
|
||||
return errTailscaleStatus(t.hostname, err)
|
||||
continue // Keep retrying on status errors
|
||||
}
|
||||
|
||||
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
|
||||
@ -929,50 +940,80 @@ func (t *TailscaleInContainer) waitForBackendState(state string) error {
|
||||
if status.BackendState == state {
|
||||
return nil
|
||||
}
|
||||
|
||||
return errTailscaleNotConnected
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WaitForPeers blocks until N number of peers is present in the
|
||||
// Peer list of the Tailscale instance and is reporting Online.
|
||||
func (t *TailscaleInContainer) WaitForPeers(expected int) error {
|
||||
return t.pool.Retry(func() error {
|
||||
//
|
||||
// The method verifies that each peer:
|
||||
// - Has the expected peer count
|
||||
// - All peers are Online
|
||||
// - All peers have a hostname
|
||||
// - All peers have a DERP relay assigned
|
||||
//
|
||||
// Uses multierr to collect all validation errors.
|
||||
func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error {
|
||||
ticker := time.NewTicker(retryInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
var lastErrs []error
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if len(lastErrs) > 0 {
|
||||
return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...))
|
||||
}
|
||||
return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout)
|
||||
case <-ticker.C:
|
||||
status, err := t.Status()
|
||||
if err != nil {
|
||||
return errTailscaleStatus(t.hostname, err)
|
||||
lastErrs = []error{errTailscaleStatus(t.hostname, err)}
|
||||
continue // Keep retrying on status errors
|
||||
}
|
||||
|
||||
if peers := status.Peers(); len(peers) != expected {
|
||||
return fmt.Errorf(
|
||||
lastErrs = []error{fmt.Errorf(
|
||||
"%s err: %w expected %d, got %d",
|
||||
t.hostname,
|
||||
errTailscaleWrongPeerCount,
|
||||
expected,
|
||||
len(peers),
|
||||
)
|
||||
} else {
|
||||
)}
|
||||
continue
|
||||
}
|
||||
|
||||
// Verify that the peers of a given node is Online
|
||||
// has a hostname and a DERP relay.
|
||||
for _, peerKey := range peers {
|
||||
var peerErrors []error
|
||||
for _, peerKey := range status.Peers() {
|
||||
peer := status.Peer[peerKey]
|
||||
|
||||
if !peer.Online {
|
||||
return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName)
|
||||
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName))
|
||||
}
|
||||
|
||||
if peer.HostName == "" {
|
||||
return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName)
|
||||
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName))
|
||||
}
|
||||
|
||||
if peer.Relay == "" {
|
||||
return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName)
|
||||
}
|
||||
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName))
|
||||
}
|
||||
}
|
||||
|
||||
if len(peerErrors) > 0 {
|
||||
lastErrs = peerErrors
|
||||
continue
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type (
|
||||
|
Loading…
Reference in New Issue
Block a user