mirror of
https://github.com/juanfont/headscale.git
synced 2025-08-24 13:46:53 +02:00
integration: rework retry for waiting for node sync
Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
parent
112438219b
commit
66eda92ec0
@ -969,11 +969,6 @@ func (s *State) HandleNodeFromPreAuthKey(
|
|||||||
return node.View(), c, nil
|
return node.View(), c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// AllocateNextIPs allocates the next available IPv4 and IPv6 addresses.
|
|
||||||
func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) {
|
|
||||||
return s.ipAlloc.Next()
|
|
||||||
}
|
|
||||||
|
|
||||||
// updatePolicyManagerUsers updates the policy manager with current users.
|
// updatePolicyManagerUsers updates the policy manager with current users.
|
||||||
// Returns true if the policy changed and notifications should be sent.
|
// Returns true if the policy changed and notifications should be sent.
|
||||||
// TODO(kradalby): This is a temporary stepping stone, ultimately we should
|
// TODO(kradalby): This is a temporary stepping stone, ultimately we should
|
||||||
|
@ -11,6 +11,7 @@ import (
|
|||||||
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
|
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
|
||||||
"github.com/juanfont/headscale/hscontrol/types"
|
"github.com/juanfont/headscale/hscontrol/types"
|
||||||
"github.com/juanfont/headscale/integration/hsic"
|
"github.com/juanfont/headscale/integration/hsic"
|
||||||
|
"github.com/juanfont/headscale/integration/integrationutil"
|
||||||
"github.com/juanfont/headscale/integration/tsic"
|
"github.com/juanfont/headscale/integration/tsic"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
@ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) {
|
|||||||
allClients, err := scenario.ListTailscaleClients()
|
allClients, err := scenario.ListTailscaleClients()
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"])
|
err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
for _, client := range allClients {
|
for _, client := range allClients {
|
||||||
|
@ -14,11 +14,26 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/juanfont/headscale/hscontrol/util"
|
||||||
"github.com/juanfont/headscale/integration/dockertestutil"
|
"github.com/juanfont/headscale/integration/dockertestutil"
|
||||||
"github.com/ory/dockertest/v3"
|
"github.com/ory/dockertest/v3"
|
||||||
"github.com/ory/dockertest/v3/docker"
|
"github.com/ory/dockertest/v3/docker"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// PeerSyncTimeout returns the timeout for peer synchronization based on environment:
|
||||||
|
// 60s for dev, 120s for CI.
|
||||||
|
func PeerSyncTimeout() time.Duration {
|
||||||
|
if util.IsCI() {
|
||||||
|
return 120 * time.Second
|
||||||
|
}
|
||||||
|
return 60 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
// PeerSyncRetryInterval returns the retry interval for peer synchronization checks.
|
||||||
|
func PeerSyncRetryInterval() time.Duration {
|
||||||
|
return 100 * time.Millisecond
|
||||||
|
}
|
||||||
|
|
||||||
func WriteFileToContainer(
|
func WriteFileToContainer(
|
||||||
pool *dockertest.Pool,
|
pool *dockertest.Pool,
|
||||||
container *dockertest.Resource,
|
container *dockertest.Resource,
|
||||||
|
@ -27,6 +27,7 @@ import (
|
|||||||
"github.com/juanfont/headscale/integration/dockertestutil"
|
"github.com/juanfont/headscale/integration/dockertestutil"
|
||||||
"github.com/juanfont/headscale/integration/dsic"
|
"github.com/juanfont/headscale/integration/dsic"
|
||||||
"github.com/juanfont/headscale/integration/hsic"
|
"github.com/juanfont/headscale/integration/hsic"
|
||||||
|
"github.com/juanfont/headscale/integration/integrationutil"
|
||||||
"github.com/juanfont/headscale/integration/tsic"
|
"github.com/juanfont/headscale/integration/tsic"
|
||||||
"github.com/oauth2-proxy/mockoidc"
|
"github.com/oauth2-proxy/mockoidc"
|
||||||
"github.com/ory/dockertest/v3"
|
"github.com/ory/dockertest/v3"
|
||||||
@ -39,6 +40,7 @@ import (
|
|||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
"tailscale.com/envknob"
|
"tailscale.com/envknob"
|
||||||
"tailscale.com/util/mak"
|
"tailscale.com/util/mak"
|
||||||
|
"tailscale.com/util/multierr"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = tsClient.WaitForNeedsLogin()
|
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf(
|
return nil, fmt.Errorf(
|
||||||
"failed to wait for tailscaled (%s) to need login: %w",
|
"failed to wait for tailscaled (%s) to need login: %w",
|
||||||
@ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = tsClient.WaitForNeedsLogin()
|
err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf(
|
return fmt.Errorf(
|
||||||
"failed to wait for tailscaled (%s) to need login: %w",
|
"failed to wait for tailscaled (%s) to need login: %w",
|
||||||
@ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, client := range user.Clients {
|
for _, client := range user.Clients {
|
||||||
err := client.WaitForRunning()
|
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err)
|
return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err)
|
||||||
}
|
}
|
||||||
@ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int {
|
|||||||
func (s *Scenario) WaitForTailscaleSync() error {
|
func (s *Scenario) WaitForTailscaleSync() error {
|
||||||
tsCount := s.CountTailscale()
|
tsCount := s.CountTailscale()
|
||||||
|
|
||||||
err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1)
|
err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
for _, user := range s.users {
|
for _, user := range s.users {
|
||||||
for _, client := range user.Clients {
|
for _, client := range user.Clients {
|
||||||
@ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error {
|
|||||||
|
|
||||||
// WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
|
// WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
|
||||||
// to have all other TailscaleClients present in their netmap.NetworkMap.
|
// to have all other TailscaleClients present in their netmap.NetworkMap.
|
||||||
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error {
|
func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error {
|
||||||
|
var allErrors []error
|
||||||
|
|
||||||
for _, user := range s.users {
|
for _, user := range s.users {
|
||||||
for _, client := range user.Clients {
|
for _, client := range user.Clients {
|
||||||
c := client
|
c := client
|
||||||
user.syncWaitGroup.Go(func() error {
|
user.syncWaitGroup.Go(func() error {
|
||||||
return c.WaitForPeers(peerCount)
|
return c.WaitForPeers(peerCount, timeout, retryInterval)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
if err := user.syncWaitGroup.Wait(); err != nil {
|
if err := user.syncWaitGroup.Wait(); err != nil {
|
||||||
return err
|
allErrors = append(allErrors, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(allErrors) > 0 {
|
||||||
|
return multierr.New(allErrors...)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, client := range user.Clients {
|
for _, client := range user.Clients {
|
||||||
err := client.WaitForRunning()
|
err := client.WaitForRunning(integrationutil.PeerSyncTimeout())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf(
|
return fmt.Errorf(
|
||||||
"%s tailscale node has not reached running: %w",
|
"%s tailscale node has not reached running: %w",
|
||||||
@ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error {
|
|||||||
for _, client := range user.Clients {
|
for _, client := range user.Clients {
|
||||||
c := client
|
c := client
|
||||||
user.syncWaitGroup.Go(func() error {
|
user.syncWaitGroup.Go(func() error {
|
||||||
return c.WaitForNeedsLogin()
|
return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
if err := user.syncWaitGroup.Wait(); err != nil {
|
if err := user.syncWaitGroup.Wait(); err != nil {
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"net/netip"
|
"net/netip"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/juanfont/headscale/hscontrol/types"
|
"github.com/juanfont/headscale/hscontrol/types"
|
||||||
"github.com/juanfont/headscale/hscontrol/util"
|
"github.com/juanfont/headscale/hscontrol/util"
|
||||||
@ -40,9 +41,9 @@ type TailscaleClient interface {
|
|||||||
DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error)
|
DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error)
|
||||||
GetNodePrivateKey() (*key.NodePrivate, error)
|
GetNodePrivateKey() (*key.NodePrivate, error)
|
||||||
Netcheck() (*netcheck.Report, error)
|
Netcheck() (*netcheck.Report, error)
|
||||||
WaitForNeedsLogin() error
|
WaitForNeedsLogin(timeout time.Duration) error
|
||||||
WaitForRunning() error
|
WaitForRunning(timeout time.Duration) error
|
||||||
WaitForPeers(expected int) error
|
WaitForPeers(expected int, timeout, retryInterval time.Duration) error
|
||||||
Ping(hostnameOrIP string, opts ...tsic.PingOption) error
|
Ping(hostnameOrIP string, opts ...tsic.PingOption) error
|
||||||
Curl(url string, opts ...tsic.CurlOption) (string, error)
|
Curl(url string, opts ...tsic.CurlOption) (string, error)
|
||||||
Traceroute(netip.Addr) (util.Traceroute, error)
|
Traceroute(netip.Addr) (util.Traceroute, error)
|
||||||
|
@ -31,6 +31,7 @@ import (
|
|||||||
"tailscale.com/paths"
|
"tailscale.com/paths"
|
||||||
"tailscale.com/types/key"
|
"tailscale.com/types/key"
|
||||||
"tailscale.com/types/netmap"
|
"tailscale.com/types/netmap"
|
||||||
|
"tailscale.com/util/multierr"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error {
|
|||||||
return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr)
|
return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr)
|
||||||
}
|
}
|
||||||
|
|
||||||
return t.waitForBackendState("NeedsLogin")
|
return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper that runs `tailscale up` with no arguments.
|
// Helper that runs `tailscale up` with no arguments.
|
||||||
@ -904,75 +905,115 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) {
|
|||||||
|
|
||||||
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
|
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
|
||||||
// started and needs to be logged into.
|
// started and needs to be logged into.
|
||||||
func (t *TailscaleInContainer) WaitForNeedsLogin() error {
|
func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error {
|
||||||
return t.waitForBackendState("NeedsLogin")
|
return t.waitForBackendState("NeedsLogin", timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
|
// WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
|
||||||
// and ready to be used.
|
// and ready to be used.
|
||||||
func (t *TailscaleInContainer) WaitForRunning() error {
|
func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error {
|
||||||
return t.waitForBackendState("Running")
|
return t.waitForBackendState("Running", timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *TailscaleInContainer) waitForBackendState(state string) error {
|
func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error {
|
||||||
return t.pool.Retry(func() error {
|
ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval())
|
||||||
status, err := t.Status()
|
defer ticker.Stop()
|
||||||
if err != nil {
|
|
||||||
return errTailscaleStatus(t.hostname, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
// https://github.com/tailscale/tailscale/pull/3865
|
defer cancel()
|
||||||
//
|
|
||||||
// Before that, we can check the BackendState to see if the
|
|
||||||
// tailscaled daemon is connected to the control system.
|
|
||||||
if status.BackendState == state {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return errTailscaleNotConnected
|
for {
|
||||||
})
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout)
|
||||||
|
case <-ticker.C:
|
||||||
|
status, err := t.Status()
|
||||||
|
if err != nil {
|
||||||
|
continue // Keep retrying on status errors
|
||||||
|
}
|
||||||
|
|
||||||
|
// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
|
||||||
|
// https://github.com/tailscale/tailscale/pull/3865
|
||||||
|
//
|
||||||
|
// Before that, we can check the BackendState to see if the
|
||||||
|
// tailscaled daemon is connected to the control system.
|
||||||
|
if status.BackendState == state {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForPeers blocks until N number of peers is present in the
|
// WaitForPeers blocks until N number of peers is present in the
|
||||||
// Peer list of the Tailscale instance and is reporting Online.
|
// Peer list of the Tailscale instance and is reporting Online.
|
||||||
func (t *TailscaleInContainer) WaitForPeers(expected int) error {
|
//
|
||||||
return t.pool.Retry(func() error {
|
// The method verifies that each peer:
|
||||||
status, err := t.Status()
|
// - Has the expected peer count
|
||||||
if err != nil {
|
// - All peers are Online
|
||||||
return errTailscaleStatus(t.hostname, err)
|
// - All peers have a hostname
|
||||||
}
|
// - All peers have a DERP relay assigned
|
||||||
|
//
|
||||||
|
// Uses multierr to collect all validation errors.
|
||||||
|
func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error {
|
||||||
|
ticker := time.NewTicker(retryInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
var lastErrs []error
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
if len(lastErrs) > 0 {
|
||||||
|
return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...))
|
||||||
|
}
|
||||||
|
return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout)
|
||||||
|
case <-ticker.C:
|
||||||
|
status, err := t.Status()
|
||||||
|
if err != nil {
|
||||||
|
lastErrs = []error{errTailscaleStatus(t.hostname, err)}
|
||||||
|
continue // Keep retrying on status errors
|
||||||
|
}
|
||||||
|
|
||||||
|
if peers := status.Peers(); len(peers) != expected {
|
||||||
|
lastErrs = []error{fmt.Errorf(
|
||||||
|
"%s err: %w expected %d, got %d",
|
||||||
|
t.hostname,
|
||||||
|
errTailscaleWrongPeerCount,
|
||||||
|
expected,
|
||||||
|
len(peers),
|
||||||
|
)}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if peers := status.Peers(); len(peers) != expected {
|
|
||||||
return fmt.Errorf(
|
|
||||||
"%s err: %w expected %d, got %d",
|
|
||||||
t.hostname,
|
|
||||||
errTailscaleWrongPeerCount,
|
|
||||||
expected,
|
|
||||||
len(peers),
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
// Verify that the peers of a given node is Online
|
// Verify that the peers of a given node is Online
|
||||||
// has a hostname and a DERP relay.
|
// has a hostname and a DERP relay.
|
||||||
for _, peerKey := range peers {
|
var peerErrors []error
|
||||||
|
for _, peerKey := range status.Peers() {
|
||||||
peer := status.Peer[peerKey]
|
peer := status.Peer[peerKey]
|
||||||
|
|
||||||
if !peer.Online {
|
if !peer.Online {
|
||||||
return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName)
|
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName))
|
||||||
}
|
}
|
||||||
|
|
||||||
if peer.HostName == "" {
|
if peer.HostName == "" {
|
||||||
return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName)
|
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName))
|
||||||
}
|
}
|
||||||
|
|
||||||
if peer.Relay == "" {
|
if peer.Relay == "" {
|
||||||
return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName)
|
peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
if len(peerErrors) > 0 {
|
||||||
})
|
lastErrs = peerErrors
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type (
|
type (
|
||||||
|
Loading…
Reference in New Issue
Block a user