mirror of
				https://github.com/juanfont/headscale.git
				synced 2025-10-28 10:51:44 +01:00 
			
		
		
		
	integration: rework retry for waiting for node sync
Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
		
							parent
							
								
									9d236571f4
								
							
						
					
					
						commit
						3b16b75fe6
					
				| @ -1418,11 +1418,6 @@ func (s *State) HandleNodeFromPreAuthKey( | ||||
| 	return savedNode.View(), c, nil | ||||
| } | ||||
| 
 | ||||
| // AllocateNextIPs allocates the next available IPv4 and IPv6 addresses.
 | ||||
| func (s *State) AllocateNextIPs() (*netip.Addr, *netip.Addr, error) { | ||||
| 	return s.ipAlloc.Next() | ||||
| } | ||||
| 
 | ||||
| // updatePolicyManagerUsers updates the policy manager with current users.
 | ||||
| // Returns true if the policy changed and notifications should be sent.
 | ||||
| // TODO(kradalby): This is a temporary stepping stone, ultimately we should
 | ||||
|  | ||||
| @ -11,6 +11,7 @@ import ( | ||||
| 	policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2" | ||||
| 	"github.com/juanfont/headscale/hscontrol/types" | ||||
| 	"github.com/juanfont/headscale/integration/hsic" | ||||
| 	"github.com/juanfont/headscale/integration/integrationutil" | ||||
| 	"github.com/juanfont/headscale/integration/tsic" | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| 	"github.com/stretchr/testify/require" | ||||
| @ -312,7 +313,7 @@ func TestACLHostsInNetMapTable(t *testing.T) { | ||||
| 			allClients, err := scenario.ListTailscaleClients() | ||||
| 			require.NoError(t, err) | ||||
| 
 | ||||
| 			err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"]) | ||||
| 			err = scenario.WaitForTailscaleSyncWithPeerCount(testCase.want["user1@test.no"], integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval()) | ||||
| 			require.NoError(t, err) | ||||
| 
 | ||||
| 			for _, client := range allClients { | ||||
|  | ||||
| @ -354,7 +354,11 @@ func TestPreAuthKeyCommand(t *testing.T) { | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		assert.Equal(t, []string{"tag:test1", "tag:test2"}, listedPreAuthKeys[index].GetAclTags()) | ||||
| 		assert.Equal( | ||||
| 			t, | ||||
| 			[]string{"tag:test1", "tag:test2"}, | ||||
| 			listedPreAuthKeys[index].GetAclTags(), | ||||
| 		) | ||||
| 	} | ||||
| 
 | ||||
| 	// Test key expiry
 | ||||
|  | ||||
| @ -14,11 +14,27 @@ import ( | ||||
| 	"path/filepath" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/juanfont/headscale/hscontrol/types" | ||||
| 	"github.com/juanfont/headscale/hscontrol/util" | ||||
| 	"github.com/juanfont/headscale/integration/dockertestutil" | ||||
| 	"github.com/ory/dockertest/v3" | ||||
| 	"github.com/ory/dockertest/v3/docker" | ||||
| ) | ||||
| 
 | ||||
| // PeerSyncTimeout returns the timeout for peer synchronization based on environment:
 | ||||
| // 60s for dev, 120s for CI.
 | ||||
| func PeerSyncTimeout() time.Duration { | ||||
| 	if util.IsCI() { | ||||
| 		return 120 * time.Second | ||||
| 	} | ||||
| 	return 60 * time.Second | ||||
| } | ||||
| 
 | ||||
| // PeerSyncRetryInterval returns the retry interval for peer synchronization checks.
 | ||||
| func PeerSyncRetryInterval() time.Duration { | ||||
| 	return 100 * time.Millisecond | ||||
| } | ||||
| 
 | ||||
| func WriteFileToContainer( | ||||
| 	pool *dockertest.Pool, | ||||
| 	container *dockertest.Resource, | ||||
|  | ||||
| @ -27,6 +27,7 @@ import ( | ||||
| 	"github.com/juanfont/headscale/integration/dockertestutil" | ||||
| 	"github.com/juanfont/headscale/integration/dsic" | ||||
| 	"github.com/juanfont/headscale/integration/hsic" | ||||
| 	"github.com/juanfont/headscale/integration/integrationutil" | ||||
| 	"github.com/juanfont/headscale/integration/tsic" | ||||
| 	"github.com/oauth2-proxy/mockoidc" | ||||
| 	"github.com/ory/dockertest/v3" | ||||
| @ -39,6 +40,7 @@ import ( | ||||
| 	"golang.org/x/sync/errgroup" | ||||
| 	"tailscale.com/envknob" | ||||
| 	"tailscale.com/util/mak" | ||||
| 	"tailscale.com/util/multierr" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| @ -498,7 +500,7 @@ func (s *Scenario) CreateTailscaleNode( | ||||
| 		) | ||||
| 	} | ||||
| 
 | ||||
| 	err = tsClient.WaitForNeedsLogin() | ||||
| 	err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf( | ||||
| 			"failed to wait for tailscaled (%s) to need login: %w", | ||||
| @ -561,7 +563,7 @@ func (s *Scenario) CreateTailscaleNodesInUser( | ||||
| 					) | ||||
| 				} | ||||
| 
 | ||||
| 				err = tsClient.WaitForNeedsLogin() | ||||
| 				err = tsClient.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) | ||||
| 				if err != nil { | ||||
| 					return fmt.Errorf( | ||||
| 						"failed to wait for tailscaled (%s) to need login: %w", | ||||
| @ -607,7 +609,7 @@ func (s *Scenario) RunTailscaleUp( | ||||
| 		} | ||||
| 
 | ||||
| 		for _, client := range user.Clients { | ||||
| 			err := client.WaitForRunning() | ||||
| 			err := client.WaitForRunning(integrationutil.PeerSyncTimeout()) | ||||
| 			if err != nil { | ||||
| 				return fmt.Errorf("%s failed to up tailscale node: %w", client.Hostname(), err) | ||||
| 			} | ||||
| @ -636,7 +638,7 @@ func (s *Scenario) CountTailscale() int { | ||||
| func (s *Scenario) WaitForTailscaleSync() error { | ||||
| 	tsCount := s.CountTailscale() | ||||
| 
 | ||||
| 	err := s.WaitForTailscaleSyncWithPeerCount(tsCount - 1) | ||||
| 	err := s.WaitForTailscaleSyncWithPeerCount(tsCount-1, integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval()) | ||||
| 	if err != nil { | ||||
| 		for _, user := range s.users { | ||||
| 			for _, client := range user.Clients { | ||||
| @ -653,19 +655,24 @@ func (s *Scenario) WaitForTailscaleSync() error { | ||||
| 
 | ||||
| // WaitForTailscaleSyncWithPeerCount blocks execution until all the TailscaleClient reports
 | ||||
| // to have all other TailscaleClients present in their netmap.NetworkMap.
 | ||||
| func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error { | ||||
| func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int, timeout, retryInterval time.Duration) error { | ||||
| 	var allErrors []error | ||||
| 
 | ||||
| 	for _, user := range s.users { | ||||
| 		for _, client := range user.Clients { | ||||
| 			c := client | ||||
| 			user.syncWaitGroup.Go(func() error { | ||||
| 				return c.WaitForPeers(peerCount) | ||||
| 				return c.WaitForPeers(peerCount, timeout, retryInterval) | ||||
| 			}) | ||||
| 		} | ||||
| 		if err := user.syncWaitGroup.Wait(); err != nil { | ||||
| 			return err | ||||
| 			allErrors = append(allErrors, err) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if len(allErrors) > 0 { | ||||
| 		return multierr.New(allErrors...) | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| @ -767,7 +774,7 @@ func (s *Scenario) RunTailscaleUpWithURL(userStr, loginServer string) error { | ||||
| 		} | ||||
| 
 | ||||
| 		for _, client := range user.Clients { | ||||
| 			err := client.WaitForRunning() | ||||
| 			err := client.WaitForRunning(integrationutil.PeerSyncTimeout()) | ||||
| 			if err != nil { | ||||
| 				return fmt.Errorf( | ||||
| 					"%s tailscale node has not reached running: %w", | ||||
| @ -1001,7 +1008,7 @@ func (s *Scenario) WaitForTailscaleLogout() error { | ||||
| 		for _, client := range user.Clients { | ||||
| 			c := client | ||||
| 			user.syncWaitGroup.Go(func() error { | ||||
| 				return c.WaitForNeedsLogin() | ||||
| 				return c.WaitForNeedsLogin(integrationutil.PeerSyncTimeout()) | ||||
| 			}) | ||||
| 		} | ||||
| 		if err := user.syncWaitGroup.Wait(); err != nil { | ||||
|  | ||||
| @ -4,6 +4,7 @@ import ( | ||||
| 	"io" | ||||
| 	"net/netip" | ||||
| 	"net/url" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/juanfont/headscale/hscontrol/types" | ||||
| 	"github.com/juanfont/headscale/hscontrol/util" | ||||
| @ -40,9 +41,9 @@ type TailscaleClient interface { | ||||
| 	DebugDERPRegion(region string) (*ipnstate.DebugDERPRegionReport, error) | ||||
| 	GetNodePrivateKey() (*key.NodePrivate, error) | ||||
| 	Netcheck() (*netcheck.Report, error) | ||||
| 	WaitForNeedsLogin() error | ||||
| 	WaitForRunning() error | ||||
| 	WaitForPeers(expected int) error | ||||
| 	WaitForNeedsLogin(timeout time.Duration) error | ||||
| 	WaitForRunning(timeout time.Duration) error | ||||
| 	WaitForPeers(expected int, timeout, retryInterval time.Duration) error | ||||
| 	Ping(hostnameOrIP string, opts ...tsic.PingOption) error | ||||
| 	Curl(url string, opts ...tsic.CurlOption) (string, error) | ||||
| 	Traceroute(netip.Addr) (util.Traceroute, error) | ||||
|  | ||||
| @ -31,6 +31,7 @@ import ( | ||||
| 	"tailscale.com/paths" | ||||
| 	"tailscale.com/types/key" | ||||
| 	"tailscale.com/types/netmap" | ||||
| 	"tailscale.com/util/multierr" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| @ -529,7 +530,7 @@ func (t *TailscaleInContainer) Logout() error { | ||||
| 		return fmt.Errorf("failed to logout, stdout: %s, stderr: %s", stdout, stderr) | ||||
| 	} | ||||
| 
 | ||||
| 	return t.waitForBackendState("NeedsLogin") | ||||
| 	return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout()) | ||||
| } | ||||
| 
 | ||||
| // Helper that runs `tailscale up` with no arguments.
 | ||||
| @ -904,21 +905,31 @@ func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) { | ||||
| 
 | ||||
| // WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has
 | ||||
| // started and needs to be logged into.
 | ||||
| func (t *TailscaleInContainer) WaitForNeedsLogin() error { | ||||
| 	return t.waitForBackendState("NeedsLogin") | ||||
| func (t *TailscaleInContainer) WaitForNeedsLogin(timeout time.Duration) error { | ||||
| 	return t.waitForBackendState("NeedsLogin", timeout) | ||||
| } | ||||
| 
 | ||||
| // WaitForRunning blocks until the Tailscale (tailscaled) instance is logged in
 | ||||
| // and ready to be used.
 | ||||
| func (t *TailscaleInContainer) WaitForRunning() error { | ||||
| 	return t.waitForBackendState("Running") | ||||
| func (t *TailscaleInContainer) WaitForRunning(timeout time.Duration) error { | ||||
| 	return t.waitForBackendState("Running", timeout) | ||||
| } | ||||
| 
 | ||||
| func (t *TailscaleInContainer) waitForBackendState(state string) error { | ||||
| 	return t.pool.Retry(func() error { | ||||
| func (t *TailscaleInContainer) waitForBackendState(state string, timeout time.Duration) error { | ||||
| 	ticker := time.NewTicker(integrationutil.PeerSyncRetryInterval()) | ||||
| 	defer ticker.Stop() | ||||
| 
 | ||||
| 	ctx, cancel := context.WithTimeout(context.Background(), timeout) | ||||
| 	defer cancel() | ||||
| 
 | ||||
| 	for { | ||||
| 		select { | ||||
| 		case <-ctx.Done(): | ||||
| 			return fmt.Errorf("timeout waiting for backend state %s on %s after %v", state, t.hostname, timeout) | ||||
| 		case <-ticker.C: | ||||
| 			status, err := t.Status() | ||||
| 			if err != nil { | ||||
| 			return errTailscaleStatus(t.hostname, err) | ||||
| 				continue // Keep retrying on status errors
 | ||||
| 			} | ||||
| 
 | ||||
| 			// ipnstate.Status.CurrentTailnet was added in Tailscale 1.22.0
 | ||||
| @ -929,50 +940,80 @@ func (t *TailscaleInContainer) waitForBackendState(state string) error { | ||||
| 			if status.BackendState == state { | ||||
| 				return nil | ||||
| 			} | ||||
| 
 | ||||
| 		return errTailscaleNotConnected | ||||
| 	}) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // WaitForPeers blocks until N number of peers is present in the
 | ||||
| // Peer list of the Tailscale instance and is reporting Online.
 | ||||
| func (t *TailscaleInContainer) WaitForPeers(expected int) error { | ||||
| 	return t.pool.Retry(func() error { | ||||
| //
 | ||||
| // The method verifies that each peer:
 | ||||
| // - Has the expected peer count
 | ||||
| // - All peers are Online
 | ||||
| // - All peers have a hostname
 | ||||
| // - All peers have a DERP relay assigned
 | ||||
| //
 | ||||
| // Uses multierr to collect all validation errors.
 | ||||
| func (t *TailscaleInContainer) WaitForPeers(expected int, timeout, retryInterval time.Duration) error { | ||||
| 	ticker := time.NewTicker(retryInterval) | ||||
| 	defer ticker.Stop() | ||||
| 
 | ||||
| 	ctx, cancel := context.WithTimeout(context.Background(), timeout) | ||||
| 	defer cancel() | ||||
| 
 | ||||
| 	var lastErrs []error | ||||
| 	for { | ||||
| 		select { | ||||
| 		case <-ctx.Done(): | ||||
| 			if len(lastErrs) > 0 { | ||||
| 				return fmt.Errorf("timeout waiting for %d peers on %s after %v, errors: %w", expected, t.hostname, timeout, multierr.New(lastErrs...)) | ||||
| 			} | ||||
| 			return fmt.Errorf("timeout waiting for %d peers on %s after %v", expected, t.hostname, timeout) | ||||
| 		case <-ticker.C: | ||||
| 			status, err := t.Status() | ||||
| 			if err != nil { | ||||
| 			return errTailscaleStatus(t.hostname, err) | ||||
| 				lastErrs = []error{errTailscaleStatus(t.hostname, err)} | ||||
| 				continue // Keep retrying on status errors
 | ||||
| 			} | ||||
| 
 | ||||
| 			if peers := status.Peers(); len(peers) != expected { | ||||
| 			return fmt.Errorf( | ||||
| 				lastErrs = []error{fmt.Errorf( | ||||
| 					"%s err: %w expected %d, got %d", | ||||
| 					t.hostname, | ||||
| 					errTailscaleWrongPeerCount, | ||||
| 					expected, | ||||
| 					len(peers), | ||||
| 			) | ||||
| 		} else { | ||||
| 				)} | ||||
| 				continue | ||||
| 			} | ||||
| 
 | ||||
| 			// Verify that the peers of a given node is Online
 | ||||
| 			// has a hostname and a DERP relay.
 | ||||
| 			for _, peerKey := range peers { | ||||
| 			var peerErrors []error | ||||
| 			for _, peerKey := range status.Peers() { | ||||
| 				peer := status.Peer[peerKey] | ||||
| 
 | ||||
| 				if !peer.Online { | ||||
| 					return fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName) | ||||
| 					peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s is not online", t.hostname, peer.HostName)) | ||||
| 				} | ||||
| 
 | ||||
| 				if peer.HostName == "" { | ||||
| 					return fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName) | ||||
| 					peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a Hostname", t.hostname, peer.HostName)) | ||||
| 				} | ||||
| 
 | ||||
| 				if peer.Relay == "" { | ||||
| 					return fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName) | ||||
| 				} | ||||
| 					peerErrors = append(peerErrors, fmt.Errorf("[%s] peer count correct, but %s does not have a DERP", t.hostname, peer.HostName)) | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			if len(peerErrors) > 0 { | ||||
| 				lastErrs = peerErrors | ||||
| 				continue | ||||
| 			} | ||||
| 
 | ||||
| 			return nil | ||||
| 	}) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| type ( | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user