From cce63a27d61b915b32be638bac354c6a1a041cfc Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Thu, 21 Aug 2025 12:17:43 +0200 Subject: [PATCH] derp: add retry, do not replace if fails This commits makes updating of the DERP map from file and url more robust by retrying with exponential backoff if it fails and upon failure, keep the old DERP map if we cannot successfully build a new one. Fixes #2694 Signed-off-by: Kristoffer Dalby --- hscontrol/app.go | 45 ++++++++++++++++++---------- hscontrol/derp/derp.go | 32 ++++---------------- hscontrol/derp/server/derp_server.go | 12 ++++---- hscontrol/mapper/builder.go | 2 +- hscontrol/state/state.go | 28 +++++++++-------- 5 files changed, 57 insertions(+), 62 deletions(-) diff --git a/hscontrol/app.go b/hscontrol/app.go index ec8e2550..774aec46 100644 --- a/hscontrol/app.go +++ b/hscontrol/app.go @@ -17,6 +17,7 @@ import ( "syscall" "time" + "github.com/cenkalti/backoff/v5" "github.com/davecgh/go-spew/spew" "github.com/gorilla/mux" grpcRuntime "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" @@ -284,11 +285,23 @@ func (h *Headscale) scheduledTasks(ctx context.Context) { case <-derpTickerChan: log.Info().Msg("Fetching DERPMap updates") - derpMap := derp.GetDERPMap(h.cfg.DERP) - if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion { - region, _ := h.DERPServer.GenerateRegion() - derpMap.Regions[region.RegionID] = ®ion + derpMap, err := backoff.Retry(ctx, func() (*tailcfg.DERPMap, error) { + derpMap, err := derp.GetDERPMap(h.cfg.DERP) + if err != nil { + return nil, err + } + if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion { + region, _ := h.DERPServer.GenerateRegion() + derpMap.Regions[region.RegionID] = ®ion + } + + return derpMap, nil + }, backoff.WithBackOff(backoff.NewExponentialBackOff())) + if err != nil { + log.Error().Err(err).Msg("failed to build new DERPMap, retrying later") + continue } + h.state.SetDERPMap(derpMap) h.Change(change.DERPSet) @@ -516,29 +529,31 @@ func (h *Headscale) Serve() error { h.mapBatcher.Start() defer h.mapBatcher.Close() - // TODO(kradalby): fix state part. if h.cfg.DERP.ServerEnabled { // When embedded DERP is enabled we always need a STUN server if h.cfg.DERP.STUNAddr == "" { return errSTUNAddressNotSet } - region, err := h.DERPServer.GenerateRegion() - if err != nil { - return fmt.Errorf("generating DERP region for embedded server: %w", err) - } - - if h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion { - h.state.DERPMap().Regions[region.RegionID] = ®ion - } - go h.DERPServer.ServeSTUN() } - if len(h.state.DERPMap().Regions) == 0 { + derpMap, err := derp.GetDERPMap(h.cfg.DERP) + if err != nil { + return fmt.Errorf("failed to get DERPMap: %w", err) + } + + if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion { + region, _ := h.DERPServer.GenerateRegion() + derpMap.Regions[region.RegionID] = ®ion + } + + if len(derpMap.Regions) == 0 { return errEmptyInitialDERPMap } + h.state.SetDERPMap(derpMap) + // Start ephemeral node garbage collector and schedule all nodes // that are already in the database and ephemeral. If they are still // around between restarts, they will reconnect and the GC will diff --git a/hscontrol/derp/derp.go b/hscontrol/derp/derp.go index 1ed619ec..c6eb52bb 100644 --- a/hscontrol/derp/derp.go +++ b/hscontrol/derp/derp.go @@ -10,7 +10,7 @@ import ( "os" "github.com/juanfont/headscale/hscontrol/types" - "github.com/rs/zerolog/log" + "github.com/spf13/viper" "gopkg.in/yaml.v3" "tailscale.com/tailcfg" ) @@ -79,26 +79,16 @@ func mergeDERPMaps(derpMaps []*tailcfg.DERPMap) *tailcfg.DERPMap { return &result } -func GetDERPMap(cfg types.DERPConfig) *tailcfg.DERPMap { +func GetDERPMap(cfg types.DERPConfig) (*tailcfg.DERPMap, error) { var derpMaps []*tailcfg.DERPMap if cfg.DERPMap != nil { derpMaps = append(derpMaps, cfg.DERPMap) } for _, path := range cfg.Paths { - log.Debug(). - Str("func", "GetDERPMap"). - Str("path", path). - Msg("Loading DERPMap from path") derpMap, err := loadDERPMapFromPath(path) if err != nil { - log.Error(). - Str("func", "GetDERPMap"). - Str("path", path). - Err(err). - Msg("Could not load DERP map from path") - - break + return nil, err } derpMaps = append(derpMaps, derpMap) @@ -106,18 +96,8 @@ func GetDERPMap(cfg types.DERPConfig) *tailcfg.DERPMap { for _, addr := range cfg.URLs { derpMap, err := loadDERPMapFromURL(addr) - log.Debug(). - Str("func", "GetDERPMap"). - Str("url", addr.String()). - Msg("Loading DERPMap from path") if err != nil { - log.Error(). - Str("func", "GetDERPMap"). - Str("url", addr.String()). - Err(err). - Msg("Could not load DERP map from path") - - break + return nil, err } derpMaps = append(derpMaps, derpMap) @@ -125,7 +105,5 @@ func GetDERPMap(cfg types.DERPConfig) *tailcfg.DERPMap { derpMap := mergeDERPMaps(derpMaps) - log.Trace().Interface("derpMap", derpMap).Msg("DERPMap loaded") - - return derpMap + return derpMap, nil } diff --git a/hscontrol/derp/server/derp_server.go b/hscontrol/derp/server/derp_server.go index fee395f1..b8f892be 100644 --- a/hscontrol/derp/server/derp_server.go +++ b/hscontrol/derp/server/derp_server.go @@ -276,7 +276,7 @@ func DERPProbeHandler( // An example implementation is found here https://derp.tailscale.com/bootstrap-dns // Coordination server is included automatically, since local DERP is using the same DNS Name in d.serverURL. func DERPBootstrapDNSHandler( - derpMap *tailcfg.DERPMap, + derpMap tailcfg.DERPMapView, ) func(http.ResponseWriter, *http.Request) { return func( writer http.ResponseWriter, @@ -287,18 +287,18 @@ func DERPBootstrapDNSHandler( resolvCtx, cancel := context.WithTimeout(req.Context(), time.Minute) defer cancel() var resolver net.Resolver - for _, region := range derpMap.Regions { - for _, node := range region.Nodes { // we don't care if we override some nodes - addrs, err := resolver.LookupIP(resolvCtx, "ip", node.HostName) + for _, region := range derpMap.Regions().All() { + for _, node := range region.Nodes().All() { // we don't care if we override some nodes + addrs, err := resolver.LookupIP(resolvCtx, "ip", node.HostName()) if err != nil { log.Trace(). Caller(). Err(err). - Msgf("bootstrap DNS lookup failed %q", node.HostName) + Msgf("bootstrap DNS lookup failed %q", node.HostName()) continue } - dnsEntries[node.HostName] = addrs + dnsEntries[node.HostName()] = addrs } } writer.Header().Set("Content-Type", "application/json") diff --git a/hscontrol/mapper/builder.go b/hscontrol/mapper/builder.go index b6102c01..111724bc 100644 --- a/hscontrol/mapper/builder.go +++ b/hscontrol/mapper/builder.go @@ -79,7 +79,7 @@ func (b *MapResponseBuilder) WithSelfNode() *MapResponseBuilder { // WithDERPMap adds the DERP map to the response func (b *MapResponseBuilder) WithDERPMap() *MapResponseBuilder { - b.resp.DERPMap = b.mapper.state.DERPMap() + b.resp.DERPMap = b.mapper.state.DERPMap().AsStruct() return b } diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index 02d5d3cd..0a743184 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -9,10 +9,10 @@ import ( "io" "net/netip" "os" + "sync/atomic" "time" hsdb "github.com/juanfont/headscale/hscontrol/db" - "github.com/juanfont/headscale/hscontrol/derp" "github.com/juanfont/headscale/hscontrol/policy" "github.com/juanfont/headscale/hscontrol/policy/matcher" "github.com/juanfont/headscale/hscontrol/routes" @@ -55,7 +55,7 @@ type State struct { // ipAlloc manages IP address allocation for nodes ipAlloc *hsdb.IPAllocator // derpMap contains the current DERP relay configuration - derpMap *tailcfg.DERPMap + derpMap atomic.Pointer[tailcfg.DERPMap] // polMan handles policy evaluation and management polMan policy.PolicyManager // registrationCache caches node registration data to reduce database load @@ -86,8 +86,6 @@ func NewState(cfg *types.Config) (*State, error) { return nil, fmt.Errorf("init ip allocatior: %w", err) } - derpMap := derp.GetDERPMap(cfg.DERP) - nodes, err := db.ListNodes() if err != nil { return nil, fmt.Errorf("loading nodes: %w", err) @@ -107,17 +105,17 @@ func NewState(cfg *types.Config) (*State, error) { return nil, fmt.Errorf("init policy manager: %w", err) } - return &State{ + s := &State{ cfg: cfg, - db: db, - ipAlloc: ipAlloc, - // TODO(kradalby): Update DERPMap - derpMap: derpMap, + db: db, + ipAlloc: ipAlloc, polMan: polMan, registrationCache: registrationCache, primaryRoutes: routes.New(), - }, nil + } + + return s, nil } // Close gracefully shuts down the State instance and releases all resources. @@ -170,9 +168,14 @@ func policyBytes(db *hsdb.HSDatabase, cfg *types.Config) ([]byte, error) { return nil, fmt.Errorf("%w: %s", ErrUnsupportedPolicyMode, cfg.Policy.Mode) } +// SetDERPMap updates the DERP relay configuration. +func (s *State) SetDERPMap(dm *tailcfg.DERPMap) { + s.derpMap.Store(dm) +} + // DERPMap returns the current DERP relay configuration for peer-to-peer connectivity. -func (s *State) DERPMap() *tailcfg.DERPMap { - return s.derpMap +func (s *State) DERPMap() tailcfg.DERPMapView { + return s.derpMap.Load().View() } // ReloadPolicy reloads the access control policy and triggers auto-approval if changed. @@ -209,7 +212,6 @@ func (s *State) CreateUser(user types.User) (*types.User, bool, error) { s.mu.Lock() defer s.mu.Unlock() - if err := s.db.DB.Save(&user).Error; err != nil { return nil, false, fmt.Errorf("creating user: %w", err) }