From 9b9f481c9c487e461079311a05fc5e1cf25776e2 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Tue, 19 Aug 2025 10:14:43 +0000 Subject: [PATCH] fix(state): resolve NodeStore synchronization race conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes CLI timeout issues in TestNodeMoveCommand by ensuring NodeStore remains the single source of truth for all node operations. - Fix updateNodeTx to only return errors, not stale database NodeViews - All callers now read updated NodeView from NodeStore after DB updates - Add nil check for IsOnline in node.Proto() to prevent panics - Add TODO comments to validate NodeStore read patterns This eliminates race conditions where database reads could return partially initialized nodes with nil IsOnline pointers, causing gRPC server panics and CLI command timeouts. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- hscontrol/state/state.go | 60 ++++++++++++++++++++++++++++++---------- hscontrol/types/node.go | 2 +- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index 27c72d75..2468d1ba 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -340,11 +340,12 @@ func (s *State) ListAllUsers() ([]types.User, error) { // IMPORTANT: This function does NOT update the NodeStore. The caller MUST update the NodeStore // BEFORE calling this function with the EXACT same changes that the database update will make. // This ensures the NodeStore is the source of truth for the batcher and maintains consistency. -func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) error) (types.NodeView, error) { +// Returns error only; callers should get the updated NodeView from NodeStore to maintain consistency. +func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) error) error { s.mu.Lock() defer s.mu.Unlock() - node, err := hsdb.Write(s.db.DB, func(tx *gorm.DB) (*types.Node, error) { + _, err := hsdb.Write(s.db.DB, func(tx *gorm.DB) (*types.Node, error) { if err := updateFn(tx); err != nil { return nil, err } @@ -360,11 +361,7 @@ func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) err return node, nil }) - if err != nil { - return types.NodeView{}, err - } - - return node.View(), nil + return err } // persistNodeToDB saves the current state of a node from NodeStore to the database. @@ -446,7 +443,7 @@ func (s *State) Connect(id types.NodeID) change.ChangeSet { // Also persist the last seen time to the database // Note: IsOnline is managed only in NodeStore (marked with gorm:"-"), not persisted to database - _, err := s.updateNodeTx(id, func(tx *gorm.DB) error { + err := s.updateNodeTx(id, func(tx *gorm.DB) error { // Update last_seen in the database return hsdb.SetLastSeen(tx, id, now) }) @@ -486,7 +483,7 @@ func (s *State) Disconnect(id types.NodeID) (change.ChangeSet, error) { n.IsOnline = ptr.To(false) }) - _, err := s.updateNodeTx(id, func(tx *gorm.DB) error { + err := s.updateNodeTx(id, func(tx *gorm.DB) error { // Update last_seen in the database // Note: IsOnline is managed only in NodeStore (marked with gorm:"-"), not persisted to database return hsdb.SetLastSeen(tx, id, now) @@ -623,13 +620,20 @@ func (s *State) SetNodeExpiry(nodeID types.NodeID, expiry time.Time) (types.Node node.Expiry = &expiryPtr }) - n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { + err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { return hsdb.NodeSetExpiry(tx, nodeID, expiry) }) if err != nil { return types.NodeView{}, change.EmptySet, fmt.Errorf("setting node expiry: %w", err) } + // Get the updated node from NodeStore to ensure consistency + // TODO(kradalby): Validate if this NodeStore read makes sense after database update + n, found := s.GetNodeByID(nodeID) + if !found { + return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID) + } + // Check if policy manager needs updating c, err := s.updatePolicyManagerNodes() if err != nil { @@ -652,13 +656,20 @@ func (s *State) SetNodeTags(nodeID types.NodeID, tags []string) (types.NodeView, node.ForcedTags = tags }) - n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { + err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { return hsdb.SetTags(tx, nodeID, tags) }) if err != nil { return types.NodeView{}, change.EmptySet, fmt.Errorf("setting node tags: %w", err) } + // Get the updated node from NodeStore to ensure consistency + // TODO(kradalby): Validate if this NodeStore read makes sense after database update + n, found := s.GetNodeByID(nodeID) + if !found { + return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID) + } + // Check if policy manager needs updating c, err := s.updatePolicyManagerNodes() if err != nil { @@ -681,13 +692,20 @@ func (s *State) SetApprovedRoutes(nodeID types.NodeID, routes []netip.Prefix) (t node.ApprovedRoutes = routes }) - n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { + err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { return hsdb.SetApprovedRoutes(tx, nodeID, routes) }) if err != nil { return types.NodeView{}, change.EmptySet, fmt.Errorf("setting approved routes: %w", err) } + // Get the updated node from NodeStore to ensure consistency + // TODO(kradalby): Validate if this NodeStore read makes sense after database update + n, found := s.GetNodeByID(nodeID) + if !found { + return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID) + } + // Check if policy manager needs updating c, err := s.updatePolicyManagerNodes() if err != nil { @@ -735,13 +753,20 @@ func (s *State) RenameNode(nodeID types.NodeID, newName string) (types.NodeView, node.GivenName = newName }) - n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { + err = s.updateNodeTx(nodeID, func(tx *gorm.DB) error { return hsdb.RenameNode(tx, nodeID, newName) }) if err != nil { return types.NodeView{}, change.EmptySet, fmt.Errorf("renaming node: %w", err) } + // Get the updated node from NodeStore to ensure consistency + // TODO(kradalby): Validate if this NodeStore read makes sense after database update + n, found := s.GetNodeByID(nodeID) + if !found { + return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID) + } + // Check if policy manager needs updating c, err := s.updatePolicyManagerNodes() if err != nil { @@ -776,13 +801,20 @@ func (s *State) AssignNodeToUser(nodeID types.NodeID, userID types.UserID) (type n.UserID = uint(userID) }) - n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error { + err = s.updateNodeTx(nodeID, func(tx *gorm.DB) error { return hsdb.AssignNodeToUser(tx, nodeID, userID) }) if err != nil { return types.NodeView{}, change.EmptySet, err } + // Get the updated node from NodeStore to ensure consistency + // TODO(kradalby): Validate if this NodeStore read makes sense after database update + n, found := s.GetNodeByID(nodeID) + if !found { + return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID) + } + // Check if policy manager needs updating c, err := s.updatePolicyManagerNodes() if err != nil { diff --git a/hscontrol/types/node.go b/hscontrol/types/node.go index d27a0c5b..5dbfe148 100644 --- a/hscontrol/types/node.go +++ b/hscontrol/types/node.go @@ -356,7 +356,7 @@ func (node *Node) Proto() *v1.Node { GivenName: node.GivenName, User: node.User.Proto(), ForcedTags: node.ForcedTags, - Online: *node.IsOnline, + Online: node.IsOnline != nil && *node.IsOnline, // Only ApprovedRoutes and AvailableRoutes is set here. SubnetRoutes has // to be populated manually with PrimaryRoute, to ensure it includes the