1
0
mirror of https://github.com/juanfont/headscale.git synced 2025-08-24 13:46:53 +02:00

fix(state): resolve NodeStore synchronization race conditions

Fixes CLI timeout issues in TestNodeMoveCommand by ensuring NodeStore
remains the single source of truth for all node operations.

- Fix updateNodeTx to only return errors, not stale database NodeViews
- All callers now read updated NodeView from NodeStore after DB updates
- Add nil check for IsOnline in node.Proto() to prevent panics
- Add TODO comments to validate NodeStore read patterns

This eliminates race conditions where database reads could return
partially initialized nodes with nil IsOnline pointers, causing
gRPC server panics and CLI command timeouts.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kristoffer Dalby 2025-08-19 10:14:43 +00:00
parent 3a92b14c1a
commit 9b9f481c9c
2 changed files with 47 additions and 15 deletions

View File

@ -340,11 +340,12 @@ func (s *State) ListAllUsers() ([]types.User, error) {
// IMPORTANT: This function does NOT update the NodeStore. The caller MUST update the NodeStore
// BEFORE calling this function with the EXACT same changes that the database update will make.
// This ensures the NodeStore is the source of truth for the batcher and maintains consistency.
func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) error) (types.NodeView, error) {
// Returns error only; callers should get the updated NodeView from NodeStore to maintain consistency.
func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) error) error {
s.mu.Lock()
defer s.mu.Unlock()
node, err := hsdb.Write(s.db.DB, func(tx *gorm.DB) (*types.Node, error) {
_, err := hsdb.Write(s.db.DB, func(tx *gorm.DB) (*types.Node, error) {
if err := updateFn(tx); err != nil {
return nil, err
}
@ -360,11 +361,7 @@ func (s *State) updateNodeTx(nodeID types.NodeID, updateFn func(tx *gorm.DB) err
return node, nil
})
if err != nil {
return types.NodeView{}, err
}
return node.View(), nil
return err
}
// persistNodeToDB saves the current state of a node from NodeStore to the database.
@ -446,7 +443,7 @@ func (s *State) Connect(id types.NodeID) change.ChangeSet {
// Also persist the last seen time to the database
// Note: IsOnline is managed only in NodeStore (marked with gorm:"-"), not persisted to database
_, err := s.updateNodeTx(id, func(tx *gorm.DB) error {
err := s.updateNodeTx(id, func(tx *gorm.DB) error {
// Update last_seen in the database
return hsdb.SetLastSeen(tx, id, now)
})
@ -486,7 +483,7 @@ func (s *State) Disconnect(id types.NodeID) (change.ChangeSet, error) {
n.IsOnline = ptr.To(false)
})
_, err := s.updateNodeTx(id, func(tx *gorm.DB) error {
err := s.updateNodeTx(id, func(tx *gorm.DB) error {
// Update last_seen in the database
// Note: IsOnline is managed only in NodeStore (marked with gorm:"-"), not persisted to database
return hsdb.SetLastSeen(tx, id, now)
@ -623,13 +620,20 @@ func (s *State) SetNodeExpiry(nodeID types.NodeID, expiry time.Time) (types.Node
node.Expiry = &expiryPtr
})
n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
return hsdb.NodeSetExpiry(tx, nodeID, expiry)
})
if err != nil {
return types.NodeView{}, change.EmptySet, fmt.Errorf("setting node expiry: %w", err)
}
// Get the updated node from NodeStore to ensure consistency
// TODO(kradalby): Validate if this NodeStore read makes sense after database update
n, found := s.GetNodeByID(nodeID)
if !found {
return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID)
}
// Check if policy manager needs updating
c, err := s.updatePolicyManagerNodes()
if err != nil {
@ -652,13 +656,20 @@ func (s *State) SetNodeTags(nodeID types.NodeID, tags []string) (types.NodeView,
node.ForcedTags = tags
})
n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
return hsdb.SetTags(tx, nodeID, tags)
})
if err != nil {
return types.NodeView{}, change.EmptySet, fmt.Errorf("setting node tags: %w", err)
}
// Get the updated node from NodeStore to ensure consistency
// TODO(kradalby): Validate if this NodeStore read makes sense after database update
n, found := s.GetNodeByID(nodeID)
if !found {
return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID)
}
// Check if policy manager needs updating
c, err := s.updatePolicyManagerNodes()
if err != nil {
@ -681,13 +692,20 @@ func (s *State) SetApprovedRoutes(nodeID types.NodeID, routes []netip.Prefix) (t
node.ApprovedRoutes = routes
})
n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
return hsdb.SetApprovedRoutes(tx, nodeID, routes)
})
if err != nil {
return types.NodeView{}, change.EmptySet, fmt.Errorf("setting approved routes: %w", err)
}
// Get the updated node from NodeStore to ensure consistency
// TODO(kradalby): Validate if this NodeStore read makes sense after database update
n, found := s.GetNodeByID(nodeID)
if !found {
return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID)
}
// Check if policy manager needs updating
c, err := s.updatePolicyManagerNodes()
if err != nil {
@ -735,13 +753,20 @@ func (s *State) RenameNode(nodeID types.NodeID, newName string) (types.NodeView,
node.GivenName = newName
})
n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
err = s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
return hsdb.RenameNode(tx, nodeID, newName)
})
if err != nil {
return types.NodeView{}, change.EmptySet, fmt.Errorf("renaming node: %w", err)
}
// Get the updated node from NodeStore to ensure consistency
// TODO(kradalby): Validate if this NodeStore read makes sense after database update
n, found := s.GetNodeByID(nodeID)
if !found {
return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID)
}
// Check if policy manager needs updating
c, err := s.updatePolicyManagerNodes()
if err != nil {
@ -776,13 +801,20 @@ func (s *State) AssignNodeToUser(nodeID types.NodeID, userID types.UserID) (type
n.UserID = uint(userID)
})
n, err := s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
err = s.updateNodeTx(nodeID, func(tx *gorm.DB) error {
return hsdb.AssignNodeToUser(tx, nodeID, userID)
})
if err != nil {
return types.NodeView{}, change.EmptySet, err
}
// Get the updated node from NodeStore to ensure consistency
// TODO(kradalby): Validate if this NodeStore read makes sense after database update
n, found := s.GetNodeByID(nodeID)
if !found {
return types.NodeView{}, change.EmptySet, fmt.Errorf("node not found in NodeStore: %d", nodeID)
}
// Check if policy manager needs updating
c, err := s.updatePolicyManagerNodes()
if err != nil {

View File

@ -356,7 +356,7 @@ func (node *Node) Proto() *v1.Node {
GivenName: node.GivenName,
User: node.User.Proto(),
ForcedTags: node.ForcedTags,
Online: *node.IsOnline,
Online: node.IsOnline != nil && *node.IsOnline,
// Only ApprovedRoutes and AvailableRoutes is set here. SubnetRoutes has
// to be populated manually with PrimaryRoute, to ensure it includes the