From 3c8e194d8bead1a74f73e49f59d88e51328a0c3a Mon Sep 17 00:00:00 2001
From: Kristoffer Dalby <kristoffer@tailscale.com>
Date: Thu, 19 Jun 2025 15:19:52 +0200
Subject: [PATCH] cmd/hi: improve reliability and remove backwards
 compatibility

- Add retry logic for container removal with exponential backoff
- Replace arbitrary sleep with deterministic container finalization checking
- Use label-based container discovery exclusively (remove time-based fallbacks)
- Add random hash to run IDs to prevent collisions (YYYYMMDD-HHMMSS-HASH)
- Create shared utility functions for consistent run ID handling
- Distinguish between expected and unexpected extraction errors
- Consolidate duplicate labeling calls in Tailscale container creation
- Remove backwards compatibility code for cleaner behavior
---
 cmd/hi/cleanup.go                    |  41 ++++++-
 cmd/hi/docker.go                     | 161 ++++++++++++++-------------
 cmd/hi/tar_utils.go                  |   8 +-
 integration/dockertestutil/config.go |  30 ++++-
 integration/tsic/tsic.go             |  12 +-
 5 files changed, 158 insertions(+), 94 deletions(-)

diff --git a/cmd/hi/cleanup.go b/cmd/hi/cleanup.go
index 17fdb594..080266d8 100644
--- a/cmd/hi/cleanup.go
+++ b/cmd/hi/cleanup.go
@@ -10,6 +10,7 @@ import (
 	"github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/api/types/image"
 	"github.com/docker/docker/client"
+	"github.com/docker/docker/errdefs"
 )
 
 // cleanupBeforeTest performs cleanup operations before running tests.
@@ -66,10 +67,8 @@ func killTestContainers(ctx context.Context) error {
 				_ = cli.ContainerKill(ctx, cont.ID, "KILL")
 			}
 			
-			// Then remove the container
-			if err := cli.ContainerRemove(ctx, cont.ID, container.RemoveOptions{
-				Force: true,
-			}); err == nil {
+			// Then remove the container with retry logic
+			if removeContainerWithRetry(ctx, cli, cont.ID) {
 				removed++
 			}
 		}
@@ -84,6 +83,32 @@ func killTestContainers(ctx context.Context) error {
 	return nil
 }
 
+// removeContainerWithRetry attempts to remove a container with exponential backoff retry logic.
+func removeContainerWithRetry(ctx context.Context, cli *client.Client, containerID string) bool {
+	maxRetries := 3
+	baseDelay := 100 * time.Millisecond
+	
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		err := cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
+			Force: true,
+		})
+		if err == nil {
+			return true
+		}
+		
+		// If this is the last attempt, don't wait
+		if attempt == maxRetries-1 {
+			break
+		}
+		
+		// Wait with exponential backoff
+		delay := baseDelay * time.Duration(1<<attempt)
+		time.Sleep(delay)
+	}
+	
+	return false
+}
+
 // pruneDockerNetworks removes unused Docker networks.
 func pruneDockerNetworks(ctx context.Context) error {
 	cli, err := createDockerClient()
@@ -167,7 +192,13 @@ func cleanCacheVolume(ctx context.Context) error {
 	volumeName := "hs-integration-go-cache"
 	err = cli.VolumeRemove(ctx, volumeName, true)
 	if err != nil {
-		fmt.Printf("Go module cache volume not found or already removed\n")
+		if errdefs.IsNotFound(err) {
+			fmt.Printf("Go module cache volume not found: %s\n", volumeName)
+		} else if errdefs.IsConflict(err) {
+			fmt.Printf("Go module cache volume is in use and cannot be removed: %s\n", volumeName)
+		} else {
+			fmt.Printf("Failed to remove Go module cache volume %s: %v\n", volumeName, err)
+		}
 	} else {
 		fmt.Printf("Removed Go module cache volume: %s\n", volumeName)
 	}
diff --git a/cmd/hi/docker.go b/cmd/hi/docker.go
index 821ea11f..2fab993b 100644
--- a/cmd/hi/docker.go
+++ b/cmd/hi/docker.go
@@ -19,6 +19,7 @@ import (
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/client"
 	"github.com/docker/docker/pkg/stdcopy"
+	"github.com/juanfont/headscale/integration/dockertestutil"
 )
 
 var (
@@ -35,7 +36,7 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 	}
 	defer cli.Close()
 
-	runID := generateRunID()
+	runID := dockertestutil.GenerateRunID()
 	containerName := "headscale-test-suite-" + runID
 	logsDir := filepath.Join(config.LogsDir, runID)
 
@@ -91,8 +92,10 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 
 	exitCode, err := streamAndWait(ctx, cli, resp.ID)
 
-	// Give the container a moment to flush any final artifacts
-	time.Sleep(2 * time.Second)
+	// Ensure all containers have finished and logs are flushed before extracting artifacts
+	if waitErr := waitForContainerFinalization(ctx, cli, resp.ID, config.Verbose); waitErr != nil && config.Verbose {
+		log.Printf("Warning: failed to wait for container finalization: %v", waitErr)
+	}
 
 	// Extract artifacts from test containers before cleanup
 	if err := extractArtifactsFromContainers(ctx, resp.ID, logsDir, config.Verbose); err != nil && config.Verbose {
@@ -152,7 +155,7 @@ func createGoTestContainer(ctx context.Context, cli *client.Client, config *RunC
 
 	projectRoot := findProjectRoot(pwd)
 
-	runID := generateRunIDFromContainerName(containerName)
+	runID := dockertestutil.ExtractRunIDFromContainerName(containerName)
 	
 	env := []string{
 		fmt.Sprintf("HEADSCALE_INTEGRATION_POSTGRES=%d", boolToInt(config.UsePostgres)),
@@ -225,23 +228,69 @@ func streamAndWait(ctx context.Context, cli *client.Client, containerID string)
 	return -1, ErrUnexpectedContainerWait
 }
 
-// generateRunID creates a unique timestamp-based run identifier.
-func generateRunID() string {
-	now := time.Now()
-	timestamp := now.Format("20060102-150405")
-	return timestamp
+// waitForContainerFinalization ensures all test containers have properly finished and flushed their output.
+func waitForContainerFinalization(ctx context.Context, cli *client.Client, testContainerID string, verbose bool) error {
+	// First, get all related test containers
+	containers, err := cli.ContainerList(ctx, container.ListOptions{All: true})
+	if err != nil {
+		return fmt.Errorf("failed to list containers: %w", err)
+	}
+
+	testContainers := getCurrentTestContainers(containers, testContainerID, verbose)
+	
+	// Wait for all test containers to reach a final state
+	maxWaitTime := 10 * time.Second
+	checkInterval := 500 * time.Millisecond
+	timeout := time.After(maxWaitTime)
+	ticker := time.NewTicker(checkInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-timeout:
+			if verbose {
+				log.Printf("Timeout waiting for container finalization, proceeding with artifact extraction")
+			}
+			return nil
+		case <-ticker.C:
+			allFinalized := true
+			
+			for _, testCont := range testContainers {
+				inspect, err := cli.ContainerInspect(ctx, testCont.ID)
+				if err != nil {
+					if verbose {
+						log.Printf("Warning: failed to inspect container %s: %v", testCont.name, err)
+					}
+					continue
+				}
+				
+				// Check if container is in a final state
+				if !isContainerFinalized(inspect.State) {
+					allFinalized = false
+					if verbose {
+						log.Printf("Container %s still finalizing (state: %s)", testCont.name, inspect.State.Status)
+					}
+					break
+				}
+			}
+			
+			if allFinalized {
+				if verbose {
+					log.Printf("All test containers finalized, ready for artifact extraction")
+				}
+				return nil
+			}
+		}
+	}
 }
 
-// generateRunIDFromContainerName extracts the run ID from container name.
-func generateRunIDFromContainerName(containerName string) string {
-	// Extract run ID from container name like "headscale-test-suite-20250618-143802"
-	parts := strings.Split(containerName, "-")
-	if len(parts) >= 2 {
-		return strings.Join(parts[len(parts)-2:], "-")
-	}
-	return containerName
+// isContainerFinalized checks if a container has reached a final state where logs are flushed.
+func isContainerFinalized(state *container.State) bool {
+	// Container is finalized if it's not running and has a finish time
+	return !state.Running && state.FinishedAt != ""
 }
 
+
 // findProjectRoot locates the project root by finding the directory containing go.mod.
 func findProjectRoot(startPath string) string {
 	current := startPath
@@ -466,11 +515,8 @@ func getCurrentTestContainers(containers []container.Summary, testContainerID st
 	}
 	
 	if runID == "" {
-		if verbose {
-			log.Printf("Warning: could not find run ID for test container %s, falling back to time-based filtering", testContainerID[:12])
-		}
-		// Fallback to time-based filtering for backward compatibility
-		return getCurrentTestContainersByTime(containers, testContainerID, verbose)
+		log.Printf("Error: test container %s missing required hi.run-id label", testContainerID[:12])
+		return testRunContainers
 	}
 	
 	if verbose {
@@ -570,78 +616,39 @@ func extractContainerLogs(ctx context.Context, cli *client.Client, containerID,
 	return nil
 }
 
-// getCurrentTestContainersByTime is a fallback method for containers without labels.
-func getCurrentTestContainersByTime(containers []container.Summary, testContainerID string, verbose bool) []testContainer {
-	var testRunContainers []testContainer
-	
-	// Find the test container to get its creation time
-	var testContainerCreated time.Time
-	for _, cont := range containers {
-		if cont.ID == testContainerID {
-			testContainerCreated = time.Unix(cont.Created, 0)
-			break
-		}
-	}
-	
-	if testContainerCreated.IsZero() {
-		if verbose {
-			log.Printf("Warning: could not find test container %s", testContainerID[:12])
-		}
-		return testRunContainers
-	}
-	
-	// Find containers created within a small time window after the test container
-	startTime := testContainerCreated
-	endTime := testContainerCreated.Add(5 * time.Minute)
-	
-	for _, cont := range containers {
-		for _, name := range cont.Names {
-			containerName := strings.TrimPrefix(name, "/")
-			if strings.HasPrefix(containerName, "hs-") || strings.HasPrefix(containerName, "ts-") {
-				createdTime := time.Unix(cont.Created, 0)
-				if createdTime.After(startTime) && createdTime.Before(endTime) {
-					testRunContainers = append(testRunContainers, testContainer{
-						ID:   cont.ID,
-						name: containerName,
-					})
-					if verbose {
-						log.Printf("Including container %s (created %s)", containerName, createdTime.Format("15:04:05"))
-					}
-				}
-				break
-			}
-		}
-	}
-	
-	return testRunContainers
-}
-
 // extractContainerFiles extracts database file and directories from headscale containers.
 func extractContainerFiles(ctx context.Context, cli *client.Client, containerID, containerName, logsDir string, verbose bool) error {
 	// Extract database file
 	if err := extractSingleFile(ctx, cli, containerID, "/tmp/integration_test_db.sqlite3", containerName+".db", logsDir, verbose); err != nil {
-		if verbose {
-			log.Printf("Warning: failed to extract database from %s: %v", containerName, err)
-		}
+		logExtractionError("database", containerName, err, verbose)
 	}
 
 	// Extract profile directory
 	if err := extractDirectory(ctx, cli, containerID, "/tmp/profile", containerName+".pprof", logsDir, verbose); err != nil {
-		if verbose {
-			log.Printf("Warning: failed to extract profile from %s: %v", containerName, err)
-		}
+		logExtractionError("profile directory", containerName, err, verbose)
 	}
 
 	// Extract map responses directory
 	if err := extractDirectory(ctx, cli, containerID, "/tmp/mapresponses", containerName+".mapresp", logsDir, verbose); err != nil {
-		if verbose {
-			log.Printf("Warning: failed to extract mapresponses from %s: %v", containerName, err)
-		}
+		logExtractionError("mapresponses directory", containerName, err, verbose)
 	}
 
 	return nil
 }
 
+// logExtractionError logs extraction errors with appropriate level based on error type.
+func logExtractionError(artifactType, containerName string, err error, verbose bool) {
+	if errors.Is(err, ErrFileNotFoundInTar) {
+		// File not found is expected and only logged in verbose mode
+		if verbose {
+			log.Printf("No %s found in container %s", artifactType, containerName)
+		}
+	} else {
+		// Other errors are actual failures and should be logged as warnings
+		log.Printf("Warning: failed to extract %s from %s: %v", artifactType, containerName, err)
+	}
+}
+
 // extractSingleFile copies a single file from a container.
 func extractSingleFile(ctx context.Context, cli *client.Client, containerID, sourcePath, fileName, logsDir string, verbose bool) error {
 	tarReader, _, err := cli.CopyFromContainer(ctx, containerID, sourcePath)
diff --git a/cmd/hi/tar_utils.go b/cmd/hi/tar_utils.go
index 46f21987..16fb8793 100644
--- a/cmd/hi/tar_utils.go
+++ b/cmd/hi/tar_utils.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"archive/tar"
+	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -9,6 +10,11 @@ import (
 	"strings"
 )
 
+var (
+	// ErrFileNotFoundInTar indicates a file was not found in the tar archive.
+	ErrFileNotFoundInTar = errors.New("file not found in tar")
+)
+
 // extractFileFromTar extracts a single file from a tar reader.
 func extractFileFromTar(tarReader io.Reader, fileName, outputPath string) error {
 	tr := tar.NewReader(tarReader)
@@ -41,7 +47,7 @@ func extractFileFromTar(tarReader io.Reader, fileName, outputPath string) error
 		}
 	}
 
-	return fmt.Errorf("file %s not found in tar", fileName)
+	return fmt.Errorf("%w: %s", ErrFileNotFoundInTar, fileName)
 }
 
 // extractDirectoryFromTar extracts all files from a tar reader to a target directory.
diff --git a/integration/dockertestutil/config.go b/integration/dockertestutil/config.go
index 269f103a..f8bbde5f 100644
--- a/integration/dockertestutil/config.go
+++ b/integration/dockertestutil/config.go
@@ -1,8 +1,12 @@
 package dockertestutil
 
 import (
+	"fmt"
 	"os"
+	"strings"
+	"time"
 
+	"github.com/juanfont/headscale/hscontrol/util"
 	"github.com/ory/dockertest/v3"
 )
 
@@ -18,8 +22,7 @@ func GetIntegrationRunID() string {
 func DockerAddIntegrationLabels(opts *dockertest.RunOptions, testType string) {
 	runID := GetIntegrationRunID()
 	if runID == "" {
-		// If no run ID is set, do nothing for backward compatibility
-		return
+		panic("HEADSCALE_INTEGRATION_RUN_ID environment variable is required")
 	}
 
 	if opts.Labels == nil {
@@ -29,6 +32,29 @@ func DockerAddIntegrationLabels(opts *dockertest.RunOptions, testType string) {
 	opts.Labels["hi.test-type"] = testType
 }
 
+// GenerateRunID creates a unique run identifier with timestamp and random hash.
+// Format: YYYYMMDD-HHMMSS-HASH (e.g., 20250619-143052-a1b2c3)
+func GenerateRunID() string {
+	now := time.Now()
+	timestamp := now.Format("20060102-150405")
+	
+	// Add a short random hash to ensure uniqueness
+	randomHash := util.MustGenerateRandomStringDNSSafe(6)
+	return fmt.Sprintf("%s-%s", timestamp, randomHash)
+}
+
+// ExtractRunIDFromContainerName extracts the run ID from container name.
+// Expects format: "prefix-YYYYMMDD-HHMMSS-HASH"
+func ExtractRunIDFromContainerName(containerName string) string {
+	parts := strings.Split(containerName, "-")
+	if len(parts) >= 3 {
+		// Return the last three parts as the run ID (YYYYMMDD-HHMMSS-HASH)
+		return strings.Join(parts[len(parts)-3:], "-")
+	}
+	
+	panic(fmt.Sprintf("unexpected container name format: %s", containerName))
+}
+
 // IsRunningInContainer checks if the current process is running inside a Docker container.
 // This is used by tests to determine if they should run integration tests.
 func IsRunningInContainer() bool {
diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go
index cad60816..d2738c55 100644
--- a/integration/tsic/tsic.go
+++ b/integration/tsic/tsic.go
@@ -280,6 +280,9 @@ func New(
 		return nil, err
 	}
 
+	// Add integration test labels if running under hi tool
+	dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
+
 	var container *dockertest.Resource
 
 	if version != VersionHead {
@@ -311,9 +314,6 @@ func New(
 			)
 		}
 
-		// Add integration test labels if running under hi tool
-		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
-		
 		container, err = pool.BuildAndRunWithBuildOptions(
 			buildOptions,
 			tailscaleOptions,
@@ -325,9 +325,6 @@ func New(
 		tailscaleOptions.Repository = "tailscale/tailscale"
 		tailscaleOptions.Tag = version
 
-		// Add integration test labels if running under hi tool
-		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
-		
 		container, err = pool.RunWithOptions(
 			tailscaleOptions,
 			dockertestutil.DockerRestartPolicy,
@@ -338,9 +335,6 @@ func New(
 		tailscaleOptions.Repository = "tailscale/tailscale"
 		tailscaleOptions.Tag = "v" + version
 
-		// Add integration test labels if running under hi tool
-		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
-		
 		container, err = pool.RunWithOptions(
 			tailscaleOptions,
 			dockertestutil.DockerRestartPolicy,