cmd/hi: improve reliability and remove backwards compatibility

- Add retry logic for container removal with exponential backoff - Replace arbitrary sleep with deterministic container finalization checking - Use label-based container discovery exclusively (remove time-based fallbacks) - Add random hash to run IDs to prevent collisions (YYYYMMDD-HHMMSS-HASH) - Create shared utility functions for consistent run ID handling - Distinguish between expected and unexpected extraction errors - Consolidate duplicate labeling calls in Tailscale container creation - Remove backwards compatibility code for cleaner behavior
2025-08-14 13:51:01 +02:00 · 2025-06-19 15:19:52 +02:00 · 2025-06-19 15:19:52 +02:00 · 3c8e194d8b
commit 3c8e194d8b
parent 8fae7edd60
5 changed files with 158 additions and 94 deletions
--- a/cmd/hi/cleanup.go
+++ b/cmd/hi/cleanup.go
@ -10,6 +10,7 @@ import (
 	"github.com/docker/docker/api/types/filters"
 	"github.com/docker/docker/api/types/image"
 	"github.com/docker/docker/client"
 	"github.com/docker/docker/errdefs"
 )
 // cleanupBeforeTest performs cleanup operations before running tests.
@ -66,10 +67,8 @@ func killTestContainers(ctx context.Context) error {
 				_ = cli.ContainerKill(ctx, cont.ID, "KILL")
 			}
-			// Then remove the container
+			// Then remove the container with retry logic
-			if err := cli.ContainerRemove(ctx, cont.ID, container.RemoveOptions{
+			if removeContainerWithRetry(ctx, cli, cont.ID) {
 				Force: true,
 			}); err == nil {
 				removed++
 			}
 		}
@ -84,6 +83,32 @@ func killTestContainers(ctx context.Context) error {
 	return nil
 }
 // removeContainerWithRetry attempts to remove a container with exponential backoff retry logic.
 func removeContainerWithRetry(ctx context.Context, cli *client.Client, containerID string) bool {
 	maxRetries := 3
 	baseDelay := 100 * time.Millisecond
 	for attempt := 0; attempt < maxRetries; attempt++ {
 		err := cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
 			Force: true,
 		})
 		if err == nil {
 			return true
 		}
 		// If this is the last attempt, don't wait
 		if attempt == maxRetries-1 {
 			break
 		}
 		// Wait with exponential backoff
 		delay := baseDelay * time.Duration(1<<attempt)
 		time.Sleep(delay)
 	}
 	return false
 }
 // pruneDockerNetworks removes unused Docker networks.
 func pruneDockerNetworks(ctx context.Context) error {
 	cli, err := createDockerClient()
@ -167,7 +192,13 @@ func cleanCacheVolume(ctx context.Context) error {
 	volumeName := "hs-integration-go-cache"
 	err = cli.VolumeRemove(ctx, volumeName, true)
 	if err != nil {
-		fmt.Printf("Go module cache volume not found or already removed\n")
+		if errdefs.IsNotFound(err) {
 			fmt.Printf("Go module cache volume not found: %s\n", volumeName)
 		} else if errdefs.IsConflict(err) {
 			fmt.Printf("Go module cache volume is in use and cannot be removed: %s\n", volumeName)
 		} else {
 			fmt.Printf("Failed to remove Go module cache volume %s: %v\n", volumeName, err)
 		}
 	} else {
 		fmt.Printf("Removed Go module cache volume: %s\n", volumeName)
 	}
--- a/cmd/hi/docker.go
+++ b/cmd/hi/docker.go
@ -19,6 +19,7 @@ import (
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/client"
 	"github.com/docker/docker/pkg/stdcopy"
 	"github.com/juanfont/headscale/integration/dockertestutil"
 )
 var (
@ -35,7 +36,7 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 	}
 	defer cli.Close()
-	runID := generateRunID()
+	runID := dockertestutil.GenerateRunID()
 	containerName := "headscale-test-suite-" + runID
 	logsDir := filepath.Join(config.LogsDir, runID)
@ -91,8 +92,10 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 	exitCode, err := streamAndWait(ctx, cli, resp.ID)
-	// Give the container a moment to flush any final artifacts
+	// Ensure all containers have finished and logs are flushed before extracting artifacts
-	time.Sleep(2 * time.Second)
+	if waitErr := waitForContainerFinalization(ctx, cli, resp.ID, config.Verbose); waitErr != nil && config.Verbose {
 		log.Printf("Warning: failed to wait for container finalization: %v", waitErr)
 	}
 	// Extract artifacts from test containers before cleanup
 	if err := extractArtifactsFromContainers(ctx, resp.ID, logsDir, config.Verbose); err != nil && config.Verbose {
@ -152,7 +155,7 @@ func createGoTestContainer(ctx context.Context, cli *client.Client, config *RunC
 	projectRoot := findProjectRoot(pwd)
-	runID := generateRunIDFromContainerName(containerName)
+	runID := dockertestutil.ExtractRunIDFromContainerName(containerName)
 	env := []string{
 		fmt.Sprintf("HEADSCALE_INTEGRATION_POSTGRES=%d", boolToInt(config.UsePostgres)),
@ -225,23 +228,69 @@ func streamAndWait(ctx context.Context, cli *client.Client, containerID string)
 	return -1, ErrUnexpectedContainerWait
 }
-// generateRunID creates a unique timestamp-based run identifier.
+// waitForContainerFinalization ensures all test containers have properly finished and flushed their output.
-func generateRunID() string {
+func waitForContainerFinalization(ctx context.Context, cli *client.Client, testContainerID string, verbose bool) error {
-	now := time.Now()
+	// First, get all related test containers
-	timestamp := now.Format("20060102-150405")
+	containers, err := cli.ContainerList(ctx, container.ListOptions{All: true})
-	return timestamp
+	if err != nil {
 		return fmt.Errorf("failed to list containers: %w", err)
 	}
 	testContainers := getCurrentTestContainers(containers, testContainerID, verbose)
 	// Wait for all test containers to reach a final state
 	maxWaitTime := 10 * time.Second
 	checkInterval := 500 * time.Millisecond
 	timeout := time.After(maxWaitTime)
 	ticker := time.NewTicker(checkInterval)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-timeout:
 			if verbose {
 				log.Printf("Timeout waiting for container finalization, proceeding with artifact extraction")
 			}
 			return nil
 		case <-ticker.C:
 			allFinalized := true
 			for _, testCont := range testContainers {
 				inspect, err := cli.ContainerInspect(ctx, testCont.ID)
 				if err != nil {
 					if verbose {
 						log.Printf("Warning: failed to inspect container %s: %v", testCont.name, err)
 					}
 					continue
 				}
 				// Check if container is in a final state
 				if !isContainerFinalized(inspect.State) {
 					allFinalized = false
 					if verbose {
 						log.Printf("Container %s still finalizing (state: %s)", testCont.name, inspect.State.Status)
 					}
 					break
 				}
 			}
 			if allFinalized {
 				if verbose {
 					log.Printf("All test containers finalized, ready for artifact extraction")
 				}
 				return nil
 			}
 		}
 	}
 }
-// generateRunIDFromContainerName extracts the run ID from container name.
+// isContainerFinalized checks if a container has reached a final state where logs are flushed.
-func generateRunIDFromContainerName(containerName string) string {
+func isContainerFinalized(state *container.State) bool {
-	// Extract run ID from container name like "headscale-test-suite-20250618-143802"
+	// Container is finalized if it's not running and has a finish time
-	parts := strings.Split(containerName, "-")
+	return !state.Running && state.FinishedAt != ""
 	if len(parts) >= 2 {
 		return strings.Join(parts[len(parts)-2:], "-")
 	}
 	return containerName
 }
 // findProjectRoot locates the project root by finding the directory containing go.mod.
 func findProjectRoot(startPath string) string {
 	current := startPath
@ -466,11 +515,8 @@ func getCurrentTestContainers(containers []container.Summary, testContainerID st
 	}
 	if runID == "" {
-		if verbose {
+		log.Printf("Error: test container %s missing required hi.run-id label", testContainerID[:12])
-			log.Printf("Warning: could not find run ID for test container %s, falling back to time-based filtering", testContainerID[:12])
+		return testRunContainers
 		}
 		// Fallback to time-based filtering for backward compatibility
 		return getCurrentTestContainersByTime(containers, testContainerID, verbose)
 	}
 	if verbose {
@ -570,78 +616,39 @@ func extractContainerLogs(ctx context.Context, cli *client.Client, containerID,
 	return nil
 }
 // getCurrentTestContainersByTime is a fallback method for containers without labels.
 func getCurrentTestContainersByTime(containers []container.Summary, testContainerID string, verbose bool) []testContainer {
 	var testRunContainers []testContainer
 	// Find the test container to get its creation time
 	var testContainerCreated time.Time
 	for _, cont := range containers {
 		if cont.ID == testContainerID {
 			testContainerCreated = time.Unix(cont.Created, 0)
 			break
 		}
 	}
 	if testContainerCreated.IsZero() {
 		if verbose {
 			log.Printf("Warning: could not find test container %s", testContainerID[:12])
 		}
 		return testRunContainers
 	}
 	// Find containers created within a small time window after the test container
 	startTime := testContainerCreated
 	endTime := testContainerCreated.Add(5 * time.Minute)
 	for _, cont := range containers {
 		for _, name := range cont.Names {
 			containerName := strings.TrimPrefix(name, "/")
 			if strings.HasPrefix(containerName, "hs-") || strings.HasPrefix(containerName, "ts-") {
 				createdTime := time.Unix(cont.Created, 0)
 				if createdTime.After(startTime) && createdTime.Before(endTime) {
 					testRunContainers = append(testRunContainers, testContainer{
 						ID:   cont.ID,
 						name: containerName,
 					})
 					if verbose {
 						log.Printf("Including container %s (created %s)", containerName, createdTime.Format("15:04:05"))
 					}
 				}
 				break
 			}
 		}
 	}
 	return testRunContainers
 }
 // extractContainerFiles extracts database file and directories from headscale containers.
 func extractContainerFiles(ctx context.Context, cli *client.Client, containerID, containerName, logsDir string, verbose bool) error {
 	// Extract database file
 	if err := extractSingleFile(ctx, cli, containerID, "/tmp/integration_test_db.sqlite3", containerName+".db", logsDir, verbose); err != nil {
-		if verbose {
+		logExtractionError("database", containerName, err, verbose)
 			log.Printf("Warning: failed to extract database from %s: %v", containerName, err)
 		}
 	}
 	// Extract profile directory
 	if err := extractDirectory(ctx, cli, containerID, "/tmp/profile", containerName+".pprof", logsDir, verbose); err != nil {
-		if verbose {
+		logExtractionError("profile directory", containerName, err, verbose)
 			log.Printf("Warning: failed to extract profile from %s: %v", containerName, err)
 		}
 	}
 	// Extract map responses directory
 	if err := extractDirectory(ctx, cli, containerID, "/tmp/mapresponses", containerName+".mapresp", logsDir, verbose); err != nil {
-		if verbose {
+		logExtractionError("mapresponses directory", containerName, err, verbose)
 			log.Printf("Warning: failed to extract mapresponses from %s: %v", containerName, err)
 		}
 	}
 	return nil
 }
 // logExtractionError logs extraction errors with appropriate level based on error type.
 func logExtractionError(artifactType, containerName string, err error, verbose bool) {
 	if errors.Is(err, ErrFileNotFoundInTar) {
 		// File not found is expected and only logged in verbose mode
 		if verbose {
 			log.Printf("No %s found in container %s", artifactType, containerName)
 		}
 	} else {
 		// Other errors are actual failures and should be logged as warnings
 		log.Printf("Warning: failed to extract %s from %s: %v", artifactType, containerName, err)
 	}
 }
 // extractSingleFile copies a single file from a container.
 func extractSingleFile(ctx context.Context, cli *client.Client, containerID, sourcePath, fileName, logsDir string, verbose bool) error {
 	tarReader, _, err := cli.CopyFromContainer(ctx, containerID, sourcePath)
--- a/cmd/hi/tar_utils.go
+++ b/cmd/hi/tar_utils.go
@ -2,6 +2,7 @@ package main
 import (
 	"archive/tar"
 	"errors"
 	"fmt"
 	"io"
 	"os"
@ -9,6 +10,11 @@ import (
 	"strings"
 )
 var (
 	// ErrFileNotFoundInTar indicates a file was not found in the tar archive.
 	ErrFileNotFoundInTar = errors.New("file not found in tar")
 )
 // extractFileFromTar extracts a single file from a tar reader.
 func extractFileFromTar(tarReader io.Reader, fileName, outputPath string) error {
 	tr := tar.NewReader(tarReader)
@ -41,7 +47,7 @@ func extractFileFromTar(tarReader io.Reader, fileName, outputPath string) error
 		}
 	}
-	return fmt.Errorf("file %s not found in tar", fileName)
+	return fmt.Errorf("%w: %s", ErrFileNotFoundInTar, fileName)
 }
 // extractDirectoryFromTar extracts all files from a tar reader to a target directory.
--- a/integration/dockertestutil/config.go
+++ b/integration/dockertestutil/config.go
@ -1,8 +1,12 @@
 package dockertestutil
 import (
 	"fmt"
 	"os"
 	"strings"
 	"time"
 	"github.com/juanfont/headscale/hscontrol/util"
 	"github.com/ory/dockertest/v3"
 )
@ -18,8 +22,7 @@ func GetIntegrationRunID() string {
 func DockerAddIntegrationLabels(opts *dockertest.RunOptions, testType string) {
 	runID := GetIntegrationRunID()
 	if runID == "" {
-		// If no run ID is set, do nothing for backward compatibility
+		panic("HEADSCALE_INTEGRATION_RUN_ID environment variable is required")
 		return
 	}
 	if opts.Labels == nil {
@ -29,6 +32,29 @@ func DockerAddIntegrationLabels(opts *dockertest.RunOptions, testType string) {
 	opts.Labels["hi.test-type"] = testType
 }
 // GenerateRunID creates a unique run identifier with timestamp and random hash.
 // Format: YYYYMMDD-HHMMSS-HASH (e.g., 20250619-143052-a1b2c3)
 func GenerateRunID() string {
 	now := time.Now()
 	timestamp := now.Format("20060102-150405")
 	// Add a short random hash to ensure uniqueness
 	randomHash := util.MustGenerateRandomStringDNSSafe(6)
 	return fmt.Sprintf("%s-%s", timestamp, randomHash)
 }
 // ExtractRunIDFromContainerName extracts the run ID from container name.
 // Expects format: "prefix-YYYYMMDD-HHMMSS-HASH"
 func ExtractRunIDFromContainerName(containerName string) string {
 	parts := strings.Split(containerName, "-")
 	if len(parts) >= 3 {
 		// Return the last three parts as the run ID (YYYYMMDD-HHMMSS-HASH)
 		return strings.Join(parts[len(parts)-3:], "-")
 	}
 	panic(fmt.Sprintf("unexpected container name format: %s", containerName))
 }
 // IsRunningInContainer checks if the current process is running inside a Docker container.
 // This is used by tests to determine if they should run integration tests.
 func IsRunningInContainer() bool {
--- a/integration/tsic/tsic.go
+++ b/integration/tsic/tsic.go
@ -280,6 +280,9 @@ func New(
 		return nil, err
 	}
 	// Add integration test labels if running under hi tool
 	dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
 	var container *dockertest.Resource
 	if version != VersionHead {
@ -311,9 +314,6 @@ func New(
 			)
 		}
 		// Add integration test labels if running under hi tool
 		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
 		container, err = pool.BuildAndRunWithBuildOptions(
 			buildOptions,
 			tailscaleOptions,
@ -325,9 +325,6 @@ func New(
 		tailscaleOptions.Repository = "tailscale/tailscale"
 		tailscaleOptions.Tag = version
 		// Add integration test labels if running under hi tool
 		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
 		container, err = pool.RunWithOptions(
 			tailscaleOptions,
 			dockertestutil.DockerRestartPolicy,
@ -338,9 +335,6 @@ func New(
 		tailscaleOptions.Repository = "tailscale/tailscale"
 		tailscaleOptions.Tag = "v" + version
 		// Add integration test labels if running under hi tool
 		dockertestutil.DockerAddIntegrationLabels(tailscaleOptions, "tailscale")
 		container, err = pool.RunWithOptions(
 			tailscaleOptions,
 			dockertestutil.DockerRestartPolicy,