diff --git a/pkg/process/kill_unix.go b/pkg/process/kill_unix.go index ab2d57b240..6f31642b18 100644 --- a/pkg/process/kill_unix.go +++ b/pkg/process/kill_unix.go @@ -6,23 +6,50 @@ package process import ( + "errors" "fmt" "os" "syscall" + "time" ) -// KillProcess kills a process by its ID +// KillProcess kills a process by its ID. +// It first sends SIGTERM for graceful shutdown, waits briefly, then sends SIGKILL +// if the process is still running. This handles zombie processes that may survive +// laptop sleep/wake cycles and don't respond to SIGTERM. func KillProcess(pid int) error { - // Check if the process exists - process, err := os.FindProcess(pid) + proc, err := os.FindProcess(pid) if err != nil { return fmt.Errorf("failed to find process: %w", err) } - // Send a SIGTERM signal to the process - if err := process.Signal(syscall.SIGTERM); err != nil { + // First try graceful termination with SIGTERM + if err := proc.Signal(syscall.SIGTERM); err != nil { + // Process might already be dead + if errors.Is(err, os.ErrProcessDone) { + return nil + } return fmt.Errorf("failed to send SIGTERM to process: %w", err) } + // Wait briefly for graceful shutdown + time.Sleep(500 * time.Millisecond) + + // Check if process is still alive + alive, err := FindProcess(pid) + if err != nil || !alive { + // Process terminated gracefully + return nil + } + + // Process didn't respond to SIGTERM - force kill with SIGKILL + // This handles zombie processes that may survive laptop sleep + if err := proc.Signal(syscall.SIGKILL); err != nil { + if errors.Is(err, os.ErrProcessDone) { + return nil + } + return fmt.Errorf("failed to send SIGKILL to process: %w", err) + } + return nil } diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go index abe260c4de..d3b9614223 100644 --- a/pkg/workloads/manager.go +++ b/pkg/workloads/manager.go @@ -91,11 +91,16 @@ type Manager interface { DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error) } +// ProcessFinder is a function type for checking if a process exists. +// This allows dependency injection for testing. +type ProcessFinder func(pid int) (bool, error) + // DefaultManager is the default implementation of the Manager interface. type DefaultManager struct { runtime rt.Runtime statuses statuses.StatusManager configProvider config.Provider + findProcess ProcessFinder // defaults to process.FindProcess } // ErrWorkloadNotRunning is returned when a container cannot be found by name. @@ -810,24 +815,43 @@ func (d *DefaultManager) getWorkloadContainer(ctx context.Context, name string) } // isSupervisorProcessAlive checks if the supervisor process for a workload is alive -// by checking if a PID exists. If a PID exists, we assume the supervisor is running. -// This is a reasonable assumption because: -// - If the supervisor exits cleanly, it cleans up the PID -// - If killed unexpectedly, the PID remains but stopProcess will handle it gracefully -// - The main issue we're preventing is accumulating zombie supervisors from repeated restarts +// by checking if a PID exists AND the process is actually running. +// This handles scenarios where: +// - The supervisor exits cleanly and removes the PID file +// - The supervisor is killed but PID file remains (zombie) +// - The process survives laptop sleep but becomes unresponsive func (d *DefaultManager) isSupervisorProcessAlive(ctx context.Context, name string) bool { if name == "" { return false } - // Try to read the PID - if it exists, assume supervisor is running - _, err := d.statuses.GetWorkloadPID(ctx, name) + // Try to read the PID + pid, err := d.statuses.GetWorkloadPID(ctx, name) if err != nil { // No PID found, supervisor is not running return false } - // PID exists, assume supervisor is alive + // Actually check if the process is running (not just if PID file exists) + findProcess := d.findProcess + if findProcess == nil { + findProcess = process.FindProcess + } + alive, err := findProcess(pid) + if err != nil { + logger.Debugf("Error checking if supervisor process %d is alive: %v", pid, err) + return false + } + + if !alive { + // Process is dead but PID file exists - clean up the stale PID file + logger.Debugf("Supervisor process %d is dead, cleaning up stale PID file for %s", pid, name) + if err := process.RemovePIDFile(name); err != nil { + logger.Warnf("Failed to remove stale PID file for %s: %v", name, err) + } + return false + } + return true } diff --git a/pkg/workloads/manager_test.go b/pkg/workloads/manager_test.go index 800a9982eb..e456fac4e1 100644 --- a/pkg/workloads/manager_test.go +++ b/pkg/workloads/manager_test.go @@ -638,13 +638,14 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) { t.Parallel() tests := []struct { - name string - workloadName string - runConfig *runner.RunConfig - foreground bool - setupMocks func(*statusMocks.MockStatusManager) - expectError bool - errorMsg string + name string + workloadName string + runConfig *runner.RunConfig + foreground bool + setupMocks func(*statusMocks.MockStatusManager) + mockFindProcess func(int) (bool, error) // optional: mock for findProcessFunc + expectError bool + errorMsg string }{ { name: "remote workload already running with healthy supervisor", @@ -662,6 +663,8 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) { // Check if supervisor is alive - return valid PID (supervisor is healthy) sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil) }, + // Mock process as alive for healthy supervisor test + mockFindProcess: func(_ int) (bool, error) { return true, nil }, // With healthy supervisor, restart should return early (no-op) expectError: false, }, @@ -691,6 +694,34 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) { expectError: true, errorMsg: "failed to load state", }, + { + name: "remote workload with zombie supervisor (PID exists but process dead)", + workloadName: "remote-workload", + runConfig: &runner.RunConfig{ + BaseName: "remote-base", + RemoteURL: "http://example.com", + }, + foreground: false, + setupMocks: func(sm *statusMocks.MockStatusManager) { + sm.EXPECT().GetWorkload(gomock.Any(), "remote-workload").Return(core.Workload{ + Name: "remote-workload", + Status: runtime.WorkloadStatusRunning, + }, nil) + // PID file exists with a valid PID, but process is dead (zombie scenario) + sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil) + // With zombie supervisor detected, restart proceeds with cleanup and restart + sm.EXPECT().SetWorkloadStatus(gomock.Any(), "remote-workload", runtime.WorkloadStatusStopping, "").Return(nil) + sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil) + // Allow any subsequent status updates + sm.EXPECT().SetWorkloadStatus(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil) + sm.EXPECT().SetWorkloadPID(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil) + }, + // Mock process as dead - this is the zombie scenario where PID file exists but process is gone + mockFindProcess: func(_ int) (bool, error) { return false, nil }, + // Restart proceeds to load state which fails in tests + expectError: true, + errorMsg: "failed to load state", + }, { name: "status manager error", workloadName: "remote-workload", @@ -718,7 +749,8 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) { tt.setupMocks(statusMgr) manager := &DefaultManager{ - statuses: statusMgr, + statuses: statusMgr, + findProcess: tt.mockFindProcess, // inject mock (nil uses default) } err := manager.restartRemoteWorkload(context.Background(), tt.workloadName, tt.runConfig, tt.foreground) @@ -739,12 +771,13 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) { t.Parallel() tests := []struct { - name string - workloadName string - foreground bool - setupMocks func(*statusMocks.MockStatusManager, *runtimeMocks.MockRuntime) - expectError bool - errorMsg string + name string + workloadName string + foreground bool + setupMocks func(*statusMocks.MockStatusManager, *runtimeMocks.MockRuntime) + mockFindProcess func(int) (bool, error) // optional: mock for findProcessFunc + expectError bool + errorMsg string }{ { name: "container workload already running with healthy supervisor", @@ -766,6 +799,8 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) { // Check if supervisor is alive - return valid PID (supervisor is healthy) sm.EXPECT().GetWorkloadPID(gomock.Any(), "container-workload").Return(12345, nil) }, + // Mock process as alive for healthy supervisor test + mockFindProcess: func(_ int) (bool, error) { return true, nil }, // With healthy supervisor, restart should return early (no-op) expectError: false, }, @@ -833,8 +868,9 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) { tt.setupMocks(statusMgr, runtimeMgr) manager := &DefaultManager{ - statuses: statusMgr, - runtime: runtimeMgr, + statuses: statusMgr, + runtime: runtimeMgr, + findProcess: tt.mockFindProcess, // inject mock (nil uses default) } err := manager.restartContainerWorkload(context.Background(), tt.workloadName, tt.foreground) @@ -871,7 +907,8 @@ func TestDefaultManager_restartLogicConsistency(t *testing.T) { statusMgr.EXPECT().GetWorkloadPID(gomock.Any(), "test-base").Return(12345, nil) manager := &DefaultManager{ - statuses: statusMgr, + statuses: statusMgr, + findProcess: func(_ int) (bool, error) { return true, nil }, // mock: process alive } runConfig := &runner.RunConfig{ @@ -949,8 +986,9 @@ func TestDefaultManager_restartLogicConsistency(t *testing.T) { statusMgr.EXPECT().GetWorkloadPID(gomock.Any(), "test-workload").Return(12345, nil) manager := &DefaultManager{ - statuses: statusMgr, - runtime: runtimeMgr, + statuses: statusMgr, + runtime: runtimeMgr, + findProcess: func(_ int) (bool, error) { return true, nil }, // mock: process alive } err := manager.restartContainerWorkload(context.Background(), "test-workload", false)