Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions pkg/process/kill_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,50 @@
package process

import (
"errors"
"fmt"
"os"
"syscall"
"time"
)

// KillProcess kills a process by its ID
// KillProcess kills a process by its ID.
// It first sends SIGTERM for graceful shutdown, waits briefly, then sends SIGKILL
// if the process is still running. This handles zombie processes that may survive
// laptop sleep/wake cycles and don't respond to SIGTERM.
func KillProcess(pid int) error {
// Check if the process exists
process, err := os.FindProcess(pid)
proc, err := os.FindProcess(pid)
if err != nil {
return fmt.Errorf("failed to find process: %w", err)
}

// Send a SIGTERM signal to the process
if err := process.Signal(syscall.SIGTERM); err != nil {
// First try graceful termination with SIGTERM
if err := proc.Signal(syscall.SIGTERM); err != nil {
// Process might already be dead
if errors.Is(err, os.ErrProcessDone) {
return nil
}
return fmt.Errorf("failed to send SIGTERM to process: %w", err)
}

// Wait briefly for graceful shutdown
time.Sleep(500 * time.Millisecond)

// Check if process is still alive
alive, err := FindProcess(pid)
if err != nil || !alive {
// Process terminated gracefully
return nil
}

// Process didn't respond to SIGTERM - force kill with SIGKILL
// This handles zombie processes that may survive laptop sleep
if err := proc.Signal(syscall.SIGKILL); err != nil {
if errors.Is(err, os.ErrProcessDone) {
return nil
}
return fmt.Errorf("failed to send SIGKILL to process: %w", err)
}

return nil
}
40 changes: 32 additions & 8 deletions pkg/workloads/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,16 @@ type Manager interface {
DoesWorkloadExist(ctx context.Context, workloadName string) (bool, error)
}

// ProcessFinder is a function type for checking if a process exists.
// This allows dependency injection for testing.
type ProcessFinder func(pid int) (bool, error)

// DefaultManager is the default implementation of the Manager interface.
type DefaultManager struct {
runtime rt.Runtime
statuses statuses.StatusManager
configProvider config.Provider
findProcess ProcessFinder // defaults to process.FindProcess
}

// ErrWorkloadNotRunning is returned when a container cannot be found by name.
Expand Down Expand Up @@ -810,24 +815,43 @@ func (d *DefaultManager) getWorkloadContainer(ctx context.Context, name string)
}

// isSupervisorProcessAlive checks if the supervisor process for a workload is alive
// by checking if a PID exists. If a PID exists, we assume the supervisor is running.
// This is a reasonable assumption because:
// - If the supervisor exits cleanly, it cleans up the PID
// - If killed unexpectedly, the PID remains but stopProcess will handle it gracefully
// - The main issue we're preventing is accumulating zombie supervisors from repeated restarts
// by checking if a PID exists AND the process is actually running.
// This handles scenarios where:
// - The supervisor exits cleanly and removes the PID file
// - The supervisor is killed but PID file remains (zombie)
// - The process survives laptop sleep but becomes unresponsive
func (d *DefaultManager) isSupervisorProcessAlive(ctx context.Context, name string) bool {
if name == "" {
return false
}

// Try to read the PID - if it exists, assume supervisor is running
_, err := d.statuses.GetWorkloadPID(ctx, name)
// Try to read the PID
pid, err := d.statuses.GetWorkloadPID(ctx, name)
if err != nil {
// No PID found, supervisor is not running
return false
}

// PID exists, assume supervisor is alive
// Actually check if the process is running (not just if PID file exists)
findProcess := d.findProcess
if findProcess == nil {
findProcess = process.FindProcess
}
alive, err := findProcess(pid)
if err != nil {
logger.Debugf("Error checking if supervisor process %d is alive: %v", pid, err)
return false
}

if !alive {
// Process is dead but PID file exists - clean up the stale PID file
logger.Debugf("Supervisor process %d is dead, cleaning up stale PID file for %s", pid, name)
if err := process.RemovePIDFile(name); err != nil {
logger.Warnf("Failed to remove stale PID file for %s: %v", name, err)
}
return false
}

return true
}

Expand Down
76 changes: 57 additions & 19 deletions pkg/workloads/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,13 +638,14 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) {
t.Parallel()

tests := []struct {
name string
workloadName string
runConfig *runner.RunConfig
foreground bool
setupMocks func(*statusMocks.MockStatusManager)
expectError bool
errorMsg string
name string
workloadName string
runConfig *runner.RunConfig
foreground bool
setupMocks func(*statusMocks.MockStatusManager)
mockFindProcess func(int) (bool, error) // optional: mock for findProcessFunc
expectError bool
errorMsg string
}{
{
name: "remote workload already running with healthy supervisor",
Expand All @@ -662,6 +663,8 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) {
// Check if supervisor is alive - return valid PID (supervisor is healthy)
sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil)
},
// Mock process as alive for healthy supervisor test
mockFindProcess: func(_ int) (bool, error) { return true, nil },
// With healthy supervisor, restart should return early (no-op)
expectError: false,
},
Expand Down Expand Up @@ -691,6 +694,34 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) {
expectError: true,
errorMsg: "failed to load state",
},
{
name: "remote workload with zombie supervisor (PID exists but process dead)",
workloadName: "remote-workload",
runConfig: &runner.RunConfig{
BaseName: "remote-base",
RemoteURL: "http://example.com",
},
foreground: false,
setupMocks: func(sm *statusMocks.MockStatusManager) {
sm.EXPECT().GetWorkload(gomock.Any(), "remote-workload").Return(core.Workload{
Name: "remote-workload",
Status: runtime.WorkloadStatusRunning,
}, nil)
// PID file exists with a valid PID, but process is dead (zombie scenario)
sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil)
// With zombie supervisor detected, restart proceeds with cleanup and restart
sm.EXPECT().SetWorkloadStatus(gomock.Any(), "remote-workload", runtime.WorkloadStatusStopping, "").Return(nil)
sm.EXPECT().GetWorkloadPID(gomock.Any(), "remote-base").Return(12345, nil)
// Allow any subsequent status updates
sm.EXPECT().SetWorkloadStatus(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil)
sm.EXPECT().SetWorkloadPID(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil)
},
// Mock process as dead - this is the zombie scenario where PID file exists but process is gone
mockFindProcess: func(_ int) (bool, error) { return false, nil },
// Restart proceeds to load state which fails in tests
expectError: true,
errorMsg: "failed to load state",
},
{
name: "status manager error",
workloadName: "remote-workload",
Expand Down Expand Up @@ -718,7 +749,8 @@ func TestDefaultManager_restartRemoteWorkload(t *testing.T) {
tt.setupMocks(statusMgr)

manager := &DefaultManager{
statuses: statusMgr,
statuses: statusMgr,
findProcess: tt.mockFindProcess, // inject mock (nil uses default)
}

err := manager.restartRemoteWorkload(context.Background(), tt.workloadName, tt.runConfig, tt.foreground)
Expand All @@ -739,12 +771,13 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) {
t.Parallel()

tests := []struct {
name string
workloadName string
foreground bool
setupMocks func(*statusMocks.MockStatusManager, *runtimeMocks.MockRuntime)
expectError bool
errorMsg string
name string
workloadName string
foreground bool
setupMocks func(*statusMocks.MockStatusManager, *runtimeMocks.MockRuntime)
mockFindProcess func(int) (bool, error) // optional: mock for findProcessFunc
expectError bool
errorMsg string
}{
{
name: "container workload already running with healthy supervisor",
Expand All @@ -766,6 +799,8 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) {
// Check if supervisor is alive - return valid PID (supervisor is healthy)
sm.EXPECT().GetWorkloadPID(gomock.Any(), "container-workload").Return(12345, nil)
},
// Mock process as alive for healthy supervisor test
mockFindProcess: func(_ int) (bool, error) { return true, nil },
// With healthy supervisor, restart should return early (no-op)
expectError: false,
},
Expand Down Expand Up @@ -833,8 +868,9 @@ func TestDefaultManager_restartContainerWorkload(t *testing.T) {
tt.setupMocks(statusMgr, runtimeMgr)

manager := &DefaultManager{
statuses: statusMgr,
runtime: runtimeMgr,
statuses: statusMgr,
runtime: runtimeMgr,
findProcess: tt.mockFindProcess, // inject mock (nil uses default)
}

err := manager.restartContainerWorkload(context.Background(), tt.workloadName, tt.foreground)
Expand Down Expand Up @@ -871,7 +907,8 @@ func TestDefaultManager_restartLogicConsistency(t *testing.T) {
statusMgr.EXPECT().GetWorkloadPID(gomock.Any(), "test-base").Return(12345, nil)

manager := &DefaultManager{
statuses: statusMgr,
statuses: statusMgr,
findProcess: func(_ int) (bool, error) { return true, nil }, // mock: process alive
}

runConfig := &runner.RunConfig{
Expand Down Expand Up @@ -949,8 +986,9 @@ func TestDefaultManager_restartLogicConsistency(t *testing.T) {
statusMgr.EXPECT().GetWorkloadPID(gomock.Any(), "test-workload").Return(12345, nil)

manager := &DefaultManager{
statuses: statusMgr,
runtime: runtimeMgr,
statuses: statusMgr,
runtime: runtimeMgr,
findProcess: func(_ int) (bool, error) { return true, nil }, // mock: process alive
}

err := manager.restartContainerWorkload(context.Background(), "test-workload", false)
Expand Down
Loading