fix(agents): improve process termination with multi-layer kill strategy

Resolves #87 and #9 by implementing a robust three-tier process termination approach: 1. ProcessRegistry kill - primary method using run_id tracking 2. ClaudeProcessState kill - fallback via stored process handle 3. System kill command - last resort using PID and OS commands Key improvements: - Enhanced logging throughout termination flow for better debugging - Graceful fallback between termination methods - Proper UI state management even when backend termination fails - Track run_id in AgentExecution component for targeted process killing - Comprehensive error handling with user-friendly feedback - Consistent event emission for UI synchronization This ensures agents can be properly stopped without requiring application restart, addressing the core issue where STOP requests were ignored and processes continued running.
2025-07-02 18:17:05 +05:30
parent e8c54d7fad
commit a7e17f16ec
4 changed files with 174 additions and 46 deletions
--- a/src-tauri/src/commands/claude.rs
+++ b/src-tauri/src/commands/claude.rs
@@ -917,25 +917,41 @@ pub async fn cancel_claude_execution(
        session_id
    );

-    let killed = if let Some(sid) = &session_id {
-        // Try to find and kill via ProcessRegistry first
-        let registry = app.state::<crate::process::ProcessRegistryState>();
-        if let Ok(Some(process_info)) = registry.0.get_claude_session_by_id(sid) {
-            match registry.0.kill_process(process_info.run_id).await {
-                Ok(success) => success,
-                Err(e) => {
-                    log::warn!("Failed to kill via registry: {}", e);
-                    false
-                }
-            }
-        } else {
-            false
-        }
-    } else {
-        false
-    };
+    let mut killed = false;
+    let mut attempted_methods = Vec::new();

-    // If registry kill didn't work, try the legacy approach
+    // Method 1: Try to find and kill via ProcessRegistry using session ID
+    if let Some(sid) = &session_id {
+        let registry = app.state::<crate::process::ProcessRegistryState>();
+        match registry.0.get_claude_session_by_id(sid) {
+            Ok(Some(process_info)) => {
+                log::info!("Found process in registry for session {}: run_id={}, PID={}", 
+                    sid, process_info.run_id, process_info.pid);
+                match registry.0.kill_process(process_info.run_id).await {
+                    Ok(success) => {
+                        if success {
+                            log::info!("Successfully killed process via registry");
+                            killed = true;
+                        } else {
+                            log::warn!("Registry kill returned false");
+                        }
+                    }
+                    Err(e) => {
+                        log::warn!("Failed to kill via registry: {}", e);
+                    }
+                }
+                attempted_methods.push("registry");
+            }
+            Ok(None) => {
+                log::warn!("Session {} not found in ProcessRegistry", sid);
+            }
+            Err(e) => {
+                log::error!("Error querying ProcessRegistry: {}", e);
+            }
+        }
+    }
+
+    // Method 2: Try the legacy approach via ClaudeProcessState
    if !killed {
        let claude_state = app.state::<ClaudeProcessState>();
        let mut current_process = claude_state.current_process.lock().await;
@@ -943,24 +959,57 @@ pub async fn cancel_claude_execution(
        if let Some(mut child) = current_process.take() {
            // Try to get the PID before killing
            let pid = child.id();
-            log::info!("Attempting to kill Claude process with PID: {:?}", pid);
+            log::info!("Attempting to kill Claude process via ClaudeProcessState with PID: {:?}", pid);

            // Kill the process
            match child.kill().await {
                Ok(_) => {
-                    log::info!("Successfully killed Claude process");
+                    log::info!("Successfully killed Claude process via ClaudeProcessState");
+                    killed = true;
                }
                Err(e) => {
-                    log::error!("Failed to kill Claude process: {}", e);
-                    return Err(format!("Failed to kill Claude process: {}", e));
+                    log::error!("Failed to kill Claude process via ClaudeProcessState: {}", e);
+                    
+                    // Method 3: If we have a PID, try system kill as last resort
+                    if let Some(pid) = pid {
+                        log::info!("Attempting system kill as last resort for PID: {}", pid);
+                        let kill_result = if cfg!(target_os = "windows") {
+                            std::process::Command::new("taskkill")
+                                .args(["/F", "/PID", &pid.to_string()])
+                                .output()
+                        } else {
+                            std::process::Command::new("kill")
+                                .args(["-KILL", &pid.to_string()])
+                                .output()
+                        };
+                        
+                        match kill_result {
+                            Ok(output) if output.status.success() => {
+                                log::info!("Successfully killed process via system command");
+                                killed = true;
+                            }
+                            Ok(output) => {
+                                let stderr = String::from_utf8_lossy(&output.stderr);
+                                log::error!("System kill failed: {}", stderr);
+                            }
+                            Err(e) => {
+                                log::error!("Failed to execute system kill command: {}", e);
+                            }
+                        }
+                    }
                }
            }
+            attempted_methods.push("claude_state");
        } else {
-            log::warn!("No active Claude process to cancel");
+            log::warn!("No active Claude process in ClaudeProcessState");
        }
    }

-    // Emit cancellation events
+    if !killed && attempted_methods.is_empty() {
+        log::warn!("No active Claude process found to cancel");
+    }
+
+    // Always emit cancellation events for UI consistency
    if let Some(sid) = session_id {
        let _ = app.emit(&format!("claude-cancelled:{}", sid), true);
        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
@@ -972,6 +1021,12 @@ pub async fn cancel_claude_execution(
    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
    let _ = app.emit("claude-complete", false);
    
+    if killed {
+        log::info!("Claude process cancellation completed successfully");
+    } else if !attempted_methods.is_empty() {
+        log::warn!("Claude process cancellation attempted but process may have already exited. Attempted methods: {:?}", attempted_methods);
+    }
+    
    Ok(())
 }

@@ -2063,3 +2118,4 @@ pub async fn track_session_messages(
    }
    Ok(())
 }
+
--- a/src-tauri/src/process/registry.rs
+++ b/src-tauri/src/process/registry.rs
@@ -213,6 +213,7 @@ impl ProcessRegistry {
            if let Some(handle) = processes.get(&run_id) {
                (handle.info.pid, handle.child.clone())
            } else {
+                warn!("Process {} not found in registry", run_id);
                return Ok(false); // Process not found
            }
        };
@@ -233,16 +234,25 @@ impl ProcessRegistry {
                    }
                    Err(e) => {
                        error!("Failed to send kill signal to process {}: {}", run_id, e);
-                        return Err(format!("Failed to kill process: {}", e));
+                        // Don't return error here, try fallback method
+                        false
                    }
                }
            } else {
-                false // Process already killed
+                warn!("No child handle available for process {} (PID: {}), attempting system kill", run_id, pid);
+                false // Process handle not available, try fallback
            }
        };

+        // If direct kill didn't work, try system command as fallback
        if !kill_sent {
-            return Ok(false);
+            info!("Attempting fallback kill for process {} (PID: {})", run_id, pid);
+            match self.kill_process_by_pid(run_id, pid) {
+                Ok(true) => return Ok(true),
+                Ok(false) => warn!("Fallback kill also failed for process {} (PID: {})", run_id, pid),
+                Err(e) => error!("Error during fallback kill: {}", e),
+            }
+            // Continue with the rest of the cleanup even if fallback failed
        }

        // Wait for the process to exit (with timeout)
@@ -297,6 +307,8 @@ impl ProcessRegistry {
                if let Ok(mut child_guard) = child_arc.lock() {
                    *child_guard = None;
                }
+                // One more attempt with system kill
+                let _ = self.kill_process_by_pid(run_id, pid);
            }
        }

--- a/src/components/AgentExecution.tsx
+++ b/src/components/AgentExecution.tsx
@@ -92,6 +92,7 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
  const fullscreenMessagesEndRef = useRef<HTMLDivElement>(null);
  const unlistenRefs = useRef<UnlistenFn[]>([]);
  const elapsedTimeIntervalRef = useRef<NodeJS.Timeout | null>(null);
+  const [runId, setRunId] = useState<number | null>(null);

  // Filter out messages that shouldn't be displayed
  const displayableMessages = React.useMemo(() => {
@@ -266,24 +267,24 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
  };

  const handleExecute = async () => {
-    if (!projectPath || !task.trim()) return;
-
-    let runId: number | null = null;
-    
    try {
      setIsRunning(true);
-      setError(null);
+      setExecutionStartTime(Date.now());
      setMessages([]);
      setRawJsonlOutput([]);
-      setExecutionStartTime(Date.now());
-      setElapsedTime(0);
-      setTotalTokens(0);
-
-      // Execute the agent with model override and get run ID
-      runId = await api.executeAgent(agent.id!, projectPath, task, model);
+      setRunId(null);
+      
+      // Clear any existing listeners
+      unlistenRefs.current.forEach(unlisten => unlisten());
+      unlistenRefs.current = [];
+      
+      // Execute the agent and get the run ID
+      const executionRunId = await api.executeAgent(agent.id!, projectPath, task, model);
+      console.log("Agent execution started with run ID:", executionRunId);
+      setRunId(executionRunId);
      
      // Set up event listeners with run ID isolation
-      const outputUnlisten = await listen<string>(`agent-output:${runId}`, (event) => {
+      const outputUnlisten = await listen<string>(`agent-output:${executionRunId}`, (event) => {
        try {
          // Store raw JSONL
          setRawJsonlOutput(prev => [...prev, event.payload]);
@@ -296,12 +297,12 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
        }
      });

-      const errorUnlisten = await listen<string>(`agent-error:${runId}`, (event) => {
+      const errorUnlisten = await listen<string>(`agent-error:${executionRunId}`, (event) => {
        console.error("Agent error:", event.payload);
        setError(event.payload);
      });

-      const completeUnlisten = await listen<boolean>(`agent-complete:${runId}`, (event) => {
+      const completeUnlisten = await listen<boolean>(`agent-complete:${executionRunId}`, (event) => {
        setIsRunning(false);
        setExecutionStartTime(null);
        if (!event.payload) {
@@ -309,7 +310,7 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
        }
      });

-      const cancelUnlisten = await listen<boolean>(`agent-cancelled:${runId}`, () => {
+      const cancelUnlisten = await listen<boolean>(`agent-cancelled:${executionRunId}`, () => {
        setIsRunning(false);
        setExecutionStartTime(null);
        setError("Agent execution was cancelled");
@@ -318,16 +319,41 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
      unlistenRefs.current = [outputUnlisten, errorUnlisten, completeUnlisten, cancelUnlisten];
    } catch (err) {
      console.error("Failed to execute agent:", err);
-      setError("Failed to execute agent");
      setIsRunning(false);
      setExecutionStartTime(null);
+      setRunId(null);
+      // Show error in messages
+      setMessages(prev => [...prev, {
+        type: "result",
+        subtype: "error",
+        is_error: true,
+        result: `Failed to execute agent: ${err instanceof Error ? err.message : 'Unknown error'}`,
+        duration_ms: 0,
+        usage: {
+          input_tokens: 0,
+          output_tokens: 0
+        }
+      }]);
    }
  };

  const handleStop = async () => {
    try {
-      // TODO: Implement actual stop functionality via API
-      // For now, just update the UI state
+      if (!runId) {
+        console.error("No run ID available to stop");
+        return;
+      }
+
+      // Call the API to kill the agent session
+      const success = await api.killAgentSession(runId);
+      
+      if (success) {
+        console.log(`Successfully stopped agent session ${runId}`);
+      } else {
+        console.warn(`Failed to stop agent session ${runId} - it may have already finished`);
+      }
+      
+      // Update UI state
      setIsRunning(false);
      setExecutionStartTime(null);
      
@@ -349,6 +375,22 @@ export const AgentExecution: React.FC<AgentExecutionProps> = ({
      }]);
    } catch (err) {
      console.error("Failed to stop agent:", err);
+      // Still update UI state even if the backend call failed
+      setIsRunning(false);
+      setExecutionStartTime(null);
+      
+      // Show error message
+      setMessages(prev => [...prev, {
+        type: "result",
+        subtype: "error",
+        is_error: true,
+        result: `Failed to stop execution: ${err instanceof Error ? err.message : 'Unknown error'}`,
+        duration_ms: elapsedTime * 1000,
+        usage: {
+          input_tokens: totalTokens,
+          output_tokens: 0
+        }
+      }]);
    }
  };

--- a/src/components/ClaudeCodeSession.tsx
+++ b/src/components/ClaudeCodeSession.tsx
@@ -606,7 +606,25 @@ export const ClaudeCodeSession: React.FC<ClaudeCodeSessionProps> = ({
      setError(null);
    } catch (err) {
      console.error("Failed to cancel execution:", err);
-      setError("Failed to cancel execution");
+      
+      // Even if backend fails, we should update UI to reflect stopped state
+      // Add error message but still stop the UI loading state
+      const errorMessage: ClaudeStreamMessage = {
+        type: "system",
+        subtype: "error",
+        result: `Failed to cancel execution: ${err instanceof Error ? err.message : 'Unknown error'}. The process may still be running in the background.`,
+        timestamp: new Date().toISOString()
+      };
+      setMessages(prev => [...prev, errorMessage]);
+      
+      // Clean up listeners anyway
+      unlistenRefs.current.forEach(unlisten => unlisten());
+      unlistenRefs.current = [];
+      
+      // Reset states to allow user to continue
+      setIsLoading(false);
+      hasActiveSessionRef.current = false;
+      setError(null);
    } finally {
      setIsCancelling(false);
    }