diff --git a/agent.py b/agent.py index 710edb1..e007c7e 100644 --- a/agent.py +++ b/agent.py @@ -3,13 +3,22 @@ SuperAgent for Term Challenge - Entry Point (SDK 3.0 Compatible). This agent accepts --instruction from the validator and runs autonomously. -Uses litellm for LLM calls instead of term_sdk. +Supports multiple LLM providers: +- Chutes API (default): Uses moonshotai/Kimi-K2.5-TEE with thinking mode +- OpenRouter/litellm: Fallback to other providers Installation: pip install . # via pyproject.toml pip install -r requirements.txt # via requirements.txt Usage: + # With Chutes API (default - requires CHUTES_API_TOKEN) + export CHUTES_API_TOKEN="your-token" + python agent.py --instruction "Your task description here..." + + # With OpenRouter (fallback) + export LLM_PROVIDER="openrouter" + export OPENROUTER_API_KEY="your-key" python agent.py --instruction "Your task description here..." """ @@ -29,7 +38,7 @@ def ensure_dependencies(): """Install dependencies if not present.""" try: - import litellm + import openai import httpx import pydantic except ImportError: @@ -48,7 +57,7 @@ def ensure_dependencies(): from src.core.loop import run_agent_loop from src.tools.registry import ToolRegistry from src.output.jsonl import emit, ErrorEvent -from src.llm.client import LiteLLMClient, CostLimitExceeded +from src.llm.client import get_llm_client, CostLimitExceeded, ChutesClient, LiteLLMClient class AgentContext: @@ -130,21 +139,30 @@ def main(): parser.add_argument("--instruction", required=True, help="Task instruction from validator") args = parser.parse_args() + provider = CONFIG.get("provider", "chutes") + _log("=" * 60) - _log("SuperAgent Starting (SDK 3.0 - litellm)") + _log(f"SuperAgent Starting (SDK 3.0 - {provider})") _log("=" * 60) + _log(f"Provider: {provider}") _log(f"Model: {CONFIG['model']}") - _log(f"Reasoning effort: {CONFIG.get('reasoning_effort', 'default')}") + _log(f"Thinking mode: {CONFIG.get('enable_thinking', True)}") _log(f"Instruction: {args.instruction[:200]}...") _log("-" * 60) # Initialize components start_time = time.time() - llm = LiteLLMClient( + # Use factory function to get appropriate client based on provider + llm = get_llm_client( + provider=provider, model=CONFIG["model"], temperature=CONFIG.get("temperature"), max_tokens=CONFIG.get("max_tokens", 16384), + cost_limit=CONFIG.get("cost_limit", 100.0), + enable_thinking=CONFIG.get("enable_thinking", True), + cache_extended_retention=CONFIG.get("cache_extended_retention", True), + cache_key=CONFIG.get("cache_key"), ) tools = ToolRegistry() diff --git a/pyproject.toml b/pyproject.toml index 864644a..6c764a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "rich>=13.0", "typer>=0.12.0", "litellm>=1.50.0", + "openai>=1.0.0", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 02cebfd..465d61c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ tomli-w>=1.0 rich>=13.0 typer>=0.12.0 litellm>=1.50.0 +openai>=1.0.0 diff --git a/src/config/defaults.py b/src/config/defaults.py index da7615f..22d8796 100644 --- a/src/config/defaults.py +++ b/src/config/defaults.py @@ -1,13 +1,12 @@ """ Hardcoded benchmark configuration for SuperAgent. -Simulates Codex exec with these flags: -- --model gpt-5.2 -- -c model_reasoning_effort=xhigh -- --dangerously-bypass-approvals-and-sandbox -- --skip-git-repo-check -- --enable unified_exec -- --json +Default provider: Chutes API with Kimi K2.5-TEE model. +Supports thinking mode with ... reasoning blocks. + +Alternative providers available via LLM_PROVIDER environment variable: +- "chutes" (default): Chutes API with Kimi K2.5-TEE +- "openrouter": OpenRouter with Claude or other models All settings are hardcoded - no CLI arguments needed. """ @@ -18,33 +17,40 @@ from typing import Any, Dict -# Main configuration - simulates Codex exec benchmark mode +# Main configuration - default to Chutes API with Kimi K2.5-TEE CONFIG: Dict[str, Any] = { # ========================================================================== - # Model Settings (simulates --model gpt-5.2 -c model_reasoning_effort=xhigh) + # Model Settings - Chutes API with Kimi K2.5-TEE # ========================================================================== - # Model to use via OpenRouter (prefix with openrouter/ for litellm) - "model": os.environ.get("LLM_MODEL", "openrouter/anthropic/claude-sonnet-4-20250514"), + # Model to use via Chutes API + # Kimi K2.5-TEE: 1T params (32B activated), 256K context window + # Supports thinking mode with reasoning_content + "model": os.environ.get("LLM_MODEL", "moonshotai/Kimi-K2.5-TEE"), - # Provider - "provider": "openrouter", + # Provider: "chutes" for Chutes API, "openrouter" for litellm/OpenRouter + "provider": os.environ.get("LLM_PROVIDER", "chutes"), - # Reasoning effort: none, minimal, low, medium, high, xhigh (not used for Claude) - "reasoning_effort": "none", + # Enable Kimi K2.5 thinking mode (reasoning in thinking blocks) + "enable_thinking": True, - # Token limits + # Token limits (Kimi K2.5 supports up to 32K output) "max_tokens": 16384, - # Temperature (0 = deterministic) - "temperature": 0.0, + # Temperature - Kimi K2.5 best practices: + # - Thinking mode: 1.0 (with top_p=0.95) + # - Instant mode: 0.6 (with top_p=0.95) + "temperature": 1.0, + + # Cost limit in USD + "cost_limit": 100.0, # ========================================================================== # Agent Execution Settings # ========================================================================== # Maximum iterations before stopping - "max_iterations": 200, + "max_iterations": 350, # Maximum tokens for tool output truncation (middle-out strategy) "max_output_tokens": 2500, # ~10KB @@ -56,10 +62,10 @@ # Context Management (like OpenCode/Codex) # ========================================================================== - # Model context window (Claude Opus 4.5 = 200K) - "model_context_limit": 200_000, + # Model context window (Kimi K2.5 = 256K) + "model_context_limit": 256_000, - # Reserved tokens for output + # Reserved tokens for output (Kimi K2.5 can output up to 32K) "output_token_max": 32_000, # Trigger compaction at this % of usable context (85%) @@ -70,16 +76,17 @@ "prune_minimum": 20_000, # Only prune if we can recover at least this many # ========================================================================== - # Prompt Caching (Anthropic via OpenRouter/Bedrock) + # Prompt Caching # ========================================================================== - # Enable prompt caching + # Enable prompt caching (Chutes may support server-side caching) "cache_enabled": True, - # Note: Anthropic caching requires minimum tokens per breakpoint: - # - Claude Opus 4.5 on Bedrock: 4096 tokens minimum - # - Claude Sonnet/other: 1024 tokens minimum - # System prompt should be large enough to meet this threshold + # Chutes API caching notes: + # - Kimi K2.5 on Chutes uses server-side optimization + # - Keep system prompt stable for best performance + "cache_extended_retention": True, + "cache_key": None, # ========================================================================== # Simulated Codex Flags (all enabled/bypassed for benchmark) diff --git a/src/llm/client.py b/src/llm/client.py index 72e048b..8636137 100644 --- a/src/llm/client.py +++ b/src/llm/client.py @@ -1,13 +1,44 @@ -"""LLM Client using litellm - replaces term_sdk dependency.""" +"""LLM Client supporting Chutes API and litellm providers. + +Supports: +- Chutes API (https://llm.chutes.ai/v1) with Kimi K2.5-TEE +- OpenRouter and other litellm-compatible providers (fallback) + +Chutes API: +- OpenAI-compatible endpoint +- Requires CHUTES_API_TOKEN environment variable +- Default model: moonshotai/Kimi-K2.5-TEE + +Kimi K2.5 Best Practices: +- Thinking mode: temperature=1.0, top_p=0.95 +- Instant mode: temperature=0.6, top_p=0.95 +- Context window: 256K tokens +""" from __future__ import annotations import json import os +import re import sys import time from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple + +# Chutes API configuration +CHUTES_API_BASE = "https://llm.chutes.ai/v1" +CHUTES_DEFAULT_MODEL = "moonshotai/Kimi-K2.5-TEE" + +# Kimi K2.5 recommended parameters +KIMI_K25_THINKING_PARAMS = { + "temperature": 1.0, # Use 1.0 for thinking mode + "top_p": 0.95, +} + +KIMI_K25_INSTANT_PARAMS = { + "temperature": 0.6, # Use 0.6 for instant mode + "top_p": 0.95, +} class CostLimitExceeded(Exception): @@ -55,19 +86,283 @@ def from_openai(cls, call: Dict[str, Any]) -> "FunctionCall": class LLMResponse: """Response from the LLM.""" text: str = "" + thinking: str = "" # Thinking/reasoning content (for models supporting thinking mode) function_calls: List[FunctionCall] = field(default_factory=list) tokens: Optional[Dict[str, int]] = None model: str = "" finish_reason: str = "" raw: Optional[Dict[str, Any]] = None + cost: float = 0.0 def has_function_calls(self) -> bool: """Check if response contains function calls.""" return len(self.function_calls) > 0 +class ChutesClient: + """LLM Client for Chutes API with Kimi K2.5-TEE. + + Chutes API is OpenAI-compatible, hosted at https://llm.chutes.ai/v1 + Default model: moonshotai/Kimi-K2.5-TEE with thinking mode enabled. + + Environment variable: CHUTES_API_TOKEN + + Kimi K2.5 parameters: + - Thinking mode: temperature=1.0, top_p=0.95 + - Instant mode: temperature=0.6, top_p=0.95 + - Context window: 256K tokens + """ + + def __init__( + self, + model: str = CHUTES_DEFAULT_MODEL, + temperature: Optional[float] = None, + max_tokens: int = 16384, + cost_limit: Optional[float] = None, + enable_thinking: bool = True, + # Legacy params (kept for compatibility) + cache_extended_retention: bool = True, + cache_key: Optional[str] = None, + ): + self.model = model + self.max_tokens = max_tokens + self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "100.0")) + self.enable_thinking = enable_thinking + + # Set temperature based on thinking mode if not explicitly provided + if temperature is None: + params = KIMI_K25_THINKING_PARAMS if enable_thinking else KIMI_K25_INSTANT_PARAMS + self.temperature = params["temperature"] + else: + self.temperature = temperature + + self._total_cost = 0.0 + self._total_tokens = 0 + self._request_count = 0 + self._input_tokens = 0 + self._output_tokens = 0 + self._cached_tokens = 0 + + # Get API token + self._api_token = os.environ.get("CHUTES_API_TOKEN") + if not self._api_token: + raise LLMError( + "CHUTES_API_TOKEN environment variable not set. " + "Get your API token at https://chutes.ai", + code="authentication_error" + ) + + # Import and configure OpenAI client for Chutes API + try: + from openai import OpenAI + self._client = OpenAI( + api_key=self._api_token, + base_url=CHUTES_API_BASE, + ) + except ImportError: + raise ImportError("openai not installed. Run: pip install openai") + + def _build_tools(self, tools: Optional[List[Dict[str, Any]]]) -> Optional[List[Dict[str, Any]]]: + """Build tools in OpenAI format.""" + if not tools: + return None + + result = [] + for tool in tools: + result.append({ + "type": "function", + "function": { + "name": tool["name"], + "description": tool.get("description", ""), + "parameters": tool.get("parameters", {"type": "object", "properties": {}}), + }, + }) + return result + + def _parse_thinking_content(self, text: str) -> Tuple[str, str]: + """Parse thinking content from response. + + Kimi K2.5 can return thinking content in: + 1. ... tags (for some deployments) + 2. reasoning_content field (official API) + + Returns (thinking_content, final_response). + """ + if not text: + return "", "" + + # Check for ... pattern + think_pattern = r"(.*?)" + match = re.search(think_pattern, text, re.DOTALL) + + if match: + thinking = match.group(1).strip() + # Remove the think block from the response + response = re.sub(think_pattern, "", text, flags=re.DOTALL).strip() + return thinking, response + + return "", text + + def chat( + self, + messages: List[Dict[str, Any]], + tools: Optional[List[Dict[str, Any]]] = None, + max_tokens: Optional[int] = None, + extra_body: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + ) -> LLMResponse: + """Send a chat request to Chutes API. + + Args: + messages: List of message dicts with 'role' and 'content' + tools: Optional list of tool definitions + max_tokens: Max tokens to generate (default: self.max_tokens) + extra_body: Additional parameters to pass to the API + temperature: Override temperature (default: self.temperature) + + Returns: + LLMResponse with text, thinking content, and any tool calls + """ + # Check cost limit + if self._total_cost >= self.cost_limit: + raise CostLimitExceeded( + f"Cost limit exceeded: ${self._total_cost:.4f} >= ${self.cost_limit:.4f}", + used=self._total_cost, + limit=self.cost_limit, + ) + + # Use provided temperature or default + temp = temperature if temperature is not None else self.temperature + + # Get appropriate params based on thinking mode + params = KIMI_K25_THINKING_PARAMS if self.enable_thinking else KIMI_K25_INSTANT_PARAMS + + # Build request kwargs + kwargs: Dict[str, Any] = { + "model": self.model, + "messages": messages, + "max_tokens": max_tokens or self.max_tokens, + "temperature": temp, + "top_p": params["top_p"], + } + + if tools: + kwargs["tools"] = self._build_tools(tools) + kwargs["tool_choice"] = "auto" + + # Add extra body params + if extra_body: + kwargs.update(extra_body) + + try: + response = self._client.chat.completions.create(**kwargs) + self._request_count += 1 + except Exception as e: + error_msg = str(e) + if "authentication" in error_msg.lower() or "api_key" in error_msg.lower() or "unauthorized" in error_msg.lower(): + raise LLMError(error_msg, code="authentication_error") + elif "rate" in error_msg.lower() or "limit" in error_msg.lower(): + raise LLMError(error_msg, code="rate_limit") + else: + raise LLMError(error_msg, code="api_error") + + # Parse response + result = LLMResponse(raw=response.model_dump() if hasattr(response, "model_dump") else None) + + # Extract usage + if hasattr(response, "usage") and response.usage: + usage = response.usage + input_tokens = getattr(usage, "prompt_tokens", 0) or 0 + output_tokens = getattr(usage, "completion_tokens", 0) or 0 + cached_tokens = 0 + + # Check for cached tokens + if hasattr(usage, "prompt_tokens_details"): + details = usage.prompt_tokens_details + if details and hasattr(details, "cached_tokens"): + cached_tokens = details.cached_tokens or 0 + + self._input_tokens += input_tokens + self._output_tokens += output_tokens + self._cached_tokens += cached_tokens + self._total_tokens += input_tokens + output_tokens + + result.tokens = { + "input": input_tokens, + "output": output_tokens, + "cached": cached_tokens, + } + + # Estimate cost (Kimi K2.5 pricing via Chutes - approximate) + # $0.60 per million input tokens, $2.50 per million output tokens + input_cost_per_1k = 0.0006 # $0.60 / 1000 + output_cost_per_1k = 0.0025 # $2.50 / 1000 + if result.tokens: + cost = (result.tokens["input"] / 1000 * input_cost_per_1k + + result.tokens["output"] / 1000 * output_cost_per_1k) + self._total_cost += cost + result.cost = cost + + # Extract model + result.model = getattr(response, "model", self.model) + + # Extract choices + if hasattr(response, "choices") and response.choices: + choice = response.choices[0] + message = choice.message + + result.finish_reason = getattr(choice, "finish_reason", "") or "" + raw_text = getattr(message, "content", "") or "" + + # Extract reasoning_content if available (official Kimi API) + if hasattr(message, "reasoning_content") and message.reasoning_content: + result.thinking = message.reasoning_content + result.text = raw_text + elif self.enable_thinking: + # Parse thinking content from tags + result.thinking, result.text = self._parse_thinking_content(raw_text) + else: + result.text = raw_text + + # Extract function calls + tool_calls = getattr(message, "tool_calls", None) + if tool_calls: + for call in tool_calls: + if hasattr(call, "function"): + func = call.function + args_str = getattr(func, "arguments", "{}") + try: + args = json.loads(args_str) if isinstance(args_str, str) else args_str + except json.JSONDecodeError: + args = {"raw": args_str} + + result.function_calls.append(FunctionCall( + id=getattr(call, "id", "") or "", + name=getattr(func, "name", "") or "", + arguments=args if isinstance(args, dict) else {}, + )) + + return result + + def get_stats(self) -> Dict[str, Any]: + """Get usage statistics.""" + return { + "total_tokens": self._total_tokens, + "input_tokens": self._input_tokens, + "output_tokens": self._output_tokens, + "cached_tokens": self._cached_tokens, + "total_cost": self._total_cost, + "request_count": self._request_count, + } + + def close(self): + """Close client.""" + if hasattr(self, "_client"): + self._client.close() + + class LiteLLMClient: - """LLM Client using litellm.""" + """LLM Client using litellm (fallback for non-Chutes providers).""" def __init__( self, @@ -75,11 +370,16 @@ def __init__( temperature: Optional[float] = None, max_tokens: int = 16384, cost_limit: Optional[float] = None, + # Legacy params for compatibility + enable_thinking: bool = False, + cache_extended_retention: bool = True, + cache_key: Optional[str] = None, ): self.model = model self.temperature = temperature self.max_tokens = max_tokens self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "10.0")) + self.enable_thinking = enable_thinking self._total_cost = 0.0 self._total_tokens = 0 @@ -128,6 +428,7 @@ def chat( tools: Optional[List[Dict[str, Any]]] = None, max_tokens: Optional[int] = None, extra_body: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, ) -> LLMResponse: """Send a chat request.""" # Check cost limit @@ -138,6 +439,9 @@ def chat( limit=self.cost_limit, ) + # Use provided temperature or default + temp = temperature if temperature is not None else self.temperature + # Build request kwargs: Dict[str, Any] = { "model": self.model, @@ -145,8 +449,8 @@ def chat( "max_tokens": max_tokens or self.max_tokens, } - if self._supports_temperature(self.model) and self.temperature is not None: - kwargs["temperature"] = self.temperature + if self._supports_temperature(self.model) and temp is not None: + kwargs["temperature"] = temp if tools: kwargs["tools"] = self._build_tools(tools) @@ -199,6 +503,7 @@ def chat( try: cost = self._litellm.completion_cost(completion_response=response) self._total_cost += cost + result.cost = cost except Exception: pass # Cost calculation may fail for some models @@ -247,3 +552,46 @@ def get_stats(self) -> Dict[str, Any]: def close(self): """Close client (no-op for litellm).""" pass + + +def get_llm_client( + provider: str = "chutes", + model: Optional[str] = None, + temperature: Optional[float] = None, + max_tokens: int = 16384, + cost_limit: Optional[float] = None, + enable_thinking: bool = True, + **kwargs, +): + """Factory function to get appropriate LLM client based on provider. + + Args: + provider: "chutes" for Chutes API, "openrouter" or others for litellm + model: Model name (default depends on provider) + temperature: Temperature setting (default based on thinking mode) + max_tokens: Max tokens to generate + cost_limit: Cost limit in USD + enable_thinking: Enable thinking mode (for Chutes/Kimi K2.5) + **kwargs: Additional arguments passed to client + + Returns: + ChutesClient or LiteLLMClient instance + """ + if provider == "chutes": + return ChutesClient( + model=model or CHUTES_DEFAULT_MODEL, + temperature=temperature, + max_tokens=max_tokens, + cost_limit=cost_limit, + enable_thinking=enable_thinking, + **kwargs, + ) + else: + return LiteLLMClient( + model=model or "openrouter/anthropic/claude-sonnet-4-20250514", + temperature=temperature, + max_tokens=max_tokens, + cost_limit=cost_limit, + enable_thinking=enable_thinking, + **kwargs, + )