diff --git a/agent.py b/agent.py
index 710edb1..e007c7e 100644
--- a/agent.py
+++ b/agent.py
@@ -3,13 +3,22 @@
SuperAgent for Term Challenge - Entry Point (SDK 3.0 Compatible).
This agent accepts --instruction from the validator and runs autonomously.
-Uses litellm for LLM calls instead of term_sdk.
+Supports multiple LLM providers:
+- Chutes API (default): Uses moonshotai/Kimi-K2.5-TEE with thinking mode
+- OpenRouter/litellm: Fallback to other providers
Installation:
pip install . # via pyproject.toml
pip install -r requirements.txt # via requirements.txt
Usage:
+ # With Chutes API (default - requires CHUTES_API_TOKEN)
+ export CHUTES_API_TOKEN="your-token"
+ python agent.py --instruction "Your task description here..."
+
+ # With OpenRouter (fallback)
+ export LLM_PROVIDER="openrouter"
+ export OPENROUTER_API_KEY="your-key"
python agent.py --instruction "Your task description here..."
"""
@@ -29,7 +38,7 @@
def ensure_dependencies():
"""Install dependencies if not present."""
try:
- import litellm
+ import openai
import httpx
import pydantic
except ImportError:
@@ -48,7 +57,7 @@ def ensure_dependencies():
from src.core.loop import run_agent_loop
from src.tools.registry import ToolRegistry
from src.output.jsonl import emit, ErrorEvent
-from src.llm.client import LiteLLMClient, CostLimitExceeded
+from src.llm.client import get_llm_client, CostLimitExceeded, ChutesClient, LiteLLMClient
class AgentContext:
@@ -130,21 +139,30 @@ def main():
parser.add_argument("--instruction", required=True, help="Task instruction from validator")
args = parser.parse_args()
+ provider = CONFIG.get("provider", "chutes")
+
_log("=" * 60)
- _log("SuperAgent Starting (SDK 3.0 - litellm)")
+ _log(f"SuperAgent Starting (SDK 3.0 - {provider})")
_log("=" * 60)
+ _log(f"Provider: {provider}")
_log(f"Model: {CONFIG['model']}")
- _log(f"Reasoning effort: {CONFIG.get('reasoning_effort', 'default')}")
+ _log(f"Thinking mode: {CONFIG.get('enable_thinking', True)}")
_log(f"Instruction: {args.instruction[:200]}...")
_log("-" * 60)
# Initialize components
start_time = time.time()
- llm = LiteLLMClient(
+ # Use factory function to get appropriate client based on provider
+ llm = get_llm_client(
+ provider=provider,
model=CONFIG["model"],
temperature=CONFIG.get("temperature"),
max_tokens=CONFIG.get("max_tokens", 16384),
+ cost_limit=CONFIG.get("cost_limit", 100.0),
+ enable_thinking=CONFIG.get("enable_thinking", True),
+ cache_extended_retention=CONFIG.get("cache_extended_retention", True),
+ cache_key=CONFIG.get("cache_key"),
)
tools = ToolRegistry()
diff --git a/pyproject.toml b/pyproject.toml
index 864644a..6c764a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
"rich>=13.0",
"typer>=0.12.0",
"litellm>=1.50.0",
+ "openai>=1.0.0",
]
[project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
index 02cebfd..465d61c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ tomli-w>=1.0
rich>=13.0
typer>=0.12.0
litellm>=1.50.0
+openai>=1.0.0
diff --git a/src/config/defaults.py b/src/config/defaults.py
index da7615f..22d8796 100644
--- a/src/config/defaults.py
+++ b/src/config/defaults.py
@@ -1,13 +1,12 @@
"""
Hardcoded benchmark configuration for SuperAgent.
-Simulates Codex exec with these flags:
-- --model gpt-5.2
-- -c model_reasoning_effort=xhigh
-- --dangerously-bypass-approvals-and-sandbox
-- --skip-git-repo-check
-- --enable unified_exec
-- --json
+Default provider: Chutes API with Kimi K2.5-TEE model.
+Supports thinking mode with ... reasoning blocks.
+
+Alternative providers available via LLM_PROVIDER environment variable:
+- "chutes" (default): Chutes API with Kimi K2.5-TEE
+- "openrouter": OpenRouter with Claude or other models
All settings are hardcoded - no CLI arguments needed.
"""
@@ -18,33 +17,40 @@
from typing import Any, Dict
-# Main configuration - simulates Codex exec benchmark mode
+# Main configuration - default to Chutes API with Kimi K2.5-TEE
CONFIG: Dict[str, Any] = {
# ==========================================================================
- # Model Settings (simulates --model gpt-5.2 -c model_reasoning_effort=xhigh)
+ # Model Settings - Chutes API with Kimi K2.5-TEE
# ==========================================================================
- # Model to use via OpenRouter (prefix with openrouter/ for litellm)
- "model": os.environ.get("LLM_MODEL", "openrouter/anthropic/claude-sonnet-4-20250514"),
+ # Model to use via Chutes API
+ # Kimi K2.5-TEE: 1T params (32B activated), 256K context window
+ # Supports thinking mode with reasoning_content
+ "model": os.environ.get("LLM_MODEL", "moonshotai/Kimi-K2.5-TEE"),
- # Provider
- "provider": "openrouter",
+ # Provider: "chutes" for Chutes API, "openrouter" for litellm/OpenRouter
+ "provider": os.environ.get("LLM_PROVIDER", "chutes"),
- # Reasoning effort: none, minimal, low, medium, high, xhigh (not used for Claude)
- "reasoning_effort": "none",
+ # Enable Kimi K2.5 thinking mode (reasoning in thinking blocks)
+ "enable_thinking": True,
- # Token limits
+ # Token limits (Kimi K2.5 supports up to 32K output)
"max_tokens": 16384,
- # Temperature (0 = deterministic)
- "temperature": 0.0,
+ # Temperature - Kimi K2.5 best practices:
+ # - Thinking mode: 1.0 (with top_p=0.95)
+ # - Instant mode: 0.6 (with top_p=0.95)
+ "temperature": 1.0,
+
+ # Cost limit in USD
+ "cost_limit": 100.0,
# ==========================================================================
# Agent Execution Settings
# ==========================================================================
# Maximum iterations before stopping
- "max_iterations": 200,
+ "max_iterations": 350,
# Maximum tokens for tool output truncation (middle-out strategy)
"max_output_tokens": 2500, # ~10KB
@@ -56,10 +62,10 @@
# Context Management (like OpenCode/Codex)
# ==========================================================================
- # Model context window (Claude Opus 4.5 = 200K)
- "model_context_limit": 200_000,
+ # Model context window (Kimi K2.5 = 256K)
+ "model_context_limit": 256_000,
- # Reserved tokens for output
+ # Reserved tokens for output (Kimi K2.5 can output up to 32K)
"output_token_max": 32_000,
# Trigger compaction at this % of usable context (85%)
@@ -70,16 +76,17 @@
"prune_minimum": 20_000, # Only prune if we can recover at least this many
# ==========================================================================
- # Prompt Caching (Anthropic via OpenRouter/Bedrock)
+ # Prompt Caching
# ==========================================================================
- # Enable prompt caching
+ # Enable prompt caching (Chutes may support server-side caching)
"cache_enabled": True,
- # Note: Anthropic caching requires minimum tokens per breakpoint:
- # - Claude Opus 4.5 on Bedrock: 4096 tokens minimum
- # - Claude Sonnet/other: 1024 tokens minimum
- # System prompt should be large enough to meet this threshold
+ # Chutes API caching notes:
+ # - Kimi K2.5 on Chutes uses server-side optimization
+ # - Keep system prompt stable for best performance
+ "cache_extended_retention": True,
+ "cache_key": None,
# ==========================================================================
# Simulated Codex Flags (all enabled/bypassed for benchmark)
diff --git a/src/llm/client.py b/src/llm/client.py
index 72e048b..8636137 100644
--- a/src/llm/client.py
+++ b/src/llm/client.py
@@ -1,13 +1,44 @@
-"""LLM Client using litellm - replaces term_sdk dependency."""
+"""LLM Client supporting Chutes API and litellm providers.
+
+Supports:
+- Chutes API (https://llm.chutes.ai/v1) with Kimi K2.5-TEE
+- OpenRouter and other litellm-compatible providers (fallback)
+
+Chutes API:
+- OpenAI-compatible endpoint
+- Requires CHUTES_API_TOKEN environment variable
+- Default model: moonshotai/Kimi-K2.5-TEE
+
+Kimi K2.5 Best Practices:
+- Thinking mode: temperature=1.0, top_p=0.95
+- Instant mode: temperature=0.6, top_p=0.95
+- Context window: 256K tokens
+"""
from __future__ import annotations
import json
import os
+import re
import sys
import time
from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
+
+# Chutes API configuration
+CHUTES_API_BASE = "https://llm.chutes.ai/v1"
+CHUTES_DEFAULT_MODEL = "moonshotai/Kimi-K2.5-TEE"
+
+# Kimi K2.5 recommended parameters
+KIMI_K25_THINKING_PARAMS = {
+ "temperature": 1.0, # Use 1.0 for thinking mode
+ "top_p": 0.95,
+}
+
+KIMI_K25_INSTANT_PARAMS = {
+ "temperature": 0.6, # Use 0.6 for instant mode
+ "top_p": 0.95,
+}
class CostLimitExceeded(Exception):
@@ -55,19 +86,283 @@ def from_openai(cls, call: Dict[str, Any]) -> "FunctionCall":
class LLMResponse:
"""Response from the LLM."""
text: str = ""
+ thinking: str = "" # Thinking/reasoning content (for models supporting thinking mode)
function_calls: List[FunctionCall] = field(default_factory=list)
tokens: Optional[Dict[str, int]] = None
model: str = ""
finish_reason: str = ""
raw: Optional[Dict[str, Any]] = None
+ cost: float = 0.0
def has_function_calls(self) -> bool:
"""Check if response contains function calls."""
return len(self.function_calls) > 0
+class ChutesClient:
+ """LLM Client for Chutes API with Kimi K2.5-TEE.
+
+ Chutes API is OpenAI-compatible, hosted at https://llm.chutes.ai/v1
+ Default model: moonshotai/Kimi-K2.5-TEE with thinking mode enabled.
+
+ Environment variable: CHUTES_API_TOKEN
+
+ Kimi K2.5 parameters:
+ - Thinking mode: temperature=1.0, top_p=0.95
+ - Instant mode: temperature=0.6, top_p=0.95
+ - Context window: 256K tokens
+ """
+
+ def __init__(
+ self,
+ model: str = CHUTES_DEFAULT_MODEL,
+ temperature: Optional[float] = None,
+ max_tokens: int = 16384,
+ cost_limit: Optional[float] = None,
+ enable_thinking: bool = True,
+ # Legacy params (kept for compatibility)
+ cache_extended_retention: bool = True,
+ cache_key: Optional[str] = None,
+ ):
+ self.model = model
+ self.max_tokens = max_tokens
+ self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "100.0"))
+ self.enable_thinking = enable_thinking
+
+ # Set temperature based on thinking mode if not explicitly provided
+ if temperature is None:
+ params = KIMI_K25_THINKING_PARAMS if enable_thinking else KIMI_K25_INSTANT_PARAMS
+ self.temperature = params["temperature"]
+ else:
+ self.temperature = temperature
+
+ self._total_cost = 0.0
+ self._total_tokens = 0
+ self._request_count = 0
+ self._input_tokens = 0
+ self._output_tokens = 0
+ self._cached_tokens = 0
+
+ # Get API token
+ self._api_token = os.environ.get("CHUTES_API_TOKEN")
+ if not self._api_token:
+ raise LLMError(
+ "CHUTES_API_TOKEN environment variable not set. "
+ "Get your API token at https://chutes.ai",
+ code="authentication_error"
+ )
+
+ # Import and configure OpenAI client for Chutes API
+ try:
+ from openai import OpenAI
+ self._client = OpenAI(
+ api_key=self._api_token,
+ base_url=CHUTES_API_BASE,
+ )
+ except ImportError:
+ raise ImportError("openai not installed. Run: pip install openai")
+
+ def _build_tools(self, tools: Optional[List[Dict[str, Any]]]) -> Optional[List[Dict[str, Any]]]:
+ """Build tools in OpenAI format."""
+ if not tools:
+ return None
+
+ result = []
+ for tool in tools:
+ result.append({
+ "type": "function",
+ "function": {
+ "name": tool["name"],
+ "description": tool.get("description", ""),
+ "parameters": tool.get("parameters", {"type": "object", "properties": {}}),
+ },
+ })
+ return result
+
+ def _parse_thinking_content(self, text: str) -> Tuple[str, str]:
+ """Parse thinking content from response.
+
+ Kimi K2.5 can return thinking content in:
+ 1. ... tags (for some deployments)
+ 2. reasoning_content field (official API)
+
+ Returns (thinking_content, final_response).
+ """
+ if not text:
+ return "", ""
+
+ # Check for ... pattern
+ think_pattern = r"(.*?)"
+ match = re.search(think_pattern, text, re.DOTALL)
+
+ if match:
+ thinking = match.group(1).strip()
+ # Remove the think block from the response
+ response = re.sub(think_pattern, "", text, flags=re.DOTALL).strip()
+ return thinking, response
+
+ return "", text
+
+ def chat(
+ self,
+ messages: List[Dict[str, Any]],
+ tools: Optional[List[Dict[str, Any]]] = None,
+ max_tokens: Optional[int] = None,
+ extra_body: Optional[Dict[str, Any]] = None,
+ temperature: Optional[float] = None,
+ ) -> LLMResponse:
+ """Send a chat request to Chutes API.
+
+ Args:
+ messages: List of message dicts with 'role' and 'content'
+ tools: Optional list of tool definitions
+ max_tokens: Max tokens to generate (default: self.max_tokens)
+ extra_body: Additional parameters to pass to the API
+ temperature: Override temperature (default: self.temperature)
+
+ Returns:
+ LLMResponse with text, thinking content, and any tool calls
+ """
+ # Check cost limit
+ if self._total_cost >= self.cost_limit:
+ raise CostLimitExceeded(
+ f"Cost limit exceeded: ${self._total_cost:.4f} >= ${self.cost_limit:.4f}",
+ used=self._total_cost,
+ limit=self.cost_limit,
+ )
+
+ # Use provided temperature or default
+ temp = temperature if temperature is not None else self.temperature
+
+ # Get appropriate params based on thinking mode
+ params = KIMI_K25_THINKING_PARAMS if self.enable_thinking else KIMI_K25_INSTANT_PARAMS
+
+ # Build request kwargs
+ kwargs: Dict[str, Any] = {
+ "model": self.model,
+ "messages": messages,
+ "max_tokens": max_tokens or self.max_tokens,
+ "temperature": temp,
+ "top_p": params["top_p"],
+ }
+
+ if tools:
+ kwargs["tools"] = self._build_tools(tools)
+ kwargs["tool_choice"] = "auto"
+
+ # Add extra body params
+ if extra_body:
+ kwargs.update(extra_body)
+
+ try:
+ response = self._client.chat.completions.create(**kwargs)
+ self._request_count += 1
+ except Exception as e:
+ error_msg = str(e)
+ if "authentication" in error_msg.lower() or "api_key" in error_msg.lower() or "unauthorized" in error_msg.lower():
+ raise LLMError(error_msg, code="authentication_error")
+ elif "rate" in error_msg.lower() or "limit" in error_msg.lower():
+ raise LLMError(error_msg, code="rate_limit")
+ else:
+ raise LLMError(error_msg, code="api_error")
+
+ # Parse response
+ result = LLMResponse(raw=response.model_dump() if hasattr(response, "model_dump") else None)
+
+ # Extract usage
+ if hasattr(response, "usage") and response.usage:
+ usage = response.usage
+ input_tokens = getattr(usage, "prompt_tokens", 0) or 0
+ output_tokens = getattr(usage, "completion_tokens", 0) or 0
+ cached_tokens = 0
+
+ # Check for cached tokens
+ if hasattr(usage, "prompt_tokens_details"):
+ details = usage.prompt_tokens_details
+ if details and hasattr(details, "cached_tokens"):
+ cached_tokens = details.cached_tokens or 0
+
+ self._input_tokens += input_tokens
+ self._output_tokens += output_tokens
+ self._cached_tokens += cached_tokens
+ self._total_tokens += input_tokens + output_tokens
+
+ result.tokens = {
+ "input": input_tokens,
+ "output": output_tokens,
+ "cached": cached_tokens,
+ }
+
+ # Estimate cost (Kimi K2.5 pricing via Chutes - approximate)
+ # $0.60 per million input tokens, $2.50 per million output tokens
+ input_cost_per_1k = 0.0006 # $0.60 / 1000
+ output_cost_per_1k = 0.0025 # $2.50 / 1000
+ if result.tokens:
+ cost = (result.tokens["input"] / 1000 * input_cost_per_1k +
+ result.tokens["output"] / 1000 * output_cost_per_1k)
+ self._total_cost += cost
+ result.cost = cost
+
+ # Extract model
+ result.model = getattr(response, "model", self.model)
+
+ # Extract choices
+ if hasattr(response, "choices") and response.choices:
+ choice = response.choices[0]
+ message = choice.message
+
+ result.finish_reason = getattr(choice, "finish_reason", "") or ""
+ raw_text = getattr(message, "content", "") or ""
+
+ # Extract reasoning_content if available (official Kimi API)
+ if hasattr(message, "reasoning_content") and message.reasoning_content:
+ result.thinking = message.reasoning_content
+ result.text = raw_text
+ elif self.enable_thinking:
+ # Parse thinking content from tags
+ result.thinking, result.text = self._parse_thinking_content(raw_text)
+ else:
+ result.text = raw_text
+
+ # Extract function calls
+ tool_calls = getattr(message, "tool_calls", None)
+ if tool_calls:
+ for call in tool_calls:
+ if hasattr(call, "function"):
+ func = call.function
+ args_str = getattr(func, "arguments", "{}")
+ try:
+ args = json.loads(args_str) if isinstance(args_str, str) else args_str
+ except json.JSONDecodeError:
+ args = {"raw": args_str}
+
+ result.function_calls.append(FunctionCall(
+ id=getattr(call, "id", "") or "",
+ name=getattr(func, "name", "") or "",
+ arguments=args if isinstance(args, dict) else {},
+ ))
+
+ return result
+
+ def get_stats(self) -> Dict[str, Any]:
+ """Get usage statistics."""
+ return {
+ "total_tokens": self._total_tokens,
+ "input_tokens": self._input_tokens,
+ "output_tokens": self._output_tokens,
+ "cached_tokens": self._cached_tokens,
+ "total_cost": self._total_cost,
+ "request_count": self._request_count,
+ }
+
+ def close(self):
+ """Close client."""
+ if hasattr(self, "_client"):
+ self._client.close()
+
+
class LiteLLMClient:
- """LLM Client using litellm."""
+ """LLM Client using litellm (fallback for non-Chutes providers)."""
def __init__(
self,
@@ -75,11 +370,16 @@ def __init__(
temperature: Optional[float] = None,
max_tokens: int = 16384,
cost_limit: Optional[float] = None,
+ # Legacy params for compatibility
+ enable_thinking: bool = False,
+ cache_extended_retention: bool = True,
+ cache_key: Optional[str] = None,
):
self.model = model
self.temperature = temperature
self.max_tokens = max_tokens
self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "10.0"))
+ self.enable_thinking = enable_thinking
self._total_cost = 0.0
self._total_tokens = 0
@@ -128,6 +428,7 @@ def chat(
tools: Optional[List[Dict[str, Any]]] = None,
max_tokens: Optional[int] = None,
extra_body: Optional[Dict[str, Any]] = None,
+ temperature: Optional[float] = None,
) -> LLMResponse:
"""Send a chat request."""
# Check cost limit
@@ -138,6 +439,9 @@ def chat(
limit=self.cost_limit,
)
+ # Use provided temperature or default
+ temp = temperature if temperature is not None else self.temperature
+
# Build request
kwargs: Dict[str, Any] = {
"model": self.model,
@@ -145,8 +449,8 @@ def chat(
"max_tokens": max_tokens or self.max_tokens,
}
- if self._supports_temperature(self.model) and self.temperature is not None:
- kwargs["temperature"] = self.temperature
+ if self._supports_temperature(self.model) and temp is not None:
+ kwargs["temperature"] = temp
if tools:
kwargs["tools"] = self._build_tools(tools)
@@ -199,6 +503,7 @@ def chat(
try:
cost = self._litellm.completion_cost(completion_response=response)
self._total_cost += cost
+ result.cost = cost
except Exception:
pass # Cost calculation may fail for some models
@@ -247,3 +552,46 @@ def get_stats(self) -> Dict[str, Any]:
def close(self):
"""Close client (no-op for litellm)."""
pass
+
+
+def get_llm_client(
+ provider: str = "chutes",
+ model: Optional[str] = None,
+ temperature: Optional[float] = None,
+ max_tokens: int = 16384,
+ cost_limit: Optional[float] = None,
+ enable_thinking: bool = True,
+ **kwargs,
+):
+ """Factory function to get appropriate LLM client based on provider.
+
+ Args:
+ provider: "chutes" for Chutes API, "openrouter" or others for litellm
+ model: Model name (default depends on provider)
+ temperature: Temperature setting (default based on thinking mode)
+ max_tokens: Max tokens to generate
+ cost_limit: Cost limit in USD
+ enable_thinking: Enable thinking mode (for Chutes/Kimi K2.5)
+ **kwargs: Additional arguments passed to client
+
+ Returns:
+ ChutesClient or LiteLLMClient instance
+ """
+ if provider == "chutes":
+ return ChutesClient(
+ model=model or CHUTES_DEFAULT_MODEL,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ cost_limit=cost_limit,
+ enable_thinking=enable_thinking,
+ **kwargs,
+ )
+ else:
+ return LiteLLMClient(
+ model=model or "openrouter/anthropic/claude-sonnet-4-20250514",
+ temperature=temperature,
+ max_tokens=max_tokens,
+ cost_limit=cost_limit,
+ enable_thinking=enable_thinking,
+ **kwargs,
+ )