diff --git a/agent.py b/agent.py
index 710edb1..e007c7e 100644
--- a/agent.py
+++ b/agent.py
@@ -3,13 +3,22 @@
 SuperAgent for Term Challenge - Entry Point (SDK 3.0 Compatible).
 
 This agent accepts --instruction from the validator and runs autonomously.
-Uses litellm for LLM calls instead of term_sdk.
+Supports multiple LLM providers:
+- Chutes API (default): Uses moonshotai/Kimi-K2.5-TEE with thinking mode
+- OpenRouter/litellm: Fallback to other providers
 
 Installation:
     pip install .                    # via pyproject.toml
     pip install -r requirements.txt  # via requirements.txt
 
 Usage:
+    # With Chutes API (default - requires CHUTES_API_TOKEN)
+    export CHUTES_API_TOKEN="your-token"
+    python agent.py --instruction "Your task description here..."
+    
+    # With OpenRouter (fallback)
+    export LLM_PROVIDER="openrouter"
+    export OPENROUTER_API_KEY="your-key"
     python agent.py --instruction "Your task description here..."
 """
 
@@ -29,7 +38,7 @@
 def ensure_dependencies():
     """Install dependencies if not present."""
     try:
-        import litellm
+        import openai
         import httpx
         import pydantic
     except ImportError:
@@ -48,7 +57,7 @@ def ensure_dependencies():
 from src.core.loop import run_agent_loop
 from src.tools.registry import ToolRegistry
 from src.output.jsonl import emit, ErrorEvent
-from src.llm.client import LiteLLMClient, CostLimitExceeded
+from src.llm.client import get_llm_client, CostLimitExceeded, ChutesClient, LiteLLMClient
 
 
 class AgentContext:
@@ -130,21 +139,30 @@ def main():
     parser.add_argument("--instruction", required=True, help="Task instruction from validator")
     args = parser.parse_args()
     
+    provider = CONFIG.get("provider", "chutes")
+    
     _log("=" * 60)
-    _log("SuperAgent Starting (SDK 3.0 - litellm)")
+    _log(f"SuperAgent Starting (SDK 3.0 - {provider})")
     _log("=" * 60)
+    _log(f"Provider: {provider}")
     _log(f"Model: {CONFIG['model']}")
-    _log(f"Reasoning effort: {CONFIG.get('reasoning_effort', 'default')}")
+    _log(f"Thinking mode: {CONFIG.get('enable_thinking', True)}")
     _log(f"Instruction: {args.instruction[:200]}...")
     _log("-" * 60)
     
     # Initialize components
     start_time = time.time()
     
-    llm = LiteLLMClient(
+    # Use factory function to get appropriate client based on provider
+    llm = get_llm_client(
+        provider=provider,
         model=CONFIG["model"],
         temperature=CONFIG.get("temperature"),
         max_tokens=CONFIG.get("max_tokens", 16384),
+        cost_limit=CONFIG.get("cost_limit", 100.0),
+        enable_thinking=CONFIG.get("enable_thinking", True),
+        cache_extended_retention=CONFIG.get("cache_extended_retention", True),
+        cache_key=CONFIG.get("cache_key"),
     )
     
     tools = ToolRegistry()
diff --git a/pyproject.toml b/pyproject.toml
index 864644a..6c764a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "rich>=13.0",
     "typer>=0.12.0",
     "litellm>=1.50.0",
+    "openai>=1.0.0",
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
index 02cebfd..465d61c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ tomli-w>=1.0
 rich>=13.0
 typer>=0.12.0
 litellm>=1.50.0
+openai>=1.0.0
diff --git a/src/config/defaults.py b/src/config/defaults.py
index da7615f..22d8796 100644
--- a/src/config/defaults.py
+++ b/src/config/defaults.py
@@ -1,13 +1,12 @@
 """
 Hardcoded benchmark configuration for SuperAgent.
 
-Simulates Codex exec with these flags:
-- --model gpt-5.2
-- -c model_reasoning_effort=xhigh
-- --dangerously-bypass-approvals-and-sandbox
-- --skip-git-repo-check
-- --enable unified_exec
-- --json
+Default provider: Chutes API with Kimi K2.5-TEE model.
+Supports thinking mode with <think>...</think> reasoning blocks.
+
+Alternative providers available via LLM_PROVIDER environment variable:
+- "chutes" (default): Chutes API with Kimi K2.5-TEE
+- "openrouter": OpenRouter with Claude or other models
 
 All settings are hardcoded - no CLI arguments needed.
 """
@@ -18,33 +17,40 @@
 from typing import Any, Dict
 
 
-# Main configuration - simulates Codex exec benchmark mode
+# Main configuration - default to Chutes API with Kimi K2.5-TEE
 CONFIG: Dict[str, Any] = {
     # ==========================================================================
-    # Model Settings (simulates --model gpt-5.2 -c model_reasoning_effort=xhigh)
+    # Model Settings - Chutes API with Kimi K2.5-TEE
     # ==========================================================================
     
-    # Model to use via OpenRouter (prefix with openrouter/ for litellm)
-    "model": os.environ.get("LLM_MODEL", "openrouter/anthropic/claude-sonnet-4-20250514"),
+    # Model to use via Chutes API
+    # Kimi K2.5-TEE: 1T params (32B activated), 256K context window
+    # Supports thinking mode with reasoning_content
+    "model": os.environ.get("LLM_MODEL", "moonshotai/Kimi-K2.5-TEE"),
     
-    # Provider
-    "provider": "openrouter",
+    # Provider: "chutes" for Chutes API, "openrouter" for litellm/OpenRouter
+    "provider": os.environ.get("LLM_PROVIDER", "chutes"),
     
-    # Reasoning effort: none, minimal, low, medium, high, xhigh (not used for Claude)
-    "reasoning_effort": "none",
+    # Enable Kimi K2.5 thinking mode (reasoning in thinking blocks)
+    "enable_thinking": True,
     
-    # Token limits
+    # Token limits (Kimi K2.5 supports up to 32K output)
     "max_tokens": 16384,
     
-    # Temperature (0 = deterministic)
-    "temperature": 0.0,
+    # Temperature - Kimi K2.5 best practices:
+    # - Thinking mode: 1.0 (with top_p=0.95)
+    # - Instant mode: 0.6 (with top_p=0.95)
+    "temperature": 1.0,
+    
+    # Cost limit in USD
+    "cost_limit": 100.0,
     
     # ==========================================================================
     # Agent Execution Settings
     # ==========================================================================
     
     # Maximum iterations before stopping
-    "max_iterations": 200,
+    "max_iterations": 350,
     
     # Maximum tokens for tool output truncation (middle-out strategy)
     "max_output_tokens": 2500,  # ~10KB
@@ -56,10 +62,10 @@
     # Context Management (like OpenCode/Codex)
     # ==========================================================================
     
-    # Model context window (Claude Opus 4.5 = 200K)
-    "model_context_limit": 200_000,
+    # Model context window (Kimi K2.5 = 256K)
+    "model_context_limit": 256_000,
     
-    # Reserved tokens for output
+    # Reserved tokens for output (Kimi K2.5 can output up to 32K)
     "output_token_max": 32_000,
     
     # Trigger compaction at this % of usable context (85%)
@@ -70,16 +76,17 @@
     "prune_minimum": 20_000,   # Only prune if we can recover at least this many
     
     # ==========================================================================
-    # Prompt Caching (Anthropic via OpenRouter/Bedrock)
+    # Prompt Caching
     # ==========================================================================
     
-    # Enable prompt caching
+    # Enable prompt caching (Chutes may support server-side caching)
     "cache_enabled": True,
     
-    # Note: Anthropic caching requires minimum tokens per breakpoint:
-    # - Claude Opus 4.5 on Bedrock: 4096 tokens minimum
-    # - Claude Sonnet/other: 1024 tokens minimum
-    # System prompt should be large enough to meet this threshold
+    # Chutes API caching notes:
+    # - Kimi K2.5 on Chutes uses server-side optimization
+    # - Keep system prompt stable for best performance
+    "cache_extended_retention": True,
+    "cache_key": None,
     
     # ==========================================================================
     # Simulated Codex Flags (all enabled/bypassed for benchmark)
diff --git a/src/llm/client.py b/src/llm/client.py
index 72e048b..8636137 100644
--- a/src/llm/client.py
+++ b/src/llm/client.py
@@ -1,13 +1,44 @@
-"""LLM Client using litellm - replaces term_sdk dependency."""
+"""LLM Client supporting Chutes API and litellm providers.
+
+Supports:
+- Chutes API (https://llm.chutes.ai/v1) with Kimi K2.5-TEE
+- OpenRouter and other litellm-compatible providers (fallback)
+
+Chutes API:
+- OpenAI-compatible endpoint
+- Requires CHUTES_API_TOKEN environment variable
+- Default model: moonshotai/Kimi-K2.5-TEE
+
+Kimi K2.5 Best Practices:
+- Thinking mode: temperature=1.0, top_p=0.95
+- Instant mode: temperature=0.6, top_p=0.95
+- Context window: 256K tokens
+"""
 
 from __future__ import annotations
 
 import json
 import os
+import re
 import sys
 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
+
+# Chutes API configuration
+CHUTES_API_BASE = "https://llm.chutes.ai/v1"
+CHUTES_DEFAULT_MODEL = "moonshotai/Kimi-K2.5-TEE"
+
+# Kimi K2.5 recommended parameters
+KIMI_K25_THINKING_PARAMS = {
+    "temperature": 1.0,  # Use 1.0 for thinking mode
+    "top_p": 0.95,
+}
+
+KIMI_K25_INSTANT_PARAMS = {
+    "temperature": 0.6,  # Use 0.6 for instant mode
+    "top_p": 0.95,
+}
 
 
 class CostLimitExceeded(Exception):
@@ -55,19 +86,283 @@ def from_openai(cls, call: Dict[str, Any]) -> "FunctionCall":
 class LLMResponse:
     """Response from the LLM."""
     text: str = ""
+    thinking: str = ""  # Thinking/reasoning content (for models supporting thinking mode)
     function_calls: List[FunctionCall] = field(default_factory=list)
     tokens: Optional[Dict[str, int]] = None
     model: str = ""
     finish_reason: str = ""
     raw: Optional[Dict[str, Any]] = None
+    cost: float = 0.0
     
     def has_function_calls(self) -> bool:
         """Check if response contains function calls."""
         return len(self.function_calls) > 0
 
 
+class ChutesClient:
+    """LLM Client for Chutes API with Kimi K2.5-TEE.
+    
+    Chutes API is OpenAI-compatible, hosted at https://llm.chutes.ai/v1
+    Default model: moonshotai/Kimi-K2.5-TEE with thinking mode enabled.
+    
+    Environment variable: CHUTES_API_TOKEN
+    
+    Kimi K2.5 parameters:
+    - Thinking mode: temperature=1.0, top_p=0.95
+    - Instant mode: temperature=0.6, top_p=0.95
+    - Context window: 256K tokens
+    """
+    
+    def __init__(
+        self,
+        model: str = CHUTES_DEFAULT_MODEL,
+        temperature: Optional[float] = None,
+        max_tokens: int = 16384,
+        cost_limit: Optional[float] = None,
+        enable_thinking: bool = True,
+        # Legacy params (kept for compatibility)
+        cache_extended_retention: bool = True,
+        cache_key: Optional[str] = None,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+        self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "100.0"))
+        self.enable_thinking = enable_thinking
+        
+        # Set temperature based on thinking mode if not explicitly provided
+        if temperature is None:
+            params = KIMI_K25_THINKING_PARAMS if enable_thinking else KIMI_K25_INSTANT_PARAMS
+            self.temperature = params["temperature"]
+        else:
+            self.temperature = temperature
+        
+        self._total_cost = 0.0
+        self._total_tokens = 0
+        self._request_count = 0
+        self._input_tokens = 0
+        self._output_tokens = 0
+        self._cached_tokens = 0
+        
+        # Get API token
+        self._api_token = os.environ.get("CHUTES_API_TOKEN")
+        if not self._api_token:
+            raise LLMError(
+                "CHUTES_API_TOKEN environment variable not set. "
+                "Get your API token at https://chutes.ai",
+                code="authentication_error"
+            )
+        
+        # Import and configure OpenAI client for Chutes API
+        try:
+            from openai import OpenAI
+            self._client = OpenAI(
+                api_key=self._api_token,
+                base_url=CHUTES_API_BASE,
+            )
+        except ImportError:
+            raise ImportError("openai not installed. Run: pip install openai")
+    
+    def _build_tools(self, tools: Optional[List[Dict[str, Any]]]) -> Optional[List[Dict[str, Any]]]:
+        """Build tools in OpenAI format."""
+        if not tools:
+            return None
+        
+        result = []
+        for tool in tools:
+            result.append({
+                "type": "function",
+                "function": {
+                    "name": tool["name"],
+                    "description": tool.get("description", ""),
+                    "parameters": tool.get("parameters", {"type": "object", "properties": {}}),
+                },
+            })
+        return result
+    
+    def _parse_thinking_content(self, text: str) -> Tuple[str, str]:
+        """Parse thinking content from response.
+        
+        Kimi K2.5 can return thinking content in:
+        1. <think>...</think> tags (for some deployments)
+        2. reasoning_content field (official API)
+        
+        Returns (thinking_content, final_response).
+        """
+        if not text:
+            return "", ""
+        
+        # Check for <think>...</think> pattern
+        think_pattern = r"<think>(.*?)</think>"
+        match = re.search(think_pattern, text, re.DOTALL)
+        
+        if match:
+            thinking = match.group(1).strip()
+            # Remove the think block from the response
+            response = re.sub(think_pattern, "", text, flags=re.DOTALL).strip()
+            return thinking, response
+        
+        return "", text
+    
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_tokens: Optional[int] = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+    ) -> LLMResponse:
+        """Send a chat request to Chutes API.
+        
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            tools: Optional list of tool definitions
+            max_tokens: Max tokens to generate (default: self.max_tokens)
+            extra_body: Additional parameters to pass to the API
+            temperature: Override temperature (default: self.temperature)
+            
+        Returns:
+            LLMResponse with text, thinking content, and any tool calls
+        """
+        # Check cost limit
+        if self._total_cost >= self.cost_limit:
+            raise CostLimitExceeded(
+                f"Cost limit exceeded: ${self._total_cost:.4f} >= ${self.cost_limit:.4f}",
+                used=self._total_cost,
+                limit=self.cost_limit,
+            )
+        
+        # Use provided temperature or default
+        temp = temperature if temperature is not None else self.temperature
+        
+        # Get appropriate params based on thinking mode
+        params = KIMI_K25_THINKING_PARAMS if self.enable_thinking else KIMI_K25_INSTANT_PARAMS
+        
+        # Build request kwargs
+        kwargs: Dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "max_tokens": max_tokens or self.max_tokens,
+            "temperature": temp,
+            "top_p": params["top_p"],
+        }
+        
+        if tools:
+            kwargs["tools"] = self._build_tools(tools)
+            kwargs["tool_choice"] = "auto"
+        
+        # Add extra body params
+        if extra_body:
+            kwargs.update(extra_body)
+        
+        try:
+            response = self._client.chat.completions.create(**kwargs)
+            self._request_count += 1
+        except Exception as e:
+            error_msg = str(e)
+            if "authentication" in error_msg.lower() or "api_key" in error_msg.lower() or "unauthorized" in error_msg.lower():
+                raise LLMError(error_msg, code="authentication_error")
+            elif "rate" in error_msg.lower() or "limit" in error_msg.lower():
+                raise LLMError(error_msg, code="rate_limit")
+            else:
+                raise LLMError(error_msg, code="api_error")
+        
+        # Parse response
+        result = LLMResponse(raw=response.model_dump() if hasattr(response, "model_dump") else None)
+        
+        # Extract usage
+        if hasattr(response, "usage") and response.usage:
+            usage = response.usage
+            input_tokens = getattr(usage, "prompt_tokens", 0) or 0
+            output_tokens = getattr(usage, "completion_tokens", 0) or 0
+            cached_tokens = 0
+            
+            # Check for cached tokens
+            if hasattr(usage, "prompt_tokens_details"):
+                details = usage.prompt_tokens_details
+                if details and hasattr(details, "cached_tokens"):
+                    cached_tokens = details.cached_tokens or 0
+            
+            self._input_tokens += input_tokens
+            self._output_tokens += output_tokens
+            self._cached_tokens += cached_tokens
+            self._total_tokens += input_tokens + output_tokens
+            
+            result.tokens = {
+                "input": input_tokens,
+                "output": output_tokens,
+                "cached": cached_tokens,
+            }
+        
+        # Estimate cost (Kimi K2.5 pricing via Chutes - approximate)
+        # $0.60 per million input tokens, $2.50 per million output tokens
+        input_cost_per_1k = 0.0006  # $0.60 / 1000
+        output_cost_per_1k = 0.0025  # $2.50 / 1000
+        if result.tokens:
+            cost = (result.tokens["input"] / 1000 * input_cost_per_1k +
+                    result.tokens["output"] / 1000 * output_cost_per_1k)
+            self._total_cost += cost
+            result.cost = cost
+        
+        # Extract model
+        result.model = getattr(response, "model", self.model)
+        
+        # Extract choices
+        if hasattr(response, "choices") and response.choices:
+            choice = response.choices[0]
+            message = choice.message
+            
+            result.finish_reason = getattr(choice, "finish_reason", "") or ""
+            raw_text = getattr(message, "content", "") or ""
+            
+            # Extract reasoning_content if available (official Kimi API)
+            if hasattr(message, "reasoning_content") and message.reasoning_content:
+                result.thinking = message.reasoning_content
+                result.text = raw_text
+            elif self.enable_thinking:
+                # Parse thinking content from <think> tags
+                result.thinking, result.text = self._parse_thinking_content(raw_text)
+            else:
+                result.text = raw_text
+            
+            # Extract function calls
+            tool_calls = getattr(message, "tool_calls", None)
+            if tool_calls:
+                for call in tool_calls:
+                    if hasattr(call, "function"):
+                        func = call.function
+                        args_str = getattr(func, "arguments", "{}")
+                        try:
+                            args = json.loads(args_str) if isinstance(args_str, str) else args_str
+                        except json.JSONDecodeError:
+                            args = {"raw": args_str}
+                        
+                        result.function_calls.append(FunctionCall(
+                            id=getattr(call, "id", "") or "",
+                            name=getattr(func, "name", "") or "",
+                            arguments=args if isinstance(args, dict) else {},
+                        ))
+        
+        return result
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get usage statistics."""
+        return {
+            "total_tokens": self._total_tokens,
+            "input_tokens": self._input_tokens,
+            "output_tokens": self._output_tokens,
+            "cached_tokens": self._cached_tokens,
+            "total_cost": self._total_cost,
+            "request_count": self._request_count,
+        }
+    
+    def close(self):
+        """Close client."""
+        if hasattr(self, "_client"):
+            self._client.close()
+
+
 class LiteLLMClient:
-    """LLM Client using litellm."""
+    """LLM Client using litellm (fallback for non-Chutes providers)."""
     
     def __init__(
         self,
@@ -75,11 +370,16 @@ def __init__(
         temperature: Optional[float] = None,
         max_tokens: int = 16384,
         cost_limit: Optional[float] = None,
+        # Legacy params for compatibility
+        enable_thinking: bool = False,
+        cache_extended_retention: bool = True,
+        cache_key: Optional[str] = None,
     ):
         self.model = model
         self.temperature = temperature
         self.max_tokens = max_tokens
         self.cost_limit = cost_limit or float(os.environ.get("LLM_COST_LIMIT", "10.0"))
+        self.enable_thinking = enable_thinking
         
         self._total_cost = 0.0
         self._total_tokens = 0
@@ -128,6 +428,7 @@ def chat(
         tools: Optional[List[Dict[str, Any]]] = None,
         max_tokens: Optional[int] = None,
         extra_body: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
     ) -> LLMResponse:
         """Send a chat request."""
         # Check cost limit
@@ -138,6 +439,9 @@ def chat(
                 limit=self.cost_limit,
             )
         
+        # Use provided temperature or default
+        temp = temperature if temperature is not None else self.temperature
+        
         # Build request
         kwargs: Dict[str, Any] = {
             "model": self.model,
@@ -145,8 +449,8 @@ def chat(
             "max_tokens": max_tokens or self.max_tokens,
         }
         
-        if self._supports_temperature(self.model) and self.temperature is not None:
-            kwargs["temperature"] = self.temperature
+        if self._supports_temperature(self.model) and temp is not None:
+            kwargs["temperature"] = temp
         
         if tools:
             kwargs["tools"] = self._build_tools(tools)
@@ -199,6 +503,7 @@ def chat(
         try:
             cost = self._litellm.completion_cost(completion_response=response)
             self._total_cost += cost
+            result.cost = cost
         except Exception:
             pass  # Cost calculation may fail for some models
         
@@ -247,3 +552,46 @@ def get_stats(self) -> Dict[str, Any]:
     def close(self):
         """Close client (no-op for litellm)."""
         pass
+
+
+def get_llm_client(
+    provider: str = "chutes",
+    model: Optional[str] = None,
+    temperature: Optional[float] = None,
+    max_tokens: int = 16384,
+    cost_limit: Optional[float] = None,
+    enable_thinking: bool = True,
+    **kwargs,
+):
+    """Factory function to get appropriate LLM client based on provider.
+    
+    Args:
+        provider: "chutes" for Chutes API, "openrouter" or others for litellm
+        model: Model name (default depends on provider)
+        temperature: Temperature setting (default based on thinking mode)
+        max_tokens: Max tokens to generate
+        cost_limit: Cost limit in USD
+        enable_thinking: Enable thinking mode (for Chutes/Kimi K2.5)
+        **kwargs: Additional arguments passed to client
+        
+    Returns:
+        ChutesClient or LiteLLMClient instance
+    """
+    if provider == "chutes":
+        return ChutesClient(
+            model=model or CHUTES_DEFAULT_MODEL,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            cost_limit=cost_limit,
+            enable_thinking=enable_thinking,
+            **kwargs,
+        )
+    else:
+        return LiteLLMClient(
+            model=model or "openrouter/anthropic/claude-sonnet-4-20250514",
+            temperature=temperature,
+            max_tokens=max_tokens,
+            cost_limit=cost_limit,
+            enable_thinking=enable_thinking,
+            **kwargs,
+        )