Claude Sonnet 4.6 Production Best Practices: Reliability & Performance

TL;DR

Production-ready Sonnet 4.6 requires: robust error handling with exponential backoff, prompt caching for cost reduction, structured output validation, comprehensive monitoring, and graceful degradation. This guide covers battle-tested patterns from high-scale deployments.

Error Handling

import anthropic
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type

@retry(
    retry=retry_if_exception_type((
        anthropic.RateLimitError,
        anthropic.APIConnectionError,
        anthropic.InternalServerError
    )),
    wait=wait_exponential(multiplier=1, min=2, max=60),
    stop=stop_after_attempt(5)
)
def call_claude(messages: list, max_tokens: int = 4096) -> str:
    try:
        response = client.messages.create(
            model="claude-sonnet-4-6-20260217",
            max_tokens=max_tokens,
            messages=messages
        )
        return response.content[0].text

    except anthropic.BadRequestError as e:
        # Don't retry - fix the request
        logger.error(f"Bad request: {e}")
        raise

    except anthropic.AuthenticationError as e:
        # Don't retry - fix credentials
        logger.critical(f"Auth failed: {e}")
        raise

Rate Limiting

import asyncio
from collections import deque
import time

class RateLimiter:
    def __init__(self, requests_per_minute: int = 1000):
        self.rpm = requests_per_minute
        self.window = deque()
        self.lock = asyncio.Lock()

    async def acquire(self):
        async with self.lock:
            now = time.time()
            # Remove requests older than 1 minute
            while self.window and self.window[0] < now - 60:
                self.window.popleft()

            if len(self.window) >= self.rpm:
                sleep_time = 60 - (now - self.window[0])
                await asyncio.sleep(sleep_time)

            self.window.append(time.time())

limiter = RateLimiter(requests_per_minute=900)  # Leave 10% headroom

async def rate_limited_call(messages):
    await limiter.acquire()
    return await async_call_claude(messages)

Prompt Caching

# Static context caching - saves 90% on repeated queries
SYSTEM_CONTEXT = """
You are a code review assistant. You analyze code for:
Security vulnerabilities
Performance issues
Best practice violations
... (large context)
"""

def cached_code_review(code: str) -> str:
    response = client.messages.create(
        model="claude-sonnet-4-6-20260217",
        max_tokens=4096,
        system=[{
            "type": "text",
            "text": SYSTEM_CONTEXT,
            "cache_control": {"type": "ephemeral"}  # Cache this
        }],
        messages=[{"role": "user", "content": f"Review this code:\n{code}"}]
    )

    # Log cache performance
    if hasattr(response, 'usage'):
        cache_hit = response.usage.cache_read_input_tokens
        cache_miss = response.usage.cache_creation_input_tokens
        logger.info(f"Cache hit: {cache_hit}, miss: {cache_miss}")

    return response.content[0].text

Structured Output Validation

from pydantic import BaseModel, ValidationError
import json

class CodeReviewResult(BaseModel):
    issues: list[dict]
    severity: str
    summary: str
    suggestions: list[str]

def get_structured_review(code: str) -> CodeReviewResult:
    response = client.messages.create(
        model="claude-sonnet-4-6-20260217",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": f"""Review this code and respond in JSON format:
{{
    "issues": [{{"line": int, "type": str, "description": str}}],
    "severity": "low|medium|high|critical",
    "summary": "brief summary",
    "suggestions": ["suggestion1", "suggestion2"]
}}

Code:
{code}"""
        }]
    )

    # Extract JSON from response
    text = response.content[0].text
    json_match = re.search(r'\{[\s\S]*\}', text)

    if not json_match:
        raise ValueError("No JSON found in response")

    try:
        data = json.loads(json_match.group())
        return CodeReviewResult(**data)
    except (json.JSONDecodeError, ValidationError) as e:
        logger.error(f"Failed to parse response: {e}")
        raise

Monitoring & Observability

import time
from datadog import statsd

class ClaudeMetrics:
    @staticmethod
    def record_request(response, latency: float, success: bool):
        tags = [
            f"model:{response.model}",
            f"stop_reason:{response.stop_reason}",
            f"success:{success}"
        ]

        statsd.increment("claude.requests", tags=tags)
        statsd.histogram("claude.latency", latency, tags=tags)
        statsd.histogram("claude.input_tokens", response.usage.input_tokens, tags=tags)
        statsd.histogram("claude.output_tokens", response.usage.output_tokens, tags=tags)

        # Cost tracking
        input_cost = response.usage.input_tokens * 3 / 1_000_000
        output_cost = response.usage.output_tokens * 15 / 1_000_000
        statsd.histogram("claude.cost_usd", input_cost + output_cost, tags=tags)

def monitored_call(messages: list) -> str:
    start = time.time()
    success = False

    try:
        response = client.messages.create(
            model="claude-sonnet-4-6-20260217",
            messages=messages
        )
        success = True
        return response.content[0].text
    finally:
        ClaudeMetrics.record_request(response, time.time() - start, success)

Graceful Degradation

async def resilient_call(messages: list, fallback_response: str = None):
    providers = [
        ("anthropic", call_anthropic),
        ("bedrock", call_bedrock),
        ("vertex", call_vertex)
    ]

    for provider_name, provider_func in providers:
        try:
            return await provider_func(messages)
        except Exception as e:
            logger.warning(f"{provider_name} failed: {e}")
            continue

    # All providers failed
    if fallback_response:
        logger.error("All providers failed, using fallback")
        return fallback_response

    raise Exception("All AI providers unavailable")

Token Management

import tiktoken

def estimate_tokens(text: str) -> int:
    # Claude uses similar tokenization to GPT
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def truncate_to_limit(text: str, max_tokens: int = 180000) -> str:
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)

    if len(tokens) <= max_tokens:
        return text

    truncated = enc.decode(tokens[:max_tokens])
    logger.warning(f"Truncated input from {len(tokens)} to {max_tokens} tokens")
    return truncated

Production Checklist

[ ] Implement exponential backoff for retries

[ ] Add rate limiting with headroom

[ ] Enable prompt caching for static content

[ ] Validate structured outputs with Pydantic

[ ] Set up comprehensive monitoring (latency, tokens, cost)

[ ] Implement multi-provider fallback

[ ] Add request timeouts

[ ] Log all requests for debugging

[ ] Implement circuit breakers for cascading failure prevention

[ ] Set up alerting for error rate spikes

Conclusion

Production Sonnet 4.6 deployments require defensive programming at every layer. The patterns in this guide—battle-tested at scale—ensure reliability, cost efficiency, and observability. Start with error handling and monitoring; add caching and fallbacks as you scale.

Claude Sonnet 4.6 Production Best Practices: Complete Guide