Tutorial

Claude Sonnet 4.6 Production Best Practices: Complete Guide

Production-ready best practices for Claude Sonnet 4.6: error handling, rate limiting, prompt optimization, monitoring, and reliability patterns.

February 2026

TL;DR

Production-ready Sonnet 4.6 requires: robust error handling with exponential backoff, prompt caching for cost reduction, structured output validation, comprehensive monitoring, and graceful degradation. This guide covers battle-tested patterns from high-scale deployments.

Error Handling

import anthropic

from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type

@retry(

retry=retry_if_exception_type((

anthropic.RateLimitError,

anthropic.APIConnectionError,

anthropic.InternalServerError

)),

wait=wait_exponential(multiplier=1, min=2, max=60),

stop=stop_after_attempt(5)

)

def call_claude(messages: list, max_tokens: int = 4096) -> str:

try:

response = client.messages.create(

model="claude-sonnet-4-6-20260217",

max_tokens=max_tokens,

messages=messages

)

return response.content[0].text

except anthropic.BadRequestError as e:

# Don't retry - fix the request

logger.error(f"Bad request: {e}")

raise

except anthropic.AuthenticationError as e:

# Don't retry - fix credentials

logger.critical(f"Auth failed: {e}")

raise

Rate Limiting

import asyncio

from collections import deque

import time

class RateLimiter:

def __init__(self, requests_per_minute: int = 1000):

self.rpm = requests_per_minute

self.window = deque()

self.lock = asyncio.Lock()

async def acquire(self):

async with self.lock:

now = time.time()

# Remove requests older than 1 minute

while self.window and self.window[0] < now - 60:

self.window.popleft()

if len(self.window) >= self.rpm:

sleep_time = 60 - (now - self.window[0])

await asyncio.sleep(sleep_time)

self.window.append(time.time())

limiter = RateLimiter(requests_per_minute=900) # Leave 10% headroom

async def rate_limited_call(messages):

await limiter.acquire()

return await async_call_claude(messages)

Prompt Caching

# Static context caching - saves 90% on repeated queries

SYSTEM_CONTEXT = """

You are a code review assistant. You analyze code for:

  • Security vulnerabilities
  • Performance issues
  • Best practice violations

... (large context)

"""

def cached_code_review(code: str) -> str:

response = client.messages.create(

model="claude-sonnet-4-6-20260217",

max_tokens=4096,

system=[{

"type": "text",

"text": SYSTEM_CONTEXT,

"cache_control": {"type": "ephemeral"} # Cache this

}],

messages=[{"role": "user", "content": f"Review this code:\n{code}"}]

)

# Log cache performance

if hasattr(response, 'usage'):

cache_hit = response.usage.cache_read_input_tokens

cache_miss = response.usage.cache_creation_input_tokens

logger.info(f"Cache hit: {cache_hit}, miss: {cache_miss}")

return response.content[0].text

Structured Output Validation

from pydantic import BaseModel, ValidationError

import json

class CodeReviewResult(BaseModel):

issues: list[dict]

severity: str

summary: str

suggestions: list[str]

def get_structured_review(code: str) -> CodeReviewResult:

response = client.messages.create(

model="claude-sonnet-4-6-20260217",

max_tokens=4096,

messages=[{

"role": "user",

"content": f"""Review this code and respond in JSON format:

{{

"issues": [{{"line": int, "type": str, "description": str}}],

"severity": "low|medium|high|critical",

"summary": "brief summary",

"suggestions": ["suggestion1", "suggestion2"]

}}

Code:

{code}"""

}]

)

# Extract JSON from response

text = response.content[0].text

json_match = re.search(r'\{[\s\S]*\}', text)

if not json_match:

raise ValueError("No JSON found in response")

try:

data = json.loads(json_match.group())

return CodeReviewResult(**data)

except (json.JSONDecodeError, ValidationError) as e:

logger.error(f"Failed to parse response: {e}")

raise

Monitoring & Observability

import time

from datadog import statsd

class ClaudeMetrics:

@staticmethod

def record_request(response, latency: float, success: bool):

tags = [

f"model:{response.model}",

f"stop_reason:{response.stop_reason}",

f"success:{success}"

]

statsd.increment("claude.requests", tags=tags)

statsd.histogram("claude.latency", latency, tags=tags)

statsd.histogram("claude.input_tokens", response.usage.input_tokens, tags=tags)

statsd.histogram("claude.output_tokens", response.usage.output_tokens, tags=tags)

# Cost tracking

input_cost = response.usage.input_tokens * 3 / 1_000_000

output_cost = response.usage.output_tokens * 15 / 1_000_000

statsd.histogram("claude.cost_usd", input_cost + output_cost, tags=tags)

def monitored_call(messages: list) -> str:

start = time.time()

success = False

try:

response = client.messages.create(

model="claude-sonnet-4-6-20260217",

messages=messages

)

success = True

return response.content[0].text

finally:

ClaudeMetrics.record_request(response, time.time() - start, success)

Graceful Degradation

async def resilient_call(messages: list, fallback_response: str = None):

providers = [

("anthropic", call_anthropic),

("bedrock", call_bedrock),

("vertex", call_vertex)

]

for provider_name, provider_func in providers:

try:

return await provider_func(messages)

except Exception as e:

logger.warning(f"{provider_name} failed: {e}")

continue

# All providers failed

if fallback_response:

logger.error("All providers failed, using fallback")

return fallback_response

raise Exception("All AI providers unavailable")

Token Management

import tiktoken

def estimate_tokens(text: str) -> int:

# Claude uses similar tokenization to GPT

enc = tiktoken.get_encoding("cl100k_base")

return len(enc.encode(text))

def truncate_to_limit(text: str, max_tokens: int = 180000) -> str:

enc = tiktoken.get_encoding("cl100k_base")

tokens = enc.encode(text)

if len(tokens) <= max_tokens:

return text

truncated = enc.decode(tokens[:max_tokens])

logger.warning(f"Truncated input from {len(tokens)} to {max_tokens} tokens")

return truncated

Production Checklist

    • [ ] Implement exponential backoff for retries
      • [ ] Add rate limiting with headroom
        • [ ] Enable prompt caching for static content
          • [ ] Validate structured outputs with Pydantic
            • [ ] Set up comprehensive monitoring (latency, tokens, cost)
              • [ ] Implement multi-provider fallback
                • [ ] Add request timeouts
                  • [ ] Log all requests for debugging
                    • [ ] Implement circuit breakers for cascading failure prevention
                      • [ ] Set up alerting for error rate spikes

                      Conclusion

                      Production Sonnet 4.6 deployments require defensive programming at every layer. The patterns in this guide—battle-tested at scale—ensure reliability, cost efficiency, and observability. Start with error handling and monitoring; add caching and fallbacks as you scale.

Ready to Experience Claude 5?

Try Now