Claude Sonnet 4.6 Production Best Practices: Complete Guide
Production-ready best practices for Claude Sonnet 4.6: error handling, rate limiting, prompt optimization, monitoring, and reliability patterns.
TL;DR
Production-ready Sonnet 4.6 requires: robust error handling with exponential backoff, prompt caching for cost reduction, structured output validation, comprehensive monitoring, and graceful degradation. This guide covers battle-tested patterns from high-scale deployments.
Error Handling
import anthropicfrom tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
@retry(
retry=retry_if_exception_type((
anthropic.RateLimitError,
anthropic.APIConnectionError,
anthropic.InternalServerError
)),
wait=wait_exponential(multiplier=1, min=2, max=60),
stop=stop_after_attempt(5)
)
def call_claude(messages: list, max_tokens: int = 4096) -> str:
try:
response = client.messages.create(
model="claude-sonnet-4-6-20260217",
max_tokens=max_tokens,
messages=messages
)
return response.content[0].text
except anthropic.BadRequestError as e:
# Don't retry - fix the request
logger.error(f"Bad request: {e}")
raise
except anthropic.AuthenticationError as e:
# Don't retry - fix credentials
logger.critical(f"Auth failed: {e}")
raise
Rate Limiting
import asynciofrom collections import deque
import time
class RateLimiter:
def __init__(self, requests_per_minute: int = 1000):
self.rpm = requests_per_minute
self.window = deque()
self.lock = asyncio.Lock()
async def acquire(self):
async with self.lock:
now = time.time()
# Remove requests older than 1 minute
while self.window and self.window[0] < now - 60:
self.window.popleft()
if len(self.window) >= self.rpm:
sleep_time = 60 - (now - self.window[0])
await asyncio.sleep(sleep_time)
self.window.append(time.time())
limiter = RateLimiter(requests_per_minute=900) # Leave 10% headroom
async def rate_limited_call(messages):
await limiter.acquire()
return await async_call_claude(messages)
Prompt Caching
# Static context caching - saves 90% on repeated queriesSYSTEM_CONTEXT = """
You are a code review assistant. You analyze code for:
- Security vulnerabilities
- Performance issues
- Best practice violations
... (large context)
"""
def cached_code_review(code: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6-20260217",
max_tokens=4096,
system=[{
"type": "text",
"text": SYSTEM_CONTEXT,
"cache_control": {"type": "ephemeral"} # Cache this
}],
messages=[{"role": "user", "content": f"Review this code:\n{code}"}]
)
# Log cache performance
if hasattr(response, 'usage'):
cache_hit = response.usage.cache_read_input_tokens
cache_miss = response.usage.cache_creation_input_tokens
logger.info(f"Cache hit: {cache_hit}, miss: {cache_miss}")
return response.content[0].text
Structured Output Validation
from pydantic import BaseModel, ValidationErrorimport json
class CodeReviewResult(BaseModel):
issues: list[dict]
severity: str
summary: str
suggestions: list[str]
def get_structured_review(code: str) -> CodeReviewResult:
response = client.messages.create(
model="claude-sonnet-4-6-20260217",
max_tokens=4096,
messages=[{
"role": "user",
"content": f"""Review this code and respond in JSON format:
{{
"issues": [{{"line": int, "type": str, "description": str}}],
"severity": "low|medium|high|critical",
"summary": "brief summary",
"suggestions": ["suggestion1", "suggestion2"]
}}
Code:
{code}"""
}]
)
# Extract JSON from response
text = response.content[0].text
json_match = re.search(r'\{[\s\S]*\}', text)
if not json_match:
raise ValueError("No JSON found in response")
try:
data = json.loads(json_match.group())
return CodeReviewResult(**data)
except (json.JSONDecodeError, ValidationError) as e:
logger.error(f"Failed to parse response: {e}")
raise
Monitoring & Observability
import timefrom datadog import statsd
class ClaudeMetrics:
@staticmethod
def record_request(response, latency: float, success: bool):
tags = [
f"model:{response.model}",
f"stop_reason:{response.stop_reason}",
f"success:{success}"
]
statsd.increment("claude.requests", tags=tags)
statsd.histogram("claude.latency", latency, tags=tags)
statsd.histogram("claude.input_tokens", response.usage.input_tokens, tags=tags)
statsd.histogram("claude.output_tokens", response.usage.output_tokens, tags=tags)
# Cost tracking
input_cost = response.usage.input_tokens * 3 / 1_000_000
output_cost = response.usage.output_tokens * 15 / 1_000_000
statsd.histogram("claude.cost_usd", input_cost + output_cost, tags=tags)
def monitored_call(messages: list) -> str:
start = time.time()
success = False
try:
response = client.messages.create(
model="claude-sonnet-4-6-20260217",
messages=messages
)
success = True
return response.content[0].text
finally:
ClaudeMetrics.record_request(response, time.time() - start, success)
Graceful Degradation
async def resilient_call(messages: list, fallback_response: str = None):providers = [
("anthropic", call_anthropic),
("bedrock", call_bedrock),
("vertex", call_vertex)
]
for provider_name, provider_func in providers:
try:
return await provider_func(messages)
except Exception as e:
logger.warning(f"{provider_name} failed: {e}")
continue
# All providers failed
if fallback_response:
logger.error("All providers failed, using fallback")
return fallback_response
raise Exception("All AI providers unavailable")
Token Management
import tiktokendef estimate_tokens(text: str) -> int:
# Claude uses similar tokenization to GPT
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))
def truncate_to_limit(text: str, max_tokens: int = 180000) -> str:
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
if len(tokens) <= max_tokens:
return text
truncated = enc.decode(tokens[:max_tokens])
logger.warning(f"Truncated input from {len(tokens)} to {max_tokens} tokens")
return truncated
Production Checklist
- [ ] Implement exponential backoff for retries
- [ ] Add rate limiting with headroom
- [ ] Enable prompt caching for static content
- [ ] Validate structured outputs with Pydantic
- [ ] Set up comprehensive monitoring (latency, tokens, cost)
- [ ] Implement multi-provider fallback
- [ ] Add request timeouts
- [ ] Log all requests for debugging
- [ ] Implement circuit breakers for cascading failure prevention
- [ ] Set up alerting for error rate spikes
Conclusion
Production Sonnet 4.6 deployments require defensive programming at every layer. The patterns in this guide—battle-tested at scale—ensure reliability, cost efficiency, and observability. Start with error handling and monitoring; add caching and fallbacks as you scale.