Cost Optimization Strategies
Optimizing API usage costs while maintaining performance and quality is crucial for scalable AI applications. This guide provides comprehensive strategies for reducing costs through intelligent caching, efficient request patterns, and smart resource management.
Smart Caching System
import hashlib
import json
import time
from typing import Optional, Dict, Any
from openai import OpenAI
class AIResponseCache:
def __init__(self, ttl_seconds=3600):
self.cache = {}
self.ttl = ttl_seconds
def _generate_key(self, model, messages, **params):
"""Generate cache key"""
# Create standardized request representation
cache_data = {
"model": model,
"messages": messages,
"params": {k: v for k, v in params.items()
if k not in ['stream', 'user']} # Exclude parameters that don't affect results
}
cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
return hashlib.md5(cache_str.encode()).hexdigest()
def get(self, model, messages, **params) -> Optional[Dict[Any, Any]]:
"""Get cached response"""
key = self._generate_key(model, messages, **params)
if key in self.cache:
cached_data, timestamp = self.cache[key]
# Check if expired
if time.time() - timestamp < self.ttl:
print("✅ Cache hit")
return cached_data
else:
# Clean expired cache
del self.cache[key]
return None
def set(self, model, messages, response, **params):
"""Cache response"""
key = self._generate_key(model, messages, **params)
self.cache[key] = (response, time.time())
def clear_expired(self):
"""Clean expired cache"""
current_time = time.time()
expired_keys = [
key for key, (_, timestamp) in self.cache.items()
if current_time - timestamp >= self.ttl
]
for key in expired_keys:
del self.cache[key]
return len(expired_keys)
def get_stats(self):
"""Get cache statistics"""
return {
"total_entries": len(self.cache),
"memory_usage_kb": len(str(self.cache)) / 1024,
"oldest_entry_age": min([time.time() - ts for _, ts in self.cache.values()], default=0)
}
# AI Client with integrated cache
class CachedAIClient:
def __init__(self, api_key, base_url):
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.cache = AIResponseCache(ttl_seconds=1800) # 30-minute cache
self.stats = {"cache_hits": 0, "cache_misses": 0, "total_savings": 0}
def chat(self, model, messages, use_cache=True, **kwargs):
"""Chat interface with cache"""
# Try to get from cache
if use_cache:
cached_response = self.cache.get(model, messages, **kwargs)
if cached_response:
self.stats["cache_hits"] += 1
# Estimate cost savings (approximate)
estimated_tokens = sum(len(msg.get("content", "")) for msg in messages) / 4
self.stats["total_savings"] += estimated_tokens * 0.0015 # Rough estimate
return cached_response
# Call actual API
self.stats["cache_misses"] += 1
response = self.client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
# Convert to dictionary format for caching
response_dict = {
"id": response.id,
"choices": [{
"index": choice.index,
"message": {
"role": choice.message.role,
"content": choice.message.content
},
"finish_reason": choice.finish_reason
} for choice in response.choices],
"usage": {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens
} if response.usage else None
}
# Cache response
if use_cache:
self.cache.set(model, messages, response_dict, **kwargs)
return response_dict
def get_cache_stats(self):
"""Get comprehensive cache statistics"""
cache_stats = self.cache.get_stats()
hit_rate = (self.stats["cache_hits"] /
(self.stats["cache_hits"] + self.stats["cache_misses"])) * 100
return {
**cache_stats,
**self.stats,
"cache_hit_rate": f"{hit_rate:.1f}%"
}
# Usage example
cached_client = CachedAIClient("your-api-key", "https://ai.machinefi.com/v1")
# First call
response1 = cached_client.chat(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "What is Python?"}]
)
# Second identical call (will hit cache)
response2 = cached_client.chat(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "What is Python?"}]
)
print(cached_client.get_cache_stats())Advanced Caching Strategies
Semantic Similarity Caching
Intelligent Token Management
Prompt Optimization for Cost Efficiency
Prompt Compression Techniques
Batching and Bulk Operations
Cost Monitoring and Analytics
Cost Optimization Best Practices
Implement Intelligent Caching: Cache responses for repeated or similar queries
Use Appropriate Models: Don't use GPT-4 for tasks that GPT-3.5-turbo can handle
Optimize Prompt Length: Remove unnecessary words and use compression techniques
Batch Similar Requests: Group related queries into single requests when possible
Set Token Limits: Use appropriate max_tokens settings to avoid over-generation
Monitor Usage Patterns: Track costs and identify optimization opportunities
Implement Rate Limiting: Control request frequency to manage costs
Use Streaming Wisely: Stream for user experience, but be aware of potential increased costs
Leverage Model-Specific Features: Use each model's strengths efficiently
Implement Budget Controls: Set daily/monthly limits and automated alerts

