Cost Optimization Strategies

Optimizing API usage costs while maintaining performance and quality is crucial for scalable AI applications. This guide provides comprehensive strategies for reducing costs through intelligent caching, efficient request patterns, and smart resource management.

Smart Caching System

import hashlib
import json
import time
from typing import Optional, Dict, Any
from openai import OpenAI

class AIResponseCache:
    def __init__(self, ttl_seconds=3600):
        self.cache = {}
        self.ttl = ttl_seconds
    
    def _generate_key(self, model, messages, **params):
        """Generate cache key"""
        # Create standardized request representation
        cache_data = {
            "model": model,
            "messages": messages,
            "params": {k: v for k, v in params.items() 
                     if k not in ['stream', 'user']}  # Exclude parameters that don't affect results
        }
        
        cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
        return hashlib.md5(cache_str.encode()).hexdigest()
    
    def get(self, model, messages, **params) -> Optional[Dict[Any, Any]]:
        """Get cached response"""
        key = self._generate_key(model, messages, **params)
        
        if key in self.cache:
            cached_data, timestamp = self.cache[key]
            
            # Check if expired
            if time.time() - timestamp < self.ttl:
                print("✅ Cache hit")
                return cached_data
            else:
                # Clean expired cache
                del self.cache[key]
        
        return None
    
    def set(self, model, messages, response, **params):
        """Cache response"""
        key = self._generate_key(model, messages, **params)
        self.cache[key] = (response, time.time())
    
    def clear_expired(self):
        """Clean expired cache"""
        current_time = time.time()
        expired_keys = [
            key for key, (_, timestamp) in self.cache.items()
            if current_time - timestamp >= self.ttl
        ]
        
        for key in expired_keys:
            del self.cache[key]
        
        return len(expired_keys)
    
    def get_stats(self):
        """Get cache statistics"""
        return {
            "total_entries": len(self.cache),
            "memory_usage_kb": len(str(self.cache)) / 1024,
            "oldest_entry_age": min([time.time() - ts for _, ts in self.cache.values()], default=0)
        }

# AI Client with integrated cache
class CachedAIClient:
    def __init__(self, api_key, base_url):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.cache = AIResponseCache(ttl_seconds=1800)  # 30-minute cache
        self.stats = {"cache_hits": 0, "cache_misses": 0, "total_savings": 0}
    
    def chat(self, model, messages, use_cache=True, **kwargs):
        """Chat interface with cache"""
        # Try to get from cache
        if use_cache:
            cached_response = self.cache.get(model, messages, **kwargs)
            if cached_response:
                self.stats["cache_hits"] += 1
                # Estimate cost savings (approximate)
                estimated_tokens = sum(len(msg.get("content", "")) for msg in messages) / 4
                self.stats["total_savings"] += estimated_tokens * 0.0015  # Rough estimate
                return cached_response
        
        # Call actual API
        self.stats["cache_misses"] += 1
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        
        # Convert to dictionary format for caching
        response_dict = {
            "id": response.id,
            "choices": [{
                "index": choice.index,
                "message": {
                    "role": choice.message.role,
                    "content": choice.message.content
                },
                "finish_reason": choice.finish_reason
            } for choice in response.choices],
            "usage": {
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens,
                "total_tokens": response.usage.total_tokens
            } if response.usage else None
        }
        
        # Cache response
        if use_cache:
            self.cache.set(model, messages, response_dict, **kwargs)
        
        return response_dict
    
    def get_cache_stats(self):
        """Get comprehensive cache statistics"""
        cache_stats = self.cache.get_stats()
        hit_rate = (self.stats["cache_hits"] / 
                   (self.stats["cache_hits"] + self.stats["cache_misses"])) * 100
        
        return {
            **cache_stats,
            **self.stats,
            "cache_hit_rate": f"{hit_rate:.1f}%"
        }

# Usage example
cached_client = CachedAIClient("your-api-key", "https://ai.machinefi.com/v1")

# First call
response1 = cached_client.chat(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "What is Python?"}]
)

# Second identical call (will hit cache)
response2 = cached_client.chat(
    model="gpt-3.5-turbo", 
    messages=[{"role": "user", "content": "What is Python?"}]
)

print(cached_client.get_cache_stats())

Advanced Caching Strategies

Semantic Similarity Caching

Intelligent Token Management

Prompt Optimization for Cost Efficiency

Prompt Compression Techniques

Batching and Bulk Operations

Cost Monitoring and Analytics

Cost Optimization Best Practices

  1. Implement Intelligent Caching: Cache responses for repeated or similar queries

  2. Use Appropriate Models: Don't use GPT-4 for tasks that GPT-3.5-turbo can handle

  3. Optimize Prompt Length: Remove unnecessary words and use compression techniques

  4. Batch Similar Requests: Group related queries into single requests when possible

  5. Set Token Limits: Use appropriate max_tokens settings to avoid over-generation

  6. Monitor Usage Patterns: Track costs and identify optimization opportunities

  7. Implement Rate Limiting: Control request frequency to manage costs

  8. Use Streaming Wisely: Stream for user experience, but be aware of potential increased costs

  9. Leverage Model-Specific Features: Use each model's strengths efficiently

  10. Implement Budget Controls: Set daily/monthly limits and automated alerts