AI infrastructure optimization is the 2026 reckoning. Deloitte calls it an “AI infrastructure reckoning” — organizations moving past the “just buy GPUs” phase into balancing model choice, inference cost, deployment architecture, and token economics. NVIDIA emphasizes cost-efficient token production as the key metric.
This guide shows how to use Terraform to deploy cost-optimized AI inference infrastructure on AWS.
The Cost Problem
| Model | Input $/1M tokens | Output $/1M tokens | 1M requests/month* |
|---|---|---|---|
| Claude 3.5 Sonnet (Bedrock) | $3.00 | $15.00 | ~$18,000 |
| Claude 3 Haiku (Bedrock) | $0.25 | $1.25 | ~$1,500 |
| Llama 3 70B (SageMaker) | Self-hosted | Self-hosted | ~$3,000-5,000 |
| Llama 3 8B (SageMaker) | Self-hosted | Self-hosted | ~$800-1,500 |
*Estimated 500 input + 500 output tokens per request
Strategy 1: Model Routing — Right Model for the Task
# Deploy multiple model endpoints at different cost tiers
resource "aws_sagemaker_endpoint" "large_model" {
name = "llama-70b-complex"
endpoint_config_name = aws_sagemaker_endpoint_configuration.large.name
tags = { ModelTier = "premium", CostPerRequest = "high" }
}
resource "aws_sagemaker_endpoint_configuration" "large" {
name = "llama-70b-config"
production_variants {
variant_name = "primary"
model_name = aws_sagemaker_model.llama_70b.name
instance_type = "ml.g5.12xlarge"
initial_instance_count = 1
}
}
resource "aws_sagemaker_endpoint" "small_model" {
name = "llama-8b-simple"
endpoint_config_name = aws_sagemaker_endpoint_configuration.small.name
tags = { ModelTier = "economy", CostPerRequest = "low" }
}
resource "aws_sagemaker_endpoint_configuration" "small" {
name = "llama-8b-config"
production_variants {
variant_name = "primary"
model_name = aws_sagemaker_model.llama_8b.name
instance_type = "ml.g5.xlarge"
initial_instance_count = 1
}
}
# Router Lambda — sends simple queries to small model, complex to large
resource "aws_lambda_function" "model_router" {
function_name = "ai-model-router"
runtime = "python3.12"
handler = "router.handler"
timeout = 30
memory_size = 256
filename = "lambda/router/deployment.zip"
source_code_hash = filebase64sha256("lambda/router/deployment.zip")
role = aws_iam_role.router.arn
environment {
variables = {
LARGE_ENDPOINT = aws_sagemaker_endpoint.large_model.name
SMALL_ENDPOINT = aws_sagemaker_endpoint.small_model.name
BEDROCK_MODEL = "anthropic.claude-3-5-sonnet" # Fallback for hardest tasks
}
}
}
Strategy 2: Auto-Scaling by Demand
# Scale SageMaker endpoints based on invocations
resource "aws_appautoscaling_target" "sagemaker" {
max_capacity = 4
min_capacity = 1
resource_id = "endpoint/${aws_sagemaker_endpoint.large_model.name}/variant/primary"
scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
service_namespace = "sagemaker"
}
resource "aws_appautoscaling_policy" "scale_on_invocations" {
name = "scale-on-invocations"
policy_type = "TargetTrackingScaling"
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
target_tracking_scaling_policy_configuration {
predefined_metric_specification {
predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
}
target_value = 100 # Target invocations per instance per minute
scale_in_cooldown = 600 # Wait 10 min before scaling down
scale_out_cooldown = 120 # Scale up faster
}
}
# Scale to zero during off-hours (schedule-based)
resource "aws_appautoscaling_scheduled_action" "night_scale_down" {
name = "night-scale-down"
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
schedule = "cron(0 22 * * ? *)" # 10 PM UTC
scalable_target_action {
min_capacity = 0
max_capacity = 1
}
}
resource "aws_appautoscaling_scheduled_action" "morning_scale_up" {
name = "morning-scale-up"
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
schedule = "cron(0 7 * * ? *)" # 7 AM UTC
scalable_target_action {
min_capacity = 1
max_capacity = 4
}
}
Strategy 3: Spot Instances for Batch Inference
# Use Spot for batch/async inference — 60-70% cheaper
resource "aws_sagemaker_endpoint_configuration" "batch_spot" {
name = "batch-inference-spot"
production_variants {
variant_name = "spot"
model_name = aws_sagemaker_model.llama_70b.name
instance_type = "ml.g5.12xlarge"
initial_instance_count = 1
managed_instance_scaling {
status = "ENABLED"
min_instance_count = 0
max_instance_count = 8
}
routing_config {
routing_strategy = "LEAST_OUTSTANDING_REQUESTS"
}
}
}
# SageMaker batch transform for large-scale inference
resource "aws_sagemaker_transform_job" "weekly_batch" {
# Triggered by Step Functions or EventBridge
# Processes S3 input → S3 output without persistent endpoint
}
Strategy 4: Caching Layer
# ElastiCache for caching repeated prompts/responses
resource "aws_elasticache_replication_group" "ai_cache" {
replication_group_id = "ai-response-cache"
description = "Cache for AI model responses"
node_type = "cache.r6g.large"
num_cache_clusters = 2
engine = "redis"
engine_version = "7.1"
# Cache repeated queries — same question = same answer
parameter_group_name = aws_elasticache_parameter_group.ai.name
at_rest_encryption_enabled = true
transit_encryption_enabled = true
tags = { Component = "ai-optimization" }
}
resource "aws_elasticache_parameter_group" "ai" {
name = "ai-cache-params"
family = "redis7"
# Set TTL for cached responses
parameter {
name = "maxmemory-policy"
value = "allkeys-lru" # Evict least recently used
}
}
Cost Monitoring Dashboard
resource "aws_cloudwatch_dashboard" "ai_costs" {
dashboard_name = "ai-cost-optimization"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
width = 12
height = 6
properties = {
title = "Token Usage by Model"
metrics = [
["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-haiku"],
["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-haiku"]
]
period = 3600
stat = "Sum"
}
},
{
type = "metric"
width = 12
height = 6
properties = {
title = "SageMaker Instance Hours"
metrics = [
["AWS/SageMaker", "CPUUtilization", "EndpointName", "llama-70b-complex"],
["AWS/SageMaker", "GPUUtilization", "EndpointName", "llama-70b-complex"],
["AWS/SageMaker", "GPUMemoryUtilization", "EndpointName", "llama-70b-complex"]
]
period = 300
stat = "Average"
}
},
{
type = "metric"
width = 12
height = 6
properties = {
title = "Cache Hit Rate"
metrics = [
["AWS/ElastiCache", "CacheHitRate", "CacheClusterId", "ai-response-cache-001"]
]
period = 300
}
}
]
})
}
Hands-On Courses
- Terraform for Beginners on CopyPasteLearn
- Terraform By Example — practical code examples
Conclusion
AI infrastructure optimization is about deploying the right model at the right cost: route simple queries to small models, auto-scale endpoints by demand, use Spot for batch inference, and cache repeated responses. Terraform makes this architecture reproducible and tunable — adjust instance types, scaling thresholds, and model routing as your usage patterns evolve. The goal isn’t minimum cost; it’s maximum value per token.
