Terraform for AI-Native Development Platforms on AWS
Provision AI-native developer platforms with Terraform: sandboxes, CI/CD runners, model-serving environments, secrets, VPCs, and preview environments.
DevOps
Optimize AI infrastructure costs with Terraform. Deploy right-sized inference endpoints, auto-scale based on token throughput, use Spot instances
AI infrastructure optimization is the 2026 reckoning. Deloitte calls it an "AI infrastructure reckoning" — organizations moving past the "just buy GPUs" phase into balancing model choice, inference cost, deployment architecture, and token economics. NVIDIA emphasizes cost-efficient token production as the key metric.
This guide shows how to use Terraform to deploy cost-optimized AI inference infrastructure on AWS.
| Model | Input $/1M tokens | Output $/1M tokens | 1M requests/month* |
|---|---|---|---|
| Claude 3.5 Sonnet (Bedrock) | $3.00 | $15.00 | ~$18,000 |
| Claude 3 Haiku (Bedrock) | $0.25 | $1.25 | ~$1,500 |
| Llama 3 70B (SageMaker) | Self-hosted | Self-hosted | ~$3,000-5,000 |
| Llama 3 8B (SageMaker) | Self-hosted | Self-hosted | ~$800-1,500 |
*Estimated 500 input + 500 output tokens per request
# Deploy multiple model endpoints at different cost tiers
resource "aws_sagemaker_endpoint" "large_model" {
name = "llama-70b-complex"
endpoint_config_name = aws_sagemaker_endpoint_configuration.large.name
tags = { ModelTier = "premium", CostPerRequest = "high" }
}
resource "aws_sagemaker_endpoint_configuration" "large" {
name = "llama-70b-config"
production_variants {
variant_name = "primary"
model_name = aws_sagemaker_model.llama_70b.name
instance_type = "ml.g5.12xlarge"
initial_instance_count = 1
}
}
resource "aws_sagemaker_endpoint" "small_model" {
name = "llama-8b-simple"
endpoint_config_name = aws_sagemaker_endpoint_configuration.small.name
tags = { ModelTier = "economy", CostPerRequest = "low" }
}
resource "aws_sagemaker_endpoint_configuration" "small" {
name = "llama-8b-config"
production_variants {
variant_name = "primary"
model_name = aws_sagemaker_model.llama_8b.name
instance_type = "ml.g5.xlarge"
initial_instance_count = 1
}
}
# Router Lambda — sends simple queries to small model, complex to large
resource "aws_lambda_function" "model_router" {
function_name = "ai-model-router"
runtime = "python3.12"
handler = "router.handler"
timeout = 30
memory_size = 256
filename = "lambda/router/deployment.zip"
source_code_hash = filebase64sha256("lambda/router/deployment.zip")
role = aws_iam_role.router.arn
environment {
variables = {
LARGE_ENDPOINT = aws_sagemaker_endpoint.large_model.name
SMALL_ENDPOINT = aws_sagemaker_endpoint.small_model.name
BEDROCK_MODEL = "anthropic.claude-3-5-sonnet" # Fallback for hardest tasks
}
}
}# Scale SageMaker endpoints based on invocations
resource "aws_appautoscaling_target" "sagemaker" {
max_capacity = 4
min_capacity = 1
resource_id = "endpoint/${aws_sagemaker_endpoint.large_model.name}/variant/primary"
scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
service_namespace = "sagemaker"
}
resource "aws_appautoscaling_policy" "scale_on_invocations" {
name = "scale-on-invocations"
policy_type = "TargetTrackingScaling"
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
target_tracking_scaling_policy_configuration {
predefined_metric_specification {
predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
}
target_value = 100 # Target invocations per instance per minute
scale_in_cooldown = 600 # Wait 10 min before scaling down
scale_out_cooldown = 120 # Scale up faster
}
}
# Scale to zero during off-hours (schedule-based)
resource "aws_appautoscaling_scheduled_action" "night_scale_down" {
name = "night-scale-down"
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
schedule = "cron(0 22 * * ? *)" # 10 PM UTC
scalable_target_action {
min_capacity = 0
max_capacity = 1
}
}
resource "aws_appautoscaling_scheduled_action" "morning_scale_up" {
name = "morning-scale-up"
service_namespace = aws_appautoscaling_target.sagemaker.service_namespace
resource_id = aws_appautoscaling_target.sagemaker.resource_id
scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
schedule = "cron(0 7 * * ? *)" # 7 AM UTC
scalable_target_action {
min_capacity = 1
max_capacity = 4
}
}# Use Spot for batch/async inference — 60-70% cheaper
resource "aws_sagemaker_endpoint_configuration" "batch_spot" {
name = "batch-inference-spot"
production_variants {
variant_name = "spot"
model_name = aws_sagemaker_model.llama_70b.name
instance_type = "ml.g5.12xlarge"
initial_instance_count = 1
managed_instance_scaling {
status = "ENABLED"
min_instance_count = 0
max_instance_count = 8
}
routing_config {
routing_strategy = "LEAST_OUTSTANDING_REQUESTS"
}
}
}
# SageMaker batch transform for large-scale inference
resource "aws_sagemaker_transform_job" "weekly_batch" {
# Triggered by Step Functions or EventBridge
# Processes S3 input → S3 output without persistent endpoint
}# ElastiCache for caching repeated prompts/responses
resource "aws_elasticache_replication_group" "ai_cache" {
replication_group_id = "ai-response-cache"
description = "Cache for AI model responses"
node_type = "cache.r6g.large"
num_cache_clusters = 2
engine = "redis"
engine_version = "7.1"
# Cache repeated queries — same question = same answer
parameter_group_name = aws_elasticache_parameter_group.ai.name
at_rest_encryption_enabled = true
transit_encryption_enabled = true
tags = { Component = "ai-optimization" }
}
resource "aws_elasticache_parameter_group" "ai" {
name = "ai-cache-params"
family = "redis7"
# Set TTL for cached responses
parameter {
name = "maxmemory-policy"
value = "allkeys-lru" # Evict least recently used
}
}resource "aws_cloudwatch_dashboard" "ai_costs" {
dashboard_name = "ai-cost-optimization"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
width = 12
height = 6
properties = {
title = "Token Usage by Model"
metrics = [
["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-haiku"],
["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-haiku"]
]
period = 3600
stat = "Sum"
}
},
{
type = "metric"
width = 12
height = 6
properties = {
title = "SageMaker Instance Hours"
metrics = [
["AWS/SageMaker", "CPUUtilization", "EndpointName", "llama-70b-complex"],
["AWS/SageMaker", "GPUUtilization", "EndpointName", "llama-70b-complex"],
["AWS/SageMaker", "GPUMemoryUtilization", "EndpointName", "llama-70b-complex"]
]
period = 300
stat = "Average"
}
},
{
type = "metric"
width = 12
height = 6
properties = {
title = "Cache Hit Rate"
metrics = [
["AWS/ElastiCache", "CacheHitRate", "CacheClusterId", "ai-response-cache-001"]
]
period = 300
}
}
]
})
}AI infrastructure optimization is about deploying the right model at the right cost: route simple queries to small models, auto-scale endpoints by demand, use Spot for batch inference, and cache repeated responses. Terraform makes this architecture reproducible and tunable — adjust instance types, scaling thresholds, and model routing as your usage patterns evolve. The goal isn't minimum cost; it's maximum value per token.
Provision AI-native developer platforms with Terraform: sandboxes, CI/CD runners, model-serving environments, secrets, VPCs, and preview environments.
Deploy agentic AI and multi-agent systems with Terraform on AWS. Provision SQS queues, Lambda functions, Step Functions orchestration
Secure AI workloads with Terraform. Deploy Bedrock guardrails, model access IAM policies, prompt injection detection
Provision AI supercomputing infrastructure with Terraform. Deploy GPU clusters with p5.48xlarge, EFA networking, FSx Lustre storage