TerraformPilot

DevOps

Terraform for AI Infrastructure Optimization: Cost-Efficient Model Deployment on AWS

Optimize AI infrastructure costs with Terraform. Deploy right-sized inference endpoints, auto-scale based on token throughput, use Spot instances

LLuca Berton1 min read

AI infrastructure optimization is the 2026 reckoning. Deloitte calls it an "AI infrastructure reckoning" — organizations moving past the "just buy GPUs" phase into balancing model choice, inference cost, deployment architecture, and token economics. NVIDIA emphasizes cost-efficient token production as the key metric.

This guide shows how to use Terraform to deploy cost-optimized AI inference infrastructure on AWS.

The Cost Problem

#
ModelInput $/1M tokensOutput $/1M tokens1M requests/month*
Claude 3.5 Sonnet (Bedrock)$3.00$15.00~$18,000
Claude 3 Haiku (Bedrock)$0.25$1.25~$1,500
Llama 3 70B (SageMaker)Self-hostedSelf-hosted~$3,000-5,000
Llama 3 8B (SageMaker)Self-hostedSelf-hosted~$800-1,500

*Estimated 500 input + 500 output tokens per request

Strategy 1: Model Routing — Right Model for the Task

#
# Deploy multiple model endpoints at different cost tiers
resource "aws_sagemaker_endpoint" "large_model" {
  name                 = "llama-70b-complex"
  endpoint_config_name = aws_sagemaker_endpoint_configuration.large.name
  tags = { ModelTier = "premium", CostPerRequest = "high" }
}
 
resource "aws_sagemaker_endpoint_configuration" "large" {
  name = "llama-70b-config"
 
  production_variants {
    variant_name           = "primary"
    model_name             = aws_sagemaker_model.llama_70b.name
    instance_type          = "ml.g5.12xlarge"
    initial_instance_count = 1
  }
}
 
resource "aws_sagemaker_endpoint" "small_model" {
  name                 = "llama-8b-simple"
  endpoint_config_name = aws_sagemaker_endpoint_configuration.small.name
  tags = { ModelTier = "economy", CostPerRequest = "low" }
}
 
resource "aws_sagemaker_endpoint_configuration" "small" {
  name = "llama-8b-config"
 
  production_variants {
    variant_name           = "primary"
    model_name             = aws_sagemaker_model.llama_8b.name
    instance_type          = "ml.g5.xlarge"
    initial_instance_count = 1
  }
}
 
# Router Lambda — sends simple queries to small model, complex to large
resource "aws_lambda_function" "model_router" {
  function_name = "ai-model-router"
  runtime       = "python3.12"
  handler       = "router.handler"
  timeout       = 30
  memory_size   = 256
 
  filename         = "lambda/router/deployment.zip"
  source_code_hash = filebase64sha256("lambda/router/deployment.zip")
  role             = aws_iam_role.router.arn
 
  environment {
    variables = {
      LARGE_ENDPOINT = aws_sagemaker_endpoint.large_model.name
      SMALL_ENDPOINT = aws_sagemaker_endpoint.small_model.name
      BEDROCK_MODEL  = "anthropic.claude-3-5-sonnet"  # Fallback for hardest tasks
    }
  }
}

Strategy 2: Auto-Scaling by Demand

#
# Scale SageMaker endpoints based on invocations
resource "aws_appautoscaling_target" "sagemaker" {
  max_capacity       = 4
  min_capacity       = 1
  resource_id        = "endpoint/${aws_sagemaker_endpoint.large_model.name}/variant/primary"
  scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
  service_namespace  = "sagemaker"
}
 
resource "aws_appautoscaling_policy" "scale_on_invocations" {
  name               = "scale-on-invocations"
  policy_type        = "TargetTrackingScaling"
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace
 
  target_tracking_scaling_policy_configuration {
    predefined_metric_specification {
      predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
    }
    target_value       = 100   # Target invocations per instance per minute
    scale_in_cooldown  = 600   # Wait 10 min before scaling down
    scale_out_cooldown = 120   # Scale up faster
  }
}
 
# Scale to zero during off-hours (schedule-based)
resource "aws_appautoscaling_scheduled_action" "night_scale_down" {
  name               = "night-scale-down"
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  schedule           = "cron(0 22 * * ? *)"  # 10 PM UTC
 
  scalable_target_action {
    min_capacity = 0
    max_capacity = 1
  }
}
 
resource "aws_appautoscaling_scheduled_action" "morning_scale_up" {
  name               = "morning-scale-up"
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  schedule           = "cron(0 7 * * ? *)"  # 7 AM UTC
 
  scalable_target_action {
    min_capacity = 1
    max_capacity = 4
  }
}

Strategy 3: Spot Instances for Batch Inference

#
# Use Spot for batch/async inference — 60-70% cheaper
resource "aws_sagemaker_endpoint_configuration" "batch_spot" {
  name = "batch-inference-spot"
 
  production_variants {
    variant_name           = "spot"
    model_name             = aws_sagemaker_model.llama_70b.name
    instance_type          = "ml.g5.12xlarge"
    initial_instance_count = 1
 
    managed_instance_scaling {
      status           = "ENABLED"
      min_instance_count = 0
      max_instance_count = 8
    }
 
    routing_config {
      routing_strategy = "LEAST_OUTSTANDING_REQUESTS"
    }
  }
}
 
# SageMaker batch transform for large-scale inference
resource "aws_sagemaker_transform_job" "weekly_batch" {
  # Triggered by Step Functions or EventBridge
  # Processes S3 input → S3 output without persistent endpoint
}

Strategy 4: Caching Layer

#
# ElastiCache for caching repeated prompts/responses
resource "aws_elasticache_replication_group" "ai_cache" {
  replication_group_id = "ai-response-cache"
  description          = "Cache for AI model responses"
  node_type            = "cache.r6g.large"
  num_cache_clusters   = 2
 
  engine         = "redis"
  engine_version = "7.1"
 
  # Cache repeated queries — same question = same answer
  parameter_group_name = aws_elasticache_parameter_group.ai.name
 
  at_rest_encryption_enabled = true
  transit_encryption_enabled = true
 
  tags = { Component = "ai-optimization" }
}
 
resource "aws_elasticache_parameter_group" "ai" {
  name   = "ai-cache-params"
  family = "redis7"
 
  # Set TTL for cached responses
  parameter {
    name  = "maxmemory-policy"
    value = "allkeys-lru"  # Evict least recently used
  }
}

Cost Monitoring Dashboard

#
resource "aws_cloudwatch_dashboard" "ai_costs" {
  dashboard_name = "ai-cost-optimization"
 
  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "Token Usage by Model"
          metrics = [
            ["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
            ["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
            ["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-haiku"],
            ["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-haiku"]
          ]
          period = 3600
          stat   = "Sum"
        }
      },
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "SageMaker Instance Hours"
          metrics = [
            ["AWS/SageMaker", "CPUUtilization", "EndpointName", "llama-70b-complex"],
            ["AWS/SageMaker", "GPUUtilization", "EndpointName", "llama-70b-complex"],
            ["AWS/SageMaker", "GPUMemoryUtilization", "EndpointName", "llama-70b-complex"]
          ]
          period = 300
          stat   = "Average"
        }
      },
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "Cache Hit Rate"
          metrics = [
            ["AWS/ElastiCache", "CacheHitRate", "CacheClusterId", "ai-response-cache-001"]
          ]
          period = 300
        }
      }
    ]
  })
}

Hands-On Courses

#

Conclusion

#

AI infrastructure optimization is about deploying the right model at the right cost: route simple queries to small models, auto-scale endpoints by demand, use Spot for batch inference, and cache repeated responses. Terraform makes this architecture reproducible and tunable — adjust instance types, scaling thresholds, and model routing as your usage patterns evolve. The goal isn't minimum cost; it's maximum value per token.

#Terraform#AI#AWS#FinOps#DevOps#Cost Optimization

Share this article