Skip to main content

Terraform for AI Infrastructure Optimization: Cost-Efficient Model Deployment on AWS

Key Takeaway

Optimize AI infrastructure costs with Terraform. Deploy right-sized inference endpoints, auto-scale based on token throughput, use Spot instances, and implement model routing for cost efficiency.

Table of Contents

AI infrastructure optimization is the 2026 reckoning. Deloitte calls it an “AI infrastructure reckoning” — organizations moving past the “just buy GPUs” phase into balancing model choice, inference cost, deployment architecture, and token economics. NVIDIA emphasizes cost-efficient token production as the key metric.

This guide shows how to use Terraform to deploy cost-optimized AI inference infrastructure on AWS.

The Cost Problem

ModelInput $/1M tokensOutput $/1M tokens1M requests/month*
Claude 3.5 Sonnet (Bedrock)$3.00$15.00~$18,000
Claude 3 Haiku (Bedrock)$0.25$1.25~$1,500
Llama 3 70B (SageMaker)Self-hostedSelf-hosted~$3,000-5,000
Llama 3 8B (SageMaker)Self-hostedSelf-hosted~$800-1,500

*Estimated 500 input + 500 output tokens per request

Strategy 1: Model Routing — Right Model for the Task

# Deploy multiple model endpoints at different cost tiers
resource "aws_sagemaker_endpoint" "large_model" {
  name                 = "llama-70b-complex"
  endpoint_config_name = aws_sagemaker_endpoint_configuration.large.name
  tags = { ModelTier = "premium", CostPerRequest = "high" }
}

resource "aws_sagemaker_endpoint_configuration" "large" {
  name = "llama-70b-config"

  production_variants {
    variant_name           = "primary"
    model_name             = aws_sagemaker_model.llama_70b.name
    instance_type          = "ml.g5.12xlarge"
    initial_instance_count = 1
  }
}

resource "aws_sagemaker_endpoint" "small_model" {
  name                 = "llama-8b-simple"
  endpoint_config_name = aws_sagemaker_endpoint_configuration.small.name
  tags = { ModelTier = "economy", CostPerRequest = "low" }
}

resource "aws_sagemaker_endpoint_configuration" "small" {
  name = "llama-8b-config"

  production_variants {
    variant_name           = "primary"
    model_name             = aws_sagemaker_model.llama_8b.name
    instance_type          = "ml.g5.xlarge"
    initial_instance_count = 1
  }
}

# Router Lambda — sends simple queries to small model, complex to large
resource "aws_lambda_function" "model_router" {
  function_name = "ai-model-router"
  runtime       = "python3.12"
  handler       = "router.handler"
  timeout       = 30
  memory_size   = 256

  filename         = "lambda/router/deployment.zip"
  source_code_hash = filebase64sha256("lambda/router/deployment.zip")
  role             = aws_iam_role.router.arn

  environment {
    variables = {
      LARGE_ENDPOINT = aws_sagemaker_endpoint.large_model.name
      SMALL_ENDPOINT = aws_sagemaker_endpoint.small_model.name
      BEDROCK_MODEL  = "anthropic.claude-3-5-sonnet"  # Fallback for hardest tasks
    }
  }
}

Strategy 2: Auto-Scaling by Demand

# Scale SageMaker endpoints based on invocations
resource "aws_appautoscaling_target" "sagemaker" {
  max_capacity       = 4
  min_capacity       = 1
  resource_id        = "endpoint/${aws_sagemaker_endpoint.large_model.name}/variant/primary"
  scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
  service_namespace  = "sagemaker"
}

resource "aws_appautoscaling_policy" "scale_on_invocations" {
  name               = "scale-on-invocations"
  policy_type        = "TargetTrackingScaling"
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace

  target_tracking_scaling_policy_configuration {
    predefined_metric_specification {
      predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
    }
    target_value       = 100   # Target invocations per instance per minute
    scale_in_cooldown  = 600   # Wait 10 min before scaling down
    scale_out_cooldown = 120   # Scale up faster
  }
}

# Scale to zero during off-hours (schedule-based)
resource "aws_appautoscaling_scheduled_action" "night_scale_down" {
  name               = "night-scale-down"
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  schedule           = "cron(0 22 * * ? *)"  # 10 PM UTC

  scalable_target_action {
    min_capacity = 0
    max_capacity = 1
  }
}

resource "aws_appautoscaling_scheduled_action" "morning_scale_up" {
  name               = "morning-scale-up"
  service_namespace  = aws_appautoscaling_target.sagemaker.service_namespace
  resource_id        = aws_appautoscaling_target.sagemaker.resource_id
  scalable_dimension = aws_appautoscaling_target.sagemaker.scalable_dimension
  schedule           = "cron(0 7 * * ? *)"  # 7 AM UTC

  scalable_target_action {
    min_capacity = 1
    max_capacity = 4
  }
}

Strategy 3: Spot Instances for Batch Inference

# Use Spot for batch/async inference — 60-70% cheaper
resource "aws_sagemaker_endpoint_configuration" "batch_spot" {
  name = "batch-inference-spot"

  production_variants {
    variant_name           = "spot"
    model_name             = aws_sagemaker_model.llama_70b.name
    instance_type          = "ml.g5.12xlarge"
    initial_instance_count = 1

    managed_instance_scaling {
      status           = "ENABLED"
      min_instance_count = 0
      max_instance_count = 8
    }

    routing_config {
      routing_strategy = "LEAST_OUTSTANDING_REQUESTS"
    }
  }
}

# SageMaker batch transform for large-scale inference
resource "aws_sagemaker_transform_job" "weekly_batch" {
  # Triggered by Step Functions or EventBridge
  # Processes S3 input → S3 output without persistent endpoint
}

Strategy 4: Caching Layer

# ElastiCache for caching repeated prompts/responses
resource "aws_elasticache_replication_group" "ai_cache" {
  replication_group_id = "ai-response-cache"
  description          = "Cache for AI model responses"
  node_type            = "cache.r6g.large"
  num_cache_clusters   = 2

  engine         = "redis"
  engine_version = "7.1"

  # Cache repeated queries — same question = same answer
  parameter_group_name = aws_elasticache_parameter_group.ai.name

  at_rest_encryption_enabled = true
  transit_encryption_enabled = true

  tags = { Component = "ai-optimization" }
}

resource "aws_elasticache_parameter_group" "ai" {
  name   = "ai-cache-params"
  family = "redis7"

  # Set TTL for cached responses
  parameter {
    name  = "maxmemory-policy"
    value = "allkeys-lru"  # Evict least recently used
  }
}

Cost Monitoring Dashboard

resource "aws_cloudwatch_dashboard" "ai_costs" {
  dashboard_name = "ai-cost-optimization"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "Token Usage by Model"
          metrics = [
            ["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
            ["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-5-sonnet"],
            ["AWS/Bedrock", "InputTokenCount", "ModelId", "anthropic.claude-3-haiku"],
            ["AWS/Bedrock", "OutputTokenCount", "ModelId", "anthropic.claude-3-haiku"]
          ]
          period = 3600
          stat   = "Sum"
        }
      },
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "SageMaker Instance Hours"
          metrics = [
            ["AWS/SageMaker", "CPUUtilization", "EndpointName", "llama-70b-complex"],
            ["AWS/SageMaker", "GPUUtilization", "EndpointName", "llama-70b-complex"],
            ["AWS/SageMaker", "GPUMemoryUtilization", "EndpointName", "llama-70b-complex"]
          ]
          period = 300
          stat   = "Average"
        }
      },
      {
        type   = "metric"
        width  = 12
        height = 6
        properties = {
          title   = "Cache Hit Rate"
          metrics = [
            ["AWS/ElastiCache", "CacheHitRate", "CacheClusterId", "ai-response-cache-001"]
          ]
          period = 300
        }
      }
    ]
  })
}

Hands-On Courses

Conclusion

AI infrastructure optimization is about deploying the right model at the right cost: route simple queries to small models, auto-scale endpoints by demand, use Spot for batch inference, and cache repeated responses. Terraform makes this architecture reproducible and tunable — adjust instance types, scaling thresholds, and model routing as your usage patterns evolve. The goal isn’t minimum cost; it’s maximum value per token.

🚀

Level Up Your Terraform Skills

Hands-on courses, books, and resources from Luca Berton

Luca Berton
Written by

Luca Berton

DevOps Engineer, AWS Partner, Terraform expert, and author. Creator of Ansible Pilot, Terraform Pilot, and CopyPasteLearn.