Terraform for AI-Native Development Platforms on AWS
Provision AI-native developer platforms with Terraform: sandboxes, CI/CD runners, model-serving environments, secrets, VPCs, and preview environments.
DevOps
Deploy agentic AI and multi-agent systems with Terraform on AWS. Provision SQS queues, Lambda functions, Step Functions orchestration
Agentic AI is the biggest infrastructure trend of 2026. AI is moving from chat interfaces to autonomous agents that execute multi-step tasks across workflows — and those agents need infrastructure. Gartner lists multiagent systems in its 2026 top 10 strategic trends.
This guide shows how to provision the infrastructure for agentic AI systems using Terraform on AWS.
Unlike a simple LLM API call, agentic systems need:
User Request
│
▼
┌─────────────────┐
│ API Gateway │
└────────┬────────┘
▼
┌─────────────────┐ ┌──────────────┐
│ Orchestrator │────▶│ Agent Queue │
│ (Step Functions)│ │ (SQS) │
└────────┬────────┘ └──────┬───────┘
│ │
┌────┴────┐ ┌─────┴─────┐
▼ ▼ ▼ ▼
┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐
│Research│ │Planning│ │Executor│ │Reviewer│
│ Agent │ │ Agent │ │ Agent │ │ Agent │
│(Lambda)│ │(Lambda)│ │(Lambda)│ │(Lambda)│
└────┬───┘ └────┬───┘ └────┬───┘ └────┬───┘
│ │ │ │
└──────────┴──────────┴──────────┘
│
┌─────┴─────┐
│ Bedrock │ ← LLM API
│ OpenSearch │ ← Vector memory
│ DynamoDB │ ← State/history
└───────────┘Each agent gets its own input queue for asynchronous communication:
variable "agents" {
default = ["research", "planning", "executor", "reviewer"]
}
resource "aws_sqs_queue" "agent_queue" {
for_each = toset(var.agents)
name = "agent-${each.key}-queue"
visibility_timeout_seconds = 300 # 5 min for LLM processing
message_retention_seconds = 86400 # 24 hours
receive_wait_time_seconds = 20 # Long polling
redrive_policy = jsonencode({
deadLetterTargetArn = aws_sqs_queue.agent_dlq[each.key].arn
maxReceiveCount = 3
})
tags = {
Component = "agentic-ai"
Agent = each.key
}
}
resource "aws_sqs_queue" "agent_dlq" {
for_each = toset(var.agents)
name = "agent-${each.key}-dlq"
}resource "aws_lambda_function" "agent" {
for_each = toset(var.agents)
function_name = "agent-${each.key}"
runtime = "python3.12"
handler = "handler.main"
timeout = 300 # 5 min max for complex reasoning
memory_size = 512
filename = "lambda/${each.key}/deployment.zip"
source_code_hash = filebase64sha256("lambda/${each.key}/deployment.zip")
role = aws_iam_role.agent_role[each.key].arn
environment {
variables = {
AGENT_NAME = each.key
BEDROCK_MODEL = var.bedrock_model_id
OPENSEARCH_HOST = aws_opensearch_domain.memory.endpoint
STATE_TABLE = aws_dynamodb_table.agent_state.name
OUTPUT_QUEUE = aws_sqs_queue.orchestrator_results.url
}
}
}
# SQS triggers each agent
resource "aws_lambda_event_source_mapping" "agent_trigger" {
for_each = toset(var.agents)
event_source_arn = aws_sqs_queue.agent_queue[each.key].arn
function_name = aws_lambda_function.agent[each.key].arn
batch_size = 1 # One task at a time per agent
}resource "aws_sfn_state_machine" "agent_orchestrator" {
name = "agent-orchestrator"
role_arn = aws_iam_role.sfn_role.arn
definition = jsonencode({
Comment = "Multi-agent task orchestration"
StartAt = "Research"
States = {
Research = {
Type = "Task"
Resource = "arn:aws:states:::sqs:sendMessage.waitForTaskToken"
Parameters = {
QueueUrl = aws_sqs_queue.agent_queue["research"].url
MessageBody = {
"task.$" = "$.task"
"taskToken.$" = "$$.Task.Token"
}
}
TimeoutSeconds = 600
Next = "Planning"
Catch = [{
ErrorEquals = ["States.TaskFailed", "States.Timeout"]
Next = "HandleError"
}]
}
Planning = {
Type = "Task"
Resource = "arn:aws:states:::sqs:sendMessage.waitForTaskToken"
Parameters = {
QueueUrl = aws_sqs_queue.agent_queue["planning"].url
MessageBody = {
"task.$" = "$.task"
"research_output.$" = "$.research_output"
"taskToken.$" = "$$.Task.Token"
}
}
TimeoutSeconds = 600
Next = "Execute"
Catch = [{
ErrorEquals = ["States.TaskFailed"]
Next = "HandleError"
}]
}
Execute = {
Type = "Task"
Resource = "arn:aws:states:::sqs:sendMessage.waitForTaskToken"
Parameters = {
QueueUrl = aws_sqs_queue.agent_queue["executor"].url
MessageBody = {
"task.$" = "$.task"
"plan.$" = "$.plan"
"taskToken.$" = "$$.Task.Token"
}
}
TimeoutSeconds = 900
Next = "Review"
Catch = [{
ErrorEquals = ["States.TaskFailed"]
Next = "HandleError"
}]
}
Review = {
Type = "Task"
Resource = "arn:aws:states:::sqs:sendMessage.waitForTaskToken"
Parameters = {
QueueUrl = aws_sqs_queue.agent_queue["reviewer"].url
MessageBody = {
"task.$" = "$.task"
"result.$" = "$.result"
"taskToken.$" = "$$.Task.Token"
}
}
TimeoutSeconds = 600
Next = "CheckApproval"
Catch = [{
ErrorEquals = ["States.TaskFailed"]
Next = "HandleError"
}]
}
CheckApproval = {
Type = "Choice"
Choices = [{
Variable = "$.approved"
BooleanEquals = true
Next = "Success"
}]
Default = "Planning" # Loop back if rejected
}
Success = {
Type = "Succeed"
}
HandleError = {
Type = "Task"
Resource = aws_lambda_function.agent["reviewer"].arn
End = true
}
}
})
}resource "aws_opensearch_domain" "memory" {
domain_name = "agent-memory"
engine_version = "OpenSearch_2.13"
cluster_config {
instance_type = "r6g.large.search"
instance_count = 2
}
ebs_options {
ebs_enabled = true
volume_size = 100
volume_type = "gp3"
}
encrypt_at_rest {
enabled = true
}
node_to_node_encryption {
enabled = true
}
tags = {
Component = "agent-memory"
}
}resource "aws_dynamodb_table" "agent_state" {
name = "agent-state"
billing_mode = "PAY_PER_REQUEST"
hash_key = "task_id"
range_key = "agent_name"
attribute {
name = "task_id"
type = "S"
}
attribute {
name = "agent_name"
type = "S"
}
ttl {
attribute_name = "expires_at"
enabled = true
}
tags = {
Component = "agent-state"
}
}# Budget alarm for AI spend
resource "aws_budgets_budget" "ai_spend" {
name = "agentic-ai-monthly"
budget_type = "COST"
limit_amount = "500"
limit_unit = "USD"
time_unit = "MONTHLY"
cost_filter {
name = "TagKeyValue"
values = ["user:Component$agentic-ai"]
}
notification {
comparison_operator = "GREATER_THAN"
threshold = 80
threshold_type = "PERCENTAGE"
notification_type = "ACTUAL"
subscriber_email_addresses = [var.alert_email]
}
}
# CloudWatch alarm for runaway agents
resource "aws_cloudwatch_metric_alarm" "agent_invocations" {
alarm_name = "agent-high-invocations"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
metric_name = "Invocations"
namespace = "AWS/Lambda"
period = 300
statistic = "Sum"
threshold = 1000
alarm_description = "Agent invocations exceeded safety threshold"
alarm_actions = [aws_sns_topic.alerts.arn]
}Agentic AI systems need purpose-built infrastructure: message queues for agent communication, Step Functions for orchestration, vector databases for memory, and budget guardrails for cost control. Terraform makes this repeatable — deploy the same multi-agent architecture across dev, staging, and production with consistent configuration. As agents move from experiments to production workloads in 2026, infrastructure-as-code becomes essential for managing their complexity.
Provision AI-native developer platforms with Terraform: sandboxes, CI/CD runners, model-serving environments, secrets, VPCs, and preview environments.
Optimize AI infrastructure costs with Terraform. Deploy right-sized inference endpoints, auto-scale based on token throughput, use Spot instances
Secure AI workloads with Terraform. Deploy Bedrock guardrails, model access IAM policies, prompt injection detection
Provision AI supercomputing infrastructure with Terraform. Deploy GPU clusters with p5.48xlarge, EFA networking, FSx Lustre storage