Tools: Terraform at Scale: Lessons from Managing 500+ Resources (2026)
When Terraform Gets Slow
Problem 1: Monolithic State
Solution: State Decomposition
Problem 2: Environment Drift
Solution: Modules + Terragrunt
Problem 3: Dangerous Applies
Solution: CI/CD Only
Problem 4: State Locks
Solution: Remote State with DynamoDB Locking
Results Our Terraform state file grew to 500+ resources. Plans took 8 minutes. Applies timed out. State locking conflicts were daily. Something had to change. Here's how we tamed it. Everything was in one state file. VPCs, databases, Kubernetes clusters, DNS, IAM all in one giant blob. Each directory = separate state file. Use data sources to reference across boundaries: Result: 6 state files, 60-100 resources each. Plan time: 45 seconds. Dev, staging, and prod drifted constantly because each was copy-pasted. Anyone could terraform apply to production from their laptop. Multiple engineers running plan simultaneously caused state lock conflicts. Plus: only CI/CD runs apply. Humans run plan locally with -lock=false for quick checks. If you want AI-powered infrastructure management that catches drift before it causes outages, check out what we're building at Nova AI Ops. Written by Dr. Samson Tanimawo
BSc · MSc · MBA · PhD
Founder & CEO, Nova AI Ops. https://novaaiops.com Templates let you quickly answer FAQs or store snippets for re-use. Hide child comments as well For further actions, you may consider blocking this person and/or reporting abuse
Before: 1 state file, 500+ resources
terraform plan: 8 minutes
terraform apply: timeout risk
blast radius: everything
Before: 1 state file, 500+ resources
terraform plan: 8 minutes
terraform apply: timeout risk
blast radius: everything
Before: 1 state file, 500+ resources
terraform plan: 8 minutes
terraform apply: timeout risk
blast radius: everything
infrastructure/
├── network/ # VPCs, subnets, security groups
├── data/ # RDS, ElastiCache, S3
├── compute/ # EKS, ASGs, Launch templates
├── dns/ # Route53 zones and records
├── iam/ # Roles, policies, users
└── monitoring/ # CloudWatch, SNS topics
infrastructure/
├── network/ # VPCs, subnets, security groups
├── data/ # RDS, ElastiCache, S3
├── compute/ # EKS, ASGs, Launch templates
├── dns/ # Route53 zones and records
├── iam/ # Roles, policies, users
└── monitoring/ # CloudWatch, SNS topics
infrastructure/
├── network/ # VPCs, subnets, security groups
├── data/ # RDS, ElastiCache, S3
├── compute/ # EKS, ASGs, Launch templates
├── dns/ # Route53 zones and records
├── iam/ # Roles, policies, users
└── monitoring/ # CloudWatch, SNS topics
# compute/main.tf
data "terraform_remote_state" "network" {
backend = "s3"
config = {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
}
} resource "aws_eks_cluster" "main" {
vpc_config {
subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids
}
}
# compute/main.tf
data "terraform_remote_state" "network" {
backend = "s3"
config = {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
}
} resource "aws_eks_cluster" "main" {
vpc_config {
subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids
}
}
# compute/main.tf
data "terraform_remote_state" "network" {
backend = "s3"
config = {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
}
} resource "aws_eks_cluster" "main" {
vpc_config {
subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids
}
}
modules/
├── eks-cluster/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── rds-instance/
├── main.tf
├── variables.tf
└── outputs.tf environments/
├── dev/
│ └── terragrunt.hcl
├── staging/
│ └── terragrunt.hcl
└── prod/
└── terragrunt.hcl
modules/
├── eks-cluster/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── rds-instance/
├── main.tf
├── variables.tf
└── outputs.tf environments/
├── dev/
│ └── terragrunt.hcl
├── staging/
│ └── terragrunt.hcl
└── prod/
└── terragrunt.hcl
modules/
├── eks-cluster/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── rds-instance/
├── main.tf
├── variables.tf
└── outputs.tf environments/
├── dev/
│ └── terragrunt.hcl
├── staging/
│ └── terragrunt.hcl
└── prod/
└── terragrunt.hcl
# environments/prod/terragrunt.hcl
terraform {
source = "../../modules/eks-cluster"
} inputs = {
cluster_name = "prod-main"
node_count = 10
instance_type = "m5.2xlarge"
multi_az = true
}
# environments/prod/terragrunt.hcl
terraform {
source = "../../modules/eks-cluster"
} inputs = {
cluster_name = "prod-main"
node_count = 10
instance_type = "m5.2xlarge"
multi_az = true
}
# environments/prod/terragrunt.hcl
terraform {
source = "../../modules/eks-cluster"
} inputs = {
cluster_name = "prod-main"
node_count = 10
instance_type = "m5.2xlarge"
multi_az = true
}
#.github/workflows/terraform.yml
name: Terraform
on:
pull_request:
paths: ['infrastructure/**']
push:
branches: [main]
paths: ['infrastructure/**'] jobs:
plan:
runs-on: ubuntu-latest
steps:
- uses: hashicorp/setup-terraform@v3
- run: terraform init
- run: terraform plan -out=plan.tfplan
- run: terraform show -json plan.tfplan > plan.json
# Post plan as PR comment
- uses: actions/github-script@v7
with:
script: |
const plan = require('./plan.json');
const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length;
const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length;
const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length;
github.rest.issues.createComment({
issue_number: context.issue.number,
body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}`
}); apply:
needs: plan
if: github.ref == 'refs/heads/main'
environment: production # Requires approval
steps:
- run: terraform apply plan.tfplan
#.github/workflows/terraform.yml
name: Terraform
on:
pull_request:
paths: ['infrastructure/**']
push:
branches: [main]
paths: ['infrastructure/**'] jobs:
plan:
runs-on: ubuntu-latest
steps:
- uses: hashicorp/setup-terraform@v3
- run: terraform init
- run: terraform plan -out=plan.tfplan
- run: terraform show -json plan.tfplan > plan.json
# Post plan as PR comment
- uses: actions/github-script@v7
with:
script: |
const plan = require('./plan.json');
const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length;
const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length;
const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length;
github.rest.issues.createComment({
issue_number: context.issue.number,
body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}`
}); apply:
needs: plan
if: github.ref == 'refs/heads/main'
environment: production # Requires approval
steps:
- run: terraform apply plan.tfplan
#.github/workflows/terraform.yml
name: Terraform
on:
pull_request:
paths: ['infrastructure/**']
push:
branches: [main]
paths: ['infrastructure/**'] jobs:
plan:
runs-on: ubuntu-latest
steps:
- uses: hashicorp/setup-terraform@v3
- run: terraform init
- run: terraform plan -out=plan.tfplan
- run: terraform show -json plan.tfplan > plan.json
# Post plan as PR comment
- uses: actions/github-script@v7
with:
script: |
const plan = require('./plan.json');
const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length;
const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length;
const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length;
github.rest.issues.createComment({
issue_number: context.issue.number,
body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}`
}); apply:
needs: plan
if: github.ref == 'refs/heads/main'
environment: production # Requires approval
steps:
- run: terraform apply plan.tfplan
terraform {
backend "s3" {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}
terraform {
backend "s3" {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}
terraform {
backend "s3" {
bucket = "terraform-state"
key = "network/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}