Tools: Terraform at Scale: Lessons from Managing 500+ Resources (2026)

Tools: Terraform at Scale: Lessons from Managing 500+ Resources (2026)

When Terraform Gets Slow

Problem 1: Monolithic State

Solution: State Decomposition

Problem 2: Environment Drift

Solution: Modules + Terragrunt

Problem 3: Dangerous Applies

Solution: CI/CD Only

Problem 4: State Locks

Solution: Remote State with DynamoDB Locking

Results Our Terraform state file grew to 500+ resources. Plans took 8 minutes. Applies timed out. State locking conflicts were daily. Something had to change. Here's how we tamed it. Everything was in one state file. VPCs, databases, Kubernetes clusters, DNS, IAM all in one giant blob. Each directory = separate state file. Use data sources to reference across boundaries: Result: 6 state files, 60-100 resources each. Plan time: 45 seconds. Dev, staging, and prod drifted constantly because each was copy-pasted. Anyone could terraform apply to production from their laptop. Multiple engineers running plan simultaneously caused state lock conflicts. Plus: only CI/CD runs apply. Humans run plan locally with -lock=false for quick checks. If you want AI-powered infrastructure management that catches drift before it causes outages, check out what we're building at Nova AI Ops. Written by Dr. Samson Tanimawo

BSc · MSc · MBA · PhD

Founder & CEO, Nova AI Ops. https://novaaiops.com Templates let you quickly answer FAQs or store snippets for re-use. Hide child comments as well For further actions, you may consider blocking this person and/or reporting abuse

Code Block

Copy

Before: 1 state file, 500+ resources terraform plan: 8 minutes terraform apply: timeout risk blast radius: everything Before: 1 state file, 500+ resources terraform plan: 8 minutes terraform apply: timeout risk blast radius: everything Before: 1 state file, 500+ resources terraform plan: 8 minutes terraform apply: timeout risk blast radius: everything infrastructure/ ├── network/ # VPCs, subnets, security groups ├── data/ # RDS, ElastiCache, S3 ├── compute/ # EKS, ASGs, Launch templates ├── dns/ # Route53 zones and records ├── iam/ # Roles, policies, users └── monitoring/ # CloudWatch, SNS topics infrastructure/ ├── network/ # VPCs, subnets, security groups ├── data/ # RDS, ElastiCache, S3 ├── compute/ # EKS, ASGs, Launch templates ├── dns/ # Route53 zones and records ├── iam/ # Roles, policies, users └── monitoring/ # CloudWatch, SNS topics infrastructure/ ├── network/ # VPCs, subnets, security groups ├── data/ # RDS, ElastiCache, S3 ├── compute/ # EKS, ASGs, Launch templates ├── dns/ # Route53 zones and records ├── iam/ # Roles, policies, users └── monitoring/ # CloudWatch, SNS topics # compute/main.tf data "terraform_remote_state" "network" { backend = "s3" config = { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" } } resource "aws_eks_cluster" "main" { vpc_config { subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids } } # compute/main.tf data "terraform_remote_state" "network" { backend = "s3" config = { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" } } resource "aws_eks_cluster" "main" { vpc_config { subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids } } # compute/main.tf data "terraform_remote_state" "network" { backend = "s3" config = { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" } } resource "aws_eks_cluster" "main" { vpc_config { subnet_ids = data.terraform_remote_state.network.outputs.private_subnet_ids } } modules/ ├── eks-cluster/ │ ├── main.tf │ ├── variables.tf │ └── outputs.tf └── rds-instance/ ├── main.tf ├── variables.tf └── outputs.tf environments/ ├── dev/ │ └── terragrunt.hcl ├── staging/ │ └── terragrunt.hcl └── prod/ └── terragrunt.hcl modules/ ├── eks-cluster/ │ ├── main.tf │ ├── variables.tf │ └── outputs.tf └── rds-instance/ ├── main.tf ├── variables.tf └── outputs.tf environments/ ├── dev/ │ └── terragrunt.hcl ├── staging/ │ └── terragrunt.hcl └── prod/ └── terragrunt.hcl modules/ ├── eks-cluster/ │ ├── main.tf │ ├── variables.tf │ └── outputs.tf └── rds-instance/ ├── main.tf ├── variables.tf └── outputs.tf environments/ ├── dev/ │ └── terragrunt.hcl ├── staging/ │ └── terragrunt.hcl └── prod/ └── terragrunt.hcl # environments/prod/terragrunt.hcl terraform { source = "../../modules/eks-cluster" } inputs = { cluster_name = "prod-main" node_count = 10 instance_type = "m5.2xlarge" multi_az = true } # environments/prod/terragrunt.hcl terraform { source = "../../modules/eks-cluster" } inputs = { cluster_name = "prod-main" node_count = 10 instance_type = "m5.2xlarge" multi_az = true } # environments/prod/terragrunt.hcl terraform { source = "../../modules/eks-cluster" } inputs = { cluster_name = "prod-main" node_count = 10 instance_type = "m5.2xlarge" multi_az = true } #.github/workflows/terraform.yml name: Terraform on: pull_request: paths: ['infrastructure/**'] push: branches: [main] paths: ['infrastructure/**'] jobs: plan: runs-on: ubuntu-latest steps: - uses: hashicorp/setup-terraform@v3 - run: terraform init - run: terraform plan -out=plan.tfplan - run: terraform show -json plan.tfplan > plan.json # Post plan as PR comment - uses: actions/github-script@v7 with: script: | const plan = require('./plan.json'); const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length; const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length; const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length; github.rest.issues.createComment({ issue_number: context.issue.number, body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}` }); apply: needs: plan if: github.ref == 'refs/heads/main' environment: production # Requires approval steps: - run: terraform apply plan.tfplan #.github/workflows/terraform.yml name: Terraform on: pull_request: paths: ['infrastructure/**'] push: branches: [main] paths: ['infrastructure/**'] jobs: plan: runs-on: ubuntu-latest steps: - uses: hashicorp/setup-terraform@v3 - run: terraform init - run: terraform plan -out=plan.tfplan - run: terraform show -json plan.tfplan > plan.json # Post plan as PR comment - uses: actions/github-script@v7 with: script: | const plan = require('./plan.json'); const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length; const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length; const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length; github.rest.issues.createComment({ issue_number: context.issue.number, body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}` }); apply: needs: plan if: github.ref == 'refs/heads/main' environment: production # Requires approval steps: - run: terraform apply plan.tfplan #.github/workflows/terraform.yml name: Terraform on: pull_request: paths: ['infrastructure/**'] push: branches: [main] paths: ['infrastructure/**'] jobs: plan: runs-on: ubuntu-latest steps: - uses: hashicorp/setup-terraform@v3 - run: terraform init - run: terraform plan -out=plan.tfplan - run: terraform show -json plan.tfplan > plan.json # Post plan as PR comment - uses: actions/github-script@v7 with: script: | const plan = require('./plan.json'); const adds = plan.resource_changes.filter(c => c.change.actions.includes('create')).length; const changes = plan.resource_changes.filter(c => c.change.actions.includes('update')).length; const deletes = plan.resource_changes.filter(c => c.change.actions.includes('delete')).length; github.rest.issues.createComment({ issue_number: context.issue.number, body: `## Terraform Plan\n+${adds} ~${changes} -${deletes}\n\n${deletes > 0? '⚠ RESOURCES WILL BE DESTROYED': ''}` }); apply: needs: plan if: github.ref == 'refs/heads/main' environment: production # Requires approval steps: - run: terraform apply plan.tfplan terraform { backend "s3" { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" dynamodb_table = "terraform-locks" encrypt = true } } terraform { backend "s3" { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" dynamodb_table = "terraform-locks" encrypt = true } } terraform { backend "s3" { bucket = "terraform-state" key = "network/terraform.tfstate" region = "us-east-1" dynamodb_table = "terraform-locks" encrypt = true } }