apiVersion: apps/v1
kind: Deployment
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: apps/v1
kind: Deployment
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: apps/v1
kind: Deployment
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: argoproj.io/v1alpha1 # <-- changed
kind: Rollout # <-- changed
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: # <-- this whole block changes canary: steps: - setWeight: 10 - pause: duration: 10m. - setWeight: 50 - pause: duration: 10m - setWeight: 100 analysis: templates: - templateName: standard-health-check args: - name: service value: my-app.default.svc.cluster.local template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: argoproj.io/v1alpha1 # <-- changed
kind: Rollout # <-- changed
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: # <-- this whole block changes canary: steps: - setWeight: 10 - pause: duration: 10m. - setWeight: 50 - pause: duration: 10m - setWeight: 100 analysis: templates: - templateName: standard-health-check args: - name: service value: my-app.default.svc.cluster.local template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: argoproj.io/v1alpha1 # <-- changed
kind: Rollout # <-- changed
metadata: name: my-app
spec: replicas: 10 selector: matchLabels: app: my-app strategy: # <-- this whole block changes canary: steps: - setWeight: 10 - pause: duration: 10m. - setWeight: 50 - pause: duration: 10m - setWeight: 100 analysis: templates: - templateName: standard-health-check args: - name: service value: my-app.default.svc.cluster.local template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v1 ports: - containerPort: 8080
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app
spec: strategy: canary: canaryService: my-app-canary # separate Service for canary pods stableService: my-app-stable # separate Service for stable pods trafficRouting: nginx: stableIngress: my-app-ingress annotationPrefix: nginx.ingress.kubernetes.io additionalIngressAnnotations: canary-by-header: X-Canary-User canary-by-header-value: "true" steps: - setWeight: 10 - pause: duration: 10m - setWeight: 50 - pause: duration: 10m
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app
spec: strategy: canary: canaryService: my-app-canary # separate Service for canary pods stableService: my-app-stable # separate Service for stable pods trafficRouting: nginx: stableIngress: my-app-ingress annotationPrefix: nginx.ingress.kubernetes.io additionalIngressAnnotations: canary-by-header: X-Canary-User canary-by-header-value: "true" steps: - setWeight: 10 - pause: duration: 10m - setWeight: 50 - pause: duration: 10m
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app
spec: strategy: canary: canaryService: my-app-canary # separate Service for canary pods stableService: my-app-stable # separate Service for stable pods trafficRouting: nginx: stableIngress: my-app-ingress annotationPrefix: nginx.ingress.kubernetes.io additionalIngressAnnotations: canary-by-header: X-Canary-User canary-by-header-value: "true" steps: - setWeight: 10 - pause: duration: 10m - setWeight: 50 - pause: duration: 10m
steps:
- setWeight: 10
- setCanaryScale: replicas: 3 # always keep 3 canary pods regardless of weight
- pause: duration: 10m
- setWeight: 50
- setCanaryScale: matchTrafficWeight: true # now scale proportionally
steps:
- setWeight: 10
- setCanaryScale: replicas: 3 # always keep 3 canary pods regardless of weight
- pause: duration: 10m
- setWeight: 50
- setCanaryScale: matchTrafficWeight: true # now scale proportionally
steps:
- setWeight: 10
- setCanaryScale: replicas: 3 # always keep 3 canary pods regardless of weight
- pause: duration: 10m
- setWeight: 50
- setCanaryScale: matchTrafficWeight: true # now scale proportionally
steps:
- setWeight: 10
- pause: duration: 10m # timed pause — auto-advances
- setWeight: 30
- pause: {} # indefinite pause — REQUIRES manual promotion
- setWeight: 100
steps:
- setWeight: 10
- pause: duration: 10m # timed pause — auto-advances
- setWeight: 30
- pause: {} # indefinite pause — REQUIRES manual promotion
- setWeight: 100
steps:
- setWeight: 10
- pause: duration: 10m # timed pause — auto-advances
- setWeight: 30
- pause: {} # indefinite pause — REQUIRES manual promotion
- setWeight: 100
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata: name: error-rate-check
spec: args: - name: service-name metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus:9090 query: | sum(rate( requests_total{service="{{args.service-name}}",status!~"5.."}[5m] )) / sum(rate( requests_total{service="{{args.service-name}}"}[5m] ))
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata: name: error-rate-check
spec: args: - name: service-name metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus:9090 query: | sum(rate( requests_total{service="{{args.service-name}}",status!~"5.."}[5m] )) / sum(rate( requests_total{service="{{args.service-name}}"}[5m] ))
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata: name: error-rate-check
spec: args: - name: service-name metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus:9090 query: | sum(rate( requests_total{service="{{args.service-name}}",status!~"5.."}[5m] )) / sum(rate( requests_total{service="{{args.service-name}}"}[5m] ))
metrics:
- name: integration-test provider: job: spec: template: spec: containers: - name: test-runner image: my-test-runner:latest command: ["pytest", "tests/smoke/", "-v"] restartPolicy: Never backoffLimit: 0
metrics:
- name: integration-test provider: job: spec: template: spec: containers: - name: test-runner image: my-test-runner:latest command: ["pytest", "tests/smoke/", "-v"] restartPolicy: Never backoffLimit: 0
metrics:
- name: integration-test provider: job: spec: template: spec: containers: - name: test-runner image: my-test-runner:latest command: ["pytest", "tests/smoke/", "-v"] restartPolicy: Never backoffLimit: 0
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate # <-- cluster-scoped
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] ))
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate # <-- cluster-scoped
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] ))
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate # <-- cluster-scoped
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] ))
# See all analysis runs for a rollout
kubectl argo rollouts get rollout my-app # Detailed view of a specific analysis run
kubectl describe analysisrun my-app-<hash> # Watch it live
kubectl argo rollouts get rollout my-app --watch
# See all analysis runs for a rollout
kubectl argo rollouts get rollout my-app # Detailed view of a specific analysis run
kubectl describe analysisrun my-app-<hash> # Watch it live
kubectl argo rollouts get rollout my-app --watch
# See all analysis runs for a rollout
kubectl argo rollouts get rollout my-app # Detailed view of a specific analysis run
kubectl describe analysisrun my-app-<hash> # Watch it live
kubectl argo rollouts get rollout my-app --watch
apiVersion: argoproj.io/v1alpha1
kind: AnalysisRun
metadata: name: dry-run-health-check
spec: args: - name: service-name value: my-app.default.svc.cluster.local templates: - templateName: error-rate-check
apiVersion: argoproj.io/v1alpha1
kind: AnalysisRun
metadata: name: dry-run-health-check
spec: args: - name: service-name value: my-app.default.svc.cluster.local templates: - templateName: error-rate-check
apiVersion: argoproj.io/v1alpha1
kind: AnalysisRun
metadata: name: dry-run-health-check
spec: args: - name: service-name value: my-app.default.svc.cluster.local templates: - templateName: error-rate-check
# Inside a Rollout's canary steps
steps:
- experiment: duration: 30m templates: - name: baseline specRef: stable # uses the current stable spec - name: canary specRef: canary # uses the new canary spec analyses: - name: compare-latency templateName: p95-latency-comparison args: - name: baseline-service value: "{{templates.baseline.service.name}}" - name: canary-service value: "{{templates.canary.service.name}}"
# Inside a Rollout's canary steps
steps:
- experiment: duration: 30m templates: - name: baseline specRef: stable # uses the current stable spec - name: canary specRef: canary # uses the new canary spec analyses: - name: compare-latency templateName: p95-latency-comparison args: - name: baseline-service value: "{{templates.baseline.service.name}}" - name: canary-service value: "{{templates.canary.service.name}}"
# Inside a Rollout's canary steps
steps:
- experiment: duration: 30m templates: - name: baseline specRef: stable # uses the current stable spec - name: canary specRef: canary # uses the new canary spec analyses: - name: compare-latency templateName: p95-latency-comparison args: - name: baseline-service value: "{{templates.baseline.service.name}}" - name: canary-service value: "{{templates.canary.service.name}}"
# macOS
brew install argoproj/tap/kubectl-argo-rollouts # Linux
curl -LO https://github.com/argoproj/argo-rollouts/releases/latest/download/kubectl-argo-rollouts-linux-amd64
chmod +x kubectl-argo-rollouts-linux-amd64
mv kubectl-argo-rollouts-linux-amd64 /usr/local/bin/kubectl-argo-rollouts
# macOS
brew install argoproj/tap/kubectl-argo-rollouts # Linux
curl -LO https://github.com/argoproj/argo-rollouts/releases/latest/download/kubectl-argo-rollouts-linux-amd64
chmod +x kubectl-argo-rollouts-linux-amd64
mv kubectl-argo-rollouts-linux-amd64 /usr/local/bin/kubectl-argo-rollouts
# macOS
brew install argoproj/tap/kubectl-argo-rollouts # Linux
curl -LO https://github.com/argoproj/argo-rollouts/releases/latest/download/kubectl-argo-rollouts-linux-amd64
chmod +x kubectl-argo-rollouts-linux-amd64
mv kubectl-argo-rollouts-linux-amd64 /usr/local/bin/kubectl-argo-rollouts
kubectl argo rollouts dashboard
kubectl argo rollouts dashboard
kubectl argo rollouts dashboard
kubectl create secret generic argo-rollouts-notification-secret \ --from-literal=slack-token=xoxb-your-slack-bot-token \ -n argo-rollouts
kubectl create secret generic argo-rollouts-notification-secret \ --from-literal=slack-token=xoxb-your-slack-bot-token \ -n argo-rollouts
kubectl create secret generic argo-rollouts-notification-secret \ --from-literal=slack-token=xoxb-your-slack-bot-token \ -n argo-rollouts
apiVersion: v1
kind: ConfigMap
metadata: name: argo-rollouts-notification-cm namespace: argo-rollouts
data: # Slack integration service.slack: | token: $slack-token # Message templates template.rollout-aborted: | message: | :red_circle: Rollout *{{.rollout.metadata.name}}* aborted in namespace *{{.rollout.metadata.namespace}}* Reason: {{.rollout.status.message}} Canary weight at time of abort: {{.rollout.status.currentPodHash}} template.analysis-run-failed: | message: | :warning: Analysis failed for *{{.rollout.metadata.name}}* Failed metric: {{range .analysisRun.status.metricResults}}{{if eq .phase "Failed"}}{{.name}}{{end}}{{end}} Initiating automatic rollback. template.rollout-completed: | message: | :white_check_mark: Rollout *{{.rollout.metadata.name}}* completed successfully. New stable image: {{range .rollout.spec.template.spec.containers}}{{.image}}{{end}} template.rollout-paused: | message: | :pause_button: Rollout *{{.rollout.metadata.name}}* paused — awaiting manual promotion. Promote with: `kubectl argo rollouts promote {{.rollout.metadata.name}} -n {{.rollout.metadata.namespace}}` # Triggers — maps events to templates trigger.on-rollout-aborted: | - send: [rollout-aborted] trigger.on-analysis-run-failed: | - send: [analysis-run-failed] trigger.on-rollout-completed: | - send: [rollout-completed] trigger.on-rollout-paused: | - send: [rollout-paused]
apiVersion: v1
kind: ConfigMap
metadata: name: argo-rollouts-notification-cm namespace: argo-rollouts
data: # Slack integration service.slack: | token: $slack-token # Message templates template.rollout-aborted: | message: | :red_circle: Rollout *{{.rollout.metadata.name}}* aborted in namespace *{{.rollout.metadata.namespace}}* Reason: {{.rollout.status.message}} Canary weight at time of abort: {{.rollout.status.currentPodHash}} template.analysis-run-failed: | message: | :warning: Analysis failed for *{{.rollout.metadata.name}}* Failed metric: {{range .analysisRun.status.metricResults}}{{if eq .phase "Failed"}}{{.name}}{{end}}{{end}} Initiating automatic rollback. template.rollout-completed: | message: | :white_check_mark: Rollout *{{.rollout.metadata.name}}* completed successfully. New stable image: {{range .rollout.spec.template.spec.containers}}{{.image}}{{end}} template.rollout-paused: | message: | :pause_button: Rollout *{{.rollout.metadata.name}}* paused — awaiting manual promotion. Promote with: `kubectl argo rollouts promote {{.rollout.metadata.name}} -n {{.rollout.metadata.namespace}}` # Triggers — maps events to templates trigger.on-rollout-aborted: | - send: [rollout-aborted] trigger.on-analysis-run-failed: | - send: [analysis-run-failed] trigger.on-rollout-completed: | - send: [rollout-completed] trigger.on-rollout-paused: | - send: [rollout-paused]
apiVersion: v1
kind: ConfigMap
metadata: name: argo-rollouts-notification-cm namespace: argo-rollouts
data: # Slack integration service.slack: | token: $slack-token # Message templates template.rollout-aborted: | message: | :red_circle: Rollout *{{.rollout.metadata.name}}* aborted in namespace *{{.rollout.metadata.namespace}}* Reason: {{.rollout.status.message}} Canary weight at time of abort: {{.rollout.status.currentPodHash}} template.analysis-run-failed: | message: | :warning: Analysis failed for *{{.rollout.metadata.name}}* Failed metric: {{range .analysisRun.status.metricResults}}{{if eq .phase "Failed"}}{{.name}}{{end}}{{end}} Initiating automatic rollback. template.rollout-completed: | message: | :white_check_mark: Rollout *{{.rollout.metadata.name}}* completed successfully. New stable image: {{range .rollout.spec.template.spec.containers}}{{.image}}{{end}} template.rollout-paused: | message: | :pause_button: Rollout *{{.rollout.metadata.name}}* paused — awaiting manual promotion. Promote with: `kubectl argo rollouts promote {{.rollout.metadata.name}} -n {{.rollout.metadata.namespace}}` # Triggers — maps events to templates trigger.on-rollout-aborted: | - send: [rollout-aborted] trigger.on-analysis-run-failed: | - send: [analysis-run-failed] trigger.on-rollout-completed: | - send: [rollout-completed] trigger.on-rollout-paused: | - send: [rollout-paused]
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: # Alert on abort and analysis failure notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-team-a" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-team-a" # Notify on success too — close the loop notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-team-a" # Alert when a manual gate is waiting for promotion notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-team-a"
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: # Alert on abort and analysis failure notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-team-a" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-team-a" # Notify on success too — close the loop notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-team-a" # Alert when a manual gate is waiting for promotion notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-team-a"
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: # Alert on abort and analysis failure notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-team-a" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-team-a" # Notify on success too — close the loop notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-team-a" # Alert when a manual gate is waiting for promotion notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-team-a"
Developer pushes new image tag to Git ↓
Argo CD detects the diff and syncs the Rollout spec ↓
Argo Rollouts controller picks up the new spec ↓
Canary step begins: 10% traffic → AnalysisRun starts ↓
Analysis passes → 50% → analysis passes → 100% ↓
New version is stable. Argo CD shows "Synced + Healthy"
Developer pushes new image tag to Git ↓
Argo CD detects the diff and syncs the Rollout spec ↓
Argo Rollouts controller picks up the new spec ↓
Canary step begins: 10% traffic → AnalysisRun starts ↓
Analysis passes → 50% → analysis passes → 100% ↓
New version is stable. Argo CD shows "Synced + Healthy"
Developer pushes new image tag to Git ↓
Argo CD detects the diff and syncs the Rollout spec ↓
Argo Rollouts controller picks up the new spec ↓
Canary step begins: 10% traffic → AnalysisRun starts ↓
Analysis passes → 50% → analysis passes → 100% ↓
New version is stable. Argo CD shows "Synced + Healthy"
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata: name: my-app
spec: syncOptions: - RespectIgnoreDifferences=true ignoreDifferences: - group: argoproj.io kind: Rollout jsonPointers: - /spec/replicas # Argo Rollouts manages this during canary
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata: name: my-app
spec: syncOptions: - RespectIgnoreDifferences=true ignoreDifferences: - group: argoproj.io kind: Rollout jsonPointers: - /spec/replicas # Argo Rollouts manages this during canary
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata: name: my-app
spec: syncOptions: - RespectIgnoreDifferences=true ignoreDifferences: - group: argoproj.io kind: Rollout jsonPointers: - /spec/replicas # Argo Rollouts manages this during canary
# 1. ClusterAnalysisTemplate — define once, use everywhere
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] )) - name: p95-latency interval: 5m successCondition: result[0] <= 500 # ms failureLimit: 2 provider: prometheus: address: http://prometheus.monitoring:9090 query: | histogram_quantile(0.95, sum(rate( istio_request_duration_milliseconds_bucket{ destination_service=~"{{args.service}}" }[5m] )) by (le) ) ---
# 2. The Rollout
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-my-team" notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-my-team"
spec: replicas: 10 selector: matchLabels: app: my-app template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v2 ports: - containerPort: 8080 strategy: canary: canaryService: my-app-canary stableService: my-app-stable trafficRouting: nginx: # Can use ALB, Istio, Traefik (Gateway is Supported via plugins haven't explored it yet) stableIngress: my-app-ingress steps: - setWeight: 10 - setCanaryScale: replicas: 3 # stable replica count regardless of weight - pause: duration: 10m # timed: auto-advances after 10m - setWeight: 30 - pause: {} # manual gate: requires explicit promotion - setWeight: 60 - pause: duration: 10m - setWeight: 100 analysis: startingStep: 1 # analysis starts after first setWeight templates: - templateName: standard-health-check clusterScope: true # use ClusterAnalysisTemplate args: - name: service value: my-app.default.svc.cluster.local
# 1. ClusterAnalysisTemplate — define once, use everywhere
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] )) - name: p95-latency interval: 5m successCondition: result[0] <= 500 # ms failureLimit: 2 provider: prometheus: address: http://prometheus.monitoring:9090 query: | histogram_quantile(0.95, sum(rate( istio_request_duration_milliseconds_bucket{ destination_service=~"{{args.service}}" }[5m] )) by (le) ) ---
# 2. The Rollout
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-my-team" notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-my-team"
spec: replicas: 10 selector: matchLabels: app: my-app template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v2 ports: - containerPort: 8080 strategy: canary: canaryService: my-app-canary stableService: my-app-stable trafficRouting: nginx: # Can use ALB, Istio, Traefik (Gateway is Supported via plugins haven't explored it yet) stableIngress: my-app-ingress steps: - setWeight: 10 - setCanaryScale: replicas: 3 # stable replica count regardless of weight - pause: duration: 10m # timed: auto-advances after 10m - setWeight: 30 - pause: {} # manual gate: requires explicit promotion - setWeight: 60 - pause: duration: 10m - setWeight: 100 analysis: startingStep: 1 # analysis starts after first setWeight templates: - templateName: standard-health-check clusterScope: true # use ClusterAnalysisTemplate args: - name: service value: my-app.default.svc.cluster.local
# 1. ClusterAnalysisTemplate — define once, use everywhere
apiVersion: argoproj.io/v1alpha1
kind: ClusterAnalysisTemplate
metadata: name: standard-health-check
spec: args: - name: service metrics: - name: success-rate interval: 5m successCondition: result[0] >= 0.95 failureLimit: 3 provider: prometheus: address: http://prometheus.monitoring:9090 query: | sum(rate( istio_requests_total{ destination_service=~"{{args.service}}", response_code!~"5.*" }[5m] )) / sum(rate( istio_requests_total{ destination_service=~"{{args.service}}" }[5m] )) - name: p95-latency interval: 5m successCondition: result[0] <= 500 # ms failureLimit: 2 provider: prometheus: address: http://prometheus.monitoring:9090 query: | histogram_quantile(0.95, sum(rate( istio_request_duration_milliseconds_bucket{ destination_service=~"{{args.service}}" }[5m] )) by (le) ) ---
# 2. The Rollout
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata: name: my-app annotations: notifications.argoproj.io/subscribe.on-rollout-aborted.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-analysis-run-failed.slack: "#alerts-my-team" notifications.argoproj.io/subscribe.on-rollout-completed.slack: "#deploys-my-team" notifications.argoproj.io/subscribe.on-rollout-paused.slack: "#deploys-my-team"
spec: replicas: 10 selector: matchLabels: app: my-app template: metadata: labels: app: my-app spec: containers: - name: app image: my-app:v2 ports: - containerPort: 8080 strategy: canary: canaryService: my-app-canary stableService: my-app-stable trafficRouting: nginx: # Can use ALB, Istio, Traefik (Gateway is Supported via plugins haven't explored it yet) stableIngress: my-app-ingress steps: - setWeight: 10 - setCanaryScale: replicas: 3 # stable replica count regardless of weight - pause: duration: 10m # timed: auto-advances after 10m - setWeight: 30 - pause: {} # manual gate: requires explicit promotion - setWeight: 60 - pause: duration: 10m - setWeight: 100 analysis: startingStep: 1 # analysis starts after first setWeight templates: - templateName: standard-health-check clusterScope: true # use ClusterAnalysisTemplate args: - name: service value: my-app.default.svc.cluster.local - Readiness probes — checks if a pod is ready, not if your release is healthy
- Rolling updates — controls speed, not safety
- Pause support — you can halt, but there's no automated rollback on failure
- Umm, that's pretty much it, not counting the Pre-stop hooks and stuff - Successful → Argo Rollouts advances to the next step
- Failed → Rollout aborts, traffic snaps back to stable, canary scales to zero
- Inconclusive → Rollout pauses, waits for manual judgment (useful when metrics are ambiguous) - Live rollout status — step progression, current traffic weights, active canary vs stable pod counts
- AnalysisRun status — each metric check, pass/fail, consecutive failures, timestamps
- One-click controls — Promote, Abort, Retry directly from the UI without touching kubectl
- Rollout history — every revision with its status and timestamp - Argo CD ensures your cluster matches the desired state in Git. It's a reconciliation engine. It sees your kind: Rollout manifest in Git and syncs it to the cluster.
- Argo Rollouts controls how the transition from old to new happens once that manifest lands. It manages the traffic shifting, analysis, and promotion/rollback logic. - Infrastructure controllers — cert-manager, nginx, coredns, sealed-secrets. These aren't application deployments; they're cluster plumbing. A canary of your ingress controller is chaos.
- Applications with shared mutable state — if your app writes to a shared file, a shared queue, or a shared database schema without backward compatibility, running two versions simultaneously will corrupt data.
- Worker/queue consumers — apps that pull from a queue typically can't handle two versions processing the same messages. Argo Rollouts doesn't control queue routing.
- Long-lived parallel versions — Argo Rollouts assumes a brief deployment window (15–60 minutes typically, 1–2 hours max). Running canary for days or weeks before deciding to promote creates operational complexity and rollback ambiguity.
- Multi-cluster rollouts — Argo Rollouts operates within a single cluster. If you need coordinated rollouts across clusters, look at Argo CD ApplicationSets or multi-cluster progressive delivery tools.
- Legacy apps that can't run multiple versions concurrently — some apps hold exclusive locks, bind to fixed ports, or have singleton assumptions. For these, Blue-Green (not canary) is your only option, and even that requires validation.