┌─────────────────────────────────────────────────┐
│ Federation Manager │
│ implements engine.Collector │
├─────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ us-east-prod │ │ eu-west-prod │ ... │
│ │ K8s Collector│ │ K8s Collector│ │
│ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │
│ └──────┬───────────┘ │
│ ▼ │
│ ┌──────────────────────────┐ │
│ │ Aggregator │ │
│ │ SHA256 dedup (30s window)│ │
│ │ Cross-cluster metrics │ │
│ └──────────────────────────┘ │
│ │
└─────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────┐
│ Federation Manager │
│ implements engine.Collector │
├─────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ us-east-prod │ │ eu-west-prod │ ... │
│ │ K8s Collector│ │ K8s Collector│ │
│ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │
│ └──────┬───────────┘ │
│ ▼ │
│ ┌──────────────────────────┐ │
│ │ Aggregator │ │
│ │ SHA256 dedup (30s window)│ │
│ │ Cross-cluster metrics │ │
│ └──────────────────────────┘ │
│ │
└─────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────┐
│ Federation Manager │
│ implements engine.Collector │
├─────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ us-east-prod │ │ eu-west-prod │ ... │
│ │ K8s Collector│ │ K8s Collector│ │
│ └──────┬───────┘ └──────┬───────┘ │
│ │ │ │
│ └──────┬───────────┘ │
│ ▼ │
│ ┌──────────────────────────┐ │
│ │ Aggregator │ │
│ │ SHA256 dedup (30s window)│ │
│ │ Cross-cluster metrics │ │
│ └──────────────────────────┘ │
│ │
└─────────────────────────────────────────────────┘
collectors: kubernetes: clusters: - name: us-east-prod kubeconfig_path: ~/.kube/us-east context: prod-context namespaces: ["default", "payments", "auth"] poll_interval: 15s - name: eu-west-prod kubeconfig_path: ~/.kube/eu-west context: prod-context namespaces: ["default", "payments"] poll_interval: 15s
collectors: kubernetes: clusters: - name: us-east-prod kubeconfig_path: ~/.kube/us-east context: prod-context namespaces: ["default", "payments", "auth"] poll_interval: 15s - name: eu-west-prod kubeconfig_path: ~/.kube/eu-west context: prod-context namespaces: ["default", "payments"] poll_interval: 15s
collectors: kubernetes: clusters: - name: us-east-prod kubeconfig_path: ~/.kube/us-east context: prod-context namespaces: ["default", "payments", "auth"] poll_interval: 15s - name: eu-west-prod kubeconfig_path: ~/.kube/eu-west context: prod-context namespaces: ["default", "payments"] poll_interval: 15s
type Plugin interface { Name() string Collect(ctx context.Context) ([]*collector.Event, error) Healthcheck(ctx context.Context) error
}
type Plugin interface { Name() string Collect(ctx context.Context) ([]*collector.Event, error) Healthcheck(ctx context.Context) error
}
type Plugin interface { Name() string Collect(ctx context.Context) ([]*collector.Event, error) Healthcheck(ctx context.Context) error
}
col := collector.NewBuilder(&myPlugin{}). WithPollInterval(10 * time.Second). WithBufferSize(128). WithLogger(slog.Default()). Build() // col implements engine.Collector — register it like any built-in collector
registry.RegisterCollector(col)
col := collector.NewBuilder(&myPlugin{}). WithPollInterval(10 * time.Second). WithBufferSize(128). WithLogger(slog.Default()). Build() // col implements engine.Collector — register it like any built-in collector
registry.RegisterCollector(col)
col := collector.NewBuilder(&myPlugin{}). WithPollInterval(10 * time.Second). WithBufferSize(128). WithLogger(slog.Default()). Build() // col implements engine.Collector — register it like any built-in collector
registry.RegisterCollector(col)
type HTTPChecker struct { url string
} func (h *HTTPChecker) Name() string { return "http-checker" } func (h *HTTPChecker) Collect(ctx context.Context) ([]*collector.Event, error) { start := time.Now() resp, err := http.Get(h.url) if err != nil { return []*collector.Event{{ Type: "http_check_failed", Severity: "high", Payload: map[string]interface{}{"url": h.url, "error": err.Error()}, }}, nil } defer resp.Body.Close() return []*collector.Event{{ Type: "http_check", Payload: map[string]interface{}{ "url": h.url, "status": resp.StatusCode, "latency_ms": time.Since(start).Milliseconds(), }, }}, nil
} func (h *HTTPChecker) Healthcheck(ctx context.Context) error { return nil }
type HTTPChecker struct { url string
} func (h *HTTPChecker) Name() string { return "http-checker" } func (h *HTTPChecker) Collect(ctx context.Context) ([]*collector.Event, error) { start := time.Now() resp, err := http.Get(h.url) if err != nil { return []*collector.Event{{ Type: "http_check_failed", Severity: "high", Payload: map[string]interface{}{"url": h.url, "error": err.Error()}, }}, nil } defer resp.Body.Close() return []*collector.Event{{ Type: "http_check", Payload: map[string]interface{}{ "url": h.url, "status": resp.StatusCode, "latency_ms": time.Since(start).Milliseconds(), }, }}, nil
} func (h *HTTPChecker) Healthcheck(ctx context.Context) error { return nil }
type HTTPChecker struct { url string
} func (h *HTTPChecker) Name() string { return "http-checker" } func (h *HTTPChecker) Collect(ctx context.Context) ([]*collector.Event, error) { start := time.Now() resp, err := http.Get(h.url) if err != nil { return []*collector.Event{{ Type: "http_check_failed", Severity: "high", Payload: map[string]interface{}{"url": h.url, "error": err.Error()}, }}, nil } defer resp.Body.Close() return []*collector.Event{{ Type: "http_check", Payload: map[string]interface{}{ "url": h.url, "status": resp.StatusCode, "latency_ms": time.Since(start).Milliseconds(), }, }}, nil
} func (h *HTTPChecker) Healthcheck(ctx context.Context) error { return nil }
Incident Detected │ ▼ FindRunbooks(incidentType) │ ▼ For each matching runbook: ├── autoExecute=true → Execute immediately └── autoExecute=false → Queue for approval │ ▼ Execute each Step sequentially: ├── kubectl_scale → Scale deployment replicas ├── restart_pod → Delete pod for controller restart ├── notify_oncall → Slack/PagerDuty notification ├── run_diagnostic → Execute diagnostic command └── custom_script → Run remediation script │ ▼ Record ExecutionResult (timing, step results, success/failure)
Incident Detected │ ▼ FindRunbooks(incidentType) │ ▼ For each matching runbook: ├── autoExecute=true → Execute immediately └── autoExecute=false → Queue for approval │ ▼ Execute each Step sequentially: ├── kubectl_scale → Scale deployment replicas ├── restart_pod → Delete pod for controller restart ├── notify_oncall → Slack/PagerDuty notification ├── run_diagnostic → Execute diagnostic command └── custom_script → Run remediation script │ ▼ Record ExecutionResult (timing, step results, success/failure)
Incident Detected │ ▼ FindRunbooks(incidentType) │ ▼ For each matching runbook: ├── autoExecute=true → Execute immediately └── autoExecute=false → Queue for approval │ ▼ Execute each Step sequentially: ├── kubectl_scale → Scale deployment replicas ├── restart_pod → Delete pod for controller restart ├── notify_oncall → Slack/PagerDuty notification ├── run_diagnostic → Execute diagnostic command └── custom_script → Run remediation script │ ▼ Record ExecutionResult (timing, step results, success/failure)
executor.RegisterRunbook(runbook.Runbook{ ID: "custom-db-failover", Name: "Database Failover", IncidentTypes: []string{"DatabaseDown", "ReplicationLag"}, AutoExecute: false, Steps: []runbook.Step{ {Name: "Check replication", Action: "run_diagnostic", Config: map[string]string{"command": "pg_stat_replication"}}, {Name: "Promote standby", Action: "custom_script", Config: map[string]string{"script": "/opt/scripts/promote-standby.sh"}}, {Name: "Notify DBA team", Action: "notify_oncall", Config: map[string]string{"channel": "#dba-oncall"}}, },
})
executor.RegisterRunbook(runbook.Runbook{ ID: "custom-db-failover", Name: "Database Failover", IncidentTypes: []string{"DatabaseDown", "ReplicationLag"}, AutoExecute: false, Steps: []runbook.Step{ {Name: "Check replication", Action: "run_diagnostic", Config: map[string]string{"command": "pg_stat_replication"}}, {Name: "Promote standby", Action: "custom_script", Config: map[string]string{"script": "/opt/scripts/promote-standby.sh"}}, {Name: "Notify DBA team", Action: "notify_oncall", Config: map[string]string{"channel": "#dba-oncall"}}, },
})
executor.RegisterRunbook(runbook.Runbook{ ID: "custom-db-failover", Name: "Database Failover", IncidentTypes: []string{"DatabaseDown", "ReplicationLag"}, AutoExecute: false, Steps: []runbook.Step{ {Name: "Check replication", Action: "run_diagnostic", Config: map[string]string{"command": "pg_stat_replication"}}, {Name: "Promote standby", Action: "custom_script", Config: map[string]string{"script": "/opt/scripts/promote-standby.sh"}}, {Name: "Notify DBA team", Action: "notify_oncall", Config: map[string]string{"channel": "#dba-oncall"}}, },
})
collectors: cicd: github_token: "ghp_..." repo_filters: - "your-org/your-repo" - "your-org/another-repo" poll_interval: 60s
collectors: cicd: github_token: "ghp_..." repo_filters: - "your-org/your-repo" - "your-org/another-repo" poll_interval: 60s
collectors: cicd: github_token: "ghp_..." repo_filters: - "your-org/your-repo" - "your-org/another-repo" poll_interval: 60s
Client connects → wsHub.add(conn) │
Broadcaster (2s) ───┤──→ JSON to all clients │
Client disconnects → wsHub.remove(conn)
Client connects → wsHub.add(conn) │
Broadcaster (2s) ───┤──→ JSON to all clients │
Client disconnects → wsHub.remove(conn)
Client connects → wsHub.add(conn) │
Broadcaster (2s) ───┤──→ JSON to all clients │
Client disconnects → wsHub.remove(conn)
┌───────────────────────────────────────────────────────┐
│ Dashboard (React) │
│ WebSocket <── REST API (Go) ──> │
├───────────────────────────────────────────────────────┤
│ Engine Core │
│ ┌────────────┐ ┌──────────┐ ┌────────────────┐ │
│ │ Federation │ │ Runbook │ │ AI Intelligence│ │
│ │ Manager │ │ Executor │ │ (AWS Bedrock) │ │
│ └────────────┘ └──────────┘ └────────────────┘ │
├───────────────────────────────────────────────────────┤
│ Collectors │
│ ┌─────┐ ┌─────┐ ┌──────┐ ┌─────┐ ┌───┐ ┌────────┐ │
│ │ K8s │ │Kafka│ │CI/CD │ │Azure│ │GCP│ │Custom │ │
│ │ │ │ │ │GitHub│ │ │ │ │ │ (SDK) │ │
│ └─────┘ └─────┘ └──────┘ └─────┘ └───┘ └────────┘ │
├───────────────────────────────────────────────────────┤
│ Integrations & Export │
│ Slack · PagerDuty · Vault · Prometheus · OTel │
└───────────────────────────────────────────────────────┘
┌───────────────────────────────────────────────────────┐
│ Dashboard (React) │
│ WebSocket <── REST API (Go) ──> │
├───────────────────────────────────────────────────────┤
│ Engine Core │
│ ┌────────────┐ ┌──────────┐ ┌────────────────┐ │
│ │ Federation │ │ Runbook │ │ AI Intelligence│ │
│ │ Manager │ │ Executor │ │ (AWS Bedrock) │ │
│ └────────────┘ └──────────┘ └────────────────┘ │
├───────────────────────────────────────────────────────┤
│ Collectors │
│ ┌─────┐ ┌─────┐ ┌──────┐ ┌─────┐ ┌───┐ ┌────────┐ │
│ │ K8s │ │Kafka│ │CI/CD │ │Azure│ │GCP│ │Custom │ │
│ │ │ │ │ │GitHub│ │ │ │ │ │ (SDK) │ │
│ └─────┘ └─────┘ └──────┘ └─────┘ └───┘ └────────┘ │
├───────────────────────────────────────────────────────┤
│ Integrations & Export │
│ Slack · PagerDuty · Vault · Prometheus · OTel │
└───────────────────────────────────────────────────────┘
┌───────────────────────────────────────────────────────┐
│ Dashboard (React) │
│ WebSocket <── REST API (Go) ──> │
├───────────────────────────────────────────────────────┤
│ Engine Core │
│ ┌────────────┐ ┌──────────┐ ┌────────────────┐ │
│ │ Federation │ │ Runbook │ │ AI Intelligence│ │
│ │ Manager │ │ Executor │ │ (AWS Bedrock) │ │
│ └────────────┘ └──────────┘ └────────────────┘ │
├───────────────────────────────────────────────────────┤
│ Collectors │
│ ┌─────┐ ┌─────┐ ┌──────┐ ┌─────┐ ┌───┐ ┌────────┐ │
│ │ K8s │ │Kafka│ │CI/CD │ │Azure│ │GCP│ │Custom │ │
│ │ │ │ │ │GitHub│ │ │ │ │ │ (SDK) │ │
│ └─────┘ └─────┘ └──────┘ └─────┘ └───┘ └────────┘ │
├───────────────────────────────────────────────────────┤
│ Integrations & Export │
│ Slack · PagerDuty · Vault · Prometheus · OTel │
└───────────────────────────────────────────────────────┘
git clone https://github.com/kronveil/kronveil.git
cd kronveil
docker compose up --build
git clone https://github.com/kronveil/kronveil.git
cd kronveil
docker compose up --build
git clone https://github.com/kronveil/kronveil.git
cd kronveil
docker compose up --build
curl http://localhost:8080/api/v1/health | jq .
curl http://localhost:8080/api/v1/health | jq .
curl http://localhost:8080/api/v1/health | jq .
{ "status": "healthy", "components": [ {"name": "kubernetes", "status": "healthy"}, {"name": "kafka", "status": "healthy"}, {"name": "cicd-collector", "status": "healthy"}, {"name": "cloud-aws", "status": "healthy"} ], "uptime": "2m30s"
}
{ "status": "healthy", "components": [ {"name": "kubernetes", "status": "healthy"}, {"name": "kafka", "status": "healthy"}, {"name": "cicd-collector", "status": "healthy"}, {"name": "cloud-aws", "status": "healthy"} ], "uptime": "2m30s"
}
{ "status": "healthy", "components": [ {"name": "kubernetes", "status": "healthy"}, {"name": "kafka", "status": "healthy"}, {"name": "cicd-collector", "status": "healthy"}, {"name": "cloud-aws", "status": "healthy"} ], "uptime": "2m30s"
}
# Build and push to ECR
docker build -f deploy/Dockerfile.agent -t <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 .
docker push <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 # Deploy with Helm
helm install kronveil helm/kronveil/ \ -n kronveil --create-namespace \ -f values-prod.yaml
# Build and push to ECR
docker build -f deploy/Dockerfile.agent -t <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 .
docker push <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 # Deploy with Helm
helm install kronveil helm/kronveil/ \ -n kronveil --create-namespace \ -f values-prod.yaml
# Build and push to ECR
docker build -f deploy/Dockerfile.agent -t <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 .
docker push <account>.dkr.ecr.<region>.amazonaws.com/kronveil/agent:v0.3 # Deploy with Helm
helm install kronveil helm/kronveil/ \ -n kronveil --create-namespace \ -f values-prod.yaml - Polling loop at your configured interval
- Immediate first collect (no waiting for the first tick)
- Buffered channel with drop + warn when full
- Health status combining Healthcheck() result with recent collect errors
- Thread-safe start/stop lifecycle - Auth: azidentity.DefaultAzureCredential — supports managed identity, CLI, environment variables
- Metrics: Azure Monitor azquery.MetricsClient queries CPU, memory, disk, and network
- Resources: ARM armresources.Client with full pagination support
- Config: Set AZURE_SUBSCRIPTION_ID and standard Azure credentials - Auth: Application Default Credentials (ADC)
- Metrics: Cloud Monitoring ListTimeSeries with 5-minute lookback window
- Resources: Cloud Asset SearchAllResources for inventory
- Config: Set GCP_PROJECT_ID or GOOGLE_CLOUD_PROJECT - useWebSocket — generic hook with auto-reconnect and exponential backoff (1s to 30s)
- useEventStream — wraps WebSocket for the events endpoint, maintains a 100-event rolling buffer, provides memoized filtered views for incidents, anomalies, and all events - Total runbooks
- Auto-execute count
- Executions in last 24 hours
- Average success rate - Name and description
- Auto/manual execution badge (green dot for auto, gray for manual)
- Incident type tags
- Step count, last run time, success rate
- Recent run indicators — green and red dots for the last 3 executions - Live runbook execution — move from dry-run to real kubectl and script execution with approval gates
- Collector marketplace — share and install community-built collectors via the SDK
- Cross-cluster incident correlation — AI-powered correlation across federated clusters
- Dashboard runbook triggers — execute runbooks directly from the UI
- Grafana plugin — embed Kronveil panels in existing Grafana dashboards - GitHub: github.com/kronveil/kronveil
- v0.1 post: I Built an AI-Powered Infrastructure Observability Agent from Scratch
- v0.2 post: Kronveil v0.2: Dashboard, gRPC, Secret Management, and Local Deployment