$ // meta-microservice-exporter.go
// Prometheus 3.0 compatible exporter for Meta's internal microservice fleet
// Implements custom metrics for RPC latency, queue depth, and error rates
package main import ( "context" "encoding/json" "errors" "fmt" "log" "net/http" "os" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" metav1 "github.com/prometheus/prometheus/model/v3/pkg/apis/meta/v1" "github.com/prometheus/prometheus/pkg/v3/ebpf/discovery"
) // Define custom metrics
var ( rpcLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "meta_microservice_rpc_latency_ms", Help: "RPC latency in milliseconds for Meta internal microservices", Buckets: prometheus.DefBuckets, }, []string{"-weight: 500;">service", "endpoint", "region"}) queueDepth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "meta_microservice_queue_depth", Help: "Current depth of task queues per microservice instance", }, []string{"-weight: 500;">service", "queue_name", "instance_id"}) errorRate = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "meta_microservice_error_total", Help: "Total number of errors per microservice endpoint", }, []string{"-weight: 500;">service", "endpoint", "error_code"})
) // serviceDiscovery uses Prometheus 3.0's eBPF discovery to find microservice instances
type serviceDiscovery struct { discoverer *discovery.EBPFDiscoverer cache map[string][]string // -weight: 500;">service name -> instance IDs
} // newServiceDiscovery initializes eBPF-based -weight: 500;">service discovery for Prometheus 3.0
func newServiceDiscovery() (*serviceDiscovery, error) { discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second, }) if err != nil { return nil, fmt.Errorf("failed to initialize eBPF discoverer: %w", err) } return &serviceDiscovery{ discoverer: discoverer, cache: make(map[string][]string), }, nil
} // scrapeMetrics fetches metrics from discovered microservice instances
func scrapeMetrics(ctx context.Context, sd *serviceDiscovery) error { instances, err := sd.discoverer.Discover(ctx) if err != nil { return fmt.Errorf("-weight: 500;">service discovery failed: %w", err) } for _, inst := range instances { // Skip instances in maintenance mode if inst.Labels["maintenance"] == "true" { log.Printf("Skipping instance %s in maintenance", inst.ID) continue } // Fetch RPC latency metrics latency, err := fetchRPCLatency(inst) if err != nil { log.Printf("Failed to fetch RPC latency for %s: %v", inst.ID, err) continue } rpcLatency.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["region"]).Observe(latency) // Fetch queue depth metrics depth, err := fetchQueueDepth(inst) if err != nil { log.Printf("Failed to fetch queue depth for %s: %v", inst.ID, err) continue } queueDepth.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["queue_name"], inst.ID).Set(depth) // Fetch error rate metrics errCount, err := fetchErrorCount(inst) if err != nil { log.Printf("Failed to fetch error count for %s: %v", inst.ID, err) continue } errorRate.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["error_code"]).Add(errCount) } return nil
} // fetchRPCLatency mocks a real RPC call to a microservice instance
// In production, this would hit the instance's /metrics endpoint
func fetchRPCLatency(inst *discovery.Instance) (float64, error) { // Simulate network error 1% of the time if time.Now().UnixNano()%100 == 0 { return 0, errors.New("simulated network timeout") } // Mock latency between 10ms and 500ms return 10 + float64(time.Now().UnixNano()%490), nil
} // fetchQueueDepth mocks queue depth fetch
func fetchQueueDepth(inst *discovery.Instance) (float64, error) { // Mock queue depth between 0 and 1000 return float64(time.Now().UnixNano() % 1000), nil
} // fetchErrorCount mocks error count fetch
func fetchErrorCount(inst *discovery.Instance) (float64, error) { // Mock 0-5 errors per scrape return float64(time.Now().UnixNano() % 5), nil
} func main() { // Register metrics with Prometheus prometheus.MustRegister(rpcLatency, queueDepth, errorRate) // Initialize -weight: 500;">service discovery sd, err := newServiceDiscovery() if err != nil { log.Fatalf("Failed to initialize -weight: 500;">service discovery: %v", err) } // Start metrics scraping goroutine go func() { ticker := time.NewTicker(15 * time.Second) defer ticker.Stop() for { select { case <-ticker.C: ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := scrapeMetrics(ctx, sd); err != nil { log.Printf("Metrics scrape failed: %v", err) } } } }() // Expose metrics endpoint http.Handle("/metrics", promhttp.Handler()) log.Println("Starting exporter on :9090") if err := http.ListenAndServe(":9090", nil); err != nil { log.Fatalf("HTTP server failed: %v", err) }
}
// meta-microservice-exporter.go
// Prometheus 3.0 compatible exporter for Meta's internal microservice fleet
// Implements custom metrics for RPC latency, queue depth, and error rates
package main import ( "context" "encoding/json" "errors" "fmt" "log" "net/http" "os" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" metav1 "github.com/prometheus/prometheus/model/v3/pkg/apis/meta/v1" "github.com/prometheus/prometheus/pkg/v3/ebpf/discovery"
) // Define custom metrics
var ( rpcLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "meta_microservice_rpc_latency_ms", Help: "RPC latency in milliseconds for Meta internal microservices", Buckets: prometheus.DefBuckets, }, []string{"-weight: 500;">service", "endpoint", "region"}) queueDepth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "meta_microservice_queue_depth", Help: "Current depth of task queues per microservice instance", }, []string{"-weight: 500;">service", "queue_name", "instance_id"}) errorRate = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "meta_microservice_error_total", Help: "Total number of errors per microservice endpoint", }, []string{"-weight: 500;">service", "endpoint", "error_code"})
) // serviceDiscovery uses Prometheus 3.0's eBPF discovery to find microservice instances
type serviceDiscovery struct { discoverer *discovery.EBPFDiscoverer cache map[string][]string // -weight: 500;">service name -> instance IDs
} // newServiceDiscovery initializes eBPF-based -weight: 500;">service discovery for Prometheus 3.0
func newServiceDiscovery() (*serviceDiscovery, error) { discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second, }) if err != nil { return nil, fmt.Errorf("failed to initialize eBPF discoverer: %w", err) } return &serviceDiscovery{ discoverer: discoverer, cache: make(map[string][]string), }, nil
} // scrapeMetrics fetches metrics from discovered microservice instances
func scrapeMetrics(ctx context.Context, sd *serviceDiscovery) error { instances, err := sd.discoverer.Discover(ctx) if err != nil { return fmt.Errorf("-weight: 500;">service discovery failed: %w", err) } for _, inst := range instances { // Skip instances in maintenance mode if inst.Labels["maintenance"] == "true" { log.Printf("Skipping instance %s in maintenance", inst.ID) continue } // Fetch RPC latency metrics latency, err := fetchRPCLatency(inst) if err != nil { log.Printf("Failed to fetch RPC latency for %s: %v", inst.ID, err) continue } rpcLatency.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["region"]).Observe(latency) // Fetch queue depth metrics depth, err := fetchQueueDepth(inst) if err != nil { log.Printf("Failed to fetch queue depth for %s: %v", inst.ID, err) continue } queueDepth.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["queue_name"], inst.ID).Set(depth) // Fetch error rate metrics errCount, err := fetchErrorCount(inst) if err != nil { log.Printf("Failed to fetch error count for %s: %v", inst.ID, err) continue } errorRate.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["error_code"]).Add(errCount) } return nil
} // fetchRPCLatency mocks a real RPC call to a microservice instance
// In production, this would hit the instance's /metrics endpoint
func fetchRPCLatency(inst *discovery.Instance) (float64, error) { // Simulate network error 1% of the time if time.Now().UnixNano()%100 == 0 { return 0, errors.New("simulated network timeout") } // Mock latency between 10ms and 500ms return 10 + float64(time.Now().UnixNano()%490), nil
} // fetchQueueDepth mocks queue depth fetch
func fetchQueueDepth(inst *discovery.Instance) (float64, error) { // Mock queue depth between 0 and 1000 return float64(time.Now().UnixNano() % 1000), nil
} // fetchErrorCount mocks error count fetch
func fetchErrorCount(inst *discovery.Instance) (float64, error) { // Mock 0-5 errors per scrape return float64(time.Now().UnixNano() % 5), nil
} func main() { // Register metrics with Prometheus prometheus.MustRegister(rpcLatency, queueDepth, errorRate) // Initialize -weight: 500;">service discovery sd, err := newServiceDiscovery() if err != nil { log.Fatalf("Failed to initialize -weight: 500;">service discovery: %v", err) } // Start metrics scraping goroutine go func() { ticker := time.NewTicker(15 * time.Second) defer ticker.Stop() for { select { case <-ticker.C: ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := scrapeMetrics(ctx, sd); err != nil { log.Printf("Metrics scrape failed: %v", err) } } } }() // Expose metrics endpoint http.Handle("/metrics", promhttp.Handler()) log.Println("Starting exporter on :9090") if err := http.ListenAndServe(":9090", nil); err != nil { log.Fatalf("HTTP server failed: %v", err) }
}
// meta-microservice-exporter.go
// Prometheus 3.0 compatible exporter for Meta's internal microservice fleet
// Implements custom metrics for RPC latency, queue depth, and error rates
package main import ( "context" "encoding/json" "errors" "fmt" "log" "net/http" "os" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" metav1 "github.com/prometheus/prometheus/model/v3/pkg/apis/meta/v1" "github.com/prometheus/prometheus/pkg/v3/ebpf/discovery"
) // Define custom metrics
var ( rpcLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "meta_microservice_rpc_latency_ms", Help: "RPC latency in milliseconds for Meta internal microservices", Buckets: prometheus.DefBuckets, }, []string{"-weight: 500;">service", "endpoint", "region"}) queueDepth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "meta_microservice_queue_depth", Help: "Current depth of task queues per microservice instance", }, []string{"-weight: 500;">service", "queue_name", "instance_id"}) errorRate = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "meta_microservice_error_total", Help: "Total number of errors per microservice endpoint", }, []string{"-weight: 500;">service", "endpoint", "error_code"})
) // serviceDiscovery uses Prometheus 3.0's eBPF discovery to find microservice instances
type serviceDiscovery struct { discoverer *discovery.EBPFDiscoverer cache map[string][]string // -weight: 500;">service name -> instance IDs
} // newServiceDiscovery initializes eBPF-based -weight: 500;">service discovery for Prometheus 3.0
func newServiceDiscovery() (*serviceDiscovery, error) { discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second, }) if err != nil { return nil, fmt.Errorf("failed to initialize eBPF discoverer: %w", err) } return &serviceDiscovery{ discoverer: discoverer, cache: make(map[string][]string), }, nil
} // scrapeMetrics fetches metrics from discovered microservice instances
func scrapeMetrics(ctx context.Context, sd *serviceDiscovery) error { instances, err := sd.discoverer.Discover(ctx) if err != nil { return fmt.Errorf("-weight: 500;">service discovery failed: %w", err) } for _, inst := range instances { // Skip instances in maintenance mode if inst.Labels["maintenance"] == "true" { log.Printf("Skipping instance %s in maintenance", inst.ID) continue } // Fetch RPC latency metrics latency, err := fetchRPCLatency(inst) if err != nil { log.Printf("Failed to fetch RPC latency for %s: %v", inst.ID, err) continue } rpcLatency.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["region"]).Observe(latency) // Fetch queue depth metrics depth, err := fetchQueueDepth(inst) if err != nil { log.Printf("Failed to fetch queue depth for %s: %v", inst.ID, err) continue } queueDepth.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["queue_name"], inst.ID).Set(depth) // Fetch error rate metrics errCount, err := fetchErrorCount(inst) if err != nil { log.Printf("Failed to fetch error count for %s: %v", inst.ID, err) continue } errorRate.WithLabelValues(inst.Labels["-weight: 500;">service"], inst.Labels["endpoint"], inst.Labels["error_code"]).Add(errCount) } return nil
} // fetchRPCLatency mocks a real RPC call to a microservice instance
// In production, this would hit the instance's /metrics endpoint
func fetchRPCLatency(inst *discovery.Instance) (float64, error) { // Simulate network error 1% of the time if time.Now().UnixNano()%100 == 0 { return 0, errors.New("simulated network timeout") } // Mock latency between 10ms and 500ms return 10 + float64(time.Now().UnixNano()%490), nil
} // fetchQueueDepth mocks queue depth fetch
func fetchQueueDepth(inst *discovery.Instance) (float64, error) { // Mock queue depth between 0 and 1000 return float64(time.Now().UnixNano() % 1000), nil
} // fetchErrorCount mocks error count fetch
func fetchErrorCount(inst *discovery.Instance) (float64, error) { // Mock 0-5 errors per scrape return float64(time.Now().UnixNano() % 5), nil
} func main() { // Register metrics with Prometheus prometheus.MustRegister(rpcLatency, queueDepth, errorRate) // Initialize -weight: 500;">service discovery sd, err := newServiceDiscovery() if err != nil { log.Fatalf("Failed to initialize -weight: 500;">service discovery: %v", err) } // Start metrics scraping goroutine go func() { ticker := time.NewTicker(15 * time.Second) defer ticker.Stop() for { select { case <-ticker.C: ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if err := scrapeMetrics(ctx, sd); err != nil { log.Printf("Metrics scrape failed: %v", err) } } } }() // Expose metrics endpoint http.Handle("/metrics", promhttp.Handler()) log.Println("Starting exporter on :9090") if err := http.ListenAndServe(":9090", nil); err != nil { log.Fatalf("HTTP server failed: %v", err) }
}
"""
grafana_provision.py
Provision Meta's DevOps dashboard in Grafana 12.0 via API
Includes data source configuration, dashboard JSON, and alert rules
""" import json
import logging
import os
import sys
import time
from typing import Dict, List, Optional import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Grafana 12.0 API configuration
GRAFANA_URL = os.getenv("GRAFANA_URL", "https://grafana.meta.internal")
GRAFANA_API_KEY = os.getenv("GRAFANA_API_KEY")
if not GRAFANA_API_KEY: logger.error("GRAFANA_API_KEY environment variable not set") sys.exit(1) # Prometheus 3.0 data source configuration
PROMETHEUS_URL = os.getenv("PROMETHEUS_URL", "https://prometheus.meta.internal:9090") def create_session() -> requests.Session: """Create a requests session with retry logic for transient errors""" session = requests.Session() retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "POST", "PUT", "DELETE"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) session.headers.-weight: 500;">update({ "Authorization": f"Bearer {GRAFANA_API_KEY}", "Content-Type": "application/json" }) return session def provision_prometheus_datasource(session: requests.Session) -> Optional[str]: """Provision Prometheus 3.0 as a data source in Grafana 12.0""" datasource_payload = { "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "access": "proxy", "basicAuth": False, "jsonData": { "httpMethod": "POST", "prometheusVersion": "3.0.0", "enableZoom": True, "retentionPeriod": "30d", "ebpfDiscoveryEnabled": True }, "secureJsonData": { "tlsCACert": os.getenv("PROMETHEUS_CA_CERT", ""), "tlsClientCert": os.getenv("PROMETHEUS_CLIENT_CERT", ""), "tlsClientKey": os.getenv("PROMETHEUS_CLIENT_KEY", "") } } try: # Check if data source already exists resp = session.get(f"{GRAFANA_URL}/api/datasources/name/Meta-Prometheus-3.0") if resp.status_code == 200: ds = resp.json() logger.info(f"Prometheus data source already exists with UID: {ds['uid']}") return ds["uid"] # Create new data source resp = session.post(f"{GRAFANA_URL}/api/datasources", json=datasource_payload) resp.raise_for_status() ds = resp.json() logger.info(f"Provisioned Prometheus data source with UID: {ds['datasource']['uid']}") return ds["datasource"]["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision Prometheus data source: {e}") return None def provision_dashboard(session: requests.Session, datasource_uid: str) -> Optional[str]: """Provision the main DevOps dashboard in Grafana 12.0""" dashboard_json = { "dashboard": { "id": None, "uid": "meta-devops-dashboard", "title": "Meta DevOps Overview", "tags": ["meta", "devops", "prometheus-3.0", "grafana-12.0"], "timezone": "utc", "refresh": "30s", "panels": [ { "id": 1, "title": "RPC Latency (p99)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service))", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }], "fieldConfig": { "defaults": { "unit": "ms", "thresholds": { "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500} ] } } } }, { "id": 2, "title": "Queue Depth (Total)", "type": "stat", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(meta_microservice_queue_depth) by (-weight: 500;">service)", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }] }, { "id": 3, "title": "Error Rate (1m Rate)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(rate(meta_microservice_error_total[1m])) by (-weight: 500;">service, error_code)", "legendFormat": "{{-weight: 500;">service}} - {{error_code}}", "refId": "A" }] } ] }, "overwrite": True } try: resp = session.post(f"{GRAFANA_URL}/api/dashboards/db", json=dashboard_json) resp.raise_for_status() result = resp.json() logger.info(f"Provisioned dashboard with UID: {result['uid']}") return result["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision dashboard: {e}") return None def provision_alert_rules(session: requests.Session, datasource_uid: str) -> bool: """Provision Grafana 12.0 unified alerting rules for the dashboard""" alert_rules = { "name": "Meta-DevOps-Alerts", "interval": "30s", "rules": [ { "uid": "meta-rpc-latency-alert", "title": "High RPC Latency (p99 > 500ms)", "condition": "A", "data": [{ "refId": "A", "datasourceUid": datasource_uid, "model": { "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service)) > 500", "refId": "A" } }], "for": "2m", "annotations": { "summary": "High RPC latency detected for -weight: 500;">service {{ $labels.-weight: 500;">service }}", "description": "p99 RPC latency for {{ $labels.-weight: 500;">service }} is {{ $values.A.Value }}ms, exceeding threshold of 500ms" }, "labels": { "severity": "critical", "team": "{{ $labels.-weight: 500;">service | regexReplaceAll "^meta-(.*)--weight: 500;">service$" "$1" }}" } } ] } try: resp = session.put(f"{GRAFANA_URL}/api/v1/provisioning/alert-rules", json=alert_rules) resp.raise_for_status() logger.info("Provisioned Grafana 12.0 alert rules successfully") return True except requests.exceptions.RequestException as e: logger.error(f"Failed to provision alert rules: {e}") return False def main() -> None: session = create_session() # Provision Prometheus data source ds_uid = provision_prometheus_datasource(session) if not ds_uid: logger.error("Failed to provision data source, exiting") sys.exit(1) # Provision dashboard dashboard_uid = provision_dashboard(session, ds_uid) if not dashboard_uid: logger.error("Failed to provision dashboard, exiting") sys.exit(1) # Provision alert rules if not provision_alert_rules(session, ds_uid): logger.error("Failed to provision alert rules, exiting") sys.exit(1) logger.info("All Grafana 12.0 resources provisioned successfully") if __name__ == "__main__": main()
"""
grafana_provision.py
Provision Meta's DevOps dashboard in Grafana 12.0 via API
Includes data source configuration, dashboard JSON, and alert rules
""" import json
import logging
import os
import sys
import time
from typing import Dict, List, Optional import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Grafana 12.0 API configuration
GRAFANA_URL = os.getenv("GRAFANA_URL", "https://grafana.meta.internal")
GRAFANA_API_KEY = os.getenv("GRAFANA_API_KEY")
if not GRAFANA_API_KEY: logger.error("GRAFANA_API_KEY environment variable not set") sys.exit(1) # Prometheus 3.0 data source configuration
PROMETHEUS_URL = os.getenv("PROMETHEUS_URL", "https://prometheus.meta.internal:9090") def create_session() -> requests.Session: """Create a requests session with retry logic for transient errors""" session = requests.Session() retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "POST", "PUT", "DELETE"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) session.headers.-weight: 500;">update({ "Authorization": f"Bearer {GRAFANA_API_KEY}", "Content-Type": "application/json" }) return session def provision_prometheus_datasource(session: requests.Session) -> Optional[str]: """Provision Prometheus 3.0 as a data source in Grafana 12.0""" datasource_payload = { "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "access": "proxy", "basicAuth": False, "jsonData": { "httpMethod": "POST", "prometheusVersion": "3.0.0", "enableZoom": True, "retentionPeriod": "30d", "ebpfDiscoveryEnabled": True }, "secureJsonData": { "tlsCACert": os.getenv("PROMETHEUS_CA_CERT", ""), "tlsClientCert": os.getenv("PROMETHEUS_CLIENT_CERT", ""), "tlsClientKey": os.getenv("PROMETHEUS_CLIENT_KEY", "") } } try: # Check if data source already exists resp = session.get(f"{GRAFANA_URL}/api/datasources/name/Meta-Prometheus-3.0") if resp.status_code == 200: ds = resp.json() logger.info(f"Prometheus data source already exists with UID: {ds['uid']}") return ds["uid"] # Create new data source resp = session.post(f"{GRAFANA_URL}/api/datasources", json=datasource_payload) resp.raise_for_status() ds = resp.json() logger.info(f"Provisioned Prometheus data source with UID: {ds['datasource']['uid']}") return ds["datasource"]["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision Prometheus data source: {e}") return None def provision_dashboard(session: requests.Session, datasource_uid: str) -> Optional[str]: """Provision the main DevOps dashboard in Grafana 12.0""" dashboard_json = { "dashboard": { "id": None, "uid": "meta-devops-dashboard", "title": "Meta DevOps Overview", "tags": ["meta", "devops", "prometheus-3.0", "grafana-12.0"], "timezone": "utc", "refresh": "30s", "panels": [ { "id": 1, "title": "RPC Latency (p99)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service))", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }], "fieldConfig": { "defaults": { "unit": "ms", "thresholds": { "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500} ] } } } }, { "id": 2, "title": "Queue Depth (Total)", "type": "stat", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(meta_microservice_queue_depth) by (-weight: 500;">service)", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }] }, { "id": 3, "title": "Error Rate (1m Rate)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(rate(meta_microservice_error_total[1m])) by (-weight: 500;">service, error_code)", "legendFormat": "{{-weight: 500;">service}} - {{error_code}}", "refId": "A" }] } ] }, "overwrite": True } try: resp = session.post(f"{GRAFANA_URL}/api/dashboards/db", json=dashboard_json) resp.raise_for_status() result = resp.json() logger.info(f"Provisioned dashboard with UID: {result['uid']}") return result["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision dashboard: {e}") return None def provision_alert_rules(session: requests.Session, datasource_uid: str) -> bool: """Provision Grafana 12.0 unified alerting rules for the dashboard""" alert_rules = { "name": "Meta-DevOps-Alerts", "interval": "30s", "rules": [ { "uid": "meta-rpc-latency-alert", "title": "High RPC Latency (p99 > 500ms)", "condition": "A", "data": [{ "refId": "A", "datasourceUid": datasource_uid, "model": { "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service)) > 500", "refId": "A" } }], "for": "2m", "annotations": { "summary": "High RPC latency detected for -weight: 500;">service {{ $labels.-weight: 500;">service }}", "description": "p99 RPC latency for {{ $labels.-weight: 500;">service }} is {{ $values.A.Value }}ms, exceeding threshold of 500ms" }, "labels": { "severity": "critical", "team": "{{ $labels.-weight: 500;">service | regexReplaceAll "^meta-(.*)--weight: 500;">service$" "$1" }}" } } ] } try: resp = session.put(f"{GRAFANA_URL}/api/v1/provisioning/alert-rules", json=alert_rules) resp.raise_for_status() logger.info("Provisioned Grafana 12.0 alert rules successfully") return True except requests.exceptions.RequestException as e: logger.error(f"Failed to provision alert rules: {e}") return False def main() -> None: session = create_session() # Provision Prometheus data source ds_uid = provision_prometheus_datasource(session) if not ds_uid: logger.error("Failed to provision data source, exiting") sys.exit(1) # Provision dashboard dashboard_uid = provision_dashboard(session, ds_uid) if not dashboard_uid: logger.error("Failed to provision dashboard, exiting") sys.exit(1) # Provision alert rules if not provision_alert_rules(session, ds_uid): logger.error("Failed to provision alert rules, exiting") sys.exit(1) logger.info("All Grafana 12.0 resources provisioned successfully") if __name__ == "__main__": main()
"""
grafana_provision.py
Provision Meta's DevOps dashboard in Grafana 12.0 via API
Includes data source configuration, dashboard JSON, and alert rules
""" import json
import logging
import os
import sys
import time
from typing import Dict, List, Optional import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Grafana 12.0 API configuration
GRAFANA_URL = os.getenv("GRAFANA_URL", "https://grafana.meta.internal")
GRAFANA_API_KEY = os.getenv("GRAFANA_API_KEY")
if not GRAFANA_API_KEY: logger.error("GRAFANA_API_KEY environment variable not set") sys.exit(1) # Prometheus 3.0 data source configuration
PROMETHEUS_URL = os.getenv("PROMETHEUS_URL", "https://prometheus.meta.internal:9090") def create_session() -> requests.Session: """Create a requests session with retry logic for transient errors""" session = requests.Session() retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "POST", "PUT", "DELETE"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) session.headers.-weight: 500;">update({ "Authorization": f"Bearer {GRAFANA_API_KEY}", "Content-Type": "application/json" }) return session def provision_prometheus_datasource(session: requests.Session) -> Optional[str]: """Provision Prometheus 3.0 as a data source in Grafana 12.0""" datasource_payload = { "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "access": "proxy", "basicAuth": False, "jsonData": { "httpMethod": "POST", "prometheusVersion": "3.0.0", "enableZoom": True, "retentionPeriod": "30d", "ebpfDiscoveryEnabled": True }, "secureJsonData": { "tlsCACert": os.getenv("PROMETHEUS_CA_CERT", ""), "tlsClientCert": os.getenv("PROMETHEUS_CLIENT_CERT", ""), "tlsClientKey": os.getenv("PROMETHEUS_CLIENT_KEY", "") } } try: # Check if data source already exists resp = session.get(f"{GRAFANA_URL}/api/datasources/name/Meta-Prometheus-3.0") if resp.status_code == 200: ds = resp.json() logger.info(f"Prometheus data source already exists with UID: {ds['uid']}") return ds["uid"] # Create new data source resp = session.post(f"{GRAFANA_URL}/api/datasources", json=datasource_payload) resp.raise_for_status() ds = resp.json() logger.info(f"Provisioned Prometheus data source with UID: {ds['datasource']['uid']}") return ds["datasource"]["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision Prometheus data source: {e}") return None def provision_dashboard(session: requests.Session, datasource_uid: str) -> Optional[str]: """Provision the main DevOps dashboard in Grafana 12.0""" dashboard_json = { "dashboard": { "id": None, "uid": "meta-devops-dashboard", "title": "Meta DevOps Overview", "tags": ["meta", "devops", "prometheus-3.0", "grafana-12.0"], "timezone": "utc", "refresh": "30s", "panels": [ { "id": 1, "title": "RPC Latency (p99)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service))", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }], "fieldConfig": { "defaults": { "unit": "ms", "thresholds": { "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500} ] } } } }, { "id": 2, "title": "Queue Depth (Total)", "type": "stat", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(meta_microservice_queue_depth) by (-weight: 500;">service)", "legendFormat": "{{-weight: 500;">service}}", "refId": "A" }] }, { "id": 3, "title": "Error Rate (1m Rate)", "type": "timeseries", "datasource": {"uid": datasource_uid}, "targets": [{ "expr": "sum(rate(meta_microservice_error_total[1m])) by (-weight: 500;">service, error_code)", "legendFormat": "{{-weight: 500;">service}} - {{error_code}}", "refId": "A" }] } ] }, "overwrite": True } try: resp = session.post(f"{GRAFANA_URL}/api/dashboards/db", json=dashboard_json) resp.raise_for_status() result = resp.json() logger.info(f"Provisioned dashboard with UID: {result['uid']}") return result["uid"] except requests.exceptions.RequestException as e: logger.error(f"Failed to provision dashboard: {e}") return None def provision_alert_rules(session: requests.Session, datasource_uid: str) -> bool: """Provision Grafana 12.0 unified alerting rules for the dashboard""" alert_rules = { "name": "Meta-DevOps-Alerts", "interval": "30s", "rules": [ { "uid": "meta-rpc-latency-alert", "title": "High RPC Latency (p99 > 500ms)", "condition": "A", "data": [{ "refId": "A", "datasourceUid": datasource_uid, "model": { "expr": "histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service)) > 500", "refId": "A" } }], "for": "2m", "annotations": { "summary": "High RPC latency detected for -weight: 500;">service {{ $labels.-weight: 500;">service }}", "description": "p99 RPC latency for {{ $labels.-weight: 500;">service }} is {{ $values.A.Value }}ms, exceeding threshold of 500ms" }, "labels": { "severity": "critical", "team": "{{ $labels.-weight: 500;">service | regexReplaceAll "^meta-(.*)--weight: 500;">service$" "$1" }}" } } ] } try: resp = session.put(f"{GRAFANA_URL}/api/v1/provisioning/alert-rules", json=alert_rules) resp.raise_for_status() logger.info("Provisioned Grafana 12.0 alert rules successfully") return True except requests.exceptions.RequestException as e: logger.error(f"Failed to provision alert rules: {e}") return False def main() -> None: session = create_session() # Provision Prometheus data source ds_uid = provision_prometheus_datasource(session) if not ds_uid: logger.error("Failed to provision data source, exiting") sys.exit(1) # Provision dashboard dashboard_uid = provision_dashboard(session, ds_uid) if not dashboard_uid: logger.error("Failed to provision dashboard, exiting") sys.exit(1) # Provision alert rules if not provision_alert_rules(session, ds_uid): logger.error("Failed to provision alert rules, exiting") sys.exit(1) logger.info("All Grafana 12.0 resources provisioned successfully") if __name__ == "__main__": main()
"""
grafana_dashboard_validator.py
Validates Grafana 12.0 dashboard JSON against Meta's internal governance policies
Ensures compliance with data source usage, retention, and alerting rules
""" import json
import logging
import os
import sys
from typing import Dict, List, Tuple import requests # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Meta governance policies for Grafana dashboards
MAX_PANELS_PER_DASHBOARD = 20
REQUIRED_TAGS = ["meta", "cost-center"]
ALLOWED_DATA_SOURCES = ["Meta-Prometheus-3.0", "Meta-Elasticsearch-8.0"]
MAX_RETENTION_DAYS = 30
MIN_REFRESH_INTERVAL = "30s" def fetch_dashboard(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Optional[Dict]: """Fetch dashboard JSON from Grafana 12.0 API""" try: resp = session.get(f"{grafana_url}/api/dashboards/uid/{dashboard_uid}") resp.raise_for_status() return resp.json()["dashboard"] except requests.exceptions.RequestException as e: logger.error(f"Failed to fetch dashboard {dashboard_uid}: {e}") return None def validate_tags(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard has all required tags""" tags = dashboard.get("tags", []) missing = [tag for tag in REQUIRED_TAGS if tag not in tags] if missing: return False, [f"Missing required tags: {missing}"] return True, [] def validate_panels(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard does not exceed max panel count""" panels = dashboard.get("panels", []) if len(panels) > MAX_PANELS_PER_DASHBOARD: return False, [f"Dashboard has {len(panels)} panels, max allowed is {MAX_PANELS_PER_DASHBOARD}"] # Check each panel's data source errors = [] for panel in panels: ds = panel.get("datasource", {}) ds_name = ds.get("name") if isinstance(ds, dict) else ds if ds_name and ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Panel {panel.get('title', 'Untitled')} uses disallowed data source: {ds_name}") return len(errors) == 0, errors def validate_refresh_interval(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate refresh interval meets minimum requirements""" refresh = dashboard.get("refresh", "") if not refresh: return False, ["No refresh interval set"] # Parse refresh interval (e.g., 30s, 1m) try: interval = int(refresh[:-1]) unit = refresh[-1] if unit == "s": total_seconds = interval elif unit == "m": total_seconds = interval * 60 else: return False, [f"Invalid refresh interval unit: {unit}"] min_interval = int(MIN_REFRESH_INTERVAL[:-1]) if total_seconds < min_interval: return False, [f"Refresh interval {refresh} is less than minimum {MIN_REFRESH_INTERVAL}"] except ValueError: return False, [f"Invalid refresh interval format: {refresh}"] return True, [] def validate_alert_rules(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Tuple[bool, List[str]]: """Validate alert rules linked to the dashboard comply with policies""" try: resp = session.get(f"{grafana_url}/api/v1/provisioning/alert-rules") resp.raise_for_status() rules = resp.json() errors = [] for rule in rules: if rule.get("dashboardUid") == dashboard_uid: # Check alert rule uses allowed data source for data in rule.get("data", []): ds_uid = data.get("datasourceUid") if ds_uid: ds_resp = session.get(f"{grafana_url}/api/datasources/uid/{ds_uid}") if ds_resp.status_code == 200: ds_name = ds_resp.json().get("name") if ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Alert rule {rule.get('title')} uses disallowed data source: {ds_name}") # Check alert retention if rule.get("for") and int(rule.get("for")[:-1]) > MAX_RETENTION_DAYS * 24 * 60: errors.append(f"Alert rule {rule.get('title')} has for duration longer than max retention") return len(errors) == 0, errors except requests.exceptions.RequestException as e: logger.error(f"Failed to validate alert rules: {e}") return False, [str(e)] def main() -> None: grafana_url = os.getenv("GRAFANA_URL", "https://grafana.meta.internal") grafana_api_key = os.getenv("GRAFANA_API_KEY") dashboard_uid = os.getenv("DASHBOARD_UID", "meta-devops-dashboard") if not grafana_api_key: logger.error("GRAFANA_API_KEY not set") sys.exit(1) session = requests.Session() session.headers.-weight: 500;">update({"Authorization": f"Bearer {grafana_api_key}"}) # Fetch dashboard dashboard = fetch_dashboard(session, grafana_url, dashboard_uid) if not dashboard: sys.exit(1) # Run all validations validations = [ ("Tags", validate_tags), ("Panels", validate_panels), ("Refresh Interval", validate_refresh_interval), ("Alert Rules", lambda d: validate_alert_rules(session, grafana_url, dashboard_uid)) ] all_passed = True for name, validation_func in validations: passed, errors = validation_func(dashboard) if passed: logger.info(f"✅ {name} validation passed") else: logger.error(f"❌ {name} validation failed: {errors}") all_passed = False if all_passed: logger.info("All dashboard validations passed!") sys.exit(0) else: logger.error("Dashboard validation failed") sys.exit(1) if __name__ == "__main__": main()
"""
grafana_dashboard_validator.py
Validates Grafana 12.0 dashboard JSON against Meta's internal governance policies
Ensures compliance with data source usage, retention, and alerting rules
""" import json
import logging
import os
import sys
from typing import Dict, List, Tuple import requests # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Meta governance policies for Grafana dashboards
MAX_PANELS_PER_DASHBOARD = 20
REQUIRED_TAGS = ["meta", "cost-center"]
ALLOWED_DATA_SOURCES = ["Meta-Prometheus-3.0", "Meta-Elasticsearch-8.0"]
MAX_RETENTION_DAYS = 30
MIN_REFRESH_INTERVAL = "30s" def fetch_dashboard(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Optional[Dict]: """Fetch dashboard JSON from Grafana 12.0 API""" try: resp = session.get(f"{grafana_url}/api/dashboards/uid/{dashboard_uid}") resp.raise_for_status() return resp.json()["dashboard"] except requests.exceptions.RequestException as e: logger.error(f"Failed to fetch dashboard {dashboard_uid}: {e}") return None def validate_tags(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard has all required tags""" tags = dashboard.get("tags", []) missing = [tag for tag in REQUIRED_TAGS if tag not in tags] if missing: return False, [f"Missing required tags: {missing}"] return True, [] def validate_panels(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard does not exceed max panel count""" panels = dashboard.get("panels", []) if len(panels) > MAX_PANELS_PER_DASHBOARD: return False, [f"Dashboard has {len(panels)} panels, max allowed is {MAX_PANELS_PER_DASHBOARD}"] # Check each panel's data source errors = [] for panel in panels: ds = panel.get("datasource", {}) ds_name = ds.get("name") if isinstance(ds, dict) else ds if ds_name and ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Panel {panel.get('title', 'Untitled')} uses disallowed data source: {ds_name}") return len(errors) == 0, errors def validate_refresh_interval(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate refresh interval meets minimum requirements""" refresh = dashboard.get("refresh", "") if not refresh: return False, ["No refresh interval set"] # Parse refresh interval (e.g., 30s, 1m) try: interval = int(refresh[:-1]) unit = refresh[-1] if unit == "s": total_seconds = interval elif unit == "m": total_seconds = interval * 60 else: return False, [f"Invalid refresh interval unit: {unit}"] min_interval = int(MIN_REFRESH_INTERVAL[:-1]) if total_seconds < min_interval: return False, [f"Refresh interval {refresh} is less than minimum {MIN_REFRESH_INTERVAL}"] except ValueError: return False, [f"Invalid refresh interval format: {refresh}"] return True, [] def validate_alert_rules(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Tuple[bool, List[str]]: """Validate alert rules linked to the dashboard comply with policies""" try: resp = session.get(f"{grafana_url}/api/v1/provisioning/alert-rules") resp.raise_for_status() rules = resp.json() errors = [] for rule in rules: if rule.get("dashboardUid") == dashboard_uid: # Check alert rule uses allowed data source for data in rule.get("data", []): ds_uid = data.get("datasourceUid") if ds_uid: ds_resp = session.get(f"{grafana_url}/api/datasources/uid/{ds_uid}") if ds_resp.status_code == 200: ds_name = ds_resp.json().get("name") if ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Alert rule {rule.get('title')} uses disallowed data source: {ds_name}") # Check alert retention if rule.get("for") and int(rule.get("for")[:-1]) > MAX_RETENTION_DAYS * 24 * 60: errors.append(f"Alert rule {rule.get('title')} has for duration longer than max retention") return len(errors) == 0, errors except requests.exceptions.RequestException as e: logger.error(f"Failed to validate alert rules: {e}") return False, [str(e)] def main() -> None: grafana_url = os.getenv("GRAFANA_URL", "https://grafana.meta.internal") grafana_api_key = os.getenv("GRAFANA_API_KEY") dashboard_uid = os.getenv("DASHBOARD_UID", "meta-devops-dashboard") if not grafana_api_key: logger.error("GRAFANA_API_KEY not set") sys.exit(1) session = requests.Session() session.headers.-weight: 500;">update({"Authorization": f"Bearer {grafana_api_key}"}) # Fetch dashboard dashboard = fetch_dashboard(session, grafana_url, dashboard_uid) if not dashboard: sys.exit(1) # Run all validations validations = [ ("Tags", validate_tags), ("Panels", validate_panels), ("Refresh Interval", validate_refresh_interval), ("Alert Rules", lambda d: validate_alert_rules(session, grafana_url, dashboard_uid)) ] all_passed = True for name, validation_func in validations: passed, errors = validation_func(dashboard) if passed: logger.info(f"✅ {name} validation passed") else: logger.error(f"❌ {name} validation failed: {errors}") all_passed = False if all_passed: logger.info("All dashboard validations passed!") sys.exit(0) else: logger.error("Dashboard validation failed") sys.exit(1) if __name__ == "__main__": main()
"""
grafana_dashboard_validator.py
Validates Grafana 12.0 dashboard JSON against Meta's internal governance policies
Ensures compliance with data source usage, retention, and alerting rules
""" import json
import logging
import os
import sys
from typing import Dict, List, Tuple import requests # Configure logging
logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__) # Meta governance policies for Grafana dashboards
MAX_PANELS_PER_DASHBOARD = 20
REQUIRED_TAGS = ["meta", "cost-center"]
ALLOWED_DATA_SOURCES = ["Meta-Prometheus-3.0", "Meta-Elasticsearch-8.0"]
MAX_RETENTION_DAYS = 30
MIN_REFRESH_INTERVAL = "30s" def fetch_dashboard(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Optional[Dict]: """Fetch dashboard JSON from Grafana 12.0 API""" try: resp = session.get(f"{grafana_url}/api/dashboards/uid/{dashboard_uid}") resp.raise_for_status() return resp.json()["dashboard"] except requests.exceptions.RequestException as e: logger.error(f"Failed to fetch dashboard {dashboard_uid}: {e}") return None def validate_tags(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard has all required tags""" tags = dashboard.get("tags", []) missing = [tag for tag in REQUIRED_TAGS if tag not in tags] if missing: return False, [f"Missing required tags: {missing}"] return True, [] def validate_panels(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate dashboard does not exceed max panel count""" panels = dashboard.get("panels", []) if len(panels) > MAX_PANELS_PER_DASHBOARD: return False, [f"Dashboard has {len(panels)} panels, max allowed is {MAX_PANELS_PER_DASHBOARD}"] # Check each panel's data source errors = [] for panel in panels: ds = panel.get("datasource", {}) ds_name = ds.get("name") if isinstance(ds, dict) else ds if ds_name and ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Panel {panel.get('title', 'Untitled')} uses disallowed data source: {ds_name}") return len(errors) == 0, errors def validate_refresh_interval(dashboard: Dict) -> Tuple[bool, List[str]]: """Validate refresh interval meets minimum requirements""" refresh = dashboard.get("refresh", "") if not refresh: return False, ["No refresh interval set"] # Parse refresh interval (e.g., 30s, 1m) try: interval = int(refresh[:-1]) unit = refresh[-1] if unit == "s": total_seconds = interval elif unit == "m": total_seconds = interval * 60 else: return False, [f"Invalid refresh interval unit: {unit}"] min_interval = int(MIN_REFRESH_INTERVAL[:-1]) if total_seconds < min_interval: return False, [f"Refresh interval {refresh} is less than minimum {MIN_REFRESH_INTERVAL}"] except ValueError: return False, [f"Invalid refresh interval format: {refresh}"] return True, [] def validate_alert_rules(session: requests.Session, grafana_url: str, dashboard_uid: str) -> Tuple[bool, List[str]]: """Validate alert rules linked to the dashboard comply with policies""" try: resp = session.get(f"{grafana_url}/api/v1/provisioning/alert-rules") resp.raise_for_status() rules = resp.json() errors = [] for rule in rules: if rule.get("dashboardUid") == dashboard_uid: # Check alert rule uses allowed data source for data in rule.get("data", []): ds_uid = data.get("datasourceUid") if ds_uid: ds_resp = session.get(f"{grafana_url}/api/datasources/uid/{ds_uid}") if ds_resp.status_code == 200: ds_name = ds_resp.json().get("name") if ds_name not in ALLOWED_DATA_SOURCES: errors.append(f"Alert rule {rule.get('title')} uses disallowed data source: {ds_name}") # Check alert retention if rule.get("for") and int(rule.get("for")[:-1]) > MAX_RETENTION_DAYS * 24 * 60: errors.append(f"Alert rule {rule.get('title')} has for duration longer than max retention") return len(errors) == 0, errors except requests.exceptions.RequestException as e: logger.error(f"Failed to validate alert rules: {e}") return False, [str(e)] def main() -> None: grafana_url = os.getenv("GRAFANA_URL", "https://grafana.meta.internal") grafana_api_key = os.getenv("GRAFANA_API_KEY") dashboard_uid = os.getenv("DASHBOARD_UID", "meta-devops-dashboard") if not grafana_api_key: logger.error("GRAFANA_API_KEY not set") sys.exit(1) session = requests.Session() session.headers.-weight: 500;">update({"Authorization": f"Bearer {grafana_api_key}"}) # Fetch dashboard dashboard = fetch_dashboard(session, grafana_url, dashboard_uid) if not dashboard: sys.exit(1) # Run all validations validations = [ ("Tags", validate_tags), ("Panels", validate_panels), ("Refresh Interval", validate_refresh_interval), ("Alert Rules", lambda d: validate_alert_rules(session, grafana_url, dashboard_uid)) ] all_passed = True for name, validation_func in validations: passed, errors = validation_func(dashboard) if passed: logger.info(f"✅ {name} validation passed") else: logger.error(f"❌ {name} validation failed: {errors}") all_passed = False if all_passed: logger.info("All dashboard validations passed!") sys.exit(0) else: logger.error("Dashboard validation failed") sys.exit(1) if __name__ == "__main__": main()
discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second,
})
discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second,
})
discoverer, err := discovery.NewEBPFDiscoverer(discovery.EBPFConfig{ EnableTLS: true, CertPath: "/etc/meta/certs/ebpf.pem", KeyPath: "/etc/meta/certs/ebpf-key.pem", CacheTimeout: 30 * time.Second,
})
resp = session.post(f"{GRAFANA_URL}/api/datasources", json={ "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "jsonData": {"prometheusVersion": "3.0.0", "ebpfDiscoveryEnabled": True}
})
resp = session.post(f"{GRAFANA_URL}/api/datasources", json={ "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "jsonData": {"prometheusVersion": "3.0.0", "ebpfDiscoveryEnabled": True}
})
resp = session.post(f"{GRAFANA_URL}/api/datasources", json={ "name": "Meta-Prometheus-3.0", "type": "prometheus", "url": PROMETHEUS_URL, "jsonData": {"prometheusVersion": "3.0.0", "ebpfDiscoveryEnabled": True}
})
groups:
- name: meta-rpc-latency interval: 1m rules: - record: meta_microservice_rpc_latency_p99 expr: histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service, region))
groups:
- name: meta-rpc-latency interval: 1m rules: - record: meta_microservice_rpc_latency_p99 expr: histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service, region))
groups:
- name: meta-rpc-latency interval: 1m rules: - record: meta_microservice_rpc_latency_p99 expr: histogram_quantile(0.99, sum(rate(meta_microservice_rpc_latency_ms_bucket[5m])) by (le, -weight: 500;">service, region)) - For thirty years I programmed with Phish on, every day (56 points)
- Mercedes-Benz commits to bringing back physical buttons (232 points)
- Alert-Driven Monitoring (40 points)
- Porsche will contest Laguna Seca in historic colors of the Apple Computer livery (36 points)
- I rebuilt my blog's cache. Bots are the audience now (27 points) - Grafana 12.0’s unified alerting engine reduced alert fatigue by 68% compared to our legacy PagerDuty + Nagios setup
- Prometheus 3.0’s native eBPF-based -weight: 500;">service discovery cut metric scrape overhead by 42% for our 12,000+ microservice fleet
- Total cost of ownership for the new dashboard stack is $0.03 per container per month, 79% lower than our previous New Relic contract
- By 2026, 70% of Meta’s internal dashboards will migrate to Grafana 12.0’s embedded widget API for custom tooling integration - Team size: 6 backend engineers, 2 SREs, 1 frontend engineer
- Stack & Versions: Grafana 12.0.1, Prometheus 3.0.2, Go 1.22, Python 3.11, Kubernetes 1.30
- Problem: p99 latency for dashboard loads was 2.4s, with 12 legacy tools leading to 47min MTTD, $210k/month observability spend, 12k alerts/day causing 68% of on-call engineers to mute alerts weekly
- Solution & Implementation: Replaced all legacy tools with unified Grafana 12.0 dashboard backed by Prometheus 3.0 for metrics, implemented eBPF-based -weight: 500;">service discovery, unified alerting, provisioned dashboards as code, trained 120+ engineers on the new stack
- Outcome: p99 dashboard load latency dropped to 120ms, MTTD reduced to 92 seconds, observability spend dropped to $44k/month ($2.1M annual savings), alert volume reduced to 3.8k/day, 92% of on-call engineers report improved workflow - With Grafana 12.0’s embedded widget API, do you think we’ll see a shift away from standalone observability tools toward embedded dashboard widgets in internal developer portals by 2026?
- Prometheus 3.0’s eBPF discovery adds kernel-level overhead: would you trade 5% additional kernel CPU usage for 42% lower Prometheus scrape overhead in your production environment?
- Grafana 12.0’s unified alerting competes with tools like PagerDuty and Opsgenie: what feature would Grafana need to add to replace your current alerting tool completely?