opentelemetry-api>=1.20.0 opentelemetry-sdk>=1.20.0 opentelemetry-instrumentation-requests>=0.41b0 opentelemetry-exporter-otlp-proto-http>=1.20.0 requests>=2.28.0 python-dotenv>=1.0.0
opentelemetry-api>=1.20.0 opentelemetry-sdk>=1.20.0 opentelemetry-instrumentation-requests>=0.41b0 opentelemetry-exporter-otlp-proto-http>=1.20.0 requests>=2.28.0 python-dotenv>=1.0.0
opentelemetry-api>=1.20.0 opentelemetry-sdk>=1.20.0 opentelemetry-instrumentation-requests>=0.41b0 opentelemetry-exporter-otlp-proto-http>=1.20.0 requests>=2.28.0 python-dotenv>=1.0.0
BRIGHT_DATA_API_KEY=your_api_key BRIGHT_DATA_ZONE=serp # or your SERP zone name from Bright Data dashboard BRIGHT_DATA_COUNTRY=us # optional OTEL_EXPORTER=console # set to "jaeger" to send traces to Jaeger (must be running)
BRIGHT_DATA_API_KEY=your_api_key BRIGHT_DATA_ZONE=serp # or your SERP zone name from Bright Data dashboard BRIGHT_DATA_COUNTRY=us # optional OTEL_EXPORTER=console # set to "jaeger" to send traces to Jaeger (must be running)
BRIGHT_DATA_API_KEY=your_api_key BRIGHT_DATA_ZONE=serp # or your SERP zone name from Bright Data dashboard BRIGHT_DATA_COUNTRY=us # optional OTEL_EXPORTER=console # set to "jaeger" to send traces to Jaeger (must be running)
import os from opentelemetry import trace from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import SERVICE_NAME, Resource def init_otel(exporter: str = "console", service_name: str = "bd-scraper"): resource = Resource.create(attributes={SERVICE_NAME: service_name}) provider = TracerProvider(resource=resource) if exporter == "jaeger": from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=f"{endpoint}/v1/traces")) else: processor = BatchSpanProcessor(ConsoleSpanExporter()) provider.add_span_processor(processor) trace.set_tracer_provider(provider) RequestsInstrumentor().instrument() # this is where the magic happens! return trace.get_tracer(service_name, "1.0.0")
import os from opentelemetry import trace from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import SERVICE_NAME, Resource def init_otel(exporter: str = "console", service_name: str = "bd-scraper"): resource = Resource.create(attributes={SERVICE_NAME: service_name}) provider = TracerProvider(resource=resource) if exporter == "jaeger": from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=f"{endpoint}/v1/traces")) else: processor = BatchSpanProcessor(ConsoleSpanExporter()) provider.add_span_processor(processor) trace.set_tracer_provider(provider) RequestsInstrumentor().instrument() # this is where the magic happens! return trace.get_tracer(service_name, "1.0.0")
import os from opentelemetry import trace from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import SERVICE_NAME, Resource def init_otel(exporter: str = "console", service_name: str = "bd-scraper"): resource = Resource.create(attributes={SERVICE_NAME: service_name}) provider = TracerProvider(resource=resource) if exporter == "jaeger": from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=f"{endpoint}/v1/traces")) else: processor = BatchSpanProcessor(ConsoleSpanExporter()) provider.add_span_processor(processor) trace.set_tracer_provider(provider) RequestsInstrumentor().instrument() # this is where the magic happens! return trace.get_tracer(service_name, "1.0.0")
import json import os import time import requests from typing import Optional from dotenv import load_dotenv load_dotenv() class BrightDataClient: def __init__( self, api_key: Optional[str] = None, zone: Optional[str] = None, country: Optional[str] = None, ): self.api_key = api_key or os.getenv("BRIGHT_DATA_API_KEY") self.zone = zone or os.getenv("BRIGHT_DATA_ZONE") self.country = country or os.getenv("BRIGHT_DATA_COUNTRY") self.api_endpoint = "https://api.brightdata.com/request" if not self.api_key or not self.zone: raise ValueError("BRIGHT_DATA_API_KEY and BRIGHT_DATA_ZONE required (env or constructor)") self.session = requests.Session() self.session.headers.update({ "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", }) def search( self, query: str, num_results: int = 10, language: Optional[str] = None, country: Optional[str] = None, max_retries: int = 2, ) -> dict: from opentelemetry import trace from opentelemetry.trace import StatusCode tracer = trace.get_tracer(__name__, "1.0.0") target_domain = "google.com" with tracer.start_as_current_span("bright_data.search") as span: span.set_attribute("scraper.query", query) span.set_attribute("scraper.target_domain", target_domain) span.set_attribute("scraper.num_results", num_results) start = time.perf_counter() last_err = None for attempt in range(max_retries + 1): try: result = self._do_search(query, num_results, language, country) latency_ms = (time.perf_counter() - start) * 1000 span.set_attribute("scraper.latency_ms", round(latency_ms, 2)) span.set_attribute("scraper.retries", attempt) # clean success — no retries needed if attempt == 0: span.set_status(StatusCode.OK) else: # recovered, but we want this surfaced in Jaeger span.set_status(StatusCode.ERROR, "Recovered after retry") return result except Exception as e: last_err = e span.set_attribute("scraper.retries", attempt + 1) if attempt `< max_retries: time.sleep(0.5 * (attempt + 1)) # all retries exhausted span.set_attribute("scraper.error", str(last_err)) span.set_status(StatusCode.ERROR, str(last_err)) span.record_exception(last_err) raise last_err def _do_search( self, query: str, num_results: int, language: Optional[str], country: Optional[str], ) ->` dict: search_url = ( f"https://www.google.com/search" f"?q={requests.utils.quote(query)}&num={num_results}&brd_json=1" ) if language: search_url += f"&hl={language}&lr=lang_{language}" target_country = country or self.country payload = {"zone": self.zone, "url": search_url, "format": "json"} if target_country: payload["country"] = target_country response = self.session.post(self.api_endpoint, json=payload, timeout=30) response.raise_for_status() result = response.json() # Bright Data may return body as JSON string — unpack it if isinstance(result, dict) and "body" in result: body = result["body"] result = json.loads(body) if isinstance(body, str) else body return result
import json import os import time import requests from typing import Optional from dotenv import load_dotenv load_dotenv() class BrightDataClient: def __init__( self, api_key: Optional[str] = None, zone: Optional[str] = None, country: Optional[str] = None, ): self.api_key = api_key or os.getenv("BRIGHT_DATA_API_KEY") self.zone = zone or os.getenv("BRIGHT_DATA_ZONE") self.country = country or os.getenv("BRIGHT_DATA_COUNTRY") self.api_endpoint = "https://api.brightdata.com/request" if not self.api_key or not self.zone: raise ValueError("BRIGHT_DATA_API_KEY and BRIGHT_DATA_ZONE required (env or constructor)") self.session = requests.Session() self.session.headers.update({ "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", }) def search( self, query: str, num_results: int = 10, language: Optional[str] = None, country: Optional[str] = None, max_retries: int = 2, ) -> dict: from opentelemetry import trace from opentelemetry.trace import StatusCode tracer = trace.get_tracer(__name__, "1.0.0") target_domain = "google.com" with tracer.start_as_current_span("bright_data.search") as span: span.set_attribute("scraper.query", query) span.set_attribute("scraper.target_domain", target_domain) span.set_attribute("scraper.num_results", num_results) start = time.perf_counter() last_err = None for attempt in range(max_retries + 1): try: result = self._do_search(query, num_results, language, country) latency_ms = (time.perf_counter() - start) * 1000 span.set_attribute("scraper.latency_ms", round(latency_ms, 2)) span.set_attribute("scraper.retries", attempt) # clean success — no retries needed if attempt == 0: span.set_status(StatusCode.OK) else: # recovered, but we want this surfaced in Jaeger span.set_status(StatusCode.ERROR, "Recovered after retry") return result except Exception as e: last_err = e span.set_attribute("scraper.retries", attempt + 1) if attempt `< max_retries: time.sleep(0.5 * (attempt + 1)) # all retries exhausted span.set_attribute("scraper.error", str(last_err)) span.set_status(StatusCode.ERROR, str(last_err)) span.record_exception(last_err) raise last_err def _do_search( self, query: str, num_results: int, language: Optional[str], country: Optional[str], ) ->` dict: search_url = ( f"https://www.google.com/search" f"?q={requests.utils.quote(query)}&num={num_results}&brd_json=1" ) if language: search_url += f"&hl={language}&lr=lang_{language}" target_country = country or self.country payload = {"zone": self.zone, "url": search_url, "format": "json"} if target_country: payload["country"] = target_country response = self.session.post(self.api_endpoint, json=payload, timeout=30) response.raise_for_status() result = response.json() # Bright Data may return body as JSON string — unpack it if isinstance(result, dict) and "body" in result: body = result["body"] result = json.loads(body) if isinstance(body, str) else body return result
import json import os import time import requests from typing import Optional from dotenv import load_dotenv load_dotenv() class BrightDataClient: def __init__( self, api_key: Optional[str] = None, zone: Optional[str] = None, country: Optional[str] = None, ): self.api_key = api_key or os.getenv("BRIGHT_DATA_API_KEY") self.zone = zone or os.getenv("BRIGHT_DATA_ZONE") self.country = country or os.getenv("BRIGHT_DATA_COUNTRY") self.api_endpoint = "https://api.brightdata.com/request" if not self.api_key or not self.zone: raise ValueError("BRIGHT_DATA_API_KEY and BRIGHT_DATA_ZONE required (env or constructor)") self.session = requests.Session() self.session.headers.update({ "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", }) def search( self, query: str, num_results: int = 10, language: Optional[str] = None, country: Optional[str] = None, max_retries: int = 2, ) -> dict: from opentelemetry import trace from opentelemetry.trace import StatusCode tracer = trace.get_tracer(__name__, "1.0.0") target_domain = "google.com" with tracer.start_as_current_span("bright_data.search") as span: span.set_attribute("scraper.query", query) span.set_attribute("scraper.target_domain", target_domain) span.set_attribute("scraper.num_results", num_results) start = time.perf_counter() last_err = None for attempt in range(max_retries + 1): try: result = self._do_search(query, num_results, language, country) latency_ms = (time.perf_counter() - start) * 1000 span.set_attribute("scraper.latency_ms", round(latency_ms, 2)) span.set_attribute("scraper.retries", attempt) # clean success — no retries needed if attempt == 0: span.set_status(StatusCode.OK) else: # recovered, but we want this surfaced in Jaeger span.set_status(StatusCode.ERROR, "Recovered after retry") return result except Exception as e: last_err = e span.set_attribute("scraper.retries", attempt + 1) if attempt `< max_retries: time.sleep(0.5 * (attempt + 1)) # all retries exhausted span.set_attribute("scraper.error", str(last_err)) span.set_status(StatusCode.ERROR, str(last_err)) span.record_exception(last_err) raise last_err def _do_search( self, query: str, num_results: int, language: Optional[str], country: Optional[str], ) ->` dict: search_url = ( f"https://www.google.com/search" f"?q={requests.utils.quote(query)}&num={num_results}&brd_json=1" ) if language: search_url += f"&hl={language}&lr=lang_{language}" target_country = country or self.country payload = {"zone": self.zone, "url": search_url, "format": "json"} if target_country: payload["country"] = target_country response = self.session.post(self.api_endpoint, json=payload, timeout=30) response.raise_for_status() result = response.json() # Bright Data may return body as JSON string — unpack it if isinstance(result, dict) and "body" in result: body = result["body"] result = json.loads(body) if isinstance(body, str) else body return result
import argparse import os import time from dotenv import load_dotenv load_dotenv() _exporter = os.getenv("OTEL_EXPORTER", "console") # console fallback as a default from otel_config import init_otel init_otel(exporter=_exporter) # must come before BrightDataClient import from bright_data_otel import BrightDataClient def run(calls: int = 10, delay: float = 0.5): queries = [ "python programming", "machine learning", "web development", "data science", "cloud computing", ] client = BrightDataClient() start = time.time() for i in range(calls): q = queries[i % len(queries)] try: data = client.search(q, num_results=5) n = len(data.get("organic", [])) if isinstance(data, dict) else 0 print(f" [{i+1}/{calls}] {q}: {n} results") except Exception as e: print(f" [{i+1}/{calls}] {q}: error — {e}") if i `< calls - 1: time.sleep(delay) print(f"Done in {time.time() - start:.1f}s") if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--count", type=int, default=10) p.add_argument("--delay", type=float, default=0.5) args = p.parse_args() run(calls=args.count, delay=args.delay)
import argparse import os import time from dotenv import load_dotenv load_dotenv() _exporter = os.getenv("OTEL_EXPORTER", "console") # console fallback as a default from otel_config import init_otel init_otel(exporter=_exporter) # must come before BrightDataClient import from bright_data_otel import BrightDataClient def run(calls: int = 10, delay: float = 0.5): queries = [ "python programming", "machine learning", "web development", "data science", "cloud computing", ] client = BrightDataClient() start = time.time() for i in range(calls): q = queries[i % len(queries)] try: data = client.search(q, num_results=5) n = len(data.get("organic", [])) if isinstance(data, dict) else 0 print(f" [{i+1}/{calls}] {q}: {n} results") except Exception as e: print(f" [{i+1}/{calls}] {q}: error — {e}") if i `< calls - 1: time.sleep(delay) print(f"Done in {time.time() - start:.1f}s") if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--count", type=int, default=10) p.add_argument("--delay", type=float, default=0.5) args = p.parse_args() run(calls=args.count, delay=args.delay)
import argparse import os import time from dotenv import load_dotenv load_dotenv() _exporter = os.getenv("OTEL_EXPORTER", "console") # console fallback as a default from otel_config import init_otel init_otel(exporter=_exporter) # must come before BrightDataClient import from bright_data_otel import BrightDataClient def run(calls: int = 10, delay: float = 0.5): queries = [ "python programming", "machine learning", "web development", "data science", "cloud computing", ] client = BrightDataClient() start = time.time() for i in range(calls): q = queries[i % len(queries)] try: data = client.search(q, num_results=5) n = len(data.get("organic", [])) if isinstance(data, dict) else 0 print(f" [{i+1}/{calls}] {q}: {n} results") except Exception as e: print(f" [{i+1}/{calls}] {q}: error — {e}") if i `< calls - 1: time.sleep(delay) print(f"Done in {time.time() - start:.1f}s") if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--count", type=int, default=10) p.add_argument("--delay", type=float, default=0.5) args = p.parse_args() run(calls=args.count, delay=args.delay)
> python scraper.py --count 5
> python scraper.py --count 5
> python scraper.py --count 5
# to be honest, just set OTEL_EXPORTER in .env > OTEL_EXPORTER=jaeger python scraper.py --count 5
# to be honest, just set OTEL_EXPORTER in .env > OTEL_EXPORTER=jaeger python scraper.py --count 5
# to be honest, just set OTEL_EXPORTER in .env > OTEL_EXPORTER=jaeger python scraper.py --count 5
[1/5] python programming: 6 results [2/5] machine learning: 8 results [3/5] web development: 9 results [4/5] data science: 9 results [5/5] cloud computing: 9 results Done in 75.1s
[1/5] python programming: 6 results [2/5] machine learning: 8 results [3/5] web development: 9 results [4/5] data science: 9 results [5/5] cloud computing: 9 results Done in 75.1s
[1/5] python programming: 6 results [2/5] machine learning: 8 results [3/5] web development: 9 results [4/5] data science: 9 results [5/5] cloud computing: 9 results Done in 75.1s
{ "name": "bright_data.search", "attributes": { "scraper.query": "python programming", "scraper.target_domain": "google.com", "scraper.latency_ms": 3686.01, "scraper.retries": 0 }
{ "name": "bright_data.search", "attributes": { "scraper.query": "python programming", "scraper.target_domain": "google.com", "scraper.latency_ms": 3686.01, "scraper.retries": 0 }
{ "name": "bright_data.search", "attributes": { "scraper.query": "python programming", "scraper.target_domain": "google.com", "scraper.latency_ms": 3686.01, "scraper.retries": 0 }
{ "name": "POST", "status": { "status_code": "ERROR", "description": "ReadTimeout: ...Read timed out. (read timeout=30)" }, "start_time": "2026-03-20T21:18:24.986273Z", "end_time": "2026-03-20T21:18:54.999505Z", "events": [{ "name": "exception", "attributes": { "exception.type": "requests.exceptions.ReadTimeout", "exception.stacktrace": "..." } }] } { "name": "POST", "status": { "status_code": "UNSET" }, "start_time": "2026-03-20T21:18:55.505186Z", "end_time": "2026-03-20T21:19:20.097874Z" } { "name": "bright_data.search", "attributes": { "scraper.query": "data science", "scraper.latency_ms": 55113.46, "scraper.retries": 1 } }
{ "name": "POST", "status": { "status_code": "ERROR", "description": "ReadTimeout: ...Read timed out. (read timeout=30)" }, "start_time": "2026-03-20T21:18:24.986273Z", "end_time": "2026-03-20T21:18:54.999505Z", "events": [{ "name": "exception", "attributes": { "exception.type": "requests.exceptions.ReadTimeout", "exception.stacktrace": "..." } }] } { "name": "POST", "status": { "status_code": "UNSET" }, "start_time": "2026-03-20T21:18:55.505186Z", "end_time": "2026-03-20T21:19:20.097874Z" } { "name": "bright_data.search", "attributes": { "scraper.query": "data science", "scraper.latency_ms": 55113.46, "scraper.retries": 1 } }
{ "name": "POST", "status": { "status_code": "ERROR", "description": "ReadTimeout: ...Read timed out. (read timeout=30)" }, "start_time": "2026-03-20T21:18:24.986273Z", "end_time": "2026-03-20T21:18:54.999505Z", "events": [{ "name": "exception", "attributes": { "exception.type": "requests.exceptions.ReadTimeout", "exception.stacktrace": "..." } }] } { "name": "POST", "status": { "status_code": "UNSET" }, "start_time": "2026-03-20T21:18:55.505186Z", "end_time": "2026-03-20T21:19:20.097874Z" } { "name": "bright_data.search", "attributes": { "scraper.query": "data science", "scraper.latency_ms": 55113.46, "scraper.retries": 1 } }
docker run -d --name jaeger \ -p 16686:16686 \ -p 4318:4318 \ jaegertracing/all-in-one:latest
docker run -d --name jaeger \ -p 16686:16686 \ -p 4318:4318 \ jaegertracing/all-in-one:latest
docker run -d --name jaeger \ -p 16686:16686 \ -p 4318:4318 \ jaegertracing/all-in-one:latest
> python scraper.py --count 10
> python scraper.py --count 10
> python scraper.py --count 10 - Retry storm detection. If a domain starts blocking aggressively, you won’t see it as hard errors anymore, but a creeping spike in scraper.retries > 0 spans. That’s your early warning before you trigger a full ban or blow past your proxy quota for the month.
- Actual cost visibility. Every retry is another proxy request. If you’re paying per request or per GB, scraper.retries on your spans maps directly to a line item on your invoice. You can aggregate this and alert on it — I haven’t been doing this before adding OTel, and most likely, neither have you. 😅
- Per-query latency profiling. Some queries are just structurally slower — more competitive terms, heavier result pages, more contention in the proxy pool. Traces let you see this per-query instead of as a blended average that makes everything look fine. Once you can see the outliers, you can do something about them. - opentelemetry-instrumentation-requests — this gives us automatic HTTP tracing. Zero manual work.
- opentelemetry-exporter-otlp-proto-http — for when we want to send traces somewhere real, like Jaeger.