┌─────────────────────────────────────────────────┐
│ Prompt Engineering System │
├────────────┬────────────┬────────────┬──────────┤
│ Registry │ Testing │ Deploy │ Monitor │
│ │ │ │ │
├────────────┼────────────┼────────────┼──────────┤
│ Storage │ Pre-deploy │ Canary / │ Metrics │
│ + versions │ eval │ A/B rollout│ + alerts │
└────────────┴────────────┴────────────┴──────────┘
┌─────────────────────────────────────────────────┐
│ Prompt Engineering System │
├────────────┬────────────┬────────────┬──────────┤
│ Registry │ Testing │ Deploy │ Monitor │
│ │ │ │ │
├────────────┼────────────┼────────────┼──────────┤
│ Storage │ Pre-deploy │ Canary / │ Metrics │
│ + versions │ eval │ A/B rollout│ + alerts │
└────────────┴────────────┴────────────┴──────────┘
┌─────────────────────────────────────────────────┐
│ Prompt Engineering System │
├────────────┬────────────┬────────────┬──────────┤
│ Registry │ Testing │ Deploy │ Monitor │
│ │ │ │ │
├────────────┼────────────┼────────────┼──────────┤
│ Storage │ Pre-deploy │ Canary / │ Metrics │
│ + versions │ eval │ A/B rollout│ + alerts │
└────────────┴────────────┴────────────┴──────────┘
from langfuse import Langfuse langfuse = Langfuse() # Get the production version of a prompt
prompt = langfuse.get_prompt( name="ticket-classifier", label="production" # or "staging", "latest"
) # Prompt with variables
system_message = prompt.compile( categories="billing,technical,general,urgent", language="en"
)
from langfuse import Langfuse langfuse = Langfuse() # Get the production version of a prompt
prompt = langfuse.get_prompt( name="ticket-classifier", label="production" # or "staging", "latest"
) # Prompt with variables
system_message = prompt.compile( categories="billing,technical,general,urgent", language="en"
)
from langfuse import Langfuse langfuse = Langfuse() # Get the production version of a prompt
prompt = langfuse.get_prompt( name="ticket-classifier", label="production" # or "staging", "latest"
) # Prompt with variables
system_message = prompt.compile( categories="billing,technical,general,urgent", language="en"
)
prompts/
├── ticket-classifier/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
│ ├── dataset.jsonl
│ └── eval.py
├── summarizer/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
└── prompt_registry.py
prompts/
├── ticket-classifier/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
│ ├── dataset.jsonl
│ └── eval.py
├── summarizer/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
└── prompt_registry.py
prompts/
├── ticket-classifier/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
│ ├── dataset.jsonl
│ └── eval.py
├── summarizer/
│ ├── prompt.yaml
│ ├── config.yaml
│ └── tests/
└── prompt_registry.py
# prompts/ticket-classifier/prompt.yaml
name: ticket-classifier
type: chat
model: gpt-4o-mini
temperature: 0
messages: - role: system content: | You are a support ticket classifier. Categories: {{categories}}. Return JSON: {"category": "...", "confidence": 0.0-1.0, "reasoning": "..."} Response language: {{language}}. - role: user content: "{{ticket_text}}"
variables: categories: "billing,technical,general,urgent" language: "en"
# prompts/ticket-classifier/prompt.yaml
name: ticket-classifier
type: chat
model: gpt-4o-mini
temperature: 0
messages: - role: system content: | You are a support ticket classifier. Categories: {{categories}}. Return JSON: {"category": "...", "confidence": 0.0-1.0, "reasoning": "..."} Response language: {{language}}. - role: user content: "{{ticket_text}}"
variables: categories: "billing,technical,general,urgent" language: "en"
# prompts/ticket-classifier/prompt.yaml
name: ticket-classifier
type: chat
model: gpt-4o-mini
temperature: 0
messages: - role: system content: | You are a support ticket classifier. Categories: {{categories}}. Return JSON: {"category": "...", "confidence": 0.0-1.0, "reasoning": "..."} Response language: {{language}}. - role: user content: "{{ticket_text}}"
variables: categories: "billing,technical,general,urgent" language: "en"
# prompt_registry.py
import yaml
from pathlib import Path class PromptRegistry: def __init__(self, prompts_dir: str = "prompts"): self.prompts_dir = Path(prompts_dir) self._cache = {} def get(self, name: str) -> dict: if name not in self._cache: prompt_path = self.prompts_dir / name / "prompt.yaml" with open(prompt_path) as f: self._cache[name] = yaml.safe_load(f) return self._cache[name] def compile(self, name: str, **variables) -> list[dict]: prompt = self.get(name) messages = [] for msg in prompt["messages"]: content = msg["content"] for key, value in {**prompt.get("variables", {}), **variables}.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) messages.append({"role": msg["role"], "content": content}) return messages
# prompt_registry.py
import yaml
from pathlib import Path class PromptRegistry: def __init__(self, prompts_dir: str = "prompts"): self.prompts_dir = Path(prompts_dir) self._cache = {} def get(self, name: str) -> dict: if name not in self._cache: prompt_path = self.prompts_dir / name / "prompt.yaml" with open(prompt_path) as f: self._cache[name] = yaml.safe_load(f) return self._cache[name] def compile(self, name: str, **variables) -> list[dict]: prompt = self.get(name) messages = [] for msg in prompt["messages"]: content = msg["content"] for key, value in {**prompt.get("variables", {}), **variables}.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) messages.append({"role": msg["role"], "content": content}) return messages
# prompt_registry.py
import yaml
from pathlib import Path class PromptRegistry: def __init__(self, prompts_dir: str = "prompts"): self.prompts_dir = Path(prompts_dir) self._cache = {} def get(self, name: str) -> dict: if name not in self._cache: prompt_path = self.prompts_dir / name / "prompt.yaml" with open(prompt_path) as f: self._cache[name] = yaml.safe_load(f) return self._cache[name] def compile(self, name: str, **variables) -> list[dict]: prompt = self.get(name) messages = [] for msg in prompt["messages"]: content = msg["content"] for key, value in {**prompt.get("variables", {}), **variables}.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) messages.append({"role": msg["role"], "content": content}) return messages
# ci/sync_prompts.py — called in CI pipeline
from langfuse import Langfuse
from prompt_registry import PromptRegistry langfuse = Langfuse()
registry = PromptRegistry() for prompt_name in ["ticket-classifier", "summarizer", "response-generator"]: prompt_data = registry.get(prompt_name) langfuse.create_prompt( name=prompt_name, prompt=prompt_data["messages"], config={"model": prompt_data["model"], "temperature": prompt_data["temperature"]}, labels=["production"], )
# ci/sync_prompts.py — called in CI pipeline
from langfuse import Langfuse
from prompt_registry import PromptRegistry langfuse = Langfuse()
registry = PromptRegistry() for prompt_name in ["ticket-classifier", "summarizer", "response-generator"]: prompt_data = registry.get(prompt_name) langfuse.create_prompt( name=prompt_name, prompt=prompt_data["messages"], config={"model": prompt_data["model"], "temperature": prompt_data["temperature"]}, labels=["production"], )
# ci/sync_prompts.py — called in CI pipeline
from langfuse import Langfuse
from prompt_registry import PromptRegistry langfuse = Langfuse()
registry = PromptRegistry() for prompt_name in ["ticket-classifier", "summarizer", "response-generator"]: prompt_data = registry.get(prompt_name) langfuse.create_prompt( name=prompt_name, prompt=prompt_data["messages"], config={"model": prompt_data["model"], "temperature": prompt_data["temperature"]}, labels=["production"], )
{"input": "Can't process payment, card is being declined", "expected": {"category": "billing", "confidence_min": 0.8}}
{"input": "App crashes when opening the chat", "expected": {"category": "technical", "confidence_min": 0.8}}
{"input": "I want to delete my account and all my data", "expected": {"category": "general", "confidence_min": 0.7}}
{"input": "URGENT! Server is down, customers can't log in", "expected": {"category": "urgent", "confidence_min": 0.9}}
{"input": "Can't process payment, card is being declined", "expected": {"category": "billing", "confidence_min": 0.8}}
{"input": "App crashes when opening the chat", "expected": {"category": "technical", "confidence_min": 0.8}}
{"input": "I want to delete my account and all my data", "expected": {"category": "general", "confidence_min": 0.7}}
{"input": "URGENT! Server is down, customers can't log in", "expected": {"category": "urgent", "confidence_min": 0.9}}
{"input": "Can't process payment, card is being declined", "expected": {"category": "billing", "confidence_min": 0.8}}
{"input": "App crashes when opening the chat", "expected": {"category": "technical", "confidence_min": 0.8}}
{"input": "I want to delete my account and all my data", "expected": {"category": "general", "confidence_min": 0.7}}
{"input": "URGENT! Server is down, customers can't log in", "expected": {"category": "urgent", "confidence_min": 0.9}}
import json
from openai import OpenAI
from prompt_registry import PromptRegistry client = OpenAI()
registry = PromptRegistry() def evaluate_prompt(prompt_name: str, dataset_path: str, threshold: float = 0.85): """Evaluate a prompt against a dataset. Return pass/fail.""" with open(dataset_path) as f: examples = [json.loads(line) for line in f] correct = 0 total = len(examples) failures = [] for example in examples: messages = registry.compile(prompt_name, ticket_text=example["input"]) response = client.chat.completions.create( model=registry.get(prompt_name)["model"], messages=messages, temperature=0, ) result = json.loads(response.choices[0].message.content) if result["category"] == example["expected"]["category"]: if result["confidence"] >= example["expected"]["confidence_min"]: correct += 1 else: failures.append({ "input": example["input"], "reason": f"low confidence: {result['confidence']}", }) else: failures.append({ "input": example["input"], "reason": f"wrong category: {result['category']}", }) accuracy = correct / total passed = accuracy >= threshold return { "accuracy": accuracy, "threshold": threshold, "passed": passed, "failures": failures, }
import json
from openai import OpenAI
from prompt_registry import PromptRegistry client = OpenAI()
registry = PromptRegistry() def evaluate_prompt(prompt_name: str, dataset_path: str, threshold: float = 0.85): """Evaluate a prompt against a dataset. Return pass/fail.""" with open(dataset_path) as f: examples = [json.loads(line) for line in f] correct = 0 total = len(examples) failures = [] for example in examples: messages = registry.compile(prompt_name, ticket_text=example["input"]) response = client.chat.completions.create( model=registry.get(prompt_name)["model"], messages=messages, temperature=0, ) result = json.loads(response.choices[0].message.content) if result["category"] == example["expected"]["category"]: if result["confidence"] >= example["expected"]["confidence_min"]: correct += 1 else: failures.append({ "input": example["input"], "reason": f"low confidence: {result['confidence']}", }) else: failures.append({ "input": example["input"], "reason": f"wrong category: {result['category']}", }) accuracy = correct / total passed = accuracy >= threshold return { "accuracy": accuracy, "threshold": threshold, "passed": passed, "failures": failures, }
import json
from openai import OpenAI
from prompt_registry import PromptRegistry client = OpenAI()
registry = PromptRegistry() def evaluate_prompt(prompt_name: str, dataset_path: str, threshold: float = 0.85): """Evaluate a prompt against a dataset. Return pass/fail.""" with open(dataset_path) as f: examples = [json.loads(line) for line in f] correct = 0 total = len(examples) failures = [] for example in examples: messages = registry.compile(prompt_name, ticket_text=example["input"]) response = client.chat.completions.create( model=registry.get(prompt_name)["model"], messages=messages, temperature=0, ) result = json.loads(response.choices[0].message.content) if result["category"] == example["expected"]["category"]: if result["confidence"] >= example["expected"]["confidence_min"]: correct += 1 else: failures.append({ "input": example["input"], "reason": f"low confidence: {result['confidence']}", }) else: failures.append({ "input": example["input"], "reason": f"wrong category: {result['category']}", }) accuracy = correct / total passed = accuracy >= threshold return { "accuracy": accuracy, "threshold": threshold, "passed": passed, "failures": failures, }
# .github/workflows/prompt-eval.yml
name: Prompt Evaluation
on: pull_request: paths: - 'prompts/**' jobs: eval: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install dependencies run: pip install openai langfuse pyyaml - name: Run prompt evaluations env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: python ci/eval_prompts.py --changed-only - name: Comment PR with results uses: actions/github-script@v7 with: script: | const fs = require('fs'); const results = JSON.parse(fs.readFileSync('eval_results.json')); let body = '## Prompt Eval Results\n\n'; for (const [name, result] of Object.entries(results)) { const status = result.passed ? '✅' : '❌'; body += `| ${name} | ${status} | ${result.accuracy.toFixed(2)} | ${result.threshold} |\n`; } github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body });
# .github/workflows/prompt-eval.yml
name: Prompt Evaluation
on: pull_request: paths: - 'prompts/**' jobs: eval: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install dependencies run: pip install openai langfuse pyyaml - name: Run prompt evaluations env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: python ci/eval_prompts.py --changed-only - name: Comment PR with results uses: actions/github-script@v7 with: script: | const fs = require('fs'); const results = JSON.parse(fs.readFileSync('eval_results.json')); let body = '## Prompt Eval Results\n\n'; for (const [name, result] of Object.entries(results)) { const status = result.passed ? '✅' : '❌'; body += `| ${name} | ${status} | ${result.accuracy.toFixed(2)} | ${result.threshold} |\n`; } github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body });
# .github/workflows/prompt-eval.yml
name: Prompt Evaluation
on: pull_request: paths: - 'prompts/**' jobs: eval: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install dependencies run: pip install openai langfuse pyyaml - name: Run prompt evaluations env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: python ci/eval_prompts.py --changed-only - name: Comment PR with results uses: actions/github-script@v7 with: script: | const fs = require('fs'); const results = JSON.parse(fs.readFileSync('eval_results.json')); let body = '## Prompt Eval Results\n\n'; for (const [name, result] of Object.entries(results)) { const status = result.passed ? '✅' : '❌'; body += `| ${name} | ${status} | ${result.accuracy.toFixed(2)} | ${result.threshold} |\n`; } github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body });
# In Langfuse UI: assign label "production" to prompt v14
# The app picks it up automatically on the next request prompt = langfuse.get_prompt( name="ticket-classifier", label="production", cache_ttl_seconds=300, # 5-minute cache
)
# In Langfuse UI: assign label "production" to prompt v14
# The app picks it up automatically on the next request prompt = langfuse.get_prompt( name="ticket-classifier", label="production", cache_ttl_seconds=300, # 5-minute cache
)
# In Langfuse UI: assign label "production" to prompt v14
# The app picks it up automatically on the next request prompt = langfuse.get_prompt( name="ticket-classifier", label="production", cache_ttl_seconds=300, # 5-minute cache
)
import random def get_prompt_with_canary( name: str, canary_percentage: int = 10,
) -> tuple[dict, str]: """Return a prompt and its version (production or canary).""" if random.randint(1, 100) <= canary_percentage: prompt = langfuse.get_prompt(name=name, label="canary") return prompt, "canary" else: prompt = langfuse.get_prompt(name=name, label="production") return prompt, "production"
import random def get_prompt_with_canary( name: str, canary_percentage: int = 10,
) -> tuple[dict, str]: """Return a prompt and its version (production or canary).""" if random.randint(1, 100) <= canary_percentage: prompt = langfuse.get_prompt(name=name, label="canary") return prompt, "canary" else: prompt = langfuse.get_prompt(name=name, label="production") return prompt, "production"
import random def get_prompt_with_canary( name: str, canary_percentage: int = 10,
) -> tuple[dict, str]: """Return a prompt and its version (production or canary).""" if random.randint(1, 100) <= canary_percentage: prompt = langfuse.get_prompt(name=name, label="canary") return prompt, "canary" else: prompt = langfuse.get_prompt(name=name, label="production") return prompt, "production"
def get_prompt_version(name: str, user_id: str) -> str: """Determine the prompt version via feature flag.""" flag = feature_flags.get(f"prompt_{name}_version") if flag.is_enabled(user_id): return flag.get_variant(user_id) # "v14", "v15" return "production"
def get_prompt_version(name: str, user_id: str) -> str: """Determine the prompt version via feature flag.""" flag = feature_flags.get(f"prompt_{name}_version") if flag.is_enabled(user_id): return flag.get_variant(user_id) # "v14", "v15" return "production"
def get_prompt_version(name: str, user_id: str) -> str: """Determine the prompt version via feature flag.""" flag = feature_flags.get(f"prompt_{name}_version") if flag.is_enabled(user_id): return flag.get_variant(user_id) # "v14", "v15" return "production"
trace = langfuse.trace( name="ticket-classification", metadata={ "prompt_name": "ticket-classifier", "prompt_version": prompt.version, # 14 "prompt_label": "production", "model": "gpt-4o-mini", },
) generation = trace.generation( name="classify", model="gpt-4o-mini", prompt=prompt, # Langfuse automatically links the version input=messages, output=response,
)
trace = langfuse.trace( name="ticket-classification", metadata={ "prompt_name": "ticket-classifier", "prompt_version": prompt.version, # 14 "prompt_label": "production", "model": "gpt-4o-mini", },
) generation = trace.generation( name="classify", model="gpt-4o-mini", prompt=prompt, # Langfuse automatically links the version input=messages, output=response,
)
trace = langfuse.trace( name="ticket-classification", metadata={ "prompt_name": "ticket-classifier", "prompt_version": prompt.version, # 14 "prompt_label": "production", "model": "gpt-4o-mini", },
) generation = trace.generation( name="classify", model="gpt-4o-mini", prompt=prompt, # Langfuse automatically links the version input=messages, output=response,
)
# Example: automatic comparison of two prompt versions
def compare_prompt_versions( prompt_name: str, version_a: int, version_b: int, metric: str = "accuracy",
) -> dict: """Compare metrics for two prompt versions from Langfuse.""" traces_a = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_a}, limit=1000, ) traces_b = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_b}, limit=1000, ) scores_a = [t.scores[metric] for t in traces_a if metric in t.scores] scores_b = [t.scores[metric] for t in traces_b if metric in t.scores] return { "version_a": {"version": version_a, "mean": sum(scores_a) / len(scores_a)}, "version_b": {"version": version_b, "mean": sum(scores_b) / len(scores_b)}, "diff": (sum(scores_b) / len(scores_b)) - (sum(scores_a) / len(scores_a)), }
# Example: automatic comparison of two prompt versions
def compare_prompt_versions( prompt_name: str, version_a: int, version_b: int, metric: str = "accuracy",
) -> dict: """Compare metrics for two prompt versions from Langfuse.""" traces_a = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_a}, limit=1000, ) traces_b = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_b}, limit=1000, ) scores_a = [t.scores[metric] for t in traces_a if metric in t.scores] scores_b = [t.scores[metric] for t in traces_b if metric in t.scores] return { "version_a": {"version": version_a, "mean": sum(scores_a) / len(scores_a)}, "version_b": {"version": version_b, "mean": sum(scores_b) / len(scores_b)}, "diff": (sum(scores_b) / len(scores_b)) - (sum(scores_a) / len(scores_a)), }
# Example: automatic comparison of two prompt versions
def compare_prompt_versions( prompt_name: str, version_a: int, version_b: int, metric: str = "accuracy",
) -> dict: """Compare metrics for two prompt versions from Langfuse.""" traces_a = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_a}, limit=1000, ) traces_b = langfuse.fetch_traces( name=f"{prompt_name}-eval", metadata={"prompt_version": version_b}, limit=1000, ) scores_a = [t.scores[metric] for t in traces_a if metric in t.scores] scores_b = [t.scores[metric] for t in traces_b if metric in t.scores] return { "version_a": {"version": version_a, "mean": sum(scores_a) / len(scores_a)}, "version_b": {"version": version_b, "mean": sum(scores_b) / len(scores_b)}, "diff": (sum(scores_b) / len(scores_b)) - (sum(scores_a) / len(scores_a)), }
# Check metrics every 15 minutes (cron job or Langfuse webhook)
def check_prompt_regression(prompt_name: str): current_version = langfuse.get_prompt(name=prompt_name, label="production").version recent_scores = get_recent_scores(prompt_name, current_version, hours=1) baseline = get_baseline_scores(prompt_name, current_version) if recent_scores["accuracy"] < baseline["accuracy"] * 0.9: # > 10% degradation alert( channel="slack", message=f"Regression detected: {prompt_name} v{current_version}. " f"Accuracy: {recent_scores['accuracy']:.2f} " f"(baseline: {baseline['accuracy']:.2f})", ) # Automatic rollback to previous version rollback_prompt(prompt_name, to_version=current_version - 1)
# Check metrics every 15 minutes (cron job or Langfuse webhook)
def check_prompt_regression(prompt_name: str): current_version = langfuse.get_prompt(name=prompt_name, label="production").version recent_scores = get_recent_scores(prompt_name, current_version, hours=1) baseline = get_baseline_scores(prompt_name, current_version) if recent_scores["accuracy"] < baseline["accuracy"] * 0.9: # > 10% degradation alert( channel="slack", message=f"Regression detected: {prompt_name} v{current_version}. " f"Accuracy: {recent_scores['accuracy']:.2f} " f"(baseline: {baseline['accuracy']:.2f})", ) # Automatic rollback to previous version rollback_prompt(prompt_name, to_version=current_version - 1)
# Check metrics every 15 minutes (cron job or Langfuse webhook)
def check_prompt_regression(prompt_name: str): current_version = langfuse.get_prompt(name=prompt_name, label="production").version recent_scores = get_recent_scores(prompt_name, current_version, hours=1) baseline = get_baseline_scores(prompt_name, current_version) if recent_scores["accuracy"] < baseline["accuracy"] * 0.9: # > 10% degradation alert( channel="slack", message=f"Regression detected: {prompt_name} v{current_version}. " f"Accuracy: {recent_scores['accuracy']:.2f} " f"(baseline: {baseline['accuracy']:.2f})", ) # Automatic rollback to previous version rollback_prompt(prompt_name, to_version=current_version - 1)
# prompts/components/output-format.yaml
name: output-format-json
content: | Respond STRICTLY in JSON. No text before or after the JSON. If you cannot determine the answer, return {"error": "unable to classify"}. # prompts/components/language-rules.yaml
name: language-rules
content: | Response language: {{language}}. Do not translate proper nouns or technical terms.
# prompts/components/output-format.yaml
name: output-format-json
content: | Respond STRICTLY in JSON. No text before or after the JSON. If you cannot determine the answer, return {"error": "unable to classify"}. # prompts/components/language-rules.yaml
name: language-rules
content: | Response language: {{language}}. Do not translate proper nouns or technical terms.
# prompts/components/output-format.yaml
name: output-format-json
content: | Respond STRICTLY in JSON. No text before or after the JSON. If you cannot determine the answer, return {"error": "unable to classify"}. # prompts/components/language-rules.yaml
name: language-rules
content: | Response language: {{language}}. Do not translate proper nouns or technical terms.
def compose_prompt(*component_names: str, **variables) -> str: """Assemble a prompt from components.""" parts = [] for name in component_names: component = registry.get(f"components/{name}") content = component["content"] for key, value in variables.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) parts.append(content) return "\n\n".join(parts) # Usage
system_prompt = compose_prompt( "ticket-classifier-core", "output-format-json", "language-rules", categories="billing,technical,general", language="en",
)
def compose_prompt(*component_names: str, **variables) -> str: """Assemble a prompt from components.""" parts = [] for name in component_names: component = registry.get(f"components/{name}") content = component["content"] for key, value in variables.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) parts.append(content) return "\n\n".join(parts) # Usage
system_prompt = compose_prompt( "ticket-classifier-core", "output-format-json", "language-rules", categories="billing,technical,general", language="en",
)
def compose_prompt(*component_names: str, **variables) -> str: """Assemble a prompt from components.""" parts = [] for name in component_names: component = registry.get(f"components/{name}") content = component["content"] for key, value in variables.items(): content = content.replace(f"{{{{{key}}}}}", str(value)) parts.append(content) return "\n\n".join(parts) # Usage
system_prompt = compose_prompt( "ticket-classifier-core", "output-format-json", "language-rules", categories="billing,technical,general", language="en",
)
{domain}-{task}-{variant} ticket-classifier-v2
ticket-classifier-multilingual
order-summarizer-short
order-summarizer-detailed
response-generator-formal
response-generator-casual
quality-judge-relevance
quality-judge-toxicity
{domain}-{task}-{variant} ticket-classifier-v2
ticket-classifier-multilingual
order-summarizer-short
order-summarizer-detailed
response-generator-formal
response-generator-casual
quality-judge-relevance
quality-judge-toxicity
{domain}-{task}-{variant} ticket-classifier-v2
ticket-classifier-multilingual
order-summarizer-short
order-summarizer-detailed
response-generator-formal
response-generator-casual
quality-judge-relevance
quality-judge-toxicity
name: ticket-classifier
metadata: owner: ml-team created: 2026-01-15 last_tested: 2026-03-20 model_compatibility: - gpt-4o-mini - claude-3-5-sonnet-20241022 avg_tokens: 450 cost_per_call_usd: 0.002 test_accuracy: 0.92 dataset_size: 150
name: ticket-classifier
metadata: owner: ml-team created: 2026-01-15 last_tested: 2026-03-20 model_compatibility: - gpt-4o-mini - claude-3-5-sonnet-20241022 avg_tokens: 450 cost_per_call_usd: 0.002 test_accuracy: 0.92 dataset_size: 150
name: ticket-classifier
metadata: owner: ml-team created: 2026-01-15 last_tested: 2026-03-20 model_compatibility: - gpt-4o-mini - claude-3-5-sonnet-20241022 avg_tokens: 450 cost_per_call_usd: 0.002 test_accuracy: 0.92 dataset_size: 150 - Production logs. Real requests with labeled responses. The most valuable source.
- Manual labeling. For new prompts with no production data yet.
- Synthetic data. An LLM generates variations of existing examples. Useful for expanding edge case coverage. - Variables instead of hardcoded values. Anything that might change (categories, languages, formats) goes into variables. The prompt stays stable.
- Few-shot examples at the end. Models "see" the end of the context more clearly. Placing examples after instructions improves accuracy.
- Minimal context. Every extra token in the prompt dilutes the model's attention. If an instruction doesn't affect quality — remove it. - Location Wylie, TX
- Work Managing Director at Colaberry — focused on AI training and enterprise deployment
- Joined Mar 22, 2026