// What I recommend tracking for each early user interface EarlyUserContext { userId: string; role: string; // "support", "ops", "sales" primaryUseCase: string; // "answer customer questions" feedbackChannel: string; // direct line to eng team } CODE_BLOCK: // What I recommend tracking for each early user interface EarlyUserContext { userId: string; role: string; // "support", "ops", "sales" primaryUseCase: string; // "answer customer questions" feedbackChannel: string; // direct line to eng team } CODE_BLOCK: // What I recommend tracking for each early user interface EarlyUserContext { userId: string; role: string; // "support", "ops", "sales" primaryUseCase: string; // "answer customer questions" feedbackChannel: string; // direct line to eng team } CODE_BLOCK: // Minimum viable trace structure interface AgentTrace { runId: string; userId: string; query: string; toolsConsidered: string[]; toolSelected: string; contextSummary: string; response: string; userFeedback: "accepted" | "edited" | "rejected" | null; latencyMs: number; } CODE_BLOCK: // Minimum viable trace structure interface AgentTrace { runId: string; userId: string; query: string; toolsConsidered: string[]; toolSelected: string; contextSummary: string; response: string; userFeedback: "accepted" | "edited" | "rejected" | null; latencyMs: number; } CODE_BLOCK: // Minimum viable trace structure interface AgentTrace { runId: string; userId: string; query: string; toolsConsidered: string[]; toolSelected: string; contextSummary: string; response: string; userFeedback: "accepted" | "edited" | "rejected" | null; latencyMs: number; } CODE_BLOCK: // Before: I see this constantly const tool = { name: "handleData", description: "Handles data operations" } // After: Clear enough for the model to reason about const tool = { name: "createShipmentFromOrder", description: "Creates a new shipment record from an existing order. Requires orderId. Returns shipmentId and tracking number." } CODE_BLOCK: // Before: I see this constantly const tool = { name: "handleData", description: "Handles data operations" } // After: Clear enough for the model to reason about const tool = { name: "createShipmentFromOrder", description: "Creates a new shipment record from an existing order. Requires orderId. Returns shipmentId and tracking number." } CODE_BLOCK: // Before: I see this constantly const tool = { name: "handleData", description: "Handles data operations" } // After: Clear enough for the model to reason about const tool = { name: "createShipmentFromOrder", description: "Creates a new shipment record from an existing order. Requires orderId. Returns shipmentId and tracking number." } CODE_BLOCK: // Example eval case from a real client failure const evalCase = { id: "shipment-status-check", query: "What's the status of order 12345?", expectedTool: "getShipmentByOrderId", expectedBehavior: "Return actual status from database", failureWeObserved: "Agent said 'delivered' without checking", groundTruth: "in_transit" } CODE_BLOCK: // Example eval case from a real client failure const evalCase = { id: "shipment-status-check", query: "What's the status of order 12345?", expectedTool: "getShipmentByOrderId", expectedBehavior: "Return actual status from database", failureWeObserved: "Agent said 'delivered' without checking", groundTruth: "in_transit" } CODE_BLOCK: // Example eval case from a real client failure const evalCase = { id: "shipment-status-check", query: "What's the status of order 12345?", expectedTool: "getShipmentByOrderId", expectedBehavior: "Return actual status from database", failureWeObserved: "Agent said 'delivered' without checking", groundTruth: "in_transit" } CODE_BLOCK: interface AgentMetrics { // Did we pick the right tool? toolSelectionAccuracy: number; // Did we retrieve relevant docs? retrievalRecall: number; // Did the final answer match ground truth? answerCorrectness: number; // Did we cite the right sources? groundingAccuracy: number; // Did the user accept the response? userAcceptanceRate: number; } CODE_BLOCK: interface AgentMetrics { // Did we pick the right tool? toolSelectionAccuracy: number; // Did we retrieve relevant docs? retrievalRecall: number; // Did the final answer match ground truth? answerCorrectness: number; // Did we cite the right sources? groundingAccuracy: number; // Did the user accept the response? userAcceptanceRate: number; } CODE_BLOCK: interface AgentMetrics { // Did we pick the right tool? toolSelectionAccuracy: number; // Did we retrieve relevant docs? retrievalRecall: number; // Did the final answer match ground truth? answerCorrectness: number; // Did we cite the right sources? groundingAccuracy: number; // Did the user accept the response? userAcceptanceRate: number; } COMMAND_BLOCK: const canUseAgent = (user: User): boolean => { // Phase 1: Named early adopters if (ROLLOUT_PHASE === 1) { return earlyAdopters.includes(user.id); } // Phase 2: Specific teams if (ROLLOUT_PHASE === 2) { return user.team === "support" || user.team === "ops"; } // Phase 3: Everyone return true; } COMMAND_BLOCK: const canUseAgent = (user: User): boolean => { // Phase 1: Named early adopters if (ROLLOUT_PHASE === 1) { return earlyAdopters.includes(user.id); } // Phase 2: Specific teams if (ROLLOUT_PHASE === 2) { return user.team === "support" || user.team === "ops"; } // Phase 3: Everyone return true; } COMMAND_BLOCK: const canUseAgent = (user: User): boolean => { // Phase 1: Named early adopters if (ROLLOUT_PHASE === 1) { return earlyAdopters.includes(user.id); } // Phase 2: Specific teams if (ROLLOUT_PHASE === 2) { return user.team === "support" || user.team === "ops"; } // Phase 3: Everyone return true; } CODE_BLOCK: Week 1: 87% tool accuracy, 72% answer correctness Week 2: 85% tool accuracy, 75% answer correctness Week 3: 83% tool accuracy, 71% answer correctness Week 4: 79% tool accuracy, 68% answer correctness ← investigate CODE_BLOCK: Week 1: 87% tool accuracy, 72% answer correctness Week 2: 85% tool accuracy, 75% answer correctness Week 3: 83% tool accuracy, 71% answer correctness Week 4: 79% tool accuracy, 68% answer correctness ← investigate CODE_BLOCK: Week 1: 87% tool accuracy, 72% answer correctness Week 2: 85% tool accuracy, 75% answer correctness Week 3: 83% tool accuracy, 71% answer correctness Week 4: 79% tool accuracy, 68% answer correctness ← investigate - 3 people who actually need the tool for real work
- Different roles (support, ops, sales)
- Direct channel to the eng team
- What query did the user send?
- What tools did the agent consider?
- Which tool did it pick and why?
- What context was in the window?
- What was the final response?
- Did the user accept, edit, or reject it?
- Wrong tool selection: Agent picked searchOrders when it should have picked searchShipments
- Missing context: Agent couldn't answer because the right doc wasn't retrieved
- Hallucinations: Agent made up data that doesn't exist
- Premature stopping: Agent gave up too early
- Slow responses: Anything over 10 seconds feels broken
- Bad tool names and descriptions
- Missing or wrong context
- Retrieval pulling irrelevant docs
- Ship with confidence, not hope
- Have real data to show leadership
- Know exactly where to focus engineering effort
- Build user trust instead of destroying it
- Week 0: Instrument everything
- Week 1: 3 users, review every trace, build failure spreadsheet
- Week 2: Fix perception issues (tools, context, retrieval)
- Week 3: Build evals from failures, establish baselines
- Week 4: Expand to 10 users, new roles, new use cases
- Week 5: Fix new failures, update evals
- Week 6: Expand to full internal team
- Week 7+: Monitor drift, harden edge cases
- When metrics stabilize: Consider external rollout