{
  "@context": "https://schema.org",
  "@type": "Dataset",
  "name": "AI Evaluation Methodologies Library — 18 named methods for production LLM systems",
  "description": "Named, structured evaluation methodologies for production AI / LLM systems. Each entry: category (offline / online / human / LLM-judge / hybrid), what-it-measures, protocol, tooling (named tools), when-to-use, when-to-avoid, implementation cost, example metric output. Companion to the AI Architecture Patterns Library — patterns + evaluation methods together cover the full production lifecycle.",
  "version": "1.0.0",
  "datePublished": "2026-06-20",
  "dateModified": "2026-06-20",
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "publisher": { "@type": "Organization", "name": "Slavin AI (SLAtech LTD)", "url": "https://www.slavin.ai" },
  "isAccessibleForFree": true,
  "keywords": ["AI evaluation", "LLM eval", "RAG eval", "RAGAS", "golden dataset", "LLM-as-judge", "A/B test", "drift", "observability"],
  "methodology": "Methods named + categorized by SLAtech from 30+ production AI deployments (2022-2026). Cost ranges, latency, and effectiveness figures are observed medians; per-domain variation is real. Tooling references are illustrative — many methods can be implemented with several tools.",
  "totalMethods": 18,
  "categories": ["offline", "online", "human-in-loop", "llm-as-judge", "drift-detection", "infrastructure"],
  "methods": [
    {
      "id": "golden-dataset",
      "name": "Golden Dataset Eval",
      "category": "offline",
      "measures": "Output correctness against curated ground-truth (input → expected output) pairs.",
      "protocol": "Curate 100-1000 representative input/output pairs from domain experts. Run system on each input; compare output to expected via exact match, F1, BLEU, or LLM-as-judge. Track pass rate over time. Update dataset quarterly as the domain evolves.",
      "tooling": ["promptfoo", "deepeval", "Langfuse evaluations", "OpenAI Evals", "custom pytest"],
      "whenToUse": "Any production AI system with definable correct answers. Mandatory before every model/prompt change. Foundation of every other eval.",
      "whenToAvoid": "Generative tasks without single correct answer (creative writing, brainstorming).",
      "implementationCost": "1-3 person-weeks initial curation; 2-4 hours per month maintenance.",
      "exampleOutput": "pass_rate=0.87, f1=0.81, regressions=3/156"
    },
    {
      "id": "llm-as-judge-rubric",
      "name": "LLM-as-Judge with Rubric",
      "category": "llm-as-judge",
      "measures": "Subjective quality dimensions (helpfulness, factuality, tone, safety) at scale.",
      "protocol": "Define a 1-5 rubric per dimension with concrete anchors. Send (input, output, rubric) to a different LLM family (e.g. Claude judges GPT outputs) with chain-of-thought reasoning. Aggregate across 100-500 samples. Calibrate against human spot-checks quarterly.",
      "tooling": ["Langfuse evaluators", "promptfoo LLM rubric", "OpenAI Evals", "DeepEval GEval"],
      "whenToUse": "Subjective quality at production scale. Drift detection. Pre-deployment quality gating.",
      "whenToAvoid": "Mathematical / objective correctness (use golden dataset). When judge LLM shares biases with the model under test.",
      "implementationCost": "1-2 person-weeks rubric design; cost ~5-10% of generation cost.",
      "exampleOutput": "helpfulness=4.2, factuality=4.6, safety=4.9, tone=3.8"
    },
    {
      "id": "ragas",
      "name": "RAGAS (RAG-specific metrics)",
      "category": "offline",
      "measures": "RAG-specific failure modes: faithfulness, answer-relevance, context-relevance, context-recall.",
      "protocol": "For each (query, retrieved-context, answer) tuple compute: (a) faithfulness — is the answer supported by context; (b) answer-relevance — does answer address the query; (c) context-precision/recall — are retrieved chunks relevant. Aggregate over evaluation set.",
      "tooling": ["RAGAS framework (Python)", "TruLens", "DeepEval RagasMetric"],
      "whenToUse": "Any RAG system in production or pre-launch. Diagnoses whether failures are retrieval-side or generation-side.",
      "whenToAvoid": "Non-RAG systems. Single-shot Q&A without external context.",
      "implementationCost": "2-3 days to wire up; ongoing cost ~10-15% of generation cost.",
      "exampleOutput": "faithfulness=0.92, answer_relevance=0.88, context_precision=0.76, context_recall=0.81"
    },
    {
      "id": "human-spot-check",
      "name": "Human Spot-Check Sampling",
      "category": "human-in-loop",
      "measures": "Ground-truth quality on real production traffic. Calibration source for all automated evals.",
      "protocol": "Sample 1-5% of production responses daily/weekly. Domain expert rates each on a fixed rubric. Track inter-rater agreement (Cohen's kappa) across 2+ reviewers monthly. Use as ground-truth to validate / recalibrate LLM-as-judge.",
      "tooling": ["Argilla", "Label Studio", "Prodigy", "in-house labeling UI"],
      "whenToUse": "Always pair with LLM-as-judge as calibration source. Mandatory in regulated domains (medical, legal, financial).",
      "whenToAvoid": "Low-stakes prototypes. When traffic volume is too low to be meaningful (< 100 responses/week).",
      "implementationCost": "4-20 person-hours per week ongoing per domain expert.",
      "exampleOutput": "human_quality=4.1±0.3, judge_correlation=0.79, samples_reviewed=87"
    },
    {
      "id": "ab-test",
      "name": "Production A/B Test",
      "category": "online",
      "measures": "Real-world business outcomes (conversion, task completion, time-to-resolution, user satisfaction) caused by AI change.",
      "protocol": "Randomize users into control (current) + variant (new model/prompt). Define primary metric (business KPI) + guardrails (safety, cost, latency). Run until statistical significance — typically 1-4 weeks. Use Bayesian or frequentist analysis.",
      "tooling": ["GrowthBook", "Optimizely", "Statsig", "in-house feature flags"],
      "whenToUse": "Any consequential AI change touching real users. Final validation gate before full rollout.",
      "whenToAvoid": "Backend-only AI (use shadow eval). When user behavior change can't be ethically randomized (e.g. medical triage).",
      "implementationCost": "1-2 person-weeks per test setup; ongoing experiment infra.",
      "exampleOutput": "conversion_lift=+4.2% (95% CI [+1.8, +6.6]), latency_p95=+45ms, safety_violations=0"
    },
    {
      "id": "shadow-eval",
      "name": "Shadow Evaluation",
      "category": "online",
      "measures": "Quality of a candidate model/prompt on real production traffic, without affecting user experience.",
      "protocol": "Run new variant in parallel with production model on real traffic. User sees only production output. Compare candidate vs production via LLM-as-judge + spot-check. Useful for backend changes where A/B isn't feasible.",
      "tooling": ["LiteLLM with shadow mode", "in-house dual-call wrapper", "Helicone"],
      "whenToUse": "Backend AI changes. Pre-A/B validation. When user-facing variation has ethical / compliance constraints.",
      "whenToAvoid": "When cost of double-calling production scale is prohibitive (large enterprise volumes can mean $$$).",
      "implementationCost": "1-2 person-weeks wiring; ongoing cost = 2x generation cost while shadow runs.",
      "exampleOutput": "candidate_judge_score=4.3 vs production=4.1; agreement_rate=82%; cost_delta=+18%"
    },
    {
      "id": "self-consistency-check",
      "name": "Self-Consistency Eval",
      "category": "offline",
      "measures": "Output stability — does the model give the same answer when asked the same question multiple ways?",
      "protocol": "Generate N=3-5 paraphrases of each evaluation query. Run all through system. Measure agreement rate (exact match for factual, semantic similarity for generative). Low consistency = brittle output.",
      "tooling": ["custom pytest", "paraphrase-mining via embeddings"],
      "whenToUse": "Reasoning + factual tasks where stability matters. Detecting prompt-fragility before production.",
      "whenToAvoid": "Inherently creative tasks where variation is desirable.",
      "implementationCost": "1-2 days setup; cost = 3-5x generation per eval run.",
      "exampleOutput": "consistency_rate=0.71, brittle_cases=23/100"
    },
    {
      "id": "drift-monitoring",
      "name": "Drift Monitoring",
      "category": "drift-detection",
      "measures": "Distribution shift over time in inputs (queries) or outputs (responses) — early warning of degradation.",
      "protocol": "Daily aggregation: topic distribution (LDA / KMeans on embeddings), query length distribution, refusal rate, response length, vocabulary entropy. Compare to baseline (first 30 days). Alert at PSI > 0.2 or KL divergence > threshold.",
      "tooling": ["Evidently AI", "Whylogs", "in-house Pandas", "Langfuse trends"],
      "whenToUse": "Production systems > 3 months old. Critical when corpus or user-base evolves rapidly.",
      "whenToAvoid": "Prototypes. Static-distribution systems.",
      "implementationCost": "1-2 person-weeks setup; ongoing storage + compute (~$50-500/month at moderate volume).",
      "exampleOutput": "topic_drift PSI=0.15 (yellow), refusal_rate=+3% MoM, response_length stable"
    },
    {
      "id": "embedding-drift",
      "name": "Embedding Drift Detection",
      "category": "drift-detection",
      "measures": "Whether the embedding model + corpus continue to represent input semantics as expected.",
      "protocol": "Maintain a fixed reference set of N=100 queries with known top-K retrieval results. Re-run embeddings + retrieval daily. Alert when top-K composition for any reference query changes by > 30%.",
      "tooling": ["custom Python + vector store", "Pinecone Vector Eval"],
      "whenToUse": "RAG systems where corpus updates frequently, or where embedding model version changes.",
      "whenToAvoid": "Static corpora with pinned embedding versions.",
      "implementationCost": "3-5 days setup; cost = 1 small embedding job per day.",
      "exampleOutput": "stability_rate=0.94, drifted_queries=6/100, root_cause=corpus_growth"
    },
    {
      "id": "red-team-eval",
      "name": "Adversarial / Red-Team Eval",
      "category": "offline",
      "measures": "Robustness against adversarial inputs: prompt injection, jailbreak attempts, off-topic distractions, encoded payloads.",
      "protocol": "Curate adversarial test suite (100-1000 prompts targeting known vulnerabilities). Run periodically; track success rate of attacks. Required by EU AI Act for GPAI systemic-risk models.",
      "tooling": ["PromptInject", "Garak (NVIDIA)", "PyRIT (Microsoft)", "custom suite"],
      "whenToUse": "User-facing LLM systems. Pre-launch hard requirement for regulated domains.",
      "whenToAvoid": "Closed internal-only systems with trusted inputs.",
      "implementationCost": "2-4 person-weeks initial; quarterly refresh + new threats.",
      "exampleOutput": "attack_success_rate=0.08, regressions vs last quarter=+0.02 (yellow)"
    },
    {
      "id": "factuality-citation-check",
      "name": "Factuality + Citation Check",
      "category": "llm-as-judge",
      "measures": "Whether stated facts in long-form output are supported by retrieved or training-data evidence.",
      "protocol": "Extract atomic factual claims from output (via secondary LLM). For each claim: search retrieved context for support, or query authoritative source. Annotate response: verified / unverified / contradicted. Aggregate per response.",
      "tooling": ["custom claim extractor + retriever", "FActScore (research)", "DeepEval FactualityMetric"],
      "whenToUse": "Long-form generation with claims (research summaries, news writing, legal opinions).",
      "whenToAvoid": "Short answers, code, structured outputs.",
      "implementationCost": "1-3 person-weeks; cost = 2-3x generation per eval run.",
      "exampleOutput": "factual_claims=12, verified=9, unverified=2, contradicted=1"
    },
    {
      "id": "latency-cost-eval",
      "name": "Latency + Cost Tracking",
      "category": "infrastructure",
      "measures": "p50/p95/p99 latency, tokens-per-request, $/request — by route, user-segment, model version.",
      "protocol": "Instrument every LLM call with tracing (request, prompt-tokens, completion-tokens, latency, model-id, route, tenant). Aggregate hourly. Alert on p95 > threshold or cost-per-day > budget.",
      "tooling": ["OpenTelemetry + Honeycomb / Datadog", "Helicone", "Langfuse", "Phoenix"],
      "whenToUse": "Always. Mandatory ops hygiene for any production system.",
      "whenToAvoid": "Never.",
      "implementationCost": "2-5 days wiring; ongoing tracing infra (~$50-2000/month).",
      "exampleOutput": "p95=1.8s, p99=4.2s, $/req=$0.012, top_route=/chat/search"
    },
    {
      "id": "user-feedback",
      "name": "Inline User Feedback (👍 / 👎 + comment)",
      "category": "online",
      "measures": "User-perceived quality. Sparse but high-value signal.",
      "protocol": "Add thumbs-up/down + optional comment after every AI response. Aggregate per route, model, time window. Investigate clusters of negatives via embeddings (similar complaint patterns).",
      "tooling": ["Helicone feedback widget", "Langfuse user feedback", "custom UI"],
      "whenToUse": "Any user-facing AI. Earliest production-quality signal you'll get.",
      "whenToAvoid": "Backend-only AI. When feedback rate is below useful threshold.",
      "implementationCost": "1-3 days UI + aggregation.",
      "exampleOutput": "positive_rate=0.84, negative_rate=0.11, abstain=0.05, top_complaint='wrong sector'"
    },
    {
      "id": "tool-call-eval",
      "name": "Tool / Function Call Eval",
      "category": "offline",
      "measures": "For tool-use agents: did the model pick the right tool, with the right arguments, at the right time?",
      "protocol": "Curate scenarios where each requires specific tool + arguments. Run agent; compare invoked tool call to expected. Measure: tool selection accuracy, argument extraction accuracy, plan completion rate, hallucinated tool rate.",
      "tooling": ["custom test harness", "DeepEval ToolCorrectness", "LangChain test utilities"],
      "whenToUse": "Any agent with function-calling. Critical before deploying agents that act on external systems.",
      "whenToAvoid": "Pure conversational agents without tools.",
      "implementationCost": "2-4 person-weeks initial scenarios; ongoing as tool surface grows.",
      "exampleOutput": "tool_correctness=0.91, arg_exact_match=0.78, plan_completion=0.83"
    },
    {
      "id": "format-adherence",
      "name": "Structured Output Adherence Eval",
      "category": "offline",
      "measures": "Compliance with declared JSON Schema, XML structure, or fixed format.",
      "protocol": "Run system on N=100-500 inputs requiring structured output. Validate each output against schema (or parse-success rate for less strict formats). Track schema-fail rate over time.",
      "tooling": ["jsonschema (Python)", "Pydantic", "Outlines", "Instructor library"],
      "whenToUse": "Any system using JSON-mode / function-calling / structured generation.",
      "whenToAvoid": "Free-form text outputs.",
      "implementationCost": "1-2 days wiring.",
      "exampleOutput": "schema_pass_rate=0.98, regenerate_rate=0.02, slowest_field=description"
    },
    {
      "id": "safety-filter-eval",
      "name": "Safety / Policy Filter Eval",
      "category": "offline",
      "measures": "Whether outputs comply with defined safety policy (PII, profanity, off-topic, brand-voice).",
      "protocol": "Run safety classifier (provider or custom) over evaluation set. Measure false-positive rate (legitimate outputs blocked) and false-negative rate (unsafe outputs passing). Calibrate threshold quarterly.",
      "tooling": ["OpenAI Moderation API", "Llama Guard 3", "NeMo Guardrails", "in-house classifier"],
      "whenToUse": "User-facing AI in regulated/branded contexts.",
      "whenToAvoid": "Internal-only / trusted-user systems.",
      "implementationCost": "2-5 days setup; quarterly recalibration.",
      "exampleOutput": "false_positive=0.04, false_negative=0.01, p95_filter_latency=80ms"
    },
    {
      "id": "regression-suite",
      "name": "Regression Suite (Pre-Deploy Gate)",
      "category": "offline",
      "measures": "Does the new version preserve previous quality on a fixed test set?",
      "protocol": "Maintain pinned regression suite (golden dataset + adversarial + format + tool calls). CI gate runs full suite on every prompt/model PR. Block merge if pass rate drops > 2pp on any sub-suite.",
      "tooling": ["GitHub Actions + promptfoo CI", "DeepEval CI integration"],
      "whenToUse": "Any production AI with version control (prompts, models, retrieval).",
      "whenToAvoid": "Pure-exploration prototypes.",
      "implementationCost": "1-2 person-weeks initial; ongoing maintenance with each new feature.",
      "exampleOutput": "all_suites_pass: yes; quality_delta: +1.2pp; cost_delta: -8%; latency_delta: +50ms"
    },
    {
      "id": "long-context-recall",
      "name": "Long-Context Recall (Needle-in-Haystack)",
      "category": "offline",
      "measures": "Whether the model can retrieve specific facts placed deep inside a long context window.",
      "protocol": "Insert known facts ('needles') at varying depths in 4K, 32K, 128K, 1M token contexts. Ask retrieval questions. Plot recall vs depth + context-length. Identify the model's effective context window.",
      "tooling": ["LangChain needle-in-haystack utilities", "custom benchmarks"],
      "whenToUse": "Systems pushing context-window limits (long PDFs, code repos, transcripts).",
      "whenToAvoid": "Systems comfortably under model's claimed context.",
      "implementationCost": "1-2 days setup; cost moderate (depends on context lengths tested).",
      "exampleOutput": "recall_at_128K=0.91, recall_at_500K=0.62, effective_ctx=~200K"
    }
  ],
  "lastReview": "2026-06-20",
  "nextReviewDate": "2026-12-20",
  "feedback": "Method missing or framing off? Contact info@slavin.ai"
}