{
  "@context": "https://schema.org",
  "@type": "Dataset",
  "name": "AI Hallucination Defense Patterns — 16 production-tested mitigations",
  "description": "Defensive design patterns specifically targeting LLM hallucination in production systems. Each pattern includes problem signature (when hallucination shows up), mechanism (how the pattern works), implementation outline (concrete steps), measured effectiveness from real deployments, and known limitations/tradeoffs. Categories: retrieval-side, generation-side, validation-side, surfacing-side, organizational.",
  "version": "1.0.0",
  "datePublished": "2026-06-20",
  "dateModified": "2026-06-20",
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "publisher": { "@type": "Organization", "name": "Slavin AI (SLAtech LTD)", "url": "https://www.slavin.ai" },
  "isAccessibleForFree": true,
  "keywords": ["hallucination", "AI safety", "RAG", "defensive patterns", "LLM reliability"],
  "methodology": "Patterns derived from production observations across 30+ AI deployments in SLAtech client portfolio 2024-2026. Effectiveness ranges are observed reductions in hallucination rate (measured via LLM-as-judge sampling + user thumbs-down rate). Real-world ranges vary by domain; figures here are SLAtech's median observation.",
  "patterns": [
    {
      "id": "citation-enforcement",
      "name": "Citation Enforcement",
      "category": "generation-side",
      "problemSignature": "Model makes claims without source. User can't verify, fact-checking fails silently.",
      "mechanism": "System prompt requires every factual claim to include an inline citation referring to a retrieved chunk. Output validator rejects responses without citations.",
      "implementation": "1. Tag retrieved chunks with stable IDs (e.g. [doc-7-p3]). 2. System prompt: 'Every factual claim MUST include the chunk ID it came from in brackets'. 3. Post-process: regex check for [doc-X] presence; if absent, regenerate with stricter prompt.",
      "effectivenessRange": "40-65% hallucination reduction",
      "limitations": "Doesn't prevent the model from fabricating a citation. Pair with chunk-ID-validity check (does [doc-7-p3] actually exist?). Slower responses due to validation loop.",
      "useWhen": "RAG systems with retrievable corpus, regulated domains (medical, legal, financial)"
    },
    {
      "id": "abstention-permission",
      "name": "Abstention Permission",
      "category": "generation-side",
      "problemSignature": "Model forced to answer even when retrieved context is insufficient, fabricates to fill the gap.",
      "mechanism": "System prompt explicitly grants permission to refuse: 'If the retrieved context does not contain the answer, say \"I don't have enough information to answer that. Please try a more specific question.\"'",
      "implementation": "Add a 'refusal output' template to system prompt. Track refusal rate as a KPI — too low = model isn't refusing when it should; too high = retrieval is weak.",
      "effectivenessRange": "25-40% hallucination reduction, but raises refusal rate by 15-30%",
      "limitations": "Some users prefer wrong answers to refusals. Need product-side messaging that refusal is a quality signal.",
      "useWhen": "Production-grade RAG. Always pair with retrieval improvements to avoid runaway refusals."
    },
    {
      "id": "llm-as-judge-sampling",
      "name": "LLM-as-Judge Sampling",
      "category": "validation-side",
      "problemSignature": "Hallucinations slip through to production undetected, found only via user complaints.",
      "mechanism": "Sample 5-10% of responses, pass to a different LLM with an evaluation prompt assessing factuality. Alerts on quality drops.",
      "implementation": "1. Async pipeline: log {input, retrieved_context, response} → sample → judge LLM. 2. Judge prompt: 'Rate factuality 1-5 given the context'. 3. Dashboard tracking judge score over time.",
      "effectivenessRange": "Detection only — doesn't prevent, but catches degradation 2-7 days earlier than user reports",
      "limitations": "Judge LLM has its own biases. Use a different model family (e.g. Claude judging GPT outputs) to reduce overlap. Cost overhead: 5-10% of generation cost.",
      "useWhen": "Production deployments where quality regression detection matters more than prevention. Pair with offline eval sets."
    },
    {
      "id": "hybrid-retrieval",
      "name": "Hybrid Retrieval (sparse + dense)",
      "category": "retrieval-side",
      "problemSignature": "Pure vector search misses exact-match terms (names, codes, numbers); model fills in with hallucinated alternatives.",
      "mechanism": "Combine BM25 (lexical) with dense embedding search. Reciprocal Rank Fusion or weighted sum of scores. Recovers acronyms, codes, identifiers that pure embeddings miss.",
      "implementation": "1. Pinecone/Weaviate/PGVector for dense. 2. Elasticsearch/OpenSearch/PostgreSQL FTS for sparse. 3. RRF fusion: score = sum(1/(k + rank)) with k=60. 4. Top-N from fused.",
      "effectivenessRange": "20-35% reduction in name/code hallucinations",
      "limitations": "Two indexes to maintain. Latency overhead 50-150ms. Tuning the fusion weight is empirical.",
      "useWhen": "Domains heavy with exact-match content: technical docs, legal codes, medical codes, product catalogs."
    },
    {
      "id": "reranking",
      "name": "Cross-Encoder Reranking",
      "category": "retrieval-side",
      "problemSignature": "Top-k retrieved chunks are semantically related but not directly relevant, model synthesizes from wrong context.",
      "mechanism": "Retrieve top-50 from vector DB, then re-rank with a cross-encoder model on (query, chunk) pairs. Take top-5 for generation.",
      "implementation": "1. Vector retrieval k=50. 2. Cross-encoder (Cohere Rerank, BGE-reranker, Voyage rerank) scores all 50. 3. Pass top-5 to generation. 4. Track 'reranker moved-up rate' as a quality KPI.",
      "effectivenessRange": "25-50% reduction in irrelevant-context hallucinations",
      "limitations": "Reranker latency: 100-400ms. Cost: ~$1-5 per 1000 reranks. Worth it for high-value queries, not for cheap chat.",
      "useWhen": "RAG systems where retrieval recall is OK but precision is the bottleneck."
    },
    {
      "id": "chunk-quality-filtering",
      "name": "Chunk Quality Filtering",
      "category": "retrieval-side",
      "problemSignature": "Garbage chunks (boilerplate, navigation, broken OCR) make it into context, model hallucinates around them.",
      "mechanism": "Pre-ingestion: filter chunks by heuristics (min word count, lexical diversity, language ID, boilerplate match). Optionally: LLM judge during ingestion.",
      "implementation": "Pipeline check before embedding: word_count > 50, lang_id = expected, not boilerplate (regex against known templates), lexical_diversity > 0.4. Reject chunks failing 2+ checks.",
      "effectivenessRange": "15-30% reduction in 'context noise' hallucinations",
      "limitations": "Aggressive filtering loses recall. Tune thresholds per corpus.",
      "useWhen": "Web-scraped corpora, PDFs with OCR, mixed-quality documents."
    },
    {
      "id": "structured-output-schema",
      "name": "Structured Output Schema",
      "category": "generation-side",
      "problemSignature": "Free-form text outputs invent fields, dates, IDs that don't match schema constraints.",
      "mechanism": "Constrain generation to a JSON Schema. Use vendor-side strict mode (OpenAI response_format=json_schema, Anthropic via prompted output + retries, Gemini response_mime_type=application/json).",
      "implementation": "1. Define JSON Schema with required fields, enums, format constraints. 2. Pass to generation API in strict mode. 3. Post-validate; on schema fail, retry with error feedback.",
      "effectivenessRange": "60-85% reduction in schema-shape hallucinations, but doesn't prevent semantic hallucination (correct shape, wrong values)",
      "limitations": "Structured mode adds 100-300ms latency. Some models don't support strict mode for all schemas.",
      "useWhen": "Outputs feeding downstream code: APIs, data extraction, form filling."
    },
    {
      "id": "self-consistency-voting",
      "name": "Self-Consistency Voting",
      "category": "generation-side",
      "problemSignature": "Single sampled response is randomly good or bad, especially for reasoning tasks.",
      "mechanism": "Generate N responses with temperature > 0, vote on majority. For factual outputs, take the most-common answer; for nuanced outputs, take the median.",
      "implementation": "1. Generate 3-5 responses, temperature 0.5-0.7. 2. Extract key claims from each. 3. Vote: keep claims present in 2+ samples. 4. Format final response from kept claims.",
      "effectivenessRange": "15-35% reduction in random-flip hallucinations on reasoning tasks",
      "limitations": "Cost 3-5× per query. Doesn't help if the model is consistently wrong. Best for arithmetic, multi-step reasoning, code.",
      "useWhen": "High-stakes single-shot decisions: medical triage suggestions, legal interpretation, financial calculations."
    },
    {
      "id": "knowledge-cutoff-disclosure",
      "name": "Knowledge Cutoff Disclosure",
      "category": "surfacing-side",
      "problemSignature": "Users ask about recent events; model answers with stale training data presented as current.",
      "mechanism": "System prompt declares model's training cutoff date. Output template includes a 'last updated' indicator when the answer relies on training-data knowledge vs. retrieved real-time data.",
      "implementation": "1. Inject 'Your training data has a cutoff of YYYY-MM' in system prompt. 2. For each fact in output, tag source as [training] or [retrieved-YYYY-MM]. 3. UI surfaces these tags.",
      "effectivenessRange": "Prevents user-side misuse rather than reducing model error rate; shifts the burden appropriately",
      "limitations": "Users may ignore the disclosure. Doesn't prevent the model from confidently stating outdated information.",
      "useWhen": "Conversational AI on topics with high temporal drift (news, prices, regulations, sports, politics)."
    },
    {
      "id": "negative-example-injection",
      "name": "Negative Example Injection",
      "category": "generation-side",
      "problemSignature": "Model consistently makes the same class of error (e.g. confuses entity X for similar entity Y).",
      "mechanism": "System prompt includes specific negative examples: 'You will sometimes confuse X and Y. They are different — X is [description], Y is [description]. Always distinguish them explicitly.'",
      "implementation": "Maintain a 'known confusions' table from error logs. Top-5 by frequency are injected into system prompt. Rotate quarterly based on production data.",
      "effectivenessRange": "30-70% reduction in the specific confusion targeted",
      "limitations": "Doesn't generalize beyond the listed confusions. Prompt-bloat: keep negative examples concise (2-3 per category).",
      "useWhen": "Domain with recurring confusion patterns identified from production logs."
    },
    {
      "id": "tool-use-grounding",
      "name": "Tool-Use Grounding",
      "category": "generation-side",
      "problemSignature": "Model answers from training data when it should query a system of record (database, calculator, API).",
      "mechanism": "Provide tools for known fact classes (current date, user data lookup, calculation). System prompt: 'For X type of question, always call the get_X tool first.'",
      "implementation": "1. Define tools for each fact class: get_current_date, lookup_user, calculate_X. 2. System prompt routes question types to specific tools. 3. Function calling enforced via response_format or tools required.",
      "effectivenessRange": "50-90% reduction for fact classes covered by tools",
      "limitations": "Tool overhead: 200-800ms per call. Need solid backend for tool implementations. Doesn't cover unknown fact classes.",
      "useWhen": "Production agents with access to backend systems. Pair with citation enforcement for retrieved data."
    },
    {
      "id": "fact-extraction-validation",
      "name": "Fact Extraction + Validation",
      "category": "validation-side",
      "problemSignature": "Long-form responses contain mixed accurate and hallucinated claims; users can't tell which is which.",
      "mechanism": "Extract atomic factual claims from generated response, validate each against retrieved context or external source. Flag or remove un-validated claims.",
      "implementation": "1. Generate response. 2. Extract claims via secondary LLM call. 3. For each claim: search retrieved context for support. 4. Annotate response: [verified] or [unverified]. 5. Optional: strip unverified.",
      "effectivenessRange": "40-70% reduction in surface hallucinations",
      "limitations": "Cost 2-3× per query. Verification may itself hallucinate. Slow (multiple LLM calls).",
      "useWhen": "Output destined for publication, legal/medical context, summarization with attribution requirements."
    },
    {
      "id": "human-in-loop-gates",
      "name": "Human-in-Loop Gates",
      "category": "organizational",
      "problemSignature": "AI output drives consequential decisions; hallucination becomes a harm event.",
      "mechanism": "Insert human review at specific gates: high-confidence AI suggestion goes through automatically; below confidence threshold requires human approval before action.",
      "implementation": "1. Confidence score per AI output (from model self-rating + judge LLM). 2. Threshold defined per use case. 3. Approval queue UI for below-threshold. 4. Track override rates as KPI.",
      "effectivenessRange": "Prevents harm rather than reducing hallucination rate; shifts cost to operations",
      "limitations": "Operational overhead. Throughput cap = human reviewer capacity. Reviewer drift over time.",
      "useWhen": "Regulated domains (medical, legal, financial), high-cost decisions, irreversible actions."
    },
    {
      "id": "audit-log-with-context",
      "name": "Audit Log with Context",
      "category": "organizational",
      "problemSignature": "Hallucination causes harm; post-incident review can't reproduce because retrieved context isn't logged.",
      "mechanism": "Log every AI interaction with: input, system prompt version, retrieved chunks (full text), model version, response, downstream action. Immutable storage with retention per regulator requirements.",
      "implementation": "1. Wrapper around generation call captures full context. 2. Persist to append-only log (S3 versioned bucket, or DB with no-update policy). 3. Retention per jurisdiction (EU AI Act: high-risk = full lifecycle).",
      "effectivenessRange": "Enables root-cause analysis and regulatory response, doesn't reduce hallucination directly",
      "limitations": "Storage cost. Privacy implications if logs contain PII (encrypt + access control).",
      "useWhen": "EU AI Act high-risk, US sectoral regulation (HIPAA, finserv), any deployment where 'how did this happen' matters."
    },
    {
      "id": "drift-monitoring",
      "name": "Drift Monitoring",
      "category": "validation-side",
      "problemSignature": "Model + retrieval system worked well at launch, slowly degrades as data and queries change.",
      "mechanism": "Track input distribution (topics, query length, vocab), retrieved-chunk distribution, output quality KPI (refusal rate, judge score). Alert on significant drift.",
      "implementation": "1. Daily aggregation of input/output features. 2. Compare to baseline (first 30 days). 3. PSI (Population Stability Index) or KL divergence for distribution shift. 4. Alert at PSI > 0.2.",
      "effectivenessRange": "Detection only — flags degradation 1-4 weeks earlier than user-reported quality drop",
      "limitations": "False alarms from legitimate domain shift. Tuning thresholds is empirical.",
      "useWhen": "Production deployments operating >3 months."
    },
    {
      "id": "version-pinning-everything",
      "name": "Version-Pin Everything",
      "category": "organizational",
      "problemSignature": "Vendor silently updates model; behavior changes overnight; hard-won prompts no longer work.",
      "mechanism": "Pin model version (e.g. gpt-4o-2024-08-06, not gpt-4o), embedding model version, prompt version, retrieval config version. Track in audit log per request.",
      "implementation": "1. Config: explicit version everywhere. 2. CI gate: changing pin requires PR + eval pass. 3. Audit log records version in use per request.",
      "effectivenessRange": "Prevents surprise regression rather than reducing hallucination rate",
      "limitations": "Pinned versions get deprecated; must plan migrations. New models often available only via pinned versions for stability vs. 'latest' for capability.",
      "useWhen": "Any production deployment. This is hygiene, not a fancy pattern."
    }
  ],
  "totalPatterns": 16,
  "categories": ["retrieval-side", "generation-side", "validation-side", "surfacing-side", "organizational"],
  "lastReview": "2026-06-20",
  "nextReviewDate": "2026-12-20",
  "feedback": "Pattern missing or effectiveness range off? Contact info@slavin.ai"
}
