{
  "@context": "https://schema.org",
  "@type": "Dataset",
  "name": "AI Architecture Patterns Library — 18 named production patterns",
  "description": "Named, structured architectural patterns for production AI systems. Each entry: name, category, intent (the problem it solves), structure (how components fit), participants (which components play which role), consequences (what you gain + what you trade), when-to-use (concrete trigger conditions), when-to-avoid, related patterns. Patterns are derived from SLAtech production deployments + cross-validated against published practitioner literature.",
  "version": "1.0.0",
  "datePublished": "2026-06-20",
  "dateModified": "2026-06-20",
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "publisher": { "@type": "Organization", "name": "Slavin AI (SLAtech LTD)", "url": "https://www.slavin.ai" },
  "isAccessibleForFree": true,
  "keywords": ["AI architecture", "design patterns", "RAG patterns", "agent patterns", "production AI", "AI system design"],
  "methodology": "Patterns named + categorized by SLAtech across 30+ production AI deployments (2022-2026). Each pattern was applied in 2+ independent deployments before inclusion. Consequence claims (latency, cost, quality) are empirical from these deployments and may vary by domain. Pattern names follow the Gang-of-Four convention (intent + structure) where viable.",
  "totalPatterns": 18,
  "categories": ["retrieval", "generation", "agent", "evaluation", "ops", "safety"],
  "patterns": [
    {
      "id": "citation-grounded-rag",
      "name": "Citation-Grounded RAG",
      "category": "retrieval",
      "intent": "Generate LLM responses that include verifiable citations to retrieved source documents so users (or downstream validators) can confirm the claim.",
      "structure": "Retriever → context with stable chunk IDs → LLM with system prompt requiring inline citations → output validator that rejects responses without [doc-X] markers → optional auto-retry with stricter prompt.",
      "participants": ["Retriever", "Citation-aware system prompt", "LLM", "Output validator", "Retry orchestrator"],
      "consequences": "+40-65% hallucination reduction (per SLAtech hallucination defense dataset). +200-400ms latency for validation + retry. ~10% of responses regenerated. Users see auditable references; legal / regulated domains satisfied.",
      "whenToUse": "RAG systems where users could harm themselves or others by acting on fabricated information (medical, legal, financial, compliance). Also: any context requiring 'show your work' justification.",
      "whenToAvoid": "Casual conversational UX where citation overhead disrupts flow. Domains where source documents are not citation-grade themselves (user-generated content corpora).",
      "relatedPatterns": ["hybrid-retrieval", "abstention-permission"]
    },
    {
      "id": "hybrid-retrieval",
      "name": "Hybrid Retrieval (Sparse + Dense)",
      "category": "retrieval",
      "intent": "Retrieve relevant chunks for queries containing BOTH semantic concepts and exact-match tokens (codes, names, identifiers).",
      "structure": "Query → parallel paths: (a) BM25 / lexical index returns top-N, (b) dense vector index returns top-N → reciprocal rank fusion (RRF) merges to top-K → cross-encoder reranking (optional) → context for LLM.",
      "participants": ["Sparse index (Elasticsearch / Postgres FTS)", "Dense index (Pinecone / PGVector / Weaviate)", "RRF fusion", "Reranker"],
      "consequences": "+20-35% recall on queries with exact-match terms. Two indexes to maintain. +50-150ms latency overhead from parallel queries.",
      "whenToUse": "Corpora heavy with technical IDs (medical codes, legal citations, product SKUs, error codes, configuration keys). Multi-language corpora where one language tokenizes poorly in embeddings.",
      "whenToAvoid": "Pure conversational corpora with no exact-match value. Single-language Western text with strong embedding models.",
      "relatedPatterns": ["reranking-stage", "chunk-quality-filter"]
    },
    {
      "id": "reranking-stage",
      "name": "Cross-Encoder Reranking Stage",
      "category": "retrieval",
      "intent": "Reduce noise in retrieved context by re-scoring top-N candidates with a precision-focused model before feeding to LLM.",
      "structure": "Vector retriever k=50 → cross-encoder reranks all 50 on (query, chunk) pairs → top-5 to LLM.",
      "participants": ["Vector retriever (recall optimized)", "Cross-encoder (precision optimized: Cohere Rerank, BGE-reranker, Voyage)", "LLM"],
      "consequences": "+25-50% reduction in irrelevant-context hallucinations. +100-400ms reranking latency. ~$1-5 per 1K reranks cost.",
      "whenToUse": "Retrieval recall is OK (top-50 contains the answer) but precision is the bottleneck (LLM gets distracted by noise). High-value queries where latency overhead is acceptable.",
      "whenToAvoid": "Cheap chat where per-call cost matters. Very narrow corpora where retrieval is already precise.",
      "relatedPatterns": ["hybrid-retrieval", "chunk-quality-filter"]
    },
    {
      "id": "chunk-quality-filter",
      "name": "Chunk Quality Filter",
      "category": "retrieval",
      "intent": "Prevent low-quality chunks (boilerplate, navigation, broken OCR) from reaching the LLM context.",
      "structure": "Ingestion pipeline → quality heuristics (min word count, lexical diversity, language ID, boilerplate match) → reject failing chunks before embedding. Optionally: LLM-as-judge for borderline chunks.",
      "participants": ["Quality scorer (rules-based)", "Optional LLM judge for borderline cases", "Vector index"],
      "consequences": "+15-30% reduction in context-noise hallucinations. Aggressive filtering loses recall — tune thresholds per corpus.",
      "whenToUse": "Web-scraped corpora, OCR'd PDFs, mixed-quality documents. Especially when chunk count is large (>1M) and quality varies widely.",
      "whenToAvoid": "Curated, hand-edited corpora. Small corpora where every chunk matters.",
      "relatedPatterns": ["hybrid-retrieval"]
    },
    {
      "id": "tool-use-agent",
      "name": "Tool-Use Agent",
      "category": "agent",
      "intent": "Let an LLM perform actions on external systems (databases, APIs, file system) by structured function calls.",
      "structure": "LLM with function-calling enabled → JSON tool descriptions → tool executor (sandboxed) → result back to LLM → continue or terminate. State maintained in conversation context.",
      "participants": ["Function-calling LLM", "Tool registry with JSON Schema", "Sandboxed tool executor", "Result formatter"],
      "consequences": "Unlocks operational capability beyond pure text generation. Higher risk: tool calls can have side effects. Latency 200-800ms per tool call. Requires sandboxing.",
      "whenToUse": "Workflows requiring data lookup + computation + action (book a flight, query DB + summarize, run code + interpret).",
      "whenToAvoid": "Casual chat. Compliance-critical writes without human approval (use Human-in-Loop Gates instead).",
      "relatedPatterns": ["human-in-loop-gates", "model-sandboxing", "planner-executor"]
    },
    {
      "id": "planner-executor",
      "name": "Planner-Executor",
      "category": "agent",
      "intent": "Decompose a complex user goal into a plan (sequence of subtasks) then execute each subtask, possibly with different specialized models.",
      "structure": "Goal → Planner LLM produces plan (typed steps) → Executor for each step (may be specialized model / tool / sub-agent) → Synthesizer combines results.",
      "participants": ["Planner LLM (often larger model)", "Step executors (specialized models / tools)", "Synthesizer LLM", "Plan validator (optional)"],
      "consequences": "Higher quality on complex multi-step tasks. Higher cost (multiple LLM calls). Plan errors compound — needs plan validator.",
      "whenToUse": "Multi-step tasks with clear decomposition (research reports, complex data analysis, multi-file code refactors).",
      "whenToAvoid": "Simple one-shot queries. Tasks where the plan space is too open-ended to validate.",
      "relatedPatterns": ["tool-use-agent", "self-consistency-voting"]
    },
    {
      "id": "self-consistency-voting",
      "name": "Self-Consistency Voting",
      "category": "generation",
      "intent": "Improve accuracy on reasoning tasks by sampling multiple LLM responses and voting on the answer.",
      "structure": "Same prompt → N independent completions (temperature 0.5-0.7) → extract key claims from each → majority vote OR median for numeric → final response.",
      "participants": ["LLM (sampled N times)", "Claim extractor", "Voter / aggregator"],
      "consequences": "+15-35% reduction in random-flip hallucinations on reasoning tasks. Cost N× per query. Doesn't help if model is consistently wrong.",
      "whenToUse": "High-stakes single-shot decisions: arithmetic, multi-step reasoning, code review, medical triage suggestions.",
      "whenToAvoid": "Cheap chat. Tasks where the model has stable consistent output regardless of sampling.",
      "relatedPatterns": ["planner-executor", "llm-as-judge"]
    },
    {
      "id": "structured-output-schema",
      "name": "Structured Output Schema (Strict)",
      "category": "generation",
      "intent": "Force LLM output to match a strict JSON Schema for downstream code consumption.",
      "structure": "Request → JSON Schema attached to LLM call (OpenAI response_format=json_schema strict, Anthropic prompted+validators, Gemini response_mime_type) → post-validate → on schema fail, retry with error feedback.",
      "participants": ["LLM with structured output mode", "JSON Schema definition", "Validator", "Retry orchestrator"],
      "consequences": "+60-85% reduction in schema-shape hallucinations (correct shape, may still be wrong values). +100-300ms latency.",
      "whenToUse": "LLM output feeds downstream code: APIs, data extraction, form filling, function calls.",
      "whenToAvoid": "Free-form creative text output. Cases where over-constraining damages quality.",
      "relatedPatterns": ["citation-grounded-rag", "tool-use-agent"]
    },
    {
      "id": "abstention-permission",
      "name": "Abstention Permission",
      "category": "generation",
      "intent": "Allow (encourage) the LLM to refuse to answer when retrieved context is insufficient instead of fabricating.",
      "structure": "System prompt explicitly grants permission: 'If the retrieved context does not contain the answer, say: I don't have enough information to answer that.' Track refusal rate as KPI.",
      "participants": ["LLM with permission-granting system prompt", "Refusal-rate monitor", "Optional retrieval improvement loop"],
      "consequences": "+25-40% hallucination reduction. +15-30% refusal rate. Some users prefer answers to refusals — product messaging matters.",
      "whenToUse": "RAG production systems where wrong answers are worse than no answers (medical, legal, financial, compliance).",
      "whenToAvoid": "Conversational UX where refusal kills engagement. Brainstorming / creative use cases.",
      "relatedPatterns": ["citation-grounded-rag", "knowledge-cutoff-disclosure"]
    },
    {
      "id": "llm-as-judge",
      "name": "LLM-as-Judge Sampling",
      "category": "evaluation",
      "intent": "Quality-check production LLM outputs at low cost without human review of every response.",
      "structure": "Production response → 5-10% sampled → judge LLM with evaluation rubric scores factuality / relevance / safety → dashboard tracking judge score over time. Optional: judge LLM can flag for human review.",
      "participants": ["Production LLM", "Sample selector", "Judge LLM (different model family preferred)", "Evaluation dashboard"],
      "consequences": "Detection (not prevention) of degradation 2-7 days earlier than user complaints. Judge has its own biases. +5-10% of generation cost.",
      "whenToUse": "Production deployments where quality regression detection matters. Pair with offline eval sets for ground truth.",
      "whenToAvoid": "Single-user / hobby projects. Cases where you can review 100% of responses cheaply.",
      "relatedPatterns": ["drift-monitoring", "fact-extraction-validation"]
    },
    {
      "id": "fact-extraction-validation",
      "name": "Fact Extraction + Validation",
      "category": "evaluation",
      "intent": "For long-form outputs, extract individual factual claims and validate each separately before presenting.",
      "structure": "LLM response → secondary LLM extracts atomic claims → for each claim, search retrieved context for support → annotate response: [verified] / [unverified] / [partial] → optionally strip unverified before presenting.",
      "participants": ["Generator LLM", "Claim extractor (secondary LLM)", "Verifier", "Annotator / stripper"],
      "consequences": "+40-70% surface-hallucination reduction. Cost 2-3× per query. Verification may itself hallucinate.",
      "whenToUse": "Output destined for publication, legal/medical context, summarization with attribution requirements.",
      "whenToAvoid": "Casual chat. Tasks where the response is structurally not decomposable into claims.",
      "relatedPatterns": ["citation-grounded-rag", "llm-as-judge"]
    },
    {
      "id": "drift-monitoring",
      "name": "Drift Monitoring",
      "category": "ops",
      "intent": "Detect slow degradation in AI system quality before users complain.",
      "structure": "Daily aggregation of input features (topic, query length, vocab) + output features (refusal rate, judge score, response length) → compare to baseline window (first 30 days) → PSI or KL divergence → alert at PSI > 0.2.",
      "participants": ["Feature aggregator", "Baseline snapshot", "Drift detector (PSI / KL)", "Alerting"],
      "consequences": "Detection only. Flags 1-4 weeks earlier than user-reported quality drop. False alarms from legitimate domain shift.",
      "whenToUse": "Production AI systems running >3 months. Critical-path systems where regression has business cost.",
      "whenToAvoid": "Prototypes. Systems with extremely stable input distributions.",
      "relatedPatterns": ["llm-as-judge", "version-pinning"]
    },
    {
      "id": "version-pinning",
      "name": "Version-Pin Everything",
      "category": "ops",
      "intent": "Prevent silent vendor model updates from breaking production behavior overnight.",
      "structure": "Config-as-code: pin model version (gpt-4o-2024-08-06, not gpt-4o), embedding model version, prompt version, retrieval config. CI gate: any pin change requires PR + eval pass. Audit log records version per request.",
      "participants": ["Config repository", "CI eval gate", "Audit log"],
      "consequences": "Prevents surprise regression. Pinned versions get deprecated; must plan migrations.",
      "whenToUse": "Any production AI deployment. This is hygiene, not optional.",
      "whenToAvoid": "Never — but stay informed about deprecation schedules.",
      "relatedPatterns": ["drift-monitoring", "human-in-loop-gates"]
    },
    {
      "id": "human-in-loop-gates",
      "name": "Human-in-Loop Gates",
      "category": "safety",
      "intent": "Insert human review at specific points when AI output drives consequential decisions.",
      "structure": "AI output + confidence score → if confidence > threshold, automatic action; if below, queue for human approval. Track override rates + approval latency.",
      "participants": ["AI generator + confidence scorer", "Approval queue UI", "Human reviewer pool", "Override-rate dashboard"],
      "consequences": "Prevents harm. Operational overhead (reviewer capacity = throughput cap). Reviewer drift over time.",
      "whenToUse": "Regulated domains (medical, legal, financial), high-cost decisions, irreversible actions.",
      "whenToAvoid": "Latency-sensitive UX. High-throughput low-stakes decisions (use auto + sampling instead).",
      "relatedPatterns": ["tool-use-agent", "audit-log-with-context"]
    },
    {
      "id": "audit-log-with-context",
      "name": "Audit Log with Full Context",
      "category": "safety",
      "intent": "After-the-fact reproducibility of any AI decision — required for post-incident review + regulatory response.",
      "structure": "Wrapper around generation call captures: input, system prompt version, retrieved chunks (full text), model + embedding versions, response, downstream action. Persist to immutable storage (S3 versioned bucket / append-only DB).",
      "participants": ["Generation wrapper", "Immutable storage", "Retention policy enforcer", "Audit query UI"],
      "consequences": "Enables root-cause analysis and regulatory response. Storage cost. PII implications if logs contain personal data (encrypt + access control).",
      "whenToUse": "EU AI Act high-risk systems. US sectoral regulation (HIPAA, finserv). Any deployment where 'how did this happen' has business / legal value.",
      "whenToAvoid": "Hobby projects. Cases with strict no-log requirements (some private medical workflows).",
      "relatedPatterns": ["human-in-loop-gates", "version-pinning"]
    },
    {
      "id": "model-sandboxing",
      "name": "Model Sandboxing for Agents",
      "category": "safety",
      "intent": "Constrain what an LLM agent can actually DO when given code-execution or browser-use tools.",
      "structure": "Tool calls run in sandboxed environment: file-system isolation (jail), network egress rules (deny-by-default + explicit allow-list), capability allow-list (no fork, no shell escape). Container per session.",
      "participants": ["Sandbox runtime (Firecracker / gVisor / WASM)", "Egress firewall", "Capability allow-list", "Session lifecycle manager"],
      "consequences": "Bounds blast radius of malicious / hallucinated tool calls. +500ms-1s overhead per sandbox start. Some legitimate tool uses blocked.",
      "whenToUse": "Agents with code-execution, browser, or file-system tools. Anywhere users can craft inputs that influence tool selection.",
      "whenToAvoid": "Pure conversational agents with no consequential tools. Single-user trusted-internal workflows.",
      "relatedPatterns": ["tool-use-agent", "human-in-loop-gates"]
    },
    {
      "id": "tenant-isolation-rag",
      "name": "Tenant-Isolation RAG",
      "category": "retrieval",
      "intent": "Multi-tenant SaaS: ensure tenant A's data never appears in tenant B's retrieved context, even with shared infrastructure.",
      "structure": "Single vector index with tenant_id metadata filter at query time. Per-tenant embedding namespaces (logical separation). Application-layer enforcement: tenant_id from auth token, never user input. Optional: separate physical indexes for highest-tier customers.",
      "participants": ["Vector index with metadata filter", "Auth-derived tenant_id", "Query builder enforcing tenant_id"],
      "consequences": "Cost-effective vs separate indexes per tenant. Risk: filter-bypass bug = catastrophic data leak. Requires test-suite specifically targeting tenant boundaries.",
      "whenToUse": "Multi-tenant SaaS AI features. Customer-facing AI where data confidentiality is contractual.",
      "whenToAvoid": "Single-tenant deployments. Cases where physical isolation is regulator-mandated.",
      "relatedPatterns": ["audit-log-with-context", "human-in-loop-gates"]
    },
    {
      "id": "prompt-cache-warming",
      "name": "Prompt Cache Warming",
      "category": "ops",
      "intent": "Reduce LLM cost + latency on workflows with repeated long system prompts or retrieved context.",
      "structure": "Identify cacheable prefix (system prompt + frequently-recurring context blocks) → use vendor cache primitive (OpenAI prompt cache, Anthropic cache_control, Gemini prefix cache) → structure prompt as cache-friendly: stable prefix → varying suffix.",
      "participants": ["Prompt template with stable prefix", "Vendor cache API", "Cache hit-rate monitor"],
      "consequences": "10-75% cost discount on cached portion (vendor-dependent). 30-60% latency reduction on cache hit. Cache TTL is short (typically minutes), so high-frequency workflows benefit most.",
      "whenToUse": "RAG with frequently-repeated context. Agent loops with persistent system prompts. High-volume API workflows.",
      "whenToAvoid": "Low-volume workflows (cache TTL expires before reuse). Highly variable prompts.",
      "relatedPatterns": ["citation-grounded-rag", "tool-use-agent"]
    }
  ],
  "lastReview": "2026-06-20",
  "nextReviewDate": "2026-12-20",
  "feedback": "Pattern missing or framing off? Contact info@slavin.ai"
}
