{
  "$schema": "https://www.slavin.ai/data/ai-incident-response-playbook-schema.json",
  "dataset": {
    "name": "AI Incident Response Playbook",
    "version": "2026-06",
    "publisher": "Slavin AI (SLAtech LTD)",
    "publisherUrl": "https://www.slavin.ai/",
    "license": "CC-BY-4.0",
    "lastUpdated": "2026-06-14",
    "description": "Structured incident response playbook for production AI systems. 14 incident classes that production AI experiences post-launch — each with detection signals, immediate triage actions, communication template, root cause investigation pattern, and the prevention update that closes the gap. Companion to the Failure Modes Catalog: failure modes are caught before launch; incidents are what reaches production despite review.",
    "audience": ["CISO", "AI Risk Lead", "AI Platform Team", "Incident Commander", "Engineering Lead"],
    "methodology": "Compiled from 150+ Slavin/SLAtech engagements 2022-2026 where production AI systems experienced incidents requiring documented response. Each incident class has been observed at least twice across independent clients. Playbook steps reflect what was actually done; entries are documented as patterns not specific events."
  },
  "severity_levels": [
    { "level": "P0", "name": "Critical", "description": "Customer-facing outage or imminent financial / regulatory exposure. Page on-call immediately." },
    { "level": "P1", "name": "High", "description": "Significant degradation or risk; mitigate within hours." },
    { "level": "P2", "name": "Medium", "description": "Quality / cost issue affecting subset of traffic; mitigate within business day." },
    { "level": "P3", "name": "Low", "description": "Observed pattern requiring investigation but no active customer impact." }
  ],
  "incidents": [
    {
      "id": "INC-01",
      "name": "LLM cost explosion",
      "default_severity": "P1",
      "detection_signals": ["Spend metric exceeds daily budget by >2x", "Token-per-query metric step-change", "Vendor billing alert"],
      "immediate_triage": ["Activate per-key rate limit at 25% of normal", "Identify top-N consumers from logs", "Roll back recent prompt/template changes if correlated"],
      "communication_template": "Cost anomaly detected on LLM service at HH:MM. Throttling in effect; user impact is rate-limited responses. Root cause investigation in progress; ETA for normal capacity within Xh.",
      "root_cause_patterns": ["Loose top-k in RAG retrieval", "Removed reranker by mistake", "Prompt change ballooning context", "Abuse / scraping pattern"],
      "prevention_update": "Add token-per-query alarm at 1.5x baseline. Enforce per-key rate limit and budget cap. Add reranker required by policy."
    },
    {
      "id": "INC-02",
      "name": "Hallucination at scale on a specific topic",
      "default_severity": "P1",
      "detection_signals": ["Customer complaint cluster mentioning same wrong fact", "Quality eval regression on a topic subset", "Citation rate drop on retrieved chunks"],
      "immediate_triage": ["Add hard-coded refusal for the topic pending fix", "Inspect retrieval against ground truth on representative queries", "Confirm RAG corpus actually contains the correct info"],
      "communication_template": "We have identified incorrect responses on topic X. The assistant is now declining the topic while we fix retrieval. ETA for return to service: Xh.",
      "root_cause_patterns": ["Retrieval not surfacing the right chunk", "Embedding model misalignment on terminology", "Corpus stale or missing the fact", "System prompt allowing speculation"],
      "prevention_update": "Add golden-set evaluation including the topic. Enforce citation requirement in prompt. Schedule corpus freshness audit."
    },
    {
      "id": "INC-03",
      "name": "Silent model drift (vendor update)",
      "default_severity": "P2",
      "detection_signals": ["Eval score drop without code change", "Output distribution shift", "Customer feedback shift over a week"],
      "immediate_triage": ["Pin to a known-good model version if vendor allows", "A/B compare current vs prior version on golden set", "Roll back affected prompts to the version-pinned model"],
      "communication_template": "Internal: model drift suspected on vendor X. Performance regression of Y% observed. Pinning to prior version while we investigate.",
      "root_cause_patterns": ["Vendor silently updated underlying model", "Vendor deprecated feature relied on by prompt", "Tokenizer changed under the same model name"],
      "prevention_update": "Version-pin every model in production. Subscribe to vendor changelog. Add eval to CI to catch drift."
    },
    {
      "id": "INC-04",
      "name": "Prompt injection through retrieved content",
      "default_severity": "P0",
      "detection_signals": ["Anomalous tool calls", "Output containing data not from current user", "Suspicious system-prompt leakage in responses"],
      "immediate_triage": ["Disable tool calling for retrieved-content sessions", "Quarantine the suspect documents", "Force re-auth on affected sessions"],
      "communication_template": "Internal P0: prompt injection vector detected through document upload. Tools disabled while we investigate scope. No customer notification yet pending impact assessment.",
      "root_cause_patterns": ["Retrieved content treated as instruction", "User-uploaded document not filtered", "RAG corpus polluted with adversarial entries"],
      "prevention_update": "System prompt that forbids following retrieved instructions. Output validation. Content scanning before ingestion. Tool-call allowlist independent of LLM output."
    },
    {
      "id": "INC-05",
      "name": "PII leak in LLM output",
      "default_severity": "P0",
      "detection_signals": ["DLP alert", "Customer report", "Audit log entry showing PII in response"],
      "immediate_triage": ["Disable affected endpoint", "Engage legal and privacy officer", "Identify scope: which users, what data, how many calls"],
      "communication_template": "Internal P0: PII appearance in LLM output identified on endpoint X. Endpoint disabled. Legal engaged. Detailed scope assessment within 4h.",
      "root_cause_patterns": ["PII in RAG corpus that should have been redacted", "Insufficient output filtering", "Cross-tenant retrieval leak", "Model trained on PII (vendor side)"],
      "prevention_update": "PII scrubbing on ingestion AND on output. Tenant filter at retrieval layer. Output DLP scan. Periodic red-team."
    },
    {
      "id": "INC-06",
      "name": "Vendor outage",
      "default_severity": "P1",
      "detection_signals": ["Vendor status page yellow/red", "Spike in 5xx from vendor API", "Latency p95 exceeds threshold"],
      "immediate_triage": ["Activate self-hosted fallback OR alternative vendor via gateway", "Set feature flag to degraded mode for non-critical AI paths", "Update status page"],
      "communication_template": "Some AI features are running in degraded mode due to upstream vendor outage. Full functionality will return when the vendor recovers; we will update status hourly.",
      "root_cause_patterns": ["Vendor incident", "Regional vendor outage with no multi-region fallback", "Rate-limit hit because our usage pattern changed"],
      "prevention_update": "Gateway-based multi-vendor with automatic failover. Self-hosted fallback for compliance-critical paths. Graceful degradation feature flags."
    },
    {
      "id": "INC-07",
      "name": "Latency degradation",
      "default_severity": "P2",
      "detection_signals": ["p95 over SLA for >15 min", "Queue depth growing", "Customer satisfaction metric drop"],
      "immediate_triage": ["Add caching layer for hot prompts", "Shed non-critical traffic", "Check vendor status; switch region if applicable"],
      "communication_template": "We are seeing higher response times in AI features. Mitigations in effect; normal latency expected within Xh.",
      "root_cause_patterns": ["Vendor regional latency", "Context window grew due to corpus growth", "Reranker is the bottleneck", "Cold cache after deploy"],
      "prevention_update": "Latency SLO alarms. Hot-prompt cache. Streaming responses where applicable. Reranker capacity planning."
    },
    {
      "id": "INC-08",
      "name": "Quality regression after prompt change",
      "default_severity": "P2",
      "detection_signals": ["Eval score drop after deploy", "Customer feedback shift", "Internal QA flag"],
      "immediate_triage": ["Roll back the prompt to previous version", "Compare A/B against rolled-back version", "Annotate failing samples for analysis"],
      "communication_template": "Internal: quality regression detected on feature X after prompt change at time Y. Reverted; investigating root cause.",
      "root_cause_patterns": ["Prompt change tested only on happy path", "Edge cases broke", "Negative-space examples not in eval"],
      "prevention_update": "Mandatory golden-set eval before prompt deploy. Negative-space examples in eval. Canary deployment for prompts."
    },
    {
      "id": "INC-09",
      "name": "Tool-use loop / runaway agent",
      "default_severity": "P1",
      "detection_signals": ["Tool call count spike per session", "Long-running session metric", "Cost-per-session anomaly"],
      "immediate_triage": ["Hard cap tool calls per session", "Kill long sessions over a limit", "Disable specific tool if it is the loop driver"],
      "communication_template": "Internal: agent loop detected. Per-session call cap reduced. Investigation in progress.",
      "root_cause_patterns": ["Tool returns ambiguous result leading to retry", "Plan-and-execute prompt encourages repetition", "No completion criterion in prompt"],
      "prevention_update": "Per-session call budget. Completion criterion in system prompt. Tool result validation before next step."
    },
    {
      "id": "INC-10",
      "name": "Compliance audit finding on AI",
      "default_severity": "P1",
      "detection_signals": ["External or internal audit finding", "Regulator inquiry", "Customer compliance review failure"],
      "immediate_triage": ["Document scope and timeline of finding", "Engage compliance and legal", "Suspend specific affected feature if recommended"],
      "communication_template": "Internal: audit finding requires response by date D. Engaging legal and compliance. Customer-facing communication pending legal review.",
      "root_cause_patterns": ["Missing audit log", "Insufficient evidence for compliance baseline control", "Configuration drift since last audit"],
      "prevention_update": "Audit log retention review. Map every control in 12-control baseline to evidence. Schedule quarterly pre-audit."
    },
    {
      "id": "INC-11",
      "name": "Embedding corpus poisoning",
      "default_severity": "P0",
      "detection_signals": ["Anomalous response cluster from specific retrieval pattern", "Unusual documents appearing high in retrieval", "External report of malicious upload"],
      "immediate_triage": ["Disable user-content ingestion temporarily", "Re-embed the corpus from trusted source", "Quarantine suspicious documents"],
      "communication_template": "Internal P0: potential corpus poisoning. User-content ingestion disabled. Re-indexing in progress.",
      "root_cause_patterns": ["Unauthenticated user-content ingestion", "Insufficient content moderation", "Adversarial uploader exploiting public path"],
      "prevention_update": "Authenticate every ingestion path. Content classifier at ingestion. Provenance tracking on every chunk."
    },
    {
      "id": "INC-12",
      "name": "Token / credential leak in prompt",
      "default_severity": "P0",
      "detection_signals": ["Credential appearing in vendor logs", "Customer credential rotation alert", "Secrets scanner alert"],
      "immediate_triage": ["Rotate the credential immediately", "Audit access logs for misuse during exposure window", "Disable the integration that leaked"],
      "communication_template": "Internal P0: credential exposure. Rotation complete; access logs under review for misuse window of Xh.",
      "root_cause_patterns": ["Credential included in retrieved chunk", "User pasted secret into chat which was logged", "Debug logging exposing secret"],
      "prevention_update": "Secret pre-filter on every user input and retrieved chunk. Vendor log redaction policy. Never log raw prompts in production."
    },
    {
      "id": "INC-13",
      "name": "Stale data served by RAG after source change",
      "default_severity": "P2",
      "detection_signals": ["Customer report of outdated info", "Reindex job failure log", "Cache invalidation alarm"],
      "immediate_triage": ["Force reindex of affected source", "Disable cache for the affected topic", "Confirm freshness via spot-check"],
      "communication_template": "Customer-facing: we identified that some responses were based on older data. Index is refreshed; please retry.",
      "root_cause_patterns": ["Failed reindex job that was not alerted", "Cache TTL too long for source change cadence", "Webhook from source dropped"],
      "prevention_update": "Reindex job health alarm. Source-change webhook retries with DLQ. Per-source freshness SLO."
    },
    {
      "id": "INC-14",
      "name": "Regulatory boundary crossing (data residency)",
      "default_severity": "P0",
      "detection_signals": ["Vendor processing log showing cross-border data flow", "Compliance review finding", "Customer escalation"],
      "immediate_triage": ["Route affected traffic to in-region endpoint", "Engage legal", "Document scope of cross-border processing"],
      "communication_template": "Internal P0: data-residency boundary may have been crossed. Routing corrected. Legal evaluating notification obligations.",
      "root_cause_patterns": ["Vendor request routed to unexpected region", "Self-hosted fallback in different jurisdiction", "DPA not covering the routing"],
      "prevention_update": "Vendor SLA includes region pinning. Self-hosted fallback in-region. Map every data flow against DPA matrix."
    }
  ],
  "common_post_incident_actions": [
    { "name": "Postmortem within 5 business days", "owner": "Incident Commander", "deliverable": "Document with timeline, impact, root cause, action items, owners, dates." },
    { "name": "Update the AI Failure Modes Catalog if a new mode emerged", "owner": "AI Platform Lead", "deliverable": "PR to internal failure-modes register." },
    { "name": "Update governance baseline if a control failed", "owner": "AI Risk Lead", "deliverable": "Updated baseline + assessment of other systems against the same control." },
    { "name": "Update vendor scorecard if vendor-related", "owner": "Procurement", "deliverable": "Score adjustment + next-review date pulled forward." }
  ],
  "see_also": {
    "position_page": "https://www.slavin.ai/Architect-vs-AI.aspx",
    "failure_modes_catalog": "https://www.slavin.ai/data/ai-failure-modes-catalog.json",
    "governance_baseline": "https://www.slavin.ai/data/ai-governance-baseline.json"
  }
}
