Skip to content

donna_models.yaml

Source: config/donna_models.yaml

# Model routing configuration
# See docs/model-layer.md for full documentation

models:
  parser:
    provider: anthropic
    model: claude-sonnet-4-20250514
    input_cost_per_token_usd: 0.000003    # $3/M input tokens (Sonnet pricing)
    output_cost_per_token_usd: 0.000015   # $15/M output tokens (Sonnet pricing)
  reasoner:
    provider: anthropic
    model: claude-sonnet-4-20250514
    input_cost_per_token_usd: 0.000003
    output_cost_per_token_usd: 0.000015
  fallback:
    provider: anthropic
    model: claude-sonnet-4-20250514
    input_cost_per_token_usd: 0.000003
    output_cost_per_token_usd: 0.000015
  local_parser:
    provider: ollama
    model: qwen2.5:32b-instruct-q4_K_M
    estimated_cost_per_1k_tokens: 0.0001  # hardware amortization
    num_ctx: 8192
  local_vision:
    provider: ollama
    model: qwen2.5-vl:7b-instruct-q6_K
    estimated_cost_per_1k_tokens: 0.0001
    num_ctx: 4096

# Ollama local LLM connection settings (RTX 3090)
ollama:
  base_url: http://donna-ollama:11434
  timeout_s: 120
  keepalive: 5m
  default_num_ctx: 8192
  default_output_reserve: 1024
  # Self-calibrating prompt-token estimation (design A, spec_v3.md §4.2).
  # The router estimates tokens_in from len(prompt)/divisor before the budget
  # gate; a constant divisor of 4 under-estimates dense prompts and silently
  # truncates the context window. Instead the router keeps a per-task-type EMA
  # of the observed len(prompt)/tokens_in ratio and uses the clamped EMA times
  # safety_factor as the divisor. No tokenizer dependency.
  token_estimation:
    safety_factor: 0.9          # <1.0 inflates the estimate (headroom vs truncation)
    ema_alpha: 0.2              # EMA smoothing in [0,1]; higher reacts faster
    divisor_bounds: [2.5, 4.5]  # clamp on the EMA divisor

# Production shadow-mode kill switch (design B, spec_v3.md §4.4).
# Shadow doubles real billed spend, so it stays OFF until an operator opts in.
# When disabled, the router never fires a shadow call even if a routing entry
# declares shadow:. Shadow spend, when enabled, is now accounted on
# invocation_log (is_shadow=1) because shadow routes through complete().
shadow:
  enabled: false

routing:
  # Keys match task_types.yaml entries exactly
  parse_task:
    model: local_parser
    fallback: reasoner
    confidence_threshold: 0.7
  parse_task_cloud:
    model: reasoner
  classify_priority:
    model: parser
    fallback: reasoner
  generate_digest:
    model: local_parser
    fallback: parser
  prep_research:
    model: reasoner
  dedup_check:
    model: parser
    fallback: reasoner
  task_decompose:
    model: reasoner
  extract_preferences:
    model: reasoner
  # Local model eval routing — use with: donna eval --task-type parse_task_local
  parse_task_local:
    model: local_parser

  # Local LLM agents — zero marginal cost
  generate_nudge:
    model: local_parser
    fallback: parser
  generate_reminder:
    model: local_parser
    fallback: parser
  challenge_task:
    model: local_parser
    fallback: parser
  generate_weekly_digest:
    model: local_parser
    fallback: parser

  # Chat interface
  classify_chat_intent:
    model: local_parser
  chat_respond:
    model: local_parser
  chat_summarize:
    model: local_parser
  chat_escalation:
    model: parser

  # Slice 20 — local-Ollama summarizer for over-budget escalation
  # notifications. Runs offline so the chat-mode notification carries
  # no API spend (§5.2 of docs/superpowers/specs/manual-escalation.md).
  escalation_summary:
    model: local_parser
    fallback: parser
  triage_failure:
    model: local_parser
    fallback: parser
  claude_novelty:
    model: parser
    fallback: reasoner
  reply_intent:
    model: local_parser
    fallback: parser
  skill_equivalence_judge:
    model: reasoner
  skill_auto_draft:
    model: reasoner
  skill_evolution:
    model: reasoner

  # Slice 15 — template-driven vault writes
  draft_meeting_note:
    model: reasoner

  # Slice 16 — cadence-driven template writes
  draft_daily_reflection:
    model: reasoner
  draft_weekly_review:
    model: reasoner
  draft_person_profile:
    model: reasoner
  extract_commitments:
    model: reasoner

  # Wave 2: prefix-routing for dynamic skill + validation task_types.
  skill_step:
    model: parser
    shadow: local_parser
    fallback: reasoner
  # GPU-aware skill step overrides — executor builds task_type with
  # model alias prefix when a step declares model: or gpu_model:
  skill_step__local_parser:
    model: local_parser
    fallback: null
  skill_step__local_vision:
    model: local_vision
    fallback: null
  skill_validation:
    model: parser
    fallback: reasoner

# Cost tracking
cost:
  monthly_budget_usd: 100.00
  daily_pause_threshold_usd: 20.00
  task_approval_threshold_usd: 5.00
  monthly_warning_pct: 0.90
  # Assumed output tokens for the escalation gate's deterministic cost floor
  # when a caller supplies no estimate_usd (manual-escalation.md §4).
  estimate_output_tokens: 1000

# Quality monitoring (disabled Phase 1–2, enable when local model handles traffic)
quality_monitoring:
  spot_check_rate: 0.05
  judge_model: reasoner
  judge_batch_schedule: weekly
  flag_threshold: 0.7
  enabled: false