donna_models.yaml¶
Source: config/donna_models.yaml
# Model routing configuration
# See docs/model-layer.md for full documentation
models:
parser:
provider: anthropic
model: claude-sonnet-4-20250514
input_cost_per_token_usd: 0.000003 # $3/M input tokens (Sonnet pricing)
output_cost_per_token_usd: 0.000015 # $15/M output tokens (Sonnet pricing)
reasoner:
provider: anthropic
model: claude-sonnet-4-20250514
input_cost_per_token_usd: 0.000003
output_cost_per_token_usd: 0.000015
fallback:
provider: anthropic
model: claude-sonnet-4-20250514
input_cost_per_token_usd: 0.000003
output_cost_per_token_usd: 0.000015
local_parser:
provider: ollama
model: qwen2.5:32b-instruct-q4_K_M
estimated_cost_per_1k_tokens: 0.0001 # hardware amortization
num_ctx: 8192
local_vision:
provider: ollama
model: qwen2.5-vl:7b-instruct-q6_K
estimated_cost_per_1k_tokens: 0.0001
num_ctx: 4096
# Ollama local LLM connection settings (RTX 3090)
ollama:
base_url: http://donna-ollama:11434
timeout_s: 120
keepalive: 5m
default_num_ctx: 8192
default_output_reserve: 1024
# Self-calibrating prompt-token estimation (design A, spec_v3.md §4.2).
# The router estimates tokens_in from len(prompt)/divisor before the budget
# gate; a constant divisor of 4 under-estimates dense prompts and silently
# truncates the context window. Instead the router keeps a per-task-type EMA
# of the observed len(prompt)/tokens_in ratio and uses the clamped EMA times
# safety_factor as the divisor. No tokenizer dependency.
token_estimation:
safety_factor: 0.9 # <1.0 inflates the estimate (headroom vs truncation)
ema_alpha: 0.2 # EMA smoothing in [0,1]; higher reacts faster
divisor_bounds: [2.5, 4.5] # clamp on the EMA divisor
# Production shadow-mode kill switch (design B, spec_v3.md §4.4).
# Shadow doubles real billed spend, so it stays OFF until an operator opts in.
# When disabled, the router never fires a shadow call even if a routing entry
# declares shadow:. Shadow spend, when enabled, is now accounted on
# invocation_log (is_shadow=1) because shadow routes through complete().
shadow:
enabled: false
routing:
# Keys match task_types.yaml entries exactly
parse_task:
model: local_parser
fallback: reasoner
confidence_threshold: 0.7
parse_task_cloud:
model: reasoner
classify_priority:
model: parser
fallback: reasoner
generate_digest:
model: local_parser
fallback: parser
prep_research:
model: reasoner
dedup_check:
model: parser
fallback: reasoner
task_decompose:
model: reasoner
extract_preferences:
model: reasoner
# Local model eval routing — use with: donna eval --task-type parse_task_local
parse_task_local:
model: local_parser
# Local LLM agents — zero marginal cost
generate_nudge:
model: local_parser
fallback: parser
generate_reminder:
model: local_parser
fallback: parser
challenge_task:
model: local_parser
fallback: parser
generate_weekly_digest:
model: local_parser
fallback: parser
# Chat interface
classify_chat_intent:
model: local_parser
chat_respond:
model: local_parser
chat_summarize:
model: local_parser
chat_escalation:
model: parser
# Slice 20 — local-Ollama summarizer for over-budget escalation
# notifications. Runs offline so the chat-mode notification carries
# no API spend (§5.2 of docs/superpowers/specs/manual-escalation.md).
escalation_summary:
model: local_parser
fallback: parser
triage_failure:
model: local_parser
fallback: parser
claude_novelty:
model: parser
fallback: reasoner
reply_intent:
model: local_parser
fallback: parser
skill_equivalence_judge:
model: reasoner
skill_auto_draft:
model: reasoner
skill_evolution:
model: reasoner
# Slice 15 — template-driven vault writes
draft_meeting_note:
model: reasoner
# Slice 16 — cadence-driven template writes
draft_daily_reflection:
model: reasoner
draft_weekly_review:
model: reasoner
draft_person_profile:
model: reasoner
extract_commitments:
model: reasoner
# Wave 2: prefix-routing for dynamic skill + validation task_types.
skill_step:
model: parser
shadow: local_parser
fallback: reasoner
# GPU-aware skill step overrides — executor builds task_type with
# model alias prefix when a step declares model: or gpu_model:
skill_step__local_parser:
model: local_parser
fallback: null
skill_step__local_vision:
model: local_vision
fallback: null
skill_validation:
model: parser
fallback: reasoner
# Cost tracking
cost:
monthly_budget_usd: 100.00
daily_pause_threshold_usd: 20.00
task_approval_threshold_usd: 5.00
monthly_warning_pct: 0.90
# Assumed output tokens for the escalation gate's deterministic cost floor
# when a caller supplies no estimate_usd (manual-escalation.md §4).
estimate_output_tokens: 1000
# Quality monitoring (disabled Phase 1–2, enable when local model handles traffic)
quality_monitoring:
spot_check_rate: 0.05
judge_model: reasoner
judge_batch_schedule: weekly
flag_threshold: 0.7
enabled: false