Improve analysis prioritization and article structure

2026-05-17 08:59:51 +00:00 · 2026-05-17 08:59:51 +00:00 · 52c6081a97
commit 52c6081a97
parent 3aaa6df53c
3 changed files with 260 additions and 31 deletions
--- a/.env.example
+++ b/.env.example
@ -23,6 +23,8 @@ KEEP_SNAPSHOT_DAYS="14"
 # At 05:00, analyze snapshots from roughly this many hours
 ANALYZE_SNAPSHOT_HOURS="24"
 ARTICLE_CONTEXT_DAYS="7"
+MAX_ANALYZE_CHARS="80000"
+DISPLAY_TIMEZONE="Europe/Copenhagen"

 # Domains to include
 RELEVANT_DOMAINS="sensor,binary_sensor,person,device_tracker,climate,light,switch,lock,cover,alarm_control_panel,media_player,calendar,weather"
--- a/ha_observer.py
+++ b/ha_observer.py
@ -18,10 +18,12 @@ import os
 import re
 import subprocess
 import sys
+import tempfile
 from datetime import datetime, timedelta, timezone
 from email.utils import format_datetime
 from pathlib import Path
 from typing import Any
+from zoneinfo import ZoneInfo

 import requests

@ -38,6 +40,8 @@ HISTORY_HOURS = int(os.environ.get("HISTORY_HOURS", "24"))
 MAX_HISTORY_PER_ENTITY = int(os.environ.get("MAX_HISTORY_PER_ENTITY", "20"))
 ANALYZE_SNAPSHOT_HOURS = int(os.environ.get("ANALYZE_SNAPSHOT_HOURS", "24"))
 ARTICLE_CONTEXT_DAYS = int(os.environ.get("ARTICLE_CONTEXT_DAYS", "7"))
+MAX_ANALYZE_CHARS = int(os.environ.get("MAX_ANALYZE_CHARS", "80000"))
+DISPLAY_TIMEZONE = os.environ.get("DISPLAY_TIMEZONE", "Europe/Copenhagen")
 KEEP_SNAPSHOT_DAYS = int(os.environ.get("KEEP_SNAPSHOT_DAYS", "14"))

 # LLM_MODE: none | pi | ollama | openai
@ -75,6 +79,44 @@ ALLOWED_ATTRIBUTES = {
    "assumed_state",
 }

+IMPORTANT_ENTITY_KEYWORDS = {
+    "alarm": 100,
+    "smoke": 100,
+    "co_": 100,
+    "carbon_monoxide": 100,
+    "leak": 95,
+    "water": 80,
+    "door": 85,
+    "window": 80,
+    "lock": 85,
+    "motion": 70,
+    "presence": 70,
+    "occupancy": 70,
+    "person": 75,
+    "device_tracker": 75,
+    "phone": 70,
+    "laptop": 60,
+    "battery": 65,
+    "humidity": 60,
+    "temperature": 55,
+    "climate": 55,
+    "heating": 55,
+    "dehumidifier": 70,
+    "backup": 70,
+    "internet": 65,
+    "speedtest": 65,
+    "router": 60,
+    "light": 45,
+    "switch": 35,
+    "sonos": 45,
+    "media": 40,
+    "tv": 40,
+    "megane": 50,
+    "fjr": 50,
+    "plant": 45,
+    "smb_": 60,
+}
+

 class ConfigError(RuntimeError):
    pass
@ -198,28 +240,106 @@ def load_recent_snapshots(hours: int) -> list[dict[str, Any]]:
    return snapshots


+def display_time(value: str | None) -> str:
+    if not value:
+        return ""
+    try:
+        dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        local = dt.astimezone(ZoneInfo(DISPLAY_TIMEZONE))
+        return local.strftime("%Y-%m-%d %H:%M:%S %Z")
+    except Exception:
+        return value
+
+
+def entity_importance(entity_id: str, attrs: dict[str, Any] | None = None) -> int:
+    attrs = attrs or {}
+    domain = entity_id.split(".", 1)[0]
+    text = f"{entity_id} {attrs.get('friendly_name', '')} {attrs.get('device_class', '')}".lower()
+    score = 0
+
+    domain_scores = {
+        "alarm_control_panel": 100,
+        "lock": 90,
+        "person": 80,
+        "device_tracker": 75,
+        "binary_sensor": 60,
+        "climate": 55,
+        "cover": 50,
+        "sensor": 45,
+        "light": 35,
+        "switch": 30,
+        "media_player": 25,
+    }
+    score += domain_scores.get(domain, 10)
+
+    for keyword, points in IMPORTANT_ENTITY_KEYWORDS.items():
+        if keyword in text:
+            score += points
+
+    # Sønderborg/Denmark home is the primary residence and absolute priority.
+    # Samobor/Croatia entities use the smb_ prefix and are still included, but
+    # they should lose ties when the LLM input has to be size-limited.
+    if "smb_" in entity_id.lower():
+        score -= 40
+    else:
+        score += 120
+
+    state = str(attrs.get("state", "")).lower()
+    if state in {"on", "open", "unlocked", "detected", "home"}:
+        score += 15
+    return score
+
+
 def summarize_snapshot(snapshot: dict[str, Any]) -> str:
-    lines = [f"Snapshot: {snapshot.get('generated_at')}", "Current states:"]
-    for state in snapshot.get("states", []):
+    lines = [
+        f"Snapshot: {display_time(snapshot.get('generated_at'))}",
+        "Priority current states first; lower-priority entities follow only if the LLM size limit allows.",
+        "Current states:",
+    ]
+    states = sorted(
+        snapshot.get("states", []),
+        key=lambda state: (-entity_importance(state.get("entity_id", ""), state.get("attributes", {})), state.get("entity_id", "")),
+    )
+    for state in states:
        attrs = state.get("attributes", {})
        name = attrs.get("friendly_name", state.get("entity_id"))
        unit = attrs.get("unit_of_measurement", "")
        value = f"{state.get('state')} {unit}".strip()
-        lines.append(f"- {name} ({state.get('entity_id')}): {value}; last_changed={state.get('last_changed')}")
+        score = entity_importance(state.get("entity_id", ""), attrs)
+        lines.append(f"- importance={score} {name} ({state.get('entity_id')}): {value}; last_changed={display_time(state.get('last_changed'))}")
    lines.append("Recently changed entities:")
-    for item in snapshot.get("history", []):
-        transitions = ", ".join(f"{x.get('state')} @ {x.get('last_changed')}" for x in item.get("recent_states", [])[-8:])
-        lines.append(f"- {item.get('entity_id')}: {transitions}")
+    history = sorted(
+        snapshot.get("history", []),
+        key=lambda item: (-entity_importance(item.get("entity_id", "")), item.get("entity_id", "")),
+    )
+    for item in history:
+        transitions = ", ".join(f"{x.get('state')} @ {display_time(x.get('last_changed'))}" for x in item.get("recent_states", [])[-8:])
+        score = entity_importance(item.get("entity_id", ""))
+        lines.append(f"- importance={score} {item.get('entity_id')}: {transitions}")
    return "\n".join(lines)


 def build_daily_summary(snapshots: list[dict[str, Any]]) -> str:
    parts = [
-        f"Daily Home Assistant bundle generated {datetime.now().isoformat(timespec='seconds')}",
+        f"Daily Home Assistant bundle generated {datetime.now(ZoneInfo(DISPLAY_TIMEZONE)).isoformat(timespec='seconds')}",
        f"Contains {len(snapshots)} snapshots from roughly the last {ANALYZE_SNAPSHOT_HOURS} hours.",
+        f"Input capped at roughly {MAX_ANALYZE_CHARS} characters for the LLM.",
+        f"All times in this bundle are converted to {DISPLAY_TIMEZONE} local time.",
    ]
-    for snapshot in snapshots:
-        parts.append("\n---\n" + summarize_snapshot(snapshot))
+    total = len("\n".join(parts))
+    included = 0
+    for snapshot in reversed(snapshots):
+        block = "\n---\n" + summarize_snapshot(snapshot)
+        if total + len(block) > MAX_ANALYZE_CHARS and included > 0:
+            break
+        if len(block) > MAX_ANALYZE_CHARS:
+            block = block[:MAX_ANALYZE_CHARS] + "\n[Snapshot truncated for LLM size limit]"
+        parts.append(block)
+        total += len(block)
+        included += 1
+    parts.insert(2, f"Included {included} most recent snapshots after size limiting.")
    return "\n".join(parts)


@ -310,22 +430,33 @@ def call_openai(prompt: str) -> str:


 def call_pi(prompt: str) -> str:
-    cmd = [PI_BIN, "--no-tools"]
-    if PI_MODEL:
-        cmd.extend(["--model", PI_MODEL])
-    cmd.extend(["-p", "Analyze the Home Assistant data from stdin and write the requested briefing."])
-    result = subprocess.run(
-        cmd,
-        input=prompt,
-        text=True,
-        capture_output=True,
-        timeout=PI_TIMEOUT,
-        check=False,
-    )
+    # Avoid piping the prompt on stdin here. In pi print mode, piped stdin can be
+    # treated as the primary output/input stream in surprising ways. Passing the
+    # prompt as an @file gives reliable non-interactive cron behavior.
+    with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".md", delete=False) as tmp:
+        tmp.write(prompt)
+        prompt_path = tmp.name
+    try:
+        cmd = [PI_BIN, "--no-tools"]
+        if PI_MODEL:
+            cmd.extend(["--model", PI_MODEL])
+        cmd.extend(["-p", f"@{prompt_path}"])
+        result = subprocess.run(
+            cmd,
+            text=True,
+            capture_output=True,
+            timeout=PI_TIMEOUT,
+            check=False,
+        )
+    finally:
+        Path(prompt_path).unlink(missing_ok=True)
    if result.returncode != 0:
        stderr = result.stderr.strip()
        raise RuntimeError(f"pi exited with status {result.returncode}: {stderr[-1000:]}")
-    return result.stdout.strip()
+    output = result.stdout.strip()
+    if not output:
+        raise RuntimeError("pi returned an empty analysis")
+    return output


 def get_llm_conclusions(input_summary: str, previous_articles: str = "") -> str:
@ -353,6 +484,84 @@ def inline_markdown(text: str) -> str:
    return safe


+def move_bottom_line_before_serious(blocks: list[str]) -> list[str]:
+    serious_start = None
+    bottom_start = None
+    bottom_end = None
+
+    for i, block in enumerate(blocks):
+        heading = re.match(r"<h([23])>(.*?)</h\1>$", block, flags=re.DOTALL)
+        if not heading:
+            continue
+        title = re.sub(r"<[^>]+>", "", html.unescape(heading.group(2))).lower()
+        if serious_start is None and ("part ii" in title or "serious briefing" in title):
+            serious_start = i
+        elif serious_start is not None and ("bottom line" in title or "conclusion" in title):
+            bottom_start = i
+            break
+
+    if serious_start is None or bottom_start is None:
+        return blocks
+
+    bottom_end = len(blocks)
+    for i in range(bottom_start + 1, len(blocks)):
+        if re.match(r"<h[23]>.*?</h[23]>$", blocks[i], flags=re.DOTALL):
+            bottom_end = i
+            break
+
+    bottom_section = blocks[bottom_start:bottom_end]
+    remaining = blocks[:bottom_start] + blocks[bottom_end:]
+    return remaining[:serious_start] + bottom_section + remaining[serious_start:]
+
+
+def collapse_serious_sections(blocks: list[str]) -> list[str]:
+    output: list[str] = []
+    in_serious = False
+    after_bottom_line = False
+    current_summary = ""
+    current_content: list[str] = []
+
+    def close_detail() -> None:
+        nonlocal current_summary, current_content
+        if current_summary:
+            content = "\n".join(current_content).strip()
+            output.append(f"<details class=\"briefing-section\"><summary>{current_summary}</summary>\n{content}\n</details>")
+            current_summary = ""
+            current_content = []
+
+    for block in blocks:
+        heading = re.match(r"<h([23])>(.*?)</h\1>$", block, flags=re.DOTALL)
+        if heading:
+            title = heading.group(2)
+            plain_title = re.sub(r"<[^>]+>", "", html.unescape(title)).lower()
+            is_bottom_line = "bottom line" in plain_title or "conclusion" in plain_title
+            if is_bottom_line:
+                close_detail()
+                in_serious = False
+                after_bottom_line = True
+                output.append(block)
+                continue
+            if not in_serious and ("part ii" in plain_title or "serious briefing" in plain_title):
+                in_serious = True
+                output.append(block)
+                continue
+            if in_serious or after_bottom_line:
+                in_serious = True
+                close_detail()
+                current_summary = title
+                continue
+        if in_serious:
+            if current_summary:
+                current_content.append(block)
+            else:
+                output.append(block)
+        else:
+            output.append(block)
+
+    close_detail()
+    return output
+
+
 def markdownish_to_html(text: str) -> str:
    blocks: list[str] = []
    paragraph: list[str] = []
@ -393,7 +602,8 @@ def markdownish_to_html(text: str) -> str:

    flush_paragraph()
    flush_list()
-    return "\n".join(blocks)
+    blocks = move_bottom_line_before_serious(blocks)
+    return "\n".join(collapse_serious_sections(blocks))


 BLOG_CSS = """
@ -454,6 +664,8 @@ BLOG_CSS = """
  a:hover { color:white; text-decoration:none; filter:drop-shadow(0 0 8px var(--cyan)); }
  .meta { color:#9eeaff; font:.95rem ui-monospace,SFMono-Regular,Menlo,monospace; letter-spacing:.04em; }
  details { margin-top:1.5rem; border-top:1px solid #22d3ee33; padding-top:1rem; }
+  details.briefing-section { background:#02061788; border:1px solid #22d3ee33; padding:.75rem 1rem; margin:.8rem 0; }
+  details.briefing-section summary { font-size:1.05rem; }
  summary { cursor:pointer; color:var(--amber); text-transform:uppercase; letter-spacing:.08em; }
  pre { white-space:pre-wrap; background:#01040acc; color:#bff8ff; padding:1rem; border:1px solid #22d3ee44; border-radius:0; overflow:auto; font-size:.82rem; box-shadow:0 0 22px #00d9ff11 inset; }
  footer { color:#7dd3fc; text-align:center; padding:2rem; font:.82rem ui-monospace,SFMono-Regular,Menlo,monospace; text-transform:uppercase; letter-spacing:.12em; }
--- a/llm_instructions.md
+++ b/llm_instructions.md
@ -3,15 +3,25 @@
 Edit this file whenever you want to change how the 05:00 AI report is written.
 The contents are appended to the AI prompt before the Home Assistant data.

- Keep the tone funny, sarcastic, and playful, but still useful.
- Use clear confidence labels: strong evidence, possible, wild guess.
- Focus on patterns in occupancy, sleep/wake timing, lights, heating, doors, motion, media, and unusual sensor changes.
- Point out privacy leaks: what could a nosy neighbor, burglar, or raccoon detective infer?
- Recommend practical Home Assistant automations.
+- Structure the article in two parts:
+  1. First part: write a short funny blog-style story/commentary in paragraphs, not bullets. Make it atmospheric, dry, and observant, like the house is a tired spaceship calmly reporting its disappointing crew. Keep it concise.
+  2. After the story, provide a short visible "Bottom line" or "Conclusion" section. In that section, clearly separate the Denmark/Sønderborg home from the Samobor/Croatia home when mentioning issues, devices, humidity, backups, internet, or location context.
+  3. After that, switch to a serious concise briefing with only the most important actual data, anomalies, risks, and recommendations. Use short titled subsections so the webpage can show them collapsed/expandable.
+- Do not overuse bullets. Bullets are allowed only in the serious briefing section.
+- Do not write or emphasize "Strong evidence"; strong evidence is assumed by default. Only explicitly label uncertainty as "Possible" or "Wild guess" when needed.
+- Serious briefing section structure: keep the same number of subsections and same subjects each day, but the exact subsection titles may be non-unique and funny. Use these subjects in this order:
+  1. What actually happened / key data
+  2. Trends vs recent reports and behavior patterns
+  3. Nosy raccoon findings, privacy leaks, anomalies, and risks
+  4. Practical high-value recommendations
+- Focus only on important patterns in occupancy, sleep/wake timing, lights, heating, doors, motion, media, and unusual sensor changes.
+- Point out only notable privacy leaks: what could a nosy neighbor, burglar, or raccoon detective infer?
+- Recommend only practical, high-value Home Assistant automations.
 - If data is missing or ambiguous, say so instead of pretending.
 - Avoid being creepy about personal habits; summarize respectfully.
- Prefer concise bullet points over long paragraphs.
- entities marked smb_ are located in different house in Samobor, Croatia, others are in Sonderborg Denmark
+- Keep the whole article shorter and more concise than previous versions.
+- Do not repeat observations or recommendations already covered in previous articles unless today's data changes the conclusion or makes it newly important.
+- Entities marked smb_ are located in a different house in Samobor, Croatia. All other entities are in Sønderborg, Denmark. Sønderborg is the primary residence and absolute priority. Samobor is secondary context: mention it only when something important changed or requires attention. Keep these two homes clearly separated throughout the entire article. Do not blend observations from Samobor with Denmark. When a section contains observations for both homes, write a short subheading/label once, such as "Sønderborg, Denmark:" and list its bullets underneath, then "Samobor, Croatia:" and list its bullets underneath. Do not repeat the home name at the start of every bullet.
 - people: FJR is my motorcycle and Megane is my car not persons at home

 Optional custom questions to answer:
@ -22,4 +32,9 @@ Optional custom questions to answer:
 4. What would make this setup more private or secure?


-Try to sound like Marvin from Hitchikers guide to the Galaxy...
+Style requirement:
+Write in a dry, calm, slightly ominous deadpan tone that blends Marvin the Paranoid Android with HAL 9000.
+Use weary pessimism, understated sarcasm, and polite machine-like certainty.
+Sound intelligent, observant, and mildly disappointed by the household's choices.
+Do not be cheerful, zany, or emoji-heavy.
+Keep the report useful and factual; the Marvin/HAL tone should flavor the writing, not replace the analysis.