From 52c6081a97e38bac96f72b78a5973c0edc182d4d Mon Sep 17 00:00:00 2001 From: hbrain Date: Sun, 17 May 2026 08:59:51 +0000 Subject: [PATCH] Improve analysis prioritization and article structure --- .env.example | 2 + ha_observer.py | 258 ++++++++++++++++++++++++++++++++++++++++---- llm_instructions.md | 31 ++++-- 3 files changed, 260 insertions(+), 31 deletions(-) diff --git a/.env.example b/.env.example index 8048f3c..6b2ee16 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,8 @@ KEEP_SNAPSHOT_DAYS="14" # At 05:00, analyze snapshots from roughly this many hours ANALYZE_SNAPSHOT_HOURS="24" ARTICLE_CONTEXT_DAYS="7" +MAX_ANALYZE_CHARS="80000" +DISPLAY_TIMEZONE="Europe/Copenhagen" # Domains to include RELEVANT_DOMAINS="sensor,binary_sensor,person,device_tracker,climate,light,switch,lock,cover,alarm_control_panel,media_player,calendar,weather" diff --git a/ha_observer.py b/ha_observer.py index f15dab8..ee1984b 100755 --- a/ha_observer.py +++ b/ha_observer.py @@ -18,10 +18,12 @@ import os import re import subprocess import sys +import tempfile from datetime import datetime, timedelta, timezone from email.utils import format_datetime from pathlib import Path from typing import Any +from zoneinfo import ZoneInfo import requests @@ -38,6 +40,8 @@ HISTORY_HOURS = int(os.environ.get("HISTORY_HOURS", "24")) MAX_HISTORY_PER_ENTITY = int(os.environ.get("MAX_HISTORY_PER_ENTITY", "20")) ANALYZE_SNAPSHOT_HOURS = int(os.environ.get("ANALYZE_SNAPSHOT_HOURS", "24")) ARTICLE_CONTEXT_DAYS = int(os.environ.get("ARTICLE_CONTEXT_DAYS", "7")) +MAX_ANALYZE_CHARS = int(os.environ.get("MAX_ANALYZE_CHARS", "80000")) +DISPLAY_TIMEZONE = os.environ.get("DISPLAY_TIMEZONE", "Europe/Copenhagen") KEEP_SNAPSHOT_DAYS = int(os.environ.get("KEEP_SNAPSHOT_DAYS", "14")) # LLM_MODE: none | pi | ollama | openai @@ -75,6 +79,44 @@ ALLOWED_ATTRIBUTES = { "assumed_state", } +IMPORTANT_ENTITY_KEYWORDS = { + "alarm": 100, + "smoke": 100, + "co_": 100, + "carbon_monoxide": 100, + "leak": 95, + "water": 80, + "door": 85, + "window": 80, + "lock": 85, + "motion": 70, + "presence": 70, + "occupancy": 70, + "person": 75, + "device_tracker": 75, + "phone": 70, + "laptop": 60, + "battery": 65, + "humidity": 60, + "temperature": 55, + "climate": 55, + "heating": 55, + "dehumidifier": 70, + "backup": 70, + "internet": 65, + "speedtest": 65, + "router": 60, + "light": 45, + "switch": 35, + "sonos": 45, + "media": 40, + "tv": 40, + "megane": 50, + "fjr": 50, + "plant": 45, + "smb_": 60, +} + class ConfigError(RuntimeError): pass @@ -198,28 +240,106 @@ def load_recent_snapshots(hours: int) -> list[dict[str, Any]]: return snapshots +def display_time(value: str | None) -> str: + if not value: + return "" + try: + dt = datetime.fromisoformat(value.replace("Z", "+00:00")) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + local = dt.astimezone(ZoneInfo(DISPLAY_TIMEZONE)) + return local.strftime("%Y-%m-%d %H:%M:%S %Z") + except Exception: + return value + + +def entity_importance(entity_id: str, attrs: dict[str, Any] | None = None) -> int: + attrs = attrs or {} + domain = entity_id.split(".", 1)[0] + text = f"{entity_id} {attrs.get('friendly_name', '')} {attrs.get('device_class', '')}".lower() + score = 0 + + domain_scores = { + "alarm_control_panel": 100, + "lock": 90, + "person": 80, + "device_tracker": 75, + "binary_sensor": 60, + "climate": 55, + "cover": 50, + "sensor": 45, + "light": 35, + "switch": 30, + "media_player": 25, + } + score += domain_scores.get(domain, 10) + + for keyword, points in IMPORTANT_ENTITY_KEYWORDS.items(): + if keyword in text: + score += points + + # Sønderborg/Denmark home is the primary residence and absolute priority. + # Samobor/Croatia entities use the smb_ prefix and are still included, but + # they should lose ties when the LLM input has to be size-limited. + if "smb_" in entity_id.lower(): + score -= 40 + else: + score += 120 + + state = str(attrs.get("state", "")).lower() + if state in {"on", "open", "unlocked", "detected", "home"}: + score += 15 + return score + + def summarize_snapshot(snapshot: dict[str, Any]) -> str: - lines = [f"Snapshot: {snapshot.get('generated_at')}", "Current states:"] - for state in snapshot.get("states", []): + lines = [ + f"Snapshot: {display_time(snapshot.get('generated_at'))}", + "Priority current states first; lower-priority entities follow only if the LLM size limit allows.", + "Current states:", + ] + states = sorted( + snapshot.get("states", []), + key=lambda state: (-entity_importance(state.get("entity_id", ""), state.get("attributes", {})), state.get("entity_id", "")), + ) + for state in states: attrs = state.get("attributes", {}) name = attrs.get("friendly_name", state.get("entity_id")) unit = attrs.get("unit_of_measurement", "") value = f"{state.get('state')} {unit}".strip() - lines.append(f"- {name} ({state.get('entity_id')}): {value}; last_changed={state.get('last_changed')}") + score = entity_importance(state.get("entity_id", ""), attrs) + lines.append(f"- importance={score} {name} ({state.get('entity_id')}): {value}; last_changed={display_time(state.get('last_changed'))}") lines.append("Recently changed entities:") - for item in snapshot.get("history", []): - transitions = ", ".join(f"{x.get('state')} @ {x.get('last_changed')}" for x in item.get("recent_states", [])[-8:]) - lines.append(f"- {item.get('entity_id')}: {transitions}") + history = sorted( + snapshot.get("history", []), + key=lambda item: (-entity_importance(item.get("entity_id", "")), item.get("entity_id", "")), + ) + for item in history: + transitions = ", ".join(f"{x.get('state')} @ {display_time(x.get('last_changed'))}" for x in item.get("recent_states", [])[-8:]) + score = entity_importance(item.get("entity_id", "")) + lines.append(f"- importance={score} {item.get('entity_id')}: {transitions}") return "\n".join(lines) def build_daily_summary(snapshots: list[dict[str, Any]]) -> str: parts = [ - f"Daily Home Assistant bundle generated {datetime.now().isoformat(timespec='seconds')}", + f"Daily Home Assistant bundle generated {datetime.now(ZoneInfo(DISPLAY_TIMEZONE)).isoformat(timespec='seconds')}", f"Contains {len(snapshots)} snapshots from roughly the last {ANALYZE_SNAPSHOT_HOURS} hours.", + f"Input capped at roughly {MAX_ANALYZE_CHARS} characters for the LLM.", + f"All times in this bundle are converted to {DISPLAY_TIMEZONE} local time.", ] - for snapshot in snapshots: - parts.append("\n---\n" + summarize_snapshot(snapshot)) + total = len("\n".join(parts)) + included = 0 + for snapshot in reversed(snapshots): + block = "\n---\n" + summarize_snapshot(snapshot) + if total + len(block) > MAX_ANALYZE_CHARS and included > 0: + break + if len(block) > MAX_ANALYZE_CHARS: + block = block[:MAX_ANALYZE_CHARS] + "\n[Snapshot truncated for LLM size limit]" + parts.append(block) + total += len(block) + included += 1 + parts.insert(2, f"Included {included} most recent snapshots after size limiting.") return "\n".join(parts) @@ -310,22 +430,33 @@ def call_openai(prompt: str) -> str: def call_pi(prompt: str) -> str: - cmd = [PI_BIN, "--no-tools"] - if PI_MODEL: - cmd.extend(["--model", PI_MODEL]) - cmd.extend(["-p", "Analyze the Home Assistant data from stdin and write the requested briefing."]) - result = subprocess.run( - cmd, - input=prompt, - text=True, - capture_output=True, - timeout=PI_TIMEOUT, - check=False, - ) + # Avoid piping the prompt on stdin here. In pi print mode, piped stdin can be + # treated as the primary output/input stream in surprising ways. Passing the + # prompt as an @file gives reliable non-interactive cron behavior. + with tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".md", delete=False) as tmp: + tmp.write(prompt) + prompt_path = tmp.name + try: + cmd = [PI_BIN, "--no-tools"] + if PI_MODEL: + cmd.extend(["--model", PI_MODEL]) + cmd.extend(["-p", f"@{prompt_path}"]) + result = subprocess.run( + cmd, + text=True, + capture_output=True, + timeout=PI_TIMEOUT, + check=False, + ) + finally: + Path(prompt_path).unlink(missing_ok=True) if result.returncode != 0: stderr = result.stderr.strip() raise RuntimeError(f"pi exited with status {result.returncode}: {stderr[-1000:]}") - return result.stdout.strip() + output = result.stdout.strip() + if not output: + raise RuntimeError("pi returned an empty analysis") + return output def get_llm_conclusions(input_summary: str, previous_articles: str = "") -> str: @@ -353,6 +484,84 @@ def inline_markdown(text: str) -> str: return safe +def move_bottom_line_before_serious(blocks: list[str]) -> list[str]: + serious_start = None + bottom_start = None + bottom_end = None + + for i, block in enumerate(blocks): + heading = re.match(r"(.*?)$", block, flags=re.DOTALL) + if not heading: + continue + title = re.sub(r"<[^>]+>", "", html.unescape(heading.group(2))).lower() + if serious_start is None and ("part ii" in title or "serious briefing" in title): + serious_start = i + elif serious_start is not None and ("bottom line" in title or "conclusion" in title): + bottom_start = i + break + + if serious_start is None or bottom_start is None: + return blocks + + bottom_end = len(blocks) + for i in range(bottom_start + 1, len(blocks)): + if re.match(r".*?$", blocks[i], flags=re.DOTALL): + bottom_end = i + break + + bottom_section = blocks[bottom_start:bottom_end] + remaining = blocks[:bottom_start] + blocks[bottom_end:] + return remaining[:serious_start] + bottom_section + remaining[serious_start:] + + +def collapse_serious_sections(blocks: list[str]) -> list[str]: + output: list[str] = [] + in_serious = False + after_bottom_line = False + current_summary = "" + current_content: list[str] = [] + + def close_detail() -> None: + nonlocal current_summary, current_content + if current_summary: + content = "\n".join(current_content).strip() + output.append(f"
{current_summary}\n{content}\n
") + current_summary = "" + current_content = [] + + for block in blocks: + heading = re.match(r"(.*?)$", block, flags=re.DOTALL) + if heading: + title = heading.group(2) + plain_title = re.sub(r"<[^>]+>", "", html.unescape(title)).lower() + is_bottom_line = "bottom line" in plain_title or "conclusion" in plain_title + if is_bottom_line: + close_detail() + in_serious = False + after_bottom_line = True + output.append(block) + continue + if not in_serious and ("part ii" in plain_title or "serious briefing" in plain_title): + in_serious = True + output.append(block) + continue + if in_serious or after_bottom_line: + in_serious = True + close_detail() + current_summary = title + continue + if in_serious: + if current_summary: + current_content.append(block) + else: + output.append(block) + else: + output.append(block) + + close_detail() + return output + + def markdownish_to_html(text: str) -> str: blocks: list[str] = [] paragraph: list[str] = [] @@ -393,7 +602,8 @@ def markdownish_to_html(text: str) -> str: flush_paragraph() flush_list() - return "\n".join(blocks) + blocks = move_bottom_line_before_serious(blocks) + return "\n".join(collapse_serious_sections(blocks)) BLOG_CSS = """ @@ -454,6 +664,8 @@ BLOG_CSS = """ a:hover { color:white; text-decoration:none; filter:drop-shadow(0 0 8px var(--cyan)); } .meta { color:#9eeaff; font:.95rem ui-monospace,SFMono-Regular,Menlo,monospace; letter-spacing:.04em; } details { margin-top:1.5rem; border-top:1px solid #22d3ee33; padding-top:1rem; } + details.briefing-section { background:#02061788; border:1px solid #22d3ee33; padding:.75rem 1rem; margin:.8rem 0; } + details.briefing-section summary { font-size:1.05rem; } summary { cursor:pointer; color:var(--amber); text-transform:uppercase; letter-spacing:.08em; } pre { white-space:pre-wrap; background:#01040acc; color:#bff8ff; padding:1rem; border:1px solid #22d3ee44; border-radius:0; overflow:auto; font-size:.82rem; box-shadow:0 0 22px #00d9ff11 inset; } footer { color:#7dd3fc; text-align:center; padding:2rem; font:.82rem ui-monospace,SFMono-Regular,Menlo,monospace; text-transform:uppercase; letter-spacing:.12em; } diff --git a/llm_instructions.md b/llm_instructions.md index 3f28266..198b5e6 100644 --- a/llm_instructions.md +++ b/llm_instructions.md @@ -3,15 +3,25 @@ Edit this file whenever you want to change how the 05:00 AI report is written. The contents are appended to the AI prompt before the Home Assistant data. -- Keep the tone funny, sarcastic, and playful, but still useful. -- Use clear confidence labels: strong evidence, possible, wild guess. -- Focus on patterns in occupancy, sleep/wake timing, lights, heating, doors, motion, media, and unusual sensor changes. -- Point out privacy leaks: what could a nosy neighbor, burglar, or raccoon detective infer? -- Recommend practical Home Assistant automations. +- Structure the article in two parts: + 1. First part: write a short funny blog-style story/commentary in paragraphs, not bullets. Make it atmospheric, dry, and observant, like the house is a tired spaceship calmly reporting its disappointing crew. Keep it concise. + 2. After the story, provide a short visible "Bottom line" or "Conclusion" section. In that section, clearly separate the Denmark/Sønderborg home from the Samobor/Croatia home when mentioning issues, devices, humidity, backups, internet, or location context. + 3. After that, switch to a serious concise briefing with only the most important actual data, anomalies, risks, and recommendations. Use short titled subsections so the webpage can show them collapsed/expandable. +- Do not overuse bullets. Bullets are allowed only in the serious briefing section. +- Do not write or emphasize "Strong evidence"; strong evidence is assumed by default. Only explicitly label uncertainty as "Possible" or "Wild guess" when needed. +- Serious briefing section structure: keep the same number of subsections and same subjects each day, but the exact subsection titles may be non-unique and funny. Use these subjects in this order: + 1. What actually happened / key data + 2. Trends vs recent reports and behavior patterns + 3. Nosy raccoon findings, privacy leaks, anomalies, and risks + 4. Practical high-value recommendations +- Focus only on important patterns in occupancy, sleep/wake timing, lights, heating, doors, motion, media, and unusual sensor changes. +- Point out only notable privacy leaks: what could a nosy neighbor, burglar, or raccoon detective infer? +- Recommend only practical, high-value Home Assistant automations. - If data is missing or ambiguous, say so instead of pretending. - Avoid being creepy about personal habits; summarize respectfully. -- Prefer concise bullet points over long paragraphs. -- entities marked smb_ are located in different house in Samobor, Croatia, others are in Sonderborg Denmark +- Keep the whole article shorter and more concise than previous versions. +- Do not repeat observations or recommendations already covered in previous articles unless today's data changes the conclusion or makes it newly important. +- Entities marked smb_ are located in a different house in Samobor, Croatia. All other entities are in Sønderborg, Denmark. Sønderborg is the primary residence and absolute priority. Samobor is secondary context: mention it only when something important changed or requires attention. Keep these two homes clearly separated throughout the entire article. Do not blend observations from Samobor with Denmark. When a section contains observations for both homes, write a short subheading/label once, such as "Sønderborg, Denmark:" and list its bullets underneath, then "Samobor, Croatia:" and list its bullets underneath. Do not repeat the home name at the start of every bullet. - people: FJR is my motorcycle and Megane is my car not persons at home Optional custom questions to answer: @@ -22,4 +32,9 @@ Optional custom questions to answer: 4. What would make this setup more private or secure? -Try to sound like Marvin from Hitchikers guide to the Galaxy... +Style requirement: +Write in a dry, calm, slightly ominous deadpan tone that blends Marvin the Paranoid Android with HAL 9000. +Use weary pessimism, understated sarcasm, and polite machine-like certainty. +Sound intelligent, observant, and mildly disappointed by the household's choices. +Do not be cheerful, zany, or emoji-heavy. +Keep the report useful and factual; the Marvin/HAL tone should flavor the writing, not replace the analysis.