commit 01af871145b0efbb44694d335b72deab96936131 Author: hbrain Date: Sat May 16 08:04:53 2026 +0000 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2963ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# Local secrets/configuration +*.conf +.git.env + +# Generated/state files +wallabag_downloaded.json +out/ + +# Python cache +__pycache__/ +*.py[cod] + +# OS/editor noise +.DS_Store +*.swp diff --git a/mail.conf.sample b/mail.conf.sample new file mode 100644 index 0000000..67127cc --- /dev/null +++ b/mail.conf.sample @@ -0,0 +1,6 @@ +SMTP_HOST=smtp.example.com +SMTP_PORT=587 +SMTP_USER=user@example.com +SMTP_PASS=change-me +SMTP_SENDER=user@example.com +KINDLE_EMAIL=your-kindle@example.com diff --git a/send_to_kindle.py b/send_to_kindle.py new file mode 100755 index 0000000..7c3a447 --- /dev/null +++ b/send_to_kindle.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +import argparse +import mimetypes +import os +import smtplib +import re +import zipfile +import xml.etree.ElementTree as ET +from email.message import EmailMessage +from pathlib import Path + +DEFAULT_CONFIG = Path("./mail.conf") +DEFAULT_OUT = Path("./out") + + +def load_config(path: Path) -> dict: + """Load simple KEY=VALUE config file. Lines starting with # are ignored.""" + cfg = {} + if not path.is_file(): + return cfg + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + cfg[key.strip()] = value.strip().strip('"').strip("'") + return cfg + + +def get_value(args, cfg, attr, key, env=None, default=None): + val = getattr(args, attr, None) + if val not in (None, ""): + return val + if key in cfg and cfg[key] != "": + return cfg[key] + if env and os.getenv(env): + return os.getenv(env) + return default + + +def chunks(items, size): + for i in range(0, len(items), size): + yield items[i:i + size] + + +def epub_title(path: Path) -> str | None: + """Read dc:title from EPUB metadata, if available.""" + try: + with zipfile.ZipFile(path) as z: + container = ET.fromstring(z.read("META-INF/container.xml")) + ns_container = {"c": "urn:oasis:names:tc:opendocument:xmlns:container"} + rootfile = container.find(".//c:rootfile", ns_container) + if rootfile is None: + return None + opf_path = rootfile.attrib["full-path"] + opf = ET.fromstring(z.read(opf_path)) + ns = {"dc": "http://purl.org/dc/elements/1.1/"} + title = opf.find(".//dc:title", ns) + if title is not None and title.text: + return " ".join(title.text.replace("_", " ").split()) + except Exception: + return None + return None + + +def attachment_name(path: Path) -> str: + """Use EPUB metadata title as emailed attachment filename. + + Local filenames stay unchanged. Amazon Send-to-Kindle often derives the + displayed document title from the email attachment filename, so use a nice + title-based attachment filename while keeping the .epub extension. + """ + title = epub_title(path) + if not title: + return path.name + name = re.sub(r'[\\/:*?"<>|]+', ' ', title) + name = re.sub(r"\s+", " ", name).strip().rstrip('.') + return (name[:120] or path.stem) + path.suffix.lower() + + +def send_to_kindle(smtp_host, smtp_port, smtp_user, smtp_pass, sender, kindle_email, file_paths): + file_paths = [Path(p) for p in file_paths] + for file_path in file_paths: + if not file_path.is_file(): + raise FileNotFoundError(file_path) + + msg = EmailMessage() + msg["From"] = sender + msg["To"] = kindle_email + msg["Subject"] = "Send to Kindle" + display_names = [attachment_name(p) for p in file_paths] + msg.set_content("Attached ebook(s):\n\n" + "\n".join(display_names)) + + for file_path in file_paths: + ctype, _ = mimetypes.guess_type(file_path) + if ctype is None: + ctype = "application/octet-stream" + maintype, subtype = ctype.split("/", 1) + + with file_path.open("rb") as f: + msg.add_attachment( + f.read(), + maintype=maintype, + subtype=subtype, + filename=attachment_name(file_path), + ) + + with smtplib.SMTP_SSL(smtp_host, int(smtp_port)) as smtp: + smtp.login(smtp_user, smtp_pass) + smtp.send_message(msg) + + +def find_epubs(out_dir: Path): + return sorted(out_dir.glob("*.epub")) + + +def main(): + p = argparse.ArgumentParser(description="Send ebook(s) to Kindle via email") + p.add_argument("file", nargs="?", help="ebook file, e.g. .epub/.pdf/.mobi. If omitted, sends all .epub files in ./out") + p.add_argument("--config", default=str(DEFAULT_CONFIG), help=f"config file, default: {DEFAULT_CONFIG}") + p.add_argument("--kindle", help="your Kindle email, e.g. name@kindle.com") + p.add_argument("--smtp-host") + p.add_argument("--smtp-port", type=int) + p.add_argument("--smtp-user") + p.add_argument("--smtp-pass") + p.add_argument("--sender") + p.add_argument("--max-attachments", type=int, default=16, help="maximum attachments per email, default: 16") + args = p.parse_args() + + cfg = load_config(Path(args.config).expanduser()) + + settings = { + "smtp_host": get_value(args, cfg, "smtp_host", "SMTP_HOST", "SMTP_HOST"), + "smtp_port": get_value(args, cfg, "smtp_port", "SMTP_PORT", "SMTP_PORT", "465"), + "smtp_user": get_value(args, cfg, "smtp_user", "SMTP_USER", "SMTP_USER"), + "smtp_pass": get_value(args, cfg, "smtp_pass", "SMTP_PASS", "SMTP_PASS"), + "sender": get_value(args, cfg, "sender", "SMTP_SENDER", "SMTP_SENDER"), + "kindle": get_value(args, cfg, "kindle", "KINDLE_EMAIL", "KINDLE_EMAIL"), + } + + missing = [k for k, v in settings.items() if not v] + if missing: + raise SystemExit( + "Missing: " + ", ".join(missing) + + f"\nAdd them to {args.config} or pass them as command-line options." + ) + + files = [Path(args.file).expanduser()] if args.file else find_epubs(DEFAULT_OUT) + if not files: + raise SystemExit(f"No EPUB files found in {DEFAULT_OUT}") + + max_attachments = max(1, args.max_attachments) + batches = list(chunks(files, max_attachments)) + for idx, batch in enumerate(batches, 1): + send_to_kindle( + settings["smtp_host"], + settings["smtp_port"], + settings["smtp_user"], + settings["smtp_pass"], + settings["sender"], + settings["kindle"], + batch, + ) + suffix = f" ({idx}/{len(batches)})" if len(batches) > 1 else "" + print(f"Sent email{suffix}: {len(batch)} attachment(s)") + for file_path in batch: + file_path.unlink() + print(f" - sent and deleted: {file_path}") + + +if __name__ == "__main__": + main() diff --git a/wallabag.conf.sample b/wallabag.conf.sample new file mode 100644 index 0000000..796ce01 --- /dev/null +++ b/wallabag.conf.sample @@ -0,0 +1,5 @@ +WALLABAG_URL=https://wallabag.example.com +CLIENT_ID=change-me +CLIENT_SECRET=change-me +USERNAME=change-me +PASSWORD=change-me diff --git a/wallabag_to_epub.py b/wallabag_to_epub.py new file mode 100755 index 0000000..35da2ba --- /dev/null +++ b/wallabag_to_epub.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 +"""Fetch Wallabag articles and create one EPUB per article.""" +import argparse +import html +import json +import re +import time +import mimetypes +import urllib.parse +import urllib.request +import uuid +import zipfile +from datetime import datetime +from pathlib import Path +from xml.sax.saxutils import escape + +BASE_DIR = Path(".") +DEFAULT_CONFIG = Path("./wallabag.conf") +DEFAULT_OUT = Path("./out") +DEFAULT_DB = Path("./wallabag_downloaded.json") + + +def load_config(path: Path) -> dict: + cfg = {} + if path.is_file(): + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + cfg[k.strip()] = v.strip().strip('"').strip("'") + return cfg + + +def http_json(url, method="GET", data=None, token=None): + body = None + headers = {"Accept": "application/json"} + if data is not None: + body = urllib.parse.urlencode(data).encode() + headers["Content-Type"] = "application/x-www-form-urlencoded" + if token: + headers["Authorization"] = f"Bearer {token}" + req = urllib.request.Request(url, data=body, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=60) as r: + return json.loads(r.read().decode("utf-8")) + + +def wallabag_token(cfg): + url = cfg["WALLABAG_URL"].rstrip("/") + "/oauth/v2/token" + return http_json(url, "POST", { + "grant_type": "password", + "client_id": cfg["CLIENT_ID"], + "client_secret": cfg["CLIENT_SECRET"], + "username": cfg["USERNAME"], + "password": cfg["PASSWORD"], + })["access_token"] + + +def fetch_entries(cfg, token, limit=10, unread=True, starred=False, archive=False): + base = cfg["WALLABAG_URL"].rstrip("/") + "/api/entries.json" + qs = { + "perPage": str(limit), + "page": "1", + "sort": "created", + "order": "desc", + "detail": "full", + } + if unread: + qs["archive"] = "0" + if archive: + qs["archive"] = "1" + if starred: + qs["starred"] = "1" + data = http_json(base + "?" + urllib.parse.urlencode(qs), token=token) + return data.get("_embedded", {}).get("items", []) + + +def mark_archived(cfg, token, entry_id): + url = cfg["WALLABAG_URL"].rstrip("/") + f"/api/entries/{entry_id}.json" + return http_json(url, "PATCH", {"archive": "1"}, token=token) + + +def clean_fragment(content): + """Prepare Wallabag HTML for EPUB while preserving formatting where possible. + + Wallabag already extracts readable article HTML, so keep headings, lists, + blockquotes, tables, links, inline styles/classes, and images. Remove only + active/interactive content and make common void tags XML-compatible. + """ + if not content: + return "

" + content = re.sub(r"]*>.*?", "", content, flags=re.I | re.S) + content = re.sub(r"]*>.*?", "", content, flags=re.I | re.S) + content = re.sub(r"]*>.*?", "", content, flags=re.I | re.S) + content = re.sub(r"\s(on\w+)=([\"']).*?\2", "", content, flags=re.I | re.S) + # Many sites lazy-load images and Wallabag can keep the real URL in data-* + # while src is empty/a placeholder. Promote those before dropping extras. + content = promote_lazy_image_srcs(content) + content = re.sub(r"\s(srcset|sizes)=([\"']).*?\2", "", content, flags=re.I | re.S) + content = html.unescape(content) + content = re.sub(r"&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[0-9A-Fa-f]+;)", "&", content) + content = re.sub(r"<(br|hr|img|meta|link|input)(\b[^>]*?)(?", r"<\1\2 />", content, flags=re.I) + return content + + +def promote_lazy_image_srcs(content): + """Use lazy-loader attributes as img src when src is missing/a placeholder.""" + def repl(match): + tag = match.group(0) + attrs = dict((m.group(1).lower(), m.group(3)) for m in re.finditer(r"\s([\w:-]+)=([\"'])(.*?)\2", tag, flags=re.S)) + src = (attrs.get("src") or "").strip() + lazy = None + for name in ("data-src", "data-original", "data-lazy-src", "data-url", "data-full-url"): + if attrs.get(name): + lazy = attrs[name].strip() + break + if not lazy: + return tag + if not src or src.startswith("data:") or "placeholder" in src.lower() or src in ("#", "/"): + if " src=" in tag.lower(): + return re.sub(r"\ssrc=([\"']).*?\1", f' src="{lazy}"', tag, count=1, flags=re.I | re.S) + return tag[:-1] + f' src="{lazy}">' + return tag + return re.sub(r"]*>", repl, content, flags=re.I | re.S) + + +def fetch_image(url, referer=None): + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36", + # Prefer Kindle-friendly formats. Some CDNs serve AVIF/WebP when asked, + # which Amazon's EPUB conversion may drop. + "Accept": "image/jpeg,image/png,image/gif,image/svg+xml,image/*;q=0.8,*/*;q=0.5", + } + if referer: + headers["Referer"] = referer + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=30) as r: + data = r.read(20 * 1024 * 1024 + 1) + if len(data) > 20 * 1024 * 1024: + raise ValueError("image is larger than 20 MiB") + ctype = (r.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower() + return data, guess_image_type(url, data, ctype) + + +def guess_image_type(url, data, ctype): + if ctype.startswith("image/"): + media_type = ctype + elif data.startswith(b"\xff\xd8\xff"): + media_type = "image/jpeg" + elif data.startswith(b"\x89PNG\r\n\x1a\n"): + media_type = "image/png" + elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"): + media_type = "image/gif" + elif data.startswith(b"RIFF") and data[8:12] == b"WEBP": + media_type = "image/webp" + elif b"" + + +def embed_images(content, base_url): + """Download sources, rewrite them to local EPUB paths, return manifest items.""" + images = [] + by_src = {} + + def repl(match): + tag, quote, src = match.group(0), match.group(1), html.unescape(match.group(2)).strip() + if not src or src.startswith(("data:", "cid:")): + return tag + abs_url = urllib.parse.urljoin(base_url, src) + parsed = urllib.parse.urlparse(abs_url) + if parsed.scheme not in ("http", "https"): + return tag + if abs_url not in by_src: + try: + data, (media_type, ext) = fetch_image(abs_url, referer=base_url) + except Exception as e: + print(f"Warning: could not download image {abs_url}: {e}") + return tag + item_id = f"img{len(images) + 1}" + href = f"images/{item_id}{ext}" + by_src[abs_url] = href + images.append({"id": item_id, "href": href, "media_type": media_type, "data": data}) + new_src = by_src[abs_url] + tag = re.sub(r"\ssrc=([\"']).*?\1", f' src="{new_src}"', tag, count=1, flags=re.I | re.S) + return sanitize_img_tag(tag) + + content = re.sub(r"]*\ssrc=([\"'])(.*?)\1[^>]*>", repl, content, flags=re.I | re.S) + return content, images + + +def safe_name(s): + s = re.sub(r"[^A-Za-z0-9_.-]+", "_", s).strip("_") + return s[:80] or "article" + + +def display_title(s): + """Human-friendly Wallabag article title for EPUB metadata/display. + + Filenames are sanitized separately with safe_name(); this function must not + use the filename. Some older/generated titles may contain underscores, so + turn those back into spaces for Kindle display. + """ + s = html.unescape(s or "Wallabag article") + s = s.replace("_", " ") + s = re.sub(r"\s+", " ", s).strip() + return s or "Wallabag article" + + +def metadata_title(s): + """Title string intended for EPUB metadata, not filename.""" + return display_title(s).replace(".epub", "").strip() + + +def build_epub(entry, out_path: Path, title: str | None = None): + """Build a single-article EPUB.""" + out_path.parent.mkdir(parents=True, exist_ok=True) + book_id = f"urn:uuid:{uuid.uuid4()}" + now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + + etitle = metadata_title(title or entry.get("title") or "Wallabag article") + url = entry.get("url") or "" + domain = entry.get("domain_name") or urllib.parse.urlparse(url).netloc or "Wallabag" + published = entry.get("published_at") or entry.get("created_at") or "" + raw_content = entry.get("content") or "

" + preview = entry.get("preview_picture") or "" + if preview and preview not in raw_content: + raw_content = f'
{html.escape(etitle, quote=True)}
\n' + raw_content + content = clean_fragment(raw_content) + content, images = embed_images(content, url) + + chapter = f''' + + + {escape(etitle)} + + + +
+
+

{escape(etitle)}

+

{escape(domain)}{(' ยท ' + escape(published[:10])) if published else ''}

+

{escape(url)}

+
+
+{content} +
+
+''' + + container = ''' + + +''' + image_manifest = "".join( + f' \n' + for img in images + ) + opf = f''' + + + {book_id} + {escape(etitle)} + main + + en + {escape(domain)} + aut + Wallabag + {escape(url)} + {now} + + + + + +{image_manifest} + +''' + nav = f''' + +{escape(etitle)} +''' + css = """body{font-family:serif;line-height:1.45;margin:0;padding:1em;} article{max-width:42em;} h1{line-height:1.15;} img,video{max-width:100%;height:auto;} figure{margin:1em 0;} figcaption,.source{font-size:.85em;color:#666;} blockquote{border-left:3px solid #aaa;margin-left:.5em;padding-left:1em;color:#333;} pre,code{font-family:monospace;white-space:pre-wrap;} table{border-collapse:collapse;max-width:100%;} td,th{border:1px solid #ccc;padding:.25em;}""" + + with zipfile.ZipFile(out_path, "w") as z: + z.writestr("mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED) + z.writestr("META-INF/container.xml", container) + z.writestr("OEBPS/content.opf", opf) + z.writestr("OEBPS/nav.xhtml", nav) + z.writestr("OEBPS/style.css", css) + z.writestr("OEBPS/article.xhtml", chapter) + for img in images: + z.writestr("OEBPS/" + img["href"], img["data"]) + print(f"Embedded {len(images)} image(s) in {out_path}") + return out_path + + +def article_key(entry) -> str: + if entry.get("id") is not None: + return f"id:{entry['id']}" + if entry.get("url"): + return "url:" + entry["url"] + return "title:" + display_title(entry.get("title") or "") + + +def load_downloaded(path: Path) -> dict: + if not path.is_file(): + return {} + try: + data = json.loads(path.read_text()) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + +def save_downloaded(path: Path, data: dict): + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n") + tmp.replace(path) + + +def remember_downloaded(db: dict, entry, out: Path): + key = article_key(entry) + db[key] = { + "id": entry.get("id"), + "title": display_title(entry.get("title") or ""), + "url": entry.get("url"), + "epub": str(out), + "downloaded_at": datetime.now().isoformat(timespec="seconds"), + } + + +def article_output_path(entry, out_dir: Path) -> Path: + title = entry.get("title") or f"wallabag-{entry.get('id', int(time.time()))}" + suffix = entry.get("id") or int(time.time()) + return out_dir / f"{safe_name(title)}-{suffix}.epub" + + +def main(): + ap = argparse.ArgumentParser(description="Fetch Wallabag articles and build one EPUB per article") + ap.add_argument("--config", default=str(DEFAULT_CONFIG)) + ap.add_argument("--limit", type=int, default=10) + ap.add_argument("--all", action="store_true", help="include archived/read articles too") + ap.add_argument("--starred", action="store_true", help="only starred articles") + ap.add_argument("--title", default=None, help="title override only when exporting one article") + ap.add_argument("--output", default=None, help="output directory, or .epub file if --limit 1") + ap.add_argument("--archive", action="store_true", help="mark fetched articles archived after successful build") + ap.add_argument("--db", default=str(DEFAULT_DB), help=f"download evidence DB, default: {DEFAULT_DB}") + ap.add_argument("--redownload", action="store_true", help="ignore evidence DB and download articles again") + args = ap.parse_args() + + cfg = load_config(Path(args.config).expanduser()) + missing = [k for k in ["WALLABAG_URL", "CLIENT_ID", "CLIENT_SECRET", "USERNAME", "PASSWORD"] if not cfg.get(k)] + if missing: + raise SystemExit("Missing in wallabag.conf: " + ", ".join(missing)) + + token = wallabag_token(cfg) + entries = fetch_entries(cfg, token, limit=args.limit, unread=not args.all, starred=args.starred) + if not entries: + raise SystemExit("No articles found.") + + db_path = Path(args.db).expanduser() + downloaded = load_downloaded(db_path) + original_count = len(entries) + if not args.redownload: + entries = [e for e in entries if article_key(e) not in downloaded] + skipped = original_count - len(entries) + if skipped: + print(f"Skipped {skipped} already downloaded article(s). Use --redownload to fetch again.") + if not entries: + raise SystemExit("No new articles to download.") + + output_arg = Path(args.output).expanduser() if args.output else DEFAULT_OUT + out_dir = output_arg if output_arg.suffix.lower() != ".epub" or len(entries) > 1 else output_arg.parent + out_dir.mkdir(parents=True, exist_ok=True) + + created = [] + for i, entry in enumerate(entries, 1): + out = output_arg if len(entries) == 1 and output_arg.suffix.lower() == ".epub" else article_output_path(entry, out_dir) + title = args.title if len(entries) == 1 and args.title else (entry.get("title") or f"Wallabag article {i}") + build_epub(entry, out, title) + remember_downloaded(downloaded, entry, out) + save_downloaded(db_path, downloaded) + created.append((entry, out)) + print(f"Created: {out}") + + if args.archive: + for entry, _ in created: + if entry.get("id") is not None: + mark_archived(cfg, token, entry["id"]) + print(f"Archived {len(created)} articles in Wallabag.") + + +if __name__ == "__main__": + main()