Initial commit

This commit is contained in:
hbrain 2026-05-16 08:04:53 +00:00
commit 01af871145
5 changed files with 615 additions and 0 deletions

15
.gitignore vendored Normal file
View file

@ -0,0 +1,15 @@
# Local secrets/configuration
*.conf
.git.env
# Generated/state files
wallabag_downloaded.json
out/
# Python cache
__pycache__/
*.py[cod]
# OS/editor noise
.DS_Store
*.swp

6
mail.conf.sample Normal file
View file

@ -0,0 +1,6 @@
SMTP_HOST=smtp.example.com
SMTP_PORT=587
SMTP_USER=user@example.com
SMTP_PASS=change-me
SMTP_SENDER=user@example.com
KINDLE_EMAIL=your-kindle@example.com

172
send_to_kindle.py Executable file
View file

@ -0,0 +1,172 @@
#!/usr/bin/env python3
import argparse
import mimetypes
import os
import smtplib
import re
import zipfile
import xml.etree.ElementTree as ET
from email.message import EmailMessage
from pathlib import Path
DEFAULT_CONFIG = Path("./mail.conf")
DEFAULT_OUT = Path("./out")
def load_config(path: Path) -> dict:
"""Load simple KEY=VALUE config file. Lines starting with # are ignored."""
cfg = {}
if not path.is_file():
return cfg
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
cfg[key.strip()] = value.strip().strip('"').strip("'")
return cfg
def get_value(args, cfg, attr, key, env=None, default=None):
val = getattr(args, attr, None)
if val not in (None, ""):
return val
if key in cfg and cfg[key] != "":
return cfg[key]
if env and os.getenv(env):
return os.getenv(env)
return default
def chunks(items, size):
for i in range(0, len(items), size):
yield items[i:i + size]
def epub_title(path: Path) -> str | None:
"""Read dc:title from EPUB metadata, if available."""
try:
with zipfile.ZipFile(path) as z:
container = ET.fromstring(z.read("META-INF/container.xml"))
ns_container = {"c": "urn:oasis:names:tc:opendocument:xmlns:container"}
rootfile = container.find(".//c:rootfile", ns_container)
if rootfile is None:
return None
opf_path = rootfile.attrib["full-path"]
opf = ET.fromstring(z.read(opf_path))
ns = {"dc": "http://purl.org/dc/elements/1.1/"}
title = opf.find(".//dc:title", ns)
if title is not None and title.text:
return " ".join(title.text.replace("_", " ").split())
except Exception:
return None
return None
def attachment_name(path: Path) -> str:
"""Use EPUB metadata title as emailed attachment filename.
Local filenames stay unchanged. Amazon Send-to-Kindle often derives the
displayed document title from the email attachment filename, so use a nice
title-based attachment filename while keeping the .epub extension.
"""
title = epub_title(path)
if not title:
return path.name
name = re.sub(r'[\\/:*?"<>|]+', ' ', title)
name = re.sub(r"\s+", " ", name).strip().rstrip('.')
return (name[:120] or path.stem) + path.suffix.lower()
def send_to_kindle(smtp_host, smtp_port, smtp_user, smtp_pass, sender, kindle_email, file_paths):
file_paths = [Path(p) for p in file_paths]
for file_path in file_paths:
if not file_path.is_file():
raise FileNotFoundError(file_path)
msg = EmailMessage()
msg["From"] = sender
msg["To"] = kindle_email
msg["Subject"] = "Send to Kindle"
display_names = [attachment_name(p) for p in file_paths]
msg.set_content("Attached ebook(s):\n\n" + "\n".join(display_names))
for file_path in file_paths:
ctype, _ = mimetypes.guess_type(file_path)
if ctype is None:
ctype = "application/octet-stream"
maintype, subtype = ctype.split("/", 1)
with file_path.open("rb") as f:
msg.add_attachment(
f.read(),
maintype=maintype,
subtype=subtype,
filename=attachment_name(file_path),
)
with smtplib.SMTP_SSL(smtp_host, int(smtp_port)) as smtp:
smtp.login(smtp_user, smtp_pass)
smtp.send_message(msg)
def find_epubs(out_dir: Path):
return sorted(out_dir.glob("*.epub"))
def main():
p = argparse.ArgumentParser(description="Send ebook(s) to Kindle via email")
p.add_argument("file", nargs="?", help="ebook file, e.g. .epub/.pdf/.mobi. If omitted, sends all .epub files in ./out")
p.add_argument("--config", default=str(DEFAULT_CONFIG), help=f"config file, default: {DEFAULT_CONFIG}")
p.add_argument("--kindle", help="your Kindle email, e.g. name@kindle.com")
p.add_argument("--smtp-host")
p.add_argument("--smtp-port", type=int)
p.add_argument("--smtp-user")
p.add_argument("--smtp-pass")
p.add_argument("--sender")
p.add_argument("--max-attachments", type=int, default=16, help="maximum attachments per email, default: 16")
args = p.parse_args()
cfg = load_config(Path(args.config).expanduser())
settings = {
"smtp_host": get_value(args, cfg, "smtp_host", "SMTP_HOST", "SMTP_HOST"),
"smtp_port": get_value(args, cfg, "smtp_port", "SMTP_PORT", "SMTP_PORT", "465"),
"smtp_user": get_value(args, cfg, "smtp_user", "SMTP_USER", "SMTP_USER"),
"smtp_pass": get_value(args, cfg, "smtp_pass", "SMTP_PASS", "SMTP_PASS"),
"sender": get_value(args, cfg, "sender", "SMTP_SENDER", "SMTP_SENDER"),
"kindle": get_value(args, cfg, "kindle", "KINDLE_EMAIL", "KINDLE_EMAIL"),
}
missing = [k for k, v in settings.items() if not v]
if missing:
raise SystemExit(
"Missing: " + ", ".join(missing) +
f"\nAdd them to {args.config} or pass them as command-line options."
)
files = [Path(args.file).expanduser()] if args.file else find_epubs(DEFAULT_OUT)
if not files:
raise SystemExit(f"No EPUB files found in {DEFAULT_OUT}")
max_attachments = max(1, args.max_attachments)
batches = list(chunks(files, max_attachments))
for idx, batch in enumerate(batches, 1):
send_to_kindle(
settings["smtp_host"],
settings["smtp_port"],
settings["smtp_user"],
settings["smtp_pass"],
settings["sender"],
settings["kindle"],
batch,
)
suffix = f" ({idx}/{len(batches)})" if len(batches) > 1 else ""
print(f"Sent email{suffix}: {len(batch)} attachment(s)")
for file_path in batch:
file_path.unlink()
print(f" - sent and deleted: {file_path}")
if __name__ == "__main__":
main()

5
wallabag.conf.sample Normal file
View file

@ -0,0 +1,5 @@
WALLABAG_URL=https://wallabag.example.com
CLIENT_ID=change-me
CLIENT_SECRET=change-me
USERNAME=change-me
PASSWORD=change-me

417
wallabag_to_epub.py Executable file
View file

@ -0,0 +1,417 @@
#!/usr/bin/env python3
"""Fetch Wallabag articles and create one EPUB per article."""
import argparse
import html
import json
import re
import time
import mimetypes
import urllib.parse
import urllib.request
import uuid
import zipfile
from datetime import datetime
from pathlib import Path
from xml.sax.saxutils import escape
BASE_DIR = Path(".")
DEFAULT_CONFIG = Path("./wallabag.conf")
DEFAULT_OUT = Path("./out")
DEFAULT_DB = Path("./wallabag_downloaded.json")
def load_config(path: Path) -> dict:
cfg = {}
if path.is_file():
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
cfg[k.strip()] = v.strip().strip('"').strip("'")
return cfg
def http_json(url, method="GET", data=None, token=None):
body = None
headers = {"Accept": "application/json"}
if data is not None:
body = urllib.parse.urlencode(data).encode()
headers["Content-Type"] = "application/x-www-form-urlencoded"
if token:
headers["Authorization"] = f"Bearer {token}"
req = urllib.request.Request(url, data=body, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=60) as r:
return json.loads(r.read().decode("utf-8"))
def wallabag_token(cfg):
url = cfg["WALLABAG_URL"].rstrip("/") + "/oauth/v2/token"
return http_json(url, "POST", {
"grant_type": "password",
"client_id": cfg["CLIENT_ID"],
"client_secret": cfg["CLIENT_SECRET"],
"username": cfg["USERNAME"],
"password": cfg["PASSWORD"],
})["access_token"]
def fetch_entries(cfg, token, limit=10, unread=True, starred=False, archive=False):
base = cfg["WALLABAG_URL"].rstrip("/") + "/api/entries.json"
qs = {
"perPage": str(limit),
"page": "1",
"sort": "created",
"order": "desc",
"detail": "full",
}
if unread:
qs["archive"] = "0"
if archive:
qs["archive"] = "1"
if starred:
qs["starred"] = "1"
data = http_json(base + "?" + urllib.parse.urlencode(qs), token=token)
return data.get("_embedded", {}).get("items", [])
def mark_archived(cfg, token, entry_id):
url = cfg["WALLABAG_URL"].rstrip("/") + f"/api/entries/{entry_id}.json"
return http_json(url, "PATCH", {"archive": "1"}, token=token)
def clean_fragment(content):
"""Prepare Wallabag HTML for EPUB while preserving formatting where possible.
Wallabag already extracts readable article HTML, so keep headings, lists,
blockquotes, tables, links, inline styles/classes, and images. Remove only
active/interactive content and make common void tags XML-compatible.
"""
if not content:
return "<p></p>"
content = re.sub(r"<script\b[^>]*>.*?</script>", "", content, flags=re.I | re.S)
content = re.sub(r"<iframe\b[^>]*>.*?</iframe>", "", content, flags=re.I | re.S)
content = re.sub(r"<form\b[^>]*>.*?</form>", "", content, flags=re.I | re.S)
content = re.sub(r"\s(on\w+)=([\"']).*?\2", "", content, flags=re.I | re.S)
# Many sites lazy-load images and Wallabag can keep the real URL in data-*
# while src is empty/a placeholder. Promote those before dropping extras.
content = promote_lazy_image_srcs(content)
content = re.sub(r"\s(srcset|sizes)=([\"']).*?\2", "", content, flags=re.I | re.S)
content = html.unescape(content)
content = re.sub(r"&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[0-9A-Fa-f]+;)", "&amp;", content)
content = re.sub(r"<(br|hr|img|meta|link|input)(\b[^>]*?)(?<!/)>", r"<\1\2 />", content, flags=re.I)
return content
def promote_lazy_image_srcs(content):
"""Use lazy-loader attributes as img src when src is missing/a placeholder."""
def repl(match):
tag = match.group(0)
attrs = dict((m.group(1).lower(), m.group(3)) for m in re.finditer(r"\s([\w:-]+)=([\"'])(.*?)\2", tag, flags=re.S))
src = (attrs.get("src") or "").strip()
lazy = None
for name in ("data-src", "data-original", "data-lazy-src", "data-url", "data-full-url"):
if attrs.get(name):
lazy = attrs[name].strip()
break
if not lazy:
return tag
if not src or src.startswith("data:") or "placeholder" in src.lower() or src in ("#", "/"):
if " src=" in tag.lower():
return re.sub(r"\ssrc=([\"']).*?\1", f' src="{lazy}"', tag, count=1, flags=re.I | re.S)
return tag[:-1] + f' src="{lazy}">'
return tag
return re.sub(r"<img\b[^>]*>", repl, content, flags=re.I | re.S)
def fetch_image(url, referer=None):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
# Prefer Kindle-friendly formats. Some CDNs serve AVIF/WebP when asked,
# which Amazon's EPUB conversion may drop.
"Accept": "image/jpeg,image/png,image/gif,image/svg+xml,image/*;q=0.8,*/*;q=0.5",
}
if referer:
headers["Referer"] = referer
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=30) as r:
data = r.read(20 * 1024 * 1024 + 1)
if len(data) > 20 * 1024 * 1024:
raise ValueError("image is larger than 20 MiB")
ctype = (r.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower()
return data, guess_image_type(url, data, ctype)
def guess_image_type(url, data, ctype):
if ctype.startswith("image/"):
media_type = ctype
elif data.startswith(b"\xff\xd8\xff"):
media_type = "image/jpeg"
elif data.startswith(b"\x89PNG\r\n\x1a\n"):
media_type = "image/png"
elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
media_type = "image/gif"
elif data.startswith(b"RIFF") and data[8:12] == b"WEBP":
media_type = "image/webp"
elif b"<svg" in data[:500].lower():
media_type = "image/svg+xml"
else:
media_type = mimetypes.guess_type(urllib.parse.urlparse(url).path)[0] or "application/octet-stream"
ext = mimetypes.guess_extension(media_type) or ""
if ext == ".jpe":
ext = ".jpg"
elif ext == ".svgz":
ext = ".svg"
return media_type, ext or ".img"
def sanitize_img_tag(tag):
"""Keep Kindle/EPUB-friendly image attributes only."""
attrs = []
for m in re.finditer(r"\s([\w:-]+)=([\"'])(.*?)\2", tag, flags=re.S):
name = m.group(1).lower()
value = html.unescape(m.group(3))
if name in {"src", "alt", "title", "class", "width", "height"}:
attrs.append((name, value))
if not any(name == "alt" for name, _ in attrs):
attrs.append(("alt", "image"))
return "<img" + "".join(f' {name}="{html.escape(value, quote=True)}"' for name, value in attrs) + " />"
def embed_images(content, base_url):
"""Download <img> sources, rewrite them to local EPUB paths, return manifest items."""
images = []
by_src = {}
def repl(match):
tag, quote, src = match.group(0), match.group(1), html.unescape(match.group(2)).strip()
if not src or src.startswith(("data:", "cid:")):
return tag
abs_url = urllib.parse.urljoin(base_url, src)
parsed = urllib.parse.urlparse(abs_url)
if parsed.scheme not in ("http", "https"):
return tag
if abs_url not in by_src:
try:
data, (media_type, ext) = fetch_image(abs_url, referer=base_url)
except Exception as e:
print(f"Warning: could not download image {abs_url}: {e}")
return tag
item_id = f"img{len(images) + 1}"
href = f"images/{item_id}{ext}"
by_src[abs_url] = href
images.append({"id": item_id, "href": href, "media_type": media_type, "data": data})
new_src = by_src[abs_url]
tag = re.sub(r"\ssrc=([\"']).*?\1", f' src="{new_src}"', tag, count=1, flags=re.I | re.S)
return sanitize_img_tag(tag)
content = re.sub(r"<img\b[^>]*\ssrc=([\"'])(.*?)\1[^>]*>", repl, content, flags=re.I | re.S)
return content, images
def safe_name(s):
s = re.sub(r"[^A-Za-z0-9_.-]+", "_", s).strip("_")
return s[:80] or "article"
def display_title(s):
"""Human-friendly Wallabag article title for EPUB metadata/display.
Filenames are sanitized separately with safe_name(); this function must not
use the filename. Some older/generated titles may contain underscores, so
turn those back into spaces for Kindle display.
"""
s = html.unescape(s or "Wallabag article")
s = s.replace("_", " ")
s = re.sub(r"\s+", " ", s).strip()
return s or "Wallabag article"
def metadata_title(s):
"""Title string intended for EPUB metadata, not filename."""
return display_title(s).replace(".epub", "").strip()
def build_epub(entry, out_path: Path, title: str | None = None):
"""Build a single-article EPUB."""
out_path.parent.mkdir(parents=True, exist_ok=True)
book_id = f"urn:uuid:{uuid.uuid4()}"
now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
etitle = metadata_title(title or entry.get("title") or "Wallabag article")
url = entry.get("url") or ""
domain = entry.get("domain_name") or urllib.parse.urlparse(url).netloc or "Wallabag"
published = entry.get("published_at") or entry.get("created_at") or ""
raw_content = entry.get("content") or "<p></p>"
preview = entry.get("preview_picture") or ""
if preview and preview not in raw_content:
raw_content = f'<figure><img src="{html.escape(preview, quote=True)}" alt="{html.escape(etitle, quote=True)}" /></figure>\n' + raw_content
content = clean_fragment(raw_content)
content, images = embed_images(content, url)
chapter = f'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head>
<title>{escape(etitle)}</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>
<article>
<header>
<h1>{escape(etitle)}</h1>
<p class="source">{escape(domain)}{(' · ' + escape(published[:10])) if published else ''}</p>
<p class="source"><a href="{escape(url)}">{escape(url)}</a></p>
</header>
<section class="content">
{content}
</section>
</article>
</body></html>'''
container = '''<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles><rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/></rootfiles>
</container>'''
image_manifest = "".join(
f' <item id="{img["id"]}" href="{escape(img["href"])}" media-type="{escape(img["media_type"])}"/>\n'
for img in images
)
opf = f'''<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" xmlns:dc="http://purl.org/dc/elements/1.1/" unique-identifier="BookId" version="3.0">
<metadata>
<dc:identifier id="BookId">{book_id}</dc:identifier>
<dc:title id="title">{escape(etitle)}</dc:title>
<meta refines="#title" property="title-type">main</meta>
<meta name="calibre:title_sort" content="{escape(etitle)}"/>
<dc:language>en</dc:language>
<dc:creator id="creator">{escape(domain)}</dc:creator>
<meta refines="#creator" property="role" scheme="marc:relators">aut</meta>
<dc:publisher>Wallabag</dc:publisher>
<dc:source>{escape(url)}</dc:source>
<meta property="dcterms:modified">{now}</meta>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="css" href="style.css" media-type="text/css"/>
<item id="article" href="article.xhtml" media-type="application/xhtml+xml"/>
{image_manifest} </manifest>
<spine><itemref idref="article"/></spine>
</package>'''
nav = f'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head><title>{escape(etitle)}</title></head>
<body><nav epub:type="toc"><h1>{escape(etitle)}</h1><ol><li><a href="article.xhtml">Article</a></li></ol></nav></body></html>'''
css = """body{font-family:serif;line-height:1.45;margin:0;padding:1em;} article{max-width:42em;} h1{line-height:1.15;} img,video{max-width:100%;height:auto;} figure{margin:1em 0;} figcaption,.source{font-size:.85em;color:#666;} blockquote{border-left:3px solid #aaa;margin-left:.5em;padding-left:1em;color:#333;} pre,code{font-family:monospace;white-space:pre-wrap;} table{border-collapse:collapse;max-width:100%;} td,th{border:1px solid #ccc;padding:.25em;}"""
with zipfile.ZipFile(out_path, "w") as z:
z.writestr("mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED)
z.writestr("META-INF/container.xml", container)
z.writestr("OEBPS/content.opf", opf)
z.writestr("OEBPS/nav.xhtml", nav)
z.writestr("OEBPS/style.css", css)
z.writestr("OEBPS/article.xhtml", chapter)
for img in images:
z.writestr("OEBPS/" + img["href"], img["data"])
print(f"Embedded {len(images)} image(s) in {out_path}")
return out_path
def article_key(entry) -> str:
if entry.get("id") is not None:
return f"id:{entry['id']}"
if entry.get("url"):
return "url:" + entry["url"]
return "title:" + display_title(entry.get("title") or "")
def load_downloaded(path: Path) -> dict:
if not path.is_file():
return {}
try:
data = json.loads(path.read_text())
return data if isinstance(data, dict) else {}
except Exception:
return {}
def save_downloaded(path: Path, data: dict):
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n")
tmp.replace(path)
def remember_downloaded(db: dict, entry, out: Path):
key = article_key(entry)
db[key] = {
"id": entry.get("id"),
"title": display_title(entry.get("title") or ""),
"url": entry.get("url"),
"epub": str(out),
"downloaded_at": datetime.now().isoformat(timespec="seconds"),
}
def article_output_path(entry, out_dir: Path) -> Path:
title = entry.get("title") or f"wallabag-{entry.get('id', int(time.time()))}"
suffix = entry.get("id") or int(time.time())
return out_dir / f"{safe_name(title)}-{suffix}.epub"
def main():
ap = argparse.ArgumentParser(description="Fetch Wallabag articles and build one EPUB per article")
ap.add_argument("--config", default=str(DEFAULT_CONFIG))
ap.add_argument("--limit", type=int, default=10)
ap.add_argument("--all", action="store_true", help="include archived/read articles too")
ap.add_argument("--starred", action="store_true", help="only starred articles")
ap.add_argument("--title", default=None, help="title override only when exporting one article")
ap.add_argument("--output", default=None, help="output directory, or .epub file if --limit 1")
ap.add_argument("--archive", action="store_true", help="mark fetched articles archived after successful build")
ap.add_argument("--db", default=str(DEFAULT_DB), help=f"download evidence DB, default: {DEFAULT_DB}")
ap.add_argument("--redownload", action="store_true", help="ignore evidence DB and download articles again")
args = ap.parse_args()
cfg = load_config(Path(args.config).expanduser())
missing = [k for k in ["WALLABAG_URL", "CLIENT_ID", "CLIENT_SECRET", "USERNAME", "PASSWORD"] if not cfg.get(k)]
if missing:
raise SystemExit("Missing in wallabag.conf: " + ", ".join(missing))
token = wallabag_token(cfg)
entries = fetch_entries(cfg, token, limit=args.limit, unread=not args.all, starred=args.starred)
if not entries:
raise SystemExit("No articles found.")
db_path = Path(args.db).expanduser()
downloaded = load_downloaded(db_path)
original_count = len(entries)
if not args.redownload:
entries = [e for e in entries if article_key(e) not in downloaded]
skipped = original_count - len(entries)
if skipped:
print(f"Skipped {skipped} already downloaded article(s). Use --redownload to fetch again.")
if not entries:
raise SystemExit("No new articles to download.")
output_arg = Path(args.output).expanduser() if args.output else DEFAULT_OUT
out_dir = output_arg if output_arg.suffix.lower() != ".epub" or len(entries) > 1 else output_arg.parent
out_dir.mkdir(parents=True, exist_ok=True)
created = []
for i, entry in enumerate(entries, 1):
out = output_arg if len(entries) == 1 and output_arg.suffix.lower() == ".epub" else article_output_path(entry, out_dir)
title = args.title if len(entries) == 1 and args.title else (entry.get("title") or f"Wallabag article {i}")
build_epub(entry, out, title)
remember_downloaded(downloaded, entry, out)
save_downloaded(db_path, downloaded)
created.append((entry, out))
print(f"Created: {out}")
if args.archive:
for entry, _ in created:
if entry.get("id") is not None:
mark_archived(cfg, token, entry["id"])
print(f"Archived {len(created)} articles in Wallabag.")
if __name__ == "__main__":
main()