lehnert.dev/scripts/convert-blog

#!/usr/bin/env python3

import re
import os
import gzip
import json
import shutil
import argparse
import tempfile
import subprocess
from datetime import date
from lxml import etree as ET

def open_blog(path: str) -> ET._Element:
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        raw_svg = f.read()

    return ET.fromstring(raw_svg)

def remove_by_xpath(el: ET._Element, xpath: str):
    for node in el.xpath(xpath):
        parent = node.getparent()
        parent.remove(node)

def tag_xpath(tag: str):
    return f".//*[local-name()='{tag}']"

def class_xpath(class_: str):
    return f".//*[contains(concat(' ', normalize-space(@class), ' '), '{class_}')]"

def save_blog(dir: str, root: ET._Element):
    os.makedirs(dir, exist_ok=True)

    frame: ET._Element = root.xpath(tag_xpath('svg'))[0]
    width = int(frame.attrib['width'][:-2])
    height = int(frame.attrib['height'][:-2])
    root.attrib['viewBox'] = f'0 0 {width} {height}'

    content: ET._Element = frame.getchildren()[0]
    root.remove(frame)
    root.append(content)

    remove_by_xpath(root, ".//*[@id='write-document']")
    remove_by_xpath(root, ".//*[@id='write-defs']")
    remove_by_xpath(root, ".//*[@id='write-doc-background']")
    remove_by_xpath(root, class_xpath('ruleline'))

    bookmarks: list[tuple[str, float]] = []
    for node in root.xpath(class_xpath('bookmark')):
        match = re.search(r"translate\([^\d\-]*([-+]?\d*\.?\d+),\s*([-+]?\d*\.?\d+)\)", node.attrib['transform'])
        bookmarks.append((node.attrib['id'], float(match.group(2))))
        node.getparent().remove(node)

    hyperrefs: list[tuple[ET._Element, str, tuple[float, float], tuple[float, float]]] = []
    for node in root.xpath(class_xpath('hyperref')):
        anchor = node.xpath(tag_xpath('a'))[0]
        rect = anchor.xpath(tag_xpath('rect'))[0]
        x, y = float(rect.attrib['x']), float(rect.attrib['y'])
        w, h = float(rect.attrib['width']), float(rect.attrib['height'])
        url = anchor.attrib['{http://www.w3.org/1999/xlink}href']
        hyperrefs.append((node, url, (x, y), (w, h)))
        anchor.getparent().remove(anchor)
        node.getparent().remove(node)

    tempdir = tempfile.mkdtemp()

    try:
        with open(f'{tempdir}/main.svg', 'wb') as f:
            f.write(ET.tostring(root, xml_declaration=True, encoding="utf-8"))

        subprocess.call(['magick', '-density', '300', '-background', 'none', f'{tempdir}/main.svg', f'{tempdir}/main.png'])
        subprocess.call(['magick', f'{tempdir}/main.png', '-crop', 'x16383', '+repage', f'{tempdir}/main_%03d.webp'])

        main_files: list[str] = []
        for file in os.listdir(tempdir):
            if file.endswith('webp'): main_files.append(file)

        url_defs: list[tuple[str, str, tuple[float, float], tuple[float, float]]] = []
        for node, url, (x, y), (w, h) in hyperrefs:
            name = os.urandom(15).hex()[:7]

            svg = ET.Element('svg', attrib={'viewBox': f'{x} {y} {w} {h}'}, nsmap={None: 'http://www.w3.org/2000/svg'})
            svg.append(node)

            with open(f'{tempdir}/{name}.svg', 'wb') as f:
                f.write(ET.tostring(svg, xml_declaration=True, encoding="utf-8"))

            subprocess.call(['magick', '-density', '300', '-background', 'none', f'{tempdir}/{name}.svg',f'{tempdir}/{name}.png'])
            subprocess.call(['magick', f'{tempdir}/{name}.png', f'{tempdir}/{name}.webp'])

            url_defs.append((f'{name}.webp', url, (x, y), (w, h)))


        for file in os.listdir(tempdir):
            if file.endswith('webp'):
                shutil.move(f'{tempdir}/{file}', f'{dir}/{file}')

        with open(f'{dir}/_definition.json', 'w') as f:
            json.dump({
                'date': date.today().strftime("%Y-%m-%d"),
                'title': 'My Cool Blog Entry',
                'keywords': ['Cool', 'Blog'],
                'bookmarks': list(map(lambda b: {'id': b[0], 'offset': b[1]}, bookmarks)),
                'urls': list(map(lambda d: {'src': d[0], 'href': d[1], 'offset': d[2], 'dimensions': d[3]}, url_defs)),
                'files': sorted(main_files),
                'dimensions': [width, height],
            }, f, indent=4)

    finally:
        shutil.rmtree(tempdir, ignore_errors=True)

def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--svgz', required=True, type=str)
    parser.add_argument('--dir', required=True, type=str)

    return parser.parse_args()

def main():
    args = parse_args()
    save_blog(args.dir, open_blog(args.svgz))

if __name__ == '__main__':
    main()