recent updates

2026-06-09 18:31:59 +02:00
parent 398b94965c
commit 94f7a1f72a
42 changed files with 8686 additions and 0 deletions
@@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+"""Extract content from Divi 5 block markup in pages.json.
+
+Reads .planning/data/pages.json (produced by analyze_db.py) and for each page
+parses the `content_raw` Divi 5 block structure into a clean per-page JSON
+under .planning/data/content/{slug}.json.
+
+Usage:
+    python3 extract_divi5.py <pages_json> <output_dir>
+
+    pages_json  : path to .planning/data/pages.json
+    output_dir  : directory to write {slug}.json files (created if missing)
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+from pathlib import Path
+from html.parser import HTMLParser
+
+
+# ---------------------------------------------------------------------------
+# HTML inner-text extractor
+# ---------------------------------------------------------------------------
+
+class _TextExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.parts: list[str] = []
+
+    def handle_data(self, data: str):
+        self.parts.append(data)
+
+    def get_text(self) -> str:
+        return " ".join(self.parts).strip()
+
+
+def _text(html: str) -> str:
+    p = _TextExtractor()
+    p.feed(html)
+    return p.get_text()
+
+
+# ---------------------------------------------------------------------------
+# Divi block parsing
+# ---------------------------------------------------------------------------
+
+# Matches opening block comment: <!-- wp:divi/MODULE {JSON} -->
+_BLOCK_OPEN  = re.compile(r"<!--\s*wp:(divi/[a-z0-9_-]+)\s*(.*?)--?>", re.DOTALL)
+# Matches closing block comment: <!-- /wp:divi/MODULE -->
+_BLOCK_CLOSE = re.compile(r"<!--\s*/wp:(divi/[a-z0-9_-]+)\s*-->")
+
+# Strip et_pb_* class tokens and data-et-* attributes
+_ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE)
+_ET_ATTR  = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE)
+_EMPTY_CL = re.compile(r'\s+class="\s*"')
+
+
+def _clean(html: str) -> str:
+    """Strip Divi noise from an HTML fragment."""
+    out = _BLOCK_OPEN.sub("", html)
+    out = _BLOCK_CLOSE.sub("", out)
+    out = _ET_ATTR.sub("", out)
+    out = _ET_CLASS.sub("", out)
+    out = _EMPTY_CL.sub("", out)
+    out = re.sub(r"\n{3,}", "\n\n", out)
+    return out.strip()
+
+
+def _parse_attrs(raw_json: str) -> dict:
+    """Parse the JSON attrs blob from a block comment (may be empty)."""
+    raw_json = raw_json.strip()
+    if not raw_json:
+        return {}
+    try:
+        return json.loads(raw_json)
+    except Exception:
+        return {}
+
+
+def _extract_inner(content: str, block_type: str) -> str:
+    """Return the raw inner HTML of the first matching block."""
+    open_pat  = re.compile(rf"<!--\s*wp:{re.escape(block_type)}[^>]*-->", re.DOTALL)
+    close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
+    m = open_pat.search(content)
+    if not m:
+        return ""
+    start = m.end()
+    m2 = close_pat.search(content, start)
+    end = m2.start() if m2 else len(content)
+    return content[start:end]
+
+
+def _bg_color(attrs: dict) -> str:
+    """Extract background colour from Divi 5 attrs dict."""
+    bg = attrs.get("backgroundColor", {})
+    if isinstance(bg, dict):
+        return bg.get("value", bg.get("color", ""))
+    return str(bg) if bg else ""
+
+
+def _section_type(bg: str) -> str:
+    """Classify section by background colour."""
+    dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"}
+    brand_colors = {"#1a8a7a", "#20a090"}
+    light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"}
+    bg_lower = bg.lower().strip()
+    if bg_lower in dark_colors:
+        return "dark"
+    if bg_lower in brand_colors:
+        return "brand"
+    if bg_lower in light_colors:
+        return "light"
+    if bg_lower in ("#ffffff", "#fff", ""):
+        return "white"
+    return "custom"
+
+
+# ---------------------------------------------------------------------------
+# Section/module extraction
+# ---------------------------------------------------------------------------
+
+def _extract_modules(section_html: str) -> list[dict]:
+    """Walk block comments inside a section and extract module data."""
+    modules: list[dict] = []
+    pos = 0
+    content = section_html
+
+    for m in _BLOCK_OPEN.finditer(content):
+        block_type = m.group(1)  # e.g. "divi/text"
+        attrs      = _parse_attrs(m.group(2))
+        inner_start = m.end()
+
+        # Find matching close tag
+        close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
+        close_m = close_pat.search(content, inner_start)
+        inner_html = content[inner_start : close_m.start() if close_m else len(content)]
+        clean_inner = _clean(inner_html)
+
+        module_type = block_type.split("/")[-1]  # "text", "button", "image", etc.
+
+        mod: dict = {"module": module_type}
+
+        if module_type == "text":
+            mod["html"]    = clean_inner
+            mod["text"]    = _text(clean_inner)
+
+        elif module_type in ("button", "cta"):
+            mod["text"]    = attrs.get("buttonText", _text(clean_inner))
+            mod["url"]     = attrs.get("buttonUrl",  attrs.get("url", "#"))
+
+        elif module_type == "image":
+            src = attrs.get("src", attrs.get("url", ""))
+            mod["src"]     = src
+            mod["alt"]     = attrs.get("altText", attrs.get("alt", ""))
+            mod["caption"] = attrs.get("caption", "")
+
+        elif module_type == "blurb":
+            mod["title"]   = attrs.get("title", "")
+            mod["icon"]    = attrs.get("iconName", "")
+            mod["html"]    = clean_inner
+            mod["text"]    = _text(clean_inner)
+
+        elif module_type == "testimonial":
+            mod["quote"]   = attrs.get("content", _text(clean_inner))
+            mod["author"]  = attrs.get("authorName", "")
+            mod["company"] = attrs.get("authorJobTitle", "")
+
+        elif module_type == "video":
+            mod["src"]     = attrs.get("src", "")
+            mod["poster"]  = attrs.get("poster", attrs.get("image", ""))
+
+        elif module_type in ("accordion", "toggle"):
+            items = re.findall(r"<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>", clean_inner, re.DOTALL)
+            mod["items"]   = [{"q": q.strip(), "a": a.strip()} for q, a in items]
+
+        elif module_type == "contact_form":
+            mod["form_id"] = attrs.get("formId", "")
+            mod["note"]    = "REPLACE with AM vanilla form — see 08-forms.md"
+
+        else:
+            mod["html"]    = clean_inner
+            mod["attrs"]   = attrs
+
+        modules.append(mod)
+
+    return modules
+
+
+def parse_page_content(content_raw: str) -> list[dict]:
+    """Parse Divi 5 block content into a list of section dicts."""
+    sections: list[dict] = []
+
+    section_pat   = re.compile(r"<!--\s*wp:divi/section(.*?)-->", re.DOTALL)
+    section_close = re.compile(r"<!--\s*/wp:divi/section\s*-->")
+
+    for sm in section_pat.finditer(content_raw):
+        attrs = _parse_attrs(sm.group(1).strip())
+        start = sm.end()
+        close_m = section_close.search(content_raw, start)
+        sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)]
+
+        bg        = _bg_color(attrs)
+        sec_type  = _section_type(bg)
+        modules   = _extract_modules(sec_html)
+
+        # Determine semantic role from first module
+        role = "content"
+        if modules and modules[0]["module"] in ("fullwidth_header", "text"):
+            first_html = modules[0].get("html", "")
+            if "<h1" in first_html:
+                role = "hero"
+
+        sections.append({
+            "role":             role,
+            "section_type":     sec_type,
+            "background_color": bg,
+            "attrs":            attrs,
+            "modules":          modules,
+        })
+
+    return sections
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <pages_json> <output_dir>")
+        sys.exit(1)
+
+    pages_path = Path(sys.argv[1])
+    out_dir    = Path(sys.argv[2])
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    pages = json.loads(pages_path.read_text(encoding="utf-8"))
+    print(f"Processing {len(pages)} pages...")
+
+    for page in pages:
+        slug    = page.get("slug") or f"page-{page['id']}"
+        content = page.get("content_raw", "")
+
+        sections = parse_page_content(content) if content.strip() else []
+
+        output = {
+            "id":              page["id"],
+            "slug":            slug,
+            "title":           page["title"],
+            "post_type":       page["post_type"],
+            "seo_title":       page.get("seo_title", ""),
+            "seo_description": page.get("seo_description", ""),
+            "seo_keywords":    page.get("seo_keywords", ""),
+            "acf":             page.get("acf", {}),
+            "date":            page.get("date", ""),
+            "modified":        page.get("modified", ""),
+            "sections":        sections,
+            "section_count":   len(sections),
+        }
+
+        out_file = out_dir / f"{slug}.json"
+        out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
+        print(f"  {slug}.json ({len(sections)} sections)")
+
+    print(f"\nDone. {len(pages)} content files in {out_dir}")
+
+
+if __name__ == "__main__":
+    main()