arisingmedia-web-sops/wp-divi-pipeline-to-am-stack/scripts/extract_divi5.py

#!/usr/bin/env python3
"""Extract content from Divi 5 block markup in pages.json.

Reads .planning/data/pages.json (produced by analyze_db.py) and for each page
parses the `content_raw` Divi 5 block structure into a clean per-page JSON
under .planning/data/content/{slug}.json.

Usage:
    python3 extract_divi5.py <pages_json> <output_dir>

    pages_json  : path to .planning/data/pages.json
    output_dir  : directory to write {slug}.json files (created if missing)
"""
from __future__ import annotations

import json
import re
import sys
from pathlib import Path
from html.parser import HTMLParser


# ---------------------------------------------------------------------------
# HTML inner-text extractor
# ---------------------------------------------------------------------------

class _TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.parts: list[str] = []

    def handle_data(self, data: str):
        self.parts.append(data)

    def get_text(self) -> str:
        return " ".join(self.parts).strip()


def _text(html: str) -> str:
    p = _TextExtractor()
    p.feed(html)
    return p.get_text()


# ---------------------------------------------------------------------------
# Divi block parsing
# ---------------------------------------------------------------------------

# Matches opening block comment: <!-- wp:divi/MODULE {JSON} -->
_BLOCK_OPEN  = re.compile(r"<!--\s*wp:(divi/[a-z0-9_-]+)\s*(.*?)--?>", re.DOTALL)
# Matches closing block comment: <!-- /wp:divi/MODULE -->
_BLOCK_CLOSE = re.compile(r"<!--\s*/wp:(divi/[a-z0-9_-]+)\s*-->")

# Strip et_pb_* class tokens and data-et-* attributes
_ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE)
_ET_ATTR  = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE)
_EMPTY_CL = re.compile(r'\s+class="\s*"')


def _clean(html: str) -> str:
    """Strip Divi noise from an HTML fragment."""
    out = _BLOCK_OPEN.sub("", html)
    out = _BLOCK_CLOSE.sub("", out)
    out = _ET_ATTR.sub("", out)
    out = _ET_CLASS.sub("", out)
    out = _EMPTY_CL.sub("", out)
    out = re.sub(r"\n{3,}", "\n\n", out)
    return out.strip()


def _parse_attrs(raw_json: str) -> dict:
    """Parse the JSON attrs blob from a block comment (may be empty)."""
    raw_json = raw_json.strip()
    if not raw_json:
        return {}
    try:
        return json.loads(raw_json)
    except Exception:
        return {}


def _extract_inner(content: str, block_type: str) -> str:
    """Return the raw inner HTML of the first matching block."""
    open_pat  = re.compile(rf"<!--\s*wp:{re.escape(block_type)}[^>]*-->", re.DOTALL)
    close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
    m = open_pat.search(content)
    if not m:
        return ""
    start = m.end()
    m2 = close_pat.search(content, start)
    end = m2.start() if m2 else len(content)
    return content[start:end]


def _bg_color(attrs: dict) -> str:
    """Extract background colour from Divi 5 attrs dict."""
    bg = attrs.get("backgroundColor", {})
    if isinstance(bg, dict):
        return bg.get("value", bg.get("color", ""))
    return str(bg) if bg else ""


def _section_type(bg: str) -> str:
    """Classify section by background colour."""
    dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"}
    brand_colors = {"#1a8a7a", "#20a090"}
    light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"}
    bg_lower = bg.lower().strip()
    if bg_lower in dark_colors:
        return "dark"
    if bg_lower in brand_colors:
        return "brand"
    if bg_lower in light_colors:
        return "light"
    if bg_lower in ("#ffffff", "#fff", ""):
        return "white"
    return "custom"


# ---------------------------------------------------------------------------
# Section/module extraction
# ---------------------------------------------------------------------------

def _extract_modules(section_html: str) -> list[dict]:
    """Walk block comments inside a section and extract module data."""
    modules: list[dict] = []
    pos = 0
    content = section_html

    for m in _BLOCK_OPEN.finditer(content):
        block_type = m.group(1)  # e.g. "divi/text"
        attrs      = _parse_attrs(m.group(2))
        inner_start = m.end()

        # Find matching close tag
        close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
        close_m = close_pat.search(content, inner_start)
        inner_html = content[inner_start : close_m.start() if close_m else len(content)]
        clean_inner = _clean(inner_html)

        module_type = block_type.split("/")[-1]  # "text", "button", "image", etc.

        mod: dict = {"module": module_type}

        if module_type == "text":
            mod["html"]    = clean_inner
            mod["text"]    = _text(clean_inner)

        elif module_type in ("button", "cta"):
            mod["text"]    = attrs.get("buttonText", _text(clean_inner))
            mod["url"]     = attrs.get("buttonUrl",  attrs.get("url", "#"))

        elif module_type == "image":
            src = attrs.get("src", attrs.get("url", ""))
            mod["src"]     = src
            mod["alt"]     = attrs.get("altText", attrs.get("alt", ""))
            mod["caption"] = attrs.get("caption", "")

        elif module_type == "blurb":
            mod["title"]   = attrs.get("title", "")
            mod["icon"]    = attrs.get("iconName", "")
            mod["html"]    = clean_inner
            mod["text"]    = _text(clean_inner)

        elif module_type == "testimonial":
            mod["quote"]   = attrs.get("content", _text(clean_inner))
            mod["author"]  = attrs.get("authorName", "")
            mod["company"] = attrs.get("authorJobTitle", "")

        elif module_type == "video":
            mod["src"]     = attrs.get("src", "")
            mod["poster"]  = attrs.get("poster", attrs.get("image", ""))

        elif module_type in ("accordion", "toggle"):
            items = re.findall(r"<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>", clean_inner, re.DOTALL)
            mod["items"]   = [{"q": q.strip(), "a": a.strip()} for q, a in items]

        elif module_type == "contact_form":
            mod["form_id"] = attrs.get("formId", "")
            mod["note"]    = "REPLACE with AM vanilla form — see 08-forms.md"

        else:
            mod["html"]    = clean_inner
            mod["attrs"]   = attrs

        modules.append(mod)

    return modules


def parse_page_content(content_raw: str) -> list[dict]:
    """Parse Divi 5 block content into a list of section dicts."""
    sections: list[dict] = []

    section_pat   = re.compile(r"<!--\s*wp:divi/section(.*?)-->", re.DOTALL)
    section_close = re.compile(r"<!--\s*/wp:divi/section\s*-->")

    for sm in section_pat.finditer(content_raw):
        attrs = _parse_attrs(sm.group(1).strip())
        start = sm.end()
        close_m = section_close.search(content_raw, start)
        sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)]

        bg        = _bg_color(attrs)
        sec_type  = _section_type(bg)
        modules   = _extract_modules(sec_html)

        # Determine semantic role from first module
        role = "content"
        if modules and modules[0]["module"] in ("fullwidth_header", "text"):
            first_html = modules[0].get("html", "")
            if "<h1" in first_html:
                role = "hero"

        sections.append({
            "role":             role,
            "section_type":     sec_type,
            "background_color": bg,
            "attrs":            attrs,
            "modules":          modules,
        })

    return sections


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    if len(sys.argv) < 3:
        print(f"Usage: {sys.argv[0]} <pages_json> <output_dir>")
        sys.exit(1)

    pages_path = Path(sys.argv[1])
    out_dir    = Path(sys.argv[2])
    out_dir.mkdir(parents=True, exist_ok=True)

    pages = json.loads(pages_path.read_text(encoding="utf-8"))
    print(f"Processing {len(pages)} pages...")

    for page in pages:
        slug    = page.get("slug") or f"page-{page['id']}"
        content = page.get("content_raw", "")

        sections = parse_page_content(content) if content.strip() else []

        output = {
            "id":              page["id"],
            "slug":            slug,
            "title":           page["title"],
            "post_type":       page["post_type"],
            "seo_title":       page.get("seo_title", ""),
            "seo_description": page.get("seo_description", ""),
            "seo_keywords":    page.get("seo_keywords", ""),
            "acf":             page.get("acf", {}),
            "date":            page.get("date", ""),
            "modified":        page.get("modified", ""),
            "sections":        sections,
            "section_count":   len(sections),
        }

        out_file = out_dir / f"{slug}.json"
        out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
        print(f"  {slug}.json ({len(sections)} sections)")

    print(f"\nDone. {len(pages)} content files in {out_dir}")


if __name__ == "__main__":
    main()