#!/usr/bin/env python3 """Extract content from Divi 5 block markup in pages.json. Reads .planning/data/pages.json (produced by analyze_db.py) and for each page parses the `content_raw` Divi 5 block structure into a clean per-page JSON under .planning/data/content/{slug}.json. Usage: python3 extract_divi5.py pages_json : path to .planning/data/pages.json output_dir : directory to write {slug}.json files (created if missing) """ from __future__ import annotations import json import re import sys from pathlib import Path from html.parser import HTMLParser # --------------------------------------------------------------------------- # HTML inner-text extractor # --------------------------------------------------------------------------- class _TextExtractor(HTMLParser): def __init__(self): super().__init__() self.parts: list[str] = [] def handle_data(self, data: str): self.parts.append(data) def get_text(self) -> str: return " ".join(self.parts).strip() def _text(html: str) -> str: p = _TextExtractor() p.feed(html) return p.get_text() # --------------------------------------------------------------------------- # Divi block parsing # --------------------------------------------------------------------------- # Matches opening block comment: _BLOCK_OPEN = re.compile(r" _BLOCK_CLOSE = re.compile(r"") # Strip et_pb_* class tokens and data-et-* attributes _ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE) _ET_ATTR = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE) _EMPTY_CL = re.compile(r'\s+class="\s*"') def _clean(html: str) -> str: """Strip Divi noise from an HTML fragment.""" out = _BLOCK_OPEN.sub("", html) out = _BLOCK_CLOSE.sub("", out) out = _ET_ATTR.sub("", out) out = _ET_CLASS.sub("", out) out = _EMPTY_CL.sub("", out) out = re.sub(r"\n{3,}", "\n\n", out) return out.strip() def _parse_attrs(raw_json: str) -> dict: """Parse the JSON attrs blob from a block comment (may be empty).""" raw_json = raw_json.strip() if not raw_json: return {} try: return json.loads(raw_json) except Exception: return {} def _extract_inner(content: str, block_type: str) -> str: """Return the raw inner HTML of the first matching block.""" open_pat = re.compile(rf"", re.DOTALL) close_pat = re.compile(rf"") m = open_pat.search(content) if not m: return "" start = m.end() m2 = close_pat.search(content, start) end = m2.start() if m2 else len(content) return content[start:end] def _bg_color(attrs: dict) -> str: """Extract background colour from Divi 5 attrs dict.""" bg = attrs.get("backgroundColor", {}) if isinstance(bg, dict): return bg.get("value", bg.get("color", "")) return str(bg) if bg else "" def _section_type(bg: str) -> str: """Classify section by background colour.""" dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"} brand_colors = {"#1a8a7a", "#20a090"} light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"} bg_lower = bg.lower().strip() if bg_lower in dark_colors: return "dark" if bg_lower in brand_colors: return "brand" if bg_lower in light_colors: return "light" if bg_lower in ("#ffffff", "#fff", ""): return "white" return "custom" # --------------------------------------------------------------------------- # Section/module extraction # --------------------------------------------------------------------------- def _extract_modules(section_html: str) -> list[dict]: """Walk block comments inside a section and extract module data.""" modules: list[dict] = [] pos = 0 content = section_html for m in _BLOCK_OPEN.finditer(content): block_type = m.group(1) # e.g. "divi/text" attrs = _parse_attrs(m.group(2)) inner_start = m.end() # Find matching close tag close_pat = re.compile(rf"") close_m = close_pat.search(content, inner_start) inner_html = content[inner_start : close_m.start() if close_m else len(content)] clean_inner = _clean(inner_html) module_type = block_type.split("/")[-1] # "text", "button", "image", etc. mod: dict = {"module": module_type} if module_type == "text": mod["html"] = clean_inner mod["text"] = _text(clean_inner) elif module_type in ("button", "cta"): mod["text"] = attrs.get("buttonText", _text(clean_inner)) mod["url"] = attrs.get("buttonUrl", attrs.get("url", "#")) elif module_type == "image": src = attrs.get("src", attrs.get("url", "")) mod["src"] = src mod["alt"] = attrs.get("altText", attrs.get("alt", "")) mod["caption"] = attrs.get("caption", "") elif module_type == "blurb": mod["title"] = attrs.get("title", "") mod["icon"] = attrs.get("iconName", "") mod["html"] = clean_inner mod["text"] = _text(clean_inner) elif module_type == "testimonial": mod["quote"] = attrs.get("content", _text(clean_inner)) mod["author"] = attrs.get("authorName", "") mod["company"] = attrs.get("authorJobTitle", "") elif module_type == "video": mod["src"] = attrs.get("src", "") mod["poster"] = attrs.get("poster", attrs.get("image", "")) elif module_type in ("accordion", "toggle"): items = re.findall(r"]*>(.*?)\s*]*>(.*?)", clean_inner, re.DOTALL) mod["items"] = [{"q": q.strip(), "a": a.strip()} for q, a in items] elif module_type == "contact_form": mod["form_id"] = attrs.get("formId", "") mod["note"] = "REPLACE with AM vanilla form — see 08-forms.md" else: mod["html"] = clean_inner mod["attrs"] = attrs modules.append(mod) return modules def parse_page_content(content_raw: str) -> list[dict]: """Parse Divi 5 block content into a list of section dicts.""" sections: list[dict] = [] section_pat = re.compile(r"", re.DOTALL) section_close = re.compile(r"") for sm in section_pat.finditer(content_raw): attrs = _parse_attrs(sm.group(1).strip()) start = sm.end() close_m = section_close.search(content_raw, start) sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)] bg = _bg_color(attrs) sec_type = _section_type(bg) modules = _extract_modules(sec_html) # Determine semantic role from first module role = "content" if modules and modules[0]["module"] in ("fullwidth_header", "text"): first_html = modules[0].get("html", "") if " ") sys.exit(1) pages_path = Path(sys.argv[1]) out_dir = Path(sys.argv[2]) out_dir.mkdir(parents=True, exist_ok=True) pages = json.loads(pages_path.read_text(encoding="utf-8")) print(f"Processing {len(pages)} pages...") for page in pages: slug = page.get("slug") or f"page-{page['id']}" content = page.get("content_raw", "") sections = parse_page_content(content) if content.strip() else [] output = { "id": page["id"], "slug": slug, "title": page["title"], "post_type": page["post_type"], "seo_title": page.get("seo_title", ""), "seo_description": page.get("seo_description", ""), "seo_keywords": page.get("seo_keywords", ""), "acf": page.get("acf", {}), "date": page.get("date", ""), "modified": page.get("modified", ""), "sections": sections, "section_count": len(sections), } out_file = out_dir / f"{slug}.json" out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False)) print(f" {slug}.json ({len(sections)} sections)") print(f"\nDone. {len(pages)} content files in {out_dir}") if __name__ == "__main__": main()