272 lines
9.2 KiB
Python
272 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract content from Divi 5 block markup in pages.json.
|
|
|
|
Reads .planning/data/pages.json (produced by analyze_db.py) and for each page
|
|
parses the `content_raw` Divi 5 block structure into a clean per-page JSON
|
|
under .planning/data/content/{slug}.json.
|
|
|
|
Usage:
|
|
python3 extract_divi5.py <pages_json> <output_dir>
|
|
|
|
pages_json : path to .planning/data/pages.json
|
|
output_dir : directory to write {slug}.json files (created if missing)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTML inner-text extractor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class _TextExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.parts: list[str] = []
|
|
|
|
def handle_data(self, data: str):
|
|
self.parts.append(data)
|
|
|
|
def get_text(self) -> str:
|
|
return " ".join(self.parts).strip()
|
|
|
|
|
|
def _text(html: str) -> str:
|
|
p = _TextExtractor()
|
|
p.feed(html)
|
|
return p.get_text()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Divi block parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Matches opening block comment: <!-- wp:divi/MODULE {JSON} -->
|
|
_BLOCK_OPEN = re.compile(r"<!--\s*wp:(divi/[a-z0-9_-]+)\s*(.*?)--?>", re.DOTALL)
|
|
# Matches closing block comment: <!-- /wp:divi/MODULE -->
|
|
_BLOCK_CLOSE = re.compile(r"<!--\s*/wp:(divi/[a-z0-9_-]+)\s*-->")
|
|
|
|
# Strip et_pb_* class tokens and data-et-* attributes
|
|
_ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE)
|
|
_ET_ATTR = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE)
|
|
_EMPTY_CL = re.compile(r'\s+class="\s*"')
|
|
|
|
|
|
def _clean(html: str) -> str:
|
|
"""Strip Divi noise from an HTML fragment."""
|
|
out = _BLOCK_OPEN.sub("", html)
|
|
out = _BLOCK_CLOSE.sub("", out)
|
|
out = _ET_ATTR.sub("", out)
|
|
out = _ET_CLASS.sub("", out)
|
|
out = _EMPTY_CL.sub("", out)
|
|
out = re.sub(r"\n{3,}", "\n\n", out)
|
|
return out.strip()
|
|
|
|
|
|
def _parse_attrs(raw_json: str) -> dict:
|
|
"""Parse the JSON attrs blob from a block comment (may be empty)."""
|
|
raw_json = raw_json.strip()
|
|
if not raw_json:
|
|
return {}
|
|
try:
|
|
return json.loads(raw_json)
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def _extract_inner(content: str, block_type: str) -> str:
|
|
"""Return the raw inner HTML of the first matching block."""
|
|
open_pat = re.compile(rf"<!--\s*wp:{re.escape(block_type)}[^>]*-->", re.DOTALL)
|
|
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
|
|
m = open_pat.search(content)
|
|
if not m:
|
|
return ""
|
|
start = m.end()
|
|
m2 = close_pat.search(content, start)
|
|
end = m2.start() if m2 else len(content)
|
|
return content[start:end]
|
|
|
|
|
|
def _bg_color(attrs: dict) -> str:
|
|
"""Extract background colour from Divi 5 attrs dict."""
|
|
bg = attrs.get("backgroundColor", {})
|
|
if isinstance(bg, dict):
|
|
return bg.get("value", bg.get("color", ""))
|
|
return str(bg) if bg else ""
|
|
|
|
|
|
def _section_type(bg: str) -> str:
|
|
"""Classify section by background colour."""
|
|
dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"}
|
|
brand_colors = {"#1a8a7a", "#20a090"}
|
|
light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"}
|
|
bg_lower = bg.lower().strip()
|
|
if bg_lower in dark_colors:
|
|
return "dark"
|
|
if bg_lower in brand_colors:
|
|
return "brand"
|
|
if bg_lower in light_colors:
|
|
return "light"
|
|
if bg_lower in ("#ffffff", "#fff", ""):
|
|
return "white"
|
|
return "custom"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Section/module extraction
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _extract_modules(section_html: str) -> list[dict]:
|
|
"""Walk block comments inside a section and extract module data."""
|
|
modules: list[dict] = []
|
|
pos = 0
|
|
content = section_html
|
|
|
|
for m in _BLOCK_OPEN.finditer(content):
|
|
block_type = m.group(1) # e.g. "divi/text"
|
|
attrs = _parse_attrs(m.group(2))
|
|
inner_start = m.end()
|
|
|
|
# Find matching close tag
|
|
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
|
|
close_m = close_pat.search(content, inner_start)
|
|
inner_html = content[inner_start : close_m.start() if close_m else len(content)]
|
|
clean_inner = _clean(inner_html)
|
|
|
|
module_type = block_type.split("/")[-1] # "text", "button", "image", etc.
|
|
|
|
mod: dict = {"module": module_type}
|
|
|
|
if module_type == "text":
|
|
mod["html"] = clean_inner
|
|
mod["text"] = _text(clean_inner)
|
|
|
|
elif module_type in ("button", "cta"):
|
|
mod["text"] = attrs.get("buttonText", _text(clean_inner))
|
|
mod["url"] = attrs.get("buttonUrl", attrs.get("url", "#"))
|
|
|
|
elif module_type == "image":
|
|
src = attrs.get("src", attrs.get("url", ""))
|
|
mod["src"] = src
|
|
mod["alt"] = attrs.get("altText", attrs.get("alt", ""))
|
|
mod["caption"] = attrs.get("caption", "")
|
|
|
|
elif module_type == "blurb":
|
|
mod["title"] = attrs.get("title", "")
|
|
mod["icon"] = attrs.get("iconName", "")
|
|
mod["html"] = clean_inner
|
|
mod["text"] = _text(clean_inner)
|
|
|
|
elif module_type == "testimonial":
|
|
mod["quote"] = attrs.get("content", _text(clean_inner))
|
|
mod["author"] = attrs.get("authorName", "")
|
|
mod["company"] = attrs.get("authorJobTitle", "")
|
|
|
|
elif module_type == "video":
|
|
mod["src"] = attrs.get("src", "")
|
|
mod["poster"] = attrs.get("poster", attrs.get("image", ""))
|
|
|
|
elif module_type in ("accordion", "toggle"):
|
|
items = re.findall(r"<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>", clean_inner, re.DOTALL)
|
|
mod["items"] = [{"q": q.strip(), "a": a.strip()} for q, a in items]
|
|
|
|
elif module_type == "contact_form":
|
|
mod["form_id"] = attrs.get("formId", "")
|
|
mod["note"] = "REPLACE with AM vanilla form — see 08-forms.md"
|
|
|
|
else:
|
|
mod["html"] = clean_inner
|
|
mod["attrs"] = attrs
|
|
|
|
modules.append(mod)
|
|
|
|
return modules
|
|
|
|
|
|
def parse_page_content(content_raw: str) -> list[dict]:
|
|
"""Parse Divi 5 block content into a list of section dicts."""
|
|
sections: list[dict] = []
|
|
|
|
section_pat = re.compile(r"<!--\s*wp:divi/section(.*?)-->", re.DOTALL)
|
|
section_close = re.compile(r"<!--\s*/wp:divi/section\s*-->")
|
|
|
|
for sm in section_pat.finditer(content_raw):
|
|
attrs = _parse_attrs(sm.group(1).strip())
|
|
start = sm.end()
|
|
close_m = section_close.search(content_raw, start)
|
|
sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)]
|
|
|
|
bg = _bg_color(attrs)
|
|
sec_type = _section_type(bg)
|
|
modules = _extract_modules(sec_html)
|
|
|
|
# Determine semantic role from first module
|
|
role = "content"
|
|
if modules and modules[0]["module"] in ("fullwidth_header", "text"):
|
|
first_html = modules[0].get("html", "")
|
|
if "<h1" in first_html:
|
|
role = "hero"
|
|
|
|
sections.append({
|
|
"role": role,
|
|
"section_type": sec_type,
|
|
"background_color": bg,
|
|
"attrs": attrs,
|
|
"modules": modules,
|
|
})
|
|
|
|
return sections
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
print(f"Usage: {sys.argv[0]} <pages_json> <output_dir>")
|
|
sys.exit(1)
|
|
|
|
pages_path = Path(sys.argv[1])
|
|
out_dir = Path(sys.argv[2])
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
pages = json.loads(pages_path.read_text(encoding="utf-8"))
|
|
print(f"Processing {len(pages)} pages...")
|
|
|
|
for page in pages:
|
|
slug = page.get("slug") or f"page-{page['id']}"
|
|
content = page.get("content_raw", "")
|
|
|
|
sections = parse_page_content(content) if content.strip() else []
|
|
|
|
output = {
|
|
"id": page["id"],
|
|
"slug": slug,
|
|
"title": page["title"],
|
|
"post_type": page["post_type"],
|
|
"seo_title": page.get("seo_title", ""),
|
|
"seo_description": page.get("seo_description", ""),
|
|
"seo_keywords": page.get("seo_keywords", ""),
|
|
"acf": page.get("acf", {}),
|
|
"date": page.get("date", ""),
|
|
"modified": page.get("modified", ""),
|
|
"sections": sections,
|
|
"section_count": len(sections),
|
|
}
|
|
|
|
out_file = out_dir / f"{slug}.json"
|
|
out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
|
print(f" {slug}.json ({len(sections)} sections)")
|
|
|
|
print(f"\nDone. {len(pages)} content files in {out_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|