recent updates
This commit is contained in:
@@ -0,0 +1,271 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract content from Divi 5 block markup in pages.json.
|
||||
|
||||
Reads .planning/data/pages.json (produced by analyze_db.py) and for each page
|
||||
parses the `content_raw` Divi 5 block structure into a clean per-page JSON
|
||||
under .planning/data/content/{slug}.json.
|
||||
|
||||
Usage:
|
||||
python3 extract_divi5.py <pages_json> <output_dir>
|
||||
|
||||
pages_json : path to .planning/data/pages.json
|
||||
output_dir : directory to write {slug}.json files (created if missing)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTML inner-text extractor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.parts: list[str] = []
|
||||
|
||||
def handle_data(self, data: str):
|
||||
self.parts.append(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return " ".join(self.parts).strip()
|
||||
|
||||
|
||||
def _text(html: str) -> str:
|
||||
p = _TextExtractor()
|
||||
p.feed(html)
|
||||
return p.get_text()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Divi block parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Matches opening block comment: <!-- wp:divi/MODULE {JSON} -->
|
||||
_BLOCK_OPEN = re.compile(r"<!--\s*wp:(divi/[a-z0-9_-]+)\s*(.*?)--?>", re.DOTALL)
|
||||
# Matches closing block comment: <!-- /wp:divi/MODULE -->
|
||||
_BLOCK_CLOSE = re.compile(r"<!--\s*/wp:(divi/[a-z0-9_-]+)\s*-->")
|
||||
|
||||
# Strip et_pb_* class tokens and data-et-* attributes
|
||||
_ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE)
|
||||
_ET_ATTR = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE)
|
||||
_EMPTY_CL = re.compile(r'\s+class="\s*"')
|
||||
|
||||
|
||||
def _clean(html: str) -> str:
|
||||
"""Strip Divi noise from an HTML fragment."""
|
||||
out = _BLOCK_OPEN.sub("", html)
|
||||
out = _BLOCK_CLOSE.sub("", out)
|
||||
out = _ET_ATTR.sub("", out)
|
||||
out = _ET_CLASS.sub("", out)
|
||||
out = _EMPTY_CL.sub("", out)
|
||||
out = re.sub(r"\n{3,}", "\n\n", out)
|
||||
return out.strip()
|
||||
|
||||
|
||||
def _parse_attrs(raw_json: str) -> dict:
|
||||
"""Parse the JSON attrs blob from a block comment (may be empty)."""
|
||||
raw_json = raw_json.strip()
|
||||
if not raw_json:
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw_json)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _extract_inner(content: str, block_type: str) -> str:
|
||||
"""Return the raw inner HTML of the first matching block."""
|
||||
open_pat = re.compile(rf"<!--\s*wp:{re.escape(block_type)}[^>]*-->", re.DOTALL)
|
||||
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
|
||||
m = open_pat.search(content)
|
||||
if not m:
|
||||
return ""
|
||||
start = m.end()
|
||||
m2 = close_pat.search(content, start)
|
||||
end = m2.start() if m2 else len(content)
|
||||
return content[start:end]
|
||||
|
||||
|
||||
def _bg_color(attrs: dict) -> str:
|
||||
"""Extract background colour from Divi 5 attrs dict."""
|
||||
bg = attrs.get("backgroundColor", {})
|
||||
if isinstance(bg, dict):
|
||||
return bg.get("value", bg.get("color", ""))
|
||||
return str(bg) if bg else ""
|
||||
|
||||
|
||||
def _section_type(bg: str) -> str:
|
||||
"""Classify section by background colour."""
|
||||
dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"}
|
||||
brand_colors = {"#1a8a7a", "#20a090"}
|
||||
light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"}
|
||||
bg_lower = bg.lower().strip()
|
||||
if bg_lower in dark_colors:
|
||||
return "dark"
|
||||
if bg_lower in brand_colors:
|
||||
return "brand"
|
||||
if bg_lower in light_colors:
|
||||
return "light"
|
||||
if bg_lower in ("#ffffff", "#fff", ""):
|
||||
return "white"
|
||||
return "custom"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Section/module extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _extract_modules(section_html: str) -> list[dict]:
|
||||
"""Walk block comments inside a section and extract module data."""
|
||||
modules: list[dict] = []
|
||||
pos = 0
|
||||
content = section_html
|
||||
|
||||
for m in _BLOCK_OPEN.finditer(content):
|
||||
block_type = m.group(1) # e.g. "divi/text"
|
||||
attrs = _parse_attrs(m.group(2))
|
||||
inner_start = m.end()
|
||||
|
||||
# Find matching close tag
|
||||
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
|
||||
close_m = close_pat.search(content, inner_start)
|
||||
inner_html = content[inner_start : close_m.start() if close_m else len(content)]
|
||||
clean_inner = _clean(inner_html)
|
||||
|
||||
module_type = block_type.split("/")[-1] # "text", "button", "image", etc.
|
||||
|
||||
mod: dict = {"module": module_type}
|
||||
|
||||
if module_type == "text":
|
||||
mod["html"] = clean_inner
|
||||
mod["text"] = _text(clean_inner)
|
||||
|
||||
elif module_type in ("button", "cta"):
|
||||
mod["text"] = attrs.get("buttonText", _text(clean_inner))
|
||||
mod["url"] = attrs.get("buttonUrl", attrs.get("url", "#"))
|
||||
|
||||
elif module_type == "image":
|
||||
src = attrs.get("src", attrs.get("url", ""))
|
||||
mod["src"] = src
|
||||
mod["alt"] = attrs.get("altText", attrs.get("alt", ""))
|
||||
mod["caption"] = attrs.get("caption", "")
|
||||
|
||||
elif module_type == "blurb":
|
||||
mod["title"] = attrs.get("title", "")
|
||||
mod["icon"] = attrs.get("iconName", "")
|
||||
mod["html"] = clean_inner
|
||||
mod["text"] = _text(clean_inner)
|
||||
|
||||
elif module_type == "testimonial":
|
||||
mod["quote"] = attrs.get("content", _text(clean_inner))
|
||||
mod["author"] = attrs.get("authorName", "")
|
||||
mod["company"] = attrs.get("authorJobTitle", "")
|
||||
|
||||
elif module_type == "video":
|
||||
mod["src"] = attrs.get("src", "")
|
||||
mod["poster"] = attrs.get("poster", attrs.get("image", ""))
|
||||
|
||||
elif module_type in ("accordion", "toggle"):
|
||||
items = re.findall(r"<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>", clean_inner, re.DOTALL)
|
||||
mod["items"] = [{"q": q.strip(), "a": a.strip()} for q, a in items]
|
||||
|
||||
elif module_type == "contact_form":
|
||||
mod["form_id"] = attrs.get("formId", "")
|
||||
mod["note"] = "REPLACE with AM vanilla form — see 08-forms.md"
|
||||
|
||||
else:
|
||||
mod["html"] = clean_inner
|
||||
mod["attrs"] = attrs
|
||||
|
||||
modules.append(mod)
|
||||
|
||||
return modules
|
||||
|
||||
|
||||
def parse_page_content(content_raw: str) -> list[dict]:
|
||||
"""Parse Divi 5 block content into a list of section dicts."""
|
||||
sections: list[dict] = []
|
||||
|
||||
section_pat = re.compile(r"<!--\s*wp:divi/section(.*?)-->", re.DOTALL)
|
||||
section_close = re.compile(r"<!--\s*/wp:divi/section\s*-->")
|
||||
|
||||
for sm in section_pat.finditer(content_raw):
|
||||
attrs = _parse_attrs(sm.group(1).strip())
|
||||
start = sm.end()
|
||||
close_m = section_close.search(content_raw, start)
|
||||
sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)]
|
||||
|
||||
bg = _bg_color(attrs)
|
||||
sec_type = _section_type(bg)
|
||||
modules = _extract_modules(sec_html)
|
||||
|
||||
# Determine semantic role from first module
|
||||
role = "content"
|
||||
if modules and modules[0]["module"] in ("fullwidth_header", "text"):
|
||||
first_html = modules[0].get("html", "")
|
||||
if "<h1" in first_html:
|
||||
role = "hero"
|
||||
|
||||
sections.append({
|
||||
"role": role,
|
||||
"section_type": sec_type,
|
||||
"background_color": bg,
|
||||
"attrs": attrs,
|
||||
"modules": modules,
|
||||
})
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print(f"Usage: {sys.argv[0]} <pages_json> <output_dir>")
|
||||
sys.exit(1)
|
||||
|
||||
pages_path = Path(sys.argv[1])
|
||||
out_dir = Path(sys.argv[2])
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pages = json.loads(pages_path.read_text(encoding="utf-8"))
|
||||
print(f"Processing {len(pages)} pages...")
|
||||
|
||||
for page in pages:
|
||||
slug = page.get("slug") or f"page-{page['id']}"
|
||||
content = page.get("content_raw", "")
|
||||
|
||||
sections = parse_page_content(content) if content.strip() else []
|
||||
|
||||
output = {
|
||||
"id": page["id"],
|
||||
"slug": slug,
|
||||
"title": page["title"],
|
||||
"post_type": page["post_type"],
|
||||
"seo_title": page.get("seo_title", ""),
|
||||
"seo_description": page.get("seo_description", ""),
|
||||
"seo_keywords": page.get("seo_keywords", ""),
|
||||
"acf": page.get("acf", {}),
|
||||
"date": page.get("date", ""),
|
||||
"modified": page.get("modified", ""),
|
||||
"sections": sections,
|
||||
"section_count": len(sections),
|
||||
}
|
||||
|
||||
out_file = out_dir / f"{slug}.json"
|
||||
out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
|
||||
print(f" {slug}.json ({len(sections)} sections)")
|
||||
|
||||
print(f"\nDone. {len(pages)} content files in {out_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user