recent updates

This commit is contained in:
2026-06-09 18:31:59 +02:00
parent 398b94965c
commit 94f7a1f72a
42 changed files with 8686 additions and 0 deletions
@@ -0,0 +1,271 @@
#!/usr/bin/env python3
"""Extract content from Divi 5 block markup in pages.json.
Reads .planning/data/pages.json (produced by analyze_db.py) and for each page
parses the `content_raw` Divi 5 block structure into a clean per-page JSON
under .planning/data/content/{slug}.json.
Usage:
python3 extract_divi5.py <pages_json> <output_dir>
pages_json : path to .planning/data/pages.json
output_dir : directory to write {slug}.json files (created if missing)
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
from html.parser import HTMLParser
# ---------------------------------------------------------------------------
# HTML inner-text extractor
# ---------------------------------------------------------------------------
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.parts: list[str] = []
def handle_data(self, data: str):
self.parts.append(data)
def get_text(self) -> str:
return " ".join(self.parts).strip()
def _text(html: str) -> str:
p = _TextExtractor()
p.feed(html)
return p.get_text()
# ---------------------------------------------------------------------------
# Divi block parsing
# ---------------------------------------------------------------------------
# Matches opening block comment: <!-- wp:divi/MODULE {JSON} -->
_BLOCK_OPEN = re.compile(r"<!--\s*wp:(divi/[a-z0-9_-]+)\s*(.*?)--?>", re.DOTALL)
# Matches closing block comment: <!-- /wp:divi/MODULE -->
_BLOCK_CLOSE = re.compile(r"<!--\s*/wp:(divi/[a-z0-9_-]+)\s*-->")
# Strip et_pb_* class tokens and data-et-* attributes
_ET_CLASS = re.compile(r"\b(et_pb_[a-z0-9_-]+|divi-[a-z0-9_-]+-[a-z0-9_-]+|d5_[a-z0-9_-]+)\b", re.IGNORECASE)
_ET_ATTR = re.compile(r'\s+data-(?:et|builder|module-id|module-class|d5)-[a-z0-9_-]+\s*=\s*"[^"]*"', re.IGNORECASE)
_EMPTY_CL = re.compile(r'\s+class="\s*"')
def _clean(html: str) -> str:
"""Strip Divi noise from an HTML fragment."""
out = _BLOCK_OPEN.sub("", html)
out = _BLOCK_CLOSE.sub("", out)
out = _ET_ATTR.sub("", out)
out = _ET_CLASS.sub("", out)
out = _EMPTY_CL.sub("", out)
out = re.sub(r"\n{3,}", "\n\n", out)
return out.strip()
def _parse_attrs(raw_json: str) -> dict:
"""Parse the JSON attrs blob from a block comment (may be empty)."""
raw_json = raw_json.strip()
if not raw_json:
return {}
try:
return json.loads(raw_json)
except Exception:
return {}
def _extract_inner(content: str, block_type: str) -> str:
"""Return the raw inner HTML of the first matching block."""
open_pat = re.compile(rf"<!--\s*wp:{re.escape(block_type)}[^>]*-->", re.DOTALL)
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
m = open_pat.search(content)
if not m:
return ""
start = m.end()
m2 = close_pat.search(content, start)
end = m2.start() if m2 else len(content)
return content[start:end]
def _bg_color(attrs: dict) -> str:
"""Extract background colour from Divi 5 attrs dict."""
bg = attrs.get("backgroundColor", {})
if isinstance(bg, dict):
return bg.get("value", bg.get("color", ""))
return str(bg) if bg else ""
def _section_type(bg: str) -> str:
"""Classify section by background colour."""
dark_colors = {"#0f5f53", "#1a3a34", "#0d4d42"}
brand_colors = {"#1a8a7a", "#20a090"}
light_colors = {"#f5f5f5", "#fafafa", "#f0f0f0", "#efefef"}
bg_lower = bg.lower().strip()
if bg_lower in dark_colors:
return "dark"
if bg_lower in brand_colors:
return "brand"
if bg_lower in light_colors:
return "light"
if bg_lower in ("#ffffff", "#fff", ""):
return "white"
return "custom"
# ---------------------------------------------------------------------------
# Section/module extraction
# ---------------------------------------------------------------------------
def _extract_modules(section_html: str) -> list[dict]:
"""Walk block comments inside a section and extract module data."""
modules: list[dict] = []
pos = 0
content = section_html
for m in _BLOCK_OPEN.finditer(content):
block_type = m.group(1) # e.g. "divi/text"
attrs = _parse_attrs(m.group(2))
inner_start = m.end()
# Find matching close tag
close_pat = re.compile(rf"<!--\s*/wp:{re.escape(block_type)}\s*-->")
close_m = close_pat.search(content, inner_start)
inner_html = content[inner_start : close_m.start() if close_m else len(content)]
clean_inner = _clean(inner_html)
module_type = block_type.split("/")[-1] # "text", "button", "image", etc.
mod: dict = {"module": module_type}
if module_type == "text":
mod["html"] = clean_inner
mod["text"] = _text(clean_inner)
elif module_type in ("button", "cta"):
mod["text"] = attrs.get("buttonText", _text(clean_inner))
mod["url"] = attrs.get("buttonUrl", attrs.get("url", "#"))
elif module_type == "image":
src = attrs.get("src", attrs.get("url", ""))
mod["src"] = src
mod["alt"] = attrs.get("altText", attrs.get("alt", ""))
mod["caption"] = attrs.get("caption", "")
elif module_type == "blurb":
mod["title"] = attrs.get("title", "")
mod["icon"] = attrs.get("iconName", "")
mod["html"] = clean_inner
mod["text"] = _text(clean_inner)
elif module_type == "testimonial":
mod["quote"] = attrs.get("content", _text(clean_inner))
mod["author"] = attrs.get("authorName", "")
mod["company"] = attrs.get("authorJobTitle", "")
elif module_type == "video":
mod["src"] = attrs.get("src", "")
mod["poster"] = attrs.get("poster", attrs.get("image", ""))
elif module_type in ("accordion", "toggle"):
items = re.findall(r"<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>", clean_inner, re.DOTALL)
mod["items"] = [{"q": q.strip(), "a": a.strip()} for q, a in items]
elif module_type == "contact_form":
mod["form_id"] = attrs.get("formId", "")
mod["note"] = "REPLACE with AM vanilla form — see 08-forms.md"
else:
mod["html"] = clean_inner
mod["attrs"] = attrs
modules.append(mod)
return modules
def parse_page_content(content_raw: str) -> list[dict]:
"""Parse Divi 5 block content into a list of section dicts."""
sections: list[dict] = []
section_pat = re.compile(r"<!--\s*wp:divi/section(.*?)-->", re.DOTALL)
section_close = re.compile(r"<!--\s*/wp:divi/section\s*-->")
for sm in section_pat.finditer(content_raw):
attrs = _parse_attrs(sm.group(1).strip())
start = sm.end()
close_m = section_close.search(content_raw, start)
sec_html = content_raw[start : close_m.start() if close_m else len(content_raw)]
bg = _bg_color(attrs)
sec_type = _section_type(bg)
modules = _extract_modules(sec_html)
# Determine semantic role from first module
role = "content"
if modules and modules[0]["module"] in ("fullwidth_header", "text"):
first_html = modules[0].get("html", "")
if "<h1" in first_html:
role = "hero"
sections.append({
"role": role,
"section_type": sec_type,
"background_color": bg,
"attrs": attrs,
"modules": modules,
})
return sections
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} <pages_json> <output_dir>")
sys.exit(1)
pages_path = Path(sys.argv[1])
out_dir = Path(sys.argv[2])
out_dir.mkdir(parents=True, exist_ok=True)
pages = json.loads(pages_path.read_text(encoding="utf-8"))
print(f"Processing {len(pages)} pages...")
for page in pages:
slug = page.get("slug") or f"page-{page['id']}"
content = page.get("content_raw", "")
sections = parse_page_content(content) if content.strip() else []
output = {
"id": page["id"],
"slug": slug,
"title": page["title"],
"post_type": page["post_type"],
"seo_title": page.get("seo_title", ""),
"seo_description": page.get("seo_description", ""),
"seo_keywords": page.get("seo_keywords", ""),
"acf": page.get("acf", {}),
"date": page.get("date", ""),
"modified": page.get("modified", ""),
"sections": sections,
"section_count": len(sections),
}
out_file = out_dir / f"{slug}.json"
out_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
print(f" {slug}.json ({len(sections)} sections)")
print(f"\nDone. {len(pages)} content files in {out_dir}")
if __name__ == "__main__":
main()