Files
arisingmedia-web-sops/wp-divi-pipeline-to-am-stack/scripts/extract_nav.py
T
2026-06-09 18:31:59 +02:00

100 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
extract_nav.py — Extract WordPress navigation menus from database.sql dump.
Outputs nav.json: [{label, href, display_order, is_cta}]
Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>
"""
import sys, re, json, os
CTA_KEYWORDS = {'book', 'get started', 'contact', 'sign up', 'register', 'join', 'buy', 'shop'}
def extract_nav(extract_dir: str, data_dir: str):
sql_path = os.path.join(extract_dir, 'database.sql')
if not os.path.exists(sql_path):
print(f"ERROR: {sql_path} not found", file=sys.stderr)
sys.exit(1)
with open(sql_path, encoding='utf-8', errors='replace') as f:
sql = f.read()
# Detect table prefix
prefix_match = re.search(r"INSERT INTO `(\w+)options`", sql)
prefix = prefix_match.group(1) if prefix_match else 'wp_'
# Find nav menu items: post_type = 'nav_menu_item'
# Extract INSERT rows from wp_posts
posts_pattern = re.compile(
r"INSERT INTO `%sposts`[^;]+?;" % re.escape(prefix),
re.DOTALL | re.IGNORECASE
)
postmeta_pattern = re.compile(
r"INSERT INTO `%spostmeta`[^;]+?;" % re.escape(prefix),
re.DOTALL | re.IGNORECASE
)
nav_posts = {}
for m in posts_pattern.finditer(sql):
rows = re.findall(r"\((\d+),[^,]*,'[^']*','[^']*','([^']*)'[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,'([^']*)'[^,]*,[^,]*,\d+,'nav_menu_item'", m.group())
for post_id, post_title, post_status in rows:
if post_status == 'publish':
nav_posts[post_id] = {'label': post_title, 'href': '/', 'menu_order': 0}
if not nav_posts:
# Fallback: simpler pattern
for m in posts_pattern.finditer(sql):
block = m.group()
ids = re.findall(r"\((\d+),", block)
titles = re.findall(r"'([^']{1,60})'", block)
for i, post_id in enumerate(ids):
if i < len(titles) and titles[i]:
nav_posts[post_id] = {'label': titles[i], 'href': '/', 'menu_order': i}
# Extract menu item URLs from postmeta (_menu_item_url or _menu_item_object_id)
for m in postmeta_pattern.finditer(sql):
block = m.group()
# _menu_item_url
url_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_url',\s*'([^']*)'\)", block)
for post_id, url in url_matches:
if post_id in nav_posts and url:
nav_posts[post_id]['href'] = url
# _menu_item_menu_order
order_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_menu_order',\s*'(\d+)'\)", block)
for post_id, order in order_matches:
if post_id in nav_posts:
nav_posts[post_id]['menu_order'] = int(order)
# Clean up hrefs: make relative if same domain
items = []
for idx, (post_id, item) in enumerate(sorted(nav_posts.items(), key=lambda x: x[1].get('menu_order', 0))):
label = item['label'].strip()
href = item['href'].strip()
if not label:
continue
# Make relative
href = re.sub(r'https?://[^/]+', '', href) or '/'
if not href.startswith('/'):
href = '/' + href
is_cta = 1 if any(kw in label.lower() for kw in CTA_KEYWORDS) else 0
items.append({
'label': label,
'href': href,
'display_order': idx + 1,
'is_cta': is_cta
})
os.makedirs(data_dir, exist_ok=True)
out_path = os.path.join(data_dir, 'nav.json')
with open(out_path, 'w', encoding='utf-8') as f:
json.dump(items, f, indent=2, ensure_ascii=False)
print(f"nav.json: {len(items)} items → {out_path}")
for item in items:
print(f" {'[CTA]' if item['is_cta'] else ' '} {item['label']}{item['href']}")
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>")
sys.exit(1)
extract_nav(sys.argv[1], sys.argv[2])