100 lines
3.9 KiB
Python
100 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract_nav.py — Extract WordPress navigation menus from database.sql dump.
|
|
Outputs nav.json: [{label, href, display_order, is_cta}]
|
|
|
|
Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>
|
|
"""
|
|
import sys, re, json, os
|
|
|
|
CTA_KEYWORDS = {'book', 'get started', 'contact', 'sign up', 'register', 'join', 'buy', 'shop'}
|
|
|
|
def extract_nav(extract_dir: str, data_dir: str):
|
|
sql_path = os.path.join(extract_dir, 'database.sql')
|
|
if not os.path.exists(sql_path):
|
|
print(f"ERROR: {sql_path} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
with open(sql_path, encoding='utf-8', errors='replace') as f:
|
|
sql = f.read()
|
|
|
|
# Detect table prefix
|
|
prefix_match = re.search(r"INSERT INTO `(\w+)options`", sql)
|
|
prefix = prefix_match.group(1) if prefix_match else 'wp_'
|
|
|
|
# Find nav menu items: post_type = 'nav_menu_item'
|
|
# Extract INSERT rows from wp_posts
|
|
posts_pattern = re.compile(
|
|
r"INSERT INTO `%sposts`[^;]+?;" % re.escape(prefix),
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
postmeta_pattern = re.compile(
|
|
r"INSERT INTO `%spostmeta`[^;]+?;" % re.escape(prefix),
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
nav_posts = {}
|
|
for m in posts_pattern.finditer(sql):
|
|
rows = re.findall(r"\((\d+),[^,]*,'[^']*','[^']*','([^']*)'[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,'([^']*)'[^,]*,[^,]*,\d+,'nav_menu_item'", m.group())
|
|
for post_id, post_title, post_status in rows:
|
|
if post_status == 'publish':
|
|
nav_posts[post_id] = {'label': post_title, 'href': '/', 'menu_order': 0}
|
|
|
|
if not nav_posts:
|
|
# Fallback: simpler pattern
|
|
for m in posts_pattern.finditer(sql):
|
|
block = m.group()
|
|
ids = re.findall(r"\((\d+),", block)
|
|
titles = re.findall(r"'([^']{1,60})'", block)
|
|
for i, post_id in enumerate(ids):
|
|
if i < len(titles) and titles[i]:
|
|
nav_posts[post_id] = {'label': titles[i], 'href': '/', 'menu_order': i}
|
|
|
|
# Extract menu item URLs from postmeta (_menu_item_url or _menu_item_object_id)
|
|
for m in postmeta_pattern.finditer(sql):
|
|
block = m.group()
|
|
# _menu_item_url
|
|
url_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_url',\s*'([^']*)'\)", block)
|
|
for post_id, url in url_matches:
|
|
if post_id in nav_posts and url:
|
|
nav_posts[post_id]['href'] = url
|
|
# _menu_item_menu_order
|
|
order_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_menu_order',\s*'(\d+)'\)", block)
|
|
for post_id, order in order_matches:
|
|
if post_id in nav_posts:
|
|
nav_posts[post_id]['menu_order'] = int(order)
|
|
|
|
# Clean up hrefs: make relative if same domain
|
|
items = []
|
|
for idx, (post_id, item) in enumerate(sorted(nav_posts.items(), key=lambda x: x[1].get('menu_order', 0))):
|
|
label = item['label'].strip()
|
|
href = item['href'].strip()
|
|
if not label:
|
|
continue
|
|
# Make relative
|
|
href = re.sub(r'https?://[^/]+', '', href) or '/'
|
|
if not href.startswith('/'):
|
|
href = '/' + href
|
|
is_cta = 1 if any(kw in label.lower() for kw in CTA_KEYWORDS) else 0
|
|
items.append({
|
|
'label': label,
|
|
'href': href,
|
|
'display_order': idx + 1,
|
|
'is_cta': is_cta
|
|
})
|
|
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
out_path = os.path.join(data_dir, 'nav.json')
|
|
with open(out_path, 'w', encoding='utf-8') as f:
|
|
json.dump(items, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"nav.json: {len(items)} items → {out_path}")
|
|
for item in items:
|
|
print(f" {'[CTA]' if item['is_cta'] else ' '} {item['label']} → {item['href']}")
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>")
|
|
sys.exit(1)
|
|
extract_nav(sys.argv[1], sys.argv[2])
|