recent updates
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
extract_nav.py — Extract WordPress navigation menus from database.sql dump.
|
||||
Outputs nav.json: [{label, href, display_order, is_cta}]
|
||||
|
||||
Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>
|
||||
"""
|
||||
import sys, re, json, os
|
||||
|
||||
CTA_KEYWORDS = {'book', 'get started', 'contact', 'sign up', 'register', 'join', 'buy', 'shop'}
|
||||
|
||||
def extract_nav(extract_dir: str, data_dir: str):
|
||||
sql_path = os.path.join(extract_dir, 'database.sql')
|
||||
if not os.path.exists(sql_path):
|
||||
print(f"ERROR: {sql_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(sql_path, encoding='utf-8', errors='replace') as f:
|
||||
sql = f.read()
|
||||
|
||||
# Detect table prefix
|
||||
prefix_match = re.search(r"INSERT INTO `(\w+)options`", sql)
|
||||
prefix = prefix_match.group(1) if prefix_match else 'wp_'
|
||||
|
||||
# Find nav menu items: post_type = 'nav_menu_item'
|
||||
# Extract INSERT rows from wp_posts
|
||||
posts_pattern = re.compile(
|
||||
r"INSERT INTO `%sposts`[^;]+?;" % re.escape(prefix),
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
postmeta_pattern = re.compile(
|
||||
r"INSERT INTO `%spostmeta`[^;]+?;" % re.escape(prefix),
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
nav_posts = {}
|
||||
for m in posts_pattern.finditer(sql):
|
||||
rows = re.findall(r"\((\d+),[^,]*,'[^']*','[^']*','([^']*)'[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,'([^']*)'[^,]*,[^,]*,\d+,'nav_menu_item'", m.group())
|
||||
for post_id, post_title, post_status in rows:
|
||||
if post_status == 'publish':
|
||||
nav_posts[post_id] = {'label': post_title, 'href': '/', 'menu_order': 0}
|
||||
|
||||
if not nav_posts:
|
||||
# Fallback: simpler pattern
|
||||
for m in posts_pattern.finditer(sql):
|
||||
block = m.group()
|
||||
ids = re.findall(r"\((\d+),", block)
|
||||
titles = re.findall(r"'([^']{1,60})'", block)
|
||||
for i, post_id in enumerate(ids):
|
||||
if i < len(titles) and titles[i]:
|
||||
nav_posts[post_id] = {'label': titles[i], 'href': '/', 'menu_order': i}
|
||||
|
||||
# Extract menu item URLs from postmeta (_menu_item_url or _menu_item_object_id)
|
||||
for m in postmeta_pattern.finditer(sql):
|
||||
block = m.group()
|
||||
# _menu_item_url
|
||||
url_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_url',\s*'([^']*)'\)", block)
|
||||
for post_id, url in url_matches:
|
||||
if post_id in nav_posts and url:
|
||||
nav_posts[post_id]['href'] = url
|
||||
# _menu_item_menu_order
|
||||
order_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_menu_order',\s*'(\d+)'\)", block)
|
||||
for post_id, order in order_matches:
|
||||
if post_id in nav_posts:
|
||||
nav_posts[post_id]['menu_order'] = int(order)
|
||||
|
||||
# Clean up hrefs: make relative if same domain
|
||||
items = []
|
||||
for idx, (post_id, item) in enumerate(sorted(nav_posts.items(), key=lambda x: x[1].get('menu_order', 0))):
|
||||
label = item['label'].strip()
|
||||
href = item['href'].strip()
|
||||
if not label:
|
||||
continue
|
||||
# Make relative
|
||||
href = re.sub(r'https?://[^/]+', '', href) or '/'
|
||||
if not href.startswith('/'):
|
||||
href = '/' + href
|
||||
is_cta = 1 if any(kw in label.lower() for kw in CTA_KEYWORDS) else 0
|
||||
items.append({
|
||||
'label': label,
|
||||
'href': href,
|
||||
'display_order': idx + 1,
|
||||
'is_cta': is_cta
|
||||
})
|
||||
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
out_path = os.path.join(data_dir, 'nav.json')
|
||||
with open(out_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(items, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"nav.json: {len(items)} items → {out_path}")
|
||||
for item in items:
|
||||
print(f" {'[CTA]' if item['is_cta'] else ' '} {item['label']} → {item['href']}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python3 extract_nav.py <wpress-extract-dir> <output-data-dir>")
|
||||
sys.exit(1)
|
||||
extract_nav(sys.argv[1], sys.argv[2])
|
||||
Reference in New Issue
Block a user