#!/usr/bin/env python3 """ extract_nav.py — Extract WordPress navigation menus from database.sql dump. Outputs nav.json: [{label, href, display_order, is_cta}] Usage: python3 extract_nav.py """ import sys, re, json, os CTA_KEYWORDS = {'book', 'get started', 'contact', 'sign up', 'register', 'join', 'buy', 'shop'} def extract_nav(extract_dir: str, data_dir: str): sql_path = os.path.join(extract_dir, 'database.sql') if not os.path.exists(sql_path): print(f"ERROR: {sql_path} not found", file=sys.stderr) sys.exit(1) with open(sql_path, encoding='utf-8', errors='replace') as f: sql = f.read() # Detect table prefix prefix_match = re.search(r"INSERT INTO `(\w+)options`", sql) prefix = prefix_match.group(1) if prefix_match else 'wp_' # Find nav menu items: post_type = 'nav_menu_item' # Extract INSERT rows from wp_posts posts_pattern = re.compile( r"INSERT INTO `%sposts`[^;]+?;" % re.escape(prefix), re.DOTALL | re.IGNORECASE ) postmeta_pattern = re.compile( r"INSERT INTO `%spostmeta`[^;]+?;" % re.escape(prefix), re.DOTALL | re.IGNORECASE ) nav_posts = {} for m in posts_pattern.finditer(sql): rows = re.findall(r"\((\d+),[^,]*,'[^']*','[^']*','([^']*)'[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,'([^']*)'[^,]*,[^,]*,\d+,'nav_menu_item'", m.group()) for post_id, post_title, post_status in rows: if post_status == 'publish': nav_posts[post_id] = {'label': post_title, 'href': '/', 'menu_order': 0} if not nav_posts: # Fallback: simpler pattern for m in posts_pattern.finditer(sql): block = m.group() ids = re.findall(r"\((\d+),", block) titles = re.findall(r"'([^']{1,60})'", block) for i, post_id in enumerate(ids): if i < len(titles) and titles[i]: nav_posts[post_id] = {'label': titles[i], 'href': '/', 'menu_order': i} # Extract menu item URLs from postmeta (_menu_item_url or _menu_item_object_id) for m in postmeta_pattern.finditer(sql): block = m.group() # _menu_item_url url_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_url',\s*'([^']*)'\)", block) for post_id, url in url_matches: if post_id in nav_posts and url: nav_posts[post_id]['href'] = url # _menu_item_menu_order order_matches = re.findall(r"\((\d+),\s*\d+,\s*'_menu_item_menu_order',\s*'(\d+)'\)", block) for post_id, order in order_matches: if post_id in nav_posts: nav_posts[post_id]['menu_order'] = int(order) # Clean up hrefs: make relative if same domain items = [] for idx, (post_id, item) in enumerate(sorted(nav_posts.items(), key=lambda x: x[1].get('menu_order', 0))): label = item['label'].strip() href = item['href'].strip() if not label: continue # Make relative href = re.sub(r'https?://[^/]+', '', href) or '/' if not href.startswith('/'): href = '/' + href is_cta = 1 if any(kw in label.lower() for kw in CTA_KEYWORDS) else 0 items.append({ 'label': label, 'href': href, 'display_order': idx + 1, 'is_cta': is_cta }) os.makedirs(data_dir, exist_ok=True) out_path = os.path.join(data_dir, 'nav.json') with open(out_path, 'w', encoding='utf-8') as f: json.dump(items, f, indent=2, ensure_ascii=False) print(f"nav.json: {len(items)} items → {out_path}") for item in items: print(f" {'[CTA]' if item['is_cta'] else ' '} {item['label']} → {item['href']}") if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: python3 extract_nav.py ") sys.exit(1) extract_nav(sys.argv[1], sys.argv[2])