#!/usr/bin/env python3 import sqlite3 import glob import os import re from datetime import datetime DB_PATH = "/home/sirdrez/arisingmedia-websites/.am-webdesign-sops/sops.db" SOP_DIR = "/home/sirdrez/arisingmedia-websites/.am-webdesign-sops" def init_db(): """Initialize database with fresh schema.""" conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() # Drop tables in reverse dependency order cursor.execute("DROP TABLE IF EXISTS sop_fts") cursor.execute("DROP TABLE IF EXISTS rules") cursor.execute("DROP TABLE IF EXISTS sop_sections") cursor.execute("DROP TABLE IF EXISTS sops") # Create tables cursor.execute(""" CREATE TABLE sops ( id INTEGER PRIMARY KEY AUTOINCREMENT, number TEXT, filename TEXT, title TEXT, full_content TEXT, updated_at TEXT ) """) cursor.execute(""" CREATE TABLE sop_sections ( id INTEGER PRIMARY KEY AUTOINCREMENT, sop_id INTEGER REFERENCES sops(id), heading_level INTEGER, title TEXT, content TEXT ) """) cursor.execute(""" CREATE TABLE rules ( id INTEGER PRIMARY KEY AUTOINCREMENT, category TEXT, rule TEXT, source_sop TEXT, source_section TEXT ) """) cursor.execute(""" CREATE VIRTUAL TABLE sop_fts USING fts5( sop_number, sop_title, section_title, content ) """) conn.commit() return conn def extract_number_from_filename(filename): """Extract number prefix from filename (e.g., '00' from '00-stack-philosophy.md').""" match = re.match(r'^(\d+)', filename) if match: return match.group(1) return "" def extract_first_heading(content): """Extract first line starting with # as title.""" for line in content.split('\n'): if line.startswith('#'): return line.lstrip('#').strip() return "" def split_into_sections(content): """Split content into sections by ## or ### headings.""" sections = [] current_section = None current_content = [] lines = content.split('\n') for line in lines: if line.startswith('##'): # Save previous section if exists if current_section: current_section['content'] = '\n'.join(current_content).strip() sections.append(current_section) # Determine heading level heading_level = 2 if line.startswith('###'): heading_level = 3 current_section = { 'heading_level': heading_level, 'title': line.lstrip('#').strip() } current_content = [] elif current_section: current_content.append(line) # Save last section if current_section: current_section['content'] = '\n'.join(current_content).strip() sections.append(current_section) return sections def extract_rules_from_section(section_title, section_content, category_map): """Extract rules from section if title matches keyword patterns.""" title_lower = section_title.lower() rules = [] # Determine category category = None if any(keyword in title_lower for keyword in ['never use', 'mandatory', 'rules', 'what we never']): if 'never' in title_lower: category = 'never_use' elif 'mandatory' in title_lower or 'pattern' in title_lower: category = 'mandatory' if not category: return rules # Extract bullet points for line in section_content.split('\n'): stripped = line.strip() if stripped.startswith('-') or stripped.startswith('*'): rule_text = stripped.lstrip('-*').strip() if rule_text: rules.append({ 'category': category, 'rule': rule_text }) return rules def process_sop_files(conn): """Process all .md files and populate database.""" cursor = conn.cursor() # Get all .md files in top level only md_files = glob.glob(os.path.join(SOP_DIR, "*.md")) md_files.sort() sop_count = 0 section_count = 0 rule_count = 0 for filepath in md_files: filename = os.path.basename(filepath) # Skip certain files if filename in ['README.md', 'STACK.md', 'CONTENT.md', 'OPTIMIZATION.md']: continue with open(filepath, 'r', encoding='utf-8') as f: full_content = f.read() # Extract metadata number = extract_number_from_filename(filename) title = extract_first_heading(full_content) updated_at = datetime.now().isoformat() # Insert SOP record cursor.execute(""" INSERT INTO sops (number, filename, title, full_content, updated_at) VALUES (?, ?, ?, ?, ?) """, (number, filename, title, full_content, updated_at)) sop_id = cursor.lastrowid sop_count += 1 # Split into sections and insert sections = split_into_sections(full_content) for section in sections: cursor.execute(""" INSERT INTO sop_sections (sop_id, heading_level, title, content) VALUES (?, ?, ?, ?) """, (sop_id, section['heading_level'], section['title'], section['content'])) section_count += 1 # Extract rules from section rules = extract_rules_from_section(section['title'], section['content'], {}) for rule in rules: cursor.execute(""" INSERT INTO rules (category, rule, source_sop, source_section) VALUES (?, ?, ?, ?) """, (rule['category'], rule['rule'], filename, section['title'])) rule_count += 1 conn.commit() return sop_count, section_count, rule_count def rebuild_fts(conn): """Rebuild FTS index.""" cursor = conn.cursor() cursor.execute(""" INSERT INTO sop_fts(sop_number, sop_title, section_title, content) SELECT s.number, s.title, ss.title, ss.content FROM sop_sections ss JOIN sops s ON ss.sop_id = s.id """) conn.commit() def main(): """Main entry point.""" try: conn = init_db() sop_count, section_count, rule_count = process_sop_files(conn) rebuild_fts(conn) conn.close() print(f"SOP Database built successfully:") print(f" SOPs loaded: {sop_count}") print(f" Sections indexed: {section_count}") print(f" Rules extracted: {rule_count}") print(f" Database: {DB_PATH}") except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() exit(1) if __name__ == "__main__": main()