arisingmedia-web-sops/build/seed_sops.py

#!/usr/bin/env python3

import sqlite3
import glob
import os
import re
from datetime import datetime

DB_PATH = "/home/sirdrez/arisingmedia-websites/.am-webdesign-sops/sops.db"
SOP_DIR = "/home/sirdrez/arisingmedia-websites/.am-webdesign-sops"

def init_db():
    """Initialize database with fresh schema."""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    # Drop tables in reverse dependency order
    cursor.execute("DROP TABLE IF EXISTS sop_fts")
    cursor.execute("DROP TABLE IF EXISTS rules")
    cursor.execute("DROP TABLE IF EXISTS sop_sections")
    cursor.execute("DROP TABLE IF EXISTS sops")

    # Create tables
    cursor.execute("""
        CREATE TABLE sops (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            number TEXT,
            filename TEXT,
            title TEXT,
            full_content TEXT,
            updated_at TEXT
        )
    """)

    cursor.execute("""
        CREATE TABLE sop_sections (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            sop_id INTEGER REFERENCES sops(id),
            heading_level INTEGER,
            title TEXT,
            content TEXT
        )
    """)

    cursor.execute("""
        CREATE TABLE rules (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            category TEXT,
            rule TEXT,
            source_sop TEXT,
            source_section TEXT
        )
    """)

    cursor.execute("""
        CREATE VIRTUAL TABLE sop_fts USING fts5(
            sop_number,
            sop_title,
            section_title,
            content
        )
    """)

    conn.commit()
    return conn

def extract_number_from_filename(filename):
    """Extract number prefix from filename (e.g., '00' from '00-stack-philosophy.md')."""
    match = re.match(r'^(\d+)', filename)
    if match:
        return match.group(1)
    return ""

def extract_first_heading(content):
    """Extract first line starting with # as title."""
    for line in content.split('\n'):
        if line.startswith('#'):
            return line.lstrip('#').strip()
    return ""

def split_into_sections(content):
    """Split content into sections by ## or ### headings."""
    sections = []
    current_section = None
    current_content = []

    lines = content.split('\n')

    for line in lines:
        if line.startswith('##'):
            # Save previous section if exists
            if current_section:
                current_section['content'] = '\n'.join(current_content).strip()
                sections.append(current_section)

            # Determine heading level
            heading_level = 2
            if line.startswith('###'):
                heading_level = 3

            current_section = {
                'heading_level': heading_level,
                'title': line.lstrip('#').strip()
            }
            current_content = []
        elif current_section:
            current_content.append(line)

    # Save last section
    if current_section:
        current_section['content'] = '\n'.join(current_content).strip()
        sections.append(current_section)

    return sections

def extract_rules_from_section(section_title, section_content, category_map):
    """Extract rules from section if title matches keyword patterns."""
    title_lower = section_title.lower()
    rules = []

    # Determine category
    category = None
    if any(keyword in title_lower for keyword in ['never use', 'mandatory', 'rules', 'what we never']):
        if 'never' in title_lower:
            category = 'never_use'
        elif 'mandatory' in title_lower or 'pattern' in title_lower:
            category = 'mandatory'

    if not category:
        return rules

    # Extract bullet points
    for line in section_content.split('\n'):
        stripped = line.strip()
        if stripped.startswith('-') or stripped.startswith('*'):
            rule_text = stripped.lstrip('-*').strip()
            if rule_text:
                rules.append({
                    'category': category,
                    'rule': rule_text
                })

    return rules

def process_sop_files(conn):
    """Process all .md files and populate database."""
    cursor = conn.cursor()

    # Get all .md files in top level only
    md_files = glob.glob(os.path.join(SOP_DIR, "*.md"))
    md_files.sort()

    sop_count = 0
    section_count = 0
    rule_count = 0

    for filepath in md_files:
        filename = os.path.basename(filepath)

        # Skip certain files
        if filename in ['README.md', 'STACK.md', 'CONTENT.md', 'OPTIMIZATION.md']:
            continue

        with open(filepath, 'r', encoding='utf-8') as f:
            full_content = f.read()

        # Extract metadata
        number = extract_number_from_filename(filename)
        title = extract_first_heading(full_content)
        updated_at = datetime.now().isoformat()

        # Insert SOP record
        cursor.execute("""
            INSERT INTO sops (number, filename, title, full_content, updated_at)
            VALUES (?, ?, ?, ?, ?)
        """, (number, filename, title, full_content, updated_at))

        sop_id = cursor.lastrowid
        sop_count += 1

        # Split into sections and insert
        sections = split_into_sections(full_content)

        for section in sections:
            cursor.execute("""
                INSERT INTO sop_sections (sop_id, heading_level, title, content)
                VALUES (?, ?, ?, ?)
            """, (sop_id, section['heading_level'], section['title'], section['content']))

            section_count += 1

            # Extract rules from section
            rules = extract_rules_from_section(section['title'], section['content'], {})

            for rule in rules:
                cursor.execute("""
                    INSERT INTO rules (category, rule, source_sop, source_section)
                    VALUES (?, ?, ?, ?)
                """, (rule['category'], rule['rule'], filename, section['title']))

                rule_count += 1

    conn.commit()
    return sop_count, section_count, rule_count

def rebuild_fts(conn):
    """Rebuild FTS index."""
    cursor = conn.cursor()

    cursor.execute("""
        INSERT INTO sop_fts(sop_number, sop_title, section_title, content)
        SELECT s.number, s.title, ss.title, ss.content
        FROM sop_sections ss JOIN sops s ON ss.sop_id = s.id
    """)

    conn.commit()

def main():
    """Main entry point."""
    try:
        conn = init_db()
        sop_count, section_count, rule_count = process_sop_files(conn)
        rebuild_fts(conn)
        conn.close()

        print(f"SOP Database built successfully:")
        print(f"  SOPs loaded: {sop_count}")
        print(f"  Sections indexed: {section_count}")
        print(f"  Rules extracted: {rule_count}")
        print(f"  Database: {DB_PATH}")

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        exit(1)

if __name__ == "__main__":
    main()