hermes-brain/scripts/org-to-gbrain.py

#!/usr/bin/env python3
"""Convert brain Org-mode files to markdown + YAML frontmatter and sync into gbrain."""
import subprocess, re, os, sys, glob

BRAIN = "/root/brain"
GBRAIN_SRC = "/mnt/hermes/brain"
PANDOC = "/usr/bin/pandoc"
BUN = os.path.expanduser("~/.bun/bin/gbrain")

def find_org_files():
    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
    files = []
    base = f"{BRAIN}/ideas"
    for root, dirs, filenames in os.walk(base):
        for fn in filenames:
            if not fn.endswith('.org'):
                continue
            abs_path = os.path.join(root, fn)
            rel = os.path.relpath(abs_path, base)
            # rel is like "compliance/hipaa.org" or "triad-overview.org"
            name = fn[:-4]  # remove .org
            files.append((name, rel, abs_path))
    return files

def gbrain_target(rel_path):
    """Derive gbrain target path from org relative path.

    ideas/compliance/hipaa.org  → concepts/compliance/hipaa.md
    ideas/triad-overview.org    → concepts/triad-overview.md (via routing dict)
    ideas/competitive-analysis...→ ideas/competitive-analysis.md
    """
    parts = rel_path.split('/')

    if len(parts) == 1:
        # Flat file in ideas/ root — use ROUTING dict
        slug = parts[0][:-4] if parts[0].endswith('.org') else parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{GBRAIN_SRC}/{category}/{slug}.md"
    else:
        # In a subdirectory: ideas/compliance/foo.org → concepts/compliance/foo.md
        subdir = parts[0]
        slug = parts[1][:-4] if parts[1].endswith('.org') else parts[1][:-4]
        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"

def extract_org_properties(src_path):
    """Extract :PROPERTIES: drawer and #+title/#+filetags from an org file."""
    props = {}
    with open(src_path) as f:
        content = f.read()

    # Extract title
    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
    if m:
        props['title'] = m.group(1).strip()

    # Extract tags
    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
    if m:
        tags = [t.strip(':') for t in m.group(1).split()]
        props['tags'] = tags

    # Extract ID from PROPERTIES drawer
    m = re.search(r':ID:\s+([^\s]+)', content)
    if m:
        props['org_id'] = m.group(1)

    # Extract CREATED
    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
    if m:
        props['created'] = m.group(1)

    return props

def strip_org_header(src_path):
    """Strip the Org-mode header block (PROPERTIES drawer + #+ directives)
    before feeding to pandoc, so it doesn't produce raw {=org} blocks."""
    with open(src_path) as f:
        lines = f.readlines()

    # Find first non-header line
    in_properties = False
    start = 0
    for i, line in enumerate(lines):
        if line.strip() == ':PROPERTIES:':
            in_properties = True
        if in_properties and line.strip() == ':END:':
            in_properties = False
            start = i + 1
            continue
        if not in_properties:
            # Skip #+ lines
            if line.startswith('#+'):
                start = i + 1
                continue
            # First real content
            if line.strip():
                start = i
                break
            start = i + 1

    return ''.join(lines[start:])

def pandoc_convert(clean_body):
    """Convert org body to markdown via pandoc (stdin mode)."""
    result = subprocess.run(
        [PANDOC, "-f", "org", "-t", "markdown-smart"],
        input=clean_body, capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  ERROR pandoc: {result.stderr[:200]}")
        return None
    return result.stdout.strip()

def build_frontmatter(props):
    """Build YAML frontmatter string from extracted properties."""
    lines = ['---']
    if 'title' in props:
        lines.append(f'title: "{props["title"]}"')
    if 'tags' in props:
        tags_str = ', '.join(props['tags'])
        lines.append(f'tags: [{tags_str}]')
    if 'created' in props:
        lines.append(f'created: {props["created"]}')
    lines.append('---')
    return '\n'.join(lines)

def postprocess_links(md_text):
    """Convert pandoc's markdown links to gbrain-friendly format."""
    # Pandoc converts [[file:foo.org][desc]] to [desc](foo.org)
    # Strip .org extensions from relative links
    md_text = re.sub(r'\(([a-zA-Z0-9_-]+)\.org\)', r'(\1)', md_text)
    return md_text

ROUTING = {
    # Concepts — triad architecture, security, economics theory
    "triad-overview": "concepts",
    "agora": "concepts",
    "stoa": "concepts",
    "triad-index": "concepts",
    "domain-gate-packages": "concepts",
    "verification-appliance": "concepts",
    "verification-monopoly": "concepts",
    "infrastructure-lock-in": "concepts",
    "evaluation-harness": "concepts",
    "collective-regression-suite": "concepts",
    "lisp-machine-security": "concepts",
    "common-logic-iso-24707": "concepts",
    "self-driving-lisp-machine": "concepts",
    "lisp-economics": "concepts",
    "sufficiency-flip": "concepts",
    "time-estimates": "concepts",
    "cost-structure": "concepts",
    "gate-rule-encoding": "concepts",
    "biology-parallels": "concepts",
    "comparison-with-symbolics": "concepts",
    "upgrade-lifecycle": "concepts",
    "ai-industry-impact": "concepts",
    "moats": "concepts",
    "patent-strategy": "concepts",
    "licensing": "concepts",
    "verified-skill-marketplace": "concepts",
    "compute-marketplace": "concepts",
    "agora-usernames": "concepts",
    "pds-as-a-service": "concepts",
    "investment-thesis": "concepts",
    "compliance-framework-mapping": "concepts",
    # Ideas — strategy, competitive analysis
    "competitive-analysis-2026-05": "ideas",
    "passepartout-economics": "ideas",
}

def main():
    imported = []

    for slug, rel_path, src_path in find_org_files():
        dst_path = gbrain_target(rel_path)

        # Create parent directories
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)

        # Extract frontmatter from org properties
        props = extract_org_properties(src_path)

        # Strip org header and convert body to markdown
        clean = strip_org_header(src_path)
        md = pandoc_convert(clean)
        if md is None:
            continue

        md = postprocess_links(md)

        # Assemble: YAML frontmatter + markdown body
        frontmatter = build_frontmatter(props)
        full = frontmatter + '\n\n' + md + '\n'

        with open(dst_path, 'w') as f:
            f.write(full)

        # Show relative path for clarity
        rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
        imported.append(rel_dst)
        print(f"  OK  {rel_dst}")

    print(f"\nConverted {len(imported)} files.")

    # Commit to git
    subprocess.run(["git", "-C", GBRAIN_SRC, "add", "-A"], capture_output=True)
    subprocess.run(
        ["git", "-C", GBRAIN_SRC, "commit", "--allow-empty",
         "-m", "gbrain: sync converted org-mode brain files"],
        capture_output=True, text=True
    )

    # Import into gbrain
    print("\nImporting into gbrain...")
    env = {**os.environ, "PATH": f"{os.path.expanduser('~')}/.bun/bin:{os.environ['PATH']}"}
    result = subprocess.run(
        [BUN, "import", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    # Show last 20 lines of stdout (skip noise)
    out_lines = result.stdout.strip().split('\n')
    for line in out_lines[-25:]:
        if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
            print(f"  {line}")

    if result.returncode != 0:
        print(f"  gbrain import exit code: {result.returncode}")
        return

    # Embed
    print("\nGenerating embeddings...")
    result2 = subprocess.run(
        [BUN, "embed", "--all"],
        capture_output=True, text=True, env=env
    )
    for line in result2.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")

if __name__ == "__main__":
    main()