hermes-brain/scripts/org-to-gbrain.py

#!/usr/bin/env python3
"""Convert brain Org-mode files to markdown + YAML frontmatter and sync into gbrain."""
import subprocess, re, os, sys, glob

BRAIN = "/root/brain"
GBRAIN_SRC = "/mnt/hermes/brain"
PANDOC = "/usr/bin/pandoc"
BUN = os.path.expanduser("~/.bun/bin/gbrain")

ROUTING = {
    # Concepts — triad architecture, security, economics theory
    "triad-overview": "concepts",
    "agora": "concepts",
    "stoa": "concepts",
    "triad-index": "concepts",
    "domain-gate-packages": "concepts",
    "verification-appliance": "concepts",
    "verification-monopoly": "concepts",
    "infrastructure-lock-in": "concepts",
    "evaluation-harness": "concepts",
    "collective-regression-suite": "concepts",
    "lisp-machine-security": "concepts",
    "common-logic-iso-24707": "concepts",
    "self-driving-lisp-machine": "concepts",
    "lisp-economics": "concepts",
    "sufficiency-flip": "concepts",
    "time-estimates": "concepts",
    "cost-structure": "concepts",
    "gate-rule-encoding": "concepts",
    "biology-parallels": "concepts",
    "comparison-with-symbolics": "concepts",
    "upgrade-lifecycle": "concepts",
    "ai-industry-impact": "concepts",
    "moats": "concepts",
    "patent-strategy": "concepts",
    "licensing": "concepts",
    "verified-skill-marketplace": "concepts",
    "compute-marketplace": "concepts",
    "agora-usernames": "concepts",
    "pds-as-a-service": "concepts",
    "investment-thesis": "concepts",
    "compliance-framework-mapping": "concepts",
    # Ideas — strategy, competitive analysis
    "orders-of-magnitude-time": "concepts",
    "revenue-hub": "concepts",
    "agora-contracts": "concepts",
    "triad-systemic-effects": "concepts",
    "growth-strategy": "concepts",
    "competitive-analysis-2026-05": "ideas",
    "passepartout-economics": "ideas",
}

def find_org_files():
    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
    files = []
    base = f"{BRAIN}/ideas"
    for root, dirs, filenames in os.walk(base):
        for fn in filenames:
            if not fn.endswith('.org'):
                continue
            abs_path = os.path.join(root, fn)
            rel = os.path.relpath(abs_path, base)
            name = fn[:-4]
            files.append((name, rel, abs_path))
    return files

def gbrain_target(rel_path):
    """Derive gbrain target path from org relative path."""
    parts = rel_path.split('/')
    if len(parts) == 1:
        slug = parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{GBRAIN_SRC}/{category}/{slug}.md"
    else:
        subdir = parts[0]
        slug = parts[1][:-4]
        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"

def gbrain_slug(rel_path):
    """Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path."""
    parts = rel_path.split('/')
    if len(parts) == 1:
        slug = parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{category}/{slug}"
    else:
        subdir = parts[0]
        slug = parts[1][:-4]
        return f"concepts/{subdir}/{slug}"

def build_slug_map():
    """Build mapping: org slug (filename without .org) → gbrain slug."""
    mapping = {}
    for slug, rel_path, abs_path in find_org_files():
        mapping[slug] = gbrain_slug(rel_path)
    return mapping

def extract_org_links_and_body(src_path):
    """Read the full org file, extract PROPERTIES + #+ directives, and
    return (props, clean_body) where clean_body has header stripped."""
    with open(src_path) as f:
        content = f.read()

    props = {}

    # Extract title
    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
    if m:
        props['title'] = m.group(1).strip()

    # Extract tags
    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
    if m:
        tags = [t.strip(':') for t in m.group(1).split()]
        props['tags'] = tags

    # Extract ID from PROPERTIES drawer
    m = re.search(r':ID:\s+([^\s]+)', content)
    if m:
        props['org_id'] = m.group(1)

    # Extract CREATED
    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
    if m:
        created_raw = m.group(1)  # e.g. "2026-05-23 Sat"
        # Extract just the date portion
        date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw)
        if date_m:
            props['created'] = date_m.group(1)

    # Strip header for body
    lines = content.split('\n')
    in_properties = False
    start = 0
    for i, line in enumerate(lines):
        if line.strip() == ':PROPERTIES:':
            in_properties = True
        if in_properties and line.strip() == ':END:':
            in_properties = False
            start = i + 1
            continue
        if not in_properties:
            if line.startswith('#+'):
                start = i + 1
                continue
            if line.strip():
                start = i
                break
            start = i + 1

    body = '\n'.join(lines[start:])
    return props, body

def resolve_org_link(match, slug_map):
    """Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]]
    when target is a known org slug. Preserves original target if unknown."""
    full = match.group(0)
    target = match.group(1)
    desc = match.group(2) if match.lastindex >= 2 else target

    if target in slug_map:
        gbrain_path = slug_map[target]
        return f"[[file:{gbrain_path}.org][{desc}]]"
    return full

def convert_body(body_text, slug_map):
    """Pre-process org body to inject gbrain path prefixes into cross-references,
    then convert to markdown via pandoc. Returns (md_body, link_refs) where
    link_refs is a list of {slug, type} dicts."""
    link_refs = []

    # Find all [[file:X.org][desc]] cross-references and collect them
    org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]')
    for m in org_link_re.finditer(body_text):
        target = m.group(1)
        if target in slug_map:
            link_refs.append({
                "slug": slug_map[target],
                "type": "references",
                "name": slug_map[target],
            })

    # Inject directory prefixes into org links so pandoc produces proper paths
    def replace_link(m):
        target = m.group(1)
        desc = m.group(2)
        if target in slug_map:
            return f"[[file:{slug_map[target]}.org][{desc}]]"
        return m.group(0)

    processed_body = org_link_re.sub(replace_link, body_text)

    # Convert to markdown
    result = subprocess.run(
        [PANDOC, "-f", "org", "-t", "markdown-smart"],
        input=processed_body, capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  ERROR pandoc: {result.stderr[:200]}")
        return None, []

    md = result.stdout.strip()

    # Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org)
    # Strip .org extensions
    md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md)

    return md, link_refs

def build_frontmatter(props, link_refs=None):
    """Build YAML frontmatter string from properties and link references."""
    lines = ['---']
    if 'title' in props:
        lines.append(f'title: "{props["title"]}"')
    if 'tags' in props:
        tags_str = ', '.join(props['tags'])
        lines.append(f'tags: [{tags_str}]')
    if 'created' in props:
        lines.append(f'created: {props["created"]}')
    if link_refs:
        for lr in link_refs:
            # Deduplicate by slug
            pass
        # Deduplicate
        seen = set()
        unique_links = []
        for lr in link_refs:
            k = lr['slug']
            if k not in seen:
                seen.add(k)
                unique_links.append(lr)
        if unique_links:
            lines.append('links:')
            for lr in unique_links:
                lines.append(f'  - slug: "{lr["slug"]}"')
                lines.append(f'    type: "{lr["type"]}"')
    lines.append('---')
    return '\n'.join(lines)

def add_timeline_entry(md_body, props):
    """If the page has a CREATED date, prepend a timeline bullet."""
    if 'created' in props and 'title' in props:
        date = props['created']
        title = props['title']
        line = f"- **{date}** | Created — {title}\n\n"
        return line + md_body
    return md_body

def main():
    # Pre-build slug map for all org files
    slug_map = build_slug_map()
    imported = []

    for slug, rel_path, src_path in find_org_files():
        dst_path = gbrain_target(rel_path)
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)

        # Extract properties and body from org file
        props, org_body = extract_org_links_and_body(src_path)

        # Convert body to markdown, collecting links along the way
        md, link_refs = convert_body(org_body, slug_map)
        if md is None:
            continue

        # Build frontmatter with links
        frontmatter = build_frontmatter(props, link_refs)

        # Add timeline entry if date exists
        md = add_timeline_entry(md, props)

        full = frontmatter + '\n\n' + md + '\n'

        with open(dst_path, 'w') as f:
            f.write(full)

        rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
        imported.append(rel_dst)
        print(f"  OK  {rel_dst}")

    print(f"\nConverted {len(imported)} files.")

    # Commit to git
    subprocess.run(["git", "-C", GBRAIN_SRC, "add", "-A"], capture_output=True)
    subprocess.run(
        ["git", "-C", GBRAIN_SRC, "commit", "--allow-empty",
         "-m", "gbrain: sync converted org-mode brain files"],
        capture_output=True, text=True
    )

    # Import into gbrain
    print("\nImporting into gbrain...")
    env = {**os.environ, "PATH": f"{os.path.expanduser('~')}/.bun/bin:{os.environ['PATH']}"}
    result = subprocess.run(
        [BUN, "import", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    out_lines = result.stdout.strip().split('\n')
    for line in out_lines[-25:]:
        if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
            print(f"  {line}")

    if result.returncode != 0:
        print(f"  gbrain import exit code: {result.returncode}")
        return

    # Embed
    print("\nGenerating embeddings...")
    result2 = subprocess.run(
        [BUN, "embed", "--all"],
        capture_output=True, text=True, env=env
    )
    for line in result2.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")

    # Extract links from frontmatter (now that pages are imported with links:)
    print("\nExtracting links from frontmatter...")
    result3 = subprocess.run(
        [BUN, "extract", "links", "--source", "db", "--include-frontmatter",
         "--dir", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    for line in result3.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")

    # Extract timeline from body
    print("\nExtracting timeline...")
    result4 = subprocess.run(
        [BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    for line in result4.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")

    # Stats
    print("\nBrain stats:")
    result5 = subprocess.run(
        [BUN, "stats"],
        capture_output=True, text=True, env=env
    )
    for line in result5.stdout.strip().split('\n')[-15:]:
        if line.strip():
            print(f"  {line}")

if __name__ == "__main__":
    main()