Files
hermes-brain/scripts/org-to-gbrain.py
2026-05-23 11:44:48 +00:00

350 lines
11 KiB
Python

#!/usr/bin/env python3
"""Convert brain Org-mode files to markdown + YAML frontmatter and sync into gbrain."""
import subprocess, re, os, sys, glob
BRAIN = "/root/brain"
GBRAIN_SRC = "/mnt/hermes/brain"
PANDOC = "/usr/bin/pandoc"
BUN = os.path.expanduser("~/.bun/bin/gbrain")
ROUTING = {
# Concepts — triad architecture, security, economics theory
"triad-overview": "concepts",
"agora": "concepts",
"stoa": "concepts",
"triad-index": "concepts",
"domain-gate-packages": "concepts",
"verification-appliance": "concepts",
"verification-monopoly": "concepts",
"infrastructure-lock-in": "concepts",
"evaluation-harness": "concepts",
"collective-regression-suite": "concepts",
"lisp-machine-security": "concepts",
"common-logic-iso-24707": "concepts",
"self-driving-lisp-machine": "concepts",
"lisp-economics": "concepts",
"sufficiency-flip": "concepts",
"time-estimates": "concepts",
"cost-structure": "concepts",
"gate-rule-encoding": "concepts",
"biology-parallels": "concepts",
"comparison-with-symbolics": "concepts",
"upgrade-lifecycle": "concepts",
"ai-industry-impact": "concepts",
"moats": "concepts",
"patent-strategy": "concepts",
"licensing": "concepts",
"verified-skill-marketplace": "concepts",
"compute-marketplace": "concepts",
"agora-usernames": "concepts",
"pds-as-a-service": "concepts",
"investment-thesis": "concepts",
"compliance-framework-mapping": "concepts",
# Ideas — strategy, competitive analysis
"orders-of-magnitude-time": "concepts",
"revenue-hub": "concepts",
"agora-contracts": "concepts",
"triad-systemic-effects": "concepts",
"growth-strategy": "concepts",
"competitive-analysis-2026-05": "ideas",
"passepartout-economics": "ideas",
}
def find_org_files():
"""Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
files = []
base = f"{BRAIN}/ideas"
for root, dirs, filenames in os.walk(base):
for fn in filenames:
if not fn.endswith('.org'):
continue
abs_path = os.path.join(root, fn)
rel = os.path.relpath(abs_path, base)
name = fn[:-4]
files.append((name, rel, abs_path))
return files
def gbrain_target(rel_path):
"""Derive gbrain target path from org relative path."""
parts = rel_path.split('/')
if len(parts) == 1:
slug = parts[0][:-4]
category = ROUTING.get(slug, "concepts")
return f"{GBRAIN_SRC}/{category}/{slug}.md"
else:
subdir = parts[0]
slug = parts[1][:-4]
return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
def gbrain_slug(rel_path):
"""Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path."""
parts = rel_path.split('/')
if len(parts) == 1:
slug = parts[0][:-4]
category = ROUTING.get(slug, "concepts")
return f"{category}/{slug}"
else:
subdir = parts[0]
slug = parts[1][:-4]
return f"concepts/{subdir}/{slug}"
def build_slug_map():
"""Build mapping: org slug (filename without .org) → gbrain slug."""
mapping = {}
for slug, rel_path, abs_path in find_org_files():
mapping[slug] = gbrain_slug(rel_path)
return mapping
def extract_org_links_and_body(src_path):
"""Read the full org file, extract PROPERTIES + #+ directives, and
return (props, clean_body) where clean_body has header stripped."""
with open(src_path) as f:
content = f.read()
props = {}
# Extract title
m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
if m:
props['title'] = m.group(1).strip()
# Extract tags
m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
if m:
tags = [t.strip(':') for t in m.group(1).split()]
props['tags'] = tags
# Extract ID from PROPERTIES drawer
m = re.search(r':ID:\s+([^\s]+)', content)
if m:
props['org_id'] = m.group(1)
# Extract CREATED
m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
if m:
created_raw = m.group(1) # e.g. "2026-05-23 Sat"
# Extract just the date portion
date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw)
if date_m:
props['created'] = date_m.group(1)
# Strip header for body
lines = content.split('\n')
in_properties = False
start = 0
for i, line in enumerate(lines):
if line.strip() == ':PROPERTIES:':
in_properties = True
if in_properties and line.strip() == ':END:':
in_properties = False
start = i + 1
continue
if not in_properties:
if line.startswith('#+'):
start = i + 1
continue
if line.strip():
start = i
break
start = i + 1
body = '\n'.join(lines[start:])
return props, body
def resolve_org_link(match, slug_map):
"""Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]]
when target is a known org slug. Preserves original target if unknown."""
full = match.group(0)
target = match.group(1)
desc = match.group(2) if match.lastindex >= 2 else target
if target in slug_map:
gbrain_path = slug_map[target]
return f"[[file:{gbrain_path}.org][{desc}]]"
return full
def convert_body(body_text, slug_map):
"""Pre-process org body to inject gbrain path prefixes into cross-references,
then convert to markdown via pandoc. Returns (md_body, link_refs) where
link_refs is a list of {slug, type} dicts."""
link_refs = []
# Find all [[file:X.org][desc]] cross-references and collect them
org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]')
for m in org_link_re.finditer(body_text):
target = m.group(1)
if target in slug_map:
link_refs.append({
"slug": slug_map[target],
"type": "references",
"name": slug_map[target],
})
# Inject directory prefixes into org links so pandoc produces proper paths
def replace_link(m):
target = m.group(1)
desc = m.group(2)
if target in slug_map:
return f"[[file:{slug_map[target]}.org][{desc}]]"
return m.group(0)
processed_body = org_link_re.sub(replace_link, body_text)
# Convert to markdown
result = subprocess.run(
[PANDOC, "-f", "org", "-t", "markdown-smart"],
input=processed_body, capture_output=True, text=True
)
if result.returncode != 0:
print(f" ERROR pandoc: {result.stderr[:200]}")
return None, []
md = result.stdout.strip()
# Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org)
# Strip .org extensions
md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md)
return md, link_refs
def build_frontmatter(props, link_refs=None):
"""Build YAML frontmatter string from properties and link references."""
lines = ['---']
if 'title' in props:
lines.append(f'title: "{props["title"]}"')
if 'tags' in props:
tags_str = ', '.join(props['tags'])
lines.append(f'tags: [{tags_str}]')
if 'created' in props:
lines.append(f'created: {props["created"]}')
if link_refs:
for lr in link_refs:
# Deduplicate by slug
pass
# Deduplicate
seen = set()
unique_links = []
for lr in link_refs:
k = lr['slug']
if k not in seen:
seen.add(k)
unique_links.append(lr)
if unique_links:
lines.append('links:')
for lr in unique_links:
lines.append(f' - slug: "{lr["slug"]}"')
lines.append(f' type: "{lr["type"]}"')
lines.append('---')
return '\n'.join(lines)
def add_timeline_entry(md_body, props):
"""If the page has a CREATED date, prepend a timeline bullet."""
if 'created' in props and 'title' in props:
date = props['created']
title = props['title']
line = f"- **{date}** | Created — {title}\n\n"
return line + md_body
return md_body
def main():
# Pre-build slug map for all org files
slug_map = build_slug_map()
imported = []
for slug, rel_path, src_path in find_org_files():
dst_path = gbrain_target(rel_path)
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
# Extract properties and body from org file
props, org_body = extract_org_links_and_body(src_path)
# Convert body to markdown, collecting links along the way
md, link_refs = convert_body(org_body, slug_map)
if md is None:
continue
# Build frontmatter with links
frontmatter = build_frontmatter(props, link_refs)
# Add timeline entry if date exists
md = add_timeline_entry(md, props)
full = frontmatter + '\n\n' + md + '\n'
with open(dst_path, 'w') as f:
f.write(full)
rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
imported.append(rel_dst)
print(f" OK {rel_dst}")
print(f"\nConverted {len(imported)} files.")
# Commit to git
subprocess.run(["git", "-C", GBRAIN_SRC, "add", "-A"], capture_output=True)
subprocess.run(
["git", "-C", GBRAIN_SRC, "commit", "--allow-empty",
"-m", "gbrain: sync converted org-mode brain files"],
capture_output=True, text=True
)
# Import into gbrain
print("\nImporting into gbrain...")
env = {**os.environ, "PATH": f"{os.path.expanduser('~')}/.bun/bin:{os.environ['PATH']}"}
result = subprocess.run(
[BUN, "import", GBRAIN_SRC],
capture_output=True, text=True, env=env
)
out_lines = result.stdout.strip().split('\n')
for line in out_lines[-25:]:
if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
print(f" {line}")
if result.returncode != 0:
print(f" gbrain import exit code: {result.returncode}")
return
# Embed
print("\nGenerating embeddings...")
result2 = subprocess.run(
[BUN, "embed", "--all"],
capture_output=True, text=True, env=env
)
for line in result2.stdout.strip().split('\n')[-10:]:
if line.strip():
print(f" {line}")
# Extract links from frontmatter (now that pages are imported with links:)
print("\nExtracting links from frontmatter...")
result3 = subprocess.run(
[BUN, "extract", "links", "--source", "db", "--include-frontmatter",
"--dir", GBRAIN_SRC],
capture_output=True, text=True, env=env
)
for line in result3.stdout.strip().split('\n')[-10:]:
if line.strip():
print(f" {line}")
# Extract timeline from body
print("\nExtracting timeline...")
result4 = subprocess.run(
[BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC],
capture_output=True, text=True, env=env
)
for line in result4.stdout.strip().split('\n')[-10:]:
if line.strip():
print(f" {line}")
# Stats
print("\nBrain stats:")
result5 = subprocess.run(
[BUN, "stats"],
capture_output=True, text=True, env=env
)
for line in result5.stdout.strip().split('\n')[-15:]:
if line.strip():
print(f" {line}")
if __name__ == "__main__":
main()