From 9a95212d1aef199df679f536772e7f61bdac1971 Mon Sep 17 00:00:00 2001
From: Hermes <hermes@hermes.local>
Date: Sat, 23 May 2026 07:00:09 +0000
Subject: [PATCH] gbrain: sync converted org-mode brain files

---
 scripts/org-to-gbrain.py | 398 ++++++++++++++++++++++++---------------
 1 file changed, 250 insertions(+), 148 deletions(-)

diff --git a/scripts/org-to-gbrain.py b/scripts/org-to-gbrain.py
index e013b23..c303b83 100644
--- a/scripts/org-to-gbrain.py
+++ b/scripts/org-to-gbrain.py
@@ -7,130 +7,6 @@ GBRAIN_SRC = "/mnt/hermes/brain"
 PANDOC = "/usr/bin/pandoc"
 BUN = os.path.expanduser("~/.bun/bin/gbrain")
 
-def find_org_files():
-    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
-    files = []
-    base = f"{BRAIN}/ideas"
-    for root, dirs, filenames in os.walk(base):
-        for fn in filenames:
-            if not fn.endswith('.org'):
-                continue
-            abs_path = os.path.join(root, fn)
-            rel = os.path.relpath(abs_path, base)
-            # rel is like "compliance/hipaa.org" or "triad-overview.org"
-            name = fn[:-4]  # remove .org
-            files.append((name, rel, abs_path))
-    return files
-
-def gbrain_target(rel_path):
-    """Derive gbrain target path from org relative path.
-    
-    ideas/compliance/hipaa.org  → concepts/compliance/hipaa.md
-    ideas/triad-overview.org    → concepts/triad-overview.md (via routing dict)
-    ideas/competitive-analysis...→ ideas/competitive-analysis.md
-    """
-    parts = rel_path.split('/')
-    
-    if len(parts) == 1:
-        # Flat file in ideas/ root — use ROUTING dict
-        slug = parts[0][:-4] if parts[0].endswith('.org') else parts[0][:-4]
-        category = ROUTING.get(slug, "concepts")
-        return f"{GBRAIN_SRC}/{category}/{slug}.md"
-    else:
-        # In a subdirectory: ideas/compliance/foo.org → concepts/compliance/foo.md
-        subdir = parts[0]
-        slug = parts[1][:-4] if parts[1].endswith('.org') else parts[1][:-4]
-        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
-
-def extract_org_properties(src_path):
-    """Extract :PROPERTIES: drawer and #+title/#+filetags from an org file."""
-    props = {}
-    with open(src_path) as f:
-        content = f.read()
-    
-    # Extract title
-    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
-    if m:
-        props['title'] = m.group(1).strip()
-    
-    # Extract tags
-    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
-    if m:
-        tags = [t.strip(':') for t in m.group(1).split()]
-        props['tags'] = tags
-    
-    # Extract ID from PROPERTIES drawer
-    m = re.search(r':ID:\s+([^\s]+)', content)
-    if m:
-        props['org_id'] = m.group(1)
-    
-    # Extract CREATED
-    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
-    if m:
-        props['created'] = m.group(1)
-    
-    return props
-
-def strip_org_header(src_path):
-    """Strip the Org-mode header block (PROPERTIES drawer + #+ directives)
-    before feeding to pandoc, so it doesn't produce raw {=org} blocks."""
-    with open(src_path) as f:
-        lines = f.readlines()
-    
-    # Find first non-header line
-    in_properties = False
-    start = 0
-    for i, line in enumerate(lines):
-        if line.strip() == ':PROPERTIES:':
-            in_properties = True
-        if in_properties and line.strip() == ':END:':
-            in_properties = False
-            start = i + 1
-            continue
-        if not in_properties:
-            # Skip #+ lines
-            if line.startswith('#+'):
-                start = i + 1
-                continue
-            # First real content
-            if line.strip():
-                start = i
-                break
-            start = i + 1
-    
-    return ''.join(lines[start:])
-
-def pandoc_convert(clean_body):
-    """Convert org body to markdown via pandoc (stdin mode)."""
-    result = subprocess.run(
-        [PANDOC, "-f", "org", "-t", "markdown-smart"],
-        input=clean_body, capture_output=True, text=True
-    )
-    if result.returncode != 0:
-        print(f"  ERROR pandoc: {result.stderr[:200]}")
-        return None
-    return result.stdout.strip()
-
-def build_frontmatter(props):
-    """Build YAML frontmatter string from extracted properties."""
-    lines = ['---']
-    if 'title' in props:
-        lines.append(f'title: "{props["title"]}"')
-    if 'tags' in props:
-        tags_str = ', '.join(props['tags'])
-        lines.append(f'tags: [{tags_str}]')
-    if 'created' in props:
-        lines.append(f'created: {props["created"]}')
-    lines.append('---')
-    return '\n'.join(lines)
-
-def postprocess_links(md_text):
-    """Convert pandoc's markdown links to gbrain-friendly format."""
-    # Pandoc converts [[file:foo.org][desc]] to [desc](foo.org)
-    # Strip .org extensions from relative links
-    md_text = re.sub(r'\(([a-zA-Z0-9_-]+)\.org\)', r'(\1)', md_text)
-    return md_text
-
 ROUTING = {
     # Concepts — triad architecture, security, economics theory
     "triad-overview": "concepts",
@@ -170,40 +46,236 @@ ROUTING = {
     "passepartout-economics": "ideas",
 }
 
+def find_org_files():
+    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
+    files = []
+    base = f"{BRAIN}/ideas"
+    for root, dirs, filenames in os.walk(base):
+        for fn in filenames:
+            if not fn.endswith('.org'):
+                continue
+            abs_path = os.path.join(root, fn)
+            rel = os.path.relpath(abs_path, base)
+            name = fn[:-4]
+            files.append((name, rel, abs_path))
+    return files
+
+def gbrain_target(rel_path):
+    """Derive gbrain target path from org relative path."""
+    parts = rel_path.split('/')
+    if len(parts) == 1:
+        slug = parts[0][:-4]
+        category = ROUTING.get(slug, "concepts")
+        return f"{GBRAIN_SRC}/{category}/{slug}.md"
+    else:
+        subdir = parts[0]
+        slug = parts[1][:-4]
+        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
+
+def gbrain_slug(rel_path):
+    """Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path."""
+    parts = rel_path.split('/')
+    if len(parts) == 1:
+        slug = parts[0][:-4]
+        category = ROUTING.get(slug, "concepts")
+        return f"{category}/{slug}"
+    else:
+        subdir = parts[0]
+        slug = parts[1][:-4]
+        return f"concepts/{subdir}/{slug}"
+
+def build_slug_map():
+    """Build mapping: org slug (filename without .org) → gbrain slug."""
+    mapping = {}
+    for slug, rel_path, abs_path in find_org_files():
+        mapping[slug] = gbrain_slug(rel_path)
+    return mapping
+
+def extract_org_links_and_body(src_path):
+    """Read the full org file, extract PROPERTIES + #+ directives, and
+    return (props, clean_body) where clean_body has header stripped."""
+    with open(src_path) as f:
+        content = f.read()
+
+    props = {}
+
+    # Extract title
+    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
+    if m:
+        props['title'] = m.group(1).strip()
+
+    # Extract tags
+    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
+    if m:
+        tags = [t.strip(':') for t in m.group(1).split()]
+        props['tags'] = tags
+
+    # Extract ID from PROPERTIES drawer
+    m = re.search(r':ID:\s+([^\s]+)', content)
+    if m:
+        props['org_id'] = m.group(1)
+
+    # Extract CREATED
+    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
+    if m:
+        created_raw = m.group(1)  # e.g. "2026-05-23 Sat"
+        # Extract just the date portion
+        date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw)
+        if date_m:
+            props['created'] = date_m.group(1)
+
+    # Strip header for body
+    lines = content.split('\n')
+    in_properties = False
+    start = 0
+    for i, line in enumerate(lines):
+        if line.strip() == ':PROPERTIES:':
+            in_properties = True
+        if in_properties and line.strip() == ':END:':
+            in_properties = False
+            start = i + 1
+            continue
+        if not in_properties:
+            if line.startswith('#+'):
+                start = i + 1
+                continue
+            if line.strip():
+                start = i
+                break
+            start = i + 1
+
+    body = '\n'.join(lines[start:])
+    return props, body
+
+def resolve_org_link(match, slug_map):
+    """Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]]
+    when target is a known org slug. Preserves original target if unknown."""
+    full = match.group(0)
+    target = match.group(1)
+    desc = match.group(2) if match.lastindex >= 2 else target
+
+    if target in slug_map:
+        gbrain_path = slug_map[target]
+        return f"[[file:{gbrain_path}.org][{desc}]]"
+    return full
+
+def convert_body(body_text, slug_map):
+    """Pre-process org body to inject gbrain path prefixes into cross-references,
+    then convert to markdown via pandoc. Returns (md_body, link_refs) where
+    link_refs is a list of {slug, type} dicts."""
+    link_refs = []
+
+    # Find all [[file:X.org][desc]] cross-references and collect them
+    org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]')
+    for m in org_link_re.finditer(body_text):
+        target = m.group(1)
+        if target in slug_map:
+            link_refs.append({
+                "slug": slug_map[target],
+                "type": "references",
+                "name": slug_map[target],
+            })
+
+    # Inject directory prefixes into org links so pandoc produces proper paths
+    def replace_link(m):
+        target = m.group(1)
+        desc = m.group(2)
+        if target in slug_map:
+            return f"[[file:{slug_map[target]}.org][{desc}]]"
+        return m.group(0)
+
+    processed_body = org_link_re.sub(replace_link, body_text)
+
+    # Convert to markdown
+    result = subprocess.run(
+        [PANDOC, "-f", "org", "-t", "markdown-smart"],
+        input=processed_body, capture_output=True, text=True
+    )
+    if result.returncode != 0:
+        print(f"  ERROR pandoc: {result.stderr[:200]}")
+        return None, []
+
+    md = result.stdout.strip()
+
+    # Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org)
+    # Strip .org extensions
+    md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md)
+
+    return md, link_refs
+
+def build_frontmatter(props, link_refs=None):
+    """Build YAML frontmatter string from properties and link references."""
+    lines = ['---']
+    if 'title' in props:
+        lines.append(f'title: "{props["title"]}"')
+    if 'tags' in props:
+        tags_str = ', '.join(props['tags'])
+        lines.append(f'tags: [{tags_str}]')
+    if 'created' in props:
+        lines.append(f'created: {props["created"]}')
+    if link_refs:
+        for lr in link_refs:
+            # Deduplicate by slug
+            pass
+        # Deduplicate
+        seen = set()
+        unique_links = []
+        for lr in link_refs:
+            k = lr['slug']
+            if k not in seen:
+                seen.add(k)
+                unique_links.append(lr)
+        if unique_links:
+            lines.append('links:')
+            for lr in unique_links:
+                lines.append(f'  - slug: "{lr["slug"]}"')
+                lines.append(f'    type: "{lr["type"]}"')
+    lines.append('---')
+    return '\n'.join(lines)
+
+def add_timeline_entry(md_body, props):
+    """If the page has a CREATED date, prepend a timeline bullet."""
+    if 'created' in props and 'title' in props:
+        date = props['created']
+        title = props['title']
+        line = f"- **{date}** | Created — {title}\n\n"
+        return line + md_body
+    return md_body
+
 def main():
+    # Pre-build slug map for all org files
+    slug_map = build_slug_map()
     imported = []
-    
+
     for slug, rel_path, src_path in find_org_files():
         dst_path = gbrain_target(rel_path)
-        
-        # Create parent directories
         os.makedirs(os.path.dirname(dst_path), exist_ok=True)
-        
-        # Extract frontmatter from org properties
-        props = extract_org_properties(src_path)
-        
-        # Strip org header and convert body to markdown
-        clean = strip_org_header(src_path)
-        md = pandoc_convert(clean)
+
+        # Extract properties and body from org file
+        props, org_body = extract_org_links_and_body(src_path)
+
+        # Convert body to markdown, collecting links along the way
+        md, link_refs = convert_body(org_body, slug_map)
         if md is None:
             continue
-        
-        md = postprocess_links(md)
-        
-        # Assemble: YAML frontmatter + markdown body
-        frontmatter = build_frontmatter(props)
+
+        # Build frontmatter with links
+        frontmatter = build_frontmatter(props, link_refs)
+
+        # Add timeline entry if date exists
+        md = add_timeline_entry(md, props)
+
         full = frontmatter + '\n\n' + md + '\n'
-        
+
         with open(dst_path, 'w') as f:
             f.write(full)
-        
-        # Show relative path for clarity
+
         rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
         imported.append(rel_dst)
         print(f"  OK  {rel_dst}")
-    
+
     print(f"\nConverted {len(imported)} files.")
-    
+
     # Commit to git
     subprocess.run(["git", "-C", GBRAIN_SRC, "add", "-A"], capture_output=True)
     subprocess.run(
@@ -211,7 +283,7 @@ def main():
          "-m", "gbrain: sync converted org-mode brain files"],
         capture_output=True, text=True
     )
-    
+
     # Import into gbrain
     print("\nImporting into gbrain...")
     env = {**os.environ, "PATH": f"{os.path.expanduser('~')}/.bun/bin:{os.environ['PATH']}"}
@@ -219,16 +291,15 @@ def main():
         [BUN, "import", GBRAIN_SRC],
         capture_output=True, text=True, env=env
     )
-    # Show last 20 lines of stdout (skip noise)
     out_lines = result.stdout.strip().split('\n')
     for line in out_lines[-25:]:
         if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
             print(f"  {line}")
-    
+
     if result.returncode != 0:
         print(f"  gbrain import exit code: {result.returncode}")
         return
-    
+
     # Embed
     print("\nGenerating embeddings...")
     result2 = subprocess.run(
@@ -239,5 +310,36 @@ def main():
         if line.strip():
             print(f"  {line}")
 
+    # Extract links from frontmatter (now that pages are imported with links:)
+    print("\nExtracting links from frontmatter...")
+    result3 = subprocess.run(
+        [BUN, "extract", "links", "--source", "db", "--include-frontmatter",
+         "--dir", GBRAIN_SRC],
+        capture_output=True, text=True, env=env
+    )
+    for line in result3.stdout.strip().split('\n')[-10:]:
+        if line.strip():
+            print(f"  {line}")
+
+    # Extract timeline from body
+    print("\nExtracting timeline...")
+    result4 = subprocess.run(
+        [BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC],
+        capture_output=True, text=True, env=env
+    )
+    for line in result4.stdout.strip().split('\n')[-10:]:
+        if line.strip():
+            print(f"  {line}")
+
+    # Stats
+    print("\nBrain stats:")
+    result5 = subprocess.run(
+        [BUN, "stats"],
+        capture_output=True, text=True, env=env
+    )
+    for line in result5.stdout.strip().split('\n')[-15:]:
+        if line.strip():
+            print(f"  {line}")
+
 if __name__ == "__main__":
     main()