gbrain: sync converted org-mode brain files

2026-05-23 07:00:09 +00:00
parent cf0c9b1f9d
commit 9a95212d1a
1 changed files with 250 additions and 148 deletions
--- a/scripts/org-to-gbrain.py
+++ b/scripts/org-to-gbrain.py
@@ -7,130 +7,6 @@ GBRAIN_SRC = "/mnt/hermes/brain"
 PANDOC = "/usr/bin/pandoc"
 BUN = os.path.expanduser("~/.bun/bin/gbrain")

-def find_org_files():
-    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
-    files = []
-    base = f"{BRAIN}/ideas"
-    for root, dirs, filenames in os.walk(base):
-        for fn in filenames:
-            if not fn.endswith('.org'):
-                continue
-            abs_path = os.path.join(root, fn)
-            rel = os.path.relpath(abs_path, base)
-            # rel is like "compliance/hipaa.org" or "triad-overview.org"
-            name = fn[:-4]  # remove .org
-            files.append((name, rel, abs_path))
-    return files
-
-def gbrain_target(rel_path):
-    """Derive gbrain target path from org relative path.
-    
-    ideas/compliance/hipaa.org  → concepts/compliance/hipaa.md
-    ideas/triad-overview.org    → concepts/triad-overview.md (via routing dict)
-    ideas/competitive-analysis...→ ideas/competitive-analysis.md
-    """
-    parts = rel_path.split('/')
-    
-    if len(parts) == 1:
-        # Flat file in ideas/ root — use ROUTING dict
-        slug = parts[0][:-4] if parts[0].endswith('.org') else parts[0][:-4]
-        category = ROUTING.get(slug, "concepts")
-        return f"{GBRAIN_SRC}/{category}/{slug}.md"
-    else:
-        # In a subdirectory: ideas/compliance/foo.org → concepts/compliance/foo.md
-        subdir = parts[0]
-        slug = parts[1][:-4] if parts[1].endswith('.org') else parts[1][:-4]
-        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
-
-def extract_org_properties(src_path):
-    """Extract :PROPERTIES: drawer and #+title/#+filetags from an org file."""
-    props = {}
-    with open(src_path) as f:
-        content = f.read()
-    
-    # Extract title
-    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
-    if m:
-        props['title'] = m.group(1).strip()
-    
-    # Extract tags
-    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
-    if m:
-        tags = [t.strip(':') for t in m.group(1).split()]
-        props['tags'] = tags
-    
-    # Extract ID from PROPERTIES drawer
-    m = re.search(r':ID:\s+([^\s]+)', content)
-    if m:
-        props['org_id'] = m.group(1)
-    
-    # Extract CREATED
-    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
-    if m:
-        props['created'] = m.group(1)
-    
-    return props
-
-def strip_org_header(src_path):
-    """Strip the Org-mode header block (PROPERTIES drawer + #+ directives)
-    before feeding to pandoc, so it doesn't produce raw {=org} blocks."""
-    with open(src_path) as f:
-        lines = f.readlines()
-    
-    # Find first non-header line
-    in_properties = False
-    start = 0
-    for i, line in enumerate(lines):
-        if line.strip() == ':PROPERTIES:':
-            in_properties = True
-        if in_properties and line.strip() == ':END:':
-            in_properties = False
-            start = i + 1
-            continue
-        if not in_properties:
-            # Skip #+ lines
-            if line.startswith('#+'):
-                start = i + 1
-                continue
-            # First real content
-            if line.strip():
-                start = i
-                break
-            start = i + 1
-    
-    return ''.join(lines[start:])
-
-def pandoc_convert(clean_body):
-    """Convert org body to markdown via pandoc (stdin mode)."""
-    result = subprocess.run(
-        [PANDOC, "-f", "org", "-t", "markdown-smart"],
-        input=clean_body, capture_output=True, text=True
-    )
-    if result.returncode != 0:
-        print(f"  ERROR pandoc: {result.stderr[:200]}")
-        return None
-    return result.stdout.strip()
-
-def build_frontmatter(props):
-    """Build YAML frontmatter string from extracted properties."""
-    lines = ['---']
-    if 'title' in props:
-        lines.append(f'title: "{props["title"]}"')
-    if 'tags' in props:
-        tags_str = ', '.join(props['tags'])
-        lines.append(f'tags: [{tags_str}]')
-    if 'created' in props:
-        lines.append(f'created: {props["created"]}')
-    lines.append('---')
-    return '\n'.join(lines)
-
-def postprocess_links(md_text):
-    """Convert pandoc's markdown links to gbrain-friendly format."""
-    # Pandoc converts [[file:foo.org][desc]] to [desc](foo.org)
-    # Strip .org extensions from relative links
-    md_text = re.sub(r'\(([a-zA-Z0-9_-]+)\.org\)', r'(\1)', md_text)
-    return md_text
-
 ROUTING = {
    # Concepts — triad architecture, security, economics theory
    "triad-overview": "concepts",
@@ -170,34 +46,230 @@ ROUTING = {
    "passepartout-economics": "ideas",
 }

+def find_org_files():
+    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
+    files = []
+    base = f"{BRAIN}/ideas"
+    for root, dirs, filenames in os.walk(base):
+        for fn in filenames:
+            if not fn.endswith('.org'):
+                continue
+            abs_path = os.path.join(root, fn)
+            rel = os.path.relpath(abs_path, base)
+            name = fn[:-4]
+            files.append((name, rel, abs_path))
+    return files
+
+def gbrain_target(rel_path):
+    """Derive gbrain target path from org relative path."""
+    parts = rel_path.split('/')
+    if len(parts) == 1:
+        slug = parts[0][:-4]
+        category = ROUTING.get(slug, "concepts")
+        return f"{GBRAIN_SRC}/{category}/{slug}.md"
+    else:
+        subdir = parts[0]
+        slug = parts[1][:-4]
+        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
+
+def gbrain_slug(rel_path):
+    """Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path."""
+    parts = rel_path.split('/')
+    if len(parts) == 1:
+        slug = parts[0][:-4]
+        category = ROUTING.get(slug, "concepts")
+        return f"{category}/{slug}"
+    else:
+        subdir = parts[0]
+        slug = parts[1][:-4]
+        return f"concepts/{subdir}/{slug}"
+
+def build_slug_map():
+    """Build mapping: org slug (filename without .org) → gbrain slug."""
+    mapping = {}
+    for slug, rel_path, abs_path in find_org_files():
+        mapping[slug] = gbrain_slug(rel_path)
+    return mapping
+
+def extract_org_links_and_body(src_path):
+    """Read the full org file, extract PROPERTIES + #+ directives, and
+    return (props, clean_body) where clean_body has header stripped."""
+    with open(src_path) as f:
+        content = f.read()
+
+    props = {}
+
+    # Extract title
+    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
+    if m:
+        props['title'] = m.group(1).strip()
+
+    # Extract tags
+    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
+    if m:
+        tags = [t.strip(':') for t in m.group(1).split()]
+        props['tags'] = tags
+
+    # Extract ID from PROPERTIES drawer
+    m = re.search(r':ID:\s+([^\s]+)', content)
+    if m:
+        props['org_id'] = m.group(1)
+
+    # Extract CREATED
+    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
+    if m:
+        created_raw = m.group(1)  # e.g. "2026-05-23 Sat"
+        # Extract just the date portion
+        date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw)
+        if date_m:
+            props['created'] = date_m.group(1)
+
+    # Strip header for body
+    lines = content.split('\n')
+    in_properties = False
+    start = 0
+    for i, line in enumerate(lines):
+        if line.strip() == ':PROPERTIES:':
+            in_properties = True
+        if in_properties and line.strip() == ':END:':
+            in_properties = False
+            start = i + 1
+            continue
+        if not in_properties:
+            if line.startswith('#+'):
+                start = i + 1
+                continue
+            if line.strip():
+                start = i
+                break
+            start = i + 1
+
+    body = '\n'.join(lines[start:])
+    return props, body
+
+def resolve_org_link(match, slug_map):
+    """Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]]
+    when target is a known org slug. Preserves original target if unknown."""
+    full = match.group(0)
+    target = match.group(1)
+    desc = match.group(2) if match.lastindex >= 2 else target
+
+    if target in slug_map:
+        gbrain_path = slug_map[target]
+        return f"[[file:{gbrain_path}.org][{desc}]]"
+    return full
+
+def convert_body(body_text, slug_map):
+    """Pre-process org body to inject gbrain path prefixes into cross-references,
+    then convert to markdown via pandoc. Returns (md_body, link_refs) where
+    link_refs is a list of {slug, type} dicts."""
+    link_refs = []
+
+    # Find all [[file:X.org][desc]] cross-references and collect them
+    org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]')
+    for m in org_link_re.finditer(body_text):
+        target = m.group(1)
+        if target in slug_map:
+            link_refs.append({
+                "slug": slug_map[target],
+                "type": "references",
+                "name": slug_map[target],
+            })
+
+    # Inject directory prefixes into org links so pandoc produces proper paths
+    def replace_link(m):
+        target = m.group(1)
+        desc = m.group(2)
+        if target in slug_map:
+            return f"[[file:{slug_map[target]}.org][{desc}]]"
+        return m.group(0)
+
+    processed_body = org_link_re.sub(replace_link, body_text)
+
+    # Convert to markdown
+    result = subprocess.run(
+        [PANDOC, "-f", "org", "-t", "markdown-smart"],
+        input=processed_body, capture_output=True, text=True
+    )
+    if result.returncode != 0:
+        print(f"  ERROR pandoc: {result.stderr[:200]}")
+        return None, []
+
+    md = result.stdout.strip()
+
+    # Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org)
+    # Strip .org extensions
+    md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md)
+
+    return md, link_refs
+
+def build_frontmatter(props, link_refs=None):
+    """Build YAML frontmatter string from properties and link references."""
+    lines = ['---']
+    if 'title' in props:
+        lines.append(f'title: "{props["title"]}"')
+    if 'tags' in props:
+        tags_str = ', '.join(props['tags'])
+        lines.append(f'tags: [{tags_str}]')
+    if 'created' in props:
+        lines.append(f'created: {props["created"]}')
+    if link_refs:
+        for lr in link_refs:
+            # Deduplicate by slug
+            pass
+        # Deduplicate
+        seen = set()
+        unique_links = []
+        for lr in link_refs:
+            k = lr['slug']
+            if k not in seen:
+                seen.add(k)
+                unique_links.append(lr)
+        if unique_links:
+            lines.append('links:')
+            for lr in unique_links:
+                lines.append(f'  - slug: "{lr["slug"]}"')
+                lines.append(f'    type: "{lr["type"]}"')
+    lines.append('---')
+    return '\n'.join(lines)
+
+def add_timeline_entry(md_body, props):
+    """If the page has a CREATED date, prepend a timeline bullet."""
+    if 'created' in props and 'title' in props:
+        date = props['created']
+        title = props['title']
+        line = f"- **{date}** | Created — {title}\n\n"
+        return line + md_body
+    return md_body
+
 def main():
+    # Pre-build slug map for all org files
+    slug_map = build_slug_map()
    imported = []

    for slug, rel_path, src_path in find_org_files():
        dst_path = gbrain_target(rel_path)
-        
-        # Create parent directories
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)

-        # Extract frontmatter from org properties
-        props = extract_org_properties(src_path)
+        # Extract properties and body from org file
+        props, org_body = extract_org_links_and_body(src_path)

-        # Strip org header and convert body to markdown
-        clean = strip_org_header(src_path)
-        md = pandoc_convert(clean)
+        # Convert body to markdown, collecting links along the way
+        md, link_refs = convert_body(org_body, slug_map)
        if md is None:
            continue

-        md = postprocess_links(md)
+        # Build frontmatter with links
+        frontmatter = build_frontmatter(props, link_refs)
+
+        # Add timeline entry if date exists
+        md = add_timeline_entry(md, props)

-        # Assemble: YAML frontmatter + markdown body
-        frontmatter = build_frontmatter(props)
        full = frontmatter + '\n\n' + md + '\n'

        with open(dst_path, 'w') as f:
            f.write(full)

-        # Show relative path for clarity
        rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
        imported.append(rel_dst)
        print(f"  OK  {rel_dst}")
@@ -219,7 +291,6 @@ def main():
        [BUN, "import", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
-    # Show last 20 lines of stdout (skip noise)
    out_lines = result.stdout.strip().split('\n')
    for line in out_lines[-25:]:
        if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
@@ -239,5 +310,36 @@ def main():
        if line.strip():
            print(f"  {line}")

+    # Extract links from frontmatter (now that pages are imported with links:)
+    print("\nExtracting links from frontmatter...")
+    result3 = subprocess.run(
+        [BUN, "extract", "links", "--source", "db", "--include-frontmatter",
+         "--dir", GBRAIN_SRC],
+        capture_output=True, text=True, env=env
+    )
+    for line in result3.stdout.strip().split('\n')[-10:]:
+        if line.strip():
+            print(f"  {line}")
+
+    # Extract timeline from body
+    print("\nExtracting timeline...")
+    result4 = subprocess.run(
+        [BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC],
+        capture_output=True, text=True, env=env
+    )
+    for line in result4.stdout.strip().split('\n')[-10:]:
+        if line.strip():
+            print(f"  {line}")
+
+    # Stats
+    print("\nBrain stats:")
+    result5 = subprocess.run(
+        [BUN, "stats"],
+        capture_output=True, text=True, env=env
+    )
+    for line in result5.stdout.strip().split('\n')[-15:]:
+        if line.strip():
+            print(f"  {line}")
+
 if __name__ == "__main__":
    main()