gbrain: sync converted org-mode brain files

2026-05-23 07:00:09 +00:00
parent cf0c9b1f9d
commit 9a95212d1a
1 changed files with 250 additions and 148 deletions
--- a/scripts/org-to-gbrain.py
+++ b/scripts/org-to-gbrain.py
@@ -7,130 +7,6 @@ GBRAIN_SRC = "/mnt/hermes/brain"
 PANDOC = "/usr/bin/pandoc"
 BUN = os.path.expanduser("~/.bun/bin/gbrain")
 def find_org_files():
    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
    files = []
    base = f"{BRAIN}/ideas"
    for root, dirs, filenames in os.walk(base):
        for fn in filenames:
            if not fn.endswith('.org'):
                continue
            abs_path = os.path.join(root, fn)
            rel = os.path.relpath(abs_path, base)
            # rel is like "compliance/hipaa.org" or "triad-overview.org"
            name = fn[:-4]  # remove .org
            files.append((name, rel, abs_path))
    return files
 def gbrain_target(rel_path):
    """Derive gbrain target path from org relative path.
    ideas/compliance/hipaa.org  → concepts/compliance/hipaa.md
    ideas/triad-overview.org    → concepts/triad-overview.md (via routing dict)
    ideas/competitive-analysis...→ ideas/competitive-analysis.md
    """
    parts = rel_path.split('/')
    if len(parts) == 1:
        # Flat file in ideas/ root — use ROUTING dict
        slug = parts[0][:-4] if parts[0].endswith('.org') else parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{GBRAIN_SRC}/{category}/{slug}.md"
    else:
        # In a subdirectory: ideas/compliance/foo.org → concepts/compliance/foo.md
        subdir = parts[0]
        slug = parts[1][:-4] if parts[1].endswith('.org') else parts[1][:-4]
        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
 def extract_org_properties(src_path):
    """Extract :PROPERTIES: drawer and #+title/#+filetags from an org file."""
    props = {}
    with open(src_path) as f:
        content = f.read()
    # Extract title
    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
    if m:
        props['title'] = m.group(1).strip()
    # Extract tags
    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
    if m:
        tags = [t.strip(':') for t in m.group(1).split()]
        props['tags'] = tags
    # Extract ID from PROPERTIES drawer
    m = re.search(r':ID:\s+([^\s]+)', content)
    if m:
        props['org_id'] = m.group(1)
    # Extract CREATED
    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
    if m:
        props['created'] = m.group(1)
    return props
 def strip_org_header(src_path):
    """Strip the Org-mode header block (PROPERTIES drawer + #+ directives)
    before feeding to pandoc, so it doesn't produce raw {=org} blocks."""
    with open(src_path) as f:
        lines = f.readlines()
    # Find first non-header line
    in_properties = False
    start = 0
    for i, line in enumerate(lines):
        if line.strip() == ':PROPERTIES:':
            in_properties = True
        if in_properties and line.strip() == ':END:':
            in_properties = False
            start = i + 1
            continue
        if not in_properties:
            # Skip #+ lines
            if line.startswith('#+'):
                start = i + 1
                continue
            # First real content
            if line.strip():
                start = i
                break
            start = i + 1
    return ''.join(lines[start:])
 def pandoc_convert(clean_body):
    """Convert org body to markdown via pandoc (stdin mode)."""
    result = subprocess.run(
        [PANDOC, "-f", "org", "-t", "markdown-smart"],
        input=clean_body, capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  ERROR pandoc: {result.stderr[:200]}")
        return None
    return result.stdout.strip()
 def build_frontmatter(props):
    """Build YAML frontmatter string from extracted properties."""
    lines = ['---']
    if 'title' in props:
        lines.append(f'title: "{props["title"]}"')
    if 'tags' in props:
        tags_str = ', '.join(props['tags'])
        lines.append(f'tags: [{tags_str}]')
    if 'created' in props:
        lines.append(f'created: {props["created"]}')
    lines.append('---')
    return '\n'.join(lines)
 def postprocess_links(md_text):
    """Convert pandoc's markdown links to gbrain-friendly format."""
    # Pandoc converts [[file:foo.org][desc]] to [desc](foo.org)
    # Strip .org extensions from relative links
    md_text = re.sub(r'\(([a-zA-Z0-9_-]+)\.org\)', r'(\1)', md_text)
    return md_text
 ROUTING = {
    # Concepts — triad architecture, security, economics theory
    "triad-overview": "concepts",
@@ -170,34 +46,230 @@ ROUTING = {
    "passepartout-economics": "ideas",
 }
 def find_org_files():
    """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path)."""
    files = []
    base = f"{BRAIN}/ideas"
    for root, dirs, filenames in os.walk(base):
        for fn in filenames:
            if not fn.endswith('.org'):
                continue
            abs_path = os.path.join(root, fn)
            rel = os.path.relpath(abs_path, base)
            name = fn[:-4]
            files.append((name, rel, abs_path))
    return files
 def gbrain_target(rel_path):
    """Derive gbrain target path from org relative path."""
    parts = rel_path.split('/')
    if len(parts) == 1:
        slug = parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{GBRAIN_SRC}/{category}/{slug}.md"
    else:
        subdir = parts[0]
        slug = parts[1][:-4]
        return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md"
 def gbrain_slug(rel_path):
    """Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path."""
    parts = rel_path.split('/')
    if len(parts) == 1:
        slug = parts[0][:-4]
        category = ROUTING.get(slug, "concepts")
        return f"{category}/{slug}"
    else:
        subdir = parts[0]
        slug = parts[1][:-4]
        return f"concepts/{subdir}/{slug}"
 def build_slug_map():
    """Build mapping: org slug (filename without .org) → gbrain slug."""
    mapping = {}
    for slug, rel_path, abs_path in find_org_files():
        mapping[slug] = gbrain_slug(rel_path)
    return mapping
 def extract_org_links_and_body(src_path):
    """Read the full org file, extract PROPERTIES + #+ directives, and
    return (props, clean_body) where clean_body has header stripped."""
    with open(src_path) as f:
        content = f.read()
    props = {}
    # Extract title
    m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE)
    if m:
        props['title'] = m.group(1).strip()
    # Extract tags
    m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE)
    if m:
        tags = [t.strip(':') for t in m.group(1).split()]
        props['tags'] = tags
    # Extract ID from PROPERTIES drawer
    m = re.search(r':ID:\s+([^\s]+)', content)
    if m:
        props['org_id'] = m.group(1)
    # Extract CREATED
    m = re.search(r':CREATED:\s+\[([^\]]+)\]', content)
    if m:
        created_raw = m.group(1)  # e.g. "2026-05-23 Sat"
        # Extract just the date portion
        date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw)
        if date_m:
            props['created'] = date_m.group(1)
    # Strip header for body
    lines = content.split('\n')
    in_properties = False
    start = 0
    for i, line in enumerate(lines):
        if line.strip() == ':PROPERTIES:':
            in_properties = True
        if in_properties and line.strip() == ':END:':
            in_properties = False
            start = i + 1
            continue
        if not in_properties:
            if line.startswith('#+'):
                start = i + 1
                continue
            if line.strip():
                start = i
                break
            start = i + 1
    body = '\n'.join(lines[start:])
    return props, body
 def resolve_org_link(match, slug_map):
    """Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]]
    when target is a known org slug. Preserves original target if unknown."""
    full = match.group(0)
    target = match.group(1)
    desc = match.group(2) if match.lastindex >= 2 else target
    if target in slug_map:
        gbrain_path = slug_map[target]
        return f"[[file:{gbrain_path}.org][{desc}]]"
    return full
 def convert_body(body_text, slug_map):
    """Pre-process org body to inject gbrain path prefixes into cross-references,
    then convert to markdown via pandoc. Returns (md_body, link_refs) where
    link_refs is a list of {slug, type} dicts."""
    link_refs = []
    # Find all [[file:X.org][desc]] cross-references and collect them
    org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]')
    for m in org_link_re.finditer(body_text):
        target = m.group(1)
        if target in slug_map:
            link_refs.append({
                "slug": slug_map[target],
                "type": "references",
                "name": slug_map[target],
            })
    # Inject directory prefixes into org links so pandoc produces proper paths
    def replace_link(m):
        target = m.group(1)
        desc = m.group(2)
        if target in slug_map:
            return f"[[file:{slug_map[target]}.org][{desc}]]"
        return m.group(0)
    processed_body = org_link_re.sub(replace_link, body_text)
    # Convert to markdown
    result = subprocess.run(
        [PANDOC, "-f", "org", "-t", "markdown-smart"],
        input=processed_body, capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"  ERROR pandoc: {result.stderr[:200]}")
        return None, []
    md = result.stdout.strip()
    # Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org)
    # Strip .org extensions
    md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md)
    return md, link_refs
 def build_frontmatter(props, link_refs=None):
    """Build YAML frontmatter string from properties and link references."""
    lines = ['---']
    if 'title' in props:
        lines.append(f'title: "{props["title"]}"')
    if 'tags' in props:
        tags_str = ', '.join(props['tags'])
        lines.append(f'tags: [{tags_str}]')
    if 'created' in props:
        lines.append(f'created: {props["created"]}')
    if link_refs:
        for lr in link_refs:
            # Deduplicate by slug
            pass
        # Deduplicate
        seen = set()
        unique_links = []
        for lr in link_refs:
            k = lr['slug']
            if k not in seen:
                seen.add(k)
                unique_links.append(lr)
        if unique_links:
            lines.append('links:')
            for lr in unique_links:
                lines.append(f'  - slug: "{lr["slug"]}"')
                lines.append(f'    type: "{lr["type"]}"')
    lines.append('---')
    return '\n'.join(lines)
 def add_timeline_entry(md_body, props):
    """If the page has a CREATED date, prepend a timeline bullet."""
    if 'created' in props and 'title' in props:
        date = props['created']
        title = props['title']
        line = f"- **{date}** | Created — {title}\n\n"
        return line + md_body
    return md_body
 def main():
    # Pre-build slug map for all org files
    slug_map = build_slug_map()
    imported = []
    for slug, rel_path, src_path in find_org_files():
        dst_path = gbrain_target(rel_path)
        # Create parent directories
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
-        # Extract frontmatter from org properties
+        # Extract properties and body from org file
-        props = extract_org_properties(src_path)
+        props, org_body = extract_org_links_and_body(src_path)
-        # Strip org header and convert body to markdown
+        # Convert body to markdown, collecting links along the way
-        clean = strip_org_header(src_path)
+        md, link_refs = convert_body(org_body, slug_map)
        md = pandoc_convert(clean)
        if md is None:
            continue
-        md = postprocess_links(md)
+        # Build frontmatter with links
        frontmatter = build_frontmatter(props, link_refs)
        # Add timeline entry if date exists
        md = add_timeline_entry(md, props)
        # Assemble: YAML frontmatter + markdown body
        frontmatter = build_frontmatter(props)
        full = frontmatter + '\n\n' + md + '\n'
        with open(dst_path, 'w') as f:
            f.write(full)
        # Show relative path for clarity
        rel_dst = os.path.relpath(dst_path, GBRAIN_SRC)
        imported.append(rel_dst)
        print(f"  OK  {rel_dst}")
@@ -219,7 +291,6 @@ def main():
        [BUN, "import", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    # Show last 20 lines of stdout (skip noise)
    out_lines = result.stdout.strip().split('\n')
    for line in out_lines[-25:]:
        if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line:
@@ -239,5 +310,36 @@ def main():
        if line.strip():
            print(f"  {line}")
    # Extract links from frontmatter (now that pages are imported with links:)
    print("\nExtracting links from frontmatter...")
    result3 = subprocess.run(
        [BUN, "extract", "links", "--source", "db", "--include-frontmatter",
         "--dir", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    for line in result3.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")
    # Extract timeline from body
    print("\nExtracting timeline...")
    result4 = subprocess.run(
        [BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC],
        capture_output=True, text=True, env=env
    )
    for line in result4.stdout.strip().split('\n')[-10:]:
        if line.strip():
            print(f"  {line}")
    # Stats
    print("\nBrain stats:")
    result5 = subprocess.run(
        [BUN, "stats"],
        capture_output=True, text=True, env=env
    )
    for line in result5.stdout.strip().split('\n')[-15:]:
        if line.strip():
            print(f"  {line}")
 if __name__ == "__main__":
    main()