From 9a95212d1aef199df679f536772e7f61bdac1971 Mon Sep 17 00:00:00 2001 From: Hermes Date: Sat, 23 May 2026 07:00:09 +0000 Subject: [PATCH] gbrain: sync converted org-mode brain files --- scripts/org-to-gbrain.py | 398 ++++++++++++++++++++++++--------------- 1 file changed, 250 insertions(+), 148 deletions(-) diff --git a/scripts/org-to-gbrain.py b/scripts/org-to-gbrain.py index e013b23..c303b83 100644 --- a/scripts/org-to-gbrain.py +++ b/scripts/org-to-gbrain.py @@ -7,130 +7,6 @@ GBRAIN_SRC = "/mnt/hermes/brain" PANDOC = "/usr/bin/pandoc" BUN = os.path.expanduser("~/.bun/bin/gbrain") -def find_org_files(): - """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path).""" - files = [] - base = f"{BRAIN}/ideas" - for root, dirs, filenames in os.walk(base): - for fn in filenames: - if not fn.endswith('.org'): - continue - abs_path = os.path.join(root, fn) - rel = os.path.relpath(abs_path, base) - # rel is like "compliance/hipaa.org" or "triad-overview.org" - name = fn[:-4] # remove .org - files.append((name, rel, abs_path)) - return files - -def gbrain_target(rel_path): - """Derive gbrain target path from org relative path. - - ideas/compliance/hipaa.org → concepts/compliance/hipaa.md - ideas/triad-overview.org → concepts/triad-overview.md (via routing dict) - ideas/competitive-analysis...→ ideas/competitive-analysis.md - """ - parts = rel_path.split('/') - - if len(parts) == 1: - # Flat file in ideas/ root — use ROUTING dict - slug = parts[0][:-4] if parts[0].endswith('.org') else parts[0][:-4] - category = ROUTING.get(slug, "concepts") - return f"{GBRAIN_SRC}/{category}/{slug}.md" - else: - # In a subdirectory: ideas/compliance/foo.org → concepts/compliance/foo.md - subdir = parts[0] - slug = parts[1][:-4] if parts[1].endswith('.org') else parts[1][:-4] - return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md" - -def extract_org_properties(src_path): - """Extract :PROPERTIES: drawer and #+title/#+filetags from an org file.""" - props = {} - with open(src_path) as f: - content = f.read() - - # Extract title - m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE) - if m: - props['title'] = m.group(1).strip() - - # Extract tags - m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE) - if m: - tags = [t.strip(':') for t in m.group(1).split()] - props['tags'] = tags - - # Extract ID from PROPERTIES drawer - m = re.search(r':ID:\s+([^\s]+)', content) - if m: - props['org_id'] = m.group(1) - - # Extract CREATED - m = re.search(r':CREATED:\s+\[([^\]]+)\]', content) - if m: - props['created'] = m.group(1) - - return props - -def strip_org_header(src_path): - """Strip the Org-mode header block (PROPERTIES drawer + #+ directives) - before feeding to pandoc, so it doesn't produce raw {=org} blocks.""" - with open(src_path) as f: - lines = f.readlines() - - # Find first non-header line - in_properties = False - start = 0 - for i, line in enumerate(lines): - if line.strip() == ':PROPERTIES:': - in_properties = True - if in_properties and line.strip() == ':END:': - in_properties = False - start = i + 1 - continue - if not in_properties: - # Skip #+ lines - if line.startswith('#+'): - start = i + 1 - continue - # First real content - if line.strip(): - start = i - break - start = i + 1 - - return ''.join(lines[start:]) - -def pandoc_convert(clean_body): - """Convert org body to markdown via pandoc (stdin mode).""" - result = subprocess.run( - [PANDOC, "-f", "org", "-t", "markdown-smart"], - input=clean_body, capture_output=True, text=True - ) - if result.returncode != 0: - print(f" ERROR pandoc: {result.stderr[:200]}") - return None - return result.stdout.strip() - -def build_frontmatter(props): - """Build YAML frontmatter string from extracted properties.""" - lines = ['---'] - if 'title' in props: - lines.append(f'title: "{props["title"]}"') - if 'tags' in props: - tags_str = ', '.join(props['tags']) - lines.append(f'tags: [{tags_str}]') - if 'created' in props: - lines.append(f'created: {props["created"]}') - lines.append('---') - return '\n'.join(lines) - -def postprocess_links(md_text): - """Convert pandoc's markdown links to gbrain-friendly format.""" - # Pandoc converts [[file:foo.org][desc]] to [desc](foo.org) - # Strip .org extensions from relative links - md_text = re.sub(r'\(([a-zA-Z0-9_-]+)\.org\)', r'(\1)', md_text) - return md_text - ROUTING = { # Concepts — triad architecture, security, economics theory "triad-overview": "concepts", @@ -170,40 +46,236 @@ ROUTING = { "passepartout-economics": "ideas", } +def find_org_files(): + """Scan ideas/ recursively for all .org files, return (slug, rel_path, abs_path).""" + files = [] + base = f"{BRAIN}/ideas" + for root, dirs, filenames in os.walk(base): + for fn in filenames: + if not fn.endswith('.org'): + continue + abs_path = os.path.join(root, fn) + rel = os.path.relpath(abs_path, base) + name = fn[:-4] + files.append((name, rel, abs_path)) + return files + +def gbrain_target(rel_path): + """Derive gbrain target path from org relative path.""" + parts = rel_path.split('/') + if len(parts) == 1: + slug = parts[0][:-4] + category = ROUTING.get(slug, "concepts") + return f"{GBRAIN_SRC}/{category}/{slug}.md" + else: + subdir = parts[0] + slug = parts[1][:-4] + return f"{GBRAIN_SRC}/concepts/{subdir}/{slug}.md" + +def gbrain_slug(rel_path): + """Return the gbrain slug (e.g. 'concepts/time-estimates') for an org rel_path.""" + parts = rel_path.split('/') + if len(parts) == 1: + slug = parts[0][:-4] + category = ROUTING.get(slug, "concepts") + return f"{category}/{slug}" + else: + subdir = parts[0] + slug = parts[1][:-4] + return f"concepts/{subdir}/{slug}" + +def build_slug_map(): + """Build mapping: org slug (filename without .org) → gbrain slug.""" + mapping = {} + for slug, rel_path, abs_path in find_org_files(): + mapping[slug] = gbrain_slug(rel_path) + return mapping + +def extract_org_links_and_body(src_path): + """Read the full org file, extract PROPERTIES + #+ directives, and + return (props, clean_body) where clean_body has header stripped.""" + with open(src_path) as f: + content = f.read() + + props = {} + + # Extract title + m = re.search(r'^#\+title:\s+(.+)$', content, re.MULTILINE) + if m: + props['title'] = m.group(1).strip() + + # Extract tags + m = re.search(r'^#\+filetags:\s+(.+)$', content, re.MULTILINE) + if m: + tags = [t.strip(':') for t in m.group(1).split()] + props['tags'] = tags + + # Extract ID from PROPERTIES drawer + m = re.search(r':ID:\s+([^\s]+)', content) + if m: + props['org_id'] = m.group(1) + + # Extract CREATED + m = re.search(r':CREATED:\s+\[([^\]]+)\]', content) + if m: + created_raw = m.group(1) # e.g. "2026-05-23 Sat" + # Extract just the date portion + date_m = re.match(r'(\d{4}-\d{2}-\d{2})', created_raw) + if date_m: + props['created'] = date_m.group(1) + + # Strip header for body + lines = content.split('\n') + in_properties = False + start = 0 + for i, line in enumerate(lines): + if line.strip() == ':PROPERTIES:': + in_properties = True + if in_properties and line.strip() == ':END:': + in_properties = False + start = i + 1 + continue + if not in_properties: + if line.startswith('#+'): + start = i + 1 + continue + if line.strip(): + start = i + break + start = i + 1 + + body = '\n'.join(lines[start:]) + return props, body + +def resolve_org_link(match, slug_map): + """Replace [[file:target.org][desc]] with [[file:gbrain_path/target.org][desc]] + when target is a known org slug. Preserves original target if unknown.""" + full = match.group(0) + target = match.group(1) + desc = match.group(2) if match.lastindex >= 2 else target + + if target in slug_map: + gbrain_path = slug_map[target] + return f"[[file:{gbrain_path}.org][{desc}]]" + return full + +def convert_body(body_text, slug_map): + """Pre-process org body to inject gbrain path prefixes into cross-references, + then convert to markdown via pandoc. Returns (md_body, link_refs) where + link_refs is a list of {slug, type} dicts.""" + link_refs = [] + + # Find all [[file:X.org][desc]] cross-references and collect them + org_link_re = re.compile(r'\[\[file:([^\]]+?)\.org\]\[([^\]]*?)\]\]') + for m in org_link_re.finditer(body_text): + target = m.group(1) + if target in slug_map: + link_refs.append({ + "slug": slug_map[target], + "type": "references", + "name": slug_map[target], + }) + + # Inject directory prefixes into org links so pandoc produces proper paths + def replace_link(m): + target = m.group(1) + desc = m.group(2) + if target in slug_map: + return f"[[file:{slug_map[target]}.org][{desc}]]" + return m.group(0) + + processed_body = org_link_re.sub(replace_link, body_text) + + # Convert to markdown + result = subprocess.run( + [PANDOC, "-f", "org", "-t", "markdown-smart"], + input=processed_body, capture_output=True, text=True + ) + if result.returncode != 0: + print(f" ERROR pandoc: {result.stderr[:200]}") + return None, [] + + md = result.stdout.strip() + + # Pandoc converts [[file:concepts/foo.org][desc]] to [desc](concepts/foo.org) + # Strip .org extensions + md = re.sub(r'\(([a-zA-Z0-9_/-]+)\.org\)', r'(\1)', md) + + return md, link_refs + +def build_frontmatter(props, link_refs=None): + """Build YAML frontmatter string from properties and link references.""" + lines = ['---'] + if 'title' in props: + lines.append(f'title: "{props["title"]}"') + if 'tags' in props: + tags_str = ', '.join(props['tags']) + lines.append(f'tags: [{tags_str}]') + if 'created' in props: + lines.append(f'created: {props["created"]}') + if link_refs: + for lr in link_refs: + # Deduplicate by slug + pass + # Deduplicate + seen = set() + unique_links = [] + for lr in link_refs: + k = lr['slug'] + if k not in seen: + seen.add(k) + unique_links.append(lr) + if unique_links: + lines.append('links:') + for lr in unique_links: + lines.append(f' - slug: "{lr["slug"]}"') + lines.append(f' type: "{lr["type"]}"') + lines.append('---') + return '\n'.join(lines) + +def add_timeline_entry(md_body, props): + """If the page has a CREATED date, prepend a timeline bullet.""" + if 'created' in props and 'title' in props: + date = props['created'] + title = props['title'] + line = f"- **{date}** | Created — {title}\n\n" + return line + md_body + return md_body + def main(): + # Pre-build slug map for all org files + slug_map = build_slug_map() imported = [] - + for slug, rel_path, src_path in find_org_files(): dst_path = gbrain_target(rel_path) - - # Create parent directories os.makedirs(os.path.dirname(dst_path), exist_ok=True) - - # Extract frontmatter from org properties - props = extract_org_properties(src_path) - - # Strip org header and convert body to markdown - clean = strip_org_header(src_path) - md = pandoc_convert(clean) + + # Extract properties and body from org file + props, org_body = extract_org_links_and_body(src_path) + + # Convert body to markdown, collecting links along the way + md, link_refs = convert_body(org_body, slug_map) if md is None: continue - - md = postprocess_links(md) - - # Assemble: YAML frontmatter + markdown body - frontmatter = build_frontmatter(props) + + # Build frontmatter with links + frontmatter = build_frontmatter(props, link_refs) + + # Add timeline entry if date exists + md = add_timeline_entry(md, props) + full = frontmatter + '\n\n' + md + '\n' - + with open(dst_path, 'w') as f: f.write(full) - - # Show relative path for clarity + rel_dst = os.path.relpath(dst_path, GBRAIN_SRC) imported.append(rel_dst) print(f" OK {rel_dst}") - + print(f"\nConverted {len(imported)} files.") - + # Commit to git subprocess.run(["git", "-C", GBRAIN_SRC, "add", "-A"], capture_output=True) subprocess.run( @@ -211,7 +283,7 @@ def main(): "-m", "gbrain: sync converted org-mode brain files"], capture_output=True, text=True ) - + # Import into gbrain print("\nImporting into gbrain...") env = {**os.environ, "PATH": f"{os.path.expanduser('~')}/.bun/bin:{os.environ['PATH']}"} @@ -219,16 +291,15 @@ def main(): [BUN, "import", GBRAIN_SRC], capture_output=True, text=True, env=env ) - # Show last 20 lines of stdout (skip noise) out_lines = result.stdout.strip().split('\n') for line in out_lines[-25:]: if line.strip() and 'batch caps' not in line and 'max_batch_tokens' not in line: print(f" {line}") - + if result.returncode != 0: print(f" gbrain import exit code: {result.returncode}") return - + # Embed print("\nGenerating embeddings...") result2 = subprocess.run( @@ -239,5 +310,36 @@ def main(): if line.strip(): print(f" {line}") + # Extract links from frontmatter (now that pages are imported with links:) + print("\nExtracting links from frontmatter...") + result3 = subprocess.run( + [BUN, "extract", "links", "--source", "db", "--include-frontmatter", + "--dir", GBRAIN_SRC], + capture_output=True, text=True, env=env + ) + for line in result3.stdout.strip().split('\n')[-10:]: + if line.strip(): + print(f" {line}") + + # Extract timeline from body + print("\nExtracting timeline...") + result4 = subprocess.run( + [BUN, "extract", "timeline", "--source", "db", "--dir", GBRAIN_SRC], + capture_output=True, text=True, env=env + ) + for line in result4.stdout.strip().split('\n')[-10:]: + if line.strip(): + print(f" {line}") + + # Stats + print("\nBrain stats:") + result5 = subprocess.run( + [BUN, "stats"], + capture_output=True, text=True, env=env + ) + for line in result5.stdout.strip().split('\n')[-15:]: + if line.strip(): + print(f" {line}") + if __name__ == "__main__": main()