lots of fixes

2026-04-14 04:56:45 -05:00
parent ddd240d865
commit dffb3f3c85
4 changed files with 247 additions and 161 deletions
@@ -0,0 +1,26 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Overview
+
+Single-script tool that walks a hardcoded Obsidian notes folder and fills empty frontmatter fields (`tags`, `slug`, `seo-title`, `seo-description`, `seo-keywords`) by calling a local LM Studio LLM. Only empty fields are touched; everything else in frontmatter is preserved.
+
+## Run
+
+```bash
+pip install pyyaml requests
+./tag-notes.py
+```
+
+Requires LM Studio running at `LM_STUDIO_URL` with `MODEL_NAME` loaded. There are no tests, linter, or build step.
+
+Sanity-check the LLM endpoint with: `curl http://localhost:1234/v1/models`
+
+## Architecture
+
+- `tag-notes.py` — all logic. Top-of-file constants (`LM_STUDIO_URL`, `MODEL_NAME`, `NOTES_FOLDER`, `TAXONOMY_FILE`) are the configuration surface — edit them in place.
+- `tag-taxonomy.yaml` — `tags:` list injected into the LLM system prompt. Add tags here when the LLM's "new_tag_suggestions" are worth keeping.
+- Flow per note: `extract_frontmatter` (regex + `yaml.safe_load`) → decide which fields are empty → slug is derived locally from filename → if any LLM-backed field is empty, send `body[:20000]` to `call_llm`, which expects a strict JSON response (with a fallback regex to unwrap ```json fences) → `reconstruct_markdown` writes the file back with `sort_keys=False` to preserve field order.
+- The 20000-char cap lives inline at the `call_llm` invocation, not as a constant.
+- Notes without `---` frontmatter are skipped, not created.
@@ -2,24 +2,21 @@

 Automatically tag your Obsidian notes, add slugs, and add SEO metadata using a local LLM.

-The script requires 5 frontmatter fields named as such:
+The script requires these frontmatter fields:
+- title (used as the basis for the SEO title)
 - tags
 - seo-title
 - seo-description
 - seo-keywords
 - slug

-**Note:** The script sends only the first 20,000 characters of the note to the LLM. This should be more than sufficient for most notes, however if you have extremely long notes then you can increase this amount by editing the following line in `tag-notes.py`:
-
-```py
-llm_response = call_llm(None, body[:20000], taxonomy)  # Limit content to first 20000 chars
-```  
+**Note:** The script sends only the first 20,000 characters of the note to the LLM. This should be more than sufficient for most notes, however if you have extremely long notes then you can increase this by editing the `CONTENT_CHAR_LIMIT` constant in `tag-notes.py`.

 ## Setup

 1. **Install dependencies:**
   ```bash
-   pip install pyyaml requests
+   pip install pyyaml ruamel.yaml requests
   ```
 2. **Make sure LM Studio is running** on  with the preferred model loaded
 3. **Place these files in a directory:**
@@ -39,24 +36,25 @@ Simply run the script:
 ./tag-notes.py
 ```

-It will automatically process all markdown files in the 'NOTES_FOLDER'.
+It will recursively process all markdown files under `NOTES_FOLDER`.

 ## What it does

 The script will:
 - ✓ Add tags (1-5) using your taxonomy + 1-2 new suggestions
- ✓ Add SEO title (clean, non-clickbaity)
- ✓ Add SEO description (150-160 chars, factual)
+- ✓ Add SEO title in the format `{title}: {short descriptor}`, built from the note's `title:` frontmatter field
+- ✓ Add SEO description (strictly 150-160 chars, factual — re-asks the LLM once if the first response is out of range)
 - ✓ Add SEO keywords (generous, 10-15 keywords)
- ✓ Add slug based off of filename
+- ✓ Add slug derived from the filename (lowercased, punctuation stripped, spaces/hyphens collapsed)
 - ✓ **Only updates empty fields** - preserves existing values
 - ✓ Updates files directly (no confirmation needed)
- ✓ **Only touches**: `tags`, `seo-title`, `seo-description`, `seo-keywords`
- ✓ **Preserves everything else** in your frontmatter
+- ✓ **Preserves existing frontmatter formatting** via round-trip YAML
+
+At the end of a run, if the LLM proposed any genuinely new tags, the script lists them and prompts you to append them to `tag-taxonomy.yaml` automatically.

 ## Managing the taxonomy

-Edit `tag-taxonomy.yaml` to add new tags that the LLM suggests and you like.
+At the end of each run, the script will prompt you to add any newly suggested tags to `tag-taxonomy.yaml` automatically. You can also edit the file by hand at any time.

 The LLM will:
 - Prefer existing taxonomy tags
@@ -6,251 +6,313 @@ Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata

 import os
 import sys
-import yaml
-import requests
+import io
 import json
-from pathlib import Path
 import re
+from pathlib import Path
+
+import requests
+import yaml
+from ruamel.yaml import YAML

 # Configuration
-LM_STUDIO_URL = "http://192.168.68.84:1234/v1/chat/completions"
+LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
 MODEL_NAME = "openai/gpt-oss-20b"
 TAXONOMY_FILE = "tag-taxonomy.yaml"
 NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
+CONTENT_CHAR_LIMIT = 20000
+SEO_DESC_MIN = 150
+SEO_DESC_MAX = 160
+
+# Round-trip YAML preserves existing frontmatter formatting
+yaml_rt = YAML()
+yaml_rt.preserve_quotes = True
+yaml_rt.width = 4096
+

 def load_taxonomy(taxonomy_path):
-    """Load the tag taxonomy from YAML file"""
    with open(taxonomy_path, 'r') as f:
-        data = yaml.safe_load(f)
-    return data.get('tags', [])
+        data = yaml.safe_load(f) or {}
+    return data.get('tags', []) or []
+
+
+def append_tags_to_taxonomy(taxonomy_path, new_tags):
+    with open(taxonomy_path, 'r') as f:
+        data = yaml.safe_load(f) or {}
+    existing = data.get('tags', []) or []
+    combined = list(dict.fromkeys(existing + list(new_tags)))
+    data['tags'] = combined
+    with open(taxonomy_path, 'w') as f:
+        yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
+

 def extract_frontmatter(content):
-    """Extract frontmatter and content from markdown"""
-    # Match YAML frontmatter between --- delimiters
    pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    match = re.match(pattern, content, re.DOTALL)
-    
-    if match:
-        frontmatter_str = match.group(1)
-        body = match.group(2)
-        frontmatter = yaml.safe_load(frontmatter_str)
-        return frontmatter, body, frontmatter_str
-    
-    return None, content, None
+    if not match:
+        return None, content
+    frontmatter = yaml_rt.load(match.group(1))
+    return frontmatter, match.group(2)
+

 def reconstruct_markdown(frontmatter, body):
-    """Reconstruct markdown with updated frontmatter"""
-    # Convert frontmatter to YAML string
-    frontmatter_str = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False)
-    return f"---\n{frontmatter_str}---\n{body}"
+    stream = io.StringIO()
+    yaml_rt.dump(frontmatter, stream)
+    fm_str = stream.getvalue()
+    if not fm_str.endswith('\n'):
+        fm_str += '\n'
+    return f"---\n{fm_str}---\n{body}"

-def call_llm(prompt, note_content, taxonomy):
-    """Call LM Studio API to get tags and SEO metadata"""
-    taxonomy_str = ", ".join(taxonomy)
-    
-    system_prompt = f"""You are a helpful assistant that analyzes markdown notes and provides:
-1. Tags from existing taxonomy (1-5 tags, prefer existing)
-2. 1-2 NEW tag suggestions if content warrants it
-3. Clean, concise SEO title (not clickbaity)
-4. Clean, concise SEO description (150-160 chars, factual summary)
-5. SEO keywords (be generous, 10-15 relevant keywords)

-Existing tag taxonomy: {taxonomy_str}
+def slugify(text):
+    text = text.lower()
+    text = re.sub(r"[’'`]", '', text)
+    text = re.sub(r'[^\w\s-]', ' ', text)
+    text = re.sub(r'[-\s]+', '-', text).strip('-')
+    return text

-Return ONLY valid JSON in this exact format:
-{{
-  "tags_from_taxonomy": ["tag1", "tag2"],
-  "new_tag_suggestions": ["newtag1"],
-  "seo_title": "Clear Title Here",
-  "seo_description": "Concise factual summary of the note content.",
-  "seo_keywords": ["keyword1", "keyword2", "keyword3"]
-}}"""

-    user_prompt = f"""Analyze this note and provide tags and SEO metadata:
+def parse_json_response(content):
+    if content is None:
+        return None
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError:
+        pass
+    start = content.find('{')
+    end = content.rfind('}')
+    if start != -1 and end > start:
+        try:
+            return json.loads(content[start:end + 1])
+        except json.JSONDecodeError:
+            pass
+    return None

-{note_content}
-
-Remember:
- Use 1-5 tags from taxonomy that fit best
- Suggest 1-2 NEW tags only if content really warrants it (be conservative)
- SEO title should be clear and informative, NOT clickbaity
- SEO description should be a clean factual summary (150-160 characters)
- SEO keywords can be generous (10-15 keywords)"""

+def call_llm_json(system_prompt, user_prompt, max_tokens=900):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
+            {"role": "user", "content": user_prompt},
        ],
-        "temperature": 0.7,
-        "max_tokens": 500
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+        "response_format": {"type": "json_object"},
    }
-    
    try:
-        response = requests.post(LM_STUDIO_URL, json=payload, timeout=60)
+        response = requests.post(LM_STUDIO_URL, json=payload, timeout=120)
        response.raise_for_status()
        result = response.json()
-        
-        # Extract the response content
        content = result['choices'][0]['message']['content']
-        
-        # Try to parse JSON from the response
-        # Sometimes LLMs wrap JSON in markdown code blocks
-        json_match = re.search(r'```json\s*(\{.*?\})\s*```', content, re.DOTALL)
-        if json_match:
-            content = json_match.group(1)
-        
-        return json.loads(content)
-        
+        return parse_json_response(content)
    except Exception as e:
-        print(f"Error calling LLM: {e}")
+        print(f"  ! LLM error: {e}")
        return None

-def process_note(file_path, taxonomy):
-    """Process a single note file"""
+
+def request_metadata(title, note_content, taxonomy):
+    taxonomy_str = ", ".join(taxonomy)
+    system_prompt = f"""You analyze markdown notes and return structured metadata.
+
+Return ONLY valid JSON in this exact shape:
+{{
+  "tags_from_taxonomy": ["tag1", "tag2"],
+  "new_tag_suggestions": ["newtag1"],
+  "seo_title_suffix": "Short descriptor that will follow the note title",
+  "seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.",
+  "seo_keywords": ["keyword1", "keyword2"]
+}}
+
+Rules:
+- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content.
+- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative).
+- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow "<title>: ". Aim for 4-10 words.
+- seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.
+- seo_keywords: 10-15 relevant keywords, no duplicates.
+
+Existing tag taxonomy: {taxonomy_str}"""
+
+    user_prompt = f"""Note title: {title}
+
+Note content:
+{note_content}
+
+Produce the JSON described in the system prompt."""
+    return call_llm_json(system_prompt, user_prompt)
+
+
+def request_description_retry(title, note_content, previous_desc):
+    system_prompt = f"""You rewrite SEO descriptions to a strict length.
+
+Return ONLY valid JSON of the form:
+{{"seo_description": "..."}}
+
+The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
+    user_prompt = f"""Note title: {title}
+
+Note content:
+{note_content}
+
+Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range:
+"{previous_desc}"
+
+Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
+    result = call_llm_json(system_prompt, user_prompt, max_tokens=400)
+    if result:
+        return (result.get('seo_description') or '').strip()
+    return ''
+
+
+def process_note(file_path, taxonomy, new_tag_accumulator):
    print(f"Processing: {file_path}")
-    
-    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
-    
-    # Extract frontmatter and body
-    frontmatter, body, original_fm_str = extract_frontmatter(content)
-    
+
+    frontmatter, body = extract_frontmatter(content)
    if frontmatter is None:
-        print(f"  ⚠️  No frontmatter found, skipping")
+        print("  ⚠️  No frontmatter found, skipping")
        return
-    
-    # Check what needs to be filled in
-    needs_update = False
-    existing_tags = frontmatter.get('tags', [])
-    if not existing_tags or existing_tags == [None]:
+
+    existing_tags = frontmatter.get('tags', []) or []
+    if existing_tags == [None]:
        existing_tags = []
-    
+
    needs_tags = not existing_tags
    needs_slug = not frontmatter.get('slug')
    needs_seo_title = not frontmatter.get('seo-title')
    needs_seo_desc = not frontmatter.get('seo-description')
    needs_seo_keywords = not frontmatter.get('seo-keywords')
-    
+
    if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
-        print(f"  ✓ All fields already populated, skipping")
+        print("  ✓ All fields already populated, skipping")
        return
-    
+
+    title = frontmatter.get('title') or Path(file_path).stem
    updated = False
-    
-    # Generate slug from filename if needed
+
    if needs_slug:
-        # Get filename without extension
-        filename = Path(file_path).stem
-        # Convert to lowercase and replace spaces with hyphens
-        slug = filename.lower().replace(' ', '-')
+        slug = slugify(Path(file_path).stem)
        frontmatter['slug'] = slug
        print(f"  + Added slug: {slug}")
        updated = True
-    
-    # Only call LLM if we need tags or SEO fields
+
    if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
-        # Only needed slug, we're done
-        new_content = reconstruct_markdown(frontmatter, body)
        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(new_content)
-        print(f"  ✓ Updated successfully")
+            f.write(reconstruct_markdown(frontmatter, body))
+        print("  ✓ Updated successfully")
        return
-    
-    # Call LLM
-    llm_response = call_llm(None, body[:20000], taxonomy)  # Limit content to first 20000 chars
-    
+
+    llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy)
    if not llm_response:
-        print(f"  ✗ Failed to get LLM response")
+        print("  ✗ Failed to get LLM response")
        return
-    
-    # Update frontmatter with new values (only if empty)
-    
+
    if needs_tags:
-        # Combine taxonomy tags and new suggestions
-        all_tags = llm_response.get('tags_from_taxonomy', [])
-        new_suggestions = llm_response.get('new_tag_suggestions', [])
-        all_tags.extend(new_suggestions)
-        
-        # Limit to 5 tags total
-        all_tags = all_tags[:5]
-        
-        if all_tags:
-            frontmatter['tags'] = all_tags
+        taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
+        new_suggestions = llm_response.get('new_tag_suggestions') or []
+        combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
+        if combined:
+            frontmatter['tags'] = combined
            updated = True
-            print(f"  + Added tags: {', '.join(all_tags)}")
-            if new_suggestions:
-                print(f"    (New suggestions: {', '.join(new_suggestions)})")
-    
+            print(f"  + Added tags: {', '.join(combined)}")
+            genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
+            if genuinely_new:
+                print(f"    (New suggestions: {', '.join(genuinely_new)})")
+                new_tag_accumulator.update(genuinely_new)
+
    if needs_seo_title:
-        seo_title = llm_response.get('seo_title', '')
-        if seo_title:
+        suffix = (llm_response.get('seo_title_suffix') or '').strip()
+        suffix = suffix.lstrip(':').strip()
+        # Strip a leading repeat of the title if the LLM included it anyway
+        if suffix.lower().startswith(title.lower()):
+            suffix = suffix[len(title):].lstrip(':').strip()
+        if suffix:
+            seo_title = f"{title}: {suffix}"
            frontmatter['seo-title'] = seo_title
            updated = True
            print(f"  + Added SEO title: {seo_title}")
-    
+
    if needs_seo_desc:
-        seo_desc = llm_response.get('seo_description', '')
+        seo_desc = (llm_response.get('seo_description') or '').strip()
+        if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
+            print(f"  ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
+            retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
+            if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX:
+                seo_desc = retry
+            elif retry:
+                print(f"  ! Retry still {len(retry)} chars; using original")
+            else:
+                print("  ! Retry failed; using original")
        if seo_desc:
            frontmatter['seo-description'] = seo_desc
            updated = True
-            print(f"  + Added SEO description")
-    
+            print(f"  + Added SEO description ({len(seo_desc)} chars)")
+
    if needs_seo_keywords:
-        seo_keywords = llm_response.get('seo_keywords', [])
+        seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
        if seo_keywords:
            frontmatter['seo-keywords'] = seo_keywords
            updated = True
            print(f"  + Added {len(seo_keywords)} SEO keywords")
-    
+
    if updated:
-        # Write back to file
-        new_content = reconstruct_markdown(frontmatter, body)
        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(new_content)
-        print(f"  ✓ Updated successfully")
+            f.write(reconstruct_markdown(frontmatter, body))
+        print("  ✓ Updated successfully")
    else:
-        print(f"  - No updates needed")
+        print("  - No updates needed")
+

 def main():
-    # Load taxonomy
    taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
    if not taxonomy_path.exists():
        print(f"Error: Taxonomy file not found at {taxonomy_path}")
        print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
        sys.exit(1)
-    
+
    taxonomy = load_taxonomy(taxonomy_path)
    print(f"Loaded {len(taxonomy)} tags from taxonomy\n")
-    
-    # Use the hardcoded notes folder
+
    target_path = Path(NOTES_FOLDER)
-    
    if not target_path.exists():
        print(f"Error: Notes folder not found: {target_path}")
        sys.exit(1)
-    
    if not target_path.is_dir():
        print(f"Error: {target_path} is not a directory")
        sys.exit(1)
-    
-    # Process all markdown files in the directory
-    md_files = list(target_path.glob('*.md'))
-    
+
+    md_files = sorted(target_path.rglob('*.md'))
    if not md_files:
        print(f"No markdown files found in {target_path}")
        sys.exit(0)
-    
-    print(f"Processing all markdown files in: {target_path}")
+
+    print(f"Processing all markdown files under: {target_path}")
    print(f"Found {len(md_files)} markdown files\n")
-    
+
+    new_tag_accumulator = set()
    for md_file in md_files:
-        process_note(md_file, taxonomy)
-        print()  # Blank line between files
-    
+        try:
+            process_note(md_file, taxonomy, new_tag_accumulator)
+        except Exception as e:
+            print(f"  ✗ Error processing {md_file}: {e}")
+        print()
+
    print("\n✓ Processing complete!")

+    fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
+    if fresh:
+        print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
+        try:
+            answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
+        except EOFError:
+            answer = ''
+        if answer == 'y':
+            append_tags_to_taxonomy(taxonomy_path, fresh)
+            print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
+        else:
+            print("Skipped taxonomy update.")
+
+
 if __name__ == "__main__":
    main()