lots of fixes

2026-04-14 04:56:45 -05:00
parent ddd240d865
commit dffb3f3c85
4 changed files with 247 additions and 161 deletions
@@ -6,251 +6,313 @@ Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata

 import os
 import sys
-import yaml
-import requests
+import io
 import json
-from pathlib import Path
 import re
+from pathlib import Path
+
+import requests
+import yaml
+from ruamel.yaml import YAML

 # Configuration
-LM_STUDIO_URL = "http://192.168.68.84:1234/v1/chat/completions"
+LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
 MODEL_NAME = "openai/gpt-oss-20b"
 TAXONOMY_FILE = "tag-taxonomy.yaml"
 NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
+CONTENT_CHAR_LIMIT = 20000
+SEO_DESC_MIN = 150
+SEO_DESC_MAX = 160
+
+# Round-trip YAML preserves existing frontmatter formatting
+yaml_rt = YAML()
+yaml_rt.preserve_quotes = True
+yaml_rt.width = 4096
+

 def load_taxonomy(taxonomy_path):
-    """Load the tag taxonomy from YAML file"""
    with open(taxonomy_path, 'r') as f:
-        data = yaml.safe_load(f)
-    return data.get('tags', [])
+        data = yaml.safe_load(f) or {}
+    return data.get('tags', []) or []
+
+
+def append_tags_to_taxonomy(taxonomy_path, new_tags):
+    with open(taxonomy_path, 'r') as f:
+        data = yaml.safe_load(f) or {}
+    existing = data.get('tags', []) or []
+    combined = list(dict.fromkeys(existing + list(new_tags)))
+    data['tags'] = combined
+    with open(taxonomy_path, 'w') as f:
+        yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
+

 def extract_frontmatter(content):
-    """Extract frontmatter and content from markdown"""
-    # Match YAML frontmatter between --- delimiters
    pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    match = re.match(pattern, content, re.DOTALL)
-    
-    if match:
-        frontmatter_str = match.group(1)
-        body = match.group(2)
-        frontmatter = yaml.safe_load(frontmatter_str)
-        return frontmatter, body, frontmatter_str
-    
-    return None, content, None
+    if not match:
+        return None, content
+    frontmatter = yaml_rt.load(match.group(1))
+    return frontmatter, match.group(2)
+

 def reconstruct_markdown(frontmatter, body):
-    """Reconstruct markdown with updated frontmatter"""
-    # Convert frontmatter to YAML string
-    frontmatter_str = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False)
-    return f"---\n{frontmatter_str}---\n{body}"
+    stream = io.StringIO()
+    yaml_rt.dump(frontmatter, stream)
+    fm_str = stream.getvalue()
+    if not fm_str.endswith('\n'):
+        fm_str += '\n'
+    return f"---\n{fm_str}---\n{body}"

-def call_llm(prompt, note_content, taxonomy):
-    """Call LM Studio API to get tags and SEO metadata"""
-    taxonomy_str = ", ".join(taxonomy)
-    
-    system_prompt = f"""You are a helpful assistant that analyzes markdown notes and provides:
-1. Tags from existing taxonomy (1-5 tags, prefer existing)
-2. 1-2 NEW tag suggestions if content warrants it
-3. Clean, concise SEO title (not clickbaity)
-4. Clean, concise SEO description (150-160 chars, factual summary)
-5. SEO keywords (be generous, 10-15 relevant keywords)

-Existing tag taxonomy: {taxonomy_str}
+def slugify(text):
+    text = text.lower()
+    text = re.sub(r"[’'`]", '', text)
+    text = re.sub(r'[^\w\s-]', ' ', text)
+    text = re.sub(r'[-\s]+', '-', text).strip('-')
+    return text

-Return ONLY valid JSON in this exact format:
-{{
-  "tags_from_taxonomy": ["tag1", "tag2"],
-  "new_tag_suggestions": ["newtag1"],
-  "seo_title": "Clear Title Here",
-  "seo_description": "Concise factual summary of the note content.",
-  "seo_keywords": ["keyword1", "keyword2", "keyword3"]
-}}"""

-    user_prompt = f"""Analyze this note and provide tags and SEO metadata:
+def parse_json_response(content):
+    if content is None:
+        return None
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError:
+        pass
+    start = content.find('{')
+    end = content.rfind('}')
+    if start != -1 and end > start:
+        try:
+            return json.loads(content[start:end + 1])
+        except json.JSONDecodeError:
+            pass
+    return None

-{note_content}
-
-Remember:
- Use 1-5 tags from taxonomy that fit best
- Suggest 1-2 NEW tags only if content really warrants it (be conservative)
- SEO title should be clear and informative, NOT clickbaity
- SEO description should be a clean factual summary (150-160 characters)
- SEO keywords can be generous (10-15 keywords)"""

+def call_llm_json(system_prompt, user_prompt, max_tokens=900):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
+            {"role": "user", "content": user_prompt},
        ],
-        "temperature": 0.7,
-        "max_tokens": 500
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+        "response_format": {"type": "json_object"},
    }
-    
    try:
-        response = requests.post(LM_STUDIO_URL, json=payload, timeout=60)
+        response = requests.post(LM_STUDIO_URL, json=payload, timeout=120)
        response.raise_for_status()
        result = response.json()
-        
-        # Extract the response content
        content = result['choices'][0]['message']['content']
-        
-        # Try to parse JSON from the response
-        # Sometimes LLMs wrap JSON in markdown code blocks
-        json_match = re.search(r'```json\s*(\{.*?\})\s*```', content, re.DOTALL)
-        if json_match:
-            content = json_match.group(1)
-        
-        return json.loads(content)
-        
+        return parse_json_response(content)
    except Exception as e:
-        print(f"Error calling LLM: {e}")
+        print(f"  ! LLM error: {e}")
        return None

-def process_note(file_path, taxonomy):
-    """Process a single note file"""
+
+def request_metadata(title, note_content, taxonomy):
+    taxonomy_str = ", ".join(taxonomy)
+    system_prompt = f"""You analyze markdown notes and return structured metadata.
+
+Return ONLY valid JSON in this exact shape:
+{{
+  "tags_from_taxonomy": ["tag1", "tag2"],
+  "new_tag_suggestions": ["newtag1"],
+  "seo_title_suffix": "Short descriptor that will follow the note title",
+  "seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.",
+  "seo_keywords": ["keyword1", "keyword2"]
+}}
+
+Rules:
+- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content.
+- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative).
+- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow "<title>: ". Aim for 4-10 words.
+- seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.
+- seo_keywords: 10-15 relevant keywords, no duplicates.
+
+Existing tag taxonomy: {taxonomy_str}"""
+
+    user_prompt = f"""Note title: {title}
+
+Note content:
+{note_content}
+
+Produce the JSON described in the system prompt."""
+    return call_llm_json(system_prompt, user_prompt)
+
+
+def request_description_retry(title, note_content, previous_desc):
+    system_prompt = f"""You rewrite SEO descriptions to a strict length.
+
+Return ONLY valid JSON of the form:
+{{"seo_description": "..."}}
+
+The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
+    user_prompt = f"""Note title: {title}
+
+Note content:
+{note_content}
+
+Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range:
+"{previous_desc}"
+
+Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
+    result = call_llm_json(system_prompt, user_prompt, max_tokens=400)
+    if result:
+        return (result.get('seo_description') or '').strip()
+    return ''
+
+
+def process_note(file_path, taxonomy, new_tag_accumulator):
    print(f"Processing: {file_path}")
-    
-    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
-    
-    # Extract frontmatter and body
-    frontmatter, body, original_fm_str = extract_frontmatter(content)
-    
+
+    frontmatter, body = extract_frontmatter(content)
    if frontmatter is None:
-        print(f"  ⚠️  No frontmatter found, skipping")
+        print("  ⚠️  No frontmatter found, skipping")
        return
-    
-    # Check what needs to be filled in
-    needs_update = False
-    existing_tags = frontmatter.get('tags', [])
-    if not existing_tags or existing_tags == [None]:
+
+    existing_tags = frontmatter.get('tags', []) or []
+    if existing_tags == [None]:
        existing_tags = []
-    
+
    needs_tags = not existing_tags
    needs_slug = not frontmatter.get('slug')
    needs_seo_title = not frontmatter.get('seo-title')
    needs_seo_desc = not frontmatter.get('seo-description')
    needs_seo_keywords = not frontmatter.get('seo-keywords')
-    
+
    if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
-        print(f"  ✓ All fields already populated, skipping")
+        print("  ✓ All fields already populated, skipping")
        return
-    
+
+    title = frontmatter.get('title') or Path(file_path).stem
    updated = False
-    
-    # Generate slug from filename if needed
+
    if needs_slug:
-        # Get filename without extension
-        filename = Path(file_path).stem
-        # Convert to lowercase and replace spaces with hyphens
-        slug = filename.lower().replace(' ', '-')
+        slug = slugify(Path(file_path).stem)
        frontmatter['slug'] = slug
        print(f"  + Added slug: {slug}")
        updated = True
-    
-    # Only call LLM if we need tags or SEO fields
+
    if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
-        # Only needed slug, we're done
-        new_content = reconstruct_markdown(frontmatter, body)
        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(new_content)
-        print(f"  ✓ Updated successfully")
+            f.write(reconstruct_markdown(frontmatter, body))
+        print("  ✓ Updated successfully")
        return
-    
-    # Call LLM
-    llm_response = call_llm(None, body[:20000], taxonomy)  # Limit content to first 20000 chars
-    
+
+    llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy)
    if not llm_response:
-        print(f"  ✗ Failed to get LLM response")
+        print("  ✗ Failed to get LLM response")
        return
-    
-    # Update frontmatter with new values (only if empty)
-    
+
    if needs_tags:
-        # Combine taxonomy tags and new suggestions
-        all_tags = llm_response.get('tags_from_taxonomy', [])
-        new_suggestions = llm_response.get('new_tag_suggestions', [])
-        all_tags.extend(new_suggestions)
-        
-        # Limit to 5 tags total
-        all_tags = all_tags[:5]
-        
-        if all_tags:
-            frontmatter['tags'] = all_tags
+        taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
+        new_suggestions = llm_response.get('new_tag_suggestions') or []
+        combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
+        if combined:
+            frontmatter['tags'] = combined
            updated = True
-            print(f"  + Added tags: {', '.join(all_tags)}")
-            if new_suggestions:
-                print(f"    (New suggestions: {', '.join(new_suggestions)})")
-    
+            print(f"  + Added tags: {', '.join(combined)}")
+            genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
+            if genuinely_new:
+                print(f"    (New suggestions: {', '.join(genuinely_new)})")
+                new_tag_accumulator.update(genuinely_new)
+
    if needs_seo_title:
-        seo_title = llm_response.get('seo_title', '')
-        if seo_title:
+        suffix = (llm_response.get('seo_title_suffix') or '').strip()
+        suffix = suffix.lstrip(':').strip()
+        # Strip a leading repeat of the title if the LLM included it anyway
+        if suffix.lower().startswith(title.lower()):
+            suffix = suffix[len(title):].lstrip(':').strip()
+        if suffix:
+            seo_title = f"{title}: {suffix}"
            frontmatter['seo-title'] = seo_title
            updated = True
            print(f"  + Added SEO title: {seo_title}")
-    
+
    if needs_seo_desc:
-        seo_desc = llm_response.get('seo_description', '')
+        seo_desc = (llm_response.get('seo_description') or '').strip()
+        if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
+            print(f"  ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
+            retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
+            if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX:
+                seo_desc = retry
+            elif retry:
+                print(f"  ! Retry still {len(retry)} chars; using original")
+            else:
+                print("  ! Retry failed; using original")
        if seo_desc:
            frontmatter['seo-description'] = seo_desc
            updated = True
-            print(f"  + Added SEO description")
-    
+            print(f"  + Added SEO description ({len(seo_desc)} chars)")
+
    if needs_seo_keywords:
-        seo_keywords = llm_response.get('seo_keywords', [])
+        seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
        if seo_keywords:
            frontmatter['seo-keywords'] = seo_keywords
            updated = True
            print(f"  + Added {len(seo_keywords)} SEO keywords")
-    
+
    if updated:
-        # Write back to file
-        new_content = reconstruct_markdown(frontmatter, body)
        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(new_content)
-        print(f"  ✓ Updated successfully")
+            f.write(reconstruct_markdown(frontmatter, body))
+        print("  ✓ Updated successfully")
    else:
-        print(f"  - No updates needed")
+        print("  - No updates needed")
+

 def main():
-    # Load taxonomy
    taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
    if not taxonomy_path.exists():
        print(f"Error: Taxonomy file not found at {taxonomy_path}")
        print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
        sys.exit(1)
-    
+
    taxonomy = load_taxonomy(taxonomy_path)
    print(f"Loaded {len(taxonomy)} tags from taxonomy\n")
-    
-    # Use the hardcoded notes folder
+
    target_path = Path(NOTES_FOLDER)
-    
    if not target_path.exists():
        print(f"Error: Notes folder not found: {target_path}")
        sys.exit(1)
-    
    if not target_path.is_dir():
        print(f"Error: {target_path} is not a directory")
        sys.exit(1)
-    
-    # Process all markdown files in the directory
-    md_files = list(target_path.glob('*.md'))
-    
+
+    md_files = sorted(target_path.rglob('*.md'))
    if not md_files:
        print(f"No markdown files found in {target_path}")
        sys.exit(0)
-    
-    print(f"Processing all markdown files in: {target_path}")
+
+    print(f"Processing all markdown files under: {target_path}")
    print(f"Found {len(md_files)} markdown files\n")
-    
+
+    new_tag_accumulator = set()
    for md_file in md_files:
-        process_note(md_file, taxonomy)
-        print()  # Blank line between files
-    
+        try:
+            process_note(md_file, taxonomy, new_tag_accumulator)
+        except Exception as e:
+            print(f"  ✗ Error processing {md_file}: {e}")
+        print()
+
    print("\n✓ Processing complete!")

+    fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
+    if fresh:
+        print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
+        try:
+            answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
+        except EOFError:
+            answer = ''
+        if answer == 'y':
+            append_tags_to_taxonomy(taxonomy_path, fresh)
+            print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
+        else:
+            print("Skipped taxonomy update.")
+
+
 if __name__ == "__main__":
    main()