note-tagger/tag-notes.py

#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
Processes markdown notes using a local LLM to add tags and SEO metadata
"""

import os
import sys
import yaml
import requests
import json
from pathlib import Path
import re

# Configuration
LM_STUDIO_URL = "http://192.168.68.84:1234/v1/chat/completions"
MODEL_NAME = "openai/gpt-oss-20b"
TAXONOMY_FILE = "tag-taxonomy.yaml"
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")

def load_taxonomy(taxonomy_path):
    """Load the tag taxonomy from YAML file"""
    with open(taxonomy_path, 'r') as f:
        data = yaml.safe_load(f)
    return data.get('tags', [])

def extract_frontmatter(content):
    """Extract frontmatter and content from markdown"""
    # Match YAML frontmatter between --- delimiters
    pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    match = re.match(pattern, content, re.DOTALL)

    if match:
        frontmatter_str = match.group(1)
        body = match.group(2)
        frontmatter = yaml.safe_load(frontmatter_str)
        return frontmatter, body, frontmatter_str

    return None, content, None

def reconstruct_markdown(frontmatter, body):
    """Reconstruct markdown with updated frontmatter"""
    # Convert frontmatter to YAML string
    frontmatter_str = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False)
    return f"---\n{frontmatter_str}---\n{body}"

def call_llm(prompt, note_content, taxonomy):
    """Call LM Studio API to get tags and SEO metadata"""
    taxonomy_str = ", ".join(taxonomy)

    system_prompt = f"""You are a helpful assistant that analyzes markdown notes and provides:
1. Tags from existing taxonomy (1-5 tags, prefer existing)
2. 1-2 NEW tag suggestions if content warrants it
3. Clean, concise SEO title (not clickbaity)
4. Clean, concise SEO description (150-160 chars, factual summary)
5. SEO keywords (be generous, 10-15 relevant keywords)

Existing tag taxonomy: {taxonomy_str}

Return ONLY valid JSON in this exact format:
{{
  "tags_from_taxonomy": ["tag1", "tag2"],
  "new_tag_suggestions": ["newtag1"],
  "seo_title": "Clear Title Here",
  "seo_description": "Concise factual summary of the note content.",
  "seo_keywords": ["keyword1", "keyword2", "keyword3"]
}}"""

    user_prompt = f"""Analyze this note and provide tags and SEO metadata:

{note_content}

Remember:
- Use 1-5 tags from taxonomy that fit best
- Suggest 1-2 NEW tags only if content really warrants it (be conservative)
- SEO title should be clear and informative, NOT clickbaity
- SEO description should be a clean factual summary (150-160 characters)
- SEO keywords can be generous (10-15 keywords)"""

    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "temperature": 0.7,
        "max_tokens": 500
    }

    try:
        response = requests.post(LM_STUDIO_URL, json=payload, timeout=60)
        response.raise_for_status()
        result = response.json()

        # Extract the response content
        content = result['choices'][0]['message']['content']

        # Try to parse JSON from the response
        # Sometimes LLMs wrap JSON in markdown code blocks
        json_match = re.search(r'```json\s*(\{.*?\})\s*```', content, re.DOTALL)
        if json_match:
            content = json_match.group(1)

        return json.loads(content)

    except Exception as e:
        print(f"Error calling LLM: {e}")
        return None

def process_note(file_path, taxonomy):
    """Process a single note file"""
    print(f"Processing: {file_path}")

    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract frontmatter and body
    frontmatter, body, original_fm_str = extract_frontmatter(content)

    if frontmatter is None:
        print(f"  ⚠️  No frontmatter found, skipping")
        return

    # Check what needs to be filled in
    needs_update = False
    existing_tags = frontmatter.get('tags', [])
    if not existing_tags or existing_tags == [None]:
        existing_tags = []

    needs_tags = not existing_tags
    needs_seo_title = not frontmatter.get('seo-title')
    needs_seo_desc = not frontmatter.get('seo-description')
    needs_seo_keywords = not frontmatter.get('seo-keywords')

    if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
        print(f"  ✓ All fields already populated, skipping")
        return

    # Call LLM
    llm_response = call_llm(None, body[:20000], taxonomy)  # Limit content to first 20000 chars

    if not llm_response:
        print(f"  ✗ Failed to get LLM response")
        return

    # Update frontmatter with new values (only if empty)
    updated = False

    if needs_tags:
        # Combine taxonomy tags and new suggestions
        all_tags = llm_response.get('tags_from_taxonomy', [])
        new_suggestions = llm_response.get('new_tag_suggestions', [])
        all_tags.extend(new_suggestions)

        # Limit to 5 tags total
        all_tags = all_tags[:5]

        if all_tags:
            frontmatter['tags'] = all_tags
            updated = True
            print(f"  + Added tags: {', '.join(all_tags)}")
            if new_suggestions:
                print(f"    (New suggestions: {', '.join(new_suggestions)})")

    if needs_seo_title:
        seo_title = llm_response.get('seo_title', '')
        if seo_title:
            frontmatter['seo-title'] = seo_title
            updated = True
            print(f"  + Added SEO title: {seo_title}")

    if needs_seo_desc:
        seo_desc = llm_response.get('seo_description', '')
        if seo_desc:
            frontmatter['seo-description'] = seo_desc
            updated = True
            print(f"  + Added SEO description")

    if needs_seo_keywords:
        seo_keywords = llm_response.get('seo_keywords', [])
        if seo_keywords:
            frontmatter['seo-keywords'] = seo_keywords
            updated = True
            print(f"  + Added {len(seo_keywords)} SEO keywords")

    if updated:
        # Write back to file
        new_content = reconstruct_markdown(frontmatter, body)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(new_content)
        print(f"  ✓ Updated successfully")
    else:
        print(f"  - No updates needed")

def main():
    # Load taxonomy
    taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
    if not taxonomy_path.exists():
        print(f"Error: Taxonomy file not found at {taxonomy_path}")
        print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
        sys.exit(1)

    taxonomy = load_taxonomy(taxonomy_path)
    print(f"Loaded {len(taxonomy)} tags from taxonomy\n")

    # Use the hardcoded notes folder
    target_path = Path(NOTES_FOLDER)

    if not target_path.exists():
        print(f"Error: Notes folder not found: {target_path}")
        sys.exit(1)

    if not target_path.is_dir():
        print(f"Error: {target_path} is not a directory")
        sys.exit(1)

    # Process all markdown files in the directory
    md_files = list(target_path.glob('*.md'))

    if not md_files:
        print(f"No markdown files found in {target_path}")
        sys.exit(0)

    print(f"Processing all markdown files in: {target_path}")
    print(f"Found {len(md_files)} markdown files\n")

    for md_file in md_files:
        process_note(md_file, taxonomy)
        print()  # Blank line between files

    print("\n✓ Processing complete!")

if __name__ == "__main__":
    main()