tag-notes.py

#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
"""

import os
import sys
import io
import json
import re
from pathlib import Path

import requests
import yaml
from ruamel.yaml import YAML

# Configuration
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
MODEL_NAME = "openai/gpt-oss-20b"
TAXONOMY_FILE = "tag-taxonomy.yaml"
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
CONTENT_CHAR_LIMIT = 20000
SEO_DESC_MIN = 150
SEO_DESC_MAX = 160

# Round-trip YAML preserves existing frontmatter formatting
yaml_rt = YAML()
yaml_rt.preserve_quotes = True
yaml_rt.width = 4096


def load_taxonomy(taxonomy_path):
    with open(taxonomy_path, 'r') as f:
        data = yaml.safe_load(f) or {}
    return data.get('tags', []) or []


def append_tags_to_taxonomy(taxonomy_path, new_tags):
    with open(taxonomy_path, 'r') as f:
        data = yaml.safe_load(f) or {}
    existing = data.get('tags', []) or []
    combined = list(dict.fromkeys(existing + list(new_tags)))
    data['tags'] = combined
    with open(taxonomy_path, 'w') as f:
        yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)


def extract_frontmatter(content):
    pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    match = re.match(pattern, content, re.DOTALL)
    if not match:
        return None, content
    frontmatter = yaml_rt.load(match.group(1))
    return frontmatter, match.group(2)


def reconstruct_markdown(frontmatter, body):
    stream = io.StringIO()
    yaml_rt.dump(frontmatter, stream)
    fm_str = stream.getvalue()
    if not fm_str.endswith('\n'):
        fm_str += '\n'
    return f"---\n{fm_str}---\n{body}"


def slugify(text):
    text = text.lower()
    text = re.sub(r"[’'`]", '', text)
    text = re.sub(r'[^\w\s-]', ' ', text)
    text = re.sub(r'[-\s]+', '-', text).strip('-')
    return text


def parse_json_response(content):
    if content is None:
        return None
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        pass
    start = content.find('{')
    end = content.rfind('}')
    if start != -1 and end > start:
        try:
            return json.loads(content[start:end + 1])
        except json.JSONDecodeError:
            pass
    return None


def call_llm_json(system_prompt, user_prompt, max_tokens=4000):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "temperature": 0.2,
        "max_tokens": max_tokens,
        "response_format": {"type": "text"},
    }
    try:
        response = requests.post(LM_STUDIO_URL, json=payload, timeout=600)
        if not response.ok:
            print(f"  ! LLM error: {response.status_code} {response.reason}")
            print(f"    body: {response.text[:500]}")
            return None
        result = response.json()
        choice = result['choices'][0]
        content = choice['message'].get('content') or ''
        parsed = parse_json_response(content)
        if parsed is None:
            finish = choice.get('finish_reason')
            print(f"  ! LLM returned no parseable JSON (finish_reason={finish})")
            if finish == 'length':
                print("    Hit max_tokens — reasoning model burned the budget before emitting output.")
            print(f"    content: {content[:500]!r}")
        return parsed
    except Exception as e:
        print(f"  ! LLM error: {e}")
        return None


def request_metadata(title, note_content, taxonomy):
    taxonomy_str = ", ".join(taxonomy)
    system_prompt = f"""You analyze markdown notes and return structured metadata.

Return ONLY valid JSON in this exact shape:
{{
  "tags_from_taxonomy": ["tag1", "tag2"],
  "new_tag_suggestions": ["newtag1"],
  "seo_title_suffix": "Short descriptor that will follow the note title",
  "seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.",
  "seo_keywords": ["keyword1", "keyword2"]
}}

Rules:
- Tags should describe what the note is substantively about, not topics it merely mentions in passing.
- tags_from_taxonomy: 0-5 tags drawn from the existing taxonomy, ONLY when they genuinely fit. Do NOT force a taxonomy tag — return an empty list if nothing truly applies.
- new_tag_suggestions: 0-5 NEW tags when the taxonomy doesn't adequately cover the content. Each must be a reusable category (not hyper-specific to one note). Use lowercase-hyphenated style (e.g., personal-essay).
- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow "<title>: ". Aim for 4-10 words.
- seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.
- seo_keywords: 10-15 relevant keywords, no duplicates.

Existing tag taxonomy: {taxonomy_str}"""

    user_prompt = f"""Note title: {title}

Note content:
{note_content}

Produce the JSON described in the system prompt."""
    return call_llm_json(system_prompt, user_prompt)


def request_description_retry(title, note_content, previous_desc):
    system_prompt = f"""You rewrite SEO descriptions to a strict length.

Return ONLY valid JSON of the form:
{{"seo_description": "..."}}

The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
    user_prompt = f"""Note title: {title}

Note content:
{note_content}

Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range:
"{previous_desc}"

Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
    result = call_llm_json(system_prompt, user_prompt)
    if result:
        return (result.get('seo_description') or '').strip()
    return ''


def process_note(file_path, taxonomy, new_tag_accumulator):
    print(f"Processing: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    frontmatter, body = extract_frontmatter(content)
    if frontmatter is None:
        print("  ⚠️  No frontmatter found, skipping")
        return

    existing_tags = frontmatter.get('tags', []) or []
    if existing_tags == [None]:
        existing_tags = []

    needs_tags = not existing_tags
    needs_slug = not frontmatter.get('slug')
    needs_seo_title = not frontmatter.get('seo-title')
    needs_seo_desc = not frontmatter.get('seo-description')
    needs_seo_keywords = not frontmatter.get('seo-keywords')

    if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
        print("  ✓ All fields already populated, skipping")
        return

    title = frontmatter.get('title') or Path(file_path).stem
    updated = False

    if needs_slug:
        slug = slugify(Path(file_path).stem)
        frontmatter['slug'] = slug
        print(f"  + Added slug: {slug}")
        updated = True

    if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(reconstruct_markdown(frontmatter, body))
        print("  ✓ Updated successfully")
        return

    llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy)
    if not llm_response:
        print("  ✗ Failed to get LLM response")
        return

    if needs_tags:
        taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
        new_suggestions = llm_response.get('new_tag_suggestions') or []
        combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
        if combined:
            frontmatter['tags'] = combined
            updated = True
            print(f"  + Added tags: {', '.join(combined)}")
            genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
            if genuinely_new:
                print(f"    (New suggestions: {', '.join(genuinely_new)})")
                new_tag_accumulator.update(genuinely_new)

    if needs_seo_title:
        suffix = (llm_response.get('seo_title_suffix') or '').strip()
        suffix = suffix.lstrip(':').strip()
        # Strip a leading repeat of the title if the LLM included it anyway
        if suffix.lower().startswith(title.lower()):
            suffix = suffix[len(title):].lstrip(':').strip()
        if suffix:
            seo_title = f"{title}: {suffix}"
            frontmatter['seo-title'] = seo_title
            updated = True
            print(f"  + Added SEO title: {seo_title}")

    if needs_seo_desc:
        seo_desc = (llm_response.get('seo_description') or '').strip()
        if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
            print(f"  ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
            retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
            if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX:
                seo_desc = retry
            elif retry:
                print(f"  ! Retry still {len(retry)} chars; using original")
            else:
                print("  ! Retry failed; using original")
        if seo_desc:
            frontmatter['seo-description'] = seo_desc
            updated = True
            print(f"  + Added SEO description ({len(seo_desc)} chars)")

    if needs_seo_keywords:
        seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
        if seo_keywords:
            frontmatter['seo-keywords'] = seo_keywords
            updated = True
            print(f"  + Added {len(seo_keywords)} SEO keywords")

    if updated:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(reconstruct_markdown(frontmatter, body))
        print("  ✓ Updated successfully")
    else:
        print("  - No updates needed")


def main():
    taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
    if not taxonomy_path.exists():
        print(f"Error: Taxonomy file not found at {taxonomy_path}")
        print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
        sys.exit(1)

    taxonomy = load_taxonomy(taxonomy_path)
    print(f"Loaded {len(taxonomy)} tags from taxonomy\n")

    target_path = Path(NOTES_FOLDER)
    if not target_path.exists():
        print(f"Error: Notes folder not found: {target_path}")
        sys.exit(1)
    if not target_path.is_dir():
        print(f"Error: {target_path} is not a directory")
        sys.exit(1)

    md_files = sorted(target_path.rglob('*.md'))
    if not md_files:
        print(f"No markdown files found in {target_path}")
        sys.exit(0)

    print(f"Processing all markdown files under: {target_path}")
    print(f"Found {len(md_files)} markdown files\n")

    new_tag_accumulator = set()
    for md_file in md_files:
        try:
            process_note(md_file, taxonomy, new_tag_accumulator)
        except Exception as e:
            print(f"  ✗ Error processing {md_file}: {e}")
        print()

    print("\n✓ Processing complete!")

    fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
    if fresh:
        print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
        try:
            answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
        except EOFError:
            answer = ''
        if answer == 'y':
            append_tags_to_taxonomy(taxonomy_path, fresh)
            print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
        else:
            print("Skipped taxonomy update.")


if __name__ == "__main__":
    main()