switch to using Anthropic API

Raise combined tag cap from 5 to 8
Allow zero-taxonomy-tag output and more new tag suggestions
2026-04-25 18:47:29 -05:00 · 2026-04-19 22:27:40 -05:00 · 2026-04-19 22:24:53 -05:00 · 2026-04-19 22:22:27 -05:00
3 changed files with 167 additions and 148 deletions
@@ -1,53 +1,65 @@
 #!/usr/bin/env python3
 """
 Note Tagging and SEO Metadata Script
-Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
+Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata.
 """

-import os
-import sys
 import io
 import json
+import os
 import re
+import sys
 from pathlib import Path

-import requests
+import anthropic
 import yaml
 from ruamel.yaml import YAML

+# ---------------------------------------------------------------------------
 # Configuration
-LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
-MODEL_NAME = "openai/gpt-oss-20b"
+# ---------------------------------------------------------------------------
 TAXONOMY_FILE = "tag-taxonomy.yaml"
 NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
 CONTENT_CHAR_LIMIT = 20000
 SEO_DESC_MIN = 150
 SEO_DESC_MAX = 160
+MODEL = "claude-sonnet-4-6"

 # Round-trip YAML preserves existing frontmatter formatting
 yaml_rt = YAML()
 yaml_rt.preserve_quotes = True
 yaml_rt.width = 4096

+# Anthropic client — reads ANTHROPIC_API_KEY from env automatically
+client = anthropic.Anthropic()

-def load_taxonomy(taxonomy_path):
-    with open(taxonomy_path, 'r') as f:
+
+# ---------------------------------------------------------------------------
+# Taxonomy helpers
+# ---------------------------------------------------------------------------
+
+def load_taxonomy(taxonomy_path: Path) -> list[str]:
+    with open(taxonomy_path) as f:
        data = yaml.safe_load(f) or {}
-    return data.get('tags', []) or []
+    return data.get("tags", []) or []


-def append_tags_to_taxonomy(taxonomy_path, new_tags):
-    with open(taxonomy_path, 'r') as f:
+def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None:
+    with open(taxonomy_path) as f:
        data = yaml.safe_load(f) or {}
-    existing = data.get('tags', []) or []
+    existing = data.get("tags", []) or []
    combined = list(dict.fromkeys(existing + list(new_tags)))
-    data['tags'] = combined
-    with open(taxonomy_path, 'w') as f:
+    data["tags"] = combined
+    with open(taxonomy_path, "w") as f:
        yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)


-def extract_frontmatter(content):
-    pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
+# ---------------------------------------------------------------------------
+# Frontmatter helpers
+# ---------------------------------------------------------------------------
+
+def extract_frontmatter(content: str):
+    pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
    match = re.match(pattern, content, re.DOTALL)
    if not match:
        return None, content
@@ -55,74 +67,65 @@ def extract_frontmatter(content):
    return frontmatter, match.group(2)


-def reconstruct_markdown(frontmatter, body):
+def reconstruct_markdown(frontmatter, body: str) -> str:
    stream = io.StringIO()
    yaml_rt.dump(frontmatter, stream)
    fm_str = stream.getvalue()
-    if not fm_str.endswith('\n'):
-        fm_str += '\n'
+    if not fm_str.endswith("\n"):
+        fm_str += "\n"
    return f"---\n{fm_str}---\n{body}"


-def slugify(text):
+def slugify(text: str) -> str:
    text = text.lower()
-    text = re.sub(r"[’'`]", '', text)
-    text = re.sub(r'[^\w\s-]', ' ', text)
-    text = re.sub(r'[-\s]+', '-', text).strip('-')
+    text = re.sub(r"[''`]", "", text)
+    text = re.sub(r"[^\w\s-]", " ", text)
+    text = re.sub(r"[-\s]+", "-", text).strip("-")
    return text


-def parse_json_response(content):
+# ---------------------------------------------------------------------------
+# LLM helpers
+# ---------------------------------------------------------------------------
+
+def parse_json_response(content: str | None) -> dict | None:
    if content is None:
        return None
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        pass
-    start = content.find('{')
-    end = content.rfind('}')
+    start = content.find("{")
+    end = content.rfind("}")
    if start != -1 and end > start:
        try:
-            return json.loads(content[start:end + 1])
+            return json.loads(content[start : end + 1])
        except json.JSONDecodeError:
            pass
    return None


-def call_llm_json(system_prompt, user_prompt, max_tokens=4000):
-    payload = {
-        "model": MODEL_NAME,
-        "messages": [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt},
-        ],
-        "temperature": 0.2,
-        "max_tokens": max_tokens,
-        "response_format": {"type": "text"},
-    }
+def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None:
    try:
-        response = requests.post(LM_STUDIO_URL, json=payload, timeout=600)
-        if not response.ok:
-            print(f"  ! LLM error: {response.status_code} {response.reason}")
-            print(f"    body: {response.text[:500]}")
-            return None
-        result = response.json()
-        choice = result['choices'][0]
-        content = choice['message'].get('content') or ''
+        message = client.messages.create(
+            model=MODEL,
+            max_tokens=max_tokens,
+            system=system_prompt,
+            messages=[{"role": "user", "content": user_prompt}],
+            temperature=0.2,
+        )
+        content = message.content[0].text if message.content else ""
        parsed = parse_json_response(content)
        if parsed is None:
-            finish = choice.get('finish_reason')
-            print(f"  ! LLM returned no parseable JSON (finish_reason={finish})")
-            if finish == 'length':
-                print("    Hit max_tokens — reasoning model burned the budget before emitting output.")
+            print(f"  ! LLM returned no parseable JSON (stop_reason={message.stop_reason})")
            print(f"    content: {content[:500]!r}")
        return parsed
-    except Exception as e:
-        print(f"  ! LLM error: {e}")
+    except anthropic.APIError as e:
+        print(f"  ! Anthropic API error: {e}")
        return None


-def request_metadata(title, note_content, taxonomy):
+def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None:
    taxonomy_str = ", ".join(taxonomy)
    system_prompt = f"""You analyze markdown notes and return structured metadata.

@@ -153,13 +156,14 @@ Produce the JSON described in the system prompt."""
    return call_llm_json(system_prompt, user_prompt)


-def request_description_retry(title, note_content, previous_desc):
+def request_description_retry(title: str, note_content: str, previous_desc: str) -> str:
    system_prompt = f"""You rewrite SEO descriptions to a strict length.

 Return ONLY valid JSON of the form:
 {{"seo_description": "..."}}

 The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
+
    user_prompt = f"""Note title: {title}

 Note content:
@@ -171,46 +175,49 @@ Your previous description was {len(previous_desc)} characters, outside the allow
 Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
    result = call_llm_json(system_prompt, user_prompt)
    if result:
-        return (result.get('seo_description') or '').strip()
-    return ''
+        return (result.get("seo_description") or "").strip()
+    return ""


-def process_note(file_path, taxonomy, new_tag_accumulator):
+# ---------------------------------------------------------------------------
+# Note processing
+# ---------------------------------------------------------------------------
+
+def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None:
    print(f"Processing: {file_path}")
-    with open(file_path, 'r', encoding='utf-8') as f:
-        content = f.read()
+    content = file_path.read_text(encoding="utf-8")

    frontmatter, body = extract_frontmatter(content)
    if frontmatter is None:
        print("  ⚠️  No frontmatter found, skipping")
        return

-    existing_tags = frontmatter.get('tags', []) or []
+    existing_tags = frontmatter.get("tags", []) or []
    if existing_tags == [None]:
        existing_tags = []

    needs_tags = not existing_tags
-    needs_slug = not frontmatter.get('slug')
-    needs_seo_title = not frontmatter.get('seo-title')
-    needs_seo_desc = not frontmatter.get('seo-description')
-    needs_seo_keywords = not frontmatter.get('seo-keywords')
+    needs_slug = not frontmatter.get("slug")
+    needs_seo_title = not frontmatter.get("seo-title")
+    needs_seo_desc = not frontmatter.get("seo-description")
+    needs_seo_keywords = not frontmatter.get("seo-keywords")

-    if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
+    if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
        print("  ✓ All fields already populated, skipping")
        return

-    title = frontmatter.get('title') or Path(file_path).stem
+    title = frontmatter.get("title") or file_path.stem
    updated = False

    if needs_slug:
-        slug = slugify(Path(file_path).stem)
-        frontmatter['slug'] = slug
+        slug = slugify(file_path.stem)
+        frontmatter["slug"] = slug
        print(f"  + Added slug: {slug}")
        updated = True

-    if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
-        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(reconstruct_markdown(frontmatter, body))
+    # If only slug was needed, skip the LLM call
+    if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
+        file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
        print("  ✓ Updated successfully")
        return

@@ -220,11 +227,11 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
        return

    if needs_tags:
-        taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
-        new_suggestions = llm_response.get('new_tag_suggestions') or []
+        taxonomy_tags = llm_response.get("tags_from_taxonomy") or []
+        new_suggestions = llm_response.get("new_tag_suggestions") or []
        combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
        if combined:
-            frontmatter['tags'] = combined
+            frontmatter["tags"] = combined
            updated = True
            print(f"  + Added tags: {', '.join(combined)}")
            genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
@@ -233,19 +240,17 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
                new_tag_accumulator.update(genuinely_new)

    if needs_seo_title:
-        suffix = (llm_response.get('seo_title_suffix') or '').strip()
-        suffix = suffix.lstrip(':').strip()
-        # Strip a leading repeat of the title if the LLM included it anyway
+        suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip()
        if suffix.lower().startswith(title.lower()):
-            suffix = suffix[len(title):].lstrip(':').strip()
+            suffix = suffix[len(title):].lstrip(":").strip()
        if suffix:
            seo_title = f"{title}: {suffix}"
-            frontmatter['seo-title'] = seo_title
+            frontmatter["seo-title"] = seo_title
            updated = True
            print(f"  + Added SEO title: {seo_title}")

    if needs_seo_desc:
-        seo_desc = (llm_response.get('seo_description') or '').strip()
+        seo_desc = (llm_response.get("seo_description") or "").strip()
        if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
            print(f"  ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
            retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
@@ -256,30 +261,32 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
            else:
                print("  ! Retry failed; using original")
        if seo_desc:
-            frontmatter['seo-description'] = seo_desc
+            frontmatter["seo-description"] = seo_desc
            updated = True
            print(f"  + Added SEO description ({len(seo_desc)} chars)")

    if needs_seo_keywords:
-        seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
+        seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or []))
        if seo_keywords:
-            frontmatter['seo-keywords'] = seo_keywords
+            frontmatter["seo-keywords"] = seo_keywords
            updated = True
            print(f"  + Added {len(seo_keywords)} SEO keywords")

    if updated:
-        with open(file_path, 'w', encoding='utf-8') as f:
-            f.write(reconstruct_markdown(frontmatter, body))
+        file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
        print("  ✓ Updated successfully")
    else:
        print("  - No updates needed")


-def main():
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main() -> None:
    taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
    if not taxonomy_path.exists():
        print(f"Error: Taxonomy file not found at {taxonomy_path}")
-        print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
        sys.exit(1)

    taxonomy = load_taxonomy(taxonomy_path)
@@ -289,11 +296,8 @@ def main():
    if not target_path.exists():
        print(f"Error: Notes folder not found: {target_path}")
        sys.exit(1)
-    if not target_path.is_dir():
-        print(f"Error: {target_path} is not a directory")
-        sys.exit(1)

-    md_files = sorted(target_path.rglob('*.md'))
+    md_files = sorted(target_path.rglob("*.md"))
    if not md_files:
        print(f"No markdown files found in {target_path}")
        sys.exit(0)
@@ -301,7 +305,7 @@ def main():
    print(f"Processing all markdown files under: {target_path}")
    print(f"Found {len(md_files)} markdown files\n")

-    new_tag_accumulator = set()
+    new_tag_accumulator: set[str] = set()
    for md_file in md_files:
        try:
            process_note(md_file, taxonomy, new_tag_accumulator)
@@ -312,17 +316,27 @@ def main():
    print("\n✓ Processing complete!")

    fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
-    if fresh:
-        print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
-        try:
-            answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
-        except EOFError:
-            answer = ''
-        if answer == 'y':
-            append_tags_to_taxonomy(taxonomy_path, fresh)
-            print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
-        else:
-            print("Skipped taxonomy update.")
+    if not fresh:
+        return
+
+    print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
+
+    # Non-interactive (CI): log and skip
+    if not sys.stdin.isatty():
+        print("Non-interactive environment detected — skipping taxonomy update.")
+        print(f"To add these manually, run the script locally and answer 'y' when prompted.")
+        return
+
+    try:
+        answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
+    except EOFError:
+        answer = ""
+
+    if answer == "y":
+        append_tags_to_taxonomy(taxonomy_path, fresh)
+        print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
+    else:
+        print("Skipped taxonomy update.")


 if __name__ == "__main__":
@@ -1,46 +1,51 @@
-# Tag Taxonomy for Note Tagging
-# Add new tags here as the LLM suggests good ones
-
 tags:
-  # Technology & Development
-  - self-hosting
-  - linux
-  - automation
-  - ai-tools
-  - web-development
-  - infrastructure
-  - docker
-  - security
-  - privacy
-  
-  # Work & Management
-  - project-management
-  - business-analysis
-  - leadership
-  - agile
-  - team-dynamics
-  - process-improvement
-  - governance
-  
-  # Knowledge & Learning
-  - knowledge-management
-  - zettelkasten
-  - note-taking
-  - learning
-  - productivity
-  
-  # Philosophy & Spirituality
-  - buddhism
-  - eastern-philosophy
-  - meditation
-  - mindfulness
-  
-  # Literature & Writing
-  - literature
-  - postmodernism
-  - writing
-  
-  # Personal Interests
-  - plants
-  - aroids
-  - gardening
+- self-hosting
+- linux
+- automation
+- ai-tools
+- web-development
+- infrastructure
+- docker
+- security
+- privacy
+- project-management
+- business-analysis
+- leadership
+- agile
+- team-dynamics
+- process-improvement
+- governance
+- knowledge-management
+- zettelkasten
+- note-taking
+- learning
+- productivity
+- buddhism
+- eastern-philosophy
+- meditation
+- mindfulness
+- literature
+- postmodernism
+- writing
+- plants
+- aroids
+- gardening
+- memoir
+- personal-essay
+- reflection
+- family
+- parenting
+- recovery
+- mental-health
+- aging
+- relationships
+- childhood
+- identity
+- morning-routine
+- sleep-habits
+- baking
+- cooking-techniques
+- fermentation
+- food-baking
+- kitchen-hacks
+- sourdough
Author	SHA1	Message	Date
ejlewis	e792732a13	switch to using Anthropic API	2026-04-25 18:47:29 -05:00
ejlewis	fd1407e06f	Raise combined tag cap from 5 to 8	2026-04-19 22:27:40 -05:00
ejlewis	68d78fe6bd	Allow zero-taxonomy-tag output and more new tag suggestions	2026-04-19 22:24:53 -05:00
ejlewis	b4fb1283b9	Add personal-narrative cluster to tag taxonomy	2026-04-19 22:22:27 -05:00