diff --git a/.tag-notes.py.swp b/.tag-notes.py.swp deleted file mode 100644 index 840f118..0000000 Binary files a/.tag-notes.py.swp and /dev/null differ diff --git a/tag-notes.py b/tag-notes.py index f12a232..d54d009 100755 --- a/tag-notes.py +++ b/tag-notes.py @@ -1,53 +1,65 @@ #!/usr/bin/env python3 """ Note Tagging and SEO Metadata Script -Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata +Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata. """ -import os -import sys import io import json +import os import re +import sys from pathlib import Path -import requests +import anthropic import yaml from ruamel.yaml import YAML +# --------------------------------------------------------------------------- # Configuration -LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions" -MODEL_NAME = "openai/gpt-oss-20b" +# --------------------------------------------------------------------------- TAXONOMY_FILE = "tag-taxonomy.yaml" NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/") CONTENT_CHAR_LIMIT = 20000 SEO_DESC_MIN = 150 SEO_DESC_MAX = 160 +MODEL = "claude-sonnet-4-6" # Round-trip YAML preserves existing frontmatter formatting yaml_rt = YAML() yaml_rt.preserve_quotes = True yaml_rt.width = 4096 +# Anthropic client — reads ANTHROPIC_API_KEY from env automatically +client = anthropic.Anthropic() -def load_taxonomy(taxonomy_path): - with open(taxonomy_path, 'r') as f: + +# --------------------------------------------------------------------------- +# Taxonomy helpers +# --------------------------------------------------------------------------- + +def load_taxonomy(taxonomy_path: Path) -> list[str]: + with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} - return data.get('tags', []) or [] + return data.get("tags", []) or [] -def append_tags_to_taxonomy(taxonomy_path, new_tags): - with open(taxonomy_path, 'r') as f: +def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None: + with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} - existing = data.get('tags', []) or [] + existing = data.get("tags", []) or [] combined = list(dict.fromkeys(existing + list(new_tags))) - data['tags'] = combined - with open(taxonomy_path, 'w') as f: + data["tags"] = combined + with open(taxonomy_path, "w") as f: yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -def extract_frontmatter(content): - pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$' +# --------------------------------------------------------------------------- +# Frontmatter helpers +# --------------------------------------------------------------------------- + +def extract_frontmatter(content: str): + pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$" match = re.match(pattern, content, re.DOTALL) if not match: return None, content @@ -55,74 +67,65 @@ def extract_frontmatter(content): return frontmatter, match.group(2) -def reconstruct_markdown(frontmatter, body): +def reconstruct_markdown(frontmatter, body: str) -> str: stream = io.StringIO() yaml_rt.dump(frontmatter, stream) fm_str = stream.getvalue() - if not fm_str.endswith('\n'): - fm_str += '\n' + if not fm_str.endswith("\n"): + fm_str += "\n" return f"---\n{fm_str}---\n{body}" -def slugify(text): +def slugify(text: str) -> str: text = text.lower() - text = re.sub(r"[’'`]", '', text) - text = re.sub(r'[^\w\s-]', ' ', text) - text = re.sub(r'[-\s]+', '-', text).strip('-') + text = re.sub(r"[''`]", "", text) + text = re.sub(r"[^\w\s-]", " ", text) + text = re.sub(r"[-\s]+", "-", text).strip("-") return text -def parse_json_response(content): +# --------------------------------------------------------------------------- +# LLM helpers +# --------------------------------------------------------------------------- + +def parse_json_response(content: str | None) -> dict | None: if content is None: return None try: return json.loads(content) except json.JSONDecodeError: pass - start = content.find('{') - end = content.rfind('}') + start = content.find("{") + end = content.rfind("}") if start != -1 and end > start: try: - return json.loads(content[start:end + 1]) + return json.loads(content[start : end + 1]) except json.JSONDecodeError: pass return None -def call_llm_json(system_prompt, user_prompt, max_tokens=4000): - payload = { - "model": MODEL_NAME, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - "temperature": 0.2, - "max_tokens": max_tokens, - "response_format": {"type": "text"}, - } +def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None: try: - response = requests.post(LM_STUDIO_URL, json=payload, timeout=600) - if not response.ok: - print(f" ! LLM error: {response.status_code} {response.reason}") - print(f" body: {response.text[:500]}") - return None - result = response.json() - choice = result['choices'][0] - content = choice['message'].get('content') or '' + message = client.messages.create( + model=MODEL, + max_tokens=max_tokens, + system=system_prompt, + messages=[{"role": "user", "content": user_prompt}], + temperature=0.2, + ) + content = message.content[0].text if message.content else "" parsed = parse_json_response(content) if parsed is None: - finish = choice.get('finish_reason') - print(f" ! LLM returned no parseable JSON (finish_reason={finish})") - if finish == 'length': - print(" Hit max_tokens — reasoning model burned the budget before emitting output.") + print(f" ! LLM returned no parseable JSON (stop_reason={message.stop_reason})") print(f" content: {content[:500]!r}") return parsed - except Exception as e: - print(f" ! LLM error: {e}") + except anthropic.APIError as e: + print(f" ! Anthropic API error: {e}") return None -def request_metadata(title, note_content, taxonomy): +def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None: taxonomy_str = ", ".join(taxonomy) system_prompt = f"""You analyze markdown notes and return structured metadata. @@ -136,9 +139,8 @@ Return ONLY valid JSON in this exact shape: }} Rules: -- Tags should describe what the note is substantively about, not topics it merely mentions in passing. -- tags_from_taxonomy: 0-5 tags drawn from the existing taxonomy, ONLY when they genuinely fit. Do NOT force a taxonomy tag — return an empty list if nothing truly applies. -- new_tag_suggestions: 0-5 NEW tags when the taxonomy doesn't adequately cover the content. Each must be a reusable category (not hyper-specific to one note). Use lowercase-hyphenated style (e.g., personal-essay). +- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content. +- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative). - seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow ": ". Aim for 4-10 words. - seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding. - seo_keywords: 10-15 relevant keywords, no duplicates. @@ -154,13 +156,14 @@ Produce the JSON described in the system prompt.""" return call_llm_json(system_prompt, user_prompt) -def request_description_retry(title, note_content, previous_desc): +def request_description_retry(title: str, note_content: str, previous_desc: str) -> str: system_prompt = f"""You rewrite SEO descriptions to a strict length. Return ONLY valid JSON of the form: {{"seo_description": "..."}} The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.""" + user_prompt = f"""Note title: {title} Note content: @@ -172,46 +175,49 @@ Your previous description was {len(previous_desc)} characters, outside the allow Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters.""" result = call_llm_json(system_prompt, user_prompt) if result: - return (result.get('seo_description') or '').strip() - return '' + return (result.get("seo_description") or "").strip() + return "" -def process_note(file_path, taxonomy, new_tag_accumulator): +# --------------------------------------------------------------------------- +# Note processing +# --------------------------------------------------------------------------- + +def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None: print(f"Processing: {file_path}") - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() + content = file_path.read_text(encoding="utf-8") frontmatter, body = extract_frontmatter(content) if frontmatter is None: print(" ⚠️ No frontmatter found, skipping") return - existing_tags = frontmatter.get('tags', []) or [] + existing_tags = frontmatter.get("tags", []) or [] if existing_tags == [None]: existing_tags = [] needs_tags = not existing_tags - needs_slug = not frontmatter.get('slug') - needs_seo_title = not frontmatter.get('seo-title') - needs_seo_desc = not frontmatter.get('seo-description') - needs_seo_keywords = not frontmatter.get('seo-keywords') + needs_slug = not frontmatter.get("slug") + needs_seo_title = not frontmatter.get("seo-title") + needs_seo_desc = not frontmatter.get("seo-description") + needs_seo_keywords = not frontmatter.get("seo-keywords") - if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords): + if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]): print(" ✓ All fields already populated, skipping") return - title = frontmatter.get('title') or Path(file_path).stem + title = frontmatter.get("title") or file_path.stem updated = False if needs_slug: - slug = slugify(Path(file_path).stem) - frontmatter['slug'] = slug + slug = slugify(file_path.stem) + frontmatter["slug"] = slug print(f" + Added slug: {slug}") updated = True - if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords): - with open(file_path, 'w', encoding='utf-8') as f: - f.write(reconstruct_markdown(frontmatter, body)) + # If only slug was needed, skip the LLM call + if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]): + file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") return @@ -221,11 +227,11 @@ def process_note(file_path, taxonomy, new_tag_accumulator): return if needs_tags: - taxonomy_tags = llm_response.get('tags_from_taxonomy') or [] - new_suggestions = llm_response.get('new_tag_suggestions') or [] - combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:8] + taxonomy_tags = llm_response.get("tags_from_taxonomy") or [] + new_suggestions = llm_response.get("new_tag_suggestions") or [] + combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5] if combined: - frontmatter['tags'] = combined + frontmatter["tags"] = combined updated = True print(f" + Added tags: {', '.join(combined)}") genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions] @@ -234,19 +240,17 @@ def process_note(file_path, taxonomy, new_tag_accumulator): new_tag_accumulator.update(genuinely_new) if needs_seo_title: - suffix = (llm_response.get('seo_title_suffix') or '').strip() - suffix = suffix.lstrip(':').strip() - # Strip a leading repeat of the title if the LLM included it anyway + suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip() if suffix.lower().startswith(title.lower()): - suffix = suffix[len(title):].lstrip(':').strip() + suffix = suffix[len(title):].lstrip(":").strip() if suffix: seo_title = f"{title}: {suffix}" - frontmatter['seo-title'] = seo_title + frontmatter["seo-title"] = seo_title updated = True print(f" + Added SEO title: {seo_title}") if needs_seo_desc: - seo_desc = (llm_response.get('seo_description') or '').strip() + seo_desc = (llm_response.get("seo_description") or "").strip() if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX): print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking") retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc) @@ -257,30 +261,32 @@ def process_note(file_path, taxonomy, new_tag_accumulator): else: print(" ! Retry failed; using original") if seo_desc: - frontmatter['seo-description'] = seo_desc + frontmatter["seo-description"] = seo_desc updated = True print(f" + Added SEO description ({len(seo_desc)} chars)") if needs_seo_keywords: - seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or [])) + seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or [])) if seo_keywords: - frontmatter['seo-keywords'] = seo_keywords + frontmatter["seo-keywords"] = seo_keywords updated = True print(f" + Added {len(seo_keywords)} SEO keywords") if updated: - with open(file_path, 'w', encoding='utf-8') as f: - f.write(reconstruct_markdown(frontmatter, body)) + file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") else: print(" - No updates needed") -def main(): +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: taxonomy_path = Path(__file__).parent / TAXONOMY_FILE if not taxonomy_path.exists(): print(f"Error: Taxonomy file not found at {taxonomy_path}") - print(f"Please create {TAXONOMY_FILE} in the same directory as this script") sys.exit(1) taxonomy = load_taxonomy(taxonomy_path) @@ -290,11 +296,8 @@ def main(): if not target_path.exists(): print(f"Error: Notes folder not found: {target_path}") sys.exit(1) - if not target_path.is_dir(): - print(f"Error: {target_path} is not a directory") - sys.exit(1) - md_files = sorted(target_path.rglob('*.md')) + md_files = sorted(target_path.rglob("*.md")) if not md_files: print(f"No markdown files found in {target_path}") sys.exit(0) @@ -302,7 +305,7 @@ def main(): print(f"Processing all markdown files under: {target_path}") print(f"Found {len(md_files)} markdown files\n") - new_tag_accumulator = set() + new_tag_accumulator: set[str] = set() for md_file in md_files: try: process_note(md_file, taxonomy, new_tag_accumulator) @@ -313,17 +316,27 @@ def main(): print("\n✓ Processing complete!") fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy) - if fresh: - print(f"\nNew tags suggested during this run: {', '.join(fresh)}") - try: - answer = input("Add these to the taxonomy? [y/N]: ").strip().lower() - except EOFError: - answer = '' - if answer == 'y': - append_tags_to_taxonomy(taxonomy_path, fresh) - print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}") - else: - print("Skipped taxonomy update.") + if not fresh: + return + + print(f"\nNew tags suggested during this run: {', '.join(fresh)}") + + # Non-interactive (CI): log and skip + if not sys.stdin.isatty(): + print("Non-interactive environment detected — skipping taxonomy update.") + print(f"To add these manually, run the script locally and answer 'y' when prompted.") + return + + try: + answer = input("Add these to the taxonomy? [y/N]: ").strip().lower() + except EOFError: + answer = "" + + if answer == "y": + append_tags_to_taxonomy(taxonomy_path, fresh) + print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}") + else: + print("Skipped taxonomy update.") if __name__ == "__main__": diff --git a/tag-taxonomy.yaml b/tag-taxonomy.yaml index 6c8ce35..5542f64 100644 --- a/tag-taxonomy.yaml +++ b/tag-taxonomy.yaml @@ -1,59 +1,51 @@ -# Tag Taxonomy for Note Tagging -# Add new tags here as the LLM suggests good ones - tags: - # Technology & Development - - self-hosting - - linux - - automation - - ai-tools - - web-development - - infrastructure - - docker - - security - - privacy - - # Work & Management - - project-management - - business-analysis - - leadership - - agile - - team-dynamics - - process-improvement - - governance - - # Knowledge & Learning - - knowledge-management - - zettelkasten - - note-taking - - learning - - productivity - - # Philosophy & Spirituality - - buddhism - - eastern-philosophy - - meditation - - mindfulness - - # Literature & Writing - - literature - - postmodernism - - writing - - # Personal Interests - - plants - - aroids - - gardening - - # Personal Narrative & Life - - memoir - - personal-essay - - reflection - - family - - parenting - - recovery - - mental-health - - aging - - relationships - - childhood - - identity \ No newline at end of file +- self-hosting +- linux +- automation +- ai-tools +- web-development +- infrastructure +- docker +- security +- privacy +- project-management +- business-analysis +- leadership +- agile +- team-dynamics +- process-improvement +- governance +- knowledge-management +- zettelkasten +- note-taking +- learning +- productivity +- buddhism +- eastern-philosophy +- meditation +- mindfulness +- literature +- postmodernism +- writing +- plants +- aroids +- gardening +- memoir +- personal-essay +- reflection +- family +- parenting +- recovery +- mental-health +- aging +- relationships +- childhood +- identity +- morning-routine +- sleep-habits +- baking +- cooking-techniques +- fermentation +- food-baking +- kitchen-hacks +- sourdough