#!/usr/bin/env python3 """ Note Tagging and SEO Metadata Script Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata. """ import io import json import os import re import sys from pathlib import Path import anthropic import yaml from ruamel.yaml import YAML # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- TAXONOMY_FILE = "tag-taxonomy.yaml" NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/") CONTENT_CHAR_LIMIT = 20000 SEO_DESC_MIN = 150 SEO_DESC_MAX = 160 MODEL = "claude-sonnet-4-6" # Round-trip YAML preserves existing frontmatter formatting yaml_rt = YAML() yaml_rt.preserve_quotes = True yaml_rt.width = 4096 # Anthropic client — reads ANTHROPIC_API_KEY from env automatically client = anthropic.Anthropic() # --------------------------------------------------------------------------- # Taxonomy helpers # --------------------------------------------------------------------------- def load_taxonomy(taxonomy_path: Path) -> list[str]: with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} return data.get("tags", []) or [] def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None: with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} existing = data.get("tags", []) or [] combined = list(dict.fromkeys(existing + list(new_tags))) data["tags"] = combined with open(taxonomy_path, "w") as f: yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) # --------------------------------------------------------------------------- # Frontmatter helpers # --------------------------------------------------------------------------- def extract_frontmatter(content: str): pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$" match = re.match(pattern, content, re.DOTALL) if not match: return None, content frontmatter = yaml_rt.load(match.group(1)) return frontmatter, match.group(2) def reconstruct_markdown(frontmatter, body: str) -> str: stream = io.StringIO() yaml_rt.dump(frontmatter, stream) fm_str = stream.getvalue() if not fm_str.endswith("\n"): fm_str += "\n" return f"---\n{fm_str}---\n{body}" def slugify(text: str) -> str: text = text.lower() text = re.sub(r"[''`]", "", text) text = re.sub(r"[^\w\s-]", " ", text) text = re.sub(r"[-\s]+", "-", text).strip("-") return text # --------------------------------------------------------------------------- # LLM helpers # --------------------------------------------------------------------------- def parse_json_response(content: str | None) -> dict | None: if content is None: return None try: return json.loads(content) except json.JSONDecodeError: pass start = content.find("{") end = content.rfind("}") if start != -1 and end > start: try: return json.loads(content[start : end + 1]) except json.JSONDecodeError: pass return None def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None: try: message = client.messages.create( model=MODEL, max_tokens=max_tokens, system=system_prompt, messages=[{"role": "user", "content": user_prompt}], temperature=0.2, ) content = message.content[0].text if message.content else "" parsed = parse_json_response(content) if parsed is None: print(f" ! LLM returned no parseable JSON (stop_reason={message.stop_reason})") print(f" content: {content[:500]!r}") return parsed except anthropic.APIError as e: print(f" ! Anthropic API error: {e}") return None def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None: taxonomy_str = ", ".join(taxonomy) system_prompt = f"""You analyze markdown notes and return structured metadata. Return ONLY valid JSON in this exact shape: {{ "tags_from_taxonomy": ["tag1", "tag2"], "new_tag_suggestions": ["newtag1"], "seo_title_suffix": "Short descriptor that will follow the note title", "seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.", "seo_keywords": ["keyword1", "keyword2"] }} Rules: - tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content. - new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative). - seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow ": ". Aim for 4-10 words. - seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding. - seo_keywords: 10-15 relevant keywords, no duplicates. Existing tag taxonomy: {taxonomy_str}""" user_prompt = f"""Note title: {title} Note content: {note_content} Produce the JSON described in the system prompt.""" return call_llm_json(system_prompt, user_prompt) def request_description_retry(title: str, note_content: str, previous_desc: str) -> str: system_prompt = f"""You rewrite SEO descriptions to a strict length. Return ONLY valid JSON of the form: {{"seo_description": "..."}} The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.""" user_prompt = f"""Note title: {title} Note content: {note_content} Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range: "{previous_desc}" Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters.""" result = call_llm_json(system_prompt, user_prompt) if result: return (result.get("seo_description") or "").strip() return "" # --------------------------------------------------------------------------- # Note processing # --------------------------------------------------------------------------- def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None: print(f"Processing: {file_path}") content = file_path.read_text(encoding="utf-8") frontmatter, body = extract_frontmatter(content) if frontmatter is None: print(" ⚠️ No frontmatter found, skipping") return existing_tags = frontmatter.get("tags", []) or [] if existing_tags == [None]: existing_tags = [] needs_tags = not existing_tags needs_slug = not frontmatter.get("slug") needs_seo_title = not frontmatter.get("seo-title") needs_seo_desc = not frontmatter.get("seo-description") needs_seo_keywords = not frontmatter.get("seo-keywords") if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]): print(" ✓ All fields already populated, skipping") return title = frontmatter.get("title") or file_path.stem updated = False if needs_slug: slug = slugify(file_path.stem) frontmatter["slug"] = slug print(f" + Added slug: {slug}") updated = True # If only slug was needed, skip the LLM call if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]): file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") return llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy) if not llm_response: print(" ✗ Failed to get LLM response") return if needs_tags: taxonomy_tags = llm_response.get("tags_from_taxonomy") or [] new_suggestions = llm_response.get("new_tag_suggestions") or [] combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5] if combined: frontmatter["tags"] = combined updated = True print(f" + Added tags: {', '.join(combined)}") genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions] if genuinely_new: print(f" (New suggestions: {', '.join(genuinely_new)})") new_tag_accumulator.update(genuinely_new) if needs_seo_title: suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip() if suffix.lower().startswith(title.lower()): suffix = suffix[len(title):].lstrip(":").strip() if suffix: seo_title = f"{title}: {suffix}" frontmatter["seo-title"] = seo_title updated = True print(f" + Added SEO title: {seo_title}") if needs_seo_desc: seo_desc = (llm_response.get("seo_description") or "").strip() if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX): print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking") retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc) if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX: seo_desc = retry elif retry: print(f" ! Retry still {len(retry)} chars; using original") else: print(" ! Retry failed; using original") if seo_desc: frontmatter["seo-description"] = seo_desc updated = True print(f" + Added SEO description ({len(seo_desc)} chars)") if needs_seo_keywords: seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or [])) if seo_keywords: frontmatter["seo-keywords"] = seo_keywords updated = True print(f" + Added {len(seo_keywords)} SEO keywords") if updated: file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") else: print(" - No updates needed") # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main() -> None: taxonomy_path = Path(__file__).parent / TAXONOMY_FILE if not taxonomy_path.exists(): print(f"Error: Taxonomy file not found at {taxonomy_path}") sys.exit(1) taxonomy = load_taxonomy(taxonomy_path) print(f"Loaded {len(taxonomy)} tags from taxonomy\n") target_path = Path(NOTES_FOLDER) if not target_path.exists(): print(f"Error: Notes folder not found: {target_path}") sys.exit(1) md_files = sorted(target_path.rglob("*.md")) if not md_files: print(f"No markdown files found in {target_path}") sys.exit(0) print(f"Processing all markdown files under: {target_path}") print(f"Found {len(md_files)} markdown files\n") new_tag_accumulator: set[str] = set() for md_file in md_files: try: process_note(md_file, taxonomy, new_tag_accumulator) except Exception as e: print(f" ✗ Error processing {md_file}: {e}") print() print("\n✓ Processing complete!") fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy) if not fresh: return print(f"\nNew tags suggested during this run: {', '.join(fresh)}") # Non-interactive (CI): log and skip if not sys.stdin.isatty(): print("Non-interactive environment detected — skipping taxonomy update.") print(f"To add these manually, run the script locally and answer 'y' when prompted.") return try: answer = input("Add these to the taxonomy? [y/N]: ").strip().lower() except EOFError: answer = "" if answer == "y": append_tags_to_taxonomy(taxonomy_path, fresh) print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}") else: print("Skipped taxonomy update.") if __name__ == "__main__": main()