From e792732a1367cd09628317607c308e32873849be Mon Sep 17 00:00:00 2001 From: Ethan J Lewis Date: Sat, 25 Apr 2026 18:47:29 -0500 Subject: [PATCH] switch to using Anthropic API --- .tag-notes.py.swp | Bin 16384 -> 0 bytes tag-notes.py | 227 ++++++++++++++++++++++++---------------------- tag-taxonomy.yaml | 108 ++++++++++------------ 3 files changed, 170 insertions(+), 165 deletions(-) delete mode 100644 .tag-notes.py.swp diff --git a/.tag-notes.py.swp b/.tag-notes.py.swp deleted file mode 100644 index 840f118ce38605b36c9b5593a885019e09e4c2a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHNO^h5z6>gIdlh}zZtZ)EDD))kB#>~&o+8e}5OIYl+Vr%vX?>dgs$`&Kpk$zApk$zApk$zApk$zA;D3^VD15;9JZAnrJ$p}oAK3T( z5B(YHa)00Z2lVf{{{Gdz_xfk~RWeXAP%=<5P%=<5P%=<5P%=<5P%=<5P%=<5@LyoS z?&B^HPe%$}3@&6x>7{(po$3O_Uz&c<9&j9Cv8K4O~4jcj=0si)wVeA5L09!yE z_&D$|@asnn<2%5szzrY-7J&}|w+|b}cYzmx=Kvp=2P(i{4jINR;D^BLz}El|m*f4$rJO|7HM}XTOFpO^iBVY*30iOnr0Y`y%4r0B)*MVn&OQhgZCT87l9Xn2)G2y0SAF!JZKm{1-=cu3VaE83D^Nn z0!M&%kwCcv>;f+UTflkX2(XLf%(s9YAOb!MJPkB}2Z29Bb|k-d)1=LP^oZ$?;884m zwix)_NXtUu_)&FW&I=I;*0WvL@rSI_S(stle8;S*ihQfb!-&nRFDLMsEg7H5`p6#i z0M}*UH{HPQ%W5z1Bko5@H#BBemwxECY?e_I%d?~2s5ult99G+T6K7C(osXI~L3_cQ zlG~y;s*1|BYuD)Anmkcm+isqCYHg>6XOvdz>4h zJZqM(A9Z3kx>D(l0;k7iN5ZzW#KcO<_BdTZ2GYGP0vs-lLA{=0?nAg#YgZGOBhKtd-S^XU;8I7cX?` zN?^00Jti%vGvYLequD1XlhGnEPUNU2x06lL^z1Dw3f8$V%^5a5IhntSxEFGP*$9qT zlg&@mvv*hXM~2U~DFT-&nX%-;tTR({haxJmLI@^|@=^CTl4FUn_BKn((uVnPrWO#* z_DVJ%(}RY%C|X!LGvBcm=N9IZCYld$w+%yPEAvZiosTyI(U-71pLIDKa-T#0C9Ae4 z8|~@rea$pY-K5WDPdH&jb|^P83wVh~8ZDag_OGd2Sm|~0bml*!Z zC0XFRV+Ko2Ra$UtC1bW}3t{__WKP7Ym?mCB1RJVrn#V<0OS>_h^pRMMy%qR@2WuMH z5gVXg7lRoWxW~ON7c+*jO@AA+^gug?;=LeP&^rKIbH444Z*p|wNJcHx${_m<2llJx zOiLvp!07G19ogu6XNQPp%#=K^QsIzvP5q-ynxUVk3eP7^vP{*iARx3fPX~Sv_H>tI zf`KwzH8s7Mrb&vTI#I$=&6QMEm8_oT(0*0L=bIMRV97)l&;_M~CdyIVHkD17v`ULu zMcsr|wG>qY%fR;()!g0&z3OOKTI^hA8@B88vwctT$G2#oNt6OC&2Kp}qH_X007=(4 zFw#ft8v=rvZtm4_Qh-#!>nu@l!Ck%q4aqg#IMrmQNte}bt*9d*6&7vy9a+G=#rA37UuednJvhUN8Akuv5PZSf@PvcS%RJDv&;`7(i~aTYFXG+i=w~~q^Zs8f#X-x z7s*th?WbX=th2yYqPXt_Y;O4+9b`j}acPbFjfCu@&td3jX1T6cO}5jzva!@EvE^&6 zs6kyAKH4=Hn2knb&EE!deuNzc`;k(Xq_i9K$2(+xjHoSEz!Ju`=QjJX7gh`J>d=dU zjib{VxWOj68oPYN3{_W?!rCuz*eZ%**hA(LY&y}1#Ub_(-Y@o%awYOcaB>nqV(%(w<@=~2H?82X zWL}$^2~m_%n$Nk;s2|uvUPSC>Eeho+q0%03i%#j{e5g@E4UI#6qI%Igo66r7`lM$2 zDrz?l2ZL%}P*C+7wc&`V$7-mwA+dV$8SnDZbOmZt`5Y;XJhA3i;o#NQ{KC)LM=NT< z6KsL}$wY{SS>im2HaYhh+CY0;$8+dZRBa%z)Tt`AAREwIG;!-?l1~N96Hkox+Ei;Z z*LSZ0&^gnDaaAj1UGPY+T+O+pmspkmn|Bfss+2RYavm9{C_)7jx=B zp6YNZ?$620l}ZmBI7r%#N%q~z`3sm@0Cx|AHu+WvQs~sJ zn$Ne+1ijdU$Ycxu_HNuDxo4LamliHtXU}!!(HR{; z_&a#AYc^@srjh$hRDqUjJFQ_DH3BIcQ^@`%-N0t$#`Lv+ z2R4XMI)H|%wRU0_v=hTtly|5Go;co$rBHvp=l%u@$I&S8PZ|{e{}r+EEyTy{|NX#@D<<^a2og&@C5J& z#Pn|iZvkHgt^*uc1|A0v0DnhZzYDw!ya)_|r+{O?W57F@?;YSRfabmnl%JA;l7W(e zl7W(el7W(el7atw1}G-=Y@{-32I5M~@@7Mn#VU?IGAe9JO*(3RY7m}~y+H@?iY z8%|EM4hr~jeZirEKk7GWP-AR@ea5x@b%yH@Wczi*NO4o1^he@@+D7xV&eV|>YE=;u6G{Kcm@525?$qrg5l9><9{0`VUk|KOhq!_%3|yEO zNTndPS`5=uA}O~{bRqHIrxaLUN1e%3@^z#fQPu#JlnSO%IaH8h(Ij_}7G`Rg7LaJ~ zlSasg0(C4?x_gLe8F^pKx`*2Xgt*(YbDK(rnSd(TBinGcntPHsgsku*@cE-m|2 zooK+F0p-<;Zm=y0Nzf?RXb;N>au> M+NqFLcE@P;Pv$M2mH+?% diff --git a/tag-notes.py b/tag-notes.py index f12a232..d54d009 100755 --- a/tag-notes.py +++ b/tag-notes.py @@ -1,53 +1,65 @@ #!/usr/bin/env python3 """ Note Tagging and SEO Metadata Script -Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata +Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata. """ -import os -import sys import io import json +import os import re +import sys from pathlib import Path -import requests +import anthropic import yaml from ruamel.yaml import YAML +# --------------------------------------------------------------------------- # Configuration -LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions" -MODEL_NAME = "openai/gpt-oss-20b" +# --------------------------------------------------------------------------- TAXONOMY_FILE = "tag-taxonomy.yaml" NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/") CONTENT_CHAR_LIMIT = 20000 SEO_DESC_MIN = 150 SEO_DESC_MAX = 160 +MODEL = "claude-sonnet-4-6" # Round-trip YAML preserves existing frontmatter formatting yaml_rt = YAML() yaml_rt.preserve_quotes = True yaml_rt.width = 4096 +# Anthropic client — reads ANTHROPIC_API_KEY from env automatically +client = anthropic.Anthropic() -def load_taxonomy(taxonomy_path): - with open(taxonomy_path, 'r') as f: + +# --------------------------------------------------------------------------- +# Taxonomy helpers +# --------------------------------------------------------------------------- + +def load_taxonomy(taxonomy_path: Path) -> list[str]: + with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} - return data.get('tags', []) or [] + return data.get("tags", []) or [] -def append_tags_to_taxonomy(taxonomy_path, new_tags): - with open(taxonomy_path, 'r') as f: +def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None: + with open(taxonomy_path) as f: data = yaml.safe_load(f) or {} - existing = data.get('tags', []) or [] + existing = data.get("tags", []) or [] combined = list(dict.fromkeys(existing + list(new_tags))) - data['tags'] = combined - with open(taxonomy_path, 'w') as f: + data["tags"] = combined + with open(taxonomy_path, "w") as f: yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -def extract_frontmatter(content): - pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$' +# --------------------------------------------------------------------------- +# Frontmatter helpers +# --------------------------------------------------------------------------- + +def extract_frontmatter(content: str): + pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$" match = re.match(pattern, content, re.DOTALL) if not match: return None, content @@ -55,74 +67,65 @@ def extract_frontmatter(content): return frontmatter, match.group(2) -def reconstruct_markdown(frontmatter, body): +def reconstruct_markdown(frontmatter, body: str) -> str: stream = io.StringIO() yaml_rt.dump(frontmatter, stream) fm_str = stream.getvalue() - if not fm_str.endswith('\n'): - fm_str += '\n' + if not fm_str.endswith("\n"): + fm_str += "\n" return f"---\n{fm_str}---\n{body}" -def slugify(text): +def slugify(text: str) -> str: text = text.lower() - text = re.sub(r"[’'`]", '', text) - text = re.sub(r'[^\w\s-]', ' ', text) - text = re.sub(r'[-\s]+', '-', text).strip('-') + text = re.sub(r"[''`]", "", text) + text = re.sub(r"[^\w\s-]", " ", text) + text = re.sub(r"[-\s]+", "-", text).strip("-") return text -def parse_json_response(content): +# --------------------------------------------------------------------------- +# LLM helpers +# --------------------------------------------------------------------------- + +def parse_json_response(content: str | None) -> dict | None: if content is None: return None try: return json.loads(content) except json.JSONDecodeError: pass - start = content.find('{') - end = content.rfind('}') + start = content.find("{") + end = content.rfind("}") if start != -1 and end > start: try: - return json.loads(content[start:end + 1]) + return json.loads(content[start : end + 1]) except json.JSONDecodeError: pass return None -def call_llm_json(system_prompt, user_prompt, max_tokens=4000): - payload = { - "model": MODEL_NAME, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - "temperature": 0.2, - "max_tokens": max_tokens, - "response_format": {"type": "text"}, - } +def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None: try: - response = requests.post(LM_STUDIO_URL, json=payload, timeout=600) - if not response.ok: - print(f" ! LLM error: {response.status_code} {response.reason}") - print(f" body: {response.text[:500]}") - return None - result = response.json() - choice = result['choices'][0] - content = choice['message'].get('content') or '' + message = client.messages.create( + model=MODEL, + max_tokens=max_tokens, + system=system_prompt, + messages=[{"role": "user", "content": user_prompt}], + temperature=0.2, + ) + content = message.content[0].text if message.content else "" parsed = parse_json_response(content) if parsed is None: - finish = choice.get('finish_reason') - print(f" ! LLM returned no parseable JSON (finish_reason={finish})") - if finish == 'length': - print(" Hit max_tokens — reasoning model burned the budget before emitting output.") + print(f" ! LLM returned no parseable JSON (stop_reason={message.stop_reason})") print(f" content: {content[:500]!r}") return parsed - except Exception as e: - print(f" ! LLM error: {e}") + except anthropic.APIError as e: + print(f" ! Anthropic API error: {e}") return None -def request_metadata(title, note_content, taxonomy): +def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None: taxonomy_str = ", ".join(taxonomy) system_prompt = f"""You analyze markdown notes and return structured metadata. @@ -136,9 +139,8 @@ Return ONLY valid JSON in this exact shape: }} Rules: -- Tags should describe what the note is substantively about, not topics it merely mentions in passing. -- tags_from_taxonomy: 0-5 tags drawn from the existing taxonomy, ONLY when they genuinely fit. Do NOT force a taxonomy tag — return an empty list if nothing truly applies. -- new_tag_suggestions: 0-5 NEW tags when the taxonomy doesn't adequately cover the content. Each must be a reusable category (not hyper-specific to one note). Use lowercase-hyphenated style (e.g., personal-essay). +- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content. +- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative). - seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow ": ". Aim for 4-10 words. - seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding. - seo_keywords: 10-15 relevant keywords, no duplicates. @@ -154,13 +156,14 @@ Produce the JSON described in the system prompt.""" return call_llm_json(system_prompt, user_prompt) -def request_description_retry(title, note_content, previous_desc): +def request_description_retry(title: str, note_content: str, previous_desc: str) -> str: system_prompt = f"""You rewrite SEO descriptions to a strict length. Return ONLY valid JSON of the form: {{"seo_description": "..."}} The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.""" + user_prompt = f"""Note title: {title} Note content: @@ -172,46 +175,49 @@ Your previous description was {len(previous_desc)} characters, outside the allow Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters.""" result = call_llm_json(system_prompt, user_prompt) if result: - return (result.get('seo_description') or '').strip() - return '' + return (result.get("seo_description") or "").strip() + return "" -def process_note(file_path, taxonomy, new_tag_accumulator): +# --------------------------------------------------------------------------- +# Note processing +# --------------------------------------------------------------------------- + +def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None: print(f"Processing: {file_path}") - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() + content = file_path.read_text(encoding="utf-8") frontmatter, body = extract_frontmatter(content) if frontmatter is None: print(" ⚠️ No frontmatter found, skipping") return - existing_tags = frontmatter.get('tags', []) or [] + existing_tags = frontmatter.get("tags", []) or [] if existing_tags == [None]: existing_tags = [] needs_tags = not existing_tags - needs_slug = not frontmatter.get('slug') - needs_seo_title = not frontmatter.get('seo-title') - needs_seo_desc = not frontmatter.get('seo-description') - needs_seo_keywords = not frontmatter.get('seo-keywords') + needs_slug = not frontmatter.get("slug") + needs_seo_title = not frontmatter.get("seo-title") + needs_seo_desc = not frontmatter.get("seo-description") + needs_seo_keywords = not frontmatter.get("seo-keywords") - if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords): + if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]): print(" ✓ All fields already populated, skipping") return - title = frontmatter.get('title') or Path(file_path).stem + title = frontmatter.get("title") or file_path.stem updated = False if needs_slug: - slug = slugify(Path(file_path).stem) - frontmatter['slug'] = slug + slug = slugify(file_path.stem) + frontmatter["slug"] = slug print(f" + Added slug: {slug}") updated = True - if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords): - with open(file_path, 'w', encoding='utf-8') as f: - f.write(reconstruct_markdown(frontmatter, body)) + # If only slug was needed, skip the LLM call + if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]): + file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") return @@ -221,11 +227,11 @@ def process_note(file_path, taxonomy, new_tag_accumulator): return if needs_tags: - taxonomy_tags = llm_response.get('tags_from_taxonomy') or [] - new_suggestions = llm_response.get('new_tag_suggestions') or [] - combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:8] + taxonomy_tags = llm_response.get("tags_from_taxonomy") or [] + new_suggestions = llm_response.get("new_tag_suggestions") or [] + combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5] if combined: - frontmatter['tags'] = combined + frontmatter["tags"] = combined updated = True print(f" + Added tags: {', '.join(combined)}") genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions] @@ -234,19 +240,17 @@ def process_note(file_path, taxonomy, new_tag_accumulator): new_tag_accumulator.update(genuinely_new) if needs_seo_title: - suffix = (llm_response.get('seo_title_suffix') or '').strip() - suffix = suffix.lstrip(':').strip() - # Strip a leading repeat of the title if the LLM included it anyway + suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip() if suffix.lower().startswith(title.lower()): - suffix = suffix[len(title):].lstrip(':').strip() + suffix = suffix[len(title):].lstrip(":").strip() if suffix: seo_title = f"{title}: {suffix}" - frontmatter['seo-title'] = seo_title + frontmatter["seo-title"] = seo_title updated = True print(f" + Added SEO title: {seo_title}") if needs_seo_desc: - seo_desc = (llm_response.get('seo_description') or '').strip() + seo_desc = (llm_response.get("seo_description") or "").strip() if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX): print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking") retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc) @@ -257,30 +261,32 @@ def process_note(file_path, taxonomy, new_tag_accumulator): else: print(" ! Retry failed; using original") if seo_desc: - frontmatter['seo-description'] = seo_desc + frontmatter["seo-description"] = seo_desc updated = True print(f" + Added SEO description ({len(seo_desc)} chars)") if needs_seo_keywords: - seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or [])) + seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or [])) if seo_keywords: - frontmatter['seo-keywords'] = seo_keywords + frontmatter["seo-keywords"] = seo_keywords updated = True print(f" + Added {len(seo_keywords)} SEO keywords") if updated: - with open(file_path, 'w', encoding='utf-8') as f: - f.write(reconstruct_markdown(frontmatter, body)) + file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8") print(" ✓ Updated successfully") else: print(" - No updates needed") -def main(): +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: taxonomy_path = Path(__file__).parent / TAXONOMY_FILE if not taxonomy_path.exists(): print(f"Error: Taxonomy file not found at {taxonomy_path}") - print(f"Please create {TAXONOMY_FILE} in the same directory as this script") sys.exit(1) taxonomy = load_taxonomy(taxonomy_path) @@ -290,11 +296,8 @@ def main(): if not target_path.exists(): print(f"Error: Notes folder not found: {target_path}") sys.exit(1) - if not target_path.is_dir(): - print(f"Error: {target_path} is not a directory") - sys.exit(1) - md_files = sorted(target_path.rglob('*.md')) + md_files = sorted(target_path.rglob("*.md")) if not md_files: print(f"No markdown files found in {target_path}") sys.exit(0) @@ -302,7 +305,7 @@ def main(): print(f"Processing all markdown files under: {target_path}") print(f"Found {len(md_files)} markdown files\n") - new_tag_accumulator = set() + new_tag_accumulator: set[str] = set() for md_file in md_files: try: process_note(md_file, taxonomy, new_tag_accumulator) @@ -313,17 +316,27 @@ def main(): print("\n✓ Processing complete!") fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy) - if fresh: - print(f"\nNew tags suggested during this run: {', '.join(fresh)}") - try: - answer = input("Add these to the taxonomy? [y/N]: ").strip().lower() - except EOFError: - answer = '' - if answer == 'y': - append_tags_to_taxonomy(taxonomy_path, fresh) - print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}") - else: - print("Skipped taxonomy update.") + if not fresh: + return + + print(f"\nNew tags suggested during this run: {', '.join(fresh)}") + + # Non-interactive (CI): log and skip + if not sys.stdin.isatty(): + print("Non-interactive environment detected — skipping taxonomy update.") + print(f"To add these manually, run the script locally and answer 'y' when prompted.") + return + + try: + answer = input("Add these to the taxonomy? [y/N]: ").strip().lower() + except EOFError: + answer = "" + + if answer == "y": + append_tags_to_taxonomy(taxonomy_path, fresh) + print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}") + else: + print("Skipped taxonomy update.") if __name__ == "__main__": diff --git a/tag-taxonomy.yaml b/tag-taxonomy.yaml index 6c8ce35..5542f64 100644 --- a/tag-taxonomy.yaml +++ b/tag-taxonomy.yaml @@ -1,59 +1,51 @@ -# Tag Taxonomy for Note Tagging -# Add new tags here as the LLM suggests good ones - tags: - # Technology & Development - - self-hosting - - linux - - automation - - ai-tools - - web-development - - infrastructure - - docker - - security - - privacy - - # Work & Management - - project-management - - business-analysis - - leadership - - agile - - team-dynamics - - process-improvement - - governance - - # Knowledge & Learning - - knowledge-management - - zettelkasten - - note-taking - - learning - - productivity - - # Philosophy & Spirituality - - buddhism - - eastern-philosophy - - meditation - - mindfulness - - # Literature & Writing - - literature - - postmodernism - - writing - - # Personal Interests - - plants - - aroids - - gardening - - # Personal Narrative & Life - - memoir - - personal-essay - - reflection - - family - - parenting - - recovery - - mental-health - - aging - - relationships - - childhood - - identity \ No newline at end of file +- self-hosting +- linux +- automation +- ai-tools +- web-development +- infrastructure +- docker +- security +- privacy +- project-management +- business-analysis +- leadership +- agile +- team-dynamics +- process-improvement +- governance +- knowledge-management +- zettelkasten +- note-taking +- learning +- productivity +- buddhism +- eastern-philosophy +- meditation +- mindfulness +- literature +- postmodernism +- writing +- plants +- aroids +- gardening +- memoir +- personal-essay +- reflection +- family +- parenting +- recovery +- mental-health +- aging +- relationships +- childhood +- identity +- morning-routine +- sleep-habits +- baking +- cooking-techniques +- fermentation +- food-baking +- kitchen-hacks +- sourdough