2026-02-06 06:17:43 -06:00
#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
2026-04-25 18:47:29 -05:00
Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata.
2026-02-06 06:17:43 -06:00
"""
2026-04-14 04:56:45 -05:00
import io
2026-02-06 06:17:43 -06:00
import json
2026-04-25 18:47:29 -05:00
import os
2026-02-06 06:17:43 -06:00
import re
2026-04-25 18:47:29 -05:00
import sys
2026-04-14 04:56:45 -05:00
from pathlib import Path
2026-04-25 18:47:29 -05:00
import anthropic
2026-04-14 04:56:45 -05:00
import yaml
from ruamel . yaml import YAML
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
2026-02-06 06:17:43 -06:00
# Configuration
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
2026-02-06 06:17:43 -06:00
TAXONOMY_FILE = " tag-taxonomy.yaml "
NOTES_FOLDER = os . path . expanduser ( " ~/Documents/ejl-zk/40 Public/41 Notes/ " )
2026-04-14 04:56:45 -05:00
CONTENT_CHAR_LIMIT = 20000
SEO_DESC_MIN = 150
SEO_DESC_MAX = 160
2026-04-25 18:47:29 -05:00
MODEL = " claude-sonnet-4-6 "
2026-04-14 04:56:45 -05:00
# Round-trip YAML preserves existing frontmatter formatting
yaml_rt = YAML ( )
yaml_rt . preserve_quotes = True
yaml_rt . width = 4096
2026-04-25 18:47:29 -05:00
# Anthropic client — reads ANTHROPIC_API_KEY from env automatically
client = anthropic . Anthropic ( )
# ---------------------------------------------------------------------------
# Taxonomy helpers
# ---------------------------------------------------------------------------
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
def load_taxonomy ( taxonomy_path : Path ) - > list [ str ] :
with open ( taxonomy_path ) as f :
2026-04-14 04:56:45 -05:00
data = yaml . safe_load ( f ) or { }
2026-04-25 18:47:29 -05:00
return data . get ( " tags " , [ ] ) or [ ]
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
def append_tags_to_taxonomy ( taxonomy_path : Path , new_tags : set [ str ] ) - > None :
with open ( taxonomy_path ) as f :
2026-04-14 04:56:45 -05:00
data = yaml . safe_load ( f ) or { }
2026-04-25 18:47:29 -05:00
existing = data . get ( " tags " , [ ] ) or [ ]
2026-04-14 04:56:45 -05:00
combined = list ( dict . fromkeys ( existing + list ( new_tags ) ) )
2026-04-25 18:47:29 -05:00
data [ " tags " ] = combined
with open ( taxonomy_path , " w " ) as f :
2026-04-14 04:56:45 -05:00
yaml . dump ( data , f , default_flow_style = False , sort_keys = False , allow_unicode = True )
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
# Frontmatter helpers
# ---------------------------------------------------------------------------
def extract_frontmatter ( content : str ) :
pattern = r " ^--- \ s* \ n(.*?) \ n--- \ s* \ n(.*)$ "
2026-02-06 06:17:43 -06:00
match = re . match ( pattern , content , re . DOTALL )
2026-04-14 04:56:45 -05:00
if not match :
return None , content
frontmatter = yaml_rt . load ( match . group ( 1 ) )
return frontmatter , match . group ( 2 )
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
def reconstruct_markdown ( frontmatter , body : str ) - > str :
2026-04-14 04:56:45 -05:00
stream = io . StringIO ( )
yaml_rt . dump ( frontmatter , stream )
fm_str = stream . getvalue ( )
2026-04-25 18:47:29 -05:00
if not fm_str . endswith ( " \n " ) :
fm_str + = " \n "
2026-04-14 04:56:45 -05:00
return f " --- \n { fm_str } --- \n { body } "
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
def slugify ( text : str ) - > str :
2026-04-14 04:56:45 -05:00
text = text . lower ( )
2026-04-25 18:47:29 -05:00
text = re . sub ( r " [ ' ' `] " , " " , text )
text = re . sub ( r " [^ \ w \ s-] " , " " , text )
text = re . sub ( r " [- \ s]+ " , " - " , text ) . strip ( " - " )
2026-04-14 04:56:45 -05:00
return text
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
# LLM helpers
# ---------------------------------------------------------------------------
def parse_json_response ( content : str | None ) - > dict | None :
2026-04-14 04:56:45 -05:00
if content is None :
return None
try :
return json . loads ( content )
except json . JSONDecodeError :
pass
2026-04-25 18:47:29 -05:00
start = content . find ( " { " )
end = content . rfind ( " } " )
2026-04-14 04:56:45 -05:00
if start != - 1 and end > start :
try :
2026-04-25 18:47:29 -05:00
return json . loads ( content [ start : end + 1 ] )
2026-04-14 04:56:45 -05:00
except json . JSONDecodeError :
pass
return None
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
def call_llm_json ( system_prompt : str , user_prompt : str , max_tokens : int = 1024 ) - > dict | None :
2026-02-06 06:17:43 -06:00
try :
2026-04-25 18:47:29 -05:00
message = client . messages . create (
model = MODEL ,
max_tokens = max_tokens ,
system = system_prompt ,
messages = [ { " role " : " user " , " content " : user_prompt } ] ,
temperature = 0.2 ,
)
content = message . content [ 0 ] . text if message . content else " "
2026-04-19 22:18:10 -05:00
parsed = parse_json_response ( content )
if parsed is None :
2026-04-25 18:47:29 -05:00
print ( f " ! LLM returned no parseable JSON (stop_reason= { message . stop_reason } ) " )
2026-04-19 22:18:10 -05:00
print ( f " content: { content [ : 500 ] !r} " )
return parsed
2026-04-25 18:47:29 -05:00
except anthropic . APIError as e :
print ( f " ! Anthropic API error: { e } " )
2026-02-06 06:17:43 -06:00
return None
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
def request_metadata ( title : str , note_content : str , taxonomy : list [ str ] ) - > dict | None :
2026-04-14 04:56:45 -05:00
taxonomy_str = " , " . join ( taxonomy )
system_prompt = f """ You analyze markdown notes and return structured metadata.
Return ONLY valid JSON in this exact shape:
{{
" tags_from_taxonomy " : [ " tag1 " , " tag2 " ],
" new_tag_suggestions " : [ " newtag1 " ],
" seo_title_suffix " : " Short descriptor that will follow the note title " ,
" seo_description " : " Factual summary between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters. " ,
" seo_keywords " : [ " keyword1 " , " keyword2 " ]
}}
Rules:
2026-04-25 18:47:29 -05:00
- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content.
- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative).
2026-04-14 04:56:45 -05:00
- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow " <title>: " . Aim for 4-10 words.
- seo_description: a clean factual summary, STRICTLY between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters inclusive. Count characters carefully before responding.
- seo_keywords: 10-15 relevant keywords, no duplicates.
Existing tag taxonomy: { taxonomy_str } """
user_prompt = f """ Note title: { title }
Note content:
{ note_content }
Produce the JSON described in the system prompt. """
return call_llm_json ( system_prompt , user_prompt )
2026-04-25 18:47:29 -05:00
def request_description_retry ( title : str , note_content : str , previous_desc : str ) - > str :
2026-04-14 04:56:45 -05:00
system_prompt = f """ You rewrite SEO descriptions to a strict length.
Return ONLY valid JSON of the form:
{{ " seo_description " : " ... " }}
The description must be a clean, factual summary of the note, STRICTLY between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters inclusive. Count characters carefully before responding. """
2026-04-25 18:47:29 -05:00
2026-04-14 04:56:45 -05:00
user_prompt = f """ Note title: { title }
Note content:
{ note_content }
Your previous description was { len ( previous_desc ) } characters, outside the allowed { SEO_DESC_MIN } - { SEO_DESC_MAX } range:
" { previous_desc } "
Rewrite it to fit strictly within { SEO_DESC_MIN } - { SEO_DESC_MAX } characters. """
2026-04-19 22:18:10 -05:00
result = call_llm_json ( system_prompt , user_prompt )
2026-04-14 04:56:45 -05:00
if result :
2026-04-25 18:47:29 -05:00
return ( result . get ( " seo_description " ) or " " ) . strip ( )
return " "
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
# Note processing
# ---------------------------------------------------------------------------
def process_note ( file_path : Path , taxonomy : list [ str ] , new_tag_accumulator : set ) - > None :
2026-02-06 06:17:43 -06:00
print ( f " Processing: { file_path } " )
2026-04-25 18:47:29 -05:00
content = file_path . read_text ( encoding = " utf-8 " )
2026-04-14 04:56:45 -05:00
frontmatter , body = extract_frontmatter ( content )
2026-02-06 06:17:43 -06:00
if frontmatter is None :
2026-04-14 04:56:45 -05:00
print ( " ⚠️ No frontmatter found, skipping " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
existing_tags = frontmatter . get ( " tags " , [ ] ) or [ ]
2026-04-14 04:56:45 -05:00
if existing_tags == [ None ] :
2026-02-06 06:17:43 -06:00
existing_tags = [ ]
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
needs_tags = not existing_tags
2026-04-25 18:47:29 -05:00
needs_slug = not frontmatter . get ( " slug " )
needs_seo_title = not frontmatter . get ( " seo-title " )
needs_seo_desc = not frontmatter . get ( " seo-description " )
needs_seo_keywords = not frontmatter . get ( " seo-keywords " )
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
if not any ( [ needs_tags , needs_slug , needs_seo_title , needs_seo_desc , needs_seo_keywords ] ) :
2026-04-14 04:56:45 -05:00
print ( " ✓ All fields already populated, skipping " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
title = frontmatter . get ( " title " ) or file_path . stem
2026-02-07 12:30:28 -06:00
updated = False
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if needs_slug :
2026-04-25 18:47:29 -05:00
slug = slugify ( file_path . stem )
frontmatter [ " slug " ] = slug
2026-02-07 12:30:28 -06:00
print ( f " + Added slug: { slug } " )
updated = True
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
# If only slug was needed, skip the LLM call
if not any ( [ needs_tags , needs_seo_title , needs_seo_desc , needs_seo_keywords ] ) :
file_path . write_text ( reconstruct_markdown ( frontmatter , body ) , encoding = " utf-8 " )
2026-04-14 04:56:45 -05:00
print ( " ✓ Updated successfully " )
2026-02-07 12:30:28 -06:00
return
2026-04-14 04:56:45 -05:00
llm_response = request_metadata ( title , body [ : CONTENT_CHAR_LIMIT ] , taxonomy )
2026-02-06 06:17:43 -06:00
if not llm_response :
2026-04-14 04:56:45 -05:00
print ( " ✗ Failed to get LLM response " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_tags :
2026-04-25 18:47:29 -05:00
taxonomy_tags = llm_response . get ( " tags_from_taxonomy " ) or [ ]
new_suggestions = llm_response . get ( " new_tag_suggestions " ) or [ ]
combined = list ( dict . fromkeys ( list ( taxonomy_tags ) + list ( new_suggestions ) ) ) [ : 5 ]
2026-04-14 04:56:45 -05:00
if combined :
2026-04-25 18:47:29 -05:00
frontmatter [ " tags " ] = combined
2026-02-06 06:17:43 -06:00
updated = True
2026-04-14 04:56:45 -05:00
print ( f " + Added tags: { ' , ' . join ( combined ) } " )
genuinely_new = [ t for t in combined if t not in taxonomy and t in new_suggestions ]
if genuinely_new :
print ( f " (New suggestions: { ' , ' . join ( genuinely_new ) } ) " )
new_tag_accumulator . update ( genuinely_new )
2026-02-06 06:17:43 -06:00
if needs_seo_title :
2026-04-25 18:47:29 -05:00
suffix = ( llm_response . get ( " seo_title_suffix " ) or " " ) . strip ( ) . lstrip ( " : " ) . strip ( )
2026-04-14 04:56:45 -05:00
if suffix . lower ( ) . startswith ( title . lower ( ) ) :
2026-04-25 18:47:29 -05:00
suffix = suffix [ len ( title ) : ] . lstrip ( " : " ) . strip ( )
2026-04-14 04:56:45 -05:00
if suffix :
seo_title = f " { title } : { suffix } "
2026-04-25 18:47:29 -05:00
frontmatter [ " seo-title " ] = seo_title
2026-02-06 06:17:43 -06:00
updated = True
print ( f " + Added SEO title: { seo_title } " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_seo_desc :
2026-04-25 18:47:29 -05:00
seo_desc = ( llm_response . get ( " seo_description " ) or " " ) . strip ( )
2026-04-14 04:56:45 -05:00
if seo_desc and not ( SEO_DESC_MIN < = len ( seo_desc ) < = SEO_DESC_MAX ) :
print ( f " ~ SEO description length { len ( seo_desc ) } outside { SEO_DESC_MIN } - { SEO_DESC_MAX } , re-asking " )
retry = request_description_retry ( title , body [ : CONTENT_CHAR_LIMIT ] , seo_desc )
if retry and SEO_DESC_MIN < = len ( retry ) < = SEO_DESC_MAX :
seo_desc = retry
elif retry :
print ( f " ! Retry still { len ( retry ) } chars; using original " )
else :
print ( " ! Retry failed; using original " )
2026-02-06 06:17:43 -06:00
if seo_desc :
2026-04-25 18:47:29 -05:00
frontmatter [ " seo-description " ] = seo_desc
2026-02-06 06:17:43 -06:00
updated = True
2026-04-14 04:56:45 -05:00
print ( f " + Added SEO description ( { len ( seo_desc ) } chars) " )
2026-02-06 06:17:43 -06:00
if needs_seo_keywords :
2026-04-25 18:47:29 -05:00
seo_keywords = list ( dict . fromkeys ( llm_response . get ( " seo_keywords " ) or [ ] ) )
2026-02-06 06:17:43 -06:00
if seo_keywords :
2026-04-25 18:47:29 -05:00
frontmatter [ " seo-keywords " ] = seo_keywords
2026-02-06 06:17:43 -06:00
updated = True
print ( f " + Added { len ( seo_keywords ) } SEO keywords " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if updated :
2026-04-25 18:47:29 -05:00
file_path . write_text ( reconstruct_markdown ( frontmatter , body ) , encoding = " utf-8 " )
2026-04-14 04:56:45 -05:00
print ( " ✓ Updated successfully " )
2026-02-06 06:17:43 -06:00
else :
2026-04-14 04:56:45 -05:00
print ( " - No updates needed " )
2026-02-06 06:17:43 -06:00
2026-04-25 18:47:29 -05:00
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main ( ) - > None :
2026-02-06 06:17:43 -06:00
taxonomy_path = Path ( __file__ ) . parent / TAXONOMY_FILE
if not taxonomy_path . exists ( ) :
print ( f " Error: Taxonomy file not found at { taxonomy_path } " )
sys . exit ( 1 )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
taxonomy = load_taxonomy ( taxonomy_path )
print ( f " Loaded { len ( taxonomy ) } tags from taxonomy \n " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
target_path = Path ( NOTES_FOLDER )
if not target_path . exists ( ) :
print ( f " Error: Notes folder not found: { target_path } " )
sys . exit ( 1 )
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
md_files = sorted ( target_path . rglob ( " *.md " ) )
2026-02-06 06:17:43 -06:00
if not md_files :
print ( f " No markdown files found in { target_path } " )
sys . exit ( 0 )
2026-04-14 04:56:45 -05:00
print ( f " Processing all markdown files under: { target_path } " )
2026-02-06 06:17:43 -06:00
print ( f " Found { len ( md_files ) } markdown files \n " )
2026-04-14 04:56:45 -05:00
2026-04-25 18:47:29 -05:00
new_tag_accumulator : set [ str ] = set ( )
2026-02-06 06:17:43 -06:00
for md_file in md_files :
2026-04-14 04:56:45 -05:00
try :
process_note ( md_file , taxonomy , new_tag_accumulator )
except Exception as e :
print ( f " ✗ Error processing { md_file } : { e } " )
print ( )
2026-02-06 06:17:43 -06:00
print ( " \n ✓ Processing complete! " )
2026-04-14 04:56:45 -05:00
fresh = sorted ( t for t in new_tag_accumulator if t not in taxonomy )
2026-04-25 18:47:29 -05:00
if not fresh :
return
print ( f " \n New tags suggested during this run: { ' , ' . join ( fresh ) } " )
# Non-interactive (CI): log and skip
if not sys . stdin . isatty ( ) :
print ( " Non-interactive environment detected — skipping taxonomy update. " )
print ( f " To add these manually, run the script locally and answer ' y ' when prompted. " )
return
try :
answer = input ( " Add these to the taxonomy? [y/N]: " ) . strip ( ) . lower ( )
except EOFError :
answer = " "
if answer == " y " :
append_tags_to_taxonomy ( taxonomy_path , fresh )
print ( f " ✓ Added { len ( fresh ) } tag(s) to { taxonomy_path . name } " )
else :
print ( " Skipped taxonomy update. " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if __name__ == " __main__ " :
2026-02-07 12:30:28 -06:00
main ( )