2026-02-06 06:17:43 -06:00
#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
2026-02-07 12:30:28 -06:00
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
2026-02-06 06:17:43 -06:00
"""
import os
import sys
2026-04-14 04:56:45 -05:00
import io
2026-02-06 06:17:43 -06:00
import json
import re
2026-04-14 04:56:45 -05:00
from pathlib import Path
import requests
import yaml
from ruamel . yaml import YAML
2026-02-06 06:17:43 -06:00
# Configuration
2026-04-14 04:56:45 -05:00
LM_STUDIO_URL = " http://localhost:1234/v1/chat/completions "
2026-02-06 06:17:43 -06:00
MODEL_NAME = " openai/gpt-oss-20b "
TAXONOMY_FILE = " tag-taxonomy.yaml "
NOTES_FOLDER = os . path . expanduser ( " ~/Documents/ejl-zk/40 Public/41 Notes/ " )
2026-04-14 04:56:45 -05:00
CONTENT_CHAR_LIMIT = 20000
SEO_DESC_MIN = 150
SEO_DESC_MAX = 160
# Round-trip YAML preserves existing frontmatter formatting
yaml_rt = YAML ( )
yaml_rt . preserve_quotes = True
yaml_rt . width = 4096
2026-02-06 06:17:43 -06:00
def load_taxonomy ( taxonomy_path ) :
with open ( taxonomy_path , ' r ' ) as f :
2026-04-14 04:56:45 -05:00
data = yaml . safe_load ( f ) or { }
return data . get ( ' tags ' , [ ] ) or [ ]
def append_tags_to_taxonomy ( taxonomy_path , new_tags ) :
with open ( taxonomy_path , ' r ' ) as f :
data = yaml . safe_load ( f ) or { }
existing = data . get ( ' tags ' , [ ] ) or [ ]
combined = list ( dict . fromkeys ( existing + list ( new_tags ) ) )
data [ ' tags ' ] = combined
with open ( taxonomy_path , ' w ' ) as f :
yaml . dump ( data , f , default_flow_style = False , sort_keys = False , allow_unicode = True )
2026-02-06 06:17:43 -06:00
def extract_frontmatter ( content ) :
pattern = r ' ^--- \ s* \ n(.*?) \ n--- \ s* \ n(.*)$ '
match = re . match ( pattern , content , re . DOTALL )
2026-04-14 04:56:45 -05:00
if not match :
return None , content
frontmatter = yaml_rt . load ( match . group ( 1 ) )
return frontmatter , match . group ( 2 )
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def reconstruct_markdown ( frontmatter , body ) :
stream = io . StringIO ( )
yaml_rt . dump ( frontmatter , stream )
fm_str = stream . getvalue ( )
if not fm_str . endswith ( ' \n ' ) :
fm_str + = ' \n '
return f " --- \n { fm_str } --- \n { body } "
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def slugify ( text ) :
text = text . lower ( )
text = re . sub ( r " [’ ' `] " , ' ' , text )
text = re . sub ( r ' [^ \ w \ s-] ' , ' ' , text )
text = re . sub ( r ' [- \ s]+ ' , ' - ' , text ) . strip ( ' - ' )
return text
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def parse_json_response ( content ) :
if content is None :
return None
try :
return json . loads ( content )
except json . JSONDecodeError :
pass
start = content . find ( ' { ' )
end = content . rfind ( ' } ' )
if start != - 1 and end > start :
try :
return json . loads ( content [ start : end + 1 ] )
except json . JSONDecodeError :
pass
return None
2026-02-06 06:17:43 -06:00
2026-04-19 22:18:10 -05:00
def call_llm_json ( system_prompt , user_prompt , max_tokens = 4000 ) :
2026-02-06 06:17:43 -06:00
payload = {
" model " : MODEL_NAME ,
" messages " : [
{ " role " : " system " , " content " : system_prompt } ,
2026-04-14 04:56:45 -05:00
{ " role " : " user " , " content " : user_prompt } ,
2026-02-06 06:17:43 -06:00
] ,
2026-04-14 04:56:45 -05:00
" temperature " : 0.2 ,
" max_tokens " : max_tokens ,
2026-04-14 21:35:12 -05:00
" response_format " : { " type " : " text " } ,
2026-02-06 06:17:43 -06:00
}
try :
2026-04-19 22:18:10 -05:00
response = requests . post ( LM_STUDIO_URL , json = payload , timeout = 600 )
2026-04-14 21:35:12 -05:00
if not response . ok :
print ( f " ! LLM error: { response . status_code } { response . reason } " )
print ( f " body: { response . text [ : 500 ] } " )
return None
2026-02-06 06:17:43 -06:00
result = response . json ( )
2026-04-19 22:18:10 -05:00
choice = result [ ' choices ' ] [ 0 ]
content = choice [ ' message ' ] . get ( ' content ' ) or ' '
parsed = parse_json_response ( content )
if parsed is None :
finish = choice . get ( ' finish_reason ' )
print ( f " ! LLM returned no parseable JSON (finish_reason= { finish } ) " )
if finish == ' length ' :
print ( " Hit max_tokens — reasoning model burned the budget before emitting output. " )
print ( f " content: { content [ : 500 ] !r} " )
return parsed
2026-02-06 06:17:43 -06:00
except Exception as e :
2026-04-14 04:56:45 -05:00
print ( f " ! LLM error: { e } " )
2026-02-06 06:17:43 -06:00
return None
2026-04-14 04:56:45 -05:00
def request_metadata ( title , note_content , taxonomy ) :
taxonomy_str = " , " . join ( taxonomy )
system_prompt = f """ You analyze markdown notes and return structured metadata.
Return ONLY valid JSON in this exact shape:
{{
" tags_from_taxonomy " : [ " tag1 " , " tag2 " ],
" new_tag_suggestions " : [ " newtag1 " ],
" seo_title_suffix " : " Short descriptor that will follow the note title " ,
" seo_description " : " Factual summary between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters. " ,
" seo_keywords " : [ " keyword1 " , " keyword2 " ]
}}
Rules:
2026-04-19 22:24:53 -05:00
- Tags should describe what the note is substantively about, not topics it merely mentions in passing.
- tags_from_taxonomy: 0-5 tags drawn from the existing taxonomy, ONLY when they genuinely fit. Do NOT force a taxonomy tag — return an empty list if nothing truly applies.
- new_tag_suggestions: 0-5 NEW tags when the taxonomy doesn ' t adequately cover the content. Each must be a reusable category (not hyper-specific to one note). Use lowercase-hyphenated style (e.g., personal-essay).
2026-04-14 04:56:45 -05:00
- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow " <title>: " . Aim for 4-10 words.
- seo_description: a clean factual summary, STRICTLY between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters inclusive. Count characters carefully before responding.
- seo_keywords: 10-15 relevant keywords, no duplicates.
Existing tag taxonomy: { taxonomy_str } """
user_prompt = f """ Note title: { title }
Note content:
{ note_content }
Produce the JSON described in the system prompt. """
return call_llm_json ( system_prompt , user_prompt )
def request_description_retry ( title , note_content , previous_desc ) :
system_prompt = f """ You rewrite SEO descriptions to a strict length.
Return ONLY valid JSON of the form:
{{ " seo_description " : " ... " }}
The description must be a clean, factual summary of the note, STRICTLY between { SEO_DESC_MIN } and { SEO_DESC_MAX } characters inclusive. Count characters carefully before responding. """
user_prompt = f """ Note title: { title }
Note content:
{ note_content }
Your previous description was { len ( previous_desc ) } characters, outside the allowed { SEO_DESC_MIN } - { SEO_DESC_MAX } range:
" { previous_desc } "
Rewrite it to fit strictly within { SEO_DESC_MIN } - { SEO_DESC_MAX } characters. """
2026-04-19 22:18:10 -05:00
result = call_llm_json ( system_prompt , user_prompt )
2026-04-14 04:56:45 -05:00
if result :
return ( result . get ( ' seo_description ' ) or ' ' ) . strip ( )
return ' '
def process_note ( file_path , taxonomy , new_tag_accumulator ) :
2026-02-06 06:17:43 -06:00
print ( f " Processing: { file_path } " )
with open ( file_path , ' r ' , encoding = ' utf-8 ' ) as f :
content = f . read ( )
2026-04-14 04:56:45 -05:00
frontmatter , body = extract_frontmatter ( content )
2026-02-06 06:17:43 -06:00
if frontmatter is None :
2026-04-14 04:56:45 -05:00
print ( " ⚠️ No frontmatter found, skipping " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
existing_tags = frontmatter . get ( ' tags ' , [ ] ) or [ ]
if existing_tags == [ None ] :
2026-02-06 06:17:43 -06:00
existing_tags = [ ]
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
needs_tags = not existing_tags
2026-02-07 12:30:28 -06:00
needs_slug = not frontmatter . get ( ' slug ' )
2026-02-06 06:17:43 -06:00
needs_seo_title = not frontmatter . get ( ' seo-title ' )
needs_seo_desc = not frontmatter . get ( ' seo-description ' )
needs_seo_keywords = not frontmatter . get ( ' seo-keywords ' )
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if not ( needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords ) :
2026-04-14 04:56:45 -05:00
print ( " ✓ All fields already populated, skipping " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
title = frontmatter . get ( ' title ' ) or Path ( file_path ) . stem
2026-02-07 12:30:28 -06:00
updated = False
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if needs_slug :
2026-04-14 04:56:45 -05:00
slug = slugify ( Path ( file_path ) . stem )
2026-02-07 12:30:28 -06:00
frontmatter [ ' slug ' ] = slug
print ( f " + Added slug: { slug } " )
updated = True
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if not ( needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords ) :
with open ( file_path , ' w ' , encoding = ' utf-8 ' ) as f :
2026-04-14 04:56:45 -05:00
f . write ( reconstruct_markdown ( frontmatter , body ) )
print ( " ✓ Updated successfully " )
2026-02-07 12:30:28 -06:00
return
2026-04-14 04:56:45 -05:00
llm_response = request_metadata ( title , body [ : CONTENT_CHAR_LIMIT ] , taxonomy )
2026-02-06 06:17:43 -06:00
if not llm_response :
2026-04-14 04:56:45 -05:00
print ( " ✗ Failed to get LLM response " )
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_tags :
2026-04-14 04:56:45 -05:00
taxonomy_tags = llm_response . get ( ' tags_from_taxonomy ' ) or [ ]
new_suggestions = llm_response . get ( ' new_tag_suggestions ' ) or [ ]
combined = list ( dict . fromkeys ( list ( taxonomy_tags ) + list ( new_suggestions ) ) ) [ : 5 ]
if combined :
frontmatter [ ' tags ' ] = combined
2026-02-06 06:17:43 -06:00
updated = True
2026-04-14 04:56:45 -05:00
print ( f " + Added tags: { ' , ' . join ( combined ) } " )
genuinely_new = [ t for t in combined if t not in taxonomy and t in new_suggestions ]
if genuinely_new :
print ( f " (New suggestions: { ' , ' . join ( genuinely_new ) } ) " )
new_tag_accumulator . update ( genuinely_new )
2026-02-06 06:17:43 -06:00
if needs_seo_title :
2026-04-14 04:56:45 -05:00
suffix = ( llm_response . get ( ' seo_title_suffix ' ) or ' ' ) . strip ( )
suffix = suffix . lstrip ( ' : ' ) . strip ( )
# Strip a leading repeat of the title if the LLM included it anyway
if suffix . lower ( ) . startswith ( title . lower ( ) ) :
suffix = suffix [ len ( title ) : ] . lstrip ( ' : ' ) . strip ( )
if suffix :
seo_title = f " { title } : { suffix } "
2026-02-06 06:17:43 -06:00
frontmatter [ ' seo-title ' ] = seo_title
updated = True
print ( f " + Added SEO title: { seo_title } " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_seo_desc :
2026-04-14 04:56:45 -05:00
seo_desc = ( llm_response . get ( ' seo_description ' ) or ' ' ) . strip ( )
if seo_desc and not ( SEO_DESC_MIN < = len ( seo_desc ) < = SEO_DESC_MAX ) :
print ( f " ~ SEO description length { len ( seo_desc ) } outside { SEO_DESC_MIN } - { SEO_DESC_MAX } , re-asking " )
retry = request_description_retry ( title , body [ : CONTENT_CHAR_LIMIT ] , seo_desc )
if retry and SEO_DESC_MIN < = len ( retry ) < = SEO_DESC_MAX :
seo_desc = retry
elif retry :
print ( f " ! Retry still { len ( retry ) } chars; using original " )
else :
print ( " ! Retry failed; using original " )
2026-02-06 06:17:43 -06:00
if seo_desc :
frontmatter [ ' seo-description ' ] = seo_desc
updated = True
2026-04-14 04:56:45 -05:00
print ( f " + Added SEO description ( { len ( seo_desc ) } chars) " )
2026-02-06 06:17:43 -06:00
if needs_seo_keywords :
2026-04-14 04:56:45 -05:00
seo_keywords = list ( dict . fromkeys ( llm_response . get ( ' seo_keywords ' ) or [ ] ) )
2026-02-06 06:17:43 -06:00
if seo_keywords :
frontmatter [ ' seo-keywords ' ] = seo_keywords
updated = True
print ( f " + Added { len ( seo_keywords ) } SEO keywords " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if updated :
with open ( file_path , ' w ' , encoding = ' utf-8 ' ) as f :
2026-04-14 04:56:45 -05:00
f . write ( reconstruct_markdown ( frontmatter , body ) )
print ( " ✓ Updated successfully " )
2026-02-06 06:17:43 -06:00
else :
2026-04-14 04:56:45 -05:00
print ( " - No updates needed " )
2026-02-06 06:17:43 -06:00
def main ( ) :
taxonomy_path = Path ( __file__ ) . parent / TAXONOMY_FILE
if not taxonomy_path . exists ( ) :
print ( f " Error: Taxonomy file not found at { taxonomy_path } " )
print ( f " Please create { TAXONOMY_FILE } in the same directory as this script " )
sys . exit ( 1 )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
taxonomy = load_taxonomy ( taxonomy_path )
print ( f " Loaded { len ( taxonomy ) } tags from taxonomy \n " )
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
target_path = Path ( NOTES_FOLDER )
if not target_path . exists ( ) :
print ( f " Error: Notes folder not found: { target_path } " )
sys . exit ( 1 )
if not target_path . is_dir ( ) :
print ( f " Error: { target_path } is not a directory " )
sys . exit ( 1 )
2026-04-14 04:56:45 -05:00
md_files = sorted ( target_path . rglob ( ' *.md ' ) )
2026-02-06 06:17:43 -06:00
if not md_files :
print ( f " No markdown files found in { target_path } " )
sys . exit ( 0 )
2026-04-14 04:56:45 -05:00
print ( f " Processing all markdown files under: { target_path } " )
2026-02-06 06:17:43 -06:00
print ( f " Found { len ( md_files ) } markdown files \n " )
2026-04-14 04:56:45 -05:00
new_tag_accumulator = set ( )
2026-02-06 06:17:43 -06:00
for md_file in md_files :
2026-04-14 04:56:45 -05:00
try :
process_note ( md_file , taxonomy , new_tag_accumulator )
except Exception as e :
print ( f " ✗ Error processing { md_file } : { e } " )
print ( )
2026-02-06 06:17:43 -06:00
print ( " \n ✓ Processing complete! " )
2026-04-14 04:56:45 -05:00
fresh = sorted ( t for t in new_tag_accumulator if t not in taxonomy )
if fresh :
print ( f " \n New tags suggested during this run: { ' , ' . join ( fresh ) } " )
try :
answer = input ( " Add these to the taxonomy? [y/N]: " ) . strip ( ) . lower ( )
except EOFError :
answer = ' '
if answer == ' y ' :
append_tags_to_taxonomy ( taxonomy_path , fresh )
print ( f " ✓ Added { len ( fresh ) } tag(s) to { taxonomy_path . name } " )
else :
print ( " Skipped taxonomy update. " )
2026-02-06 06:17:43 -06:00
if __name__ == " __main__ " :
2026-02-07 12:30:28 -06:00
main ( )