Files
note-tagger/tag-notes.py
T

319 lines
11 KiB
Python
Raw Normal View History

2026-02-06 06:17:43 -06:00
#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
2026-02-07 12:30:28 -06:00
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
2026-02-06 06:17:43 -06:00
"""
import os
import sys
2026-04-14 04:56:45 -05:00
import io
2026-02-06 06:17:43 -06:00
import json
import re
2026-04-14 04:56:45 -05:00
from pathlib import Path
import requests
import yaml
from ruamel.yaml import YAML
2026-02-06 06:17:43 -06:00
# Configuration
2026-04-14 04:56:45 -05:00
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
2026-02-06 06:17:43 -06:00
MODEL_NAME = "openai/gpt-oss-20b"
TAXONOMY_FILE = "tag-taxonomy.yaml"
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
2026-04-14 04:56:45 -05:00
CONTENT_CHAR_LIMIT = 20000
SEO_DESC_MIN = 150
SEO_DESC_MAX = 160
# Round-trip YAML preserves existing frontmatter formatting
yaml_rt = YAML()
yaml_rt.preserve_quotes = True
yaml_rt.width = 4096
2026-02-06 06:17:43 -06:00
def load_taxonomy(taxonomy_path):
with open(taxonomy_path, 'r') as f:
2026-04-14 04:56:45 -05:00
data = yaml.safe_load(f) or {}
return data.get('tags', []) or []
def append_tags_to_taxonomy(taxonomy_path, new_tags):
with open(taxonomy_path, 'r') as f:
data = yaml.safe_load(f) or {}
existing = data.get('tags', []) or []
combined = list(dict.fromkeys(existing + list(new_tags)))
data['tags'] = combined
with open(taxonomy_path, 'w') as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
2026-02-06 06:17:43 -06:00
def extract_frontmatter(content):
pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
match = re.match(pattern, content, re.DOTALL)
2026-04-14 04:56:45 -05:00
if not match:
return None, content
frontmatter = yaml_rt.load(match.group(1))
return frontmatter, match.group(2)
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def reconstruct_markdown(frontmatter, body):
stream = io.StringIO()
yaml_rt.dump(frontmatter, stream)
fm_str = stream.getvalue()
if not fm_str.endswith('\n'):
fm_str += '\n'
return f"---\n{fm_str}---\n{body}"
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def slugify(text):
text = text.lower()
text = re.sub(r"['`]", '', text)
text = re.sub(r'[^\w\s-]', ' ', text)
text = re.sub(r'[-\s]+', '-', text).strip('-')
return text
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def parse_json_response(content):
if content is None:
return None
try:
return json.loads(content)
except json.JSONDecodeError:
pass
start = content.find('{')
end = content.rfind('}')
if start != -1 and end > start:
try:
return json.loads(content[start:end + 1])
except json.JSONDecodeError:
pass
return None
2026-02-06 06:17:43 -06:00
2026-04-14 04:56:45 -05:00
def call_llm_json(system_prompt, user_prompt, max_tokens=900):
2026-02-06 06:17:43 -06:00
payload = {
"model": MODEL_NAME,
"messages": [
{"role": "system", "content": system_prompt},
2026-04-14 04:56:45 -05:00
{"role": "user", "content": user_prompt},
2026-02-06 06:17:43 -06:00
],
2026-04-14 04:56:45 -05:00
"temperature": 0.2,
"max_tokens": max_tokens,
"response_format": {"type": "json_object"},
2026-02-06 06:17:43 -06:00
}
try:
2026-04-14 04:56:45 -05:00
response = requests.post(LM_STUDIO_URL, json=payload, timeout=120)
2026-02-06 06:17:43 -06:00
response.raise_for_status()
result = response.json()
content = result['choices'][0]['message']['content']
2026-04-14 04:56:45 -05:00
return parse_json_response(content)
2026-02-06 06:17:43 -06:00
except Exception as e:
2026-04-14 04:56:45 -05:00
print(f" ! LLM error: {e}")
2026-02-06 06:17:43 -06:00
return None
2026-04-14 04:56:45 -05:00
def request_metadata(title, note_content, taxonomy):
taxonomy_str = ", ".join(taxonomy)
system_prompt = f"""You analyze markdown notes and return structured metadata.
Return ONLY valid JSON in this exact shape:
{{
"tags_from_taxonomy": ["tag1", "tag2"],
"new_tag_suggestions": ["newtag1"],
"seo_title_suffix": "Short descriptor that will follow the note title",
"seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.",
"seo_keywords": ["keyword1", "keyword2"]
}}
Rules:
- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content.
- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative).
- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow "<title>: ". Aim for 4-10 words.
- seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.
- seo_keywords: 10-15 relevant keywords, no duplicates.
Existing tag taxonomy: {taxonomy_str}"""
user_prompt = f"""Note title: {title}
Note content:
{note_content}
Produce the JSON described in the system prompt."""
return call_llm_json(system_prompt, user_prompt)
def request_description_retry(title, note_content, previous_desc):
system_prompt = f"""You rewrite SEO descriptions to a strict length.
Return ONLY valid JSON of the form:
{{"seo_description": "..."}}
The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
user_prompt = f"""Note title: {title}
Note content:
{note_content}
Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range:
"{previous_desc}"
Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
result = call_llm_json(system_prompt, user_prompt, max_tokens=400)
if result:
return (result.get('seo_description') or '').strip()
return ''
def process_note(file_path, taxonomy, new_tag_accumulator):
2026-02-06 06:17:43 -06:00
print(f"Processing: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
2026-04-14 04:56:45 -05:00
frontmatter, body = extract_frontmatter(content)
2026-02-06 06:17:43 -06:00
if frontmatter is None:
2026-04-14 04:56:45 -05:00
print(" ⚠️ No frontmatter found, skipping")
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
existing_tags = frontmatter.get('tags', []) or []
if existing_tags == [None]:
2026-02-06 06:17:43 -06:00
existing_tags = []
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
needs_tags = not existing_tags
2026-02-07 12:30:28 -06:00
needs_slug = not frontmatter.get('slug')
2026-02-06 06:17:43 -06:00
needs_seo_title = not frontmatter.get('seo-title')
needs_seo_desc = not frontmatter.get('seo-description')
needs_seo_keywords = not frontmatter.get('seo-keywords')
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
2026-04-14 04:56:45 -05:00
print(" ✓ All fields already populated, skipping")
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
title = frontmatter.get('title') or Path(file_path).stem
2026-02-07 12:30:28 -06:00
updated = False
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if needs_slug:
2026-04-14 04:56:45 -05:00
slug = slugify(Path(file_path).stem)
2026-02-07 12:30:28 -06:00
frontmatter['slug'] = slug
print(f" + Added slug: {slug}")
updated = True
2026-04-14 04:56:45 -05:00
2026-02-07 12:30:28 -06:00
if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
with open(file_path, 'w', encoding='utf-8') as f:
2026-04-14 04:56:45 -05:00
f.write(reconstruct_markdown(frontmatter, body))
print(" ✓ Updated successfully")
2026-02-07 12:30:28 -06:00
return
2026-04-14 04:56:45 -05:00
llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy)
2026-02-06 06:17:43 -06:00
if not llm_response:
2026-04-14 04:56:45 -05:00
print(" ✗ Failed to get LLM response")
2026-02-06 06:17:43 -06:00
return
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_tags:
2026-04-14 04:56:45 -05:00
taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
new_suggestions = llm_response.get('new_tag_suggestions') or []
combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
if combined:
frontmatter['tags'] = combined
2026-02-06 06:17:43 -06:00
updated = True
2026-04-14 04:56:45 -05:00
print(f" + Added tags: {', '.join(combined)}")
genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
if genuinely_new:
print(f" (New suggestions: {', '.join(genuinely_new)})")
new_tag_accumulator.update(genuinely_new)
2026-02-06 06:17:43 -06:00
if needs_seo_title:
2026-04-14 04:56:45 -05:00
suffix = (llm_response.get('seo_title_suffix') or '').strip()
suffix = suffix.lstrip(':').strip()
# Strip a leading repeat of the title if the LLM included it anyway
if suffix.lower().startswith(title.lower()):
suffix = suffix[len(title):].lstrip(':').strip()
if suffix:
seo_title = f"{title}: {suffix}"
2026-02-06 06:17:43 -06:00
frontmatter['seo-title'] = seo_title
updated = True
print(f" + Added SEO title: {seo_title}")
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if needs_seo_desc:
2026-04-14 04:56:45 -05:00
seo_desc = (llm_response.get('seo_description') or '').strip()
if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX:
seo_desc = retry
elif retry:
print(f" ! Retry still {len(retry)} chars; using original")
else:
print(" ! Retry failed; using original")
2026-02-06 06:17:43 -06:00
if seo_desc:
frontmatter['seo-description'] = seo_desc
updated = True
2026-04-14 04:56:45 -05:00
print(f" + Added SEO description ({len(seo_desc)} chars)")
2026-02-06 06:17:43 -06:00
if needs_seo_keywords:
2026-04-14 04:56:45 -05:00
seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
2026-02-06 06:17:43 -06:00
if seo_keywords:
frontmatter['seo-keywords'] = seo_keywords
updated = True
print(f" + Added {len(seo_keywords)} SEO keywords")
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
if updated:
with open(file_path, 'w', encoding='utf-8') as f:
2026-04-14 04:56:45 -05:00
f.write(reconstruct_markdown(frontmatter, body))
print(" ✓ Updated successfully")
2026-02-06 06:17:43 -06:00
else:
2026-04-14 04:56:45 -05:00
print(" - No updates needed")
2026-02-06 06:17:43 -06:00
def main():
taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
if not taxonomy_path.exists():
print(f"Error: Taxonomy file not found at {taxonomy_path}")
print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
sys.exit(1)
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
taxonomy = load_taxonomy(taxonomy_path)
print(f"Loaded {len(taxonomy)} tags from taxonomy\n")
2026-04-14 04:56:45 -05:00
2026-02-06 06:17:43 -06:00
target_path = Path(NOTES_FOLDER)
if not target_path.exists():
print(f"Error: Notes folder not found: {target_path}")
sys.exit(1)
if not target_path.is_dir():
print(f"Error: {target_path} is not a directory")
sys.exit(1)
2026-04-14 04:56:45 -05:00
md_files = sorted(target_path.rglob('*.md'))
2026-02-06 06:17:43 -06:00
if not md_files:
print(f"No markdown files found in {target_path}")
sys.exit(0)
2026-04-14 04:56:45 -05:00
print(f"Processing all markdown files under: {target_path}")
2026-02-06 06:17:43 -06:00
print(f"Found {len(md_files)} markdown files\n")
2026-04-14 04:56:45 -05:00
new_tag_accumulator = set()
2026-02-06 06:17:43 -06:00
for md_file in md_files:
2026-04-14 04:56:45 -05:00
try:
process_note(md_file, taxonomy, new_tag_accumulator)
except Exception as e:
print(f" ✗ Error processing {md_file}: {e}")
print()
2026-02-06 06:17:43 -06:00
print("\n✓ Processing complete!")
2026-04-14 04:56:45 -05:00
fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
if fresh:
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
try:
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
except EOFError:
answer = ''
if answer == 'y':
append_tags_to_taxonomy(taxonomy_path, fresh)
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
else:
print("Skipped taxonomy update.")
2026-02-06 06:17:43 -06:00
if __name__ == "__main__":
2026-02-07 12:30:28 -06:00
main()