322 lines
11 KiB
Python
Executable File
322 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Note Tagging and SEO Metadata Script
|
||
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import io
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
|
||
import requests
|
||
import yaml
|
||
from ruamel.yaml import YAML
|
||
|
||
# Configuration
|
||
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
|
||
MODEL_NAME = "openai/gpt-oss-20b"
|
||
TAXONOMY_FILE = "tag-taxonomy.yaml"
|
||
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
|
||
CONTENT_CHAR_LIMIT = 20000
|
||
SEO_DESC_MIN = 150
|
||
SEO_DESC_MAX = 160
|
||
|
||
# Round-trip YAML preserves existing frontmatter formatting
|
||
yaml_rt = YAML()
|
||
yaml_rt.preserve_quotes = True
|
||
yaml_rt.width = 4096
|
||
|
||
|
||
def load_taxonomy(taxonomy_path):
|
||
with open(taxonomy_path, 'r') as f:
|
||
data = yaml.safe_load(f) or {}
|
||
return data.get('tags', []) or []
|
||
|
||
|
||
def append_tags_to_taxonomy(taxonomy_path, new_tags):
|
||
with open(taxonomy_path, 'r') as f:
|
||
data = yaml.safe_load(f) or {}
|
||
existing = data.get('tags', []) or []
|
||
combined = list(dict.fromkeys(existing + list(new_tags)))
|
||
data['tags'] = combined
|
||
with open(taxonomy_path, 'w') as f:
|
||
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||
|
||
|
||
def extract_frontmatter(content):
|
||
pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
|
||
match = re.match(pattern, content, re.DOTALL)
|
||
if not match:
|
||
return None, content
|
||
frontmatter = yaml_rt.load(match.group(1))
|
||
return frontmatter, match.group(2)
|
||
|
||
|
||
def reconstruct_markdown(frontmatter, body):
|
||
stream = io.StringIO()
|
||
yaml_rt.dump(frontmatter, stream)
|
||
fm_str = stream.getvalue()
|
||
if not fm_str.endswith('\n'):
|
||
fm_str += '\n'
|
||
return f"---\n{fm_str}---\n{body}"
|
||
|
||
|
||
def slugify(text):
|
||
text = text.lower()
|
||
text = re.sub(r"[’'`]", '', text)
|
||
text = re.sub(r'[^\w\s-]', ' ', text)
|
||
text = re.sub(r'[-\s]+', '-', text).strip('-')
|
||
return text
|
||
|
||
|
||
def parse_json_response(content):
|
||
if content is None:
|
||
return None
|
||
try:
|
||
return json.loads(content)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
start = content.find('{')
|
||
end = content.rfind('}')
|
||
if start != -1 and end > start:
|
||
try:
|
||
return json.loads(content[start:end + 1])
|
||
except json.JSONDecodeError:
|
||
pass
|
||
return None
|
||
|
||
|
||
def call_llm_json(system_prompt, user_prompt, max_tokens=900):
|
||
payload = {
|
||
"model": MODEL_NAME,
|
||
"messages": [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_prompt},
|
||
],
|
||
"temperature": 0.2,
|
||
"max_tokens": max_tokens,
|
||
"response_format": {"type": "text"},
|
||
}
|
||
try:
|
||
response = requests.post(LM_STUDIO_URL, json=payload, timeout=120)
|
||
if not response.ok:
|
||
print(f" ! LLM error: {response.status_code} {response.reason}")
|
||
print(f" body: {response.text[:500]}")
|
||
return None
|
||
result = response.json()
|
||
content = result['choices'][0]['message']['content']
|
||
return parse_json_response(content)
|
||
except Exception as e:
|
||
print(f" ! LLM error: {e}")
|
||
return None
|
||
|
||
|
||
def request_metadata(title, note_content, taxonomy):
|
||
taxonomy_str = ", ".join(taxonomy)
|
||
system_prompt = f"""You analyze markdown notes and return structured metadata.
|
||
|
||
Return ONLY valid JSON in this exact shape:
|
||
{{
|
||
"tags_from_taxonomy": ["tag1", "tag2"],
|
||
"new_tag_suggestions": ["newtag1"],
|
||
"seo_title_suffix": "Short descriptor that will follow the note title",
|
||
"seo_description": "Factual summary between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters.",
|
||
"seo_keywords": ["keyword1", "keyword2"]
|
||
}}
|
||
|
||
Rules:
|
||
- tags_from_taxonomy: 1-5 tags drawn from the existing taxonomy that best fit the content.
|
||
- new_tag_suggestions: 0-2 NEW tags, only when content truly warrants it (be conservative).
|
||
- seo_title_suffix: a short, clean, non-clickbaity descriptor of the note. Do NOT include the note title or a leading colon — only the text that would follow "<title>: ". Aim for 4-10 words.
|
||
- seo_description: a clean factual summary, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding.
|
||
- seo_keywords: 10-15 relevant keywords, no duplicates.
|
||
|
||
Existing tag taxonomy: {taxonomy_str}"""
|
||
|
||
user_prompt = f"""Note title: {title}
|
||
|
||
Note content:
|
||
{note_content}
|
||
|
||
Produce the JSON described in the system prompt."""
|
||
return call_llm_json(system_prompt, user_prompt)
|
||
|
||
|
||
def request_description_retry(title, note_content, previous_desc):
|
||
system_prompt = f"""You rewrite SEO descriptions to a strict length.
|
||
|
||
Return ONLY valid JSON of the form:
|
||
{{"seo_description": "..."}}
|
||
|
||
The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
|
||
user_prompt = f"""Note title: {title}
|
||
|
||
Note content:
|
||
{note_content}
|
||
|
||
Your previous description was {len(previous_desc)} characters, outside the allowed {SEO_DESC_MIN}-{SEO_DESC_MAX} range:
|
||
"{previous_desc}"
|
||
|
||
Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
|
||
result = call_llm_json(system_prompt, user_prompt, max_tokens=400)
|
||
if result:
|
||
return (result.get('seo_description') or '').strip()
|
||
return ''
|
||
|
||
|
||
def process_note(file_path, taxonomy, new_tag_accumulator):
|
||
print(f"Processing: {file_path}")
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
frontmatter, body = extract_frontmatter(content)
|
||
if frontmatter is None:
|
||
print(" ⚠️ No frontmatter found, skipping")
|
||
return
|
||
|
||
existing_tags = frontmatter.get('tags', []) or []
|
||
if existing_tags == [None]:
|
||
existing_tags = []
|
||
|
||
needs_tags = not existing_tags
|
||
needs_slug = not frontmatter.get('slug')
|
||
needs_seo_title = not frontmatter.get('seo-title')
|
||
needs_seo_desc = not frontmatter.get('seo-description')
|
||
needs_seo_keywords = not frontmatter.get('seo-keywords')
|
||
|
||
if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
|
||
print(" ✓ All fields already populated, skipping")
|
||
return
|
||
|
||
title = frontmatter.get('title') or Path(file_path).stem
|
||
updated = False
|
||
|
||
if needs_slug:
|
||
slug = slugify(Path(file_path).stem)
|
||
frontmatter['slug'] = slug
|
||
print(f" + Added slug: {slug}")
|
||
updated = True
|
||
|
||
if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(reconstruct_markdown(frontmatter, body))
|
||
print(" ✓ Updated successfully")
|
||
return
|
||
|
||
llm_response = request_metadata(title, body[:CONTENT_CHAR_LIMIT], taxonomy)
|
||
if not llm_response:
|
||
print(" ✗ Failed to get LLM response")
|
||
return
|
||
|
||
if needs_tags:
|
||
taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
|
||
new_suggestions = llm_response.get('new_tag_suggestions') or []
|
||
combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
|
||
if combined:
|
||
frontmatter['tags'] = combined
|
||
updated = True
|
||
print(f" + Added tags: {', '.join(combined)}")
|
||
genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
|
||
if genuinely_new:
|
||
print(f" (New suggestions: {', '.join(genuinely_new)})")
|
||
new_tag_accumulator.update(genuinely_new)
|
||
|
||
if needs_seo_title:
|
||
suffix = (llm_response.get('seo_title_suffix') or '').strip()
|
||
suffix = suffix.lstrip(':').strip()
|
||
# Strip a leading repeat of the title if the LLM included it anyway
|
||
if suffix.lower().startswith(title.lower()):
|
||
suffix = suffix[len(title):].lstrip(':').strip()
|
||
if suffix:
|
||
seo_title = f"{title}: {suffix}"
|
||
frontmatter['seo-title'] = seo_title
|
||
updated = True
|
||
print(f" + Added SEO title: {seo_title}")
|
||
|
||
if needs_seo_desc:
|
||
seo_desc = (llm_response.get('seo_description') or '').strip()
|
||
if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
|
||
print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
|
||
retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
|
||
if retry and SEO_DESC_MIN <= len(retry) <= SEO_DESC_MAX:
|
||
seo_desc = retry
|
||
elif retry:
|
||
print(f" ! Retry still {len(retry)} chars; using original")
|
||
else:
|
||
print(" ! Retry failed; using original")
|
||
if seo_desc:
|
||
frontmatter['seo-description'] = seo_desc
|
||
updated = True
|
||
print(f" + Added SEO description ({len(seo_desc)} chars)")
|
||
|
||
if needs_seo_keywords:
|
||
seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
|
||
if seo_keywords:
|
||
frontmatter['seo-keywords'] = seo_keywords
|
||
updated = True
|
||
print(f" + Added {len(seo_keywords)} SEO keywords")
|
||
|
||
if updated:
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(reconstruct_markdown(frontmatter, body))
|
||
print(" ✓ Updated successfully")
|
||
else:
|
||
print(" - No updates needed")
|
||
|
||
|
||
def main():
|
||
taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
|
||
if not taxonomy_path.exists():
|
||
print(f"Error: Taxonomy file not found at {taxonomy_path}")
|
||
print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
|
||
sys.exit(1)
|
||
|
||
taxonomy = load_taxonomy(taxonomy_path)
|
||
print(f"Loaded {len(taxonomy)} tags from taxonomy\n")
|
||
|
||
target_path = Path(NOTES_FOLDER)
|
||
if not target_path.exists():
|
||
print(f"Error: Notes folder not found: {target_path}")
|
||
sys.exit(1)
|
||
if not target_path.is_dir():
|
||
print(f"Error: {target_path} is not a directory")
|
||
sys.exit(1)
|
||
|
||
md_files = sorted(target_path.rglob('*.md'))
|
||
if not md_files:
|
||
print(f"No markdown files found in {target_path}")
|
||
sys.exit(0)
|
||
|
||
print(f"Processing all markdown files under: {target_path}")
|
||
print(f"Found {len(md_files)} markdown files\n")
|
||
|
||
new_tag_accumulator = set()
|
||
for md_file in md_files:
|
||
try:
|
||
process_note(md_file, taxonomy, new_tag_accumulator)
|
||
except Exception as e:
|
||
print(f" ✗ Error processing {md_file}: {e}")
|
||
print()
|
||
|
||
print("\n✓ Processing complete!")
|
||
|
||
fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
|
||
if fresh:
|
||
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
|
||
try:
|
||
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
|
||
except EOFError:
|
||
answer = ''
|
||
if answer == 'y':
|
||
append_tags_to_taxonomy(taxonomy_path, fresh)
|
||
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
|
||
else:
|
||
print("Skipped taxonomy update.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|