Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e792732a13 | |||
| fd1407e06f | |||
| 68d78fe6bd | |||
| b4fb1283b9 |
Binary file not shown.
+117
-103
@@ -1,53 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Note Tagging and SEO Metadata Script
|
||||
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
|
||||
Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import anthropic
|
||||
import yaml
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
# ---------------------------------------------------------------------------
|
||||
TAXONOMY_FILE = "tag-taxonomy.yaml"
|
||||
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
|
||||
CONTENT_CHAR_LIMIT = 20000
|
||||
SEO_DESC_MIN = 150
|
||||
SEO_DESC_MAX = 160
|
||||
MODEL = "claude-sonnet-4-6"
|
||||
|
||||
# Round-trip YAML preserves existing frontmatter formatting
|
||||
yaml_rt = YAML()
|
||||
yaml_rt.preserve_quotes = True
|
||||
yaml_rt.width = 4096
|
||||
|
||||
# Anthropic client — reads ANTHROPIC_API_KEY from env automatically
|
||||
client = anthropic.Anthropic()
|
||||
|
||||
def load_taxonomy(taxonomy_path):
|
||||
with open(taxonomy_path, 'r') as f:
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Taxonomy helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_taxonomy(taxonomy_path: Path) -> list[str]:
|
||||
with open(taxonomy_path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data.get('tags', []) or []
|
||||
return data.get("tags", []) or []
|
||||
|
||||
|
||||
def append_tags_to_taxonomy(taxonomy_path, new_tags):
|
||||
with open(taxonomy_path, 'r') as f:
|
||||
def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None:
|
||||
with open(taxonomy_path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
existing = data.get('tags', []) or []
|
||||
existing = data.get("tags", []) or []
|
||||
combined = list(dict.fromkeys(existing + list(new_tags)))
|
||||
data['tags'] = combined
|
||||
with open(taxonomy_path, 'w') as f:
|
||||
data["tags"] = combined
|
||||
with open(taxonomy_path, "w") as f:
|
||||
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||||
|
||||
|
||||
def extract_frontmatter(content):
|
||||
pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
|
||||
# ---------------------------------------------------------------------------
|
||||
# Frontmatter helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract_frontmatter(content: str):
|
||||
pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
|
||||
match = re.match(pattern, content, re.DOTALL)
|
||||
if not match:
|
||||
return None, content
|
||||
@@ -55,74 +67,65 @@ def extract_frontmatter(content):
|
||||
return frontmatter, match.group(2)
|
||||
|
||||
|
||||
def reconstruct_markdown(frontmatter, body):
|
||||
def reconstruct_markdown(frontmatter, body: str) -> str:
|
||||
stream = io.StringIO()
|
||||
yaml_rt.dump(frontmatter, stream)
|
||||
fm_str = stream.getvalue()
|
||||
if not fm_str.endswith('\n'):
|
||||
fm_str += '\n'
|
||||
if not fm_str.endswith("\n"):
|
||||
fm_str += "\n"
|
||||
return f"---\n{fm_str}---\n{body}"
|
||||
|
||||
|
||||
def slugify(text):
|
||||
def slugify(text: str) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[’'`]", '', text)
|
||||
text = re.sub(r'[^\w\s-]', ' ', text)
|
||||
text = re.sub(r'[-\s]+', '-', text).strip('-')
|
||||
text = re.sub(r"[''`]", "", text)
|
||||
text = re.sub(r"[^\w\s-]", " ", text)
|
||||
text = re.sub(r"[-\s]+", "-", text).strip("-")
|
||||
return text
|
||||
|
||||
|
||||
def parse_json_response(content):
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_json_response(content: str | None) -> dict | None:
|
||||
if content is None:
|
||||
return None
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
start = content.find('{')
|
||||
end = content.rfind('}')
|
||||
start = content.find("{")
|
||||
end = content.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
try:
|
||||
return json.loads(content[start:end + 1])
|
||||
return json.loads(content[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def call_llm_json(system_prompt, user_prompt, max_tokens=4000):
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.2,
|
||||
"max_tokens": max_tokens,
|
||||
"response_format": {"type": "text"},
|
||||
}
|
||||
def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None:
|
||||
try:
|
||||
response = requests.post(LM_STUDIO_URL, json=payload, timeout=600)
|
||||
if not response.ok:
|
||||
print(f" ! LLM error: {response.status_code} {response.reason}")
|
||||
print(f" body: {response.text[:500]}")
|
||||
return None
|
||||
result = response.json()
|
||||
choice = result['choices'][0]
|
||||
content = choice['message'].get('content') or ''
|
||||
message = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=max_tokens,
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": user_prompt}],
|
||||
temperature=0.2,
|
||||
)
|
||||
content = message.content[0].text if message.content else ""
|
||||
parsed = parse_json_response(content)
|
||||
if parsed is None:
|
||||
finish = choice.get('finish_reason')
|
||||
print(f" ! LLM returned no parseable JSON (finish_reason={finish})")
|
||||
if finish == 'length':
|
||||
print(" Hit max_tokens — reasoning model burned the budget before emitting output.")
|
||||
print(f" ! LLM returned no parseable JSON (stop_reason={message.stop_reason})")
|
||||
print(f" content: {content[:500]!r}")
|
||||
return parsed
|
||||
except Exception as e:
|
||||
print(f" ! LLM error: {e}")
|
||||
except anthropic.APIError as e:
|
||||
print(f" ! Anthropic API error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def request_metadata(title, note_content, taxonomy):
|
||||
def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None:
|
||||
taxonomy_str = ", ".join(taxonomy)
|
||||
system_prompt = f"""You analyze markdown notes and return structured metadata.
|
||||
|
||||
@@ -153,13 +156,14 @@ Produce the JSON described in the system prompt."""
|
||||
return call_llm_json(system_prompt, user_prompt)
|
||||
|
||||
|
||||
def request_description_retry(title, note_content, previous_desc):
|
||||
def request_description_retry(title: str, note_content: str, previous_desc: str) -> str:
|
||||
system_prompt = f"""You rewrite SEO descriptions to a strict length.
|
||||
|
||||
Return ONLY valid JSON of the form:
|
||||
{{"seo_description": "..."}}
|
||||
|
||||
The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
|
||||
|
||||
user_prompt = f"""Note title: {title}
|
||||
|
||||
Note content:
|
||||
@@ -171,46 +175,49 @@ Your previous description was {len(previous_desc)} characters, outside the allow
|
||||
Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
|
||||
result = call_llm_json(system_prompt, user_prompt)
|
||||
if result:
|
||||
return (result.get('seo_description') or '').strip()
|
||||
return ''
|
||||
return (result.get("seo_description") or "").strip()
|
||||
return ""
|
||||
|
||||
|
||||
def process_note(file_path, taxonomy, new_tag_accumulator):
|
||||
# ---------------------------------------------------------------------------
|
||||
# Note processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None:
|
||||
print(f"Processing: {file_path}")
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
|
||||
frontmatter, body = extract_frontmatter(content)
|
||||
if frontmatter is None:
|
||||
print(" ⚠️ No frontmatter found, skipping")
|
||||
return
|
||||
|
||||
existing_tags = frontmatter.get('tags', []) or []
|
||||
existing_tags = frontmatter.get("tags", []) or []
|
||||
if existing_tags == [None]:
|
||||
existing_tags = []
|
||||
|
||||
needs_tags = not existing_tags
|
||||
needs_slug = not frontmatter.get('slug')
|
||||
needs_seo_title = not frontmatter.get('seo-title')
|
||||
needs_seo_desc = not frontmatter.get('seo-description')
|
||||
needs_seo_keywords = not frontmatter.get('seo-keywords')
|
||||
needs_slug = not frontmatter.get("slug")
|
||||
needs_seo_title = not frontmatter.get("seo-title")
|
||||
needs_seo_desc = not frontmatter.get("seo-description")
|
||||
needs_seo_keywords = not frontmatter.get("seo-keywords")
|
||||
|
||||
if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
|
||||
if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
|
||||
print(" ✓ All fields already populated, skipping")
|
||||
return
|
||||
|
||||
title = frontmatter.get('title') or Path(file_path).stem
|
||||
title = frontmatter.get("title") or file_path.stem
|
||||
updated = False
|
||||
|
||||
if needs_slug:
|
||||
slug = slugify(Path(file_path).stem)
|
||||
frontmatter['slug'] = slug
|
||||
slug = slugify(file_path.stem)
|
||||
frontmatter["slug"] = slug
|
||||
print(f" + Added slug: {slug}")
|
||||
updated = True
|
||||
|
||||
if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(reconstruct_markdown(frontmatter, body))
|
||||
# If only slug was needed, skip the LLM call
|
||||
if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
|
||||
file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
|
||||
print(" ✓ Updated successfully")
|
||||
return
|
||||
|
||||
@@ -220,11 +227,11 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
|
||||
return
|
||||
|
||||
if needs_tags:
|
||||
taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
|
||||
new_suggestions = llm_response.get('new_tag_suggestions') or []
|
||||
taxonomy_tags = llm_response.get("tags_from_taxonomy") or []
|
||||
new_suggestions = llm_response.get("new_tag_suggestions") or []
|
||||
combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
|
||||
if combined:
|
||||
frontmatter['tags'] = combined
|
||||
frontmatter["tags"] = combined
|
||||
updated = True
|
||||
print(f" + Added tags: {', '.join(combined)}")
|
||||
genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
|
||||
@@ -233,19 +240,17 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
|
||||
new_tag_accumulator.update(genuinely_new)
|
||||
|
||||
if needs_seo_title:
|
||||
suffix = (llm_response.get('seo_title_suffix') or '').strip()
|
||||
suffix = suffix.lstrip(':').strip()
|
||||
# Strip a leading repeat of the title if the LLM included it anyway
|
||||
suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip()
|
||||
if suffix.lower().startswith(title.lower()):
|
||||
suffix = suffix[len(title):].lstrip(':').strip()
|
||||
suffix = suffix[len(title):].lstrip(":").strip()
|
||||
if suffix:
|
||||
seo_title = f"{title}: {suffix}"
|
||||
frontmatter['seo-title'] = seo_title
|
||||
frontmatter["seo-title"] = seo_title
|
||||
updated = True
|
||||
print(f" + Added SEO title: {seo_title}")
|
||||
|
||||
if needs_seo_desc:
|
||||
seo_desc = (llm_response.get('seo_description') or '').strip()
|
||||
seo_desc = (llm_response.get("seo_description") or "").strip()
|
||||
if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
|
||||
print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
|
||||
retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
|
||||
@@ -256,30 +261,32 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
|
||||
else:
|
||||
print(" ! Retry failed; using original")
|
||||
if seo_desc:
|
||||
frontmatter['seo-description'] = seo_desc
|
||||
frontmatter["seo-description"] = seo_desc
|
||||
updated = True
|
||||
print(f" + Added SEO description ({len(seo_desc)} chars)")
|
||||
|
||||
if needs_seo_keywords:
|
||||
seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
|
||||
seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or []))
|
||||
if seo_keywords:
|
||||
frontmatter['seo-keywords'] = seo_keywords
|
||||
frontmatter["seo-keywords"] = seo_keywords
|
||||
updated = True
|
||||
print(f" + Added {len(seo_keywords)} SEO keywords")
|
||||
|
||||
if updated:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(reconstruct_markdown(frontmatter, body))
|
||||
file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
|
||||
print(" ✓ Updated successfully")
|
||||
else:
|
||||
print(" - No updates needed")
|
||||
|
||||
|
||||
def main():
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
|
||||
if not taxonomy_path.exists():
|
||||
print(f"Error: Taxonomy file not found at {taxonomy_path}")
|
||||
print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
|
||||
sys.exit(1)
|
||||
|
||||
taxonomy = load_taxonomy(taxonomy_path)
|
||||
@@ -289,11 +296,8 @@ def main():
|
||||
if not target_path.exists():
|
||||
print(f"Error: Notes folder not found: {target_path}")
|
||||
sys.exit(1)
|
||||
if not target_path.is_dir():
|
||||
print(f"Error: {target_path} is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
md_files = sorted(target_path.rglob('*.md'))
|
||||
md_files = sorted(target_path.rglob("*.md"))
|
||||
if not md_files:
|
||||
print(f"No markdown files found in {target_path}")
|
||||
sys.exit(0)
|
||||
@@ -301,7 +305,7 @@ def main():
|
||||
print(f"Processing all markdown files under: {target_path}")
|
||||
print(f"Found {len(md_files)} markdown files\n")
|
||||
|
||||
new_tag_accumulator = set()
|
||||
new_tag_accumulator: set[str] = set()
|
||||
for md_file in md_files:
|
||||
try:
|
||||
process_note(md_file, taxonomy, new_tag_accumulator)
|
||||
@@ -312,17 +316,27 @@ def main():
|
||||
print("\n✓ Processing complete!")
|
||||
|
||||
fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
|
||||
if fresh:
|
||||
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
|
||||
try:
|
||||
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
|
||||
except EOFError:
|
||||
answer = ''
|
||||
if answer == 'y':
|
||||
append_tags_to_taxonomy(taxonomy_path, fresh)
|
||||
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
|
||||
else:
|
||||
print("Skipped taxonomy update.")
|
||||
if not fresh:
|
||||
return
|
||||
|
||||
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
|
||||
|
||||
# Non-interactive (CI): log and skip
|
||||
if not sys.stdin.isatty():
|
||||
print("Non-interactive environment detected — skipping taxonomy update.")
|
||||
print(f"To add these manually, run the script locally and answer 'y' when prompted.")
|
||||
return
|
||||
|
||||
try:
|
||||
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
|
||||
except EOFError:
|
||||
answer = ""
|
||||
|
||||
if answer == "y":
|
||||
append_tags_to_taxonomy(taxonomy_path, fresh)
|
||||
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
|
||||
else:
|
||||
print("Skipped taxonomy update.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+50
-45
@@ -1,46 +1,51 @@
|
||||
# Tag Taxonomy for Note Tagging
|
||||
# Add new tags here as the LLM suggests good ones
|
||||
|
||||
tags:
|
||||
# Technology & Development
|
||||
- self-hosting
|
||||
- linux
|
||||
- automation
|
||||
- ai-tools
|
||||
- web-development
|
||||
- infrastructure
|
||||
- docker
|
||||
- security
|
||||
- privacy
|
||||
|
||||
# Work & Management
|
||||
- project-management
|
||||
- business-analysis
|
||||
- leadership
|
||||
- agile
|
||||
- team-dynamics
|
||||
- process-improvement
|
||||
- governance
|
||||
|
||||
# Knowledge & Learning
|
||||
- knowledge-management
|
||||
- zettelkasten
|
||||
- note-taking
|
||||
- learning
|
||||
- productivity
|
||||
|
||||
# Philosophy & Spirituality
|
||||
- buddhism
|
||||
- eastern-philosophy
|
||||
- meditation
|
||||
- mindfulness
|
||||
|
||||
# Literature & Writing
|
||||
- literature
|
||||
- postmodernism
|
||||
- writing
|
||||
|
||||
# Personal Interests
|
||||
- plants
|
||||
- aroids
|
||||
- gardening
|
||||
- self-hosting
|
||||
- linux
|
||||
- automation
|
||||
- ai-tools
|
||||
- web-development
|
||||
- infrastructure
|
||||
- docker
|
||||
- security
|
||||
- privacy
|
||||
- project-management
|
||||
- business-analysis
|
||||
- leadership
|
||||
- agile
|
||||
- team-dynamics
|
||||
- process-improvement
|
||||
- governance
|
||||
- knowledge-management
|
||||
- zettelkasten
|
||||
- note-taking
|
||||
- learning
|
||||
- productivity
|
||||
- buddhism
|
||||
- eastern-philosophy
|
||||
- meditation
|
||||
- mindfulness
|
||||
- literature
|
||||
- postmodernism
|
||||
- writing
|
||||
- plants
|
||||
- aroids
|
||||
- gardening
|
||||
- memoir
|
||||
- personal-essay
|
||||
- reflection
|
||||
- family
|
||||
- parenting
|
||||
- recovery
|
||||
- mental-health
|
||||
- aging
|
||||
- relationships
|
||||
- childhood
|
||||
- identity
|
||||
- morning-routine
|
||||
- sleep-habits
|
||||
- baking
|
||||
- cooking-techniques
|
||||
- fermentation
|
||||
- food-baking
|
||||
- kitchen-hacks
|
||||
- sourdough
|
||||
|
||||
Reference in New Issue
Block a user