Compare commits

...

4 Commits

3 changed files with 167 additions and 148 deletions
BIN
View File
Binary file not shown.
+117 -103
View File
@@ -1,53 +1,65 @@
#!/usr/bin/env python3
"""
Note Tagging and SEO Metadata Script
Processes markdown notes using a local LLM to add tags, slugs, and SEO metadata
Processes markdown notes using the Anthropic API to add tags, slugs, and SEO metadata.
"""
import os
import sys
import io
import json
import os
import re
import sys
from pathlib import Path
import requests
import anthropic
import yaml
from ruamel.yaml import YAML
# ---------------------------------------------------------------------------
# Configuration
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"
MODEL_NAME = "openai/gpt-oss-20b"
# ---------------------------------------------------------------------------
TAXONOMY_FILE = "tag-taxonomy.yaml"
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
CONTENT_CHAR_LIMIT = 20000
SEO_DESC_MIN = 150
SEO_DESC_MAX = 160
MODEL = "claude-sonnet-4-6"
# Round-trip YAML preserves existing frontmatter formatting
yaml_rt = YAML()
yaml_rt.preserve_quotes = True
yaml_rt.width = 4096
# Anthropic client — reads ANTHROPIC_API_KEY from env automatically
client = anthropic.Anthropic()
def load_taxonomy(taxonomy_path):
with open(taxonomy_path, 'r') as f:
# ---------------------------------------------------------------------------
# Taxonomy helpers
# ---------------------------------------------------------------------------
def load_taxonomy(taxonomy_path: Path) -> list[str]:
with open(taxonomy_path) as f:
data = yaml.safe_load(f) or {}
return data.get('tags', []) or []
return data.get("tags", []) or []
def append_tags_to_taxonomy(taxonomy_path, new_tags):
with open(taxonomy_path, 'r') as f:
def append_tags_to_taxonomy(taxonomy_path: Path, new_tags: set[str]) -> None:
with open(taxonomy_path) as f:
data = yaml.safe_load(f) or {}
existing = data.get('tags', []) or []
existing = data.get("tags", []) or []
combined = list(dict.fromkeys(existing + list(new_tags)))
data['tags'] = combined
with open(taxonomy_path, 'w') as f:
data["tags"] = combined
with open(taxonomy_path, "w") as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
def extract_frontmatter(content):
pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
# ---------------------------------------------------------------------------
# Frontmatter helpers
# ---------------------------------------------------------------------------
def extract_frontmatter(content: str):
pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
match = re.match(pattern, content, re.DOTALL)
if not match:
return None, content
@@ -55,74 +67,65 @@ def extract_frontmatter(content):
return frontmatter, match.group(2)
def reconstruct_markdown(frontmatter, body):
def reconstruct_markdown(frontmatter, body: str) -> str:
stream = io.StringIO()
yaml_rt.dump(frontmatter, stream)
fm_str = stream.getvalue()
if not fm_str.endswith('\n'):
fm_str += '\n'
if not fm_str.endswith("\n"):
fm_str += "\n"
return f"---\n{fm_str}---\n{body}"
def slugify(text):
def slugify(text: str) -> str:
text = text.lower()
text = re.sub(r"['`]", '', text)
text = re.sub(r'[^\w\s-]', ' ', text)
text = re.sub(r'[-\s]+', '-', text).strip('-')
text = re.sub(r"[''`]", "", text)
text = re.sub(r"[^\w\s-]", " ", text)
text = re.sub(r"[-\s]+", "-", text).strip("-")
return text
def parse_json_response(content):
# ---------------------------------------------------------------------------
# LLM helpers
# ---------------------------------------------------------------------------
def parse_json_response(content: str | None) -> dict | None:
if content is None:
return None
try:
return json.loads(content)
except json.JSONDecodeError:
pass
start = content.find('{')
end = content.rfind('}')
start = content.find("{")
end = content.rfind("}")
if start != -1 and end > start:
try:
return json.loads(content[start:end + 1])
return json.loads(content[start : end + 1])
except json.JSONDecodeError:
pass
return None
def call_llm_json(system_prompt, user_prompt, max_tokens=4000):
payload = {
"model": MODEL_NAME,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"temperature": 0.2,
"max_tokens": max_tokens,
"response_format": {"type": "text"},
}
def call_llm_json(system_prompt: str, user_prompt: str, max_tokens: int = 1024) -> dict | None:
try:
response = requests.post(LM_STUDIO_URL, json=payload, timeout=600)
if not response.ok:
print(f" ! LLM error: {response.status_code} {response.reason}")
print(f" body: {response.text[:500]}")
return None
result = response.json()
choice = result['choices'][0]
content = choice['message'].get('content') or ''
message = client.messages.create(
model=MODEL,
max_tokens=max_tokens,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}],
temperature=0.2,
)
content = message.content[0].text if message.content else ""
parsed = parse_json_response(content)
if parsed is None:
finish = choice.get('finish_reason')
print(f" ! LLM returned no parseable JSON (finish_reason={finish})")
if finish == 'length':
print(" Hit max_tokens — reasoning model burned the budget before emitting output.")
print(f" ! LLM returned no parseable JSON (stop_reason={message.stop_reason})")
print(f" content: {content[:500]!r}")
return parsed
except Exception as e:
print(f" ! LLM error: {e}")
except anthropic.APIError as e:
print(f" ! Anthropic API error: {e}")
return None
def request_metadata(title, note_content, taxonomy):
def request_metadata(title: str, note_content: str, taxonomy: list[str]) -> dict | None:
taxonomy_str = ", ".join(taxonomy)
system_prompt = f"""You analyze markdown notes and return structured metadata.
@@ -153,13 +156,14 @@ Produce the JSON described in the system prompt."""
return call_llm_json(system_prompt, user_prompt)
def request_description_retry(title, note_content, previous_desc):
def request_description_retry(title: str, note_content: str, previous_desc: str) -> str:
system_prompt = f"""You rewrite SEO descriptions to a strict length.
Return ONLY valid JSON of the form:
{{"seo_description": "..."}}
The description must be a clean, factual summary of the note, STRICTLY between {SEO_DESC_MIN} and {SEO_DESC_MAX} characters inclusive. Count characters carefully before responding."""
user_prompt = f"""Note title: {title}
Note content:
@@ -171,46 +175,49 @@ Your previous description was {len(previous_desc)} characters, outside the allow
Rewrite it to fit strictly within {SEO_DESC_MIN}-{SEO_DESC_MAX} characters."""
result = call_llm_json(system_prompt, user_prompt)
if result:
return (result.get('seo_description') or '').strip()
return ''
return (result.get("seo_description") or "").strip()
return ""
def process_note(file_path, taxonomy, new_tag_accumulator):
# ---------------------------------------------------------------------------
# Note processing
# ---------------------------------------------------------------------------
def process_note(file_path: Path, taxonomy: list[str], new_tag_accumulator: set) -> None:
print(f"Processing: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
content = file_path.read_text(encoding="utf-8")
frontmatter, body = extract_frontmatter(content)
if frontmatter is None:
print(" ⚠️ No frontmatter found, skipping")
return
existing_tags = frontmatter.get('tags', []) or []
existing_tags = frontmatter.get("tags", []) or []
if existing_tags == [None]:
existing_tags = []
needs_tags = not existing_tags
needs_slug = not frontmatter.get('slug')
needs_seo_title = not frontmatter.get('seo-title')
needs_seo_desc = not frontmatter.get('seo-description')
needs_seo_keywords = not frontmatter.get('seo-keywords')
needs_slug = not frontmatter.get("slug")
needs_seo_title = not frontmatter.get("seo-title")
needs_seo_desc = not frontmatter.get("seo-description")
needs_seo_keywords = not frontmatter.get("seo-keywords")
if not (needs_tags or needs_slug or needs_seo_title or needs_seo_desc or needs_seo_keywords):
if not any([needs_tags, needs_slug, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
print(" ✓ All fields already populated, skipping")
return
title = frontmatter.get('title') or Path(file_path).stem
title = frontmatter.get("title") or file_path.stem
updated = False
if needs_slug:
slug = slugify(Path(file_path).stem)
frontmatter['slug'] = slug
slug = slugify(file_path.stem)
frontmatter["slug"] = slug
print(f" + Added slug: {slug}")
updated = True
if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
with open(file_path, 'w', encoding='utf-8') as f:
f.write(reconstruct_markdown(frontmatter, body))
# If only slug was needed, skip the LLM call
if not any([needs_tags, needs_seo_title, needs_seo_desc, needs_seo_keywords]):
file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
print(" ✓ Updated successfully")
return
@@ -220,11 +227,11 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
return
if needs_tags:
taxonomy_tags = llm_response.get('tags_from_taxonomy') or []
new_suggestions = llm_response.get('new_tag_suggestions') or []
taxonomy_tags = llm_response.get("tags_from_taxonomy") or []
new_suggestions = llm_response.get("new_tag_suggestions") or []
combined = list(dict.fromkeys(list(taxonomy_tags) + list(new_suggestions)))[:5]
if combined:
frontmatter['tags'] = combined
frontmatter["tags"] = combined
updated = True
print(f" + Added tags: {', '.join(combined)}")
genuinely_new = [t for t in combined if t not in taxonomy and t in new_suggestions]
@@ -233,19 +240,17 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
new_tag_accumulator.update(genuinely_new)
if needs_seo_title:
suffix = (llm_response.get('seo_title_suffix') or '').strip()
suffix = suffix.lstrip(':').strip()
# Strip a leading repeat of the title if the LLM included it anyway
suffix = (llm_response.get("seo_title_suffix") or "").strip().lstrip(":").strip()
if suffix.lower().startswith(title.lower()):
suffix = suffix[len(title):].lstrip(':').strip()
suffix = suffix[len(title):].lstrip(":").strip()
if suffix:
seo_title = f"{title}: {suffix}"
frontmatter['seo-title'] = seo_title
frontmatter["seo-title"] = seo_title
updated = True
print(f" + Added SEO title: {seo_title}")
if needs_seo_desc:
seo_desc = (llm_response.get('seo_description') or '').strip()
seo_desc = (llm_response.get("seo_description") or "").strip()
if seo_desc and not (SEO_DESC_MIN <= len(seo_desc) <= SEO_DESC_MAX):
print(f" ~ SEO description length {len(seo_desc)} outside {SEO_DESC_MIN}-{SEO_DESC_MAX}, re-asking")
retry = request_description_retry(title, body[:CONTENT_CHAR_LIMIT], seo_desc)
@@ -256,30 +261,32 @@ def process_note(file_path, taxonomy, new_tag_accumulator):
else:
print(" ! Retry failed; using original")
if seo_desc:
frontmatter['seo-description'] = seo_desc
frontmatter["seo-description"] = seo_desc
updated = True
print(f" + Added SEO description ({len(seo_desc)} chars)")
if needs_seo_keywords:
seo_keywords = list(dict.fromkeys(llm_response.get('seo_keywords') or []))
seo_keywords = list(dict.fromkeys(llm_response.get("seo_keywords") or []))
if seo_keywords:
frontmatter['seo-keywords'] = seo_keywords
frontmatter["seo-keywords"] = seo_keywords
updated = True
print(f" + Added {len(seo_keywords)} SEO keywords")
if updated:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(reconstruct_markdown(frontmatter, body))
file_path.write_text(reconstruct_markdown(frontmatter, body), encoding="utf-8")
print(" ✓ Updated successfully")
else:
print(" - No updates needed")
def main():
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
if not taxonomy_path.exists():
print(f"Error: Taxonomy file not found at {taxonomy_path}")
print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
sys.exit(1)
taxonomy = load_taxonomy(taxonomy_path)
@@ -289,11 +296,8 @@ def main():
if not target_path.exists():
print(f"Error: Notes folder not found: {target_path}")
sys.exit(1)
if not target_path.is_dir():
print(f"Error: {target_path} is not a directory")
sys.exit(1)
md_files = sorted(target_path.rglob('*.md'))
md_files = sorted(target_path.rglob("*.md"))
if not md_files:
print(f"No markdown files found in {target_path}")
sys.exit(0)
@@ -301,7 +305,7 @@ def main():
print(f"Processing all markdown files under: {target_path}")
print(f"Found {len(md_files)} markdown files\n")
new_tag_accumulator = set()
new_tag_accumulator: set[str] = set()
for md_file in md_files:
try:
process_note(md_file, taxonomy, new_tag_accumulator)
@@ -312,17 +316,27 @@ def main():
print("\n✓ Processing complete!")
fresh = sorted(t for t in new_tag_accumulator if t not in taxonomy)
if fresh:
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
try:
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
except EOFError:
answer = ''
if answer == 'y':
append_tags_to_taxonomy(taxonomy_path, fresh)
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
else:
print("Skipped taxonomy update.")
if not fresh:
return
print(f"\nNew tags suggested during this run: {', '.join(fresh)}")
# Non-interactive (CI): log and skip
if not sys.stdin.isatty():
print("Non-interactive environment detected — skipping taxonomy update.")
print(f"To add these manually, run the script locally and answer 'y' when prompted.")
return
try:
answer = input("Add these to the taxonomy? [y/N]: ").strip().lower()
except EOFError:
answer = ""
if answer == "y":
append_tags_to_taxonomy(taxonomy_path, fresh)
print(f"✓ Added {len(fresh)} tag(s) to {taxonomy_path.name}")
else:
print("Skipped taxonomy update.")
if __name__ == "__main__":
+50 -45
View File
@@ -1,46 +1,51 @@
# Tag Taxonomy for Note Tagging
# Add new tags here as the LLM suggests good ones
tags:
# Technology & Development
- self-hosting
- linux
- automation
- ai-tools
- web-development
- infrastructure
- docker
- security
- privacy
# Work & Management
- project-management
- business-analysis
- leadership
- agile
- team-dynamics
- process-improvement
- governance
# Knowledge & Learning
- knowledge-management
- zettelkasten
- note-taking
- learning
- productivity
# Philosophy & Spirituality
- buddhism
- eastern-philosophy
- meditation
- mindfulness
# Literature & Writing
- literature
- postmodernism
- writing
# Personal Interests
- plants
- aroids
- gardening
- self-hosting
- linux
- automation
- ai-tools
- web-development
- infrastructure
- docker
- security
- privacy
- project-management
- business-analysis
- leadership
- agile
- team-dynamics
- process-improvement
- governance
- knowledge-management
- zettelkasten
- note-taking
- learning
- productivity
- buddhism
- eastern-philosophy
- meditation
- mindfulness
- literature
- postmodernism
- writing
- plants
- aroids
- gardening
- memoir
- personal-essay
- reflection
- family
- parenting
- recovery
- mental-health
- aging
- relationships
- childhood
- identity
- morning-routine
- sleep-habits
- baking
- cooking-techniques
- fermentation
- food-baking
- kitchen-hacks
- sourdough