235 lines
7.6 KiB
Python
Executable File
235 lines
7.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Note Tagging and SEO Metadata Script
|
|
Processes markdown notes using a local LLM to add tags and SEO metadata
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import requests
|
|
import json
|
|
from pathlib import Path
|
|
import re
|
|
|
|
# Configuration
|
|
LM_STUDIO_URL = "http://192.168.68.84:1234/v1/chat/completions"
|
|
MODEL_NAME = "openai/gpt-oss-20b"
|
|
TAXONOMY_FILE = "tag-taxonomy.yaml"
|
|
NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/")
|
|
|
|
def load_taxonomy(taxonomy_path):
|
|
"""Load the tag taxonomy from YAML file"""
|
|
with open(taxonomy_path, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
return data.get('tags', [])
|
|
|
|
def extract_frontmatter(content):
|
|
"""Extract frontmatter and content from markdown"""
|
|
# Match YAML frontmatter between --- delimiters
|
|
pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
|
|
match = re.match(pattern, content, re.DOTALL)
|
|
|
|
if match:
|
|
frontmatter_str = match.group(1)
|
|
body = match.group(2)
|
|
frontmatter = yaml.safe_load(frontmatter_str)
|
|
return frontmatter, body, frontmatter_str
|
|
|
|
return None, content, None
|
|
|
|
def reconstruct_markdown(frontmatter, body):
|
|
"""Reconstruct markdown with updated frontmatter"""
|
|
# Convert frontmatter to YAML string
|
|
frontmatter_str = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
return f"---\n{frontmatter_str}---\n{body}"
|
|
|
|
def call_llm(prompt, note_content, taxonomy):
|
|
"""Call LM Studio API to get tags and SEO metadata"""
|
|
taxonomy_str = ", ".join(taxonomy)
|
|
|
|
system_prompt = f"""You are a helpful assistant that analyzes markdown notes and provides:
|
|
1. Tags from existing taxonomy (1-5 tags, prefer existing)
|
|
2. 1-2 NEW tag suggestions if content warrants it
|
|
3. Clean, concise SEO title (not clickbaity)
|
|
4. Clean, concise SEO description (150-160 chars, factual summary)
|
|
5. SEO keywords (be generous, 10-15 relevant keywords)
|
|
|
|
Existing tag taxonomy: {taxonomy_str}
|
|
|
|
Return ONLY valid JSON in this exact format:
|
|
{{
|
|
"tags_from_taxonomy": ["tag1", "tag2"],
|
|
"new_tag_suggestions": ["newtag1"],
|
|
"seo_title": "Clear Title Here",
|
|
"seo_description": "Concise factual summary of the note content.",
|
|
"seo_keywords": ["keyword1", "keyword2", "keyword3"]
|
|
}}"""
|
|
|
|
user_prompt = f"""Analyze this note and provide tags and SEO metadata:
|
|
|
|
{note_content}
|
|
|
|
Remember:
|
|
- Use 1-5 tags from taxonomy that fit best
|
|
- Suggest 1-2 NEW tags only if content really warrants it (be conservative)
|
|
- SEO title should be clear and informative, NOT clickbaity
|
|
- SEO description should be a clean factual summary (150-160 characters)
|
|
- SEO keywords can be generous (10-15 keywords)"""
|
|
|
|
payload = {
|
|
"model": MODEL_NAME,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_prompt}
|
|
],
|
|
"temperature": 0.7,
|
|
"max_tokens": 500
|
|
}
|
|
|
|
try:
|
|
response = requests.post(LM_STUDIO_URL, json=payload, timeout=60)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
|
|
# Extract the response content
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
# Try to parse JSON from the response
|
|
# Sometimes LLMs wrap JSON in markdown code blocks
|
|
json_match = re.search(r'```json\s*(\{.*?\})\s*```', content, re.DOTALL)
|
|
if json_match:
|
|
content = json_match.group(1)
|
|
|
|
return json.loads(content)
|
|
|
|
except Exception as e:
|
|
print(f"Error calling LLM: {e}")
|
|
return None
|
|
|
|
def process_note(file_path, taxonomy):
|
|
"""Process a single note file"""
|
|
print(f"Processing: {file_path}")
|
|
|
|
# Read the file
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract frontmatter and body
|
|
frontmatter, body, original_fm_str = extract_frontmatter(content)
|
|
|
|
if frontmatter is None:
|
|
print(f" ⚠️ No frontmatter found, skipping")
|
|
return
|
|
|
|
# Check what needs to be filled in
|
|
needs_update = False
|
|
existing_tags = frontmatter.get('tags', [])
|
|
if not existing_tags or existing_tags == [None]:
|
|
existing_tags = []
|
|
|
|
needs_tags = not existing_tags
|
|
needs_seo_title = not frontmatter.get('seo-title')
|
|
needs_seo_desc = not frontmatter.get('seo-description')
|
|
needs_seo_keywords = not frontmatter.get('seo-keywords')
|
|
|
|
if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords):
|
|
print(f" ✓ All fields already populated, skipping")
|
|
return
|
|
|
|
# Call LLM
|
|
llm_response = call_llm(None, body[:20000], taxonomy) # Limit content to first 20000 chars
|
|
|
|
if not llm_response:
|
|
print(f" ✗ Failed to get LLM response")
|
|
return
|
|
|
|
# Update frontmatter with new values (only if empty)
|
|
updated = False
|
|
|
|
if needs_tags:
|
|
# Combine taxonomy tags and new suggestions
|
|
all_tags = llm_response.get('tags_from_taxonomy', [])
|
|
new_suggestions = llm_response.get('new_tag_suggestions', [])
|
|
all_tags.extend(new_suggestions)
|
|
|
|
# Limit to 5 tags total
|
|
all_tags = all_tags[:5]
|
|
|
|
if all_tags:
|
|
frontmatter['tags'] = all_tags
|
|
updated = True
|
|
print(f" + Added tags: {', '.join(all_tags)}")
|
|
if new_suggestions:
|
|
print(f" (New suggestions: {', '.join(new_suggestions)})")
|
|
|
|
if needs_seo_title:
|
|
seo_title = llm_response.get('seo_title', '')
|
|
if seo_title:
|
|
frontmatter['seo-title'] = seo_title
|
|
updated = True
|
|
print(f" + Added SEO title: {seo_title}")
|
|
|
|
if needs_seo_desc:
|
|
seo_desc = llm_response.get('seo_description', '')
|
|
if seo_desc:
|
|
frontmatter['seo-description'] = seo_desc
|
|
updated = True
|
|
print(f" + Added SEO description")
|
|
|
|
if needs_seo_keywords:
|
|
seo_keywords = llm_response.get('seo_keywords', [])
|
|
if seo_keywords:
|
|
frontmatter['seo-keywords'] = seo_keywords
|
|
updated = True
|
|
print(f" + Added {len(seo_keywords)} SEO keywords")
|
|
|
|
if updated:
|
|
# Write back to file
|
|
new_content = reconstruct_markdown(frontmatter, body)
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(new_content)
|
|
print(f" ✓ Updated successfully")
|
|
else:
|
|
print(f" - No updates needed")
|
|
|
|
def main():
|
|
# Load taxonomy
|
|
taxonomy_path = Path(__file__).parent / TAXONOMY_FILE
|
|
if not taxonomy_path.exists():
|
|
print(f"Error: Taxonomy file not found at {taxonomy_path}")
|
|
print(f"Please create {TAXONOMY_FILE} in the same directory as this script")
|
|
sys.exit(1)
|
|
|
|
taxonomy = load_taxonomy(taxonomy_path)
|
|
print(f"Loaded {len(taxonomy)} tags from taxonomy\n")
|
|
|
|
# Use the hardcoded notes folder
|
|
target_path = Path(NOTES_FOLDER)
|
|
|
|
if not target_path.exists():
|
|
print(f"Error: Notes folder not found: {target_path}")
|
|
sys.exit(1)
|
|
|
|
if not target_path.is_dir():
|
|
print(f"Error: {target_path} is not a directory")
|
|
sys.exit(1)
|
|
|
|
# Process all markdown files in the directory
|
|
md_files = list(target_path.glob('*.md'))
|
|
|
|
if not md_files:
|
|
print(f"No markdown files found in {target_path}")
|
|
sys.exit(0)
|
|
|
|
print(f"Processing all markdown files in: {target_path}")
|
|
print(f"Found {len(md_files)} markdown files\n")
|
|
|
|
for md_file in md_files:
|
|
process_note(md_file, taxonomy)
|
|
print() # Blank line between files
|
|
|
|
print("\n✓ Processing complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main() |