#!/usr/bin/env python3 """ Note Tagging and SEO Metadata Script Processes markdown notes using a local LLM to add tags and SEO metadata """ import os import sys import yaml import requests import json from pathlib import Path import re # Configuration LM_STUDIO_URL = "http://192.168.68.84:1234/v1/chat/completions" MODEL_NAME = "openai/gpt-oss-20b" TAXONOMY_FILE = "tag-taxonomy.yaml" NOTES_FOLDER = os.path.expanduser("~/Documents/ejl-zk/40 Public/41 Notes/") def load_taxonomy(taxonomy_path): """Load the tag taxonomy from YAML file""" with open(taxonomy_path, 'r') as f: data = yaml.safe_load(f) return data.get('tags', []) def extract_frontmatter(content): """Extract frontmatter and content from markdown""" # Match YAML frontmatter between --- delimiters pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$' match = re.match(pattern, content, re.DOTALL) if match: frontmatter_str = match.group(1) body = match.group(2) frontmatter = yaml.safe_load(frontmatter_str) return frontmatter, body, frontmatter_str return None, content, None def reconstruct_markdown(frontmatter, body): """Reconstruct markdown with updated frontmatter""" # Convert frontmatter to YAML string frontmatter_str = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False) return f"---\n{frontmatter_str}---\n{body}" def call_llm(prompt, note_content, taxonomy): """Call LM Studio API to get tags and SEO metadata""" taxonomy_str = ", ".join(taxonomy) system_prompt = f"""You are a helpful assistant that analyzes markdown notes and provides: 1. Tags from existing taxonomy (1-5 tags, prefer existing) 2. 1-2 NEW tag suggestions if content warrants it 3. Clean, concise SEO title (not clickbaity) 4. Clean, concise SEO description (150-160 chars, factual summary) 5. SEO keywords (be generous, 10-15 relevant keywords) Existing tag taxonomy: {taxonomy_str} Return ONLY valid JSON in this exact format: {{ "tags_from_taxonomy": ["tag1", "tag2"], "new_tag_suggestions": ["newtag1"], "seo_title": "Clear Title Here", "seo_description": "Concise factual summary of the note content.", "seo_keywords": ["keyword1", "keyword2", "keyword3"] }}""" user_prompt = f"""Analyze this note and provide tags and SEO metadata: {note_content} Remember: - Use 1-5 tags from taxonomy that fit best - Suggest 1-2 NEW tags only if content really warrants it (be conservative) - SEO title should be clear and informative, NOT clickbaity - SEO description should be a clean factual summary (150-160 characters) - SEO keywords can be generous (10-15 keywords)""" payload = { "model": MODEL_NAME, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], "temperature": 0.7, "max_tokens": 500 } try: response = requests.post(LM_STUDIO_URL, json=payload, timeout=60) response.raise_for_status() result = response.json() # Extract the response content content = result['choices'][0]['message']['content'] # Try to parse JSON from the response # Sometimes LLMs wrap JSON in markdown code blocks json_match = re.search(r'```json\s*(\{.*?\})\s*```', content, re.DOTALL) if json_match: content = json_match.group(1) return json.loads(content) except Exception as e: print(f"Error calling LLM: {e}") return None def process_note(file_path, taxonomy): """Process a single note file""" print(f"Processing: {file_path}") # Read the file with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Extract frontmatter and body frontmatter, body, original_fm_str = extract_frontmatter(content) if frontmatter is None: print(f" ⚠️ No frontmatter found, skipping") return # Check what needs to be filled in needs_update = False existing_tags = frontmatter.get('tags', []) if not existing_tags or existing_tags == [None]: existing_tags = [] needs_tags = not existing_tags needs_seo_title = not frontmatter.get('seo-title') needs_seo_desc = not frontmatter.get('seo-description') needs_seo_keywords = not frontmatter.get('seo-keywords') if not (needs_tags or needs_seo_title or needs_seo_desc or needs_seo_keywords): print(f" ✓ All fields already populated, skipping") return # Call LLM llm_response = call_llm(None, body[:20000], taxonomy) # Limit content to first 20000 chars if not llm_response: print(f" ✗ Failed to get LLM response") return # Update frontmatter with new values (only if empty) updated = False if needs_tags: # Combine taxonomy tags and new suggestions all_tags = llm_response.get('tags_from_taxonomy', []) new_suggestions = llm_response.get('new_tag_suggestions', []) all_tags.extend(new_suggestions) # Limit to 5 tags total all_tags = all_tags[:5] if all_tags: frontmatter['tags'] = all_tags updated = True print(f" + Added tags: {', '.join(all_tags)}") if new_suggestions: print(f" (New suggestions: {', '.join(new_suggestions)})") if needs_seo_title: seo_title = llm_response.get('seo_title', '') if seo_title: frontmatter['seo-title'] = seo_title updated = True print(f" + Added SEO title: {seo_title}") if needs_seo_desc: seo_desc = llm_response.get('seo_description', '') if seo_desc: frontmatter['seo-description'] = seo_desc updated = True print(f" + Added SEO description") if needs_seo_keywords: seo_keywords = llm_response.get('seo_keywords', []) if seo_keywords: frontmatter['seo-keywords'] = seo_keywords updated = True print(f" + Added {len(seo_keywords)} SEO keywords") if updated: # Write back to file new_content = reconstruct_markdown(frontmatter, body) with open(file_path, 'w', encoding='utf-8') as f: f.write(new_content) print(f" ✓ Updated successfully") else: print(f" - No updates needed") def main(): # Load taxonomy taxonomy_path = Path(__file__).parent / TAXONOMY_FILE if not taxonomy_path.exists(): print(f"Error: Taxonomy file not found at {taxonomy_path}") print(f"Please create {TAXONOMY_FILE} in the same directory as this script") sys.exit(1) taxonomy = load_taxonomy(taxonomy_path) print(f"Loaded {len(taxonomy)} tags from taxonomy\n") # Use the hardcoded notes folder target_path = Path(NOTES_FOLDER) if not target_path.exists(): print(f"Error: Notes folder not found: {target_path}") sys.exit(1) if not target_path.is_dir(): print(f"Error: {target_path} is not a directory") sys.exit(1) # Process all markdown files in the directory md_files = list(target_path.glob('*.md')) if not md_files: print(f"No markdown files found in {target_path}") sys.exit(0) print(f"Processing all markdown files in: {target_path}") print(f"Found {len(md_files)} markdown files\n") for md_file in md_files: process_note(md_file, taxonomy) print() # Blank line between files print("\n✓ Processing complete!") if __name__ == "__main__": main()