commit 22f9304be694f806682644b4d1151e75c02d5125 Author: Zhongwei Li Date: Sun Nov 30 09:04:59 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..17de860 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "youtube-to-markdown", + "description": "Transform YouTube video to storagable knowledge. Get tight summary, cleaned transcript broken into chapters and paragraphs, timestamp links back to original video, and notable content highlighted. Might be that you can skip watching the video entirely.", + "version": "1.0.0", + "author": { + "name": "Ville Reijonen", + "email": "marketplace@vre.iki.fi" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3fc516d --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2025 Ville Reijonen +Copyright (c) 2025 Michał Parkoła / Tapestry Skills Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6fd60fd --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# youtube-to-markdown + +Transform YouTube video to storagable knowledge. Get tight summary, cleaned transcript broken into chapters and paragraphs, timestamp links back to original video, and notable content highlighted. Might be that you can skip watching the video entirely. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..49da794 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,169 @@ +--- +name: youtube-to-markdown +description: Use when user asks YouTube video extraction, get, fetch, transcripts, subtitles, or captions. Writes video details and transcription into structured markdown file. +allowed-tools: + - Bash + - Read + - Write + - Task + - AskUserQuestion + - Skill +--- + +# YouTube to Markdown + +Execute all steps sequentially without asking for user approval. Use TodoWrite to track progress. + +## Step 0: Ask about comment analysis + +If not clear from user's request, ask: + +``` +AskUserQuestion: +- question: "Would you like to analyze comments after extracting the video transcript?" +- header: "Comments" +- options: + 1. label: "Yes, analyze comments" + description: "After video extraction, run youtube-comment-analysis for cross-analysis with video summary" + 2. label: "No, video only" + description: "Extract only video transcript and metadata" +``` + +Note user's choice for Step 9. + +## Step 1: Extract data (metadata, description, chapters) + +```bash +python3 extract_data.py "" "" +``` + +Script extracts video ID from URL and creates: youtube_{VIDEO_ID}_metadata.md, youtube_{VIDEO_ID}_description.md, youtube_{VIDEO_ID}_chapters.json + +**IMPORTANT**: If you ask which language transcript to extract then do not translate that language to english and require that subagent do not translate either. Only if the user requests another language that the original then translate. + +## Step 2: Extract transcript + +### Primary method (if transcript available) + +If video language is `en`, proceed directly. If non-English, ask user which language to download. + +```bash +python3 extract_transcript.py "" "" "" +``` + +Script creates: youtube_{VIDEO_ID}_transcript.vtt + +**IMPORTANT**: All file output must be in the same language as discovered in Step 2. If language is not English, explicitly instruct all subagents to preserve the original language. + +The download may fail if a video is private, age-restricted, or geo-blocked. + +### Fallback (only if transcript unavailable) + +Ask user: "No transcript available. Proceed with Whisper transcription? +- Mac/Apple Silicon: Uses MLX Whisper if installed (faster, see SETUP_MLX_WHISPER.md) +- All platforms: Falls back to OpenAI Whisper (requires: brew install openai-whisper OR pip3 install openai-whisper)" + +```bash +python3 extract_transcript_whisper.py "" "" +``` + +Script auto-detects MLX Whisper on Mac and uses it if available, otherwise uses OpenAI Whisper. + +## Step 3: Deduplicate transcript + +Set BASE_NAME from Step 1 output (youtube_{VIDEO_ID}) + +```bash +python3 deduplicate_vtt.py "/${BASE_NAME}_transcript.vtt" "/${BASE_NAME}_transcript_dedup.md" +cut -c 16- /${BASE_NAME}_transcript_dedup.md > /${BASE_NAME}_transcript_no_timestamps.txt +``` + +## Step 4: Add natural paragraph breaks + +Parallel with Step 5. + +task_tool: +- subagent_type: "general-purpose" +- prompt: +``` +Analyze /${BASE_NAME}_transcript_no_timestamps.txt and identify natural paragraph break line numbers. + +Read /${BASE_NAME}_chapters.json. If it contains chapters, use chapter timestamps as primary break points. + +Target ~500 chars per paragraph. Find natural break points at topic shifts or sentence endings. + +Return format: +BREAKS: 15,42,78,103,... +``` + +```bash +python3 ./apply_paragraph_breaks.py "/${BASE_NAME}_transcript_dedup.md" "/${BASE_NAME}_transcript_paragraphs.md" "" +``` + +## Step 5: Summarize transcript + +Parallel with Step 4. + +task_tool: +- subagent_type: "general-purpose" +- prompt: +``` +Summarize /${BASE_NAME}_transcript_no_timestamps.txt. No fluff, it is NOT a document. Aim to 10% xor max 1500 letters. Write to /${BASE_NAME}_summary.md: +**TL;DR**: [1 sentence core insight, do not repeat later] + +[skip the question if repeating or non essential content] +**What**: +**Where**: +**When**: +**Why**: +**How**: +**What Then**: + +**Hidden Gems**: +- [any insights hiding under the main story] +``` + +## Step 6: Clean speech artifacts + +task_tool: +- subagent_type: "general-purpose" +- model: "haiku" +- prompt: +``` +Read /${BASE_NAME}_transcript_paragraphs.md and clean speech artifacts. Write to /${BASE_NAME}_transcript_cleaned.md. + +Tasks: +- Remove fillers (um, uh, like, you know) +- Fix transcription errors +- Add proper punctuation +- Reduce or add implicit words to improve flow +- Preserve natural voice and tone +- Keep timestamps at end of paragraphs +``` + +## Step 7: Add topic headings + +task_tool: +- subagent_type: "general-purpose" +- prompt: +``` +Read /${BASE_NAME}_transcript_cleaned.md and add markdown headings. Write to /${BASE_NAME}_transcript.md. + +Read /${BASE_NAME}_chapters.json: +- If contains chapters: Use chapter names as ### headings at chapter timestamps, add #### headings for subtopics +- If empty: Add ### headings where major topics change +``` + +## Step 8: Finalize and cleanup + +```bash +python3 finalize.py "${BASE_NAME}" "" +``` + +Script uses template.md to create final file by merging all component files (metadata, summary, description, transcript) and removes intermediate work files. Final output: `youtube - {title} ({video_id}).md` + +Use `--debug` flag to keep intermediate work files for inspection. + +## Step 9: Chain to comment analysis (optional) + +If user chose "Yes, analyze comments" in Step 0 run youtube-comment-analysis Skill with the same YouTube URL. diff --git a/apply_paragraph_breaks.py b/apply_paragraph_breaks.py new file mode 100755 index 0000000..52863d6 --- /dev/null +++ b/apply_paragraph_breaks.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Apply paragraph breaks to deduplicated transcript +# Usage: python3 apply_paragraph_breaks.py +# BREAKS format: "15,42,78,103" (comma-separated line numbers) + +import os +import sys + +if len(sys.argv) != 4: + print("Usage: python3 apply_paragraph_breaks.py ", file=sys.stderr) + sys.exit(1) + +INPUT_FILE = sys.argv[1] +OUTPUT_FILE = sys.argv[2] +BREAK_POINTS_STR = sys.argv[3] + +try: + # Parse break points + break_points = [int(x.strip()) for x in BREAK_POINTS_STR.split(',')] + break_points_set = set(break_points) + + # Read input file with timestamps + with open(INPUT_FILE, 'r', encoding='utf-8') as f: + lines = [line.rstrip('\n') for line in f.readlines()] + + # Parse timestamps + timestamps = [] + texts = [] + for line in lines: + if line.startswith('[') and len(line) > 15: + timestamp = line[:14] # [00:00:00.080] is 14 chars + text = line[15:] # Text starts at position 15 + timestamps.append(timestamp) + texts.append(text) + else: + timestamps.append(None) + texts.append(line) + + # Build paragraphs based on break points + paragraphs = [] + current_paragraph = [] + paragraph_start_timestamp = None + + for i, text in enumerate(texts, start=1): + # Track first timestamp in paragraph + if timestamps[i-1] and not paragraph_start_timestamp: + paragraph_start_timestamp = timestamps[i-1] + + # Add text + if text: + current_paragraph.append(text) + + # Check if this is a break point + if i in break_points_set or i == len(texts): + # Finish current paragraph + if current_paragraph and paragraph_start_timestamp: + paragraph_text = ' '.join(current_paragraph) + paragraphs.append(f"{paragraph_text} {paragraph_start_timestamp}") + current_paragraph = [] + paragraph_start_timestamp = None + + # Validate output + if not paragraphs: + print(f"ERROR: No paragraphs created from {INPUT_FILE}", file=sys.stderr) + sys.exit(1) + + # Write to output file + with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: + for para in paragraphs: + f.write(para + '\n\n') + + print(f"SUCCESS: Created {len(paragraphs)} paragraphs -> {OUTPUT_FILE}") + +except FileNotFoundError: + print(f"ERROR: {INPUT_FILE} not found", file=sys.stderr) + sys.exit(1) +except ValueError as e: + print(f"ERROR: Invalid break points format: {e}", file=sys.stderr) + sys.exit(1) +except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/deduplicate_vtt.py b/deduplicate_vtt.py new file mode 100755 index 0000000..41a7c5e --- /dev/null +++ b/deduplicate_vtt.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# Deduplicate VTT (removes duplicate lines from auto-generated captions) +# Usage: python3 deduplicate_vtt.py +# Output format: [00:00:01.000] Text here + +import os +import re +import sys + +if len(sys.argv) != 3: + print("Usage: python3 deduplicate_vtt.py ", file=sys.stderr) + sys.exit(1) + +VTT_FILE = sys.argv[1] +OUTPUT_FILE = sys.argv[2] + +seen = set() +current_timestamp = None +output_lines = [] + +try: + with open(VTT_FILE, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + + # Skip headers + if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'): + continue + + # Capture timestamp (start time only) + if '-->' in line: + current_timestamp = line.split('-->')[0].strip() + continue + + # Process text with deduplication + if line and current_timestamp: + clean = re.sub('<[^>]*>', '', line) + clean = clean.replace('&', '&').replace('>', '>').replace('<', '<') + + if clean and clean not in seen: + output_lines.append(f'[{current_timestamp}] {clean}') + seen.add(clean) + + # Validate output + if not output_lines: + print(f"ERROR: No text extracted from {VTT_FILE}", file=sys.stderr) + sys.exit(1) + + # Write to output file + with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: + f.write('\n'.join(output_lines)) + + # Verify file was created + if not os.path.exists(OUTPUT_FILE): + print(f"ERROR: Failed to create {OUTPUT_FILE}", file=sys.stderr) + sys.exit(1) + + print(f"SUCCESS: {OUTPUT_FILE} ({len(output_lines)} lines)") + +except FileNotFoundError: + print(f"ERROR: {VTT_FILE} not found", file=sys.stderr) + sys.exit(1) +except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/extract_data.py b/extract_data.py new file mode 100755 index 0000000..64e7c66 --- /dev/null +++ b/extract_data.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +""" +Extracts YouTube video data: metadata, description, and chapters +Usage: extract_data.py +Output: Creates youtube_{VIDEO_ID}_metadata.md, youtube_{VIDEO_ID}_description.md, youtube_{VIDEO_ID}_chapters.json +""" + +import json +import sys +import os +import subprocess +import re +from datetime import datetime + +def extract_video_id(url): + """Extract video ID from YouTube URL""" + # Handle youtu.be format + if 'youtu.be/' in url: + video_id = url.split('youtu.be/')[-1].split('?')[0] + return video_id + + # Handle youtube.com format with v= parameter + match = re.search(r'[?&]v=([^&]+)', url) + if match: + return match.group(1) + + return None + +def check_yt_dlp(): + """Check if yt-dlp is installed""" + try: + subprocess.run(['yt-dlp', '--version'], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + print("ERROR: yt-dlp is not installed", file=sys.stderr) + print("Install options:", file=sys.stderr) + print(" - macOS: brew install yt-dlp", file=sys.stderr) + print(" - Ubuntu/Debian: sudo apt update && sudo apt install -y yt-dlp", file=sys.stderr) + print(" - All systems: pip3 install yt-dlp", file=sys.stderr) + sys.exit(1) + +def fetch_video_data(video_url, output_dir): + """Fetch video metadata from YouTube""" + temp_json = os.path.join(output_dir, "video_data.json") + try: + with open(temp_json, 'w') as f: + result = subprocess.run( + ['yt-dlp', '--dump-single-json', '--skip-download', video_url], + stdout=f, + stderr=subprocess.PIPE, + text=True + ) + if result.returncode != 0: + print("ERROR: Failed to extract video metadata", file=sys.stderr) + if os.path.exists(temp_json): + os.remove(temp_json) + sys.exit(1) + except Exception as e: + print(f"ERROR: Failed to extract video metadata: {e}", file=sys.stderr) + if os.path.exists(temp_json): + os.remove(temp_json) + sys.exit(1) + + try: + with open(temp_json, 'r') as f: + data = json.load(f) + except Exception as e: + print(f"ERROR: Failed to read JSON: {e}", file=sys.stderr) + if os.path.exists(temp_json): + os.remove(temp_json) + sys.exit(1) + finally: + if os.path.exists(temp_json): + os.remove(temp_json) + + return data + +def format_upload_date(upload_date): + """Format upload date from YYYYMMDD to YYYY-MM-DD""" + if upload_date != 'Unknown' and len(str(upload_date)) == 8: + return f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:]}" + return upload_date + +def format_subscribers(subscribers): + """Format subscriber count""" + if isinstance(subscribers, int): + return f"{subscribers:,} subscribers" + return f"{subscribers} subscribers" + +def format_duration(duration): + """Format duration from seconds to HH:MM:SS or MM:SS""" + if duration: + hours = duration // 3600 + minutes = (duration % 3600) // 60 + seconds = duration % 60 + if hours > 0: + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + else: + return f"{minutes:02d}:{seconds:02d}" + return "Unknown" + +def create_metadata_file(data, base_name, output_dir): + """Create metadata file with video origin info""" + filename = os.path.join(output_dir, f"{base_name}_metadata.md") + title = data.get('title', 'Untitled') + + # Save title to separate file for finalize.py to use in filename + title_file = os.path.join(output_dir, f"{base_name}_title.txt") + with open(title_file, 'w', encoding='utf-8') as tf: + tf.write(title) + link = data.get('webpage_url', 'N/A') + channel = data.get('uploader', 'Unknown') + channel_url = data.get('channel_url', data.get('uploader_url', '')) + subscribers = data.get('channel_follower_count', 'N/A') + upload_date = data.get('upload_date', 'Unknown') + view_count = data.get('view_count', 0) + like_count = data.get('like_count', 0) + duration = data.get('duration', 0) + + upload_date = format_upload_date(upload_date) + extraction_date = datetime.now().strftime('%Y-%m-%d') + sub_text = format_subscribers(subscribers) + duration_text = format_duration(duration) + views_text = f"{view_count:,}" if view_count else "0" + likes_text = f"{like_count:,}" if like_count else "0" + + with open(filename, 'w', encoding='utf-8') as md: + md.write(f"- **Title:** [{title}]({link})\n") + if channel_url: + md.write(f"- **Channel:** [{channel}]({channel_url}) ({sub_text})\n") + else: + md.write(f"- **Channel:** {channel} ({sub_text})\n") + md.write(f"- **Views:** {views_text} | Likes: {likes_text} | Duration: {duration_text}\n") + md.write(f"- **Published:** {upload_date} | Extracted: {extraction_date}\n") + + print(f"SUCCESS: {filename}") + return filename + +def create_description_file(data, base_name, output_dir): + """Create description file""" + filename = os.path.join(output_dir, f"{base_name}_description.md") + description = data.get('description', 'No description') + + with open(filename, 'w', encoding='utf-8') as f: + f.write(description) + + print(f"SUCCESS: {filename}") + return filename + +def create_chapters_file(data, base_name, output_dir): + """Create chapters JSON file""" + chapters = data.get('chapters', []) + chapters_file = os.path.join(output_dir, f"{base_name}_chapters.json") + + with open(chapters_file, 'w', encoding='utf-8') as cf: + json.dump(chapters if chapters else [], cf, indent=2) + + if chapters: + print(f"CHAPTERS: {chapters_file}") + else: + print(f"CHAPTERS: {chapters_file} (no chapters in video)") + + return chapters_file + +def main(): + # Parse arguments + if len(sys.argv) != 3: + print("Usage: extract_data.py ", file=sys.stderr) + sys.exit(1) + + video_url = sys.argv[1] + output_dir = sys.argv[2] + + # Validate arguments + if not video_url: + print("ERROR: No YouTube URL provided", file=sys.stderr) + sys.exit(1) + + # Check required commands + check_yt_dlp() + + # Extract video ID from URL + video_id = extract_video_id(video_url) + if not video_id: + print("ERROR: Could not extract video ID from URL", file=sys.stderr) + sys.exit(1) + + base_name = f"youtube_{video_id}" + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Fetch video data from YouTube + data = fetch_video_data(video_url, output_dir) + + # Create output files + create_metadata_file(data, base_name, output_dir) + create_description_file(data, base_name, output_dir) + create_chapters_file(data, base_name, output_dir) + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/extract_transcript.py b/extract_transcript.py new file mode 100755 index 0000000..e441652 --- /dev/null +++ b/extract_transcript.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Detects video language, lists available subtitles, tries manual subtitles first, falls back to auto-generated +Usage: extract_transcript.py [SUBTITLE_LANG] +Output: SUCCESS: youtube_{VIDEO_ID}_transcript.vtt or ERROR: No subtitles available +""" + +import sys +import os +import subprocess +import re +import glob + +def extract_video_id(url): + """Extract video ID from YouTube URL""" + # Handle youtu.be format + if 'youtu.be/' in url: + video_id = url.split('youtu.be/')[-1].split('?')[0] + return video_id + + # Handle youtube.com format with v= parameter + match = re.search(r'[?&]v=([^&]+)', url) + if match: + return match.group(1) + + return None + +def check_yt_dlp(): + """Check if yt-dlp is installed""" + try: + subprocess.run(['yt-dlp', '--version'], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + print("ERROR: yt-dlp is not installed", file=sys.stderr) + sys.exit(1) + +def get_video_language(youtube_url): + """Get video language from YouTube""" + result = subprocess.run( + ['yt-dlp', '--print', '%(language)s', youtube_url], + capture_output=True, + text=True + ) + video_lang = result.stdout.strip() if result.returncode == 0 else "unknown" + print(f"Video language: {video_lang}") + return video_lang + +def download_manual_subtitles(youtube_url, subtitle_lang, output_name): + """Try to download manual subtitles""" + subprocess.run( + ['yt-dlp', '--write-sub', '--sub-langs', subtitle_lang, '--skip-download', '--output', output_name, youtube_url], + capture_output=True + ) + temp_files = glob.glob(f"{output_name}.*.vtt") + if temp_files: + print(f"Manual subtitles downloaded ({subtitle_lang})") + return temp_files[0] + return None + +def download_auto_subtitles(youtube_url, subtitle_lang, output_name): + """Try to download auto-generated subtitles""" + subprocess.run( + ['yt-dlp', '--write-auto-sub', '--sub-langs', subtitle_lang, '--skip-download', '--output', output_name, youtube_url], + capture_output=True + ) + temp_files = glob.glob(f"{output_name}.*.vtt") + if temp_files: + print(f"Auto-generated subtitles downloaded ({subtitle_lang})") + return temp_files[0] + return None + +def download_subtitles(youtube_url, subtitle_lang, output_name): + """Download subtitles, trying manual first then auto-generated""" + # Try manual subtitles first + subtitle_file = download_manual_subtitles(youtube_url, subtitle_lang, output_name) + if subtitle_file: + return subtitle_file + + # Fall back to auto-generated + subtitle_file = download_auto_subtitles(youtube_url, subtitle_lang, output_name) + if subtitle_file: + return subtitle_file + + print(f"ERROR: No subtitles available for language: {subtitle_lang}", file=sys.stderr) + sys.exit(1) + +def rename_subtitle_file(temp_file, final_output): + """Rename temporary subtitle file to final output name""" + try: + os.rename(temp_file, final_output) + except Exception as e: + print(f"ERROR: Failed to rename transcript file: {e}", file=sys.stderr) + sys.exit(1) + + if not os.path.exists(final_output): + print(f"ERROR: {final_output} not created", file=sys.stderr) + sys.exit(1) + + return final_output + +def main(): + # Parse arguments + youtube_url = sys.argv[1] if len(sys.argv) > 1 else None + output_dir = sys.argv[2] if len(sys.argv) > 2 else "." + subtitle_lang = sys.argv[3] if len(sys.argv) > 3 else "en" + + # Validate arguments + if not youtube_url: + print("ERROR: No YouTube URL provided", file=sys.stderr) + sys.exit(1) + + # Check required commands + check_yt_dlp() + + # Extract video ID from URL + video_id = extract_video_id(youtube_url) + if not video_id: + print("ERROR: Could not extract video ID from URL", file=sys.stderr) + sys.exit(1) + + base_name = f"youtube_{video_id}" + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Get video language + get_video_language(youtube_url) + + # Download subtitles + output_name = os.path.join(output_dir, f"{base_name}_transcript_temp") + final_output = os.path.join(output_dir, f"{base_name}_transcript.vtt") + temp_file = download_subtitles(youtube_url, subtitle_lang, output_name) + + # Rename to final filename + final_file = rename_subtitle_file(temp_file, final_output) + + print(f"SUCCESS: {final_file}") + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/extract_transcript_whisper.py b/extract_transcript_whisper.py new file mode 100755 index 0000000..1f740e7 --- /dev/null +++ b/extract_transcript_whisper.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +""" +Fallback transcription using Whisper when no subtitles available +Usage: extract_transcript_whisper.py [--mq|--hq] +Options: + --mq Use medium model (~5GB download) + --hq Use large model for highest quality (slower, ~10GB download) + default: small model (~2GB download) + +On macOS: Uses MLX Whisper if available (faster, Apple Silicon optimized). That uses only large model. +Otherwise: Uses OpenAI Whisper. + +Output: SUCCESS: youtube_{VIDEO_ID}_transcript.vtt, Audio file: youtube_{VIDEO_ID}_audio.mp3 (ask user about deletion) +""" + +import sys +import os +import subprocess +import re + +def extract_video_id(url): + """Extract video ID from YouTube URL""" + # Handle youtu.be format + if 'youtu.be/' in url: + video_id = url.split('youtu.be/')[-1].split('?')[0] + return video_id + + # Handle youtube.com format with v= parameter + match = re.search(r'[?&]v=([^&]+)', url) + if match: + return match.group(1) + + return None + +def check_yt_dlp(): + """Check if yt-dlp is installed""" + try: + subprocess.run(['yt-dlp', '--version'], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + print("ERROR: yt-dlp is not installed", file=sys.stderr) + sys.exit(1) + +def check_uv(): + """Check if uv is available""" + try: + subprocess.run(['uv', '-V'], capture_output=True, check=True) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + +def check_mlx_whisper(): + """Check if MLX Whisper is available, returns (variant, command_array)""" + # Try uv run mlx_whisper first + if check_uv(): + try: + subprocess.run(['uv', 'run', 'mlx_whisper', '--help'], capture_output=True, check=True, timeout=10) + return ('uv-mlx', ['uv', 'run', 'mlx_whisper']) + except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): + pass + + # Try mlx-whisper command + try: + subprocess.run(['mlx-whisper', '--help'], capture_output=True, check=True) + return ('mlx-whisper', ['mlx-whisper']) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + + return (None, None) + +def check_whisper(): + """Check if OpenAI Whisper is installed""" + try: + subprocess.run(['whisper', '--help'], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + print("ERROR: Whisper is not installed", file=sys.stderr) + print("Install options:", file=sys.stderr) + print(" - macOS (slow): brew install openai-whisper", file=sys.stderr) + print(" - macOS (faster): brew install ffmpeg uv; uv venv .venv; source .venv/bin/activate; uv pip install mlx-whisper", file=sys.stderr) + print(" (Ask your AI what the above does. At the moment pipx, python 3.14, and mlx-whisper conflict, but uv works - 11/2025)", file=sys.stderr) + print(" - All systems: pip3 install openai-whisper", file=sys.stderr) + sys.exit(1) + +def check_audio_size(youtube_url): + """Get audio file size estimate""" + print("Checking audio file size...") + result = subprocess.run( + ['yt-dlp', '--print', '%(filesize,filesize_approx)s %(duration)s', youtube_url], + capture_output=True, + text=True + ) + if result.returncode == 0: + print(result.stdout.strip()) + +def download_audio(youtube_url, audio_file): + """Download audio from YouTube video""" + print("Downloading audio...") + result = subprocess.run( + ['yt-dlp', '-x', '--audio-format', 'mp3', '--output', audio_file, youtube_url], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print("ERROR: Failed to download audio", file=sys.stderr) + sys.exit(1) + + if not os.path.exists(audio_file): + print("ERROR: Audio file not found", file=sys.stderr) + sys.exit(1) + +def get_video_language(youtube_url): + """Get video language from YouTube""" + result = subprocess.run( + ['yt-dlp', '--print', '%(language)s', youtube_url], + capture_output=True, + text=True + ) + video_lang = result.stdout.strip() if result.returncode == 0 else "unknown" + return video_lang + +def transcribe_with_whisper(audio_file, output_dir, mlx_command=None, quality='default'): + """Transcribe audio file with Whisper (MLX or OpenAI)""" + if mlx_command: + # MLX Whisper - default large-v3 as faster processing and better quality, supports other models + print(f"Transcribing with MLX Whisper (Apple Silicon optimized, large-v3 model) - this may take a while...") + command = mlx_command + model = 'mlx-community/whisper-large-v3-mlx' + # MLX uses hyphens in arguments + output_format_arg = '--output-format' + output_dir_arg = '--output-dir' + else: + # OpenAI Whisper - configurable quality + # All models are multilingual and support ~99 languages + # Model sizes: tiny (~1GB, fast), base (~1GB, balanced), small (~2GB), medium (~5GB), large (~10GB, best) + command = ['whisper'] + if quality == 'hq': + model = 'large' + size_info = '~10GB' + elif quality == 'mq': + model = 'medium' + size_info = '~5GB' + else: + model = 'small' + size_info = '~2GB' + print(f"Transcribing with OpenAI Whisper model '{model}' ({size_info}) - this may take a while...") + # OpenAI uses underscores in arguments + output_format_arg = '--output_format' + output_dir_arg = '--output_dir' + + result = subprocess.run( + command + [audio_file, '--model', model, output_format_arg, 'vtt', output_dir_arg, output_dir], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print(f"ERROR: {' '.join(command)} transcription failed", file=sys.stderr) + if os.path.exists(audio_file): + os.remove(audio_file) + sys.exit(1) + + if not mlx_command and quality == 'default': + print("NOTE: If transcription quality is poor, try --mq (medium, ~5GB) or --hq (large, ~10GB)") + +def rename_vtt_file(audio_file, base_name, output_dir): + """Rename VTT file to standard name""" + # Whisper creates VTT file with same name as audio file + audio_basename = os.path.splitext(os.path.basename(audio_file))[0] + vtt_file = os.path.join(output_dir, f"{audio_basename}.vtt") + final_vtt = os.path.join(output_dir, f"{base_name}_transcript.vtt") + + if os.path.exists(vtt_file): + try: + os.rename(vtt_file, final_vtt) + except Exception as e: + print(f"ERROR: Failed to rename VTT file: {e}", file=sys.stderr) + if os.path.exists(audio_file): + os.remove(audio_file) + sys.exit(1) + + print(f"SUCCESS: {final_vtt}") + print(f"Audio file: {audio_file} (delete with: rm {audio_file})") + return final_vtt + else: + print("ERROR: VTT file not created", file=sys.stderr) + if os.path.exists(audio_file): + os.remove(audio_file) + sys.exit(1) + +def main(): + # Parse options + quality = 'default' + args = [] + for arg in sys.argv[1:]: + if arg == '--hq': + quality = 'hq' + elif arg == '--mq': + quality = 'mq' + else: + args.append(arg) + + # Parse arguments + if len(args) != 2: + print("Usage: extract_transcript_whisper.py [--mq|--hq] ", file=sys.stderr) + sys.exit(1) + + youtube_url = args[0] + output_dir = args[1] + + # Validate arguments + if not youtube_url: + print("ERROR: No YouTube URL provided", file=sys.stderr) + sys.exit(1) + + # Check required commands + check_yt_dlp() + + # Check which Whisper variant is available + mlx_variant, mlx_command = check_mlx_whisper() + if mlx_variant: + print(f"Using MLX Whisper via {mlx_variant} (Apple Silicon optimized)") + if quality != 'default': + print("NOTE: Quality flags (--mq, --hq) are ignored with MLX Whisper (always uses large-v3)") + else: + check_whisper() + print("Using OpenAI Whisper") + mlx_command = None + + # Extract video ID from URL + video_id = extract_video_id(youtube_url) + if not video_id: + print("ERROR: Could not extract video ID from URL", file=sys.stderr) + sys.exit(1) + + base_name = f"youtube_{video_id}" + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Check audio size + check_audio_size(youtube_url) + + # Get video language + video_lang = get_video_language(youtube_url) + print(f"Video language: {video_lang}") + + # Download audio + audio_file = os.path.join(output_dir, f"{base_name}_audio.mp3") + download_audio(youtube_url, audio_file) + + # Transcribe with Whisper (MLX or OpenAI variant) + transcribe_with_whisper(audio_file, output_dir, mlx_command=mlx_command, quality=quality) + + # Rename VTT file + rename_vtt_file(audio_file, base_name, output_dir) + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/finalize.py b/finalize.py new file mode 100755 index 0000000..2065464 --- /dev/null +++ b/finalize.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +Creates final markdown file from template and component files, cleans up intermediate work files +Usage: finalize.py [--debug] +Keeps: {BASE_NAME}.md +Removes: all intermediate files including _metadata.md, _summary.md, _description.md, _transcript.md (unless --debug) +""" + +import sys +import os +import re + +def clean_title_for_filename(title, max_length=60): + """Clean title for use in filename""" + # Remove or replace problematic characters + cleaned = re.sub(r'[<>:"/\\|?*]', '', title) # Remove invalid filename chars + cleaned = re.sub(r'\s+', ' ', cleaned) # Normalize whitespace + cleaned = cleaned.strip() + + # Truncate if too long + if len(cleaned) > max_length: + cleaned = cleaned[:max_length].rsplit(' ', 1)[0] # Cut at word boundary + + return cleaned + +def read_file_or_empty(file_path): + """Read file content or return empty string if file doesn't exist""" + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + return "" + +def main(): + # Parse options + debug = False + args = [] + for arg in sys.argv[1:]: + if arg == '--debug': + debug = True + else: + args.append(arg) + + # Parse arguments + if len(args) < 1: + print("ERROR: No BASE_NAME provided", file=sys.stderr) + print("Usage: finalize.py [--debug] ", file=sys.stderr) + sys.exit(1) + + base_name = args[0] + output_dir = args[1] if len(args) > 1 else "." + + # Get script directory for template + script_dir = os.path.dirname(os.path.abspath(__file__)) + template_file = os.path.join(script_dir, "template.md") + + # Validate template exists + if not os.path.exists(template_file): + print(f"ERROR: {template_file} not found", file=sys.stderr) + sys.exit(1) + + # Read template + with open(template_file, 'r', encoding='utf-8') as f: + template = f.read() + + # Read component files + metadata = read_file_or_empty(os.path.join(output_dir, f"{base_name}_metadata.md")) + summary = read_file_or_empty(os.path.join(output_dir, f"{base_name}_summary.md")) + description = read_file_or_empty(os.path.join(output_dir, f"{base_name}_description.md")) + transcription = read_file_or_empty(os.path.join(output_dir, f"{base_name}_transcript.md")) + + # Replace placeholders + final_content = template.replace("{metadata}", metadata.strip()) + final_content = final_content.replace("{summary}", summary.strip()) + final_content = final_content.replace("{description}", description.strip()) + final_content = final_content.replace("{transcription}", transcription.strip()) + + # Read title and create human-readable filename + title = read_file_or_empty(os.path.join(output_dir, f"{base_name}_title.txt")).strip() + if title: + cleaned_title = clean_title_for_filename(title) + video_id = base_name.replace('youtube_', '') + final_filename = f"youtube - {cleaned_title} ({video_id}).md" + else: + # Fallback to old format if title not found + final_filename = f"{base_name}.md" + + # Write final file + final_file = os.path.join(output_dir, final_filename) + with open(final_file, 'w', encoding='utf-8') as f: + f.write(final_content) + + print(f"Created final file: {final_filename}") + + # Clean up intermediate work files unless --debug is set + if debug: + print("Debug mode: keeping intermediate work files") + else: + work_files = [ + f"{base_name}_title.txt", + f"{base_name}_metadata.md", + f"{base_name}_summary.md", + f"{base_name}_description.md", + f"{base_name}_chapters.json", + f"{base_name}_transcript.vtt", + f"{base_name}_transcript_dedup.md", + f"{base_name}_transcript_no_timestamps.txt", + f"{base_name}_transcript_paragraphs.md", + f"{base_name}_transcript_cleaned.md", + f"{base_name}_transcript.md" + ] + + for work_file in work_files: + file_path = os.path.join(output_dir, work_file) + if os.path.exists(file_path): + os.remove(file_path) + + print("Cleaned up intermediate work files") + + print(f"Final file: {final_filename}") + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERROR: {str(e)}", file=sys.stderr) + sys.exit(1) diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..6e35856 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,77 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:vre/flow-state:youtube-to-markdown", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "d453a2d4f1543874c246df807a17cd89b1466fbf", + "treeHash": "e11673d3a2a7c0c028bfec35eaaf7cfb95b28ba1f9829743f4ef1fcfcdcbb8d1", + "generatedAt": "2025-11-28T10:28:56.461463Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "youtube-to-markdown", + "description": "Transform YouTube video to storagable knowledge. Get tight summary, cleaned transcript broken into chapters and paragraphs, timestamp links back to original video, and notable content highlighted. Might be that you can skip watching the video entirely.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "template.md", + "sha256": "19b448513789c48be9b30eca0c714b07b57d62d87cc74a0b177ca6da3009bd93" + }, + { + "path": "LICENSE", + "sha256": "97219df5f45e2937df07dcd379629292e3dc5bf0a18456edab803a1ef093d383" + }, + { + "path": "deduplicate_vtt.py", + "sha256": "9d17a5e137164526a9e9913411071fae7c6fac04bac0c4999d126d19d5c5cbb6" + }, + { + "path": "finalize.py", + "sha256": "34a6adeb828c609a4ef0d8be2882fd0cafbb8a6617e526806b97446680fcbacd" + }, + { + "path": "apply_paragraph_breaks.py", + "sha256": "d7dd4164a02331ba5d3fd918941ae5da6ed6ac840338da02ede0e3239a80d64c" + }, + { + "path": "README.md", + "sha256": "1c05e81deb1c1a0e57e9596c47c86ebb16b7ac671e917912cba2c891be72b191" + }, + { + "path": "extract_transcript_whisper.py", + "sha256": "c750eda8419fea37f6ac99129570a99211bdbe8e57085c195efbe756475e52e0" + }, + { + "path": "SKILL.md", + "sha256": "f2545b169a0bdef7b29d17613b7736df52d0f86c29e94ac8a910c4e80686a611" + }, + { + "path": "extract_transcript.py", + "sha256": "a0ff59c12f8deabb3aad6af15940f41b8518c421774738fc6c6fa88a066df856" + }, + { + "path": "extract_data.py", + "sha256": "92a1ae183b06dd00f3b470c67fa2b2c49cd1a7c41befdf2da2561d00803a29c2" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "8b0f36bc304b4735db132a81858ab4e9d1bd68bd47e9338bd81e9dbfe2c7229e" + } + ], + "dirSha256": "e11673d3a2a7c0c028bfec35eaaf7cfb95b28ba1f9829743f4ef1fcfcdcbb8d1" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/template.md b/template.md new file mode 100644 index 0000000..33f337a --- /dev/null +++ b/template.md @@ -0,0 +1,15 @@ +## Video + +{metadata} + +## Summary + +{summary} + +## Description + +{description} + +## Transcription + +{transcription}