Code for YouTube Video Transcription Summarization with Python Tutorial

View on Github
import os
import re
import nltk
import pytube
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from urllib.parse import urlparse, parse_qs
import textwrap
from colorama import Fore, Back, Style, init
from openai import OpenAI

# Initialize colorama for cross-platform colored terminal output
init(autoreset=True)

# Download necessary NLTK data
nltk.download('punkt_tab', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize OpenAI client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="<api_key>", # Add your OpenRouter API key here
)

def extract_video_id(youtube_url):
    """Extract the video ID from a YouTube URL."""
    parsed_url = urlparse(youtube_url)
    
    if parsed_url.netloc == 'youtu.be':
        return parsed_url.path[1:]
    
    if parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
        if parsed_url.path == '/watch':
            return parse_qs(parsed_url.query)['v'][0]
        elif parsed_url.path.startswith('/embed/'):
            return parsed_url.path.split('/')[2]
        elif parsed_url.path.startswith('/v/'):
            return parsed_url.path.split('/')[2]
    
    # If no match found
    raise ValueError(f"Could not extract video ID from URL: {youtube_url}")

def get_transcript(video_id):
    """Get the transcript of a YouTube video."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return ' '.join([entry['text'] for entry in transcript])
    except Exception as e:
        return f"Error retrieving transcript: {str(e)}."

def summarize_text_nltk(text, num_sentences=5):
    """Summarize text using frequency-based extractive summarization with NLTK."""
    if not text or text.startswith("Error") or text.startswith("Transcript not available"):
        return text
    
    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    
    # If there are fewer sentences than requested, return all sentences
    if len(sentences) <= num_sentences:
        return text
    
    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Calculate word frequencies
    freq = FreqDist(words)
    
    # Score sentences based on word frequencies
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                if i in sentence_scores:
                    sentence_scores[i] += freq[word]
                else:
                    sentence_scores[i] = freq[word]
    
    # Get the top N sentences with highest scores
    summary_sentences_indices = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary_sentences_indices.sort()  # Sort to maintain original order
    
    # Construct the summary
    summary = ' '.join([sentences[i] for i in summary_sentences_indices])
    return summary

def summarize_text_ai(text, video_title, num_sentences=5):
    """Summarize text using the Mistral AI model via OpenRouter."""
    if not text or text.startswith("Error") or text.startswith("Transcript not available"):
        return text
    
    # Truncate text if it's too long (models often have token limits)
    max_chars = 15000  # Adjust based on model's context window
    truncated_text = text[:max_chars] if len(text) > max_chars else text
    
    prompt = f"""Please provide a concise summary of the following YouTube video transcript.
Title: {video_title}

Transcript:
{truncated_text}

Create a clear, informative summary that captures the main points and key insights from the video.
Your summary should be approximately {num_sentences} sentences long.
"""
    
    try:
        completion = client.chat.completions.create(
            model="mistralai/mistral-small-3.1-24b-instruct:free",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error generating AI summary: {str(e)}"

def summarize_youtube_video(youtube_url, num_sentences=5):
    """Main function to summarize a YouTube video's transcription."""
    try:
        video_id = extract_video_id(youtube_url)
        transcript = get_transcript(video_id)
        
        # Get video title for context
        try:
            yt = pytube.YouTube(youtube_url)
            video_title = yt.title
            
        except Exception as e:
            video_title = "Unknown Title"

        
        # Generate both summaries
        print(Fore.YELLOW + f"Generating AI summary with {num_sentences} sentences...")
        ai_summary = summarize_text_ai(transcript, video_title, num_sentences)
        
        print(Fore.YELLOW + f"Generating NLTK summary with {num_sentences} sentences...")
        nltk_summary = summarize_text_nltk(transcript, num_sentences)
        
        return {
            "video_title": video_title,
            "video_id": video_id,
            "ai_summary": ai_summary,
            "nltk_summary": nltk_summary,
            "full_transcript_length": len(transcript.split()),
            "nltk_summary_length": len(nltk_summary.split()),
            "ai_summary_length": len(ai_summary.split()) if not ai_summary.startswith("Error") else 0
        }
    except Exception as e:
        return {"error": str(e)}

def format_time(seconds):
    """Convert seconds to a readable time format."""
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    if hours > 0:
        return f"{hours}h {minutes}m {seconds}s"
    elif minutes > 0:
        return f"{minutes}m {seconds}s"
    else:
        return f"{seconds}s"

def format_number(number):
    """Format large numbers with commas for readability."""
    return "{:,}".format(number)

def print_boxed_text(text, width=80, title=None, color=Fore.WHITE):
    """Print text in a nice box with optional title."""
    wrapper = textwrap.TextWrapper(width=width-4)  # -4 for the box margins
    wrapped_text = wrapper.fill(text)
    lines = wrapped_text.split('\n')
    
    # Print top border with optional title
    if title:
        title_space = width - 4 - len(title)
        left_padding = title_space // 2
        right_padding = title_space - left_padding
        print(color + '┌' + '─' * left_padding + title + '─' * right_padding + '┐')
    else:
        print(color + '┌' + '─' * (width-2) + '┐')
    
    # Print content
    for line in lines:
        padding = width - 2 - len(line)
        print(color + '│ ' + line + ' ' * padding + '│')
    
    # Print bottom border
    print(color + '└' + '─' * (width-2) + '┘')

def print_summary_result(result, width=80):
    """Print the summary result in a nicely formatted way."""
    if "error" in result:
        print_boxed_text(f"Error: {result['error']}", width=width, title="ERROR", color=Fore.RED)
        return
    
    # Terminal width
    terminal_width = width
    
    # Print header with video information
    print("\n" + Fore.CYAN + "=" * terminal_width)
    print(Fore.CYAN + Style.BRIGHT + result['video_title'].center(terminal_width))
    print(Fore.CYAN + "=" * terminal_width + "\n")
    
    # Video metadata section
    print(Fore.YELLOW + Style.BRIGHT + "VIDEO INFORMATION".center(terminal_width))
    print(Fore.YELLOW + "─" * terminal_width)
    
    # Two-column layout for metadata
    col_width = terminal_width // 2 - 2
    
    # Row 3
    print(f"{Fore.GREEN}Video ID: {Fore.WHITE}{result['video_id']:<{col_width}}"
          f"{Fore.GREEN}URL: {Fore.WHITE}https://youtu.be/{result['video_id']}")
    
    print(Fore.YELLOW + "─" * terminal_width + "\n")
    
    # AI Summary section
    ai_compression = "N/A"
    if result['ai_summary_length'] > 0:
        ai_compression = round((1 - result['ai_summary_length'] / result['full_transcript_length']) * 100)
    
    ai_summary_title = f" AI SUMMARY ({result['ai_summary_length']} words, condensed {ai_compression}% from {result['full_transcript_length']} words) "
    
    print(Fore.GREEN + Style.BRIGHT + ai_summary_title.center(terminal_width))
    print(Fore.GREEN + "─" * terminal_width)
    
    # Print the AI summary with proper wrapping
    wrapper = textwrap.TextWrapper(width=terminal_width-4, 
                                  initial_indent='  ', 
                                  subsequent_indent='  ')
    
    # Split AI summary into paragraphs and print each
    ai_paragraphs = result['ai_summary'].split('\n')
    for paragraph in ai_paragraphs:
        if paragraph.strip():  # Skip empty paragraphs
            print(wrapper.fill(paragraph))
            print()  # Empty line between paragraphs
    
    print(Fore.GREEN + "─" * terminal_width + "\n")
    
    # NLTK Summary section
    nltk_compression = round((1 - result['nltk_summary_length'] / result['full_transcript_length']) * 100)
    nltk_summary_title = f" NLTK SUMMARY ({result['nltk_summary_length']} words, condensed {nltk_compression}% from {result['full_transcript_length']} words) "
    
    print(Fore.MAGENTA + Style.BRIGHT + nltk_summary_title.center(terminal_width))
    print(Fore.MAGENTA + "─" * terminal_width)
    
    # Split NLTK summary into paragraphs and wrap each
    paragraphs = result['nltk_summary'].split('. ')
    formatted_paragraphs = []
    
    current_paragraph = ""
    for sentence in paragraphs:
        if not sentence.endswith('.'):
            sentence += '.'
        
        if len(current_paragraph) + len(sentence) + 1 <= 150:  # Arbitrary length for paragraph
            current_paragraph += " " + sentence if current_paragraph else sentence
        else:
            if current_paragraph:
                formatted_paragraphs.append(current_paragraph)
            current_paragraph = sentence
    
    if current_paragraph:
        formatted_paragraphs.append(current_paragraph)
    
    # Print each paragraph
    for paragraph in formatted_paragraphs:
        print(wrapper.fill(paragraph))
        print()  # Empty line between paragraphs
    
    print(Fore.MAGENTA + "─" * terminal_width + "\n")


if __name__ == "__main__":
    # Get terminal width
    try:
        terminal_width = os.get_terminal_size().columns
        # Limit width to reasonable range
        terminal_width = max(80, min(terminal_width, 120))
    except:
        terminal_width = 80  # Default if can't determine
    
    # Print welcome banner
    print(Fore.CYAN + Style.BRIGHT + "\n" + "=" * terminal_width)
    print(Fore.CYAN + Style.BRIGHT + "YOUTUBE VIDEO SUMMARIZER".center(terminal_width))
    print(Fore.CYAN + Style.BRIGHT + "=" * terminal_width + "\n")
    
    youtube_url = input(Fore.GREEN + "Enter YouTube video URL: " + Fore.WHITE)
    
    num_sentences_input = input(Fore.GREEN + "Enter number of sentences for summaries (default 5): " + Fore.WHITE)
    num_sentences = int(num_sentences_input) if num_sentences_input.strip() else 5
    
    print(Fore.YELLOW + "\nFetching and analyzing video transcript... Please wait...\n")
    
    result = summarize_youtube_video(youtube_url, num_sentences)
    print_summary_result(result, width=terminal_width)
youtube_transcript_summarizer.py
Ethical Hacking with Python EBook - Topic - Top
New Tutorials

Building a Full-Stack RAG Chatbot with FastAPI, OpenAI, and Streamlit
How to Recover Deleted Files with Python
How to Use Python to Track Google Search Results and Reviews Over Time
YouTube Video Transcription Summarization with Python
Getting Started with Python for SaaS Applications
Code for YouTube Video Transcription Summarization with Python Tutorial

Tags

New Tutorials

Popular Tutorials