youtube_tutorial / smolagent_processor.py

Upload folder using huggingface_hub

4d338c7 verified 10 months ago

16.2 kB

	"""
	SmoLAgent processor for YouTube transcripts.
	Handles transcript processing and step extraction.
	"""
	import re
	import logging
	from typing import Dict, List, Optional, Any, Tuple

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	class SmoLAgentProcessor:
	"""
	Processor for YouTube transcripts using SmoLAgent.

	This class handles the processing of YouTube transcripts to extract
	meaningful steps and code snippets from tutorial videos.
	"""

	def __init__(self):
	"""Initialize the SmoLAgentProcessor."""
	logger.info("Initializing SmoLAgentProcessor")

	# Regular expressions for code detection
	self.code_patterns = [
	# Python patterns
	r'import\s+[\w\s,\.]+',
	r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+',
	r'def\s+\w+\s$[^)]$\s*:',
	r'class\s+\w+(\s$[^)]$)?\s*:',
	r'if\s+.:\s$',
	r'for\s+.:\s$',
	r'while\s+.:\s$',
	r'try\s:\s$',
	r'except\s+.:\s$',
	r'return\s+.*',
	r'print\s*\(',
	r'with\s+.:\s$',
	r'lambda\s+.*:',
	r'@\w+',

	# JavaScript patterns
	r'function\s+\w+\s$[^)]$\s*{',
	r'const\s+\w+\s*=',
	r'let\s+\w+\s*=',
	r'var\s+\w+\s*=',
	r'import\s+{[^}]*}\s+from',
	r'export\s+',
	r'=>\s*{',
	r'document\.querySelector',
	r'async\s+function',
	r'await\s+',

	# HTML patterns
	r'<\w+[^>]*>',
	r'</\w+>',

	# CSS patterns
	r'\.\w+\s*{',
	r'#\w+\s*{',
	r'@media',
	r'@keyframes',

	# Shell/Command line patterns
	r'npm\s+install',
	r'pip\s+install',
	r'git\s+',
	r'docker\s+',
	r'cd\s+',
	r'mkdir\s+',
	r'touch\s+',
	r'ls\s+',
	r'rm\s+',

	# General code indicators
	r'```\w*',
	r'`[^`]+`',
	r'\$\s+\w+',
	]

	# Compile patterns for efficiency
	self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.code_patterns]

	# Step indicator patterns
	self.step_indicators = [
	r'step\s+\d+',
	r'first\s+step',
	r'next\s+step',
	r'final\s+step',
	r'let\'s\s+start',
	r'now\s+we',
	r'next\s+we',
	r'first\s+we',
	r'finally\s+we',
	r'let\'s\s+do',
	r'we\s+need\s+to',
	r'you\s+need\s+to',
	r'we\'re\s+going\s+to',
	r'i\'m\s+going\s+to',
	r'let\'s\s+create',
	r'let\'s\s+add',
	r'let\'s\s+implement',
	r'let\'s\s+build',
	r'let\'s\s+make',
	r'let\'s\s+set\s+up',
	r'let\'s\s+configure',
	r'let\'s\s+install',
	r'let\'s\s+initialize',
	r'let\'s\s+define',
	r'let\'s\s+write',
	r'let\'s\s+move\s+on\s+to',
	r'moving\s+on\s+to',
	r'now\s+let\'s',
	r'the\s+next\s+thing',
	r'after\s+that',
	r'once\s+you\'ve',
	r'once\s+we\'ve',
	r'now\s+that\s+we',
	r'now\s+that\s+you',
	r'to\s+begin',
	r'to\s+start',
	r'to\s+get\s+started',
	r'first\s+thing',
	r'second\s+thing',
	r'third\s+thing',
	r'lastly',
	r'finally',
	r'in\s+conclusion',
	r'to\s+summarize',
	r'to\s+wrap\s+up',
	]

	# Compile step indicators for efficiency
	self.compiled_step_indicators = [re.compile(pattern, re.IGNORECASE) for pattern in self.step_indicators]

	# Programming language detection patterns
	self.language_patterns = {
	'python': [
	r'import\s+[\w\s,\.]+',
	r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+',
	r'def\s+\w+\s$[^)]$\s*:',
	r'class\s+\w+(\s$[^)]$)?\s*:',
	r'print\s*\(',
	r'if\s+.:\s$',
	r'for\s+.:\s$',
	r'while\s+.:\s$',
	],
	'javascript': [
	r'function\s+\w+\s$[^)]$\s*{',
	r'const\s+\w+\s*=',
	r'let\s+\w+\s*=',
	r'var\s+\w+\s*=',
	r'import\s+{[^}]*}\s+from',
	r'export\s+',
	r'=>\s*{',
	r'document\.',
	r'window\.',
	],
	'html': [
	r'<html',
	r'<head',
	r'<body',
	r'<div',
	r'<span',
	r'<p>',
	r'<a\s+href',
	r'<img\s+src',
	r'<script',
	r'<style',
	],
	'css': [
	r'\.\w+\s*{',
	r'#\w+\s*{',
	r'@media',
	r'@keyframes',
	r'margin:',
	r'padding:',
	r'color:',
	r'background:',
	],
	'shell': [
	r'npm\s+install',
	r'pip\s+install',
	r'git\s+',
	r'docker\s+',
	r'cd\s+',
	r'mkdir\s+',
	r'touch\s+',
	r'ls\s+',
	r'rm\s+',
	],
	}

	# Compile language patterns for efficiency
	self.compiled_language_patterns = {
	lang: [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
	for lang, patterns in self.language_patterns.items()
	}

	def process_transcript(self, transcript: List[Dict[str, Any]], chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Process the transcript to extract steps.

	Args:
	transcript: List of transcript segments with text and timestamps
	chapters: List of chapters with title, start_time, end_time

	Returns:
	List of steps with timestamp, text, and code information
	"""
	if not transcript:
	logger.warning("Empty transcript provided")
	return []

	logger.info(f"Processing transcript with {len(transcript)} segments and {len(chapters)} chapters")

	# Merge adjacent transcript segments
	merged_segments = self._merge_adjacent_segments(transcript)
	logger.info(f"Merged into {len(merged_segments)} segments")

	# Extract steps from merged segments
	steps = self._extract_steps(merged_segments, chapters)
	logger.info(f"Extracted {len(steps)} steps")

	# Detect code in steps
	steps_with_code = self._detect_code_in_steps(steps)
	logger.info(f"Detected code in steps, final count: {len(steps_with_code)}")

	return steps_with_code

	def _merge_adjacent_segments(self, transcript: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Merge adjacent transcript segments that are part of the same sentence.

	Args:
	transcript: List of transcript segments

	Returns:
	List of merged transcript segments
	"""
	if not transcript:
	return []

	merged = []
	current_segment = transcript[0].copy()

	for i in range(1, len(transcript)):
	segment = transcript[i]

	# Check if segments are close in time (within 2 seconds)
	time_gap = segment["start"] - (current_segment["start"] + current_segment.get("duration", 0))

	# Check if the current segment ends with a sentence-ending punctuation
	current_text_ends_sentence = re.search(r'[.!?]\s*$', current_segment["text"])

	if time_gap < 2 and not current_text_ends_sentence:
	# Merge segments
	current_segment["text"] += " " + segment["text"]
	current_segment["duration"] = segment["start"] + segment.get("duration", 0) - current_segment["start"]
	else:
	# Start a new segment
	merged.append(current_segment)
	current_segment = segment.copy()

	# Add the last segment
	merged.append(current_segment)

	return merged

	def _extract_steps(self, segments: List[Dict[str, Any]], chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Extract steps from transcript segments.

	Args:
	segments: List of transcript segments
	chapters: List of chapters

	Returns:
	List of steps with timestamp and text
	"""
	steps = []

	# If we have chapters, use them as the primary structure
	if chapters:
	logger.info("Using chapters as primary structure for steps")

	for chapter in chapters:
	chapter_start = chapter["start_time"]
	chapter_end = chapter.get("end_time", float("inf"))

	# Find segments that belong to this chapter
	chapter_segments = [
	s for s in segments
	if s["start"] >= chapter_start and s["start"] < chapter_end
	]

	if not chapter_segments:
	continue

	# Add chapter as a step
	steps.append({
	"timestamp": self._format_timestamp(chapter_start),
	"text": f"## {chapter['title']}",
	"start_seconds": chapter_start,
	"is_chapter": True
	})

	# Extract steps within this chapter
	chapter_steps = self._extract_steps_from_segments(chapter_segments)

	# If no steps found within chapter, add the first segment as a step
	if not chapter_steps and chapter_segments:
	chapter_steps = [{
	"timestamp": self._format_timestamp(chapter_segments[0]["start"]),
	"text": chapter_segments[0]["text"],
	"start_seconds": chapter_segments[0]["start"],
	"is_chapter": False
	}]

	steps.extend(chapter_steps)
	else:
	# No chapters, extract steps directly from segments
	logger.info("No chapters available, extracting steps directly from segments")
	steps = self._extract_steps_from_segments(segments)

	# If no steps found, create steps based on time intervals
	if not steps and segments:
	logger.info("No clear steps found, creating steps based on time intervals")

	# Get total duration
	if len(segments) > 1:
	total_duration = segments[-1]["start"] + segments[-1].get("duration", 0) - segments[0]["start"]
	else:
	total_duration = segments[0].get("duration", 300) # Default to 5 minutes if only one segment

	# Create steps every 2 minutes or at least 5 steps
	step_count = max(5, int(total_duration / 120))
	interval = total_duration / step_count

	for i in range(step_count):
	target_time = segments[0]["start"] + i * interval

	# Find the closest segment
	closest_segment = min(segments, key=lambda s: abs(s["start"] - target_time))

	steps.append({
	"timestamp": self._format_timestamp(closest_segment["start"]),
	"text": closest_segment["text"],
	"start_seconds": closest_segment["start"],
	"is_chapter": False
	})

	# Sort steps by timestamp
	steps.sort(key=lambda x: x["start_seconds"])

	return steps

	def _extract_steps_from_segments(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Extract steps from transcript segments based on step indicators.

	Args:
	segments: List of transcript segments

	Returns:
	List of steps with timestamp and text
	"""
	steps = []

	for segment in segments:
	text = segment["text"]

	# Check if the segment contains a step indicator
	is_step = any(pattern.search(text) for pattern in self.compiled_step_indicators)

	# Check if the segment contains code
	is_code = any(pattern.search(text) for pattern in self.compiled_patterns)

	# Add as a step if it's a step indicator or contains code
	if is_step or is_code:
	steps.append({
	"timestamp": self._format_timestamp(segment["start"]),
	"text": text,
	"start_seconds": segment["start"],
	"is_chapter": False
	})

	return steps

	def _detect_code_in_steps(self, steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Detect code snippets in steps.

	Args:
	steps: List of steps

	Returns:
	List of steps with code information
	"""
	steps_with_code = []

	for step in steps:
	text = step["text"]

	# Skip chapter headings for code detection
	if step.get("is_chapter", False):
	steps_with_code.append(step)
	continue

	# Check if the text contains code
	is_code = any(pattern.search(text) for pattern in self.compiled_patterns)

	if is_code:
	# Detect programming language
	language = self._detect_language(text)

	steps_with_code.append({
	**step,
	"is_code": True,
	"code_language": language,
	"code_content": text
	})
	else:
	steps_with_code.append({
	**step,
	"is_code": False
	})

	return steps_with_code

	def _detect_language(self, text: str) -> str:
	"""
	Detect the programming language of a code snippet.

	Args:
	text: Code snippet text

	Returns:
	Detected programming language
	"""
	language_scores = {}

	for lang, patterns in self.compiled_language_patterns.items():
	score = sum(1 for pattern in patterns if pattern.search(text))
	language_scores[lang] = score

	if not language_scores or max(language_scores.values()) == 0:
	return "text"

	return max(language_scores.items(), key=lambda x: x[1])[0]

	def _format_timestamp(self, seconds: float) -> str:
	"""
	Format seconds as MM:SS timestamp.

	Args:
	seconds: Time in seconds

	Returns:
	Formatted timestamp string
	"""
	minutes = int(seconds // 60)
	seconds = int(seconds % 60)
	return f"{minutes}:{seconds:02d}"