""" SmoLAgent processor for YouTube transcripts. Handles transcript processing and step extraction. """ import re import logging from typing import Dict, List, Optional, Any, Tuple # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class SmoLAgentProcessor: """ Processor for YouTube transcripts using SmoLAgent. This class handles the processing of YouTube transcripts to extract meaningful steps and code snippets from tutorial videos. """ def __init__(self): """Initialize the SmoLAgentProcessor.""" logger.info("Initializing SmoLAgentProcessor") # Regular expressions for code detection self.code_patterns = [ # Python patterns r'import\s+[\w\s,\.]+', r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+', r'def\s+\w+\s*\([^)]*\)\s*:', r'class\s+\w+(\s*\([^)]*\))?\s*:', r'if\s+.*:\s*$', r'for\s+.*:\s*$', r'while\s+.*:\s*$', r'try\s*:\s*$', r'except\s+.*:\s*$', r'return\s+.*', r'print\s*\(', r'with\s+.*:\s*$', r'lambda\s+.*:', r'@\w+', # JavaScript patterns r'function\s+\w+\s*\([^)]*\)\s*{', r'const\s+\w+\s*=', r'let\s+\w+\s*=', r'var\s+\w+\s*=', r'import\s+{[^}]*}\s+from', r'export\s+', r'=>\s*{', r'document\.querySelector', r'async\s+function', r'await\s+', # HTML patterns r'<\w+[^>]*>', r'', # CSS patterns r'\.\w+\s*{', r'#\w+\s*{', r'@media', r'@keyframes', # Shell/Command line patterns r'npm\s+install', r'pip\s+install', r'git\s+', r'docker\s+', r'cd\s+', r'mkdir\s+', r'touch\s+', r'ls\s+', r'rm\s+', # General code indicators r'```\w*', r'`[^`]+`', r'\$\s+\w+', ] # Compile patterns for efficiency self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.code_patterns] # Step indicator patterns self.step_indicators = [ r'step\s+\d+', r'first\s+step', r'next\s+step', r'final\s+step', r'let\'s\s+start', r'now\s+we', r'next\s+we', r'first\s+we', r'finally\s+we', r'let\'s\s+do', r'we\s+need\s+to', r'you\s+need\s+to', r'we\'re\s+going\s+to', r'i\'m\s+going\s+to', r'let\'s\s+create', r'let\'s\s+add', r'let\'s\s+implement', r'let\'s\s+build', r'let\'s\s+make', r'let\'s\s+set\s+up', r'let\'s\s+configure', r'let\'s\s+install', r'let\'s\s+initialize', r'let\'s\s+define', r'let\'s\s+write', r'let\'s\s+move\s+on\s+to', r'moving\s+on\s+to', r'now\s+let\'s', r'the\s+next\s+thing', r'after\s+that', r'once\s+you\'ve', r'once\s+we\'ve', r'now\s+that\s+we', r'now\s+that\s+you', r'to\s+begin', r'to\s+start', r'to\s+get\s+started', r'first\s+thing', r'second\s+thing', r'third\s+thing', r'lastly', r'finally', r'in\s+conclusion', r'to\s+summarize', r'to\s+wrap\s+up', ] # Compile step indicators for efficiency self.compiled_step_indicators = [re.compile(pattern, re.IGNORECASE) for pattern in self.step_indicators] # Programming language detection patterns self.language_patterns = { 'python': [ r'import\s+[\w\s,\.]+', r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+', r'def\s+\w+\s*\([^)]*\)\s*:', r'class\s+\w+(\s*\([^)]*\))?\s*:', r'print\s*\(', r'if\s+.*:\s*$', r'for\s+.*:\s*$', r'while\s+.*:\s*$', ], 'javascript': [ r'function\s+\w+\s*\([^)]*\)\s*{', r'const\s+\w+\s*=', r'let\s+\w+\s*=', r'var\s+\w+\s*=', r'import\s+{[^}]*}\s+from', r'export\s+', r'=>\s*{', r'document\.', r'window\.', ], 'html': [ r'', r' List[Dict[str, Any]]: """ Process the transcript to extract steps. Args: transcript: List of transcript segments with text and timestamps chapters: List of chapters with title, start_time, end_time Returns: List of steps with timestamp, text, and code information """ if not transcript: logger.warning("Empty transcript provided") return [] logger.info(f"Processing transcript with {len(transcript)} segments and {len(chapters)} chapters") # Merge adjacent transcript segments merged_segments = self._merge_adjacent_segments(transcript) logger.info(f"Merged into {len(merged_segments)} segments") # Extract steps from merged segments steps = self._extract_steps(merged_segments, chapters) logger.info(f"Extracted {len(steps)} steps") # Detect code in steps steps_with_code = self._detect_code_in_steps(steps) logger.info(f"Detected code in steps, final count: {len(steps_with_code)}") return steps_with_code def _merge_adjacent_segments(self, transcript: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Merge adjacent transcript segments that are part of the same sentence. Args: transcript: List of transcript segments Returns: List of merged transcript segments """ if not transcript: return [] merged = [] current_segment = transcript[0].copy() for i in range(1, len(transcript)): segment = transcript[i] # Check if segments are close in time (within 2 seconds) time_gap = segment["start"] - (current_segment["start"] + current_segment.get("duration", 0)) # Check if the current segment ends with a sentence-ending punctuation current_text_ends_sentence = re.search(r'[.!?]\s*$', current_segment["text"]) if time_gap < 2 and not current_text_ends_sentence: # Merge segments current_segment["text"] += " " + segment["text"] current_segment["duration"] = segment["start"] + segment.get("duration", 0) - current_segment["start"] else: # Start a new segment merged.append(current_segment) current_segment = segment.copy() # Add the last segment merged.append(current_segment) return merged def _extract_steps(self, segments: List[Dict[str, Any]], chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Extract steps from transcript segments. Args: segments: List of transcript segments chapters: List of chapters Returns: List of steps with timestamp and text """ steps = [] # If we have chapters, use them as the primary structure if chapters: logger.info("Using chapters as primary structure for steps") for chapter in chapters: chapter_start = chapter["start_time"] chapter_end = chapter.get("end_time", float("inf")) # Find segments that belong to this chapter chapter_segments = [ s for s in segments if s["start"] >= chapter_start and s["start"] < chapter_end ] if not chapter_segments: continue # Add chapter as a step steps.append({ "timestamp": self._format_timestamp(chapter_start), "text": f"## {chapter['title']}", "start_seconds": chapter_start, "is_chapter": True }) # Extract steps within this chapter chapter_steps = self._extract_steps_from_segments(chapter_segments) # If no steps found within chapter, add the first segment as a step if not chapter_steps and chapter_segments: chapter_steps = [{ "timestamp": self._format_timestamp(chapter_segments[0]["start"]), "text": chapter_segments[0]["text"], "start_seconds": chapter_segments[0]["start"], "is_chapter": False }] steps.extend(chapter_steps) else: # No chapters, extract steps directly from segments logger.info("No chapters available, extracting steps directly from segments") steps = self._extract_steps_from_segments(segments) # If no steps found, create steps based on time intervals if not steps and segments: logger.info("No clear steps found, creating steps based on time intervals") # Get total duration if len(segments) > 1: total_duration = segments[-1]["start"] + segments[-1].get("duration", 0) - segments[0]["start"] else: total_duration = segments[0].get("duration", 300) # Default to 5 minutes if only one segment # Create steps every 2 minutes or at least 5 steps step_count = max(5, int(total_duration / 120)) interval = total_duration / step_count for i in range(step_count): target_time = segments[0]["start"] + i * interval # Find the closest segment closest_segment = min(segments, key=lambda s: abs(s["start"] - target_time)) steps.append({ "timestamp": self._format_timestamp(closest_segment["start"]), "text": closest_segment["text"], "start_seconds": closest_segment["start"], "is_chapter": False }) # Sort steps by timestamp steps.sort(key=lambda x: x["start_seconds"]) return steps def _extract_steps_from_segments(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Extract steps from transcript segments based on step indicators. Args: segments: List of transcript segments Returns: List of steps with timestamp and text """ steps = [] for segment in segments: text = segment["text"] # Check if the segment contains a step indicator is_step = any(pattern.search(text) for pattern in self.compiled_step_indicators) # Check if the segment contains code is_code = any(pattern.search(text) for pattern in self.compiled_patterns) # Add as a step if it's a step indicator or contains code if is_step or is_code: steps.append({ "timestamp": self._format_timestamp(segment["start"]), "text": text, "start_seconds": segment["start"], "is_chapter": False }) return steps def _detect_code_in_steps(self, steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Detect code snippets in steps. Args: steps: List of steps Returns: List of steps with code information """ steps_with_code = [] for step in steps: text = step["text"] # Skip chapter headings for code detection if step.get("is_chapter", False): steps_with_code.append(step) continue # Check if the text contains code is_code = any(pattern.search(text) for pattern in self.compiled_patterns) if is_code: # Detect programming language language = self._detect_language(text) steps_with_code.append({ **step, "is_code": True, "code_language": language, "code_content": text }) else: steps_with_code.append({ **step, "is_code": False }) return steps_with_code def _detect_language(self, text: str) -> str: """ Detect the programming language of a code snippet. Args: text: Code snippet text Returns: Detected programming language """ language_scores = {} for lang, patterns in self.compiled_language_patterns.items(): score = sum(1 for pattern in patterns if pattern.search(text)) language_scores[lang] = score if not language_scores or max(language_scores.values()) == 0: return "text" return max(language_scores.items(), key=lambda x: x[1])[0] def _format_timestamp(self, seconds: float) -> str: """ Format seconds as MM:SS timestamp. Args: seconds: Time in seconds Returns: Formatted timestamp string """ minutes = int(seconds // 60) seconds = int(seconds % 60) return f"{minutes}:{seconds:02d}"