|
|
""" |
|
|
SmoLAgent processor for YouTube transcripts. |
|
|
Handles transcript processing and step extraction. |
|
|
""" |
|
|
import re |
|
|
import logging |
|
|
from typing import Dict, List, Optional, Any, Tuple |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class SmoLAgentProcessor: |
|
|
""" |
|
|
Processor for YouTube transcripts using SmoLAgent. |
|
|
|
|
|
This class handles the processing of YouTube transcripts to extract |
|
|
meaningful steps and code snippets from tutorial videos. |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the SmoLAgentProcessor.""" |
|
|
logger.info("Initializing SmoLAgentProcessor") |
|
|
|
|
|
|
|
|
self.code_patterns = [ |
|
|
|
|
|
r'import\s+[\w\s,\.]+', |
|
|
r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+', |
|
|
r'def\s+\w+\s*\([^)]*\)\s*:', |
|
|
r'class\s+\w+(\s*\([^)]*\))?\s*:', |
|
|
r'if\s+.*:\s*$', |
|
|
r'for\s+.*:\s*$', |
|
|
r'while\s+.*:\s*$', |
|
|
r'try\s*:\s*$', |
|
|
r'except\s+.*:\s*$', |
|
|
r'return\s+.*', |
|
|
r'print\s*\(', |
|
|
r'with\s+.*:\s*$', |
|
|
r'lambda\s+.*:', |
|
|
r'@\w+', |
|
|
|
|
|
|
|
|
r'function\s+\w+\s*\([^)]*\)\s*{', |
|
|
r'const\s+\w+\s*=', |
|
|
r'let\s+\w+\s*=', |
|
|
r'var\s+\w+\s*=', |
|
|
r'import\s+{[^}]*}\s+from', |
|
|
r'export\s+', |
|
|
r'=>\s*{', |
|
|
r'document\.querySelector', |
|
|
r'async\s+function', |
|
|
r'await\s+', |
|
|
|
|
|
|
|
|
r'<\w+[^>]*>', |
|
|
r'</\w+>', |
|
|
|
|
|
|
|
|
r'\.\w+\s*{', |
|
|
r'#\w+\s*{', |
|
|
r'@media', |
|
|
r'@keyframes', |
|
|
|
|
|
|
|
|
r'npm\s+install', |
|
|
r'pip\s+install', |
|
|
r'git\s+', |
|
|
r'docker\s+', |
|
|
r'cd\s+', |
|
|
r'mkdir\s+', |
|
|
r'touch\s+', |
|
|
r'ls\s+', |
|
|
r'rm\s+', |
|
|
|
|
|
|
|
|
r'```\w*', |
|
|
r'`[^`]+`', |
|
|
r'\$\s+\w+', |
|
|
] |
|
|
|
|
|
|
|
|
self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.code_patterns] |
|
|
|
|
|
|
|
|
self.step_indicators = [ |
|
|
r'step\s+\d+', |
|
|
r'first\s+step', |
|
|
r'next\s+step', |
|
|
r'final\s+step', |
|
|
r'let\'s\s+start', |
|
|
r'now\s+we', |
|
|
r'next\s+we', |
|
|
r'first\s+we', |
|
|
r'finally\s+we', |
|
|
r'let\'s\s+do', |
|
|
r'we\s+need\s+to', |
|
|
r'you\s+need\s+to', |
|
|
r'we\'re\s+going\s+to', |
|
|
r'i\'m\s+going\s+to', |
|
|
r'let\'s\s+create', |
|
|
r'let\'s\s+add', |
|
|
r'let\'s\s+implement', |
|
|
r'let\'s\s+build', |
|
|
r'let\'s\s+make', |
|
|
r'let\'s\s+set\s+up', |
|
|
r'let\'s\s+configure', |
|
|
r'let\'s\s+install', |
|
|
r'let\'s\s+initialize', |
|
|
r'let\'s\s+define', |
|
|
r'let\'s\s+write', |
|
|
r'let\'s\s+move\s+on\s+to', |
|
|
r'moving\s+on\s+to', |
|
|
r'now\s+let\'s', |
|
|
r'the\s+next\s+thing', |
|
|
r'after\s+that', |
|
|
r'once\s+you\'ve', |
|
|
r'once\s+we\'ve', |
|
|
r'now\s+that\s+we', |
|
|
r'now\s+that\s+you', |
|
|
r'to\s+begin', |
|
|
r'to\s+start', |
|
|
r'to\s+get\s+started', |
|
|
r'first\s+thing', |
|
|
r'second\s+thing', |
|
|
r'third\s+thing', |
|
|
r'lastly', |
|
|
r'finally', |
|
|
r'in\s+conclusion', |
|
|
r'to\s+summarize', |
|
|
r'to\s+wrap\s+up', |
|
|
] |
|
|
|
|
|
|
|
|
self.compiled_step_indicators = [re.compile(pattern, re.IGNORECASE) for pattern in self.step_indicators] |
|
|
|
|
|
|
|
|
self.language_patterns = { |
|
|
'python': [ |
|
|
r'import\s+[\w\s,\.]+', |
|
|
r'from\s+[\w\.]+\s+import\s+[\w\s,\.]+', |
|
|
r'def\s+\w+\s*\([^)]*\)\s*:', |
|
|
r'class\s+\w+(\s*\([^)]*\))?\s*:', |
|
|
r'print\s*\(', |
|
|
r'if\s+.*:\s*$', |
|
|
r'for\s+.*:\s*$', |
|
|
r'while\s+.*:\s*$', |
|
|
], |
|
|
'javascript': [ |
|
|
r'function\s+\w+\s*\([^)]*\)\s*{', |
|
|
r'const\s+\w+\s*=', |
|
|
r'let\s+\w+\s*=', |
|
|
r'var\s+\w+\s*=', |
|
|
r'import\s+{[^}]*}\s+from', |
|
|
r'export\s+', |
|
|
r'=>\s*{', |
|
|
r'document\.', |
|
|
r'window\.', |
|
|
], |
|
|
'html': [ |
|
|
r'<html', |
|
|
r'<head', |
|
|
r'<body', |
|
|
r'<div', |
|
|
r'<span', |
|
|
r'<p>', |
|
|
r'<a\s+href', |
|
|
r'<img\s+src', |
|
|
r'<script', |
|
|
r'<style', |
|
|
], |
|
|
'css': [ |
|
|
r'\.\w+\s*{', |
|
|
r'#\w+\s*{', |
|
|
r'@media', |
|
|
r'@keyframes', |
|
|
r'margin:', |
|
|
r'padding:', |
|
|
r'color:', |
|
|
r'background:', |
|
|
], |
|
|
'shell': [ |
|
|
r'npm\s+install', |
|
|
r'pip\s+install', |
|
|
r'git\s+', |
|
|
r'docker\s+', |
|
|
r'cd\s+', |
|
|
r'mkdir\s+', |
|
|
r'touch\s+', |
|
|
r'ls\s+', |
|
|
r'rm\s+', |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
self.compiled_language_patterns = { |
|
|
lang: [re.compile(pattern, re.IGNORECASE) for pattern in patterns] |
|
|
for lang, patterns in self.language_patterns.items() |
|
|
} |
|
|
|
|
|
def process_transcript(self, transcript: List[Dict[str, Any]], chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Process the transcript to extract steps. |
|
|
|
|
|
Args: |
|
|
transcript: List of transcript segments with text and timestamps |
|
|
chapters: List of chapters with title, start_time, end_time |
|
|
|
|
|
Returns: |
|
|
List of steps with timestamp, text, and code information |
|
|
""" |
|
|
if not transcript: |
|
|
logger.warning("Empty transcript provided") |
|
|
return [] |
|
|
|
|
|
logger.info(f"Processing transcript with {len(transcript)} segments and {len(chapters)} chapters") |
|
|
|
|
|
|
|
|
merged_segments = self._merge_adjacent_segments(transcript) |
|
|
logger.info(f"Merged into {len(merged_segments)} segments") |
|
|
|
|
|
|
|
|
steps = self._extract_steps(merged_segments, chapters) |
|
|
logger.info(f"Extracted {len(steps)} steps") |
|
|
|
|
|
|
|
|
steps_with_code = self._detect_code_in_steps(steps) |
|
|
logger.info(f"Detected code in steps, final count: {len(steps_with_code)}") |
|
|
|
|
|
return steps_with_code |
|
|
|
|
|
def _merge_adjacent_segments(self, transcript: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Merge adjacent transcript segments that are part of the same sentence. |
|
|
|
|
|
Args: |
|
|
transcript: List of transcript segments |
|
|
|
|
|
Returns: |
|
|
List of merged transcript segments |
|
|
""" |
|
|
if not transcript: |
|
|
return [] |
|
|
|
|
|
merged = [] |
|
|
current_segment = transcript[0].copy() |
|
|
|
|
|
for i in range(1, len(transcript)): |
|
|
segment = transcript[i] |
|
|
|
|
|
|
|
|
time_gap = segment["start"] - (current_segment["start"] + current_segment.get("duration", 0)) |
|
|
|
|
|
|
|
|
current_text_ends_sentence = re.search(r'[.!?]\s*$', current_segment["text"]) |
|
|
|
|
|
if time_gap < 2 and not current_text_ends_sentence: |
|
|
|
|
|
current_segment["text"] += " " + segment["text"] |
|
|
current_segment["duration"] = segment["start"] + segment.get("duration", 0) - current_segment["start"] |
|
|
else: |
|
|
|
|
|
merged.append(current_segment) |
|
|
current_segment = segment.copy() |
|
|
|
|
|
|
|
|
merged.append(current_segment) |
|
|
|
|
|
return merged |
|
|
|
|
|
def _extract_steps(self, segments: List[Dict[str, Any]], chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Extract steps from transcript segments. |
|
|
|
|
|
Args: |
|
|
segments: List of transcript segments |
|
|
chapters: List of chapters |
|
|
|
|
|
Returns: |
|
|
List of steps with timestamp and text |
|
|
""" |
|
|
steps = [] |
|
|
|
|
|
|
|
|
if chapters: |
|
|
logger.info("Using chapters as primary structure for steps") |
|
|
|
|
|
for chapter in chapters: |
|
|
chapter_start = chapter["start_time"] |
|
|
chapter_end = chapter.get("end_time", float("inf")) |
|
|
|
|
|
|
|
|
chapter_segments = [ |
|
|
s for s in segments |
|
|
if s["start"] >= chapter_start and s["start"] < chapter_end |
|
|
] |
|
|
|
|
|
if not chapter_segments: |
|
|
continue |
|
|
|
|
|
|
|
|
steps.append({ |
|
|
"timestamp": self._format_timestamp(chapter_start), |
|
|
"text": f"## {chapter['title']}", |
|
|
"start_seconds": chapter_start, |
|
|
"is_chapter": True |
|
|
}) |
|
|
|
|
|
|
|
|
chapter_steps = self._extract_steps_from_segments(chapter_segments) |
|
|
|
|
|
|
|
|
if not chapter_steps and chapter_segments: |
|
|
chapter_steps = [{ |
|
|
"timestamp": self._format_timestamp(chapter_segments[0]["start"]), |
|
|
"text": chapter_segments[0]["text"], |
|
|
"start_seconds": chapter_segments[0]["start"], |
|
|
"is_chapter": False |
|
|
}] |
|
|
|
|
|
steps.extend(chapter_steps) |
|
|
else: |
|
|
|
|
|
logger.info("No chapters available, extracting steps directly from segments") |
|
|
steps = self._extract_steps_from_segments(segments) |
|
|
|
|
|
|
|
|
if not steps and segments: |
|
|
logger.info("No clear steps found, creating steps based on time intervals") |
|
|
|
|
|
|
|
|
if len(segments) > 1: |
|
|
total_duration = segments[-1]["start"] + segments[-1].get("duration", 0) - segments[0]["start"] |
|
|
else: |
|
|
total_duration = segments[0].get("duration", 300) |
|
|
|
|
|
|
|
|
step_count = max(5, int(total_duration / 120)) |
|
|
interval = total_duration / step_count |
|
|
|
|
|
for i in range(step_count): |
|
|
target_time = segments[0]["start"] + i * interval |
|
|
|
|
|
|
|
|
closest_segment = min(segments, key=lambda s: abs(s["start"] - target_time)) |
|
|
|
|
|
steps.append({ |
|
|
"timestamp": self._format_timestamp(closest_segment["start"]), |
|
|
"text": closest_segment["text"], |
|
|
"start_seconds": closest_segment["start"], |
|
|
"is_chapter": False |
|
|
}) |
|
|
|
|
|
|
|
|
steps.sort(key=lambda x: x["start_seconds"]) |
|
|
|
|
|
return steps |
|
|
|
|
|
def _extract_steps_from_segments(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Extract steps from transcript segments based on step indicators. |
|
|
|
|
|
Args: |
|
|
segments: List of transcript segments |
|
|
|
|
|
Returns: |
|
|
List of steps with timestamp and text |
|
|
""" |
|
|
steps = [] |
|
|
|
|
|
for segment in segments: |
|
|
text = segment["text"] |
|
|
|
|
|
|
|
|
is_step = any(pattern.search(text) for pattern in self.compiled_step_indicators) |
|
|
|
|
|
|
|
|
is_code = any(pattern.search(text) for pattern in self.compiled_patterns) |
|
|
|
|
|
|
|
|
if is_step or is_code: |
|
|
steps.append({ |
|
|
"timestamp": self._format_timestamp(segment["start"]), |
|
|
"text": text, |
|
|
"start_seconds": segment["start"], |
|
|
"is_chapter": False |
|
|
}) |
|
|
|
|
|
return steps |
|
|
|
|
|
def _detect_code_in_steps(self, steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Detect code snippets in steps. |
|
|
|
|
|
Args: |
|
|
steps: List of steps |
|
|
|
|
|
Returns: |
|
|
List of steps with code information |
|
|
""" |
|
|
steps_with_code = [] |
|
|
|
|
|
for step in steps: |
|
|
text = step["text"] |
|
|
|
|
|
|
|
|
if step.get("is_chapter", False): |
|
|
steps_with_code.append(step) |
|
|
continue |
|
|
|
|
|
|
|
|
is_code = any(pattern.search(text) for pattern in self.compiled_patterns) |
|
|
|
|
|
if is_code: |
|
|
|
|
|
language = self._detect_language(text) |
|
|
|
|
|
steps_with_code.append({ |
|
|
**step, |
|
|
"is_code": True, |
|
|
"code_language": language, |
|
|
"code_content": text |
|
|
}) |
|
|
else: |
|
|
steps_with_code.append({ |
|
|
**step, |
|
|
"is_code": False |
|
|
}) |
|
|
|
|
|
return steps_with_code |
|
|
|
|
|
def _detect_language(self, text: str) -> str: |
|
|
""" |
|
|
Detect the programming language of a code snippet. |
|
|
|
|
|
Args: |
|
|
text: Code snippet text |
|
|
|
|
|
Returns: |
|
|
Detected programming language |
|
|
""" |
|
|
language_scores = {} |
|
|
|
|
|
for lang, patterns in self.compiled_language_patterns.items(): |
|
|
score = sum(1 for pattern in patterns if pattern.search(text)) |
|
|
language_scores[lang] = score |
|
|
|
|
|
if not language_scores or max(language_scores.values()) == 0: |
|
|
return "text" |
|
|
|
|
|
return max(language_scores.items(), key=lambda x: x[1])[0] |
|
|
|
|
|
def _format_timestamp(self, seconds: float) -> str: |
|
|
""" |
|
|
Format seconds as MM:SS timestamp. |
|
|
|
|
|
Args: |
|
|
seconds: Time in seconds |
|
|
|
|
|
Returns: |
|
|
Formatted timestamp string |
|
|
""" |
|
|
minutes = int(seconds // 60) |
|
|
seconds = int(seconds % 60) |
|
|
return f"{minutes}:{seconds:02d}" |
|
|
|