Spaces:
Running
Running
File size: 4,227 Bytes
d557d77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
"""Repository loading and cloning."""
from pathlib import Path
from typing import Callable, Optional
from git import Repo, GitCommandError
from coderag.config import get_settings
from coderag.logging import get_logger
from coderag.ingestion.validator import GitHubRepoInfo
logger = get_logger(__name__)
ProgressCallback = Callable[[str, int], None]
class LoaderError(Exception):
"""Repository loading error."""
pass
class RepositoryLoader:
"""Loads repositories from GitHub."""
def __init__(self, cache_dir: Optional[Path] = None) -> None:
settings = get_settings()
self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
return self.cache_dir / repo_info.owner / repo_info.name
def clone_repository(
self,
repo_info: GitHubRepoInfo,
branch: Optional[str] = None,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
repo_path = self.get_repo_path(repo_info)
# Try branches in order: specified, repo default, main, master
branches_to_try = []
if branch:
branches_to_try.append(branch)
if repo_info.branch and repo_info.branch not in branches_to_try:
branches_to_try.append(repo_info.branch)
if "main" not in branches_to_try:
branches_to_try.append("main")
if "master" not in branches_to_try:
branches_to_try.append("master")
if repo_path.exists():
logger.info("Repository exists, updating", path=str(repo_path))
return self._update_repository(repo_path, branches_to_try[0], progress_callback)
if progress_callback:
progress_callback("Cloning repository", 0)
repo_path.parent.mkdir(parents=True, exist_ok=True)
last_error = None
for try_branch in branches_to_try:
try:
logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
Repo.clone_from(
repo_info.clone_url,
repo_path,
branch=try_branch,
depth=1,
single_branch=True,
)
if progress_callback:
progress_callback("Clone complete", 100)
logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
return repo_path
except GitCommandError as e:
last_error = e
logger.debug("Branch not found, trying next", branch=try_branch)
# Clean up partial clone if any
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
continue
raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")
def _update_repository(
self,
repo_path: Path,
branch: str,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
try:
repo = Repo(repo_path)
if progress_callback:
progress_callback("Fetching updates", 30)
repo.remotes.origin.fetch()
repo.git.checkout(branch)
repo.remotes.origin.pull()
if progress_callback:
progress_callback("Update complete", 100)
logger.info("Repository updated", path=str(repo_path))
return repo_path
except GitCommandError as e:
logger.warning("Update failed, re-cloning", error=str(e))
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
raise LoaderError(f"Failed to update, please re-clone: {e}")
def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
return self.get_repo_path(repo_info).exists()
def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
repo_path = self.get_repo_path(repo_info)
if repo_path.exists():
import shutil
shutil.rmtree(repo_path)
logger.info("Cache deleted", path=str(repo_path))
|