File size: 4,227 Bytes
d557d77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Repository loading and cloning."""

from pathlib import Path
from typing import Callable, Optional

from git import Repo, GitCommandError

from coderag.config import get_settings
from coderag.logging import get_logger
from coderag.ingestion.validator import GitHubRepoInfo

logger = get_logger(__name__)

ProgressCallback = Callable[[str, int], None]


class LoaderError(Exception):
    """Repository loading error."""
    pass


class RepositoryLoader:
    """Loads repositories from GitHub."""

    def __init__(self, cache_dir: Optional[Path] = None) -> None:
        settings = get_settings()
        self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
        return self.cache_dir / repo_info.owner / repo_info.name

    def clone_repository(
        self,
        repo_info: GitHubRepoInfo,
        branch: Optional[str] = None,
        progress_callback: Optional[ProgressCallback] = None,
    ) -> Path:
        repo_path = self.get_repo_path(repo_info)

        # Try branches in order: specified, repo default, main, master
        branches_to_try = []
        if branch:
            branches_to_try.append(branch)
        if repo_info.branch and repo_info.branch not in branches_to_try:
            branches_to_try.append(repo_info.branch)
        if "main" not in branches_to_try:
            branches_to_try.append("main")
        if "master" not in branches_to_try:
            branches_to_try.append("master")

        if repo_path.exists():
            logger.info("Repository exists, updating", path=str(repo_path))
            return self._update_repository(repo_path, branches_to_try[0], progress_callback)

        if progress_callback:
            progress_callback("Cloning repository", 0)

        repo_path.parent.mkdir(parents=True, exist_ok=True)

        last_error = None
        for try_branch in branches_to_try:
            try:
                logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
                Repo.clone_from(
                    repo_info.clone_url,
                    repo_path,
                    branch=try_branch,
                    depth=1,
                    single_branch=True,
                )
                if progress_callback:
                    progress_callback("Clone complete", 100)
                logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
                return repo_path
            except GitCommandError as e:
                last_error = e
                logger.debug("Branch not found, trying next", branch=try_branch)
                # Clean up partial clone if any
                import shutil
                shutil.rmtree(repo_path, ignore_errors=True)
                continue

        raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")

    def _update_repository(
        self,
        repo_path: Path,
        branch: str,
        progress_callback: Optional[ProgressCallback] = None,
    ) -> Path:
        try:
            repo = Repo(repo_path)
            if progress_callback:
                progress_callback("Fetching updates", 30)
            repo.remotes.origin.fetch()
            repo.git.checkout(branch)
            repo.remotes.origin.pull()
            if progress_callback:
                progress_callback("Update complete", 100)
            logger.info("Repository updated", path=str(repo_path))
            return repo_path
        except GitCommandError as e:
            logger.warning("Update failed, re-cloning", error=str(e))
            import shutil
            shutil.rmtree(repo_path, ignore_errors=True)
            raise LoaderError(f"Failed to update, please re-clone: {e}")

    def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
        return self.get_repo_path(repo_info).exists()

    def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
        repo_path = self.get_repo_path(repo_info)
        if repo_path.exists():
            import shutil
            shutil.rmtree(repo_path)
            logger.info("Cache deleted", path=str(repo_path))