TeXray

Running

App Files Files Community

openfree commited on 6 days ago

Commit

fc4c5f1

verified ·

1 Parent(s): 79cfe45

Update plagiarism_check.py

Browse files

Files changed (1) hide show

plagiarism_check.py +300 -27

plagiarism_check.py CHANGED Viewed

@@ -16,6 +16,18 @@ try:
 except ImportError:
     HAS_HTTPX = False
 try:
     from google import genai
     from google.genai import types as gtypes
@@ -308,38 +320,51 @@ def _fetch_page_text(url, timeout=8):
 def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
-    """출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거"""
     if not sources:
-        return [], 0.0
     verified = []
     total_sim = 0.0
     def _check_one(src):
         url = src.get("url", "")
         snippet = src.get("snippet", "")
         if snippet and len(snippet) > 20:
             sim = _text_similarity(original_text[:500], snippet, n=2)
             if sim >= min_similarity:
-                return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim
-        page_text = _fetch_page_text(url, timeout=6)
         if page_text and len(page_text) > 50:
             jaccard = _text_similarity(original_text, page_text, n=3)
             containment = _containment_similarity(original_text, page_text, n=3)
             sim = max(jaccard, containment)
             if sim >= min_similarity:
                 matched = _find_matching_sentences(original_text, page_text)
-                return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim
-        return None, 0.0
     with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
         futures = {executor.submit(_check_one, src): src for src in sources[:20]}
         for future in as_completed(futures, timeout=30):
             try:
-                result, sim = future.result()
                 if result is not None:
                     verified.append(result)
                     total_sim += sim
@@ -348,7 +373,7 @@ def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_wo
     verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
     avg_sim = total_sim / len(verified) if verified else 0.0
-    return verified, avg_sim
 def _find_matching_sentences(original, source_text, threshold=0.3):
@@ -366,6 +391,164 @@ def _find_matching_sentences(original, source_text, threshold=0.3):
     return matched
 # ============================================
 # 핵심 구문 추출
 # ============================================
@@ -608,7 +791,7 @@ def run_plagiarism(text, progress=gr.Progress()):
     log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
     # 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
-    verified_sources, avg_similarity = _verify_sources_parallel(
         text, raw_sources, min_similarity=0.02, max_workers=8
     )
@@ -616,37 +799,86 @@ def run_plagiarism(text, progress=gr.Progress()):
     verified_urls = {s["url"] for s in verified_sources}
     unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
-    log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 평균유사도={avg_similarity:.3f}")
     # ═══════════════════════════════════════
-    # 종합 판정 (실제 유사도 기반)
     # ═══════════════════════════════════════
-    _prog(0.85, "보고서 생성...")
     all_sources = verified_sources  # 검증된 출처만 표시
-    # 표절율 산출: 실제 유사도 기반 (Gemini 환각 방지)
-    # ① 검증된 출처들의 최고 유사도
     max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
-    # ② 검증된 출처 수 기반 보정
     count_factor = min(len(verified_sources) * 3, 30)
-    # ③ 평균 유사도 반영
     avg_factor = avg_similarity * 100
-    # 실제 유사도 기반 점수 (0~100)
-    similarity_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100)
-    # Gemini 표절율은 참고용으로만 (30% 가중)
-    # 단, Gemini가 0%인데 유사 출처가 있으면 유사도 기반만 사용
     if gemini_pct > 0 and len(verified_sources) > 0:
-        plag_pct = min(round(gemini_pct * 0.3 + similarity_score * 0.7), 100)
     elif len(verified_sources) > 0:
-        plag_pct = similarity_score
     else:
-        # 검증된 출처 없음 → Gemini 값도 크게 할인
         plag_pct = min(round(gemini_pct * 0.1), 20)
-    log_lines.append(f"[판정] Gemini={gemini_pct}%, 최고유사도={max_sim:.1f}%, 검증출처={len(verified_sources)}건 → 종합={plag_pct}%")
     if plag_pct >= 50:
         grade, gc = "🚨 표절 의심", "#FF4444"
@@ -714,13 +946,54 @@ def run_plagiarism(text, progress=gr.Progress()):
       </details>
     </div>"""
     HDR = '#3B7DD8'
     html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
     <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
       <div style="display:flex;justify-content:space-between;align-items:center;">
         <div>
           <div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
-          <div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini Google Search + Brave + KCI·RISS·arXiv</div>
         </div>
         <div style="text-align:right;font-size:11px;opacity:0.9;">
           <div>문서: {doc_id}</div>
@@ -757,8 +1030,8 @@ def run_plagiarism(text, progress=gr.Progress()):
         <span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
         <span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
       </div>
-    </div>{gemini_summary}
-    <div style="padding:24px;">
       <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
       <table style="width:100%;border-collapse:collapse;font-size:11px;">
         <thead>

 except ImportError:
     HAS_HTTPX = False
+try:
+    import chromadb
+    HAS_CHROMADB = True
+except ImportError:
+    HAS_CHROMADB = False
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
 try:
     from google import genai
     from google.genai import types as gtypes
 def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
+    """출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거, 크롤링 본문도 반환"""
     if not sources:
+        return [], 0.0, {}
     verified = []
     total_sim = 0.0
+    crawled_texts = {}  # url → page_text (벡터DB 인덱싱용)
     def _check_one(src):
         url = src.get("url", "")
         snippet = src.get("snippet", "")
+        page_text = ""
+        # snippet만으로 검증되는 경우에도 본문 크롤링 시도
+        page_text = _fetch_page_text(url, timeout=6)
         if snippet and len(snippet) > 20:
             sim = _text_similarity(original_text[:500], snippet, n=2)
             if sim >= min_similarity:
+                return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim, page_text
         if page_text and len(page_text) > 50:
             jaccard = _text_similarity(original_text, page_text, n=3)
             containment = _containment_similarity(original_text, page_text, n=3)
             sim = max(jaccard, containment)
             if sim >= min_similarity:
                 matched = _find_matching_sentences(original_text, page_text)
+                return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim, page_text
+        return None, 0.0, page_text
     with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
         futures = {executor.submit(_check_one, src): src for src in sources[:20]}
         for future in as_completed(futures, timeout=30):
             try:
+                result, sim, page_text = future.result()
+                src_info = futures[future]
+                url = src_info.get("url", "")
+                # 크롤링 본문 저장 (검증 여부 무관하게)
+                if page_text and len(page_text) > 50 and url:
+                    crawled_texts[url] = {
+                        "text": page_text,
+                        "title": src_info.get("title", ""),
+                        "source": src_info.get("source", ""),
+                    }
                 if result is not None:
                     verified.append(result)
                     total_sim += sim
     verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
     avg_sim = total_sim / len(verified) if verified else 0.0
+    return verified, avg_sim, crawled_texts
 def _find_matching_sentences(original, source_text, threshold=0.3):
     return matched
+# ============================================
+# 벡터 DB 기반 정밀 표절 검사
+# ============================================
+def _chunk_text(text, chunk_size=200, overlap=50):
+    """텍스트를 청크로 분리 (문장 경계 유지)"""
+    sents = _split_sentences(text)
+    chunks = []
+    current = []
+    current_len = 0
+    for sent in sents:
+        current.append(sent)
+        current_len += len(sent)
+        if current_len >= chunk_size:
+            chunks.append(' '.join(current))
+            # overlap: 마지막 문장 유지
+            if overlap > 0 and len(current) > 1:
+                current = current[-1:]
+                current_len = len(current[0])
+            else:
+                current = []
+                current_len = 0
+    if current:
+        chunks.append(' '.join(current))
+    return chunks
+class VectorPlagiarismDB:
+    """크롤링한 출처를 벡터DB에 저장하고 문장별 유사도 검색"""
+    def __init__(self):
+        if not HAS_CHROMADB:
+            self.available = False
+            return
+        try:
+            self.client = chromadb.Client()
+            self.collection = self.client.get_or_create_collection(
+                name="plag_sources",
+                metadata={"hnsw:space": "cosine"},
+            )
+            self.available = True
+            self._doc_count = 0
+        except Exception as e:
+            print(f"ChromaDB 초기화 실패: {e}")
+            self.available = False
+    def index_source(self, url, title, text, source_type="Web"):
+        """출처 텍스트를 청크로 분리하여 벡터DB에 저장"""
+        if not self.available or not text or len(text.strip()) < 30:
+            return 0
+        try:
+            chunks = _chunk_text(text, chunk_size=200, overlap=50)
+            if not chunks:
+                return 0
+            doc_hash = hashlib.md5(url.encode()).hexdigest()[:10]
+            ids = []
+            documents = []
+            metadatas = []
+            for i, chunk in enumerate(chunks[:50]):  # 출처당 최대 50청크
+                chunk_id = f"{doc_hash}_{i}"
+                ids.append(chunk_id)
+                documents.append(chunk)
+                metadatas.append({
+                    "url": url[:200],
+                    "title": title[:100],
+                    "source": source_type,
+                    "chunk_idx": i,
+                })
+            self.collection.add(ids=ids, documents=documents, metadatas=metadatas)
+            self._doc_count += len(ids)
+            return len(ids)
+        except Exception as e:
+            print(f"벡터DB 인덱싱 오류: {e}")
+            return 0
+    def query_sentence(self, sentence, n_results=3):
+        """단일 문장에 대해 가장 유사한 출처 청크 검색"""
+        if not self.available or self._doc_count == 0:
+            return []
+        try:
+            results = self.collection.query(
+                query_texts=[sentence],
+                n_results=min(n_results, self._doc_count),
+            )
+            matches = []
+            if results and results['distances'] and results['distances'][0]:
+                for j, dist in enumerate(results['distances'][0]):
+                    # ChromaDB cosine distance: 0 = 동일, 2 = 정반대
+                    # similarity = 1 - (distance / 2)
+                    similarity = max(0, 1 - dist / 2)
+                    if similarity < 0.3:  # 30% 미만은 무시
+                        continue
+                    meta = results['metadatas'][0][j] if results['metadatas'] else {}
+                    doc_text = results['documents'][0][j] if results['documents'] else ""
+                    matches.append({
+                        "similarity": round(similarity * 100, 1),
+                        "url": meta.get("url", ""),
+                        "title": meta.get("title", ""),
+                        "source": meta.get("source", ""),
+                        "matched_text": doc_text[:150],
+                    })
+            return matches
+        except Exception as e:
+            print(f"벡터DB 쿼리 오류: {e}")
+            return []
+    def check_document(self, text, min_similarity=30):
+        """전체 문서를 문장별로 검사 → 표절 문장 목록 반환"""
+        if not self.available or self._doc_count == 0:
+            return [], 0.0, {}
+        sents = _split_sentences(text)
+        flagged = []      # 표절 의심 문장
+        all_sims = []      # 전체 유사도
+        source_hits = {}   # URL별 히트 횟수
+        for sent in sents:
+            if len(sent) < 15:
+                continue
+            matches = self.query_sentence(sent, n_results=3)
+            if not matches:
+                all_sims.append(0)
+                continue
+            best = matches[0]
+            all_sims.append(best["similarity"])
+            if best["similarity"] >= min_similarity:
+                flagged.append({
+                    "sentence": sent[:80],
+                    "similarity": best["similarity"],
+                    "url": best["url"],
+                    "title": best["title"],
+                    "matched_text": best["matched_text"][:100],
+                })
+                url = best["url"]
+                source_hits[url] = source_hits.get(url, 0) + 1
+        # 표절율: 유사 문장 비율
+        total_checked = len(all_sims)
+        flagged_count = len(flagged)
+        plag_ratio = (flagged_count / total_checked * 100) if total_checked > 0 else 0
+        # 평균 유사도 (유사 문장만)
+        avg_sim = sum(s["similarity"] for s in flagged) / len(flagged) if flagged else 0
+        return flagged, plag_ratio, source_hits
+    def get_stats(self):
+        """DB 상태"""
+        if not self.available:
+            return {"available": False, "chunks": 0}
+        return {"available": True, "chunks": self._doc_count}
 # ============================================
 # 핵심 구문 추출
 # ============================================
     log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
     # 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
+    verified_sources, avg_similarity, crawled_texts = _verify_sources_parallel(
         text, raw_sources, min_similarity=0.02, max_workers=8
     )
     verified_urls = {s["url"] for s in verified_sources}
     unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
+    log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 크롤링={len(crawled_texts)}건, 평균유사도={avg_similarity:.3f}")
     # ═══════════════════════════════════════
+    # PHASE 5: 벡터DB 문장별 정밀 대조
     # ═══════════════════════════════════════
+    vector_flagged = []
+    vector_plag_ratio = 0.0
+    vector_source_hits = {}
+    vdb = None
+    if HAS_CHROMADB and crawled_texts:
+        _prog(0.80, "⑤ 벡터DB 문장별 정밀 대조...")
+        try:
+            vdb = VectorPlagiarismDB()
+            if vdb.available:
+                # 크롤링된 모든 출처 본문을 벡터DB에 인덱싱
+                indexed_count = 0
+                for url, info in crawled_texts.items():
+                    n = vdb.index_source(
+                        url=url,
+                        title=info.get("title", ""),
+                        text=info["text"],
+                        source_type=info.get("source", "Web"),
+                    )
+                    indexed_count += n
+                log_lines.append(f"[벡터DB] {len(crawled_texts)}개 출처 → {indexed_count}개 청크 인덱싱")
+                if indexed_count > 0:
+                    # 입력 텍스트를 문장별로 벡터DB 검색
+                    vector_flagged, vector_plag_ratio, vector_source_hits = vdb.check_document(
+                        text, min_similarity=35
+                    )
+                    log_lines.append(
+                        f"[벡터DB] 표절 문장={len(vector_flagged)}건, "
+                        f"문장표절율={vector_plag_ratio:.1f}%, "
+                        f"히트출처={len(vector_source_hits)}건"
+                    )
+        except Exception as e:
+            log_lines.append(f"[벡터DB] 오류: {str(e)[:80]}")
+    elif not HAS_CHROMADB:
+        log_lines.append("[벡터DB] chromadb 미설치 — 건너뜀")
+    else:
+        log_lines.append("[벡터DB] 크롤링 데이터 없음 — 건너뜀")
+    # ═══════════════════════════════════════
+    # 종합 판정 (n-gram + 벡터DB 복합)
+    # ═══════════════════════════════════════
+    _prog(0.90, "보고서 생성...")
     all_sources = verified_sources  # 검증된 출처만 표시
+    # 표절율 산출: 다층 검증
+    # ① n-gram 기반 (기존)
     max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
     count_factor = min(len(verified_sources) * 3, 30)
     avg_factor = avg_similarity * 100
+    ngram_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100)
+    # ② 벡터DB 기반 (문장별 매칭)
+    vector_score = round(vector_plag_ratio) if vector_flagged else 0
+    # 복합 점수: 벡터DB가 있으면 50:50, 없으면 n-gram만
+    if vector_flagged:
+        combined_score = round(ngram_score * 0.4 + vector_score * 0.6)
+    else:
+        combined_score = ngram_score
+    # Gemini 표절율은 참고용 (20% 가중, 검증 출처가 있을 때만)
     if gemini_pct > 0 and len(verified_sources) > 0:
+        plag_pct = min(round(gemini_pct * 0.2 + combined_score * 0.8), 100)
     elif len(verified_sources) > 0:
+        plag_pct = combined_score
     else:
         plag_pct = min(round(gemini_pct * 0.1), 20)
+    log_lines.append(
+        f"[판정] Gemini={gemini_pct}%, n-gram={ngram_score}%, "
+        f"벡터={vector_score}%, → 종합={plag_pct}%"
+    )
     if plag_pct >= 50:
         grade, gc = "🚨 표절 의심", "#FF4444"
       </details>
     </div>"""
+    # 벡터DB 문장별 매칭 결과
+    vector_section = ""
+    if vector_flagged:
+        vf_rows = ""
+        for k, vf in enumerate(vector_flagged[:15]):
+            sim_val = vf["similarity"]
+            sim_color = "#FF4444" if sim_val >= 70 else "#FF8800" if sim_val >= 50 else "#DDAA00"
+            sent_safe = vf["sentence"][:70].replace('<', '&lt;')
+            matched_safe = vf["matched_text"][:90].replace('<', '&lt;')
+            title_safe = vf["title"][:40].replace('<', '&lt;')
+            vf_rows += f"""<tr style="border-bottom:1px solid #F0F0F0;">
+              <td style="padding:6px;text-align:center;font-size:10px;color:#666;">{k+1}</td>
+              <td style="padding:6px;font-size:10px;color:#333;">{sent_safe}</td>
+              <td style="padding:6px;text-align:center;"><span style="font-weight:700;color:{sim_color};">{sim_val:.0f}%</span></td>
+              <td style="padding:6px;font-size:9px;color:#666;">{matched_safe}</td>
+              <td style="padding:6px;font-size:9px;"><a href="{vf['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;">{title_safe}</a></td>
+            </tr>"""
+        vdb_stats = vdb.get_stats() if vdb else {"chunks": 0}
+        vector_section = f"""
+    <div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;">
+      <details open>
+        <summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;">
+          🧬 벡터DB 문장별 정밀 대조 — 표절 문장 {len(vector_flagged)}건 / 문장표절율 {vector_plag_ratio:.1f}%
+          <span style="font-size:10px;font-weight:400;color:#888;margin-left:8px;">(DB: {vdb_stats['chunks']}청크)</span>
+        </summary>
+        <table style="width:100%;border-collapse:collapse;font-size:11px;margin-top:10px;">
+          <thead>
+            <tr style="background:#E8EAF6;color:#333;">
+              <th style="padding:8px;width:30px;">#</th>
+              <th style="padding:8px;text-align:left;">입력 문장</th>
+              <th style="padding:8px;width:55px;">유사도</th>
+              <th style="padding:8px;text-align:left;">매칭 출처 내용</th>
+              <th style="padding:8px;width:120px;">출처</th>
+            </tr>
+          </thead>
+          <tbody>{vf_rows}</tbody>
+        </table>
+      </details>
+    </div>"""
     HDR = '#3B7DD8'
     html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
     <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
       <div style="display:flex;justify-content:space-between;align-items:center;">
         <div>
           <div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
+          <div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini + Brave + KCI·RISS·arXiv + 벡터DB 정밀대조</div>
         </div>
         <div style="text-align:right;font-size:11px;opacity:0.9;">
           <div>문서: {doc_id}</div>
         <span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
         <span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
       </div>
+    </div>{gemini_summary}{vector_section}
+    <div style="padding:24px;border-bottom:1px solid #E0E0E0;">
       <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
       <table style="width:100%;border-collapse:collapse;font-size:11px;">
         <thead>