Spaces:
Running
Running
Update plagiarism_check.py
Browse files- plagiarism_check.py +300 -27
plagiarism_check.py
CHANGED
|
@@ -16,6 +16,18 @@ try:
|
|
| 16 |
except ImportError:
|
| 17 |
HAS_HTTPX = False
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
from google import genai
|
| 21 |
from google.genai import types as gtypes
|
|
@@ -308,38 +320,51 @@ def _fetch_page_text(url, timeout=8):
|
|
| 308 |
|
| 309 |
|
| 310 |
def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
|
| 311 |
-
"""출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거"""
|
| 312 |
if not sources:
|
| 313 |
-
return [], 0.0
|
| 314 |
|
| 315 |
verified = []
|
| 316 |
total_sim = 0.0
|
|
|
|
| 317 |
|
| 318 |
def _check_one(src):
|
| 319 |
url = src.get("url", "")
|
| 320 |
snippet = src.get("snippet", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
if snippet and len(snippet) > 20:
|
| 323 |
sim = _text_similarity(original_text[:500], snippet, n=2)
|
| 324 |
if sim >= min_similarity:
|
| 325 |
-
return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim
|
| 326 |
|
| 327 |
-
page_text = _fetch_page_text(url, timeout=6)
|
| 328 |
if page_text and len(page_text) > 50:
|
| 329 |
jaccard = _text_similarity(original_text, page_text, n=3)
|
| 330 |
containment = _containment_similarity(original_text, page_text, n=3)
|
| 331 |
sim = max(jaccard, containment)
|
| 332 |
if sim >= min_similarity:
|
| 333 |
matched = _find_matching_sentences(original_text, page_text)
|
| 334 |
-
return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim
|
| 335 |
|
| 336 |
-
return None, 0.0
|
| 337 |
|
| 338 |
with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
|
| 339 |
futures = {executor.submit(_check_one, src): src for src in sources[:20]}
|
| 340 |
for future in as_completed(futures, timeout=30):
|
| 341 |
try:
|
| 342 |
-
result, sim = future.result()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
if result is not None:
|
| 344 |
verified.append(result)
|
| 345 |
total_sim += sim
|
|
@@ -348,7 +373,7 @@ def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_wo
|
|
| 348 |
|
| 349 |
verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
|
| 350 |
avg_sim = total_sim / len(verified) if verified else 0.0
|
| 351 |
-
return verified, avg_sim
|
| 352 |
|
| 353 |
|
| 354 |
def _find_matching_sentences(original, source_text, threshold=0.3):
|
|
@@ -366,6 +391,164 @@ def _find_matching_sentences(original, source_text, threshold=0.3):
|
|
| 366 |
return matched
|
| 367 |
|
| 368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
# ============================================
|
| 370 |
# 핵심 구문 추출
|
| 371 |
# ============================================
|
|
@@ -608,7 +791,7 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 608 |
log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
|
| 609 |
|
| 610 |
# 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
|
| 611 |
-
verified_sources, avg_similarity = _verify_sources_parallel(
|
| 612 |
text, raw_sources, min_similarity=0.02, max_workers=8
|
| 613 |
)
|
| 614 |
|
|
@@ -616,37 +799,86 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 616 |
verified_urls = {s["url"] for s in verified_sources}
|
| 617 |
unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
|
| 618 |
|
| 619 |
-
log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 평균유사도={avg_similarity:.3f}")
|
| 620 |
|
| 621 |
# ═══════════════════════════════════════
|
| 622 |
-
#
|
| 623 |
# ═══════════════════════════════════════
|
| 624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
|
| 626 |
all_sources = verified_sources # 검증된 출처만 표시
|
| 627 |
|
| 628 |
-
# 표절율 산출:
|
| 629 |
-
# ①
|
| 630 |
max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
|
| 631 |
-
# ② 검증된 출처 수 기반 보정
|
| 632 |
count_factor = min(len(verified_sources) * 3, 30)
|
| 633 |
-
# ③ 평균 유사도 반영
|
| 634 |
avg_factor = avg_similarity * 100
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
|
| 636 |
-
#
|
| 637 |
-
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
-
# Gemini 표절율은 참고용
|
| 640 |
-
# 단, Gemini가 0%인데 유사 출처가 있으면 유사도 기반만 사용
|
| 641 |
if gemini_pct > 0 and len(verified_sources) > 0:
|
| 642 |
-
plag_pct = min(round(gemini_pct * 0.
|
| 643 |
elif len(verified_sources) > 0:
|
| 644 |
-
plag_pct =
|
| 645 |
else:
|
| 646 |
-
# 검증된 출처 없음 → Gemini 값도 크게 할인
|
| 647 |
plag_pct = min(round(gemini_pct * 0.1), 20)
|
| 648 |
|
| 649 |
-
log_lines.append(
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
if plag_pct >= 50:
|
| 652 |
grade, gc = "🚨 표절 의심", "#FF4444"
|
|
@@ -714,13 +946,54 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 714 |
</details>
|
| 715 |
</div>"""
|
| 716 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
HDR = '#3B7DD8'
|
| 718 |
html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
|
| 719 |
<div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
|
| 720 |
<div style="display:flex;justify-content:space-between;align-items:center;">
|
| 721 |
<div>
|
| 722 |
<div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
|
| 723 |
-
<div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini
|
| 724 |
</div>
|
| 725 |
<div style="text-align:right;font-size:11px;opacity:0.9;">
|
| 726 |
<div>문서: {doc_id}</div>
|
|
@@ -757,8 +1030,8 @@ def run_plagiarism(text, progress=gr.Progress()):
|
|
| 757 |
<span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
|
| 758 |
<span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
|
| 759 |
</div>
|
| 760 |
-
</div>{gemini_summary}
|
| 761 |
-
<div style="padding:24px;">
|
| 762 |
<div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
|
| 763 |
<table style="width:100%;border-collapse:collapse;font-size:11px;">
|
| 764 |
<thead>
|
|
|
|
| 16 |
except ImportError:
|
| 17 |
HAS_HTTPX = False
|
| 18 |
|
| 19 |
+
try:
|
| 20 |
+
import chromadb
|
| 21 |
+
HAS_CHROMADB = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
HAS_CHROMADB = False
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
import numpy as np
|
| 27 |
+
HAS_NUMPY = True
|
| 28 |
+
except ImportError:
|
| 29 |
+
HAS_NUMPY = False
|
| 30 |
+
|
| 31 |
try:
|
| 32 |
from google import genai
|
| 33 |
from google.genai import types as gtypes
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
|
| 323 |
+
"""출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거, 크롤링 본문도 반환"""
|
| 324 |
if not sources:
|
| 325 |
+
return [], 0.0, {}
|
| 326 |
|
| 327 |
verified = []
|
| 328 |
total_sim = 0.0
|
| 329 |
+
crawled_texts = {} # url → page_text (벡터DB 인덱싱용)
|
| 330 |
|
| 331 |
def _check_one(src):
|
| 332 |
url = src.get("url", "")
|
| 333 |
snippet = src.get("snippet", "")
|
| 334 |
+
page_text = ""
|
| 335 |
+
|
| 336 |
+
# snippet만으로 검증되는 경우에도 본문 크롤링 시도
|
| 337 |
+
page_text = _fetch_page_text(url, timeout=6)
|
| 338 |
|
| 339 |
if snippet and len(snippet) > 20:
|
| 340 |
sim = _text_similarity(original_text[:500], snippet, n=2)
|
| 341 |
if sim >= min_similarity:
|
| 342 |
+
return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim, page_text
|
| 343 |
|
|
|
|
| 344 |
if page_text and len(page_text) > 50:
|
| 345 |
jaccard = _text_similarity(original_text, page_text, n=3)
|
| 346 |
containment = _containment_similarity(original_text, page_text, n=3)
|
| 347 |
sim = max(jaccard, containment)
|
| 348 |
if sim >= min_similarity:
|
| 349 |
matched = _find_matching_sentences(original_text, page_text)
|
| 350 |
+
return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim, page_text
|
| 351 |
|
| 352 |
+
return None, 0.0, page_text
|
| 353 |
|
| 354 |
with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
|
| 355 |
futures = {executor.submit(_check_one, src): src for src in sources[:20]}
|
| 356 |
for future in as_completed(futures, timeout=30):
|
| 357 |
try:
|
| 358 |
+
result, sim, page_text = future.result()
|
| 359 |
+
src_info = futures[future]
|
| 360 |
+
url = src_info.get("url", "")
|
| 361 |
+
# 크롤링 본문 저장 (검증 여부 무관하게)
|
| 362 |
+
if page_text and len(page_text) > 50 and url:
|
| 363 |
+
crawled_texts[url] = {
|
| 364 |
+
"text": page_text,
|
| 365 |
+
"title": src_info.get("title", ""),
|
| 366 |
+
"source": src_info.get("source", ""),
|
| 367 |
+
}
|
| 368 |
if result is not None:
|
| 369 |
verified.append(result)
|
| 370 |
total_sim += sim
|
|
|
|
| 373 |
|
| 374 |
verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
|
| 375 |
avg_sim = total_sim / len(verified) if verified else 0.0
|
| 376 |
+
return verified, avg_sim, crawled_texts
|
| 377 |
|
| 378 |
|
| 379 |
def _find_matching_sentences(original, source_text, threshold=0.3):
|
|
|
|
| 391 |
return matched
|
| 392 |
|
| 393 |
|
| 394 |
+
# ============================================
|
| 395 |
+
# 벡터 DB 기반 정밀 표절 검사
|
| 396 |
+
# ============================================
|
| 397 |
+
|
| 398 |
+
def _chunk_text(text, chunk_size=200, overlap=50):
|
| 399 |
+
"""텍스트를 청크로 분리 (문장 경계 유지)"""
|
| 400 |
+
sents = _split_sentences(text)
|
| 401 |
+
chunks = []
|
| 402 |
+
current = []
|
| 403 |
+
current_len = 0
|
| 404 |
+
for sent in sents:
|
| 405 |
+
current.append(sent)
|
| 406 |
+
current_len += len(sent)
|
| 407 |
+
if current_len >= chunk_size:
|
| 408 |
+
chunks.append(' '.join(current))
|
| 409 |
+
# overlap: 마지막 문장 유지
|
| 410 |
+
if overlap > 0 and len(current) > 1:
|
| 411 |
+
current = current[-1:]
|
| 412 |
+
current_len = len(current[0])
|
| 413 |
+
else:
|
| 414 |
+
current = []
|
| 415 |
+
current_len = 0
|
| 416 |
+
if current:
|
| 417 |
+
chunks.append(' '.join(current))
|
| 418 |
+
return chunks
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
class VectorPlagiarismDB:
|
| 422 |
+
"""크롤링한 출처를 벡터DB에 저장하고 문장별 유사도 검색"""
|
| 423 |
+
|
| 424 |
+
def __init__(self):
|
| 425 |
+
if not HAS_CHROMADB:
|
| 426 |
+
self.available = False
|
| 427 |
+
return
|
| 428 |
+
try:
|
| 429 |
+
self.client = chromadb.Client()
|
| 430 |
+
self.collection = self.client.get_or_create_collection(
|
| 431 |
+
name="plag_sources",
|
| 432 |
+
metadata={"hnsw:space": "cosine"},
|
| 433 |
+
)
|
| 434 |
+
self.available = True
|
| 435 |
+
self._doc_count = 0
|
| 436 |
+
except Exception as e:
|
| 437 |
+
print(f"ChromaDB 초기화 실패: {e}")
|
| 438 |
+
self.available = False
|
| 439 |
+
|
| 440 |
+
def index_source(self, url, title, text, source_type="Web"):
|
| 441 |
+
"""출처 텍스트를 청크로 분리하여 벡터DB에 저장"""
|
| 442 |
+
if not self.available or not text or len(text.strip()) < 30:
|
| 443 |
+
return 0
|
| 444 |
+
try:
|
| 445 |
+
chunks = _chunk_text(text, chunk_size=200, overlap=50)
|
| 446 |
+
if not chunks:
|
| 447 |
+
return 0
|
| 448 |
+
|
| 449 |
+
doc_hash = hashlib.md5(url.encode()).hexdigest()[:10]
|
| 450 |
+
ids = []
|
| 451 |
+
documents = []
|
| 452 |
+
metadatas = []
|
| 453 |
+
|
| 454 |
+
for i, chunk in enumerate(chunks[:50]): # 출처당 최대 50청크
|
| 455 |
+
chunk_id = f"{doc_hash}_{i}"
|
| 456 |
+
ids.append(chunk_id)
|
| 457 |
+
documents.append(chunk)
|
| 458 |
+
metadatas.append({
|
| 459 |
+
"url": url[:200],
|
| 460 |
+
"title": title[:100],
|
| 461 |
+
"source": source_type,
|
| 462 |
+
"chunk_idx": i,
|
| 463 |
+
})
|
| 464 |
+
|
| 465 |
+
self.collection.add(ids=ids, documents=documents, metadatas=metadatas)
|
| 466 |
+
self._doc_count += len(ids)
|
| 467 |
+
return len(ids)
|
| 468 |
+
except Exception as e:
|
| 469 |
+
print(f"벡터DB 인덱싱 오류: {e}")
|
| 470 |
+
return 0
|
| 471 |
+
|
| 472 |
+
def query_sentence(self, sentence, n_results=3):
|
| 473 |
+
"""단일 문장에 대해 가장 유사한 출처 청크 검색"""
|
| 474 |
+
if not self.available or self._doc_count == 0:
|
| 475 |
+
return []
|
| 476 |
+
try:
|
| 477 |
+
results = self.collection.query(
|
| 478 |
+
query_texts=[sentence],
|
| 479 |
+
n_results=min(n_results, self._doc_count),
|
| 480 |
+
)
|
| 481 |
+
matches = []
|
| 482 |
+
if results and results['distances'] and results['distances'][0]:
|
| 483 |
+
for j, dist in enumerate(results['distances'][0]):
|
| 484 |
+
# ChromaDB cosine distance: 0 = 동일, 2 = 정반대
|
| 485 |
+
# similarity = 1 - (distance / 2)
|
| 486 |
+
similarity = max(0, 1 - dist / 2)
|
| 487 |
+
if similarity < 0.3: # 30% 미만은 무시
|
| 488 |
+
continue
|
| 489 |
+
meta = results['metadatas'][0][j] if results['metadatas'] else {}
|
| 490 |
+
doc_text = results['documents'][0][j] if results['documents'] else ""
|
| 491 |
+
matches.append({
|
| 492 |
+
"similarity": round(similarity * 100, 1),
|
| 493 |
+
"url": meta.get("url", ""),
|
| 494 |
+
"title": meta.get("title", ""),
|
| 495 |
+
"source": meta.get("source", ""),
|
| 496 |
+
"matched_text": doc_text[:150],
|
| 497 |
+
})
|
| 498 |
+
return matches
|
| 499 |
+
except Exception as e:
|
| 500 |
+
print(f"벡터DB 쿼리 오류: {e}")
|
| 501 |
+
return []
|
| 502 |
+
|
| 503 |
+
def check_document(self, text, min_similarity=30):
|
| 504 |
+
"""전체 문서를 문장별로 검사 → 표절 문장 목록 반환"""
|
| 505 |
+
if not self.available or self._doc_count == 0:
|
| 506 |
+
return [], 0.0, {}
|
| 507 |
+
|
| 508 |
+
sents = _split_sentences(text)
|
| 509 |
+
flagged = [] # 표절 의심 문장
|
| 510 |
+
all_sims = [] # 전체 유사도
|
| 511 |
+
source_hits = {} # URL별 히트 횟수
|
| 512 |
+
|
| 513 |
+
for sent in sents:
|
| 514 |
+
if len(sent) < 15:
|
| 515 |
+
continue
|
| 516 |
+
matches = self.query_sentence(sent, n_results=3)
|
| 517 |
+
if not matches:
|
| 518 |
+
all_sims.append(0)
|
| 519 |
+
continue
|
| 520 |
+
|
| 521 |
+
best = matches[0]
|
| 522 |
+
all_sims.append(best["similarity"])
|
| 523 |
+
|
| 524 |
+
if best["similarity"] >= min_similarity:
|
| 525 |
+
flagged.append({
|
| 526 |
+
"sentence": sent[:80],
|
| 527 |
+
"similarity": best["similarity"],
|
| 528 |
+
"url": best["url"],
|
| 529 |
+
"title": best["title"],
|
| 530 |
+
"matched_text": best["matched_text"][:100],
|
| 531 |
+
})
|
| 532 |
+
url = best["url"]
|
| 533 |
+
source_hits[url] = source_hits.get(url, 0) + 1
|
| 534 |
+
|
| 535 |
+
# 표절율: 유사 문장 비율
|
| 536 |
+
total_checked = len(all_sims)
|
| 537 |
+
flagged_count = len(flagged)
|
| 538 |
+
plag_ratio = (flagged_count / total_checked * 100) if total_checked > 0 else 0
|
| 539 |
+
|
| 540 |
+
# 평균 유사도 (유사 문장만)
|
| 541 |
+
avg_sim = sum(s["similarity"] for s in flagged) / len(flagged) if flagged else 0
|
| 542 |
+
|
| 543 |
+
return flagged, plag_ratio, source_hits
|
| 544 |
+
|
| 545 |
+
def get_stats(self):
|
| 546 |
+
"""DB 상태"""
|
| 547 |
+
if not self.available:
|
| 548 |
+
return {"available": False, "chunks": 0}
|
| 549 |
+
return {"available": True, "chunks": self._doc_count}
|
| 550 |
+
|
| 551 |
+
|
| 552 |
# ============================================
|
| 553 |
# 핵심 구문 추출
|
| 554 |
# ============================================
|
|
|
|
| 791 |
log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
|
| 792 |
|
| 793 |
# 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
|
| 794 |
+
verified_sources, avg_similarity, crawled_texts = _verify_sources_parallel(
|
| 795 |
text, raw_sources, min_similarity=0.02, max_workers=8
|
| 796 |
)
|
| 797 |
|
|
|
|
| 799 |
verified_urls = {s["url"] for s in verified_sources}
|
| 800 |
unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
|
| 801 |
|
| 802 |
+
log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 크롤링={len(crawled_texts)}건, 평균유사도={avg_similarity:.3f}")
|
| 803 |
|
| 804 |
# ═══════════════════════════════════════
|
| 805 |
+
# PHASE 5: 벡터DB 문장별 정밀 대조
|
| 806 |
# ═══════════════════════════════════════
|
| 807 |
+
vector_flagged = []
|
| 808 |
+
vector_plag_ratio = 0.0
|
| 809 |
+
vector_source_hits = {}
|
| 810 |
+
vdb = None
|
| 811 |
+
|
| 812 |
+
if HAS_CHROMADB and crawled_texts:
|
| 813 |
+
_prog(0.80, "⑤ 벡터DB 문장별 정밀 대조...")
|
| 814 |
+
try:
|
| 815 |
+
vdb = VectorPlagiarismDB()
|
| 816 |
+
if vdb.available:
|
| 817 |
+
# 크롤링된 모든 출처 본문을 벡터DB에 인덱싱
|
| 818 |
+
indexed_count = 0
|
| 819 |
+
for url, info in crawled_texts.items():
|
| 820 |
+
n = vdb.index_source(
|
| 821 |
+
url=url,
|
| 822 |
+
title=info.get("title", ""),
|
| 823 |
+
text=info["text"],
|
| 824 |
+
source_type=info.get("source", "Web"),
|
| 825 |
+
)
|
| 826 |
+
indexed_count += n
|
| 827 |
+
|
| 828 |
+
log_lines.append(f"[벡터DB] {len(crawled_texts)}개 출처 → {indexed_count}개 청크 인덱싱")
|
| 829 |
+
|
| 830 |
+
if indexed_count > 0:
|
| 831 |
+
# 입력 텍스트를 문장별로 벡터DB 검색
|
| 832 |
+
vector_flagged, vector_plag_ratio, vector_source_hits = vdb.check_document(
|
| 833 |
+
text, min_similarity=35
|
| 834 |
+
)
|
| 835 |
+
log_lines.append(
|
| 836 |
+
f"[벡터DB] 표절 문장={len(vector_flagged)}건, "
|
| 837 |
+
f"문장표절율={vector_plag_ratio:.1f}%, "
|
| 838 |
+
f"히트출처={len(vector_source_hits)}건"
|
| 839 |
+
)
|
| 840 |
+
except Exception as e:
|
| 841 |
+
log_lines.append(f"[벡터DB] 오류: {str(e)[:80]}")
|
| 842 |
+
elif not HAS_CHROMADB:
|
| 843 |
+
log_lines.append("[벡터DB] chromadb 미설치 — 건너뜀")
|
| 844 |
+
else:
|
| 845 |
+
log_lines.append("[벡터DB] 크롤링 데이터 없음 — 건너뜀")
|
| 846 |
+
|
| 847 |
+
# ═══════════════════════════════════════
|
| 848 |
+
# 종합 판정 (n-gram + 벡터DB 복합)
|
| 849 |
+
# ═══════════════════════════════════════
|
| 850 |
+
_prog(0.90, "보고서 생성...")
|
| 851 |
|
| 852 |
all_sources = verified_sources # 검증된 출처만 표시
|
| 853 |
|
| 854 |
+
# 표절율 산출: 다층 검증
|
| 855 |
+
# ① n-gram 기반 (기존)
|
| 856 |
max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
|
|
|
|
| 857 |
count_factor = min(len(verified_sources) * 3, 30)
|
|
|
|
| 858 |
avg_factor = avg_similarity * 100
|
| 859 |
+
ngram_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100)
|
| 860 |
+
|
| 861 |
+
# ② 벡터DB 기반 (문장별 매칭)
|
| 862 |
+
vector_score = round(vector_plag_ratio) if vector_flagged else 0
|
| 863 |
|
| 864 |
+
# 복합 점수: 벡터DB가 있으면 50:50, 없으면 n-gram만
|
| 865 |
+
if vector_flagged:
|
| 866 |
+
combined_score = round(ngram_score * 0.4 + vector_score * 0.6)
|
| 867 |
+
else:
|
| 868 |
+
combined_score = ngram_score
|
| 869 |
|
| 870 |
+
# Gemini 표절율은 참고용 (20% 가중, 검증 출처가 있을 때만)
|
|
|
|
| 871 |
if gemini_pct > 0 and len(verified_sources) > 0:
|
| 872 |
+
plag_pct = min(round(gemini_pct * 0.2 + combined_score * 0.8), 100)
|
| 873 |
elif len(verified_sources) > 0:
|
| 874 |
+
plag_pct = combined_score
|
| 875 |
else:
|
|
|
|
| 876 |
plag_pct = min(round(gemini_pct * 0.1), 20)
|
| 877 |
|
| 878 |
+
log_lines.append(
|
| 879 |
+
f"[판정] Gemini={gemini_pct}%, n-gram={ngram_score}%, "
|
| 880 |
+
f"벡터={vector_score}%, → 종합={plag_pct}%"
|
| 881 |
+
)
|
| 882 |
|
| 883 |
if plag_pct >= 50:
|
| 884 |
grade, gc = "🚨 표절 의심", "#FF4444"
|
|
|
|
| 946 |
</details>
|
| 947 |
</div>"""
|
| 948 |
|
| 949 |
+
# 벡터DB 문장별 매칭 결과
|
| 950 |
+
vector_section = ""
|
| 951 |
+
if vector_flagged:
|
| 952 |
+
vf_rows = ""
|
| 953 |
+
for k, vf in enumerate(vector_flagged[:15]):
|
| 954 |
+
sim_val = vf["similarity"]
|
| 955 |
+
sim_color = "#FF4444" if sim_val >= 70 else "#FF8800" if sim_val >= 50 else "#DDAA00"
|
| 956 |
+
sent_safe = vf["sentence"][:70].replace('<', '<')
|
| 957 |
+
matched_safe = vf["matched_text"][:90].replace('<', '<')
|
| 958 |
+
title_safe = vf["title"][:40].replace('<', '<')
|
| 959 |
+
vf_rows += f"""<tr style="border-bottom:1px solid #F0F0F0;">
|
| 960 |
+
<td style="padding:6px;text-align:center;font-size:10px;color:#666;">{k+1}</td>
|
| 961 |
+
<td style="padding:6px;font-size:10px;color:#333;">{sent_safe}</td>
|
| 962 |
+
<td style="padding:6px;text-align:center;"><span style="font-weight:700;color:{sim_color};">{sim_val:.0f}%</span></td>
|
| 963 |
+
<td style="padding:6px;font-size:9px;color:#666;">{matched_safe}</td>
|
| 964 |
+
<td style="padding:6px;font-size:9px;"><a href="{vf['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;">{title_safe}</a></td>
|
| 965 |
+
</tr>"""
|
| 966 |
+
|
| 967 |
+
vdb_stats = vdb.get_stats() if vdb else {"chunks": 0}
|
| 968 |
+
vector_section = f"""
|
| 969 |
+
<div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;">
|
| 970 |
+
<details open>
|
| 971 |
+
<summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;">
|
| 972 |
+
🧬 벡터DB 문장별 정밀 대조 — 표절 문장 {len(vector_flagged)}건 / 문장표절율 {vector_plag_ratio:.1f}%
|
| 973 |
+
<span style="font-size:10px;font-weight:400;color:#888;margin-left:8px;">(DB: {vdb_stats['chunks']}청크)</span>
|
| 974 |
+
</summary>
|
| 975 |
+
<table style="width:100%;border-collapse:collapse;font-size:11px;margin-top:10px;">
|
| 976 |
+
<thead>
|
| 977 |
+
<tr style="background:#E8EAF6;color:#333;">
|
| 978 |
+
<th style="padding:8px;width:30px;">#</th>
|
| 979 |
+
<th style="padding:8px;text-align:left;">입력 문장</th>
|
| 980 |
+
<th style="padding:8px;width:55px;">유사도</th>
|
| 981 |
+
<th style="padding:8px;text-align:left;">매칭 출처 내용</th>
|
| 982 |
+
<th style="padding:8px;width:120px;">출처</th>
|
| 983 |
+
</tr>
|
| 984 |
+
</thead>
|
| 985 |
+
<tbody>{vf_rows}</tbody>
|
| 986 |
+
</table>
|
| 987 |
+
</details>
|
| 988 |
+
</div>"""
|
| 989 |
+
|
| 990 |
HDR = '#3B7DD8'
|
| 991 |
html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
|
| 992 |
<div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
|
| 993 |
<div style="display:flex;justify-content:space-between;align-items:center;">
|
| 994 |
<div>
|
| 995 |
<div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
|
| 996 |
+
<div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini + Brave + KCI·RISS·arXiv + 벡터DB 정밀대조</div>
|
| 997 |
</div>
|
| 998 |
<div style="text-align:right;font-size:11px;opacity:0.9;">
|
| 999 |
<div>문서: {doc_id}</div>
|
|
|
|
| 1030 |
<span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
|
| 1031 |
<span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
|
| 1032 |
</div>
|
| 1033 |
+
</div>{gemini_summary}{vector_section}
|
| 1034 |
+
<div style="padding:24px;border-bottom:1px solid #E0E0E0;">
|
| 1035 |
<div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
|
| 1036 |
<table style="width:100%;border-collapse:collapse;font-size:11px;">
|
| 1037 |
<thead>
|