openfree commited on
Commit
fc4c5f1
·
verified ·
1 Parent(s): 79cfe45

Update plagiarism_check.py

Browse files
Files changed (1) hide show
  1. plagiarism_check.py +300 -27
plagiarism_check.py CHANGED
@@ -16,6 +16,18 @@ try:
16
  except ImportError:
17
  HAS_HTTPX = False
18
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  try:
20
  from google import genai
21
  from google.genai import types as gtypes
@@ -308,38 +320,51 @@ def _fetch_page_text(url, timeout=8):
308
 
309
 
310
  def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
311
- """출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거"""
312
  if not sources:
313
- return [], 0.0
314
 
315
  verified = []
316
  total_sim = 0.0
 
317
 
318
  def _check_one(src):
319
  url = src.get("url", "")
320
  snippet = src.get("snippet", "")
 
 
 
 
321
 
322
  if snippet and len(snippet) > 20:
323
  sim = _text_similarity(original_text[:500], snippet, n=2)
324
  if sim >= min_similarity:
325
- return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim
326
 
327
- page_text = _fetch_page_text(url, timeout=6)
328
  if page_text and len(page_text) > 50:
329
  jaccard = _text_similarity(original_text, page_text, n=3)
330
  containment = _containment_similarity(original_text, page_text, n=3)
331
  sim = max(jaccard, containment)
332
  if sim >= min_similarity:
333
  matched = _find_matching_sentences(original_text, page_text)
334
- return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim
335
 
336
- return None, 0.0
337
 
338
  with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
339
  futures = {executor.submit(_check_one, src): src for src in sources[:20]}
340
  for future in as_completed(futures, timeout=30):
341
  try:
342
- result, sim = future.result()
 
 
 
 
 
 
 
 
 
343
  if result is not None:
344
  verified.append(result)
345
  total_sim += sim
@@ -348,7 +373,7 @@ def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_wo
348
 
349
  verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
350
  avg_sim = total_sim / len(verified) if verified else 0.0
351
- return verified, avg_sim
352
 
353
 
354
  def _find_matching_sentences(original, source_text, threshold=0.3):
@@ -366,6 +391,164 @@ def _find_matching_sentences(original, source_text, threshold=0.3):
366
  return matched
367
 
368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  # ============================================
370
  # 핵심 구문 추출
371
  # ============================================
@@ -608,7 +791,7 @@ def run_plagiarism(text, progress=gr.Progress()):
608
  log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
609
 
610
  # 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
611
- verified_sources, avg_similarity = _verify_sources_parallel(
612
  text, raw_sources, min_similarity=0.02, max_workers=8
613
  )
614
 
@@ -616,37 +799,86 @@ def run_plagiarism(text, progress=gr.Progress()):
616
  verified_urls = {s["url"] for s in verified_sources}
617
  unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
618
 
619
- log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 평균유사도={avg_similarity:.3f}")
620
 
621
  # ═══════════════════════════════════════
622
- # 종합 판정 (실제 유사도 기반)
623
  # ═══════════════════════════════════════
624
- _prog(0.85, "보고서 생성...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
  all_sources = verified_sources # 검증된 출처만 표시
627
 
628
- # 표절율 산출: 실제 유사도 기반 (Gemini 환각 방지)
629
- # ① 검증된 출처들의 최고 유사도
630
  max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
631
- # ② 검증된 출처 수 기반 보정
632
  count_factor = min(len(verified_sources) * 3, 30)
633
- # ③ 평균 유사도 반영
634
  avg_factor = avg_similarity * 100
 
 
 
 
635
 
636
- # 실제 유사도 기반 점수 (0~100)
637
- similarity_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100)
 
 
 
638
 
639
- # Gemini 표절율은 참고용으로만 (30% 가중)
640
- # 단, Gemini가 0%인데 유사 출처가 있으면 유사도 기반만 사용
641
  if gemini_pct > 0 and len(verified_sources) > 0:
642
- plag_pct = min(round(gemini_pct * 0.3 + similarity_score * 0.7), 100)
643
  elif len(verified_sources) > 0:
644
- plag_pct = similarity_score
645
  else:
646
- # 검증된 출처 없음 → Gemini 값도 크게 할인
647
  plag_pct = min(round(gemini_pct * 0.1), 20)
648
 
649
- log_lines.append(f"[판정] Gemini={gemini_pct}%, 최고유사도={max_sim:.1f}%, 검증출처={len(verified_sources)}건 → 종합={plag_pct}%")
 
 
 
650
 
651
  if plag_pct >= 50:
652
  grade, gc = "🚨 표절 의심", "#FF4444"
@@ -714,13 +946,54 @@ def run_plagiarism(text, progress=gr.Progress()):
714
  </details>
715
  </div>"""
716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  HDR = '#3B7DD8'
718
  html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
719
  <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
720
  <div style="display:flex;justify-content:space-between;align-items:center;">
721
  <div>
722
  <div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
723
- <div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini Google Search + Brave + KCI·RISS·arXiv</div>
724
  </div>
725
  <div style="text-align:right;font-size:11px;opacity:0.9;">
726
  <div>문서: {doc_id}</div>
@@ -757,8 +1030,8 @@ def run_plagiarism(text, progress=gr.Progress()):
757
  <span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
758
  <span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
759
  </div>
760
- </div>{gemini_summary}
761
- <div style="padding:24px;">
762
  <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
763
  <table style="width:100%;border-collapse:collapse;font-size:11px;">
764
  <thead>
 
16
  except ImportError:
17
  HAS_HTTPX = False
18
 
19
+ try:
20
+ import chromadb
21
+ HAS_CHROMADB = True
22
+ except ImportError:
23
+ HAS_CHROMADB = False
24
+
25
+ try:
26
+ import numpy as np
27
+ HAS_NUMPY = True
28
+ except ImportError:
29
+ HAS_NUMPY = False
30
+
31
  try:
32
  from google import genai
33
  from google.genai import types as gtypes
 
320
 
321
 
322
  def _verify_sources_parallel(original_text, sources, min_similarity=0.02, max_workers=8):
323
+ """출처 URL을 크롤링하여 원문과 대조 검증 — 유사도 미달 출처 제거, 크롤링 본문도 반환"""
324
  if not sources:
325
+ return [], 0.0, {}
326
 
327
  verified = []
328
  total_sim = 0.0
329
+ crawled_texts = {} # url → page_text (벡터DB 인덱싱용)
330
 
331
  def _check_one(src):
332
  url = src.get("url", "")
333
  snippet = src.get("snippet", "")
334
+ page_text = ""
335
+
336
+ # snippet만으로 검증되는 경우에도 본문 크롤링 시도
337
+ page_text = _fetch_page_text(url, timeout=6)
338
 
339
  if snippet and len(snippet) > 20:
340
  sim = _text_similarity(original_text[:500], snippet, n=2)
341
  if sim >= min_similarity:
342
+ return {**src, "similarity": round(sim * 100, 1), "method": "snippet"}, sim, page_text
343
 
 
344
  if page_text and len(page_text) > 50:
345
  jaccard = _text_similarity(original_text, page_text, n=3)
346
  containment = _containment_similarity(original_text, page_text, n=3)
347
  sim = max(jaccard, containment)
348
  if sim >= min_similarity:
349
  matched = _find_matching_sentences(original_text, page_text)
350
+ return {**src, "similarity": round(sim * 100, 1), "method": "crawl", "matched": matched}, sim, page_text
351
 
352
+ return None, 0.0, page_text
353
 
354
  with ThreadPoolExecutor(max_workers=min(max_workers, len(sources))) as executor:
355
  futures = {executor.submit(_check_one, src): src for src in sources[:20]}
356
  for future in as_completed(futures, timeout=30):
357
  try:
358
+ result, sim, page_text = future.result()
359
+ src_info = futures[future]
360
+ url = src_info.get("url", "")
361
+ # 크롤링 본문 저장 (검증 여부 무관하게)
362
+ if page_text and len(page_text) > 50 and url:
363
+ crawled_texts[url] = {
364
+ "text": page_text,
365
+ "title": src_info.get("title", ""),
366
+ "source": src_info.get("source", ""),
367
+ }
368
  if result is not None:
369
  verified.append(result)
370
  total_sim += sim
 
373
 
374
  verified.sort(key=lambda x: x.get("similarity", 0), reverse=True)
375
  avg_sim = total_sim / len(verified) if verified else 0.0
376
+ return verified, avg_sim, crawled_texts
377
 
378
 
379
  def _find_matching_sentences(original, source_text, threshold=0.3):
 
391
  return matched
392
 
393
 
394
+ # ============================================
395
+ # 벡터 DB 기반 정밀 표절 검사
396
+ # ============================================
397
+
398
+ def _chunk_text(text, chunk_size=200, overlap=50):
399
+ """텍스트를 청크로 분리 (문장 경계 유지)"""
400
+ sents = _split_sentences(text)
401
+ chunks = []
402
+ current = []
403
+ current_len = 0
404
+ for sent in sents:
405
+ current.append(sent)
406
+ current_len += len(sent)
407
+ if current_len >= chunk_size:
408
+ chunks.append(' '.join(current))
409
+ # overlap: 마지막 문장 유지
410
+ if overlap > 0 and len(current) > 1:
411
+ current = current[-1:]
412
+ current_len = len(current[0])
413
+ else:
414
+ current = []
415
+ current_len = 0
416
+ if current:
417
+ chunks.append(' '.join(current))
418
+ return chunks
419
+
420
+
421
+ class VectorPlagiarismDB:
422
+ """크롤링한 출처를 벡터DB에 저장하고 문장별 유사도 검색"""
423
+
424
+ def __init__(self):
425
+ if not HAS_CHROMADB:
426
+ self.available = False
427
+ return
428
+ try:
429
+ self.client = chromadb.Client()
430
+ self.collection = self.client.get_or_create_collection(
431
+ name="plag_sources",
432
+ metadata={"hnsw:space": "cosine"},
433
+ )
434
+ self.available = True
435
+ self._doc_count = 0
436
+ except Exception as e:
437
+ print(f"ChromaDB 초기화 실패: {e}")
438
+ self.available = False
439
+
440
+ def index_source(self, url, title, text, source_type="Web"):
441
+ """출처 텍스트를 청크로 분리하여 벡터DB에 저장"""
442
+ if not self.available or not text or len(text.strip()) < 30:
443
+ return 0
444
+ try:
445
+ chunks = _chunk_text(text, chunk_size=200, overlap=50)
446
+ if not chunks:
447
+ return 0
448
+
449
+ doc_hash = hashlib.md5(url.encode()).hexdigest()[:10]
450
+ ids = []
451
+ documents = []
452
+ metadatas = []
453
+
454
+ for i, chunk in enumerate(chunks[:50]): # 출처당 최대 50청크
455
+ chunk_id = f"{doc_hash}_{i}"
456
+ ids.append(chunk_id)
457
+ documents.append(chunk)
458
+ metadatas.append({
459
+ "url": url[:200],
460
+ "title": title[:100],
461
+ "source": source_type,
462
+ "chunk_idx": i,
463
+ })
464
+
465
+ self.collection.add(ids=ids, documents=documents, metadatas=metadatas)
466
+ self._doc_count += len(ids)
467
+ return len(ids)
468
+ except Exception as e:
469
+ print(f"벡터DB 인덱싱 오류: {e}")
470
+ return 0
471
+
472
+ def query_sentence(self, sentence, n_results=3):
473
+ """단일 문장에 대해 가장 유사한 출처 청크 검색"""
474
+ if not self.available or self._doc_count == 0:
475
+ return []
476
+ try:
477
+ results = self.collection.query(
478
+ query_texts=[sentence],
479
+ n_results=min(n_results, self._doc_count),
480
+ )
481
+ matches = []
482
+ if results and results['distances'] and results['distances'][0]:
483
+ for j, dist in enumerate(results['distances'][0]):
484
+ # ChromaDB cosine distance: 0 = 동일, 2 = 정반대
485
+ # similarity = 1 - (distance / 2)
486
+ similarity = max(0, 1 - dist / 2)
487
+ if similarity < 0.3: # 30% 미만은 무시
488
+ continue
489
+ meta = results['metadatas'][0][j] if results['metadatas'] else {}
490
+ doc_text = results['documents'][0][j] if results['documents'] else ""
491
+ matches.append({
492
+ "similarity": round(similarity * 100, 1),
493
+ "url": meta.get("url", ""),
494
+ "title": meta.get("title", ""),
495
+ "source": meta.get("source", ""),
496
+ "matched_text": doc_text[:150],
497
+ })
498
+ return matches
499
+ except Exception as e:
500
+ print(f"벡터DB 쿼리 오류: {e}")
501
+ return []
502
+
503
+ def check_document(self, text, min_similarity=30):
504
+ """전체 문서를 문장별로 검사 → 표절 문장 목록 반환"""
505
+ if not self.available or self._doc_count == 0:
506
+ return [], 0.0, {}
507
+
508
+ sents = _split_sentences(text)
509
+ flagged = [] # 표절 의심 문장
510
+ all_sims = [] # 전체 유사도
511
+ source_hits = {} # URL별 히트 횟수
512
+
513
+ for sent in sents:
514
+ if len(sent) < 15:
515
+ continue
516
+ matches = self.query_sentence(sent, n_results=3)
517
+ if not matches:
518
+ all_sims.append(0)
519
+ continue
520
+
521
+ best = matches[0]
522
+ all_sims.append(best["similarity"])
523
+
524
+ if best["similarity"] >= min_similarity:
525
+ flagged.append({
526
+ "sentence": sent[:80],
527
+ "similarity": best["similarity"],
528
+ "url": best["url"],
529
+ "title": best["title"],
530
+ "matched_text": best["matched_text"][:100],
531
+ })
532
+ url = best["url"]
533
+ source_hits[url] = source_hits.get(url, 0) + 1
534
+
535
+ # 표절율: 유사 문장 비율
536
+ total_checked = len(all_sims)
537
+ flagged_count = len(flagged)
538
+ plag_ratio = (flagged_count / total_checked * 100) if total_checked > 0 else 0
539
+
540
+ # 평균 유사도 (유사 문장만)
541
+ avg_sim = sum(s["similarity"] for s in flagged) / len(flagged) if flagged else 0
542
+
543
+ return flagged, plag_ratio, source_hits
544
+
545
+ def get_stats(self):
546
+ """DB 상태"""
547
+ if not self.available:
548
+ return {"available": False, "chunks": 0}
549
+ return {"available": True, "chunks": self._doc_count}
550
+
551
+
552
  # ============================================
553
  # 핵심 구문 추출
554
  # ============================================
 
791
  log_lines.append(f"[수집] 총 {len(raw_sources)}건 (검증 전)")
792
 
793
  # 실제 크롤링하여 원문과 대조 → 유사도 미달 출처 제거
794
+ verified_sources, avg_similarity, crawled_texts = _verify_sources_parallel(
795
  text, raw_sources, min_similarity=0.02, max_workers=8
796
  )
797
 
 
799
  verified_urls = {s["url"] for s in verified_sources}
800
  unverified_sources = [s for s in raw_sources if s["url"] not in verified_urls]
801
 
802
+ log_lines.append(f"[검증] 통과={len(verified_sources)}건, 미검증={len(unverified_sources)}건, 크롤링={len(crawled_texts)}건, 평균유사도={avg_similarity:.3f}")
803
 
804
  # ═══════════════════════════════════════
805
+ # PHASE 5: 벡터DB 문장별 정밀 대조
806
  # ═══════════════════════════════════════
807
+ vector_flagged = []
808
+ vector_plag_ratio = 0.0
809
+ vector_source_hits = {}
810
+ vdb = None
811
+
812
+ if HAS_CHROMADB and crawled_texts:
813
+ _prog(0.80, "⑤ 벡터DB 문장별 정밀 대조...")
814
+ try:
815
+ vdb = VectorPlagiarismDB()
816
+ if vdb.available:
817
+ # 크롤링된 모든 출처 본문을 벡터DB에 인덱싱
818
+ indexed_count = 0
819
+ for url, info in crawled_texts.items():
820
+ n = vdb.index_source(
821
+ url=url,
822
+ title=info.get("title", ""),
823
+ text=info["text"],
824
+ source_type=info.get("source", "Web"),
825
+ )
826
+ indexed_count += n
827
+
828
+ log_lines.append(f"[벡터DB] {len(crawled_texts)}개 출처 → {indexed_count}개 청크 인덱싱")
829
+
830
+ if indexed_count > 0:
831
+ # 입력 텍스트를 문장별로 벡터DB 검색
832
+ vector_flagged, vector_plag_ratio, vector_source_hits = vdb.check_document(
833
+ text, min_similarity=35
834
+ )
835
+ log_lines.append(
836
+ f"[벡터DB] 표절 문장={len(vector_flagged)}건, "
837
+ f"문장표절율={vector_plag_ratio:.1f}%, "
838
+ f"히트출처={len(vector_source_hits)}건"
839
+ )
840
+ except Exception as e:
841
+ log_lines.append(f"[벡터DB] 오류: {str(e)[:80]}")
842
+ elif not HAS_CHROMADB:
843
+ log_lines.append("[벡터DB] chromadb 미설치 — 건너뜀")
844
+ else:
845
+ log_lines.append("[벡터DB] 크롤링 데이터 없음 — 건너뜀")
846
+
847
+ # ═══════════════════════════════════════
848
+ # 종합 판정 (n-gram + 벡터DB 복합)
849
+ # ═══════════════════════════════════════
850
+ _prog(0.90, "보고서 생성...")
851
 
852
  all_sources = verified_sources # 검증된 출처만 표시
853
 
854
+ # 표절율 산출: 다층 검증
855
+ # ① n-gram 기반 (기존)
856
  max_sim = max((s.get("similarity", 0) for s in verified_sources), default=0)
 
857
  count_factor = min(len(verified_sources) * 3, 30)
 
858
  avg_factor = avg_similarity * 100
859
+ ngram_score = min(round(max_sim * 0.4 + avg_factor * 0.3 + count_factor * 0.3), 100)
860
+
861
+ # ② 벡터DB 기반 (문장별 매칭)
862
+ vector_score = round(vector_plag_ratio) if vector_flagged else 0
863
 
864
+ # 복합 점수: 벡터DB가 있으면 50:50, 없으면 n-gram만
865
+ if vector_flagged:
866
+ combined_score = round(ngram_score * 0.4 + vector_score * 0.6)
867
+ else:
868
+ combined_score = ngram_score
869
 
870
+ # Gemini 표절율은 참고용 (20% 가중, 검증 출처가 있을 때만)
 
871
  if gemini_pct > 0 and len(verified_sources) > 0:
872
+ plag_pct = min(round(gemini_pct * 0.2 + combined_score * 0.8), 100)
873
  elif len(verified_sources) > 0:
874
+ plag_pct = combined_score
875
  else:
 
876
  plag_pct = min(round(gemini_pct * 0.1), 20)
877
 
878
+ log_lines.append(
879
+ f"[판정] Gemini={gemini_pct}%, n-gram={ngram_score}%, "
880
+ f"벡터={vector_score}%, → 종합={plag_pct}%"
881
+ )
882
 
883
  if plag_pct >= 50:
884
  grade, gc = "🚨 표절 의심", "#FF4444"
 
946
  </details>
947
  </div>"""
948
 
949
+ # 벡터DB 문장별 매칭 결과
950
+ vector_section = ""
951
+ if vector_flagged:
952
+ vf_rows = ""
953
+ for k, vf in enumerate(vector_flagged[:15]):
954
+ sim_val = vf["similarity"]
955
+ sim_color = "#FF4444" if sim_val >= 70 else "#FF8800" if sim_val >= 50 else "#DDAA00"
956
+ sent_safe = vf["sentence"][:70].replace('<', '&lt;')
957
+ matched_safe = vf["matched_text"][:90].replace('<', '&lt;')
958
+ title_safe = vf["title"][:40].replace('<', '&lt;')
959
+ vf_rows += f"""<tr style="border-bottom:1px solid #F0F0F0;">
960
+ <td style="padding:6px;text-align:center;font-size:10px;color:#666;">{k+1}</td>
961
+ <td style="padding:6px;font-size:10px;color:#333;">{sent_safe}</td>
962
+ <td style="padding:6px;text-align:center;"><span style="font-weight:700;color:{sim_color};">{sim_val:.0f}%</span></td>
963
+ <td style="padding:6px;font-size:9px;color:#666;">{matched_safe}</td>
964
+ <td style="padding:6px;font-size:9px;"><a href="{vf['url']}" target="_blank" rel="noopener noreferrer" style="color:#2E86C1;text-decoration:none;">{title_safe}</a></td>
965
+ </tr>"""
966
+
967
+ vdb_stats = vdb.get_stats() if vdb else {"chunks": 0}
968
+ vector_section = f"""
969
+ <div style="padding:16px 24px;border-bottom:1px solid #E0E0E0;">
970
+ <details open>
971
+ <summary style="cursor:pointer;font-size:13px;font-weight:700;color:#1A3C6E;">
972
+ 🧬 벡터DB 문장별 정밀 대조 — 표절 문장 {len(vector_flagged)}건 / 문장표절율 {vector_plag_ratio:.1f}%
973
+ <span style="font-size:10px;font-weight:400;color:#888;margin-left:8px;">(DB: {vdb_stats['chunks']}청크)</span>
974
+ </summary>
975
+ <table style="width:100%;border-collapse:collapse;font-size:11px;margin-top:10px;">
976
+ <thead>
977
+ <tr style="background:#E8EAF6;color:#333;">
978
+ <th style="padding:8px;width:30px;">#</th>
979
+ <th style="padding:8px;text-align:left;">입력 문장</th>
980
+ <th style="padding:8px;width:55px;">유사도</th>
981
+ <th style="padding:8px;text-align:left;">매칭 출처 내용</th>
982
+ <th style="padding:8px;width:120px;">출처</th>
983
+ </tr>
984
+ </thead>
985
+ <tbody>{vf_rows}</tbody>
986
+ </table>
987
+ </details>
988
+ </div>"""
989
+
990
  HDR = '#3B7DD8'
991
  html = f"""<div style="font-family:'Noto Sans KR',sans-serif;max-width:900px;margin:20px auto;background:#fff;border:1px solid #E0E0E0;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,0.06);">
992
  <div style="background:linear-gradient(135deg,{HDR},#4A8DE0);padding:24px;color:#fff;border-radius:8px 8px 0 0;">
993
  <div style="display:flex;justify-content:space-between;align-items:center;">
994
  <div>
995
  <div style="font-size:24px;font-weight:900;">표절 검사 결과</div>
996
+ <div style="font-size:12px;opacity:0.9;margin-top:4px;">Gemini + Brave + KCI·RISS·arXiv + 벡터DB 정밀대조</div>
997
  </div>
998
  <div style="text-align:right;font-size:11px;opacity:0.9;">
999
  <div>문서: {doc_id}</div>
 
1030
  <span><span style="color:#888;">단어수</span> <b>{word_count:,}단어</b></span>
1031
  <span><span style="color:#888;">검색엔진</span> <b>Google + Brave + KCI·RISS·arXiv</b></span>
1032
  </div>
1033
+ </div>{gemini_summary}{vector_section}
1034
+ <div style="padding:24px;border-bottom:1px solid #E0E0E0;">
1035
  <div style="font-size:13px;font-weight:700;color:#1A3C6E;margin-bottom:12px;">🔍 검증된 출처 ({len(all_sources)}건)</div>
1036
  <table style="width:100%;border-collapse:collapse;font-size:11px;">
1037
  <thead>