Spaces:

vijayitsbbd
/

plagtool

Running

App Files Files Community

plagtool / src /app.py

vijayitsbbd

Update src/app.py

5faec67 verified about 1 month ago

raw

history blame contribute delete

35.6 kB

	# =========================================================
	# Advanced Academic Rule-Based + Hybrid Paraphraser
	# =========================================================

	import gradio as gr
	import os, re, random, warnings

	# Disable analytics for privacy
	os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	warnings.filterwarnings("ignore")

	# ---------------------------------------------------------
	# Transformer lazy load
	# ---------------------------------------------------------
	_transformer = None
	TRANSFORMER_AVAILABLE = True

	def load_transformer():
	global _transformer
	if _transformer is None:
	from transformers import pipeline
	_transformer = pipeline(
	"text2text-generation", # ← correct for T5
	model="humarin/chatgpt_paraphraser_on_T5_base",
	device=-1
	)
	return _transformer


	# ---------------------------------------------------------
	# File Reader (with lazy import)
	# ---------------------------------------------------------
	def read_input_file(file):
	if not file:
	return ""
	if file.name.endswith(".txt"):
	return file.read().decode("utf-8")
	if file.name.endswith(".docx"):
	# Lazy import for docx
	from docx import Document
	doc = Document(file)
	return "\n".join(p.text for p in doc.paragraphs)
	return ""

	# =========================================================
	# Core Paraphraser
	# =========================================================
	class AcademicParaphraser:
	def __init__(self):
	# NLTK is NOT imported here - it will be imported lazily in methods
	self.contractions = {
	"do not": "don't", "cannot": "can't", "will not": "won't",
	"is not": "isn't", "are not": "aren't", "did not": "didn't",
	"would not": "wouldn't", "could not": "couldn't", "should not": "shouldn't",
	"has not": "hasn't", "have not": "haven't", "had not": "hadn't"
	}
	self.expanded_contractions = {v: k for k, v in self.contractions.items()}

	self.academic_phrase_pairs = [
	# ── Your original pairs ─────────────────────────────────────────────────────
	(r'\bcarried out\b', r'conducted'),
	(r'\bperformed\b', r'conducted'),
	(r'\butilized\b', r'used'),
	(r'\bemployed\b', r'used'),
	(r'\bconducted\b', r'performed'),
	(r'\busing\b', r'with'),
	(r'\bwith\b', r'using'),
	(r'\bto explore\b', r'to investigate'),
	(r'\bto investigate\b', r'to explore'),
	(r'\bto examine\b', r'to analyze'),
	(r'\bto analyze\b', r'to examine'),
	(r'\bto obtain\b', r'to gain'),
	(r'\bto gain\b', r'to obtain'),
	(r'\bmore detailed\b', r'deeper'),
	(r'\bdeeper\b', r'more detailed'),
	(r'\bunderstanding\b', r'insight'),
	(r'\binsight\b', r'understanding'),
	(r'\bframework\b', r'structure'),
	(r'\bstructure\b', r'framework'),
	(r'\boptimized\b', r'minimized'),
	(r'\bminimized\b', r'optimized'),
	(r'\bgeometry\b', r'shape'),
	(r'\bshape\b', r'geometry'),
	(r'\binteractions\b', r'connections'),
	(r'\bconnections\b', r'interactions'),

	# ── Purpose / aim / objective ───────────────────────────────────────────────
	(r'\bthis study aims to\b', r'the present work seeks to'),
	(r'\bthe aim of this study\b', r'the purpose of this research'),
	(r'\bthe objective of\b', r'the goal of'),
	(r'\bwe aimed to\b', r'this study was designed to'),
	(r'\bintended to investigate\b', r'undertaken to examine'),

	# ── Methods / procedures ────────────────────────────────────────────────────
	(r'\bwas used\b', r'was employed'),
	(r'\bwere used\b', r'were employed'),
	(r'\bby using\b', r'by means of'),
	(r'\bthe samples were collected\b', r'specimens were obtained'),
	(r'\bthe data were collected\b', r'data were acquired'),
	(r'\bmeasurements were taken\b', r'measurements were performed'),

	# ── Results / findings ──────────────────────────────────────────────────────
	(r'\bthe results show\b', r'the findings indicate'),
	(r'\bthe results showed\b', r'the findings revealed'),
	(r'\bshowed that\b', r'indicated that'),
	(r'\bfound that\b', r'revealed that'),
	(r'\bit was observed that\b', r'observations indicated that'),
	(r'\bthe data indicate\b', r'the results suggest'),
	(r'\bsignificant difference\b', r'marked difference'),
	(r'\bno significant difference\b', r'no appreciable difference'),

	# ── Discussion / interpretation ─────────────────────────────────────────────
	(r'\bthis suggests that\b', r'these findings imply that'),
	(r'\bthis indicates that\b', r'this points to'),
	(r'\bit can be seen that\b', r'it is evident that'),
	(r'\bconsistent with\b', r'in agreement with'),
	(r'\bin line with\b', r'aligned with'),
	(r'\bcontrary to\b', r'in contrast to'),

	# ── Transitions / connectors ────────────────────────────────────────────────
	(r'\bhowever\b', r'nevertheless'),
	(r'\btherefore\b', r'consequently'),
	(r'\bin addition\b', r'furthermore'),
	(r'\bmoreover\b', r'what is more'),
	(r'\bon the other hand\b', r'conversely'),
	(r'\bfor example\b', r'for instance'),
	(r'\bfirstly\b', r'first'),
	(r'\bsecondly\b', r'second'),

	# ── Hedging / cautious language ─────────────────────────────────────────────
	(r'\bshows that\b', r'suggests that'),
	(r'\bindicates that\b', r'appears to indicate that'),
	(r'\bproves that\b', r'provides evidence that'),
	(r'\bit is clear that\b', r'it appears that'),
	(r'\bclearly\b', r'evidently'),
	(r'\bdefinitely\b', r'presumably'),

	# ── Quantity / degree / intensity ───────────────────────────────────────────
	(r'\bvery\b', r'highly'),
	(r'\bvery important\b', r'crucially important'),
	(r'\blarge\b', r'substantial'),
	(r'\bsmall\b', r'modest'),
	(r'\bincrease\b', r'rise'),
	(r'\bdecrease\b', r'decline'),
	(r'\bmore detailed\b', r'more in-depth'),

	# ── General academic nouns & expressions ────────────────────────────────────
	(r'\bthis paper\b', r'the present study'),
	(r'\bthis work\b', r'the current research'),
	(r'\bapproach\b', r'methodology'),
	(r'\banalysis\b', r'examination'),
	(r'\bresults\b', r'findings'),
	(r'\bdata\b', r'observations'),

	# ── Additional high-frequency pairs ─────────────────────────────────────────
	(r'\bthe present study\b', r'this investigation'),
	(r'\bthe current study\b', r'the present work'),
	(r'\bcan be used\b', r'may be employed'),
	(r'\bto confirm\b', r'to verify'),
	(r'\bto compare\b', r'to contrast'),
	(r'\bimportant\b', r'noteworthy'),
	(r'\bnotably\b', r'particularly'),
	(r'\bcrucially\b', r'importantly'),
	(r'\bthus\b', r'hence'),
	(r'\baccordingly\b', r'in accordance with this'),
	]

	self.discipline_terms = {
	"Chemistry": [
	(r'\bsynthesis\b', 'preparation'),
	(r'\bsynthesised\b', 'prepared'),
	(r'\bsynthesize\b', 'prepare'),
	(r'\breaction\b', 'chemical transformation'),
	(r'\byield\b', 'isolated yield'),
	(r'\bcatalyst\b', 'catalytic system'),
	(r'\bsolvent\b', 'reaction medium'),
	(r'\bspectroscopy\b', 'spectroscopic analysis'),
	(r'\bNMR\b', 'nuclear magnetic resonance spectroscopy'),
	(r'\bIR\b', 'infrared spectroscopy'),
	(r'\bcompound\b', 'chemical entity'),
	(r'\bmolecule\b', 'molecular species'),
	(r'\bpurity\b', 'chemical purity'),
	(r'\bpurified\b', 'isolated and purified'),
	(r'\bcharacterised\b', 'fully characterised'),
	],

	"Physics": [
	(r'\bforce\b', 'interaction'),
	(r'\bforces\b', 'interactions'),
	(r'\bparticle\b', 'microscopic entity'),
	(r'\belectron\b', 'charged particle'),
	(r'\bvelocity\b', 'motion vector'),
	(r'\benergy\b', 'energetic state'),
	(r'\bpotential\b', 'potential energy function'),
	(r'\bwave\b', 'propagating disturbance'),
	(r'\bfield\b', 'physical field'),
	(r'\bquantum\b', 'quantised'),
	(r'\bmeasured\b', 'experimentally determined'),
	(r'\bsimulated\b', 'numerically computed'),
	(r'\bspectrum\b', 'spectral distribution'),
	(r'\btemperature\b', 'thermal energy scale'),
	(r'\bpressure\b', 'applied stress'),
	],

	"Biology": [
	(r'\bcell\b', 'biological cell'),
	(r'\bcells\b', 'cellular entities'),
	(r'\bgene\b', 'genetic locus'),
	(r'\bprotein\b', 'polypeptide chain'),
	(r'\benzyme\b', 'biocatalyst'),
	(r'\bexpression\b', 'gene expression'),
	(r'\bpathway\b', 'metabolic pathway'),
	(r'\borganism\b', 'living system'),
	(r'\btissue\b', 'biological tissue'),
	(r'\bobserved\b', 'microscopically observed'),
	(r'\btreatment\b', 'experimental treatment'),
	(r'\bcontrol\b', 'untreated control group'),
	(r'\bconcentration\b', 'molar concentration'),
	(r'\bincubated\b', 'cultured'),
	(r'\bviability\b', 'cell viability'),
	],

	"Computer Science": [
	(r'\balgorithm\b', 'computational procedure'),
	(r'\balgorithms\b', 'computational methods'),
	(r'\bmodel\b', 'computational model'),
	(r'\bperformance\b', 'computational efficiency'),
	(r'\baccuracy\b', 'predictive accuracy'),
	(r'\bdataset\b', 'data corpus'),
	(r'\btraining\b', 'model training phase'),
	(r'\bnetwork\b', 'neural architecture'),
	(r'\bcomplexity\b', 'computational complexity'),
	(r'\bruntime\b', 'execution time'),
	(r'\bmemory\b', 'space complexity'),
	(r'\bimplementation\b', 'software implementation'),
	(r'\bevaluation\b', 'empirical evaluation'),
	(r'\bframework\b', 'software framework'),
	(r'\bsimulation\b', 'computational simulation'),
	],

	"Accounts": [
	(r'\bprofit\b', 'net income'),
	(r'\bprofits\b', 'net earnings'),
	(r'\bloss\b', 'net loss'),
	(r'\bexpense\b', 'expenditure'),
	(r'\bexpenses\b', 'operating costs'),
	(r'\brevenue\b', 'total revenue'),
	(r'\bassets\b', 'economic resources'),
	(r'\bliabilities\b', 'financial obligations'),
	(r'\bequity\b', 'owners\' equity'),
	(r'\bcash flow\b', 'net cash inflow'),
	(r'\bbalance sheet\b', 'statement of financial position'),
	(r'\bincome statement\b', 'profit and loss account'),
	(r'\bdepreciation\b', 'amortisation charge'),
	(r'\btax\b', 'income tax expense'),
	(r'\baudit\b', 'independent audit'),
	],

	"Economics": [
	(r'\bmarket\b', 'economic market'),
	(r'\bsupply\b', 'quantity supplied'),
	(r'\bdemand\b', 'quantity demanded'),
	(r'\bequilibrium\b', 'market equilibrium'),
	(r'\bgrowth\b', 'economic expansion'),
	(r'\binflation\b', 'price level increase'),
	(r'\bunemployment\b', 'labour underutilisation'),
	(r'\bpolicy\b', 'macroeconomic policy'),
	(r'\bconsumption\b', 'household consumption'),
	(r'\binvestment\b', 'capital formation'),
	(r'\btrade\b', 'international trade'),
	(r'\bexchange rate\b', 'currency exchange rate'),
	(r'\bGDP\b', 'gross domestic product'),
	(r'\bprice\b', 'market price'),
	(r'\bsubidy\b', 'government transfer payment'),
	],

	"History": [
	(r'\brevolution\b', 'political revolution'),
	(r'\bwar\b', 'armed conflict'),
	(r'\bempire\b', 'imperial system'),
	(r'\bking\b', 'monarch'),
	(r'\bqueen\b', 'female sovereign'),
	(r'\bcolony\b', 'colonial territory'),
	(r'\bindependence\b', 'national sovereignty'),
	(r'\btreaty\b', 'international agreement'),
	(r'\bmovement\b', 'social-political movement'),
	(r'\bperiod\b', 'historical era'),
	(r'\bevent\b', 'historical occurrence'),
	(r'\bleader\b', 'political figure'),
	(r'\bideology\b', 'political doctrine'),
	(r'\bsociety\b', 'social structure'),
	(r'\bculture\b', 'cultural system'),
	],

	"Geography": [
	(r'\bclimate\b', 'climatic conditions'),
	(r'\btemperature\b', 'mean annual temperature'),
	(r'\brainfall\b', 'precipitation'),
	(r'\bregion\b', 'geographical area'),
	(r'\blandscape\b', 'physical landscape'),
	(r'\btopography\b', 'terrain characteristics'),
	(r'\belevation\b', 'altitude above sea level'),
	(r'\bvegetation\b', 'natural vegetation cover'),
	(r'\bsoil\b', 'pedological characteristics'),
	(r'\briver\b', 'watercourse'),
	(r'\bmountain\b', 'mountainous relief'),
	(r'\bcoast\b', 'coastal zone'),
	(r'\bpopulation\b', 'human population distribution'),
	(r'\bsettlement\b', 'human settlement pattern'),
	(r'\bresource\b', 'natural resource endowment'),
	],

	"Civics": [
	(r'\bdemocracy\b', 'democratic governance'),
	(r'\bgovernment\b', 'governing authority'),
	(r'\bconstitution\b', 'fundamental law'),
	(r'\bcitizen\b', 'member of the polity'),
	(r'\bright\b', 'fundamental right'),
	(r'\bduty\b', 'civic obligation'),
	(r'\belection\b', 'democratic election'),
	(r'\bvoting\b', 'electoral participation'),
	(r'\bparliament\b', 'legislative body'),
	(r'\bjudiciary\b', 'judicial branch'),
	(r'\bexecutive\b', 'executive authority'),
	(r'\bfreedom\b', 'civil liberty'),
	(r'\bequality\b', 'principle of equality'),
	(r'\bjustice\b', 'social justice'),
	(r'\bpolicy\b', 'public policy'),
	]
	}

	self.journal_styles = {
	"ACS": [ # American Chemical Society — very strict passive, concise, avoids "we"
	(r'\bwe\b', ''),
	(r'\bour\b', ''),
	(r'\bused\b', 'was employed'),
	(r'\bwere used\b', 'were employed'),
	(r'\bshowed\b', 'demonstrated'),
	(r'\bfound\b', 'observed'),
	(r'\bthe results show\b', 'the results demonstrate'),
	(r'\bthis work\b', 'the present study'),
	(r'\bhere\b', 'in this study'),
	(r'\bcan be seen\b', 'can be observed'),
	],

	"RSC": [ # Royal Society of Chemistry — slightly more narrative, still formal
	(r'\bshows\b', 'reveals'),
	(r'\bshowed\b', 'revealed'),
	(r'\bdemonstrates\b', 'illustrates'),
	(r'\bwe report\b', 'reported herein'),
	(r'\bwe present\b', 'presented in this work'),
	(r'\bthe results indicate\b', 'these findings suggest'),
	(r'\bimportant\b', 'noteworthy'),
	(r'\bvery\b', 'highly'),
	(r'\busing\b', 'employing'),
	(r'\bprepared\b', 'synthesised'),
	],

	"Elsevier": [ # Many Elsevier journals — prefers "findings", avoids first person in some cases
	(r'\bresults\b', 'findings'),
	(r'\bthe results\b', 'these findings'),
	(r'\bwe found\b', 'it was found'),
	(r'\bwe observed\b', 'it was observed'),
	(r'\bthis study\b', 'the present investigation'),
	(r'\bimportant\b', 'significant'),
	(r'\bshows\b', 'indicates'),
	(r'\bsuggests\b', 'points to'),
	(r'\bhowever\b', 'nevertheless'),
	(r'\btherefore\b', 'consequently'),
	],

	"Exam-safe": [ # Very conservative — minimal change, avoids any risk of being flagged as AI
	# Almost no transformations — safest for school/college submissions
	(r'\bvery important\b', 'crucial'),
	(r'\bvery good\b', 'excellent'),
	(r'\bvery bad\b', 'poor'),
	(r'\bgot\b', 'obtained'),
	(r'\bshows\b', 'indicates'),
	(r'\bwe think\b', 'it is considered'),
	# Keep changes extremely light
	],

	# Optional extra styles you can add later
	"Springer": [
	(r'\bwe\b', 'the authors'),
	(r'\bour results\b', 'the obtained results'),
	(r'\bshow\b', 'indicate'),
	(r'\bfound\b', 'revealed'),
	(r'\bthis paper\b', 'the present contribution'),
	],

	"IEEE": [
	(r'\bwe\b', ''),
	(r'\bthis paper\b', 'this work'),
	(r'\bpresented\b', 'proposed'),
	(r'\bproposed\b', 'introduced'),
	(r'\bperformance\b', 'efficacy'),
	(r'\bresults\b', 'experimental outcomes'),
	],

	"Nature": [
	(r'\bwe show\b', 'here we demonstrate'),
	(r'\bwe report\b', 'we describe'),
	(r'\bimportant\b', 'striking'),
	(r'\bnovel\b', 'previously unreported'),
	(r'\bsuggests\b', 'indicates'),
	]
	}
	# -----------------------------------------------------
	def similarity(self, text1: str, text2: str) -> float:
	words1 = set(text1.lower().split())
	words2 = set(text2.lower().split())
	union = words1.union(words2)
	return len(words1.intersection(words2)) / len(union) if union else 0.0

	# ── Individual transformation methods ──
	# -----------------------------------------------------
	def synonym_replace(self, text: str, aggressiveness: float) -> str:
	"""Method 1: Replace words with synonyms (WordNet based)"""
	try:
	# LAZY IMPORT - CRITICAL FOR HF
	from nltk.tokenize import word_tokenize
	from nltk import pos_tag

	tokens = word_tokenize(text)
	pos_tags = pos_tag(tokens)

	new_tokens = []
	change_prob = 0.05 + 0.18 * aggressiveness
	protected = {'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did'}

	for token, pos in pos_tags:
	lower = token.lower()
	if lower in protected or len(token) <= 3:
	new_tokens.append(token)
	continue

	synonyms = self._get_synonyms(lower, pos)
	if synonyms and random.random() < change_prob:
	new_word = random.choice(synonyms)
	if token[0].isupper():
	new_word = new_word.capitalize()
	new_tokens.append(new_word)
	else:
	new_tokens.append(token)

	result = ' '.join(new_tokens)
	return re.sub(r'\s+([.,;:!?])', r'\1', result).strip()
	except:
	return text
	# -----------------------------------------------------
	def _get_synonyms(self, word: str, pos_tag: str) -> list:
	# LAZY IMPORT - CRITICAL FOR HF
	from nltk.corpus import wordnet

	pos_map = {'NN': wordnet.NOUN, 'VB': wordnet.VERB, 'JJ': wordnet.ADJ, 'RB': wordnet.ADV}
	wn_pos = pos_map.get(pos_tag[:2])
	if not wn_pos:
	return []

	synonyms = set()
	for syn in wordnet.synsets(word, pos=wn_pos):
	for lemma in syn.lemmas():
	name = lemma.name().replace('_', ' ')
	if name.lower() != word.lower() and ' ' not in name:
	synonyms.add(name)
	return list(synonyms)[:3]

	def active_passive_conversion(self, text: str) -> str:
	"""Simple active/passive conversion"""
	# LAZY IMPORT for sent_tokenize
	from nltk.tokenize import sent_tokenize

	sentences = sent_tokenize(text)
	new_sentences = []
	for sent in sentences:
	# Very simple: if starts with subject + verb, try to make passive
	if random.random() < 0.4: # apply randomly
	# Placeholder: real passive conversion needs more logic
	sent = sent.replace("The researchers conducted", "The experiment was conducted by the researchers")
	new_sentences.append(sent)
	return ' '.join(new_sentences)

	def direct_indirect_style(self, text: str) -> str:
	"""Method 3: Convert direct statements to more indirect/academic phrasing"""
	replacements = [
	(r'\bshows that\b', r'indicates that'),
	(r'\bsuggests that\b', r'suggests the possibility that'),
	(r'\bproves that\b', r'provides evidence that'),
	(r'\bwe can see that\b', r'it can be observed that'),
	(r'\bit is clear that\b', r'it appears that'),
	]
	result = text
	for pattern, repl in replacements:
	result = re.sub(pattern, repl, result, flags=re.I)
	return result

	def clause_reordering(self, text: str) -> str:
	"""Method 4: Reorder clauses / adverbial phrases"""
	# LAZY IMPORT for sent_tokenize
	from nltk.tokenize import sent_tokenize

	sentences = sent_tokenize(text)
	new_sentences = []
	for sent in sentences:
	if random.random() < 0.6:
	# Simple reorder: move time/place adverbial to front sometimes
	match = re.search(r'\b(in\|at\|on\|during\|after\|before)\s+[\w\s]+?\b', sent, re.I)
	if match:
	phrase = match.group(0)
	rest = sent.replace(phrase, '', 1).strip()
	if random.random() < 0.5:
	new_sent = f"{phrase.capitalize()}, {rest[0].lower()}{rest[1:]}"
	else:
	new_sent = sent
	new_sentences.append(new_sent)
	else:
	new_sentences.append(sent)
	else:
	new_sentences.append(sent)
	return ' '.join(new_sentences)

	def sentence_splitting(self, text: str) -> str:
	"""Method 5: Split long sentences"""
	# LAZY IMPORT for sent_tokenize
	from nltk.tokenize import sent_tokenize

	if len(text.split()) > 35:
	sentences = sent_tokenize(text)
	result = []
	for sent in sentences:
	if len(sent.split()) > 25 and ' and ' in sent:
	parts = sent.split(' and ', 1)
	result.append(parts[0].strip() + '.')
	result.append('And ' + parts[1].strip())
	else:
	result.append(sent)
	return ' '.join(result)
	return text

	def contraction_toggle(self, text: str) -> str:
	"""Method 6: Expand or contract contractions randomly"""
	words = text.split()
	i = 0
	while i < len(words) - 1:
	pair = f"{words[i]} {words[i+1]}".lower().rstrip('.,;:!?')
	if pair in self.contractions:
	if random.random() < 0.4:
	words[i] = self.contractions[pair]
	words.pop(i+1)
	else:
	i += 1
	elif pair in self.expanded_contractions:
	if random.random() < 0.4:
	words[i] = self.expanded_contractions[pair]
	words.pop(i+1)
	else:
	i += 1
	else:
	i += 1
	return ' '.join(words)

	def academic_phrase_swap(self, text: str, aggressiveness: float) -> str:
	"""Method 7: Swap common academic phrases"""
	prob = 0.2 + 0.3 * aggressiveness
	result = text
	for pattern, repl in self.academic_phrase_pairs:
	if random.random() < prob:
	result = re.sub(pattern, repl, result, flags=re.I)
	return result

	def post_process(self, text: str) -> str:
	text = re.sub(r'\s+', ' ', text).strip()
	if text and text[-1] not in '.!?':
	text += '.'
	return text

	def generate_variant(self, text: str, aggressiveness: float) -> tuple[str, list[str]]:
	"""Apply a random combination of transformations"""
	current = text
	applied = []

	# Randomly select 2–5 methods (more aggressive = more methods)
	num_methods = random.randint(2, 4) if aggressiveness < 0.6 else random.randint(3, 6)
	all_methods = [
	("Synonym replacement", self.synonym_replace),
	("Academic phrase swap", self.academic_phrase_swap),
	("Clause reordering", self.clause_reordering),
	("Contraction toggle", self.contraction_toggle),
	("Sentence splitting", self.sentence_splitting),
	("Direct-indirect style", self.direct_indirect_style),
	("Active-passive (basic)", self.active_passive_conversion),
	]

	selected = random.sample(all_methods, min(num_methods, len(all_methods)))

	for name, func in selected:
	if func == self.synonym_replace or func == self.academic_phrase_swap:
	current = func(current, aggressiveness)
	else:
	current = func(current)
	applied.append(name)

	current = self.post_process(current)
	return current, applied

	def transformer_sentence_refine(self, text, max_new_tokens=80):
	model = load_transformer()
	# LAZY IMPORT for sent_tokenize
	from nltk.tokenize import sent_tokenize

	sentences = sent_tokenize(text)
	refined = []

	for s in sentences:
	if len(s.split()) < 6:
	refined.append(s)
	continue

	out = model(
	s,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=0.7,
	top_p=0.95
	)[0]["generated_text"]

	refined.append(out)

	return " ".join(refined)

	def apply_pairs(self, text: str, pairs: list, aggressiveness: float) -> str:
	"""Apply a list of (pattern, replacement) pairs with probability based on aggressiveness"""
	prob = 0.15 + 0.35 * aggressiveness # adjust range as you like
	result = text
	for pattern, repl in pairs:
	if random.random() < prob:
	result = re.sub(pattern, repl, result, flags=re.I)
	return result

	def rewrite(self, text, aggr, discipline, journal):
	# LAZY IMPORT for sent_tokenize
	from nltk.tokenize import sent_tokenize

	sentences = sent_tokenize(text)
	new_sents = []

	for s in sentences:
	if USE_SYNONYM:
	s = self.synonym_replace(s, aggr)

	if USE_ACADEMIC:
	s = self.apply_pairs(s, self.academic_phrase_pairs, aggr)

	if USE_DISCIPLINE and discipline != "General":
	s = self.apply_pairs(
	s, self.discipline_terms.get(discipline, []), aggr)

	if USE_ACTIVE_PASSIVE:
	s = self.active_passive_conversion(s)

	if USE_DIRECT_INDIRECT:
	s = self.direct_indirect_style(s)

	if USE_CLAUSE:
	s = self.clause_reordering(s)

	if USE_SPLIT:
	s = self.sentence_splitting(s)

	# Journal style always applied (probability=1.0 = always)
	s = self.apply_pairs(
	s, self.journal_styles.get(journal, []), 1.0)

	new_sents.append(s)

	return " ".join(new_sents)

	# =========================================================
	# DOCX Export (with lazy import)
	# =========================================================
	def export_docx(original, variants):
	from docx import Document
	from io import BytesIO

	doc = Document()
	doc.add_heading("Academic Paraphrasing Output", 1)
	doc.add_paragraph("Original Text:")
	doc.add_paragraph(original)

	for i, v in enumerate(variants, 1):
	doc.add_heading(f"Variant {i}", 2)
	doc.add_paragraph(v["text"])
	doc.add_paragraph(f"Similarity: {v['sim']:.3f}")

	buf = BytesIO()
	doc.save(buf)
	buf.seek(0)
	return buf

	# =========================================================
	# Gradio callback (TOP-LEVEL FUNCTION)
	# =========================================================
	def generate_paraphrases(
	input_text,
	uploaded_file,
	num_variants,
	aggressiveness,
	discipline,
	journal_mode,
	plagiarism_safe,
	use_transformer,
	USE_SYNONYM,
	USE_ACADEMIC,
	USE_DISCIPLINE,
	USE_ACTIVE_PASSIVE,
	USE_DIRECT_INDIRECT,
	USE_CLAUSE,
	USE_SPLIT,
	):
	# Inject file text if uploaded
	if uploaded_file:
	input_text = read_input_file(uploaded_file)

	engine = AcademicParaphraser()
	outputs = []

	for _ in range(num_variants * 3):
	t = engine.rewrite(
	input_text,
	aggressiveness,
	discipline,
	journal_mode
	)

	if use_transformer and TRANSFORMER_AVAILABLE:
	t = engine.transformer_sentence_refine(t)

	sim = engine.similarity(input_text, t)
	if plagiarism_safe and sim > 0.72:
	continue

	outputs.append({"text": t, "sim": sim})

	outputs = sorted(outputs, key=lambda x: x["sim"])[:num_variants]

	display = ""
	for i, o in enumerate(outputs, 1):
	display += f"\n\n### Variant {i} (Similarity {o['sim']:.3f})\n{o['text']}"

	doc = export_docx(input_text, outputs)
	return display.strip(), doc

	# =========================================================
	# === GRADIO UI ===========================================
	# =========================================================
	with gr.Blocks(title="🧠 Advanced Academic Paraphraser") as demo:

	gr.Markdown(
	"# 🧠 Advanced Academic Paraphraser\n"
	"Rule-based \| AI \| Hybrid academic paraphrasing engine"
	)

	input_text = gr.Textbox(label="Paste Text", lines=10)
	uploaded_file = gr.File(label="Upload (.txt / .docx)")
	variants = gr.Slider(1, 3, value=2, step=1, label="Variants")
	strength = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Rewrite Strength")

	discipline = gr.Dropdown(
	["General","Chemistry","Physics","Biology","Computer Science",
	"Accounts","Economics","History","Geography","Civics"],
	value="General",
	label="Discipline"
	)

	journal = gr.Dropdown(
	["Standard","ACS","RSC","Elsevier","Exam-safe"],
	value="Standard",
	label="Tone"
	)

	plagiarism_safe = gr.Checkbox(True, label="Plagiarism-risk minimization")
	hybrid = gr.Checkbox(False, label="Hybrid rule + transformer")

	USE_SYNONYM = gr.Checkbox(True, label="Synonym replacement")
	USE_ACADEMIC = gr.Checkbox(True, label="Academic phrase expansion")
	USE_DISCIPLINE = gr.Checkbox(True, label="Discipline terms")
	USE_ACTIVE_PASSIVE = gr.Checkbox(True, label="Active → Passive")
	USE_DIRECT_INDIRECT = gr.Checkbox(True, label="Direct → Indirect")
	USE_CLAUSE = gr.Checkbox(False, label="Clause rewriting")
	USE_SPLIT = gr.Checkbox(True, label="Sentence splitting")

	run_btn = gr.Button("Generate Paraphrase")

	output_md = gr.Markdown()
	output_file = gr.File(label="Download DOCX")

	# =====================================================
	# EVENT HANDLERS MUST BE INSIDE THE BLOCKS CONTEXT
	# =====================================================
	run_btn.click(
	fn=generate_paraphrases,
	inputs=[
	input_text,
	uploaded_file,
	variants,
	strength,
	discipline,
	journal,
	plagiarism_safe,
	hybrid,
	USE_SYNONYM,
	USE_ACADEMIC,
	USE_DISCIPLINE,
	USE_ACTIVE_PASSIVE,
	USE_DIRECT_INDIRECT,
	USE_CLAUSE,
	USE_SPLIT,
	],
	outputs=[output_md, output_file]
	)

	# =========================================================
	# HF Spaces deployment - NO __main__ block
	# =========================================================
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False # Disable public sharing for HF Spaces
	)
	else:
	# For Hugging Face Spaces
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=False
	)