Spaces:
Running
Running
| # ========================================================= | |
| # Advanced Academic Rule-Based + Hybrid Paraphraser | |
| # ========================================================= | |
| import gradio as gr | |
| import os, re, random, warnings | |
| # Disable analytics for privacy | |
| os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| warnings.filterwarnings("ignore") | |
| # --------------------------------------------------------- | |
| # Transformer lazy load | |
| # --------------------------------------------------------- | |
| _transformer = None | |
| TRANSFORMER_AVAILABLE = True | |
| def load_transformer(): | |
| global _transformer | |
| if _transformer is None: | |
| from transformers import pipeline | |
| _transformer = pipeline( | |
| "text2text-generation", # β correct for T5 | |
| model="humarin/chatgpt_paraphraser_on_T5_base", | |
| device=-1 | |
| ) | |
| return _transformer | |
| # --------------------------------------------------------- | |
| # File Reader (with lazy import) | |
| # --------------------------------------------------------- | |
| def read_input_file(file): | |
| if not file: | |
| return "" | |
| if file.name.endswith(".txt"): | |
| return file.read().decode("utf-8") | |
| if file.name.endswith(".docx"): | |
| # Lazy import for docx | |
| from docx import Document | |
| doc = Document(file) | |
| return "\n".join(p.text for p in doc.paragraphs) | |
| return "" | |
| # ========================================================= | |
| # Core Paraphraser | |
| # ========================================================= | |
| class AcademicParaphraser: | |
| def __init__(self): | |
| # NLTK is NOT imported here - it will be imported lazily in methods | |
| self.contractions = { | |
| "do not": "don't", "cannot": "can't", "will not": "won't", | |
| "is not": "isn't", "are not": "aren't", "did not": "didn't", | |
| "would not": "wouldn't", "could not": "couldn't", "should not": "shouldn't", | |
| "has not": "hasn't", "have not": "haven't", "had not": "hadn't" | |
| } | |
| self.expanded_contractions = {v: k for k, v in self.contractions.items()} | |
| self.academic_phrase_pairs = [ | |
| # ββ Your original pairs βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bcarried out\b', r'conducted'), | |
| (r'\bperformed\b', r'conducted'), | |
| (r'\butilized\b', r'used'), | |
| (r'\bemployed\b', r'used'), | |
| (r'\bconducted\b', r'performed'), | |
| (r'\busing\b', r'with'), | |
| (r'\bwith\b', r'using'), | |
| (r'\bto explore\b', r'to investigate'), | |
| (r'\bto investigate\b', r'to explore'), | |
| (r'\bto examine\b', r'to analyze'), | |
| (r'\bto analyze\b', r'to examine'), | |
| (r'\bto obtain\b', r'to gain'), | |
| (r'\bto gain\b', r'to obtain'), | |
| (r'\bmore detailed\b', r'deeper'), | |
| (r'\bdeeper\b', r'more detailed'), | |
| (r'\bunderstanding\b', r'insight'), | |
| (r'\binsight\b', r'understanding'), | |
| (r'\bframework\b', r'structure'), | |
| (r'\bstructure\b', r'framework'), | |
| (r'\boptimized\b', r'minimized'), | |
| (r'\bminimized\b', r'optimized'), | |
| (r'\bgeometry\b', r'shape'), | |
| (r'\bshape\b', r'geometry'), | |
| (r'\binteractions\b', r'connections'), | |
| (r'\bconnections\b', r'interactions'), | |
| # ββ Purpose / aim / objective βββββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bthis study aims to\b', r'the present work seeks to'), | |
| (r'\bthe aim of this study\b', r'the purpose of this research'), | |
| (r'\bthe objective of\b', r'the goal of'), | |
| (r'\bwe aimed to\b', r'this study was designed to'), | |
| (r'\bintended to investigate\b', r'undertaken to examine'), | |
| # ββ Methods / procedures ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bwas used\b', r'was employed'), | |
| (r'\bwere used\b', r'were employed'), | |
| (r'\bby using\b', r'by means of'), | |
| (r'\bthe samples were collected\b', r'specimens were obtained'), | |
| (r'\bthe data were collected\b', r'data were acquired'), | |
| (r'\bmeasurements were taken\b', r'measurements were performed'), | |
| # ββ Results / findings ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bthe results show\b', r'the findings indicate'), | |
| (r'\bthe results showed\b', r'the findings revealed'), | |
| (r'\bshowed that\b', r'indicated that'), | |
| (r'\bfound that\b', r'revealed that'), | |
| (r'\bit was observed that\b', r'observations indicated that'), | |
| (r'\bthe data indicate\b', r'the results suggest'), | |
| (r'\bsignificant difference\b', r'marked difference'), | |
| (r'\bno significant difference\b', r'no appreciable difference'), | |
| # ββ Discussion / interpretation βββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bthis suggests that\b', r'these findings imply that'), | |
| (r'\bthis indicates that\b', r'this points to'), | |
| (r'\bit can be seen that\b', r'it is evident that'), | |
| (r'\bconsistent with\b', r'in agreement with'), | |
| (r'\bin line with\b', r'aligned with'), | |
| (r'\bcontrary to\b', r'in contrast to'), | |
| # ββ Transitions / connectors ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bhowever\b', r'nevertheless'), | |
| (r'\btherefore\b', r'consequently'), | |
| (r'\bin addition\b', r'furthermore'), | |
| (r'\bmoreover\b', r'what is more'), | |
| (r'\bon the other hand\b', r'conversely'), | |
| (r'\bfor example\b', r'for instance'), | |
| (r'\bfirstly\b', r'first'), | |
| (r'\bsecondly\b', r'second'), | |
| # ββ Hedging / cautious language βββββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bshows that\b', r'suggests that'), | |
| (r'\bindicates that\b', r'appears to indicate that'), | |
| (r'\bproves that\b', r'provides evidence that'), | |
| (r'\bit is clear that\b', r'it appears that'), | |
| (r'\bclearly\b', r'evidently'), | |
| (r'\bdefinitely\b', r'presumably'), | |
| # ββ Quantity / degree / intensity βββββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bvery\b', r'highly'), | |
| (r'\bvery important\b', r'crucially important'), | |
| (r'\blarge\b', r'substantial'), | |
| (r'\bsmall\b', r'modest'), | |
| (r'\bincrease\b', r'rise'), | |
| (r'\bdecrease\b', r'decline'), | |
| (r'\bmore detailed\b', r'more in-depth'), | |
| # ββ General academic nouns & expressions ββββββββββββββββββββββββββββββββββββ | |
| (r'\bthis paper\b', r'the present study'), | |
| (r'\bthis work\b', r'the current research'), | |
| (r'\bapproach\b', r'methodology'), | |
| (r'\banalysis\b', r'examination'), | |
| (r'\bresults\b', r'findings'), | |
| (r'\bdata\b', r'observations'), | |
| # ββ Additional high-frequency pairs βββββββββββββββββββββββββββββββββββββββββ | |
| (r'\bthe present study\b', r'this investigation'), | |
| (r'\bthe current study\b', r'the present work'), | |
| (r'\bcan be used\b', r'may be employed'), | |
| (r'\bto confirm\b', r'to verify'), | |
| (r'\bto compare\b', r'to contrast'), | |
| (r'\bimportant\b', r'noteworthy'), | |
| (r'\bnotably\b', r'particularly'), | |
| (r'\bcrucially\b', r'importantly'), | |
| (r'\bthus\b', r'hence'), | |
| (r'\baccordingly\b', r'in accordance with this'), | |
| ] | |
| self.discipline_terms = { | |
| "Chemistry": [ | |
| (r'\bsynthesis\b', 'preparation'), | |
| (r'\bsynthesised\b', 'prepared'), | |
| (r'\bsynthesize\b', 'prepare'), | |
| (r'\breaction\b', 'chemical transformation'), | |
| (r'\byield\b', 'isolated yield'), | |
| (r'\bcatalyst\b', 'catalytic system'), | |
| (r'\bsolvent\b', 'reaction medium'), | |
| (r'\bspectroscopy\b', 'spectroscopic analysis'), | |
| (r'\bNMR\b', 'nuclear magnetic resonance spectroscopy'), | |
| (r'\bIR\b', 'infrared spectroscopy'), | |
| (r'\bcompound\b', 'chemical entity'), | |
| (r'\bmolecule\b', 'molecular species'), | |
| (r'\bpurity\b', 'chemical purity'), | |
| (r'\bpurified\b', 'isolated and purified'), | |
| (r'\bcharacterised\b', 'fully characterised'), | |
| ], | |
| "Physics": [ | |
| (r'\bforce\b', 'interaction'), | |
| (r'\bforces\b', 'interactions'), | |
| (r'\bparticle\b', 'microscopic entity'), | |
| (r'\belectron\b', 'charged particle'), | |
| (r'\bvelocity\b', 'motion vector'), | |
| (r'\benergy\b', 'energetic state'), | |
| (r'\bpotential\b', 'potential energy function'), | |
| (r'\bwave\b', 'propagating disturbance'), | |
| (r'\bfield\b', 'physical field'), | |
| (r'\bquantum\b', 'quantised'), | |
| (r'\bmeasured\b', 'experimentally determined'), | |
| (r'\bsimulated\b', 'numerically computed'), | |
| (r'\bspectrum\b', 'spectral distribution'), | |
| (r'\btemperature\b', 'thermal energy scale'), | |
| (r'\bpressure\b', 'applied stress'), | |
| ], | |
| "Biology": [ | |
| (r'\bcell\b', 'biological cell'), | |
| (r'\bcells\b', 'cellular entities'), | |
| (r'\bgene\b', 'genetic locus'), | |
| (r'\bprotein\b', 'polypeptide chain'), | |
| (r'\benzyme\b', 'biocatalyst'), | |
| (r'\bexpression\b', 'gene expression'), | |
| (r'\bpathway\b', 'metabolic pathway'), | |
| (r'\borganism\b', 'living system'), | |
| (r'\btissue\b', 'biological tissue'), | |
| (r'\bobserved\b', 'microscopically observed'), | |
| (r'\btreatment\b', 'experimental treatment'), | |
| (r'\bcontrol\b', 'untreated control group'), | |
| (r'\bconcentration\b', 'molar concentration'), | |
| (r'\bincubated\b', 'cultured'), | |
| (r'\bviability\b', 'cell viability'), | |
| ], | |
| "Computer Science": [ | |
| (r'\balgorithm\b', 'computational procedure'), | |
| (r'\balgorithms\b', 'computational methods'), | |
| (r'\bmodel\b', 'computational model'), | |
| (r'\bperformance\b', 'computational efficiency'), | |
| (r'\baccuracy\b', 'predictive accuracy'), | |
| (r'\bdataset\b', 'data corpus'), | |
| (r'\btraining\b', 'model training phase'), | |
| (r'\bnetwork\b', 'neural architecture'), | |
| (r'\bcomplexity\b', 'computational complexity'), | |
| (r'\bruntime\b', 'execution time'), | |
| (r'\bmemory\b', 'space complexity'), | |
| (r'\bimplementation\b', 'software implementation'), | |
| (r'\bevaluation\b', 'empirical evaluation'), | |
| (r'\bframework\b', 'software framework'), | |
| (r'\bsimulation\b', 'computational simulation'), | |
| ], | |
| "Accounts": [ | |
| (r'\bprofit\b', 'net income'), | |
| (r'\bprofits\b', 'net earnings'), | |
| (r'\bloss\b', 'net loss'), | |
| (r'\bexpense\b', 'expenditure'), | |
| (r'\bexpenses\b', 'operating costs'), | |
| (r'\brevenue\b', 'total revenue'), | |
| (r'\bassets\b', 'economic resources'), | |
| (r'\bliabilities\b', 'financial obligations'), | |
| (r'\bequity\b', 'owners\' equity'), | |
| (r'\bcash flow\b', 'net cash inflow'), | |
| (r'\bbalance sheet\b', 'statement of financial position'), | |
| (r'\bincome statement\b', 'profit and loss account'), | |
| (r'\bdepreciation\b', 'amortisation charge'), | |
| (r'\btax\b', 'income tax expense'), | |
| (r'\baudit\b', 'independent audit'), | |
| ], | |
| "Economics": [ | |
| (r'\bmarket\b', 'economic market'), | |
| (r'\bsupply\b', 'quantity supplied'), | |
| (r'\bdemand\b', 'quantity demanded'), | |
| (r'\bequilibrium\b', 'market equilibrium'), | |
| (r'\bgrowth\b', 'economic expansion'), | |
| (r'\binflation\b', 'price level increase'), | |
| (r'\bunemployment\b', 'labour underutilisation'), | |
| (r'\bpolicy\b', 'macroeconomic policy'), | |
| (r'\bconsumption\b', 'household consumption'), | |
| (r'\binvestment\b', 'capital formation'), | |
| (r'\btrade\b', 'international trade'), | |
| (r'\bexchange rate\b', 'currency exchange rate'), | |
| (r'\bGDP\b', 'gross domestic product'), | |
| (r'\bprice\b', 'market price'), | |
| (r'\bsubidy\b', 'government transfer payment'), | |
| ], | |
| "History": [ | |
| (r'\brevolution\b', 'political revolution'), | |
| (r'\bwar\b', 'armed conflict'), | |
| (r'\bempire\b', 'imperial system'), | |
| (r'\bking\b', 'monarch'), | |
| (r'\bqueen\b', 'female sovereign'), | |
| (r'\bcolony\b', 'colonial territory'), | |
| (r'\bindependence\b', 'national sovereignty'), | |
| (r'\btreaty\b', 'international agreement'), | |
| (r'\bmovement\b', 'social-political movement'), | |
| (r'\bperiod\b', 'historical era'), | |
| (r'\bevent\b', 'historical occurrence'), | |
| (r'\bleader\b', 'political figure'), | |
| (r'\bideology\b', 'political doctrine'), | |
| (r'\bsociety\b', 'social structure'), | |
| (r'\bculture\b', 'cultural system'), | |
| ], | |
| "Geography": [ | |
| (r'\bclimate\b', 'climatic conditions'), | |
| (r'\btemperature\b', 'mean annual temperature'), | |
| (r'\brainfall\b', 'precipitation'), | |
| (r'\bregion\b', 'geographical area'), | |
| (r'\blandscape\b', 'physical landscape'), | |
| (r'\btopography\b', 'terrain characteristics'), | |
| (r'\belevation\b', 'altitude above sea level'), | |
| (r'\bvegetation\b', 'natural vegetation cover'), | |
| (r'\bsoil\b', 'pedological characteristics'), | |
| (r'\briver\b', 'watercourse'), | |
| (r'\bmountain\b', 'mountainous relief'), | |
| (r'\bcoast\b', 'coastal zone'), | |
| (r'\bpopulation\b', 'human population distribution'), | |
| (r'\bsettlement\b', 'human settlement pattern'), | |
| (r'\bresource\b', 'natural resource endowment'), | |
| ], | |
| "Civics": [ | |
| (r'\bdemocracy\b', 'democratic governance'), | |
| (r'\bgovernment\b', 'governing authority'), | |
| (r'\bconstitution\b', 'fundamental law'), | |
| (r'\bcitizen\b', 'member of the polity'), | |
| (r'\bright\b', 'fundamental right'), | |
| (r'\bduty\b', 'civic obligation'), | |
| (r'\belection\b', 'democratic election'), | |
| (r'\bvoting\b', 'electoral participation'), | |
| (r'\bparliament\b', 'legislative body'), | |
| (r'\bjudiciary\b', 'judicial branch'), | |
| (r'\bexecutive\b', 'executive authority'), | |
| (r'\bfreedom\b', 'civil liberty'), | |
| (r'\bequality\b', 'principle of equality'), | |
| (r'\bjustice\b', 'social justice'), | |
| (r'\bpolicy\b', 'public policy'), | |
| ] | |
| } | |
| self.journal_styles = { | |
| "ACS": [ # American Chemical Society β very strict passive, concise, avoids "we" | |
| (r'\bwe\b', ''), | |
| (r'\bour\b', ''), | |
| (r'\bused\b', 'was employed'), | |
| (r'\bwere used\b', 'were employed'), | |
| (r'\bshowed\b', 'demonstrated'), | |
| (r'\bfound\b', 'observed'), | |
| (r'\bthe results show\b', 'the results demonstrate'), | |
| (r'\bthis work\b', 'the present study'), | |
| (r'\bhere\b', 'in this study'), | |
| (r'\bcan be seen\b', 'can be observed'), | |
| ], | |
| "RSC": [ # Royal Society of Chemistry β slightly more narrative, still formal | |
| (r'\bshows\b', 'reveals'), | |
| (r'\bshowed\b', 'revealed'), | |
| (r'\bdemonstrates\b', 'illustrates'), | |
| (r'\bwe report\b', 'reported herein'), | |
| (r'\bwe present\b', 'presented in this work'), | |
| (r'\bthe results indicate\b', 'these findings suggest'), | |
| (r'\bimportant\b', 'noteworthy'), | |
| (r'\bvery\b', 'highly'), | |
| (r'\busing\b', 'employing'), | |
| (r'\bprepared\b', 'synthesised'), | |
| ], | |
| "Elsevier": [ # Many Elsevier journals β prefers "findings", avoids first person in some cases | |
| (r'\bresults\b', 'findings'), | |
| (r'\bthe results\b', 'these findings'), | |
| (r'\bwe found\b', 'it was found'), | |
| (r'\bwe observed\b', 'it was observed'), | |
| (r'\bthis study\b', 'the present investigation'), | |
| (r'\bimportant\b', 'significant'), | |
| (r'\bshows\b', 'indicates'), | |
| (r'\bsuggests\b', 'points to'), | |
| (r'\bhowever\b', 'nevertheless'), | |
| (r'\btherefore\b', 'consequently'), | |
| ], | |
| "Exam-safe": [ # Very conservative β minimal change, avoids any risk of being flagged as AI | |
| # Almost no transformations β safest for school/college submissions | |
| (r'\bvery important\b', 'crucial'), | |
| (r'\bvery good\b', 'excellent'), | |
| (r'\bvery bad\b', 'poor'), | |
| (r'\bgot\b', 'obtained'), | |
| (r'\bshows\b', 'indicates'), | |
| (r'\bwe think\b', 'it is considered'), | |
| # Keep changes extremely light | |
| ], | |
| # Optional extra styles you can add later | |
| "Springer": [ | |
| (r'\bwe\b', 'the authors'), | |
| (r'\bour results\b', 'the obtained results'), | |
| (r'\bshow\b', 'indicate'), | |
| (r'\bfound\b', 'revealed'), | |
| (r'\bthis paper\b', 'the present contribution'), | |
| ], | |
| "IEEE": [ | |
| (r'\bwe\b', ''), | |
| (r'\bthis paper\b', 'this work'), | |
| (r'\bpresented\b', 'proposed'), | |
| (r'\bproposed\b', 'introduced'), | |
| (r'\bperformance\b', 'efficacy'), | |
| (r'\bresults\b', 'experimental outcomes'), | |
| ], | |
| "Nature": [ | |
| (r'\bwe show\b', 'here we demonstrate'), | |
| (r'\bwe report\b', 'we describe'), | |
| (r'\bimportant\b', 'striking'), | |
| (r'\bnovel\b', 'previously unreported'), | |
| (r'\bsuggests\b', 'indicates'), | |
| ] | |
| } | |
| # ----------------------------------------------------- | |
| def similarity(self, text1: str, text2: str) -> float: | |
| words1 = set(text1.lower().split()) | |
| words2 = set(text2.lower().split()) | |
| union = words1.union(words2) | |
| return len(words1.intersection(words2)) / len(union) if union else 0.0 | |
| # ββ Individual transformation methods ββ | |
| # ----------------------------------------------------- | |
| def synonym_replace(self, text: str, aggressiveness: float) -> str: | |
| """Method 1: Replace words with synonyms (WordNet based)""" | |
| try: | |
| # LAZY IMPORT - CRITICAL FOR HF | |
| from nltk.tokenize import word_tokenize | |
| from nltk import pos_tag | |
| tokens = word_tokenize(text) | |
| pos_tags = pos_tag(tokens) | |
| new_tokens = [] | |
| change_prob = 0.05 + 0.18 * aggressiveness | |
| protected = {'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did'} | |
| for token, pos in pos_tags: | |
| lower = token.lower() | |
| if lower in protected or len(token) <= 3: | |
| new_tokens.append(token) | |
| continue | |
| synonyms = self._get_synonyms(lower, pos) | |
| if synonyms and random.random() < change_prob: | |
| new_word = random.choice(synonyms) | |
| if token[0].isupper(): | |
| new_word = new_word.capitalize() | |
| new_tokens.append(new_word) | |
| else: | |
| new_tokens.append(token) | |
| result = ' '.join(new_tokens) | |
| return re.sub(r'\s+([.,;:!?])', r'\1', result).strip() | |
| except: | |
| return text | |
| # ----------------------------------------------------- | |
| def _get_synonyms(self, word: str, pos_tag: str) -> list: | |
| # LAZY IMPORT - CRITICAL FOR HF | |
| from nltk.corpus import wordnet | |
| pos_map = {'NN': wordnet.NOUN, 'VB': wordnet.VERB, 'JJ': wordnet.ADJ, 'RB': wordnet.ADV} | |
| wn_pos = pos_map.get(pos_tag[:2]) | |
| if not wn_pos: | |
| return [] | |
| synonyms = set() | |
| for syn in wordnet.synsets(word, pos=wn_pos): | |
| for lemma in syn.lemmas(): | |
| name = lemma.name().replace('_', ' ') | |
| if name.lower() != word.lower() and ' ' not in name: | |
| synonyms.add(name) | |
| return list(synonyms)[:3] | |
| def active_passive_conversion(self, text: str) -> str: | |
| """Simple active/passive conversion""" | |
| # LAZY IMPORT for sent_tokenize | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| new_sentences = [] | |
| for sent in sentences: | |
| # Very simple: if starts with subject + verb, try to make passive | |
| if random.random() < 0.4: # apply randomly | |
| # Placeholder: real passive conversion needs more logic | |
| sent = sent.replace("The researchers conducted", "The experiment was conducted by the researchers") | |
| new_sentences.append(sent) | |
| return ' '.join(new_sentences) | |
| def direct_indirect_style(self, text: str) -> str: | |
| """Method 3: Convert direct statements to more indirect/academic phrasing""" | |
| replacements = [ | |
| (r'\bshows that\b', r'indicates that'), | |
| (r'\bsuggests that\b', r'suggests the possibility that'), | |
| (r'\bproves that\b', r'provides evidence that'), | |
| (r'\bwe can see that\b', r'it can be observed that'), | |
| (r'\bit is clear that\b', r'it appears that'), | |
| ] | |
| result = text | |
| for pattern, repl in replacements: | |
| result = re.sub(pattern, repl, result, flags=re.I) | |
| return result | |
| def clause_reordering(self, text: str) -> str: | |
| """Method 4: Reorder clauses / adverbial phrases""" | |
| # LAZY IMPORT for sent_tokenize | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| new_sentences = [] | |
| for sent in sentences: | |
| if random.random() < 0.6: | |
| # Simple reorder: move time/place adverbial to front sometimes | |
| match = re.search(r'\b(in|at|on|during|after|before)\s+[\w\s]+?\b', sent, re.I) | |
| if match: | |
| phrase = match.group(0) | |
| rest = sent.replace(phrase, '', 1).strip() | |
| if random.random() < 0.5: | |
| new_sent = f"{phrase.capitalize()}, {rest[0].lower()}{rest[1:]}" | |
| else: | |
| new_sent = sent | |
| new_sentences.append(new_sent) | |
| else: | |
| new_sentences.append(sent) | |
| else: | |
| new_sentences.append(sent) | |
| return ' '.join(new_sentences) | |
| def sentence_splitting(self, text: str) -> str: | |
| """Method 5: Split long sentences""" | |
| # LAZY IMPORT for sent_tokenize | |
| from nltk.tokenize import sent_tokenize | |
| if len(text.split()) > 35: | |
| sentences = sent_tokenize(text) | |
| result = [] | |
| for sent in sentences: | |
| if len(sent.split()) > 25 and ' and ' in sent: | |
| parts = sent.split(' and ', 1) | |
| result.append(parts[0].strip() + '.') | |
| result.append('And ' + parts[1].strip()) | |
| else: | |
| result.append(sent) | |
| return ' '.join(result) | |
| return text | |
| def contraction_toggle(self, text: str) -> str: | |
| """Method 6: Expand or contract contractions randomly""" | |
| words = text.split() | |
| i = 0 | |
| while i < len(words) - 1: | |
| pair = f"{words[i]} {words[i+1]}".lower().rstrip('.,;:!?') | |
| if pair in self.contractions: | |
| if random.random() < 0.4: | |
| words[i] = self.contractions[pair] | |
| words.pop(i+1) | |
| else: | |
| i += 1 | |
| elif pair in self.expanded_contractions: | |
| if random.random() < 0.4: | |
| words[i] = self.expanded_contractions[pair] | |
| words.pop(i+1) | |
| else: | |
| i += 1 | |
| else: | |
| i += 1 | |
| return ' '.join(words) | |
| def academic_phrase_swap(self, text: str, aggressiveness: float) -> str: | |
| """Method 7: Swap common academic phrases""" | |
| prob = 0.2 + 0.3 * aggressiveness | |
| result = text | |
| for pattern, repl in self.academic_phrase_pairs: | |
| if random.random() < prob: | |
| result = re.sub(pattern, repl, result, flags=re.I) | |
| return result | |
| def post_process(self, text: str) -> str: | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| if text and text[-1] not in '.!?': | |
| text += '.' | |
| return text | |
| def generate_variant(self, text: str, aggressiveness: float) -> tuple[str, list[str]]: | |
| """Apply a random combination of transformations""" | |
| current = text | |
| applied = [] | |
| # Randomly select 2β5 methods (more aggressive = more methods) | |
| num_methods = random.randint(2, 4) if aggressiveness < 0.6 else random.randint(3, 6) | |
| all_methods = [ | |
| ("Synonym replacement", self.synonym_replace), | |
| ("Academic phrase swap", self.academic_phrase_swap), | |
| ("Clause reordering", self.clause_reordering), | |
| ("Contraction toggle", self.contraction_toggle), | |
| ("Sentence splitting", self.sentence_splitting), | |
| ("Direct-indirect style", self.direct_indirect_style), | |
| ("Active-passive (basic)", self.active_passive_conversion), | |
| ] | |
| selected = random.sample(all_methods, min(num_methods, len(all_methods))) | |
| for name, func in selected: | |
| if func == self.synonym_replace or func == self.academic_phrase_swap: | |
| current = func(current, aggressiveness) | |
| else: | |
| current = func(current) | |
| applied.append(name) | |
| current = self.post_process(current) | |
| return current, applied | |
| def transformer_sentence_refine(self, text, max_new_tokens=80): | |
| model = load_transformer() | |
| # LAZY IMPORT for sent_tokenize | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| refined = [] | |
| for s in sentences: | |
| if len(s.split()) < 6: | |
| refined.append(s) | |
| continue | |
| out = model( | |
| s, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95 | |
| )[0]["generated_text"] | |
| refined.append(out) | |
| return " ".join(refined) | |
| def apply_pairs(self, text: str, pairs: list, aggressiveness: float) -> str: | |
| """Apply a list of (pattern, replacement) pairs with probability based on aggressiveness""" | |
| prob = 0.15 + 0.35 * aggressiveness # adjust range as you like | |
| result = text | |
| for pattern, repl in pairs: | |
| if random.random() < prob: | |
| result = re.sub(pattern, repl, result, flags=re.I) | |
| return result | |
| def rewrite(self, text, aggr, discipline, journal): | |
| # LAZY IMPORT for sent_tokenize | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| new_sents = [] | |
| for s in sentences: | |
| if USE_SYNONYM: | |
| s = self.synonym_replace(s, aggr) | |
| if USE_ACADEMIC: | |
| s = self.apply_pairs(s, self.academic_phrase_pairs, aggr) | |
| if USE_DISCIPLINE and discipline != "General": | |
| s = self.apply_pairs( | |
| s, self.discipline_terms.get(discipline, []), aggr) | |
| if USE_ACTIVE_PASSIVE: | |
| s = self.active_passive_conversion(s) | |
| if USE_DIRECT_INDIRECT: | |
| s = self.direct_indirect_style(s) | |
| if USE_CLAUSE: | |
| s = self.clause_reordering(s) | |
| if USE_SPLIT: | |
| s = self.sentence_splitting(s) | |
| # Journal style always applied (probability=1.0 = always) | |
| s = self.apply_pairs( | |
| s, self.journal_styles.get(journal, []), 1.0) | |
| new_sents.append(s) | |
| return " ".join(new_sents) | |
| # ========================================================= | |
| # DOCX Export (with lazy import) | |
| # ========================================================= | |
| def export_docx(original, variants): | |
| from docx import Document | |
| from io import BytesIO | |
| doc = Document() | |
| doc.add_heading("Academic Paraphrasing Output", 1) | |
| doc.add_paragraph("Original Text:") | |
| doc.add_paragraph(original) | |
| for i, v in enumerate(variants, 1): | |
| doc.add_heading(f"Variant {i}", 2) | |
| doc.add_paragraph(v["text"]) | |
| doc.add_paragraph(f"Similarity: {v['sim']:.3f}") | |
| buf = BytesIO() | |
| doc.save(buf) | |
| buf.seek(0) | |
| return buf | |
| # ========================================================= | |
| # Gradio callback (TOP-LEVEL FUNCTION) | |
| # ========================================================= | |
| def generate_paraphrases( | |
| input_text, | |
| uploaded_file, | |
| num_variants, | |
| aggressiveness, | |
| discipline, | |
| journal_mode, | |
| plagiarism_safe, | |
| use_transformer, | |
| USE_SYNONYM, | |
| USE_ACADEMIC, | |
| USE_DISCIPLINE, | |
| USE_ACTIVE_PASSIVE, | |
| USE_DIRECT_INDIRECT, | |
| USE_CLAUSE, | |
| USE_SPLIT, | |
| ): | |
| # Inject file text if uploaded | |
| if uploaded_file: | |
| input_text = read_input_file(uploaded_file) | |
| engine = AcademicParaphraser() | |
| outputs = [] | |
| for _ in range(num_variants * 3): | |
| t = engine.rewrite( | |
| input_text, | |
| aggressiveness, | |
| discipline, | |
| journal_mode | |
| ) | |
| if use_transformer and TRANSFORMER_AVAILABLE: | |
| t = engine.transformer_sentence_refine(t) | |
| sim = engine.similarity(input_text, t) | |
| if plagiarism_safe and sim > 0.72: | |
| continue | |
| outputs.append({"text": t, "sim": sim}) | |
| outputs = sorted(outputs, key=lambda x: x["sim"])[:num_variants] | |
| display = "" | |
| for i, o in enumerate(outputs, 1): | |
| display += f"\n\n### Variant {i} (Similarity {o['sim']:.3f})\n{o['text']}" | |
| doc = export_docx(input_text, outputs) | |
| return display.strip(), doc | |
| # ========================================================= | |
| # === GRADIO UI =========================================== | |
| # ========================================================= | |
| with gr.Blocks(title="π§ Advanced Academic Paraphraser") as demo: | |
| gr.Markdown( | |
| "# π§ Advanced Academic Paraphraser\n" | |
| "Rule-based | AI | Hybrid academic paraphrasing engine" | |
| ) | |
| input_text = gr.Textbox(label="Paste Text", lines=10) | |
| uploaded_file = gr.File(label="Upload (.txt / .docx)") | |
| variants = gr.Slider(1, 3, value=2, step=1, label="Variants") | |
| strength = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Rewrite Strength") | |
| discipline = gr.Dropdown( | |
| ["General","Chemistry","Physics","Biology","Computer Science", | |
| "Accounts","Economics","History","Geography","Civics"], | |
| value="General", | |
| label="Discipline" | |
| ) | |
| journal = gr.Dropdown( | |
| ["Standard","ACS","RSC","Elsevier","Exam-safe"], | |
| value="Standard", | |
| label="Tone" | |
| ) | |
| plagiarism_safe = gr.Checkbox(True, label="Plagiarism-risk minimization") | |
| hybrid = gr.Checkbox(False, label="Hybrid rule + transformer") | |
| USE_SYNONYM = gr.Checkbox(True, label="Synonym replacement") | |
| USE_ACADEMIC = gr.Checkbox(True, label="Academic phrase expansion") | |
| USE_DISCIPLINE = gr.Checkbox(True, label="Discipline terms") | |
| USE_ACTIVE_PASSIVE = gr.Checkbox(True, label="Active β Passive") | |
| USE_DIRECT_INDIRECT = gr.Checkbox(True, label="Direct β Indirect") | |
| USE_CLAUSE = gr.Checkbox(False, label="Clause rewriting") | |
| USE_SPLIT = gr.Checkbox(True, label="Sentence splitting") | |
| run_btn = gr.Button("Generate Paraphrase") | |
| output_md = gr.Markdown() | |
| output_file = gr.File(label="Download DOCX") | |
| # ===================================================== | |
| # EVENT HANDLERS MUST BE INSIDE THE BLOCKS CONTEXT | |
| # ===================================================== | |
| run_btn.click( | |
| fn=generate_paraphrases, | |
| inputs=[ | |
| input_text, | |
| uploaded_file, | |
| variants, | |
| strength, | |
| discipline, | |
| journal, | |
| plagiarism_safe, | |
| hybrid, | |
| USE_SYNONYM, | |
| USE_ACADEMIC, | |
| USE_DISCIPLINE, | |
| USE_ACTIVE_PASSIVE, | |
| USE_DIRECT_INDIRECT, | |
| USE_CLAUSE, | |
| USE_SPLIT, | |
| ], | |
| outputs=[output_md, output_file] | |
| ) | |
| # ========================================================= | |
| # HF Spaces deployment - NO __main__ block | |
| # ========================================================= | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False # Disable public sharing for HF Spaces | |
| ) | |
| else: | |
| # For Hugging Face Spaces | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=False | |
| ) |