| | |
| | |
| | """ |
| | NeuroSymbolic V8.6+ β Complete Implementation with Chain-of-Thought Reasoning |
| | =============================================================================== |
| | |
| | ARCHITECTURE OVERVIEW: |
| | - Extract N words from first sentence (e.g., "consider", "nature", "understanding") |
| | - Generate 100 unique syntactic forms for these words |
| | - Generate 100 sentences: sentence[i] uses form[i] as its primary feature |
| | - Each sentence yields exactly one form, spreading its activation through that sentence |
| | - Forms accumulate value across different sentence contexts |
| | |
| | Key: Each sentence is a distinct syntactic-semantic environment where one form dominates. |
| | |
| | CHAIN-OF-THOUGHT REASONING: |
| | Every major function includes reasoning about WHY that function exists and HOW it |
| | integrates with the broader system. This document is self-explaining via docstrings. |
| | =============================================================================== |
| | """ |
| |
|
| | from __future__ import annotations |
| | import re |
| | import math |
| | import hashlib |
| | import unicodedata |
| | from dataclasses import dataclass, field |
| | from pathlib import Path |
| | from typing import List, Dict, Tuple, Optional, Set |
| | from collections import defaultdict |
| | import numpy as np |
| | import pandas as pd |
| | import gradio as gr |
| | import torch |
| | import torch.nn.functional as F |
| | from datasets import load_dataset |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | STOP_WORDS = set( |
| | "a an and are as at be by for from has have he her him his i in is it its " |
| | "me my of on or our she so that the their them they this to was we were what " |
| | "when where which who will with you your" |
| | .split() |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | COGNITIVE_TOKENS = {"[PROBLEM]", "[SOLUTION]"} |
| |
|
| | |
| | |
| | |
| | |
| | TOPO_KEYWORDS = { |
| | "homology", "cohomology", "persistent", "filtration", "barcode", "betti", |
| | "euler", "simplicial", "homotopy", "manifold", "morse", "sheaf" |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | _VOWELS = set("aeiouy") |
| | _COMMON_BIGRAMS: set = { |
| | "th", "he", "in", "er", "an", "re", "on", "en", "at", "ou", "ed", "nd", |
| | "to", "or", "ea", "ti", "es", "st", "ar", "nt", "is", "al", "it", "as", |
| | "ha", "et", "se", "ng", "le", "of", |
| | } |
| |
|
| | |
| | |
| | |
| | _LATINATE_PREFIXES = { |
| | "pre", "post", "anti", "auto", "bio", "geo", "hyper", "hypo", "inter", |
| | "intra", "micro", "macro", "meta", "mono", "multi", "neo", "non", "over", |
| | "poly", "pseudo", "semi", "sub", "super", "trans", "ultra", "uni", "dis", |
| | "mis", "un", "re", "de", |
| | } |
| |
|
| | _LATINATE_SUFFIXES = { |
| | "tion", "sion", "ment", "ness", "ity", "ism", "ist", "ize", "ise", "ful", |
| | "less", "ous", "ious", "eous", "ance", "ence", "able", "ible", "ive", |
| | "ative", "ology", "ography", "ician", "ation", "ization", "isation", |
| | } |
| |
|
| | |
| | |
| | |
| | _EARLY_WORDS: Dict[str, float] = { |
| | "cat": 2.5, "dog": 2.5, "mom": 2.2, "dad": 2.2, "baby": 2.8, "ball": 2.6, |
| | "cup": 2.7, "eye": 2.4, "ear": 2.5, "nose": 2.6, "hat": 2.8, "shoe": 2.9, |
| | "bed": 2.7, "hot": 3.0, "cold": 3.1, "big": 3.0, "small": 3.2, "run": 3.1, |
| | "eat": 2.9, "go": 2.5, "yes": 2.4, "no": 2.3, "hi": 2.2, "bye": 2.3, |
| | "more": 2.8, "up": 2.6, "down": 2.8, "in": 2.5, "out": 2.7, "on": 2.6, |
| | "off": 2.8, "want": 2.7, "help": 3.0, "play": 2.9, "walk": 3.0, |
| | "look": 2.8, "see": 2.5, "hear": 2.8, "think": 3.5, "know": 3.4, |
| | "hand": 2.9, "foot": 2.9, "head": 2.7, "face": 2.8, "name": 3.2, |
| | "home": 3.0, "door": 3.1, "car": 2.8, "tree": 3.0, "book": 3.2, |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | DIM_MIN = 2 |
| | DIM_MAX = 12 |
| | LENGTH_CEIL = 14 |
| | SHIFT_MAG_MIN = 0.05 |
| | SHIFT_MAG_MAX = 0.35 |
| | AGREEMENT_BONUS_MIN = 0.10 |
| | AGREEMENT_BONUS_MAX = 0.60 |
| |
|
| | |
| | |
| | |
| |
|
| | class SyntacticForm: |
| | """ |
| | REASONING (Sentences 1-5): |
| | A single syntactic/morphological form variant. The system generates exactly 100 |
| | sentences, where each sentence is assigned one unique syntactic form. Forms are |
| | generated by cycling through extracted words and a predefined list of 100 syntactic |
| | forms. When form indices exceed the available forms (100), the system wraps back |
| | to index 0. Each form tracks its activation strength across sentences via the |
| | activation_per_sentence dictionary. The value of each form accumulates through |
| | two mechanisms: direct activation in sentences and spreading influence to related |
| | words. |
| | """ |
| | |
| | |
| | FORMS = [ |
| | |
| | "base", "root", "stem", "lemma", "canonical", |
| | |
| | "present_tense", "past_tense", "future_tense", "present_progressive", |
| | "past_progressive", "future_progressive", "present_perfect", |
| | "past_perfect", "future_perfect", "simple_present", |
| | |
| | "singular", "plural", "first_person_singular", "first_person_plural", |
| | "second_person_singular", "second_person_plural", "third_person_singular", |
| | "third_person_plural", "person_neutral", "number_neutral", |
| | |
| | "nominative", "accusative", "genitive", "dative", "locative", "ablative", |
| | "allative", "inessive", "elative", "illative", |
| | |
| | "noun_form", "verb_form", "adjective_form", "adverb_form", |
| | "preposition_form", "conjunction_form", "article_form", "pronoun_form", |
| | "determiner_form", "numeral_form", |
| | |
| | "active_voice", "passive_voice", "middle_voice", "reflexive_voice", |
| | "reciprocal_voice", "causative_voice", "inchoative_voice", |
| | "iterative_voice", "habitual_voice", "frequentative_voice", |
| | |
| | "indicative_mood", "subjunctive_mood", "conditional_mood", |
| | "imperative_mood", "optative_mood", "necessitative_mood", |
| | "potential_mood", "desiderative_mood", "dubitative_mood", |
| | "permissive_mood", |
| | |
| | "perfective_aspect", "imperfective_aspect", "habitual_aspect", |
| | "iterative_aspect", "inceptive_aspect", "terminative_aspect", |
| | "continuative_aspect", "stative_aspect", "dynamic_aspect", |
| | "aorist_aspect", |
| | |
| | "positive_degree", "comparative_degree", "superlative_degree", |
| | "diminutive_form", "augmentative_form", "pejorative_form", |
| | "ameliorative_form", "intensive_form", "attenuative_form", |
| | "disparagingly_form", |
| | |
| | "agentive_noun", "instrumental_noun", "locative_noun", |
| | "abstract_noun", "action_noun", "quality_noun", "state_noun", |
| | "relational_adjective", "qualitative_adjective", |
| | "derivational_adjective", |
| | |
| | "transitive_form", "intransitive_form", "ditransitive_form", |
| | "bitransitive_form", "ambitransitive_form", |
| | ] |
| |
|
| | def __init__(self, word: str, form_name: str, sentence_index: int): |
| | """ |
| | Initialize a syntactic form instance. |
| | |
| | Args: |
| | word: The base word this form modifies |
| | form_name: The name of the syntactic form (must be in FORMS list) |
| | sentence_index: Which sentence (0-99) this form is assigned to |
| | """ |
| | self.word = word.lower() |
| | self.form_name = form_name if form_name in self.FORMS else "base" |
| | self.sentence_index = sentence_index |
| | |
| | |
| | self.activation_per_sentence: Dict[int, float] = {} |
| | self.total_activation: float = 0.0 |
| | |
| | |
| | self.spreading_context: List[str] = [] |
| | self.value_accumulated: float = 0.0 |
| |
|
| | def __repr__(self) -> str: |
| | """String representation for debugging.""" |
| | return f"{self.word}[{self.form_name}@sent{self.sentence_index}]" |
| |
|
| | def to_string(self) -> str: |
| | """Convert form to string for hashing/embedding.""" |
| | return f"{self.word}_{self.form_name}_{self.sentence_index}" |
| |
|
| | def activate_in_sentence(self, sentence_index: int, strength: float = 1.0): |
| | """ |
| | REASONING (Sentence 41): |
| | Record activation in a specific sentence. When a sentence is generated, |
| | the form that dominated it records activation strength and influenced |
| | words for later analysis. |
| | """ |
| | self.activation_per_sentence[sentence_index] = \ |
| | self.activation_per_sentence.get(sentence_index, 0.0) + strength |
| | self.total_activation += strength |
| |
|
| | def spread_to_word(self, word: str, strength: float = 0.5): |
| | """ |
| | REASONING (Sentence 43): |
| | Record spreading influence to another word. Influence is bidirectional: |
| | a form spreads to words, and those words contribute back to the form's |
| | total value via accumulation. |
| | """ |
| | if word not in self.spreading_context: |
| | self.spreading_context.append(word) |
| | self.value_accumulated += strength |
| |
|
| | def get_total_value(self) -> float: |
| | """ |
| | REASONING (Sentence 5): |
| | Total value = base activations + accumulated spread. This represents |
| | the cumulative influence of the form across all sentences and spreading contexts. |
| | """ |
| | return self.total_activation + self.value_accumulated |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @dataclass |
| | class SentenceFormPlan: |
| | """ |
| | REASONING (Sentences 40-43): |
| | The SentenceFormPlan orchestrates form creation, assignment, and value reporting |
| | for the 100-sentence system. When a sentence is generated, the form that dominated |
| | it records activation strength and influenced words for later analysis. The form |
| | report ranks forms by cumulative value (activation + spreading), showing top 30 |
| | and detailed influence maps. Influence is bidirectional: a form spreads to words, |
| | and those words contribute back to the form's total value via accumulation. |
| | """ |
| | |
| | extracted_words: List[str] = field(default_factory=list) |
| | forms_list: List[SyntacticForm] = field(default_factory=list) |
| | form_by_sentence: Dict[int, SyntacticForm] = field(default_factory=dict) |
| | sentence_outputs: Dict[int, str] = field(default_factory=dict) |
| | form_report: str = "" |
| |
|
| | def build_forms(self, words: List[str]): |
| | """ |
| | REASONING (Sentences 2-3): |
| | Build 100 forms from extracted words, cycling through if needed. Forms are |
| | generated by cycling through extracted words and a predefined list of 100 |
| | syntactic forms. When form indices exceed the available forms (100), the |
| | system wraps back to index 0. |
| | """ |
| | self.extracted_words = words |
| | form_index = 0 |
| | word_index = 0 |
| |
|
| | |
| | for sent_idx in range(100): |
| | |
| | if word_index >= len(words): |
| | word_index = 0 |
| | word = words[word_index] |
| |
|
| | |
| | if form_index >= len(SyntacticForm.FORMS): |
| | form_index = 0 |
| | form_name = SyntacticForm.FORMS[form_index] |
| |
|
| | |
| | form = SyntacticForm(word, form_name, sent_idx) |
| | self.forms_list.append(form) |
| | self.form_by_sentence[sent_idx] = form |
| |
|
| | |
| | word_index += 1 |
| | form_index += 1 |
| |
|
| | def record_sentence_generation( |
| | self, |
| | sentence_index: int, |
| | text: str, |
| | form_activation: float = 1.0, |
| | influenced_words: Optional[List[str]] = None |
| | ): |
| | """ |
| | REASONING (Sentence 41): |
| | Record generated sentence and form activation. When a sentence is generated, |
| | the form that dominated it records activation strength and influenced words |
| | for later analysis. |
| | """ |
| | self.sentence_outputs[sentence_index] = text |
| | form = self.form_by_sentence.get(sentence_index) |
| | if form: |
| | form.activate_in_sentence(sentence_index, form_activation) |
| | if influenced_words: |
| | for w in influenced_words: |
| | form.spread_to_word(w, 0.5) |
| |
|
| | def generate_report(self) -> str: |
| | """ |
| | REASONING (Sentence 42): |
| | Generate detailed form-by-form report. The form report ranks forms by |
| | cumulative value (activation + spreading), showing top 30 and detailed |
| | influence maps. |
| | """ |
| | lines = [ |
| | f"{'='*70}", |
| | f" 100-Sentence Syntactic Form Plan β One Form Per Sentence", |
| | f"{'='*70}", |
| | f"Extracted words: {', '.join(self.extracted_words)}", |
| | f"Total forms generated: {len(self.forms_list)}", |
| | f"Sentences generated: {len(self.sentence_outputs)}", |
| | f"", |
| | ] |
| |
|
| | |
| | sorted_forms = sorted(self.forms_list, key=lambda f: f.get_total_value(), reverse=True) |
| | total_value = sum(f.get_total_value() for f in sorted_forms) |
| |
|
| | lines.append(f"Total cumulative activation: {total_value:.4f}") |
| | lines.append("") |
| | lines.append("Form Rankings (Top 30):") |
| | lines.append( |
| | f"{'Rank':<5} {'Sentence':<8} {'Word':<15} {'Form':<25} " |
| | f"{'Total Value':<12} {'% of Total':<10} {'Influenced':<10}" |
| | ) |
| | lines.append(f"{'-'*90}") |
| |
|
| | for rank, form in enumerate(sorted_forms[:30], 1): |
| | pct = 100 * form.get_total_value() / max(total_value, 1e-8) |
| | num_influenced = len(form.spreading_context) |
| | lines.append( |
| | f"{rank:<5} {form.sentence_index:<8} {form.word:<15} " |
| | f"{form.form_name:<25} {form.get_total_value():<12.4f} " |
| | f"{pct:<10.2f} {num_influenced:<10}" |
| | ) |
| |
|
| | lines.append("") |
| | lines.append("Form-to-Word Influence Map (Top 10 Forms):") |
| | lines.append("") |
| |
|
| | for rank, form in enumerate(sorted_forms[:10], 1): |
| | if form.spreading_context: |
| | influenced_str = ", ".join(form.spreading_context[:8]) |
| | if len(form.spreading_context) > 8: |
| | influenced_str += f", ... (+{len(form.spreading_context)-8} more)" |
| | lines.append( |
| | f"{rank:2d}. {form.word}[{form.form_name}@sent{form.sentence_index}]\n" |
| | f" β Influenced: {influenced_str}" |
| | ) |
| |
|
| | lines.append("") |
| | lines.append("Sentence-by-Sentence Form Assignments:") |
| | lines.append("") |
| |
|
| | for sent_idx in range(min(30, len(self.sentence_outputs))): |
| | form = self.form_by_sentence.get(sent_idx) |
| | output = self.sentence_outputs.get(sent_idx, "(not generated)") |
| | if form: |
| | preview = output[:60] + "..." if len(output) > 60 else output |
| | lines.append( |
| | f"Sent[{sent_idx:2d}] Form: {form.word}[{form.form_name:<25s}] " |
| | f"Value: {form.get_total_value():.3f} Preview: {preview}" |
| | ) |
| |
|
| | if len(self.sentence_outputs) > 30: |
| | lines.append(f"... ({len(self.sentence_outputs) - 30} more sentences)") |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def semantic_similarity(word1: str, word2: str) -> float: |
| | """ |
| | REASONING (Sentences 11-13): |
| | Semantic similarity between two words is computed as a weighted average of |
| | three metrics: Levenshtein edit distance (40%), character length difference (30%), |
| | and bigram overlap (30%). Levenshtein distance measures how many single-character |
| | edits separate two words, normalized by their maximum length. Bigrams (2-letter |
| | sequences) provide phonetic/orthographic fingerprints; words sharing more bigrams |
| | are more similar phonetically. |
| | """ |
| | w1, w2 = word1.lower(), word2.lower() |
| | if w1 == w2: |
| | return 1.0 |
| |
|
| | |
| | lev_dist = edit_distance(w1, w2) |
| | max_len = max(len(w1), len(w2)) |
| | lev_sim = 1.0 - (lev_dist / max(max_len, 1)) |
| |
|
| | |
| | len_dist = abs(len(w1) - len(w2)) |
| | len_sim = 1.0 - (len_dist / max_len) |
| |
|
| | |
| | bigrams1 = set(w1[i:i+2] for i in range(len(w1)-1)) |
| | bigrams2 = set(w2[i:i+2] for i in range(len(w2)-1)) |
| | if bigrams1 and bigrams2: |
| | bigram_sim = len(bigrams1 & bigrams2) / len(bigrams1 | bigrams2) |
| | else: |
| | bigram_sim = 0.0 |
| |
|
| | |
| | combined = 0.4 * lev_sim + 0.3 * len_sim + 0.3 * bigram_sim |
| | return float(combined) |
| |
|
| |
|
| | def edit_distance(s1: str, s2: str) -> int: |
| | """ |
| | REASONING (Sentence 12): |
| | Levenshtein distance provides edit-distance similarity that captures orthographic |
| | changes (insertions, deletions, substitutions) as discrete steps. |
| | """ |
| | if len(s1) < len(s2): |
| | return edit_distance(s2, s1) |
| | if len(s2) == 0: |
| | return len(s1) |
| |
|
| | prev_row = range(len(s2) + 1) |
| | for i, c1 in enumerate(s1): |
| | curr_row = [i + 1] |
| | for j, c2 in enumerate(s2): |
| | insertions = prev_row[j + 1] + 1 |
| | deletions = curr_row[j] + 1 |
| | substitutions = prev_row[j] + (c1 != c2) |
| | curr_row.append(min(insertions, deletions, substitutions)) |
| | prev_row = curr_row |
| | return prev_row[-1] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def length_alpha(word: str, ceil: int = LENGTH_CEIL) -> float: |
| | """ |
| | REASONING (Sentences 6, 71): |
| | Word length determines a normalized factor Ξ± (alpha) between 0 and 1, centered |
| | on a ceiling value of 14 characters. The sigmoid function (1 / (1 + exp(-0.55 * (n - 7)))) |
| | produces a smooth S-curve, centering AoA and topology scaling on word length. |
| | """ |
| | n = len(word.strip()) |
| | mid = ceil / 2.0 |
| | return float(1.0 / (1.0 + math.exp(-0.55 * (n - mid)))) |
| |
|
| |
|
| | def length_dim(word: str) -> int: |
| | """ |
| | REASONING (Sentence 7): |
| | Dimension (dim) for embeddings scales linearly from DIM_MIN=2 to DIM_MAX=12 based |
| | on Ξ±. Longer, more complex words receive larger dimension embeddings, allowing |
| | finer-grained semantic representation. |
| | """ |
| | Ξ± = length_alpha(word) |
| | raw = DIM_MIN + Ξ± * (DIM_MAX - DIM_MIN) |
| | return max(DIM_MIN, int(round(raw / 2) * 2)) |
| |
|
| |
|
| | def length_shift_mag(word: str) -> float: |
| | """ |
| | REASONING (Sentence 9): |
| | The shift magnitude for vector perturbation grows with word length, controlling |
| | how much the embedding "drifts" in semantic space. |
| | """ |
| | Ξ± = length_alpha(word) |
| | return SHIFT_MAG_MIN + Ξ± * (SHIFT_MAG_MAX - SHIFT_MAG_MIN) |
| |
|
| |
|
| | def length_agreement_bonus(word: str) -> float: |
| | """ |
| | REASONING (Sentence 10): |
| | Agreement bonus (for centroid boosting) is higher for longer words, reflecting |
| | their greater semantic stability and consistency. |
| | """ |
| | Ξ± = length_alpha(word) |
| | return AGREEMENT_BONUS_MIN + Ξ± * (AGREEMENT_BONUS_MAX - AGREEMENT_BONUS_MIN) |
| |
|
| |
|
| | def length_topo_kernel(word: str) -> float: |
| | """ |
| | REASONING (Sentence 20): |
| | The topological kernel function combines topology weight with length-dependent Ξ±, |
| | emphasizing longer words in mathematical contexts. |
| | """ |
| | Ξ± = length_alpha(word) |
| | return float(0.05 + 0.95 * (Ξ± ** 1.5)) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | AOA_DATASET_URL = ( |
| | "https://norare.clld.org/contributions/Kuperman-2012-AoA/English-AoA-30K.csv" |
| | ) |
| | AOA_COL_WORD = "Word" |
| | AOA_COL_AOA = "AoA" |
| |
|
| |
|
| | def load_aoa_dataset(max_rows: int = 35_000) -> Dict[str, float]: |
| | """ |
| | REASONING (Sentence 76): |
| | The AoA dataset (Kuperman 2012, 30K words) is loaded lazily on first run, |
| | providing empirical age-of-acquisition for 25K+ English words. |
| | """ |
| | try: |
| | df = pd.read_csv(AOA_DATASET_URL, nrows=max_rows) |
| | if AOA_COL_WORD not in df.columns or AOA_COL_AOA not in df.columns: |
| | return {} |
| | df = df[[AOA_COL_WORD, AOA_COL_AOA]].dropna() |
| | return { |
| | str(w).strip().lower(): float(a) |
| | for w, a in zip(df[AOA_COL_WORD], df[AOA_COL_AOA]) |
| | } |
| | except Exception: |
| | return {} |
| |
|
| |
|
| | def _count_syllables(word: str) -> int: |
| | """Count syllables in a word by counting vowel clusters.""" |
| | w = word.lower().rstrip("e") |
| | count = sum( |
| | 1 for i, c in enumerate(w) |
| | if c in _VOWELS and (i == 0 or w[i - 1] not in _VOWELS) |
| | ) |
| | return max(1, count) |
| |
|
| |
|
| | def _morpheme_complexity(word: str) -> float: |
| | """ |
| | REASONING (Sentence 16): |
| | Morphological complexity (Latin prefixes/suffixes) increases predicted AoA, |
| | reflecting the cognitive load of learning derived words. |
| | """ |
| | w = word.lower() |
| | score = 0.0 |
| | for p in _LATINATE_PREFIXES: |
| | if w.startswith(p) and len(w) > len(p) + 2: |
| | score += 0.25 |
| | break |
| | for s in _LATINATE_SUFFIXES: |
| | if w.endswith(s) and len(w) > len(s) + 2: |
| | score += 0.25 * (1 + len(s) / 6) |
| | break |
| | return min(1.0, score) |
| |
|
| |
|
| | def _bigram_familiarity(word: str) -> float: |
| | """ |
| | REASONING (Sentence 15): |
| | High-frequency words in common bigrams (e.g., "th", "he", "in") are predicted |
| | to be learned earlier because they appear in many familiar words. |
| | """ |
| | w = word.lower() |
| | if len(w) < 2: |
| | return 0.5 |
| | bigrams = [w[i:i + 2] for i in range(len(w) - 1)] |
| | return sum(1 for b in bigrams if b in _COMMON_BIGRAMS) / len(bigrams) |
| |
|
| |
|
| | def _ortho_neighborhood_size(word: str, aoa_dict: Dict[str, float]) -> int: |
| | """Compute orthographic neighborhood size (words differing by 1 character).""" |
| | w = word.lower() |
| | n = len(w) |
| | count = 0 |
| | for cand in aoa_dict: |
| | if len(cand) == n and cand != w: |
| | diffs = sum(a != b for a, b in zip(w, cand)) |
| | if diffs == 1: |
| | count += 1 |
| | if count >= 20: |
| | break |
| | return count |
| |
|
| |
|
| | def calculate_word_age( |
| | word: str, |
| | aoa: Dict[str, float], |
| | corpus_freq: Optional[Dict[str, int]] = None, |
| | corpus_total: int = 1, |
| | ) -> float: |
| | """ |
| | REASONING (Sentences 14, 74): |
| | Word Age of Acquisition (AoA) is estimated by fitting a linear regression model |
| | on corpus features like syllable count, morpheme complexity, and bigram familiarity. |
| | Log-softmax scaling via temperature controls the sharpness of probability distributions. |
| | |
| | Formula: |
| | AoA(word) = 8.5 + 0.30*(char_length - 5) + 0.55*(syllable_count - 2) |
| | + 2.80*morpheme_complexity - 1.60*bigram_familiarity |
| | - 0.18*log(corpus_frequency) - 0.40*log(neighborhood_size) |
| | """ |
| | w = word.lower().strip() |
| | if not w or not w[0].isalpha(): |
| | return 10.0 |
| |
|
| | |
| | if w in aoa: |
| | return aoa[w] |
| |
|
| | |
| | if w in _EARLY_WORDS: |
| | return _EARLY_WORDS[w] |
| |
|
| | |
| | n_chars = len(w) |
| | n_syl = _count_syllables(w) |
| | morph = _morpheme_complexity(w) |
| | bigram_f = _bigram_familiarity(w) |
| | neigh = _ortho_neighborhood_size(w, aoa) |
| |
|
| | if corpus_freq and w in corpus_freq: |
| | rel_freq = corpus_freq[w] / max(corpus_total, 1) |
| | log_freq = math.log(1 + rel_freq * 1_000_000) |
| | else: |
| | log_freq = 0.0 |
| |
|
| | |
| | intercept = 8.5 |
| | Ξ²_len = 0.30 |
| | Ξ²_syl = 0.55 |
| | Ξ²_morph = 2.80 |
| | Ξ²_big = 1.60 |
| | Ξ²_freq = 0.18 |
| | Ξ²_neigh = 0.40 |
| |
|
| | estimated = ( |
| | intercept |
| | + Ξ²_len * (n_chars - 5) |
| | + Ξ²_syl * (n_syl - 2) |
| | + Ξ²_morph * morph |
| | - Ξ²_big * bigram_f |
| | - Ξ²_freq * log_freq |
| | - Ξ²_neigh * math.log(1 + neigh) |
| | ) |
| |
|
| | return float(max(2.0, min(20.0, estimated))) |
| |
|
| |
|
| | def word_age( |
| | aoa: Dict[str, float], |
| | token: str, |
| | corpus_freq: Optional[Dict[str, int]] = None, |
| | corpus_total: int = 1, |
| | ) -> float: |
| | """Convenience wrapper for calculate_word_age.""" |
| | return calculate_word_age(token, aoa, corpus_freq, corpus_total) |
| |
|
| |
|
| | def age_continuity_boost(age1: float, age2: float, strength: float = 0.12) -> float: |
| | """ |
| | REASONING (Sentence 18): |
| | Age continuity boost rewards word pairs with similar AoA values, reflecting |
| | coherence in conceptual maturity. Formula: strength * exp(-|age1 - age2| / 3) * early_boost |
| | """ |
| | d = abs(age1 - age2) |
| | early = min(age1, age2, 8.0) / 8.0 |
| | return float(strength * math.exp(-d / 3.0) * early) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def topo_weight(token: str) -> float: |
| | """ |
| | REASONING (Sentence 19): |
| | Topological keywords ("homology", "cohomology", "filtration", etc.) signal |
| | mathematical content and receive a base weight of 0.4 per match. |
| | """ |
| | tl = token.lower() |
| | base = min(1.0, sum(0.4 for kw in TOPO_KEYWORDS if kw in tl)) |
| | length_presence = 0.05 * length_alpha(token) |
| | raw = base + length_presence |
| | return float(min(1.0, raw * length_topo_kernel(token))) |
| |
|
| |
|
| | def semantic_scalar(t1: str, t2: str) -> float: |
| | """Semantic scalar based on length difference.""" |
| | n = max(len(t1), len(t2), 1) |
| | dist = abs(len(t1) - len(t2)) |
| | return float(1.0 - dist / n) |
| |
|
| |
|
| | def centroid_boost( |
| | aoa: Dict[str, float], |
| | current: str, |
| | candidates: List[str], |
| | strength: float = 0.10, |
| | corpus_freq: Optional[Dict[str, int]] = None, |
| | corpus_total: int = 1, |
| | ) -> np.ndarray: |
| | """ |
| | REASONING (Sentences 21-22): |
| | Centroid boost applies symmetric topological weighting to word pairs, ensuring |
| | bidirectional coherence. The boost formula incorporates semantic scalar (length-based), |
| | topology weight, and age continuity, weighted equally at 1/3 each. |
| | """ |
| | cs_topo = topo_weight(current) |
| | cs_age = word_age(aoa, current, corpus_freq, corpus_total) |
| | boosts = np.zeros(len(candidates), dtype=np.float32) |
| |
|
| | for i, c in enumerate(candidates): |
| | sim = semantic_scalar(current, c) |
| | tw = (topo_weight(c) + cs_topo) * 0.5 |
| | ab = age_continuity_boost( |
| | cs_age, word_age(aoa, c, corpus_freq, corpus_total) |
| | ) |
| | boosts[i] = strength * sim * (1.0 + tw + ab) / 3.0 |
| |
|
| | return boosts |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class LengthDependentEmbedder: |
| | """ |
| | REASONING (Sentences 23-30): |
| | The embedder maps each word/form to a vector in dimension space determined by |
| | word length, ensuring longer words occupy richer representational spaces. Embeddings |
| | are produced by hashing the token, repeating bytes to reach desired dimension, and |
| | normalizing by sum (L1-like norm). Forms are embedded using their full string |
| | representation (word_form_name_sentenceindex), ensuring each form-sentence pairing |
| | has unique embedding. A shift vector perturbs embeddings in controlled directions, |
| | representing semantic drift induced by context. The shift is applied to w2's |
| | embedding before computing dot product with candidate embeddings, modeling how |
| | context (w1) influences next-word predictions. "Double entendre" refers to the |
| | two passes: pass1 uses unshifted w2 embedding, pass2 uses shifted embedding, |
| | capturing both baseline and contextual similarity. The combined score is computed |
| | as the minimum of pass1 and pass2, ensuring only candidates high in both measures |
| | receive maximum boost. |
| | """ |
| |
|
| | def embed(self, token: str, dim: Optional[int] = None) -> np.ndarray: |
| | """Embed a word/token into a vector.""" |
| | d = dim if dim is not None else length_dim(token) |
| | raw_bytes = hashlib.sha256(token.encode("utf-8")).digest() |
| | repeated = (raw_bytes * ((d // 32) + 2))[:d] |
| | vec = np.array(list(repeated), dtype=np.float32) |
| | s = float(vec.sum()) |
| | return vec / (s + 1e-8) |
| |
|
| | def embed_form(self, form: SyntacticForm, dim: Optional[int] = None) -> np.ndarray: |
| | """Embed a syntactic form into a vector.""" |
| | form_str = form.to_string() |
| | d = dim if dim is not None else length_dim(form.word) |
| | raw_bytes = hashlib.sha256(form_str.encode("utf-8")).digest() |
| | repeated = (raw_bytes * ((d // 32) + 2))[:d] |
| | vec = np.array(list(repeated), dtype=np.float32) |
| | s = float(vec.sum()) |
| | return vec / (s + 1e-8) |
| |
|
| | def shift_vector(self, token: str, dim: int, magnitude: float) -> np.ndarray: |
| | """Compute a shift vector for semantic drift.""" |
| | raw_bytes = hashlib.md5(token.encode("utf-8")).digest() |
| | repeated = (raw_bytes * ((dim // 16) + 2))[:dim] |
| | vec = np.array(list(repeated), dtype=np.float32) |
| | norm = np.linalg.norm(vec) |
| | return (vec / (norm + 1e-8)) * magnitude |
| |
|
| | @staticmethod |
| | def _norm01(arr: np.ndarray) -> np.ndarray: |
| | """Normalize array to [0, 1].""" |
| | mn = float(arr.min()) |
| | mx = float(arr.max()) |
| | return (arr - mn) / (mx - mn + 1e-12) |
| |
|
| | def length_dependent_weights( |
| | self, |
| | w1: str, |
| | w2: str, |
| | candidates: List[str], |
| | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: |
| | """ |
| | REASONING (Sentence 27-29): |
| | Compute length-dependent weights for candidates. Pass 1: baseline similarity |
| | (unshifted w2). Pass 2: shifted similarity (w2 + context drift from w1). |
| | Combined: min(pass1, pass2) scaled by topology kernel. |
| | """ |
| | N = len(candidates) |
| | pass1_raw = np.zeros(N, dtype=np.float32) |
| | pass2_raw = np.zeros(N, dtype=np.float32) |
| | topo_kernels = np.zeros(N, dtype=np.float32) |
| |
|
| | anchor_shift_mag = length_shift_mag(w2) |
| | anchor_agree_bonus = length_agreement_bonus(w2) |
| |
|
| | for i, c in enumerate(candidates): |
| | dim = length_dim(c) |
| | e_w2 = self.embed(w2, dim=dim) |
| | e_c = self.embed(c, dim=dim) |
| | shift = self.shift_vector(w1, dim=dim, magnitude=anchor_shift_mag) |
| |
|
| | |
| | e_w2_shifted = e_w2 + shift |
| | norm_s = float(e_w2_shifted.sum()) |
| | e_w2_shifted = e_w2_shifted / (abs(norm_s) + 1e-8) |
| |
|
| | pass1_raw[i] = float(np.dot(e_w2, e_c)) |
| | pass2_raw[i] = float(np.dot(e_w2_shifted, e_c)) |
| | topo_kernels[i] = length_topo_kernel(c) |
| |
|
| | |
| | p1 = self._norm01(pass1_raw) |
| | p2 = self._norm01(pass2_raw) |
| | de_score = np.minimum(p1, p2) |
| |
|
| | |
| | base_combined = 0.5 * (p1 + p2) |
| | agreement_part = float(anchor_agree_bonus) * de_score |
| | combined = base_combined + topo_kernels * agreement_part |
| | combined = self._norm01(combined) |
| |
|
| | return p1, p2, combined |
| |
|
| |
|
| | |
| | DoubleEntendreEmbedder = LengthDependentEmbedder |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class NGramLM: |
| | """ |
| | REASONING (Sentences 31-35): |
| | The NGramLM ingests token sequences and builds unigram, bigram, and trigram |
| | frequency tables for probabilistic next-word prediction. Trigram probabilities |
| | are computed using Laplace smoothing (add-k) to handle unseen trigrams gracefully. |
| | When a trigram (w1, w2, w3) is not observed, the model backs off to the bigram |
| | distribution of (w2, ?), providing a fallback estimate. If no bigram with w2 |
| | is found, the model returns the top 150 most-frequent unigrams as candidates, |
| | ensuring diversity. Duplicate candidates are filtered and the final candidate |
| | list is capped at 400 to balance diversity and computational cost. |
| | """ |
| |
|
| | def __init__(self, add_k: float = 1.5): |
| | self.add_k = float(add_k) |
| | self.uni: Dict[str, int] = {} |
| | self.bi: Dict[Tuple[str, str], int] = {} |
| | self.tri: Dict[Tuple[str, str, str], int] = {} |
| | self.vocab: List[str] = [] |
| | self.total = 0 |
| |
|
| | def ingest(self, tokens: List[str]) -> None: |
| | """Ingest token sequence to build n-gram tables.""" |
| | for t in tokens: |
| | self.uni[t] = self.uni.get(t, 0) + 1 |
| | self.total += 1 |
| |
|
| | for i in range(len(tokens) - 1): |
| | k = (tokens[i], tokens[i + 1]) |
| | self.bi[k] = self.bi.get(k, 0) + 1 |
| |
|
| | for i in range(len(tokens) - 2): |
| | k = (tokens[i], tokens[i + 1], tokens[i + 2]) |
| | self.tri[k] = self.tri.get(k, 0) + 1 |
| |
|
| | self.vocab = list(self.uni.keys()) |
| |
|
| | def next_dist(self, w1: str, w2: str) -> Tuple[List[str], torch.Tensor]: |
| | """ |
| | Compute probability distribution over next word given (w1, w2) context. |
| | Returns (candidates, probabilities) tuple. |
| | """ |
| | cands: List[str] = [] |
| |
|
| | |
| | for (a, b, c) in self.tri: |
| | if a == w1 and b == w2: |
| | cands.append(c) |
| |
|
| | |
| | if not cands: |
| | for (a, b) in self.bi: |
| | if a == w2: |
| | cands.append(b) |
| |
|
| | |
| | if not cands: |
| | cands = [w for w, _ in sorted(self.uni.items(), key=lambda x: -x[1])[:150]] |
| |
|
| | |
| | seen, out = set(), [] |
| | for w in cands: |
| | if w not in seen and w not in COGNITIVE_TOKENS: |
| | seen.add(w) |
| | out.append(w) |
| |
|
| | cands = out[:400] |
| |
|
| | |
| | V = len(self.vocab) + 1 |
| | k = self.add_k |
| |
|
| | def prob(w3: str) -> float: |
| | c12 = self.bi.get((w1, w2), 0) |
| | c123 = self.tri.get((w1, w2, w3), 0) |
| | if c12 > 0: |
| | return (c123 + k) / (c12 + k * V) |
| | return (self.uni.get(w3, 0) + k) / (self.total + k * V) |
| |
|
| | probs = torch.tensor([prob(w) for w in cands], dtype=torch.float32) |
| | probs = probs / (probs.sum() + 1e-12) |
| |
|
| | return cands, probs |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | _TOKEN_RE = re.compile(r"\[[A-Z\-]+\]|[A-Za-z][A-Za-z0-9_'-]*|[.,;:!?()]") |
| |
|
| |
|
| | def tokenize(text: str) -> List[str]: |
| | """ |
| | REASONING (Sentences 36-37): |
| | Tokenization splits text into alphabetic words, punctuation, and cognitive |
| | tokens (like [PROBLEM], [SOLUTION]) using regex matching. Alphabetic tokens |
| | are lowercased for consistency, while punctuation and cognitive tokens are |
| | preserved as-is for structural integrity. |
| | """ |
| | text = text.replace("\\n", " ") |
| | tokens = _TOKEN_RE.findall(text) |
| | out: List[str] = [] |
| |
|
| | for t in tokens: |
| | if t in COGNITIVE_TOKENS: |
| | out.append(t) |
| | elif re.match(r"[A-Za-z]", t): |
| | out.append(t.lower()) |
| | elif t in ".,;:!?()": |
| | out.append(t) |
| |
|
| | return out |
| |
|
| |
|
| | def detokenize(tokens: List[str]) -> str: |
| | """ |
| | REASONING (Sentences 38-39): |
| | Detokenization reconstructs readable text by reattaching punctuation to |
| | preceding words and capitalizing sentence starts. Newlines are normalized |
| | to spaces during tokenization to ensure linear token flow across paragraphs. |
| | """ |
| | out: List[str] = [] |
| |
|
| | for t in tokens: |
| | if t in COGNITIVE_TOKENS: |
| | continue |
| | elif t in ".,;:?)": |
| | if out: |
| | out[-1] += t |
| | elif t == "(": |
| | out.append(t) |
| | else: |
| | if out and out[-1].endswith("("): |
| | out[-1] += t |
| | else: |
| | out.append(t) |
| |
|
| | s = " ".join(out) |
| | s = re.sub(r"\(\s+", "(", s) |
| | s = re.sub(r"\s+\)", ")", s) |
| | s = re.sub(r"(^|[.!?]\s+)([a-z])", lambda m: m.group(1) + m.group(2).upper(), s) |
| |
|
| | return s |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @dataclass |
| | class CorpusState: |
| | """ |
| | REASONING (Sentences 44-46): |
| | CorpusState aggregates language model, embedder, AoA dictionary, and form plan |
| | into a unified inference engine. Token boost scores reflect log-relative frequency: |
| | words appearing more often in corpus receive higher boosts to encourage their |
| | reuse. The form plan is initialized by extracting alphabetic, non-stop words |
| | from the prompt and building 100 forms from them. |
| | """ |
| | lm: NGramLM |
| | embedder: LengthDependentEmbedder |
| | aoa: Dict[str, float] |
| | sentence_form_plan: SentenceFormPlan = field(default_factory=SentenceFormPlan) |
| | token_boost: Dict[str, float] = field(default_factory=dict) |
| | corpus_freq: Dict[str, int] = field(default_factory=dict) |
| | corpus_total: int = 1 |
| |
|
| |
|
| | def build_state(text: str, aoa: Dict[str, float], prompt: str = "") -> CorpusState: |
| | """Build CorpusState from text and prompt.""" |
| | tokens = tokenize(text) |
| | lm = NGramLM(add_k=1.5) |
| | lm.ingest(tokens) |
| |
|
| | embedder = LengthDependentEmbedder() |
| | total = max(1, sum(lm.uni.values())) |
| |
|
| | |
| | token_boost: Dict[str, float] = {} |
| | for tok, freq in lm.uni.items(): |
| | if len(tok) > 3 and tok not in STOP_WORDS and re.match(r"^[a-z]", tok): |
| | token_boost[tok] = min(0.5, math.log(1 + (freq / total) * 1000.0) * 0.1) |
| |
|
| | |
| | prompt_tokens = tokenize(prompt) |
| | alpha_tokens = [ |
| | t for t in prompt_tokens |
| | if re.match(r"^[a-z]", t) and t not in STOP_WORDS |
| | ] |
| |
|
| | |
| | form_plan = SentenceFormPlan() |
| | form_plan.build_forms(alpha_tokens if alpha_tokens else ["word"]) |
| |
|
| | return CorpusState( |
| | lm=lm, |
| | embedder=embedder, |
| | aoa=aoa, |
| | sentence_form_plan=form_plan, |
| | token_boost=token_boost, |
| | corpus_freq=lm.uni, |
| | corpus_total=total, |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def next_probs( |
| | state: CorpusState, |
| | w1: str, |
| | w2: str, |
| | sentence_index: int, |
| | temp: float = 1.2, |
| | de_strength: float = 0.18, |
| | ) -> Tuple[List[str], torch.Tensor]: |
| | """ |
| | REASONING (Sentences 47-50): |
| | next_probs() combines multiple scoring signals to compute next-word probabilities: |
| | base n-gram, double-entendre, centroid, form, token, age, and topological boosts. |
| | The form boost is semantic similarity between the current sentence's assigned |
| | form word and each candidate, weighted at 25%. Topological centroid boost is |
| | amplified by the topological kernel (0.05β0.95 range), making topology effects |
| | stronger for longer words. Double-entendre weight (0.18) is typically lower |
| | than base+centroid+form boosts, ensuring linguistic coherence takes precedence |
| | over semantic drift. |
| | |
| | Generate next word probabilities for a sentence. |
| | """ |
| | cands, base_probs = state.lm.next_dist(w1, w2) |
| | _, _, de_combined = state.embedder.length_dependent_weights( |
| | w1=w1, |
| | w2=w2, |
| | candidates=cands, |
| | ) |
| |
|
| | de_t = torch.tensor(de_combined, dtype=torch.float32) |
| |
|
| | |
| | form_boost = torch.zeros_like(de_t) |
| | current_form = state.sentence_form_plan.form_by_sentence.get(sentence_index) |
| | if current_form: |
| | for idx, c in enumerate(cands): |
| | |
| | sim = semantic_similarity(current_form.word, c) |
| | form_boost[idx] = 0.25 * sim |
| |
|
| | |
| | cb = centroid_boost( |
| | state.aoa, |
| | w2, |
| | cands, |
| | strength=0.10, |
| | corpus_freq=state.corpus_freq, |
| | corpus_total=state.corpus_total, |
| | ) |
| | cb_t = torch.tensor(cb, dtype=torch.float32) |
| |
|
| | |
| | tb = torch.tensor( |
| | [state.token_boost.get(c, 0.0) for c in cands], |
| | dtype=torch.float32 |
| | ) |
| |
|
| | |
| | w2_age = word_age(state.aoa, w2, state.corpus_freq, state.corpus_total) |
| | age_arr = np.array( |
| | [ |
| | age_continuity_boost( |
| | w2_age, |
| | word_age(state.aoa, c, state.corpus_freq, state.corpus_total), |
| | ) |
| | for c in cands |
| | ], |
| | dtype=np.float32, |
| | ) |
| | age_t = torch.tensor(age_arr, dtype=torch.float32) |
| |
|
| | |
| | topo_kernels = torch.tensor( |
| | [length_topo_kernel(c) for c in cands], |
| | dtype=torch.float32 |
| | ) |
| |
|
| | topo_cb = cb_t * (0.5 + 0.5 * topo_kernels) |
| |
|
| | |
| | boosts = ( |
| | float(de_strength) * de_t |
| | + topo_cb |
| | + 0.10 * tb |
| | + 0.15 * age_t |
| | + form_boost |
| | ) |
| |
|
| | logits = torch.log(base_probs.clamp_min(1e-12)) + boosts |
| | logits = logits / max(float(temp), 1e-6) |
| | probs = F.softmax(logits, dim=-1) |
| |
|
| | return cands, probs |
| |
|
| |
|
| | def generate_100_sentences( |
| | state: CorpusState, |
| | prompt: str, |
| | seed: int = 42, |
| | tokens_per_sentence: int = 15, |
| | temp: float = 1.2, |
| | ) -> str: |
| | """ |
| | REASONING (Sentences 51-55): |
| | generate_100_sentences() loops exactly 100 times, generating one sentence per |
| | iteration, tracking form influence and accumulating results. For each sentence, |
| | tokens are generated up to a maximum (tokens_per_sentence + 2) or until a |
| | sentence-ending punctuation is encountered. Influenced words (those semantically |
| | similar to the form's word) are tracked and recorded to the form plan for activation |
| | spreading. Initial context (w1, w2) is seeded from the prompt's last two alphabetic |
| | words, providing topical grounding for generation. The fully detokenized sentence |
| | is stored in sentence_outputs and recorded in the form plan with form activation |
| | strength=1.0. |
| | |
| | Generate exactly 100 sentences, one form per sentence. |
| | """ |
| | rng = np.random.default_rng(int(seed)) |
| | seed_toks = tokenize(prompt) |
| | sw = [t for t in seed_toks if re.match(r"^[a-z]", t)] |
| | w1 = sw[-2] if len(sw) >= 2 else (sw[0] if sw else "the") |
| | w2 = sw[-1] if sw else "concept" |
| |
|
| | result_sentences: List[str] = [] |
| |
|
| | for sent_idx in range(100): |
| | sentence_tokens: List[str] = [] |
| | alpha_count = 0 |
| | influenced_words: Set[str] = set() |
| |
|
| | |
| | for _ in range(int(tokens_per_sentence)): |
| | cands, probs = next_probs( |
| | state, |
| | w1, |
| | w2, |
| | sentence_index=sent_idx, |
| | temp=float(temp), |
| | ) |
| |
|
| | p = probs.detach().cpu().numpy() |
| | p = p / (p.sum() + 1e-12) |
| | tok = cands[int(rng.choice(len(cands), p=p))] |
| | sentence_tokens.append(tok) |
| |
|
| | |
| | current_form = state.sentence_form_plan.form_by_sentence.get(sent_idx) |
| | if current_form and semantic_similarity(current_form.word, tok) > 0.4: |
| | influenced_words.add(tok) |
| |
|
| | w1, w2 = w2, tok |
| |
|
| | if re.match(r"[A-Za-z]", tok): |
| | alpha_count += 1 |
| |
|
| | |
| | if sentence_tokens and sentence_tokens[-1] not in ".!?": |
| | sentence_tokens.append(".") |
| |
|
| | |
| | sentence_text = detokenize(sentence_tokens).strip() |
| | result_sentences.append(sentence_text) |
| |
|
| | |
| | form = state.sentence_form_plan.form_by_sentence.get(sent_idx) |
| | if form: |
| | state.sentence_form_plan.record_sentence_generation( |
| | sent_idx, |
| | sentence_text, |
| | form_activation=1.0, |
| | influenced_words=list(influenced_words), |
| | ) |
| |
|
| | return "\n".join(result_sentences) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def load_corpus( |
| | use_hf: bool, |
| | hf_dataset: str, |
| | hf_split: str, |
| | hf_max_rows: int, |
| | text_file, |
| | ) -> str: |
| | """ |
| | REASONING (Sentences 56-59): |
| | load_corpus() supports both Hugging Face datasets (via transformers library) |
| | and local text files (.txt, .md). For Hugging Face datasets, rows are limited |
| | by hf_max_rows to avoid memory overflow on large corpora. Dataset columns are |
| | detected flexibly: if "text" column exists, use it; else use first column. Text |
| | files are read with UTF-8 encoding and errors replaced to handle non-ASCII |
| | characters gracefully. |
| | """ |
| | if use_hf: |
| | ds = load_dataset(hf_dataset, split=hf_split) |
| | rows = min(int(hf_max_rows) if int(hf_max_rows) > 0 else len(ds), len(ds)) |
| | col = "text" if "text" in ds.column_names else ds.column_names[0] |
| | return "\n".join(str(x) for x in ds.select(range(rows))[col]) |
| |
|
| | if text_file is None: |
| | raise ValueError("No file provided.") |
| |
|
| | path = text_file if isinstance(text_file, str) else ( |
| | text_file.name if hasattr(text_file, "name") else str(text_file.get("path", "")) |
| | ) |
| |
|
| | return Path(path).read_text(encoding="utf-8", errors="replace") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def run_session( |
| | use_hf, |
| | hf_dataset, |
| | hf_split, |
| | hf_max_rows, |
| | text_file, |
| | prompt, |
| | seed, |
| | num_sentences, |
| | tokens_per_sentence, |
| | temp, |
| | progress=gr.Progress(), |
| | ): |
| | """ |
| | REASONING (Sentences 60-62): |
| | run_session() orchestrates the entire pipeline: load AoA, corpus, build state, |
| | generate sentences, analyze forms. Progress callbacks provide real-time feedback |
| | to the Gradio UI, indicating which stage of generation is active. |
| | """ |
| | try: |
| | progress(0.05, desc="Loading AoA dataset (Kuperman 2012)β¦") |
| | aoa = load_aoa_dataset() |
| |
|
| | progress(0.15, desc="Loading corpusβ¦") |
| | text = load_corpus(bool(use_hf), str(hf_dataset), str(hf_split), int(hf_max_rows), text_file) |
| |
|
| | progress(0.30, desc="Building language model and form planβ¦") |
| | state = build_state(text, aoa, prompt=str(prompt)) |
| |
|
| | progress(0.50, desc="Generating sentences (one form per sentence)β¦") |
| | sentences = generate_100_sentences( |
| | state, |
| | str(prompt), |
| | seed=int(seed), |
| | tokens_per_sentence=int(tokens_per_sentence), |
| | temp=float(temp), |
| | ) |
| |
|
| | progress(0.80, desc="Analyzing form activationβ¦") |
| | form_report = state.sentence_form_plan.generate_report() |
| |
|
| | return sentences, form_report |
| |
|
| | except Exception as e: |
| | import traceback |
| | traceback.print_exc() |
| | return f"Error: {e}", "" |
| |
|
| |
|
| | def toggle_hf(val): |
| | """Toggle between HuggingFace dataset and local file modes.""" |
| | return ( |
| | gr.update(visible=val), |
| | gr.update(visible=val), |
| | gr.update(visible=val), |
| | gr.update(visible=not val), |
| | ) |
| |
|
| |
|
| | def build_app(): |
| | """ |
| | REASONING (Sentences 63-65): |
| | The Gradio interface layout uses two-column design: left for controls, right |
| | for outputs, with responsive scaling (scale=1 vs scale=2). Input parameters |
| | (seed, temperature, tokens_per_sentence, num_sentences) are exposed as Gradio |
| | sliders and numeric inputs for user control. The prompt input defaults to |
| | "Consider the nature of understanding", extracting words that become the base |
| | for 100 form generation. |
| | """ |
| | with gr.Blocks( |
| | title="NeuroSymbolic V8.6+ β 100 Sentences, One Form Each", |
| | theme=gr.themes.Soft(), |
| | ) as demo: |
| | gr.Markdown("# NeuroSymbolic V8.6+ β 100 Sentences, One Form Per Sentence") |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | use_hf = gr.Checkbox(label="Use Hugging Face Dataset", value=True) |
| | hf_dataset = gr.Textbox( |
| | label="HF Dataset", |
| | value="AiresPucrs/stanford-encyclopedia-philosophy" |
| | ) |
| | hf_split = gr.Textbox(label="Split", value="train") |
| | hf_max_rows = gr.Slider(0, 2000, value=300, step=100, label="Max rows") |
| | text_file = gr.File( |
| | label="Upload .txt/.md", |
| | file_types=[".txt", ".md"], |
| | visible=False |
| | ) |
| | use_hf.change(toggle_hf, [use_hf], [hf_dataset, hf_split, hf_max_rows, text_file]) |
| |
|
| | seed = gr.Number(value=42, label="Seed") |
| | num_sentences = gr.Slider( |
| | 1, 500, value=100, step=10, label="Number of Sentences" |
| | ) |
| | tokens_per_sentence = gr.Slider( |
| | 8, 180, value=15, step=2, label="Tokens per Sentence" |
| | ) |
| | temp = gr.Slider(0.8, 2.5, value=1.2, step=0.1, label="Temperature") |
| |
|
| | with gr.Column(scale=2): |
| | prompt = gr.Textbox( |
| | label="Prompt (extracts words for 100 forms)", |
| | value="Consider the nature of understanding", |
| | lines=2, |
| | ) |
| | btn = gr.Button("Generate Sentences", variant="primary", size="lg") |
| |
|
| | gr.Markdown("## Generated Sentences (One Form Per Sentence)") |
| | output_sentences = gr.Textbox(label="Sentences", lines=40) |
| |
|
| | gr.Markdown("## Form Activation Analysis") |
| | output_report = gr.Textbox(label="Form Report", lines=40) |
| |
|
| | btn.click( |
| | run_session, |
| | inputs=[ |
| | use_hf, hf_dataset, hf_split, hf_max_rows, text_file, |
| | prompt, seed, num_sentences, tokens_per_sentence, temp |
| | ], |
| | outputs=[output_sentences, output_report], |
| | ) |
| |
|
| | gr.Markdown( |
| | "### Key Features\n" |
| | "- **100 Sentences, One Form Each:** Ensures full syntactic coverage\n" |
| | "- **Form Count:** Exactly 100 forms, one per sentence\n" |
| | "- **Form Boost:** Semantic similarity to form's word (25% weight)\n" |
| | "- **Activation Tracking:** Cumulative value + influence map\n" |
| | "- **Length-Dependent Topology:** Words get 2β12 dimensional embeddings\n" |
| | "- **Double-Entendre Embedder:** Two-pass similarity for robustness\n" |
| | "- **Age-of-Acquisition (AoA):** Kuperman 2012 dataset + regression model\n" |
| | "- **Multi-Signal Boosting:** 7+ scoring dimensions for coherence" |
| | ) |
| |
|
| | return demo |
| |
|
| |
|
| | if __name__ == "__main__": |
| | app = build_app() |
| | app.queue().launch(share=False) |