omarkamali commited on Jan 7

Commit

1a2460a

verified ·

1 Parent(s): fa2d6b9

Upload all models and assets for bg (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +343 -146
models/embeddings/aligned/bg_128d.bin +3 -0
models/embeddings/aligned/bg_128d.meta.json +1 -0
models/embeddings/aligned/bg_128d.projection.npy +3 -0
models/embeddings/aligned/bg_128d_metadata.json +8 -0
models/embeddings/aligned/bg_32d.bin +3 -0
models/embeddings/aligned/bg_32d.meta.json +1 -0
models/embeddings/aligned/bg_32d.projection.npy +3 -0
models/embeddings/aligned/bg_32d_metadata.json +8 -0
models/embeddings/aligned/bg_64d.bin +3 -0
models/embeddings/aligned/bg_64d.meta.json +1 -0
models/embeddings/aligned/bg_64d.projection.npy +3 -0
models/embeddings/aligned/bg_64d_metadata.json +8 -0
models/embeddings/monolingual/bg_128d.bin +2 -2
models/embeddings/monolingual/bg_128d_metadata.json +5 -3
models/embeddings/monolingual/bg_32d.bin +2 -2
models/embeddings/monolingual/bg_32d_metadata.json +5 -3
models/embeddings/monolingual/bg_64d.bin +2 -2
models/embeddings/monolingual/bg_64d_metadata.json +5 -3
models/subword_markov/bg_markov_ctx1_subword.parquet +2 -2
models/subword_markov/bg_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/bg_markov_ctx2_subword.parquet +2 -2
models/subword_markov/bg_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/bg_markov_ctx3_subword.parquet +2 -2
models/subword_markov/bg_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/bg_markov_ctx4_subword.parquet +2 -2
models/subword_markov/bg_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/bg_2gram_subword.parquet +2 -2
models/subword_ngram/bg_2gram_subword_metadata.json +2 -2
models/subword_ngram/bg_3gram_subword.parquet +2 -2
models/subword_ngram/bg_3gram_subword_metadata.json +2 -2
models/subword_ngram/bg_4gram_subword.parquet +2 -2
models/subword_ngram/bg_4gram_subword_metadata.json +2 -2
models/subword_ngram/bg_5gram_subword.parquet +3 -0
models/subword_ngram/bg_5gram_subword_metadata.json +7 -0
models/tokenizer/bg_tokenizer_16k.model +2 -2
models/tokenizer/bg_tokenizer_16k.vocab +0 -0
models/tokenizer/bg_tokenizer_32k.model +2 -2
models/tokenizer/bg_tokenizer_32k.vocab +0 -0
models/tokenizer/bg_tokenizer_64k.model +2 -2
models/tokenizer/bg_tokenizer_64k.vocab +0 -0
models/tokenizer/bg_tokenizer_8k.model +2 -2
models/tokenizer/bg_tokenizer_8k.vocab +0 -0
models/vocabulary/bg_vocabulary.parquet +2 -2
models/vocabulary/bg_vocabulary_metadata.json +10 -9
models/word_markov/bg_markov_ctx1_word.parquet +2 -2
models/word_markov/bg_markov_ctx1_word_metadata.json +2 -2
models/word_markov/bg_markov_ctx2_word.parquet +2 -2
models/word_markov/bg_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-slavic_south
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.805
   - name: best_isotropy
     type: isotropy
-    value: 0.7912
   - name: vocabulary_size
     type: vocab
-    value: 960471
-generated: 2025-12-28
 ---
 # Bulgarian - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,66 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.140x | 3.10 | 0.0447% | 3,031,858 |
-| **16k** | 3.405x | 3.36 | 0.0485% | 2,795,587 |
-| **32k** | 3.631x | 3.59 | 0.0517% | 2,621,828 |
-| **64k** | 3.805x 🏆 | 3.76 | 0.0542% | 2,501,712 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Събития
- Родени
- Починали
- 28 юни – Андрей I, велик княз на Владимир-Суздал`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁събития ▁родени ▁починали ▁ 2 8 ▁юни ▁– ▁андрей ▁i ... (+9 more)` | 19 |
-| 16k | `▁събития ▁родени ▁починали ▁ 2 8 ▁юни ▁– ▁андрей ▁i ... (+9 more)` | 19 |
-| 32k | `▁събития ▁родени ▁починали ▁ 2 8 ▁юни ▁– ▁андрей ▁i ... (+9 more)` | 19 |
-| 64k | `▁събития ▁родени ▁починали ▁ 2 8 ▁юни ▁– ▁андрей ▁i ... (+8 more)` | 18 |
-**Sample 2:** `Събития
- Родени
- Починали`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁събития ▁родени ▁починали` | 3 |
-| 16k | `▁събития ▁родени ▁починали` | 3 |
-| 32k | `▁събития ▁родени ▁починали` | 3 |
-| 64k | `▁събития ▁родени ▁починали` | 3 |
-**Sample 3:** `Хайд може да се отнася за:
- Градове
- Хайд, град в Англия
- Окръзи в САЩ
- Хайд (...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ха йд ▁може ▁да ▁се ▁отнася ▁за : ▁градове ▁ха ... (+28 more)` | 38 |
-| 16k | `▁ха йд ▁може ▁да ▁се ▁отнася ▁за : ▁градове ▁ха ... (+25 more)` | 35 |
-| 32k | `▁хайд ▁може ▁да ▁се ▁отнася ▁за : ▁градове ▁хайд , ... (+20 more)` | 30 |
-| 64k | `▁хайд ▁може ▁да ▁се ▁отнася ▁за : ▁градове ▁хайд , ... (+20 more)` | 30 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.805x compression
-- **Lowest UNK Rate:** 8k with 0.0447% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -136,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 171,622 🏆 | 17.39 | 2,295,348 | 9.8% | 21.1% |
-| **2-gram** | 445 🏆 | 8.80 | 25,460 | 58.1% | 96.2% |
-| **3-gram** | 975,598 | 19.90 | 5,989,128 | 3.6% | 10.5% |
-| **3-gram** | 4,162 | 12.02 | 263,503 | 21.9% | 59.8% |
-| **4-gram** | 3,001,891 | 21.52 | 11,403,312 | 1.9% | 5.9% |
-| **4-gram** | 25,670 | 14.65 | 1,642,365 | 10.2% | 31.2% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `г .` | 1,208,709 |
-| 2 | `категория :` | 853,964 |
-| 3 | `) ,` | 479,143 |
-| 4 | `) .` | 331,723 |
-| 5 | `. в` | 330,654 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `г . ,` | 116,060 |
-| 2 | `г . )` | 88,901 |
-| 3 | `( ) е` | 84,347 |
-| 4 | `източници категория :` | 82,359 |
-| 5 | `г . в` | 77,398 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `. източници категория :` | 51,218 |
-| 2 | `категория : родени в` | 43,839 |
-| 3 | `. н . е` | 40,096 |
-| 4 | `н . е .` | 40,004 |
-| 5 | `пр . н .` | 39,838 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 445
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~31% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -194,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.7433 | 1.674 | 8.73 | 2,274,140 | 25.7% |
-| **1** | 1.3957 | 2.631 | 10.79 | 7,585 | 0.0% |
-| **2** | 0.4514 | 1.367 | 2.92 | 19,851,824 | 54.9% |
-| **2** | 0.8817 | 1.843 | 6.63 | 81,858 | 11.8% |
-| **3** | 0.2135 | 1.159 | 1.55 | 58,038,366 | 78.7% |
-| **3** | 0.9116 | 1.881 | 5.23 | 542,599 | 8.8% |
-| **4** | 0.1027 🏆 | 1.074 | 1.21 | 90,112,776 | 89.7% |
-| **4** | 0.7326 🏆 | 1.662 | 3.61 | 2,836,397 | 26.7% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `. през 1954 , и образование . “ , и на концертните може да отпият от`
-2. `, pomacentrus taeniometopon и определя характера , oktober . за купата на виенския университет на ол...`
-3. `на ветроходен спорт . структурата на непорочното зачатие на място по икономика “ . и цигулка`
 **Context Size 2:**
-1. `г . беше върната на сикст iv обявява конрад за маркиз акиле патерно , който по това`
-2. `категория : починали в тирана . след нашествието на унгарските интереси . правителството обявява наг...`
-3. `) , гръцки андартски деец , полковник станчов е български футболист 30 септември 1944 ) ташев ,`
 **Context Size 3:**
-1. `г . , ∞ 1309 за португалския крал афонсу v . той е и рекордьор за мъже в`
-2. `г . ) 1923 г . се завръща в пазарджик и председател на управителния съвет на rheinmetall са`
-3. `( ) е английски професионален футболист , който играе като вратар и се състезава в долните дивизии н...`
 **Context Size 4:**
-1. `. източници категория : литературни термини категория : научна фантастика категория : английски писа...`
-2. `категория : родени в софия категория : починали от рак категория : родени през 1710 година категория...`
-3. `. н . е . релефът изобразява човек на колесница с четири колела с четири коня и запаси за`
 ### Key Findings
-- **Best Predictability:** Context-4 with 89.7% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (2,836,397 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -258,38 +346,38 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 960,471 |
-| Total Tokens | 113,282,257 |
-| Mean Frequency | 117.94 |
 | Median Frequency | 4 |
-| Frequency Std Dev | 9016.46 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | на | 6,000,585 |
-| 2 | в | 3,189,920 |
-| 3 | и | 3,170,304 |
-| 4 | е | 2,177,841 |
-| 5 | от | 2,156,538 |
-| 6 | за | 1,349,594 |
-| 7 | се | 1,262,248 |
-| 8 | г | 1,219,255 |
-| 9 | с | 1,091,067 |
-| 10 | категория | 861,853 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | мъндън | 2 |
-| 2 | талиевия | 2 |
-| 3 | carbonato | 2 |
-| 4 | tallio | 2 |
-| 5 | tlhco3 | 2 |
-| 6 | разр | 2 |
-| 7 | mичман | 2 |
 | 8 | барутхана | 2 |
 | 9 | азадлу | 2 |
 | 10 | шталаг | 2 |
@@ -298,24 +386,24 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 0.9535 |
-| R² (Goodness of Fit) | 0.996716 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 33.9% |
-| Top 1,000 | 53.3% |
-| Top 5,000 | 70.0% |
-| Top 10,000 | 77.0% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9967 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 33.9% of corpus
-- **Long Tail:** 950,471 words needed for remaining 23.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -328,24 +416,130 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 784,943 | 32 | 3.285 | 0.948 | 0.7912 🏆 |
-| **mono_64d** | 784,943 | 64 | 3.715 | 0.928 | 0.7726 |
-| **mono_128d** | 784,943 | 128 | 4.153 | 0.959 | 0.7213 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7912 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 784,943 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -353,11 +547,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (3.81x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (445) |
-| Markov | **Context-4** | Highest predictability (89.7%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -547,7 +742,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -563,7 +759,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-28 05:10:25*

   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-slavic_south
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.373
   - name: best_isotropy
     type: isotropy
+    value: 0.7975
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-07
 ---
 # Bulgarian - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.452x | 3.45 | 0.0493% | 2,552,470 |
+| **16k** | 3.809x | 3.81 | 0.0544% | 2,313,214 |
+| **32k** | 4.120x | 4.12 | 0.0589% | 2,138,945 |
+| **64k** | 4.373x 🏆 | 4.37 | 0.0625% | 2,015,292 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Часово отместване UTC-11 се използва в: : Американска Самоа, Атол Мидуей : Ниуе ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ча сово ▁от мест ване ▁utc - 1 1 ▁се ... (+17 more)` | 27 |
+| 16k | `▁ча сово ▁от мест ване ▁utc - 1 1 ▁се ... (+15 more)` | 25 |
+| 32k | `▁ча сово ▁от местване ▁utc - 1 1 ▁се ▁използва ... (+13 more)` | 23 |
+| 64k | `▁часово ▁отместване ▁utc - 1 1 ▁се ▁използва ▁в : ... (+9 more)` | 19 |
+**Sample 2:** `Synodontis ouemeensis е вид лъчеперка от семейство Mochokidae. Разпространение В...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁s yn od ont is ▁o u em e ensis ... (+22 more)` | 32 |
+| 16k | `▁syn odont is ▁o u em e ensis ▁е ▁вид ... (+20 more)` | 30 |
+| 32k | `▁syn odont is ▁ou em e ensis ▁е ▁вид ▁лъчеперка ... (+19 more)` | 29 |
+| 64k | `▁synodontis ▁ou eme ensis ▁е ▁вид ▁лъчеперка ▁от ▁семейство ▁mochokidae ... (+13 more)` | 23 |
+**Sample 3:** `Orthotomus derbianus е вид птица от семейство Cisticolidae. Разпространение Видъ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁or th ot om us ▁der b ian us ▁е ... (+22 more)` | 32 |
+| 16k | `▁or th ot omus ▁der b ianus ▁е ▁вид ▁птица ... (+17 more)` | 27 |
+| 32k | `▁orth ot omus ▁der b ianus ▁е ▁вид ▁птица ▁от ... (+14 more)` | 24 |
+| 64k | `▁orth ot omus ▁der b ianus ▁е ▁вид ▁птица ▁от ... (+13 more)` | 23 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.373x compression
+- **Lowest UNK Rate:** 8k with 0.0493% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 246,747 | 17.91 | 2,004,902 | 5.8% | 16.2% |
+| **2-gram** | Subword | 385 🏆 | 8.59 | 20,810 | 61.1% | 97.4% |
+| **3-gram** | Word | 1,033,483 | 19.98 | 4,251,847 | 2.5% | 8.2% |
+| **3-gram** | Subword | 3,528 | 11.78 | 189,319 | 23.2% | 62.6% |
+| **4-gram** | Word | 2,692,464 | 21.36 | 7,308,829 | 1.5% | 5.1% |
+| **4-gram** | Subword | 21,676 | 14.40 | 1,191,303 | 10.4% | 32.6% |
+| **5-gram** | Word | 2,278,792 | 21.12 | 5,264,454 | 1.8% | 5.4% |
+| **5-gram** | Subword | 93,842 | 16.52 | 4,256,227 | 5.4% | 19.0% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `през г` | 371,674 |
+| 2 | `да се` | 178,835 |
+| 3 | `през година` | 109,499 |
+| 4 | `външни препратки` | 108,119 |
+| 5 | `е на` | 90,144 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `по време на` | 72,585 |
+| 2 | `източници външни препратки` | 52,888 |
+| 3 | `пр н е` | 38,682 |
+| 4 | `може да се` | 32,598 |
+| 5 | `през г е` | 28,945 |
+**4-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `разпространение видът е разпространен` | 11,928 |
+| 2 | `видът е разпространен в` | 11,811 |
+| 3 | `може да се отнася` | 9,394 |
+| 4 | `външни препратки официален сайт` | 9,248 |
+| 5 | `застрашен от изчезване разпространение` | 9,061 |
+**5-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `разпространение видът е разпространен в` | 11,030 |
+| 2 | `може да се отнася за` | 8,323 |
+| 3 | `е вид птица от семейство` | 8,165 |
+| 4 | `източници външни препратки уебсайт на` | 7,757 |
+| 5 | `външни препратки уебсайт на общината` | 7,230 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `а _` | 22,221,689 |
+| 2 | `н а` | 13,044,169 |
+| 3 | `и _` | 12,174,707 |
+| 4 | `_ с` | 10,248,868 |
+| 5 | `_ н` | 9,602,446 |
+**3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `н а _` | 8,421,175 |
+| 2 | `_ н а` | 7,714,836 |
+| 3 | `_ п р` | 3,824,613 |
+| 4 | `т а _` | 3,691,871 |
+| 5 | `т о _` | 3,556,816 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ н а _` | 5,969,377 |
+| 2 | `а т а _` | 2,454,178 |
+| 3 | `_ о т _` | 2,129,103 |
+| 4 | `а _ н а` | 1,914,071 |
+| 5 | `_ п р е` | 1,889,917 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `а _ н а _` | 1,515,525 |
+| 2 | `е _ н а _` | 949,109 |
+| 3 | `_ п р е з` | 882,206 |
+| 4 | `п р е з _` | 849,611 |
+| 5 | `о _ н а _` | 755,344 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 385
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~19% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.9743 | 1.965 | 12.25 | 1,896,771 | 2.6% |
+| **1** | Subword | 1.0920 | 2.132 | 7.98 | 9,126 | 0.0% |
+| **2** | Word | 0.3814 | 1.303 | 2.47 | 23,216,480 | 61.9% |
+| **2** | Subword | 0.7778 | 1.714 | 5.53 | 72,830 | 22.2% |
+| **3** | Word | 0.1657 | 1.122 | 1.39 | 57,272,367 | 83.4% |
+| **3** | Subword | 0.8207 | 1.766 | 4.91 | 403,072 | 17.9% |
+| **4** | Word | 0.0723 🏆 | 1.051 | 1.13 | 79,394,777 | 92.8% |
+| **4** | Subword | 0.7498 | 1.682 | 3.81 | 1,979,446 | 25.0% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `на излезли преди тази система от общинския център е най доброто от контекстовото запитване за написв...`
+2. `в миналото корабите от своето поведение и актриси актьори рок група в колекциониране на военноморска...`
+3. `и денчевци и е посрещала годеницата на черноморец бургас община палеор φούφας антиполохагос атина за...`
 **Context Size 2:**
+1. `през г тъй като години българия медал за на барила през г в битката е част от`
+2. `да се шуми около връзката ѝ с република българия собствеността на международна научна конференция га...`
+3. `външни препратки официален сайт схема на телескопа е било напълно елиминирано съмнението на ръководс...`
 **Context Size 3:**
+1. `по време на празничния сезон и стачката в метрото в токио vx не се използва от национално музикално`
+2. `източници външни препратки официален сайт на метеор първите ѝ постановки са дипломният ѝ спектакъл с...`
+3. `пр н е и са изключително популярни на балканите и втората най обща сред мъжете по онова време`
 **Context Size 4:**
+1. `разпространение видът е разпространен в малави мозамбик и j placidochromis johnstoni in iucn iucn re...`
+2. `видът е разпространен в демократична република t lamprologus lethops in iucn iucn red list of threat...`
+3. `може да се отнася до фердинандо i де медичи за да приюти извънбрачните дъщери на алесандро за разлик...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_трхтвътва_бъно_`
+2. `а_ma_верг._п_ц_м`
+3. `ита_менизандиясн`
+**Context Size 2:**
+1. `а_преват_и_с_ко_к`
+2. `на_сед_хеърши_ак:`
+3. `и_от_стори_те_съе`
+**Context Size 3:**
+1. `на_кампийский_став`
+2. `_на_от_вите_ръчепе`
+3. `_прически_баваща_с`
+**Context Size 4:**
+1. `_на_шаламброзиеолог`
+2. `ата_е_важна_космиче`
+3. `_от_попов_конвойна_`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 92.8% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (1,979,446 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 888,624 |
+| Total Tokens | 105,654,230 |
+| Mean Frequency | 118.90 |
 | Median Frequency | 4 |
+| Frequency Std Dev | 9303.24 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | на | 5,995,585 |
+| 2 | в | 3,186,690 |
+| 3 | и | 3,167,004 |
+| 4 | е | 2,175,525 |
+| 5 | от | 2,154,986 |
+| 6 | за | 1,348,073 |
+| 7 | се | 1,261,391 |
+| 8 | г | 1,205,312 |
+| 9 | с | 1,088,412 |
+| 10 | през | 849,597 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | кепевци | 2 |
+| 2 | сарджовци | 2 |
+| 3 | мъндън | 2 |
+| 4 | талиевия | 2 |
+| 5 | carbonato | 2 |
+| 6 | tallio | 2 |
+| 7 | разр | 2 |
 | 8 | барутхана | 2 |
 | 9 | азадлу | 2 |
 | 10 | шталаг | 2 |
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.9425 |
+| R² (Goodness of Fit) | 0.997405 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 35.2% |
+| Top 1,000 | 53.9% |
+| Top 5,000 | 70.2% |
+| Top 10,000 | 77.2% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9974 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 35.2% of corpus
+- **Long Tail:** 878,624 words needed for remaining 22.8% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7975 🏆 | 0.3595 | N/A | N/A |
+| **mono_64d** | 64 | 0.7851 | 0.2896 | N/A | N/A |
+| **mono_128d** | 128 | 0.7344 | 0.2334 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7975 | 0.3609 | 0.1560 | 0.5140 |
+| **aligned_64d** | 64 | 0.7851 | 0.2794 | 0.3420 | 0.7340 |
+| **aligned_128d** | 128 | 0.7344 | 0.2326 | 0.4740 | 0.8180 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.7975 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2926. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 47.4% R@1 in cross-lingual retrieval.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **-0.715** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-пр` | предхождащ, прихлупена, правнообвързващи |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-а` | исаака, жижавица, гамета |
+| `-та` | гамета, лопатовидната, малинката |
+| `-те` | врапчиште, древноиндийските, регресионните |
+| `-ите` | древноиндийските, регресионните, циментовите |
+| `-ата` | лопатовидната, малинката, покойницата |
+| `-ни` | пълнозначни, шекони, капсулни |
+| `-ки` | весегонски, гаговски, бачовски |
+| `-ия` | шумния, напрежения, валутния |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `лгар` | 2.07x | 163 contexts | елгар, илгар, юлгар |
+| `нска` | 1.82x | 254 contexts | анска, энска, юнска |
+| `анск` | 1.39x | 921 contexts | данск, анска, банск |
+| `ийск` | 1.57x | 389 contexts | бийск, ийски, лийски |
+| `нски` | 1.49x | 508 contexts | янски, ански, онски |
+| `ълга` | 2.34x | 39 contexts | дълга, бълга, ългаз |
+| `емвр` | 2.64x | 21 contexts | ноемвр, декемвр, нпември |
+| `рски` | 1.42x | 269 contexts | юрски, врски, ерски |
+| `точн` | 1.58x | 134 contexts | точни, точно, точна |
+| `ичес` | 1.43x | 204 contexts | бичес, уичес, ическ |
+| `остр` | 1.37x | 215 contexts | остри, остро, остра |
+| `ение` | 1.49x | 123 contexts | пение, шение, мение |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-пр` | `-а` | 59 words | пріложіха, приложната |
+| `-пр` | `-те` | 21 words | притеснявайте, профилиращите |
+| `-пр` | `-та` | 20 words | приложната, притежаващата |
+| `-пр` | `-ите` | 18 words | профилиращите, пребогатите |
+| `-пр` | `-ата` | 16 words | приложната, притежаващата |
+| `-пр` | `-ия` | 15 words | противоракетния, притежания |
+| `-пр` | `-то` | 13 words | прозводството, препострояването |
+| `-пр` | `-ни` | 9 words | производни, предхождани |
+| `-пр` | `-ки` | 7 words | прокарвайки, правейки |
+| `-пр` | `-на` | 6 words | приблизителна, престъпна |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| пробитите | **`пр-обит-ите`** | 6.0 | `обит` |
+| натрупванията | **`натрупван-ия-та`** | 6.0 | `натрупван` |
+| смразяващата | **`смразяващ-ата`** | 4.5 | `смразяващ` |
+| лишаването | **`лишаване-то`** | 4.5 | `лишаване` |
+| телепатия | **`телепат-ия`** | 4.5 | `телепат` |
+| плодородното | **`плодородно-то`** | 4.5 | `плодородно` |
+| маловажното | **`маловажно-то`** | 4.5 | `маловажно` |
+| стигналите | **`стигнал-ите`** | 4.5 | `стигнал` |
+| латинизирани | **`латинизира-ни`** | 4.5 | `латинизира` |
+| уругвайското | **`уругвайско-то`** | 4.5 | `уругвайско` |
+| паразитология | **`паразитолог-ия`** | 4.5 | `паразитолог` |
+| реализираната | **`реализиран-ата`** | 4.5 | `реализиран` |
+| изчислимостта | **`изчислимост-та`** | 4.5 | `изчислимост` |
+| истинностни | **`истинност-ни`** | 4.5 | `истинност` |
+| паратаксалното | **`паратаксално-то`** | 4.5 | `паратаксално` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Bulgarian shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.37x) |
+| N-gram | **2-gram** | Lowest perplexity (385) |
+| Markov | **Context-4** | Highest predictability (92.8%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-07 00:49:27*

models/embeddings/aligned/bg_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70f17c8d8d5727146912ba923f9ec2c4b65807bfa8b5e60c72de7b7230d38b0f
+size 1794266374

models/embeddings/aligned/bg_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bg", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bg_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b14d7edbf93a544e14749ba75d859efad44e84f623da9eea95b5142ba8a34bd
+size 65664

models/embeddings/aligned/bg_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bg",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 97579,
+  "vocab_size": 734481
+}

models/embeddings/aligned/bg_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b4f79d875896a889e95697cdfa2ce1628c8b5e9726df1bebe36cc6e8e221bd4
+size 462184966

models/embeddings/aligned/bg_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bg", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bg_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad1dfd93a6ed107e021e43a530e499fd9e511f7638cc31baf225f81e891b0f89
+size 4224

models/embeddings/aligned/bg_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bg",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 97579,
+  "vocab_size": 734481
+}

models/embeddings/aligned/bg_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:393ce592707f310a2f120623ca00cca0aa494ab24d1dca2c4c99d4df43ab9a9f
+size 906212102

models/embeddings/aligned/bg_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bg", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bg_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7500339b34f061294b831adfc3f7ee7c50e0b66455c22735b0de33175a6264cd
+size 16512

models/embeddings/aligned/bg_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bg",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 97579,
+  "vocab_size": 734481
+}

models/embeddings/monolingual/bg_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62cdb2428baa77bd04299975524021723a8ae2e1a65dd3825b63a90f909791a6
-size 1847208986

 version https://git-lfs.github.com/spec/v1
+oid sha256:70f17c8d8d5727146912ba923f9ec2c4b65807bfa8b5e60c72de7b7230d38b0f
+size 1794266374

models/embeddings/monolingual/bg_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 784943
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 734481
 }

models/embeddings/monolingual/bg_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6694d4b5c6d5d590970d067a729784b32deae9005f9da64f4af63b62d72fd36
-size 476372762

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b4f79d875896a889e95697cdfa2ce1628c8b5e9726df1bebe36cc6e8e221bd4
+size 462184966

models/embeddings/monolingual/bg_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 784943
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 734481
 }

models/embeddings/monolingual/bg_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c165692c56ff4e358f7d5166c0669be3db289fba438592d3be05f8b7c4c00096
-size 933318170

 version https://git-lfs.github.com/spec/v1
+oid sha256:393ce592707f310a2f120623ca00cca0aa494ab24d1dca2c4c99d4df43ab9a9f
+size 906212102

models/embeddings/monolingual/bg_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 784943
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 734481
 }

models/subword_markov/bg_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd9e037e79ad13bab48d8eeb19edaefaa7ce9c28ff82a5a07912b24ad1dbaae5
-size 553134

 version https://git-lfs.github.com/spec/v1
+oid sha256:229a2f690af4f616ac5a1e79a5fe524a2c5a46949f6c65b047078efee9a56f8e
+size 524522

models/subword_markov/bg_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "bg",
-  "unique_contexts": 7585,
-  "total_transitions": 748247572
 }

   "context_size": 1,
   "variant": "subword",
   "language": "bg",
+  "unique_contexts": 9126,
+  "total_transitions": 686554144
 }

models/subword_markov/bg_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da9040e708654bc57ac14b886e7470e3d89de042483e7f8f4538568b3ebe116c
-size 4217824

 version https://git-lfs.github.com/spec/v1
+oid sha256:f1a847282abd262310cd8bd7f8a33933ab4c3497c4cb7f489e336e92219d6abb
+size 3278578

models/subword_markov/bg_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "bg",
-  "unique_contexts": 81858,
-  "total_transitions": 747940816
 }

   "context_size": 2,
   "variant": "subword",
   "language": "bg",
+  "unique_contexts": 72830,
+  "total_transitions": 686247877
 }

models/subword_markov/bg_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:747f1607cf523956d879650cfe449c357884a3cdb39c4513d39a8f6b5d5b567f
-size 20926937

 version https://git-lfs.github.com/spec/v1
+oid sha256:126f62029a8608c56060c0ad2f72ab931fea64126b89f1dce80ebb74006532e2
+size 15865939

models/subword_markov/bg_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "bg",
-  "unique_contexts": 542599,
-  "total_transitions": 747634060
 }

   "context_size": 3,
   "variant": "subword",
   "language": "bg",
+  "unique_contexts": 403072,
+  "total_transitions": 685941610
 }

models/subword_markov/bg_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:580594f4c1b5219691b6c56a2d73434635ee09c032b5149e7db74064a6acb65d
-size 84747354

 version https://git-lfs.github.com/spec/v1
+oid sha256:f1d0b88705e12fbb905aa024fea935f15f49b2eaffd6ffa000088e7c44f0875a
+size 63194641

models/subword_markov/bg_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "bg",
-  "unique_contexts": 2836397,
-  "total_transitions": 747327304
 }

   "context_size": 4,
   "variant": "subword",
   "language": "bg",
+  "unique_contexts": 1979446,
+  "total_transitions": 685635343
 }

models/subword_ngram/bg_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67c8881953c951402dd6fdba26bfc9a9c946443e0e1ed1881dc14bec6d626101
-size 355833

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed59345b26ad583332bbdd4c4919c5d8efc2e6b9ee5d5fcfe16fd83befccb78d
+size 293294

models/subword_ngram/bg_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "bg",
-  "unique_ngrams": 25460,
-  "total_ngrams": 748247572
 }

   "n": 2,
   "variant": "subword",
   "language": "bg",
+  "unique_ngrams": 20810,
+  "total_ngrams": 686554144
 }

models/subword_ngram/bg_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32fc5c6fa7c26961c9a441b21558109617f745d911b211285e49e74ef54da5f0
-size 3306412

 version https://git-lfs.github.com/spec/v1
+oid sha256:185adce64cd1a43a9248d2b7db702539a12c908f4a5ef4bbc4e9ca66cc565dc5
+size 2444055

models/subword_ngram/bg_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "bg",
-  "unique_ngrams": 263503,
-  "total_ngrams": 747940816
 }

   "n": 3,
   "variant": "subword",
   "language": "bg",
+  "unique_ngrams": 189319,
+  "total_ngrams": 686247877
 }

models/subword_ngram/bg_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c7e026096ab3694db6e24aef689cbcdc5f2d4039e87e3716741d3e37f776c44
-size 21218611

 version https://git-lfs.github.com/spec/v1
+oid sha256:37e2856ae1cf37226f255274ab9f57209a886772f6a16df6d02d8524d23d526d
+size 15707213

models/subword_ngram/bg_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "bg",
-  "unique_ngrams": 1642365,
-  "total_ngrams": 747634060
 }

   "n": 4,
   "variant": "subword",
   "language": "bg",
+  "unique_ngrams": 1191303,
+  "total_ngrams": 685941610
 }

models/subword_ngram/bg_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386a00eeb7981736146d31a8c81ff615a9203d5400db41c70134047f00e727aa
+size 58720220

models/subword_ngram/bg_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "bg",
+  "unique_ngrams": 4256227,
+  "total_ngrams": 685635343
+}

models/tokenizer/bg_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33c6944293a101991ea008701e30e9031604db2e52cd7e6ecdbe45d45ab069c8
-size 586044

 version https://git-lfs.github.com/spec/v1
+oid sha256:3347807e91ae4acfead8089d4f180f189aa8398b96e2609682b507f6b46f0612
+size 580323

models/tokenizer/bg_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/bg_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ccfb2867a7a268411643ba09befd8b1a8af862bd8d825a39e257ee8375a64e1
-size 955868

 version https://git-lfs.github.com/spec/v1
+oid sha256:6df4d22bfb0bea403154fb25e8330887725de6ea5355a848829d1486fc10ee11
+size 941619

models/tokenizer/bg_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/bg_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb7a345ba1ca1260c37faed404185ea71b0a9e5a3a0f07e7c61a1c0ad0629672
-size 1713039

 version https://git-lfs.github.com/spec/v1
+oid sha256:0c63cd6adafc2b0b97fd02361685515c93e7aa8d6d07f8d0e5f79003afef23af
+size 1679650

models/tokenizer/bg_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/bg_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b413db4fd21958b13201f36fb037f0b71a092a43deac8595b08548d310f5741
-size 406965

 version https://git-lfs.github.com/spec/v1
+oid sha256:2442780aad7de7f357f17d60e70911ce945995d09117fd9e4391cd970916d6ee
+size 405611

models/tokenizer/bg_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/bg_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c185c72f951b2a0b535c14e0db0c089b83b539e8e448f1c25ef74024e17cc9b5
-size 15858834

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f53b6eaacded0cb5579d1b7bbc9e820a2fca3ac607d26147a27bc71bf882b23
+size 14751972

models/vocabulary/bg_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "bg",
-  "vocabulary_size": 960471,
   "statistics": {
-    "type_token_ratio": 0.019838880375008985,
     "coverage": {
-      "top_100": 0.3353316626354778,
-      "top_1000": 0.5264381386495268,
-      "top_5000": 0.6920902735329456,
-      "top_10000": 0.7614777009866214
     },
-    "hapax_count": 1312970,
-    "hapax_ratio": 0.577525433912734,
-    "total_documents": 306756
   }
 }

 {
   "language": "bg",
+  "vocabulary_size": 888624,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.017786120252959604,
     "coverage": {
+      "top_100": 0.3486511529304493,
+      "top_1000": 0.5343087812816177,
+      "top_5000": 0.6955211681171984,
+      "top_10000": 0.7649694239005076
     },
+    "hapax_count": 1008492,
+    "hapax_ratio": 0.5315921641059377,
+    "total_documents": 306267
   }
 }

models/word_markov/bg_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86a3382cdb32fda378af8795d72ce4a5795e3868e31ac776f3e98413d191d916
-size 229240553

 version https://git-lfs.github.com/spec/v1
+oid sha256:0720e5c54e069b4b09737ddf165eec8b22a456c09ad739be9b9643f73573efcb
+size 262875034

models/word_markov/bg_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "bg",
-  "unique_contexts": 2274140,
-  "total_transitions": 144455456
 }

   "context_size": 1,
   "variant": "word",
   "language": "bg",
+  "unique_contexts": 1896771,
+  "total_transitions": 106356455
 }

models/word_markov/bg_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdb579252dcf1dfd5fefb2c16e59e61f8751fd9617e88957a50be897d5e7879a
-size 956955138

 version https://git-lfs.github.com/spec/v1
+oid sha256:43ce28ac5245a38af621384dbf04db9efad0d244705973f5d2834f68d0615188
+size 1071226703

models/word_markov/bg_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "bg",
-  "unique_contexts": 19851824,
-  "total_transitions": 144148700
 }

   "context_size": 2,
   "variant": "word",
   "language": "bg",
+  "unique_contexts": 23216480,
+  "total_transitions": 106050188
 }