Taja Kuzman
commited on
Commit
·
07f72cb
1
Parent(s):
13bdbf0
Update README.md
Browse files
README.md
CHANGED
|
@@ -238,94 +238,18 @@ At cross-dataset and cross-lingual experiments, it was shown that the X-GENRE cl
|
|
| 238 |
|
| 239 |
## Citation
|
| 240 |
|
| 241 |
-
If you use the model, please cite the
|
| 242 |
|
| 243 |
```
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
```
|
| 253 |
|
| 254 |
-
and the following paper on which the original model is based:
|
| 255 |
-
```
|
| 256 |
-
@article{DBLP:journals/corr/abs-1911-02116,
|
| 257 |
-
author = {Alexis Conneau and
|
| 258 |
-
Kartikay Khandelwal and
|
| 259 |
-
Naman Goyal and
|
| 260 |
-
Vishrav Chaudhary and
|
| 261 |
-
Guillaume Wenzek and
|
| 262 |
-
Francisco Guzm{\'{a}}n and
|
| 263 |
-
Edouard Grave and
|
| 264 |
-
Myle Ott and
|
| 265 |
-
Luke Zettlemoyer and
|
| 266 |
-
Veselin Stoyanov},
|
| 267 |
-
title = {Unsupervised Cross-lingual Representation Learning at Scale},
|
| 268 |
-
journal = {CoRR},
|
| 269 |
-
volume = {abs/1911.02116},
|
| 270 |
-
year = {2019},
|
| 271 |
-
url = {http://arxiv.org/abs/1911.02116},
|
| 272 |
-
eprinttype = {arXiv},
|
| 273 |
-
eprint = {1911.02116},
|
| 274 |
-
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
|
| 275 |
-
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
|
| 276 |
-
bibsource = {dblp computer science bibliography, https://dblp.org}
|
| 277 |
-
}
|
| 278 |
-
```
|
| 279 |
-
|
| 280 |
-
To cite the datasets that were used for fine-tuning:
|
| 281 |
-
|
| 282 |
-
CORE dataset:
|
| 283 |
-
|
| 284 |
-
```
|
| 285 |
-
@article{egbert2015developing,
|
| 286 |
-
title={Developing a bottom-up, user-based method of web register classification},
|
| 287 |
-
author={Egbert, Jesse and Biber, Douglas and Davies, Mark},
|
| 288 |
-
journal={Journal of the Association for Information Science and Technology},
|
| 289 |
-
volume={66},
|
| 290 |
-
number={9},
|
| 291 |
-
pages={1817--1831},
|
| 292 |
-
year={2015},
|
| 293 |
-
publisher={Wiley Online Library}
|
| 294 |
-
}
|
| 295 |
-
```
|
| 296 |
-
|
| 297 |
-
GINCO dataset:
|
| 298 |
-
|
| 299 |
-
```
|
| 300 |
-
@InProceedings{kuzman-rupnik-ljubei:2022:LREC,
|
| 301 |
-
author = {Kuzman, Taja and Rupnik, Peter and Ljube{\v{s}}i{\'c}, Nikola},
|
| 302 |
-
title = {{The GINCO Training Dataset for Web Genre Identification of Documents Out in the Wild}},
|
| 303 |
-
booktitle = {Proceedings of the Language Resources and Evaluation Conference},
|
| 304 |
-
month = {},
|
| 305 |
-
year = {2022},
|
| 306 |
-
address = {Marseille, France},
|
| 307 |
-
publisher = {European Language Resources Association},
|
| 308 |
-
pages = {1584--1594},
|
| 309 |
-
url = {https://aclanthology.org/2022.lrec-1.170}
|
| 310 |
-
}
|
| 311 |
-
```
|
| 312 |
-
|
| 313 |
-
FTD dataset:
|
| 314 |
-
|
| 315 |
-
```
|
| 316 |
-
@article{sharoff2018functional,
|
| 317 |
-
title={Functional text dimensions for the annotation of web corpora},
|
| 318 |
-
author={Sharoff, Serge},
|
| 319 |
-
journal={Corpora},
|
| 320 |
-
volume={13},
|
| 321 |
-
number={1},
|
| 322 |
-
pages={65--95},
|
| 323 |
-
year={2018},
|
| 324 |
-
publisher={Edinburgh University Press The Tun-Holyrood Road, 12 (2f) Jackson's Entry~…}
|
| 325 |
-
}
|
| 326 |
-
```
|
| 327 |
-
|
| 328 |
-
The datasets are available at:
|
| 329 |
-
1. http://hdl.handle.net/11356/1467 (GINCO)
|
| 330 |
-
2. https://github.com/TurkuNLP/CORE-corpus (CORE)
|
| 331 |
-
3. https://github.com/ssharoff/genre-keras (FTD)
|
|
|
|
| 238 |
|
| 239 |
## Citation
|
| 240 |
|
| 241 |
+
If you use the model, please cite the paper which describes creation of the X-GENRE dataset and the genre classifier:
|
| 242 |
|
| 243 |
```
|
| 244 |
+
@article{kuzman2023automatic,
|
| 245 |
+
title={Automatic Genre Identification for Robust Enrichment of Massive Text Collections: Investigation of Classification Methods in the Era of Large Language Models},
|
| 246 |
+
author={Kuzman, Taja and Mozeti{\v{c}}, Igor and Ljube{\v{s}}i{\'c}, Nikola},
|
| 247 |
+
journal={Machine Learning and Knowledge Extraction},
|
| 248 |
+
volume={5},
|
| 249 |
+
number={3},
|
| 250 |
+
pages={1149--1175},
|
| 251 |
+
year={2023},
|
| 252 |
+
publisher={MDPI}
|
| 253 |
}
|
| 254 |
```
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|