bowphs commited on
Commit
af1acfc
·
verified ·
1 Parent(s): 187a06b

Add files using upload-large-folder tool

Browse files
Files changed (41) hide show
  1. stanza/.github/ISSUE_TEMPLATE/bug_report.md +29 -0
  2. stanza/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  3. stanza/.github/ISSUE_TEMPLATE/question.md +18 -0
  4. stanza/.github/pull_request_template.md +16 -0
  5. stanza/.github/stale.yml +21 -0
  6. stanza/.github/workflows/stanza-tests.yaml +43 -0
  7. stanza/demo/CONLL_Dependency_Visualizer_Example.ipynb +70 -0
  8. stanza/demo/Dependency_Visualization_Testing.ipynb +78 -0
  9. stanza/demo/NER_Visualization.ipynb +88 -0
  10. stanza/demo/Stanza_Beginners_Guide.ipynb +358 -0
  11. stanza/demo/arabic_test.conllu.txt +127 -0
  12. stanza/demo/corenlp.py +95 -0
  13. stanza/demo/japanese_test.conllu.txt +82 -0
  14. stanza/demo/pipeline_demo.py +52 -0
  15. stanza/demo/scenegraph.py +20 -0
  16. stanza/demo/semgrex.py +18 -0
  17. stanza/demo/semgrex_sample.conllu +24 -0
  18. stanza/demo/ssurgeon_script.txt +18 -0
  19. stanza/doc/CoreNLP.proto +873 -0
  20. stanza/scripts/config.sh +51 -0
  21. stanza/scripts/download_vectors.sh +93 -0
  22. stanza/stanza/_version.py +4 -0
  23. stanza/stanza/models/__init__.py +0 -0
  24. stanza/stanza/models/_training_logging.py +4 -0
  25. stanza/stanza/models/classifier.py +660 -0
  26. stanza/stanza/models/parser.py +406 -0
  27. stanza/stanza/resources/__init__.py +0 -0
  28. stanza/stanza/server/java_protobuf_requests.py +357 -0
  29. stanza/stanza/server/main.py +72 -0
  30. stanza/stanza/server/morphology.py +81 -0
  31. stanza/stanza/server/parser_eval.py +89 -0
  32. stanza/stanza/server/tokensregex.py +44 -0
  33. stanza/stanza/server/tsurgeon.py +95 -0
  34. stanza/stanza/server/ud_enhancer.py +81 -0
  35. stanza/stanza/tests/pytest.ini +5 -0
  36. stanza/stanza/tests/setup.py +58 -0
  37. stanza/stanza/utils/__init__.py +0 -0
  38. stanza/stanza/utils/confusion.py +216 -0
  39. stanza/stanza/utils/conll.py +205 -0
  40. stanza/stanza/utils/conll18_ud_eval.py +832 -0
  41. stanza/stanza/utils/helper_func.py +38 -0
stanza/.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Go to '...'
16
+ 2. Click on '....'
17
+ 3. Scroll down to '....'
18
+ 4. See error
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **Environment (please complete the following information):**
24
+ - OS: [e.g. Windows, Ubuntu, CentOS, MacOS]
25
+ - Python version: [e.g. Python 3.6.8 from Anaconda]
26
+ - Stanza version: [e.g., 1.0.0]
27
+
28
+ **Additional context**
29
+ Add any other context about the problem here.
stanza/.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: enhancement
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12
+
13
+ **Describe the solution you'd like**
14
+ A clear and concise description of what you want to happen.
15
+
16
+ **Describe alternatives you've considered**
17
+ A clear and concise description of any alternative solutions or features you've considered.
18
+
19
+ **Additional context**
20
+ Add any other context or screenshots about the feature request here.
stanza/.github/ISSUE_TEMPLATE/question.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Question
3
+ about: 'Question about general usage. '
4
+ title: "[QUESTION]"
5
+ labels: question
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ Before you start, make sure to check out:
11
+ * Our documentation: https://stanfordnlp.github.io/stanza/
12
+ * Our FAQ: https://stanfordnlp.github.io/stanza/faq.html
13
+ * Github issues (especially closed ones)
14
+ Your question might have an answer in these places!
15
+
16
+ If you still couldn't find the answer to your question, feel free to delete this text and write down your question. The more information you provide with your question, the faster we will be able to help you!
17
+
18
+ If you have a question about an issue you're facing when using Stanza, please try to provide a detailed step-by-step guide to reproduce the issue you're facing. Try to at least provide a minimal code sample to reproduce the problem you are facing, instead of just describing it. That would greatly help us in locating the issue faster and help you resolve it!
stanza/.github/pull_request_template.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **BEFORE YOU START**: please make sure your pull request is against the `dev` branch.
2
+ We cannot accept pull requests against the `main` branch.
3
+ See our [contributing guide](https://github.com/stanfordnlp/stanza/blob/main/CONTRIBUTING.md) for details.
4
+
5
+ ## Description
6
+ A brief and concise description of what your pull request is trying to accomplish.
7
+
8
+ ## Fixes Issues
9
+ A list of issues/bugs with # references. (e.g., #123)
10
+
11
+ ## Unit test coverage
12
+ Are there unit tests in place to make sure your code is functioning correctly?
13
+ (see [here](https://github.com/stanfordnlp/stanza/blob/master/tests/test_tagger.py) for a simple example)
14
+
15
+ ## Known breaking changes/behaviors
16
+ Does this break anything in Stanza's existing user interface? If so, what is it and how is it addressed?
stanza/.github/stale.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Number of days of inactivity before an issue becomes stale
2
+ daysUntilStale: 60
3
+ # Number of days of inactivity before a stale issue is closed
4
+ daysUntilClose: 7
5
+ # Issues with these labels will never be considered stale
6
+ exemptLabels:
7
+ - pinned
8
+ - security
9
+ - fixed on dev
10
+ - bug
11
+ - enhancement
12
+ # Label to use when marking an issue as stale
13
+ staleLabel: stale
14
+ # Comment to post when marking an issue as stale. Set to `false` to disable
15
+ markComment: >
16
+ This issue has been automatically marked as stale because it has not had
17
+ recent activity. It will be closed if no further activity occurs. Thank you
18
+ for your contributions.
19
+ # Comment to post when closing a stale issue. Set to `false` to disable
20
+ closeComment: >
21
+ This issue has been automatically closed due to inactivity.
stanza/.github/workflows/stanza-tests.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Stanza Tests
2
+ on: [push]
3
+ jobs:
4
+ Run-Stanza-Tests:
5
+ runs-on: self-hosted
6
+ steps:
7
+ - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
8
+ - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
9
+ - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
10
+ - name: Check out repository code
11
+ uses: actions/checkout@v2
12
+ - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
13
+ - run: echo "🖥️ The workflow is now ready to test your code on the runner."
14
+ - name: Run Stanza Tests
15
+ run: |
16
+ # set up environment
17
+ echo "Setting up environment..."
18
+ bash
19
+ #. $CONDA_PREFIX/etc/profile.d/conda.sh
20
+ . /home/stanzabuild/miniconda3/etc/profile.d/conda.sh
21
+ conda activate stanza
22
+ export STANZA_TEST_HOME=/scr/stanza_test
23
+ export CORENLP_HOME=$STANZA_TEST_HOME/corenlp_dir
24
+ export CLASSPATH=$CORENLP_HOME/*:
25
+ echo CORENLP_HOME=$CORENLP_HOME
26
+ echo CLASSPATH=$CLASSPATH
27
+ # install from stanza repo being evaluated
28
+ echo PWD: $pwd
29
+ echo PATH: $PATH
30
+ pip3 install -e .
31
+ pip3 install -e .[test]
32
+ pip3 install -e .[transformers]
33
+ pip3 install -e .[tokenizers]
34
+ # set up for tests
35
+ echo "Running stanza test set up..."
36
+ rm -rf $STANZA_TEST_HOME
37
+ python3 stanza/tests/setup.py
38
+ # run tests
39
+ echo "Running tests..."
40
+ export CUDA_VISIBLE_DEVICES=2
41
+ pytest stanza/tests
42
+
43
+ - run: echo "🍏 This job's status is ${{ job.status }}."
stanza/demo/CONLL_Dependency_Visualizer_Example.ipynb ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "c0fd86c8",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
11
+ "\n",
12
+ "# load necessary conllu files - expected to be in the demo directory along with the notebook\n",
13
+ "en_file = \"en_test.conllu.txt\"\n",
14
+ "\n",
15
+ "# testing left to right languages\n",
16
+ "conll_to_visual(en_file, \"en\", sent_count=2)\n",
17
+ "conll_to_visual(en_file, \"en\", sent_count=10)\n",
18
+ "#conll_to_visual(en_file, \"en\", display_all=True)\n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "fc4b3f9b",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
29
+ "\n",
30
+ "jp_file = \"japanese_test.conllu.txt\"\n",
31
+ "conll_to_visual(jp_file, \"ja\")\n"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "id": "6852b8e8",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "from stanza.utils.visualization.conll_deprel_visualization import conll_to_visual\n",
42
+ "\n",
43
+ "# testing right to left languages\n",
44
+ "ar_file = \"arabic_test.conllu.txt\"\n",
45
+ "conll_to_visual(ar_file, \"ar\")"
46
+ ]
47
+ }
48
+ ],
49
+ "metadata": {
50
+ "kernelspec": {
51
+ "display_name": "Python 3 (ipykernel)",
52
+ "language": "python",
53
+ "name": "python3"
54
+ },
55
+ "language_info": {
56
+ "codemirror_mode": {
57
+ "name": "ipython",
58
+ "version": 3
59
+ },
60
+ "file_extension": ".py",
61
+ "mimetype": "text/x-python",
62
+ "name": "python",
63
+ "nbconvert_exporter": "python",
64
+ "pygments_lexer": "ipython3",
65
+ "version": "3.8.3"
66
+ }
67
+ },
68
+ "nbformat": 4,
69
+ "nbformat_minor": 5
70
+ }
stanza/demo/Dependency_Visualization_Testing.ipynb ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "64b2a9e0",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
11
+ "\n",
12
+ "ar_strings = ['برلين ترفض حصول شركة اميركية على رخصة تصنيع دبابة \"ليوبارد\" الالمانية', \"هل بإمكاني مساعدتك؟\", \n",
13
+ " \"أراك في مابعد\", \"لحظة من فضلك\"]\n",
14
+ "# Testing with right to left language\n",
15
+ "visualize_strings(ar_strings, \"ar\")"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "id": "35ef521b",
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
26
+ "\n",
27
+ "en_strings = [\"This is a sentence.\", \n",
28
+ " \"He is wearing a red shirt\",\n",
29
+ " \"Barack Obama was born in Hawaii. He was elected President of the United States in 2008.\"]\n",
30
+ "# Testing with left to right languages\n",
31
+ "visualize_strings(en_strings, \"en\")"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "id": "f3cf10ba",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "from stanza.utils.visualization.dependency_visualization import visualize_strings\n",
42
+ "\n",
43
+ "zh_strings = [\"中国是一个很有意思的国家。\"]\n",
44
+ "# Testing with right to left language\n",
45
+ "visualize_strings(zh_strings, \"zh\")"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "d2b9b574",
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": []
55
+ }
56
+ ],
57
+ "metadata": {
58
+ "kernelspec": {
59
+ "display_name": "Python 3 (ipykernel)",
60
+ "language": "python",
61
+ "name": "python3"
62
+ },
63
+ "language_info": {
64
+ "codemirror_mode": {
65
+ "name": "ipython",
66
+ "version": 3
67
+ },
68
+ "file_extension": ".py",
69
+ "mimetype": "text/x-python",
70
+ "name": "python",
71
+ "nbconvert_exporter": "python",
72
+ "pygments_lexer": "ipython3",
73
+ "version": "3.8.3"
74
+ }
75
+ },
76
+ "nbformat": 4,
77
+ "nbformat_minor": 5
78
+ }
stanza/demo/NER_Visualization.ipynb ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "abf300bb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from stanza.utils.visualization.ner_visualization import visualize_strings\n",
11
+ "\n",
12
+ "en_strings = ['''Samuel Jackson, a Christian man from Utah, went to the JFK Airport for a flight to New York.\n",
13
+ " He was thinking of attending the US Open, his favorite tennis tournament besides Wimbledon.\n",
14
+ " That would be a dream trip, certainly not possible since it is $5000 attendance and 5000 miles away.\n",
15
+ " On the way there, he watched the Super Bowl for 2 hours and read War and Piece by Tolstoy for 1 hour.\n",
16
+ " In New York, he crossed the Brooklyn Bridge and listened to the 5th symphony of Beethoven as well as\n",
17
+ " \"All I want for Christmas is You\" by Mariah Carey.''', \n",
18
+ " \"Barack Obama was born in Hawaii. He was elected President of the United States in 2008\"]\n",
19
+ " \n",
20
+ "visualize_strings(en_strings, \"en\")\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "id": "5670921a",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "from stanza.utils.visualization.ner_visualization import visualize_strings\n",
31
+ "\n",
32
+ "zh_strings = ['''来自犹他州的基督徒塞缪尔杰克逊前往肯尼迪机场搭乘航班飞往纽约。\n",
33
+ " 他正在考虑参加美国公开赛,这是除了温布尔登之外他最喜欢的网球赛事。\n",
34
+ " 那将是一次梦想之旅,当然不可能,因为它的出勤费为 5000 美元,距离 5000 英里。\n",
35
+ " 在去的路上,他看了 2 个小时的超级碗比赛,看了 1 个小时的托尔斯泰的《战争与碎片》。\n",
36
+ " 在纽约,他穿过布鲁克林大桥,聆听了贝多芬的第五交响曲以及 玛丽亚凯莉的“圣诞节我想要的就是你”。''',\n",
37
+ " \"我觉得罗家费德勒住在加州, 在美国里面。\"]\n",
38
+ "visualize_strings(zh_strings, \"zh\", colors={\"PERSON\": \"yellow\", \"DATE\": \"red\", \"GPE\": \"blue\"})\n",
39
+ "visualize_strings(zh_strings, \"zh\", select=['PERSON', 'DATE'])"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "b8d96072",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "from stanza.utils.visualization.ner_visualization import visualize_strings\n",
50
+ "\n",
51
+ "ar_strings = [\".أعيش في سان فرانسيسكو ، كاليفورنيا. اسمي أليكس وأنا ألتحق بجامعة ستانفورد. أنا أدرس علوم الكمبيوتر وأستاذي هو كريس مانينغ\"\n",
52
+ " , \"اسمي أليكس ، أنا من الولايات المتحدة.\", \n",
53
+ " '''صامويل جاكسون ، رجل مسيحي من ولاية يوتا ، ذهب إلى مطار جون كنيدي في رحلة إلى نيويورك. كان يفكر في حضور بطولة الولايات المتحدة المفتوحة للتنس ، بطولة التنس المفضلة لديه إلى جانب بطولة ويمبلدون. ستكون هذه رحلة الأحلام ، وبالتأكيد ليست ممكنة لأنها تبلغ 5000 دولار للحضور و 5000 ميل. في الطريق إلى هناك ، شاهد Super Bowl لمدة ساعتين وقرأ War and Piece by Tolstoy لمدة ساعة واحدة. في نيويورك ، عبر جسر بروكلين واستمع إلى السيمفونية الخامسة لبيتهوفن وكذلك \"كل ما أريده في عيد الميلاد هو أنت\" لماريا كاري.''']\n",
54
+ "\n",
55
+ "visualize_strings(ar_strings, \"ar\", colors={\"PER\": \"pink\", \"LOC\": \"linear-gradient(90deg, #aa9cfc, #fc9ce7)\", \"ORG\": \"yellow\"})"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "id": "22489b27",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": []
65
+ }
66
+ ],
67
+ "metadata": {
68
+ "kernelspec": {
69
+ "display_name": "Python 3 (ipykernel)",
70
+ "language": "python",
71
+ "name": "python3"
72
+ },
73
+ "language_info": {
74
+ "codemirror_mode": {
75
+ "name": "ipython",
76
+ "version": 3
77
+ },
78
+ "file_extension": ".py",
79
+ "mimetype": "text/x-python",
80
+ "name": "python",
81
+ "nbconvert_exporter": "python",
82
+ "pygments_lexer": "ipython3",
83
+ "version": "3.8.3"
84
+ }
85
+ },
86
+ "nbformat": 4,
87
+ "nbformat_minor": 5
88
+ }
stanza/demo/Stanza_Beginners_Guide.ipynb ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "name": "Stanza-Beginners-Guide.ipynb",
7
+ "provenance": [],
8
+ "collapsed_sections": [],
9
+ "toc_visible": true
10
+ },
11
+ "kernelspec": {
12
+ "name": "python3",
13
+ "display_name": "Python 3"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {
20
+ "id": "56LiYCkPM7V_",
21
+ "colab_type": "text"
22
+ },
23
+ "source": [
24
+ "# Welcome to Stanza!\n",
25
+ "\n",
26
+ "![Latest Version](https://img.shields.io/pypi/v/stanza.svg?colorB=bc4545)\n",
27
+ "![Python Versions](https://img.shields.io/pypi/pyversions/stanza.svg?colorB=bc4545)\n",
28
+ "\n",
29
+ "Stanza is a Python NLP toolkit that supports 60+ human languages. It is built with highly accurate neural network components that enable efficient training and evaluation with your own annotated data, and offers pretrained models on 100 treebanks. Additionally, Stanza provides a stable, officially maintained Python interface to Java Stanford CoreNLP Toolkit.\n",
30
+ "\n",
31
+ "In this tutorial, we will demonstrate how to set up Stanza and annotate text with its native neural network NLP models. For the use of the Python CoreNLP interface, please see other tutorials."
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {
37
+ "id": "yQff4Di5Nnq0",
38
+ "colab_type": "text"
39
+ },
40
+ "source": [
41
+ "## 1. Installing Stanza\n",
42
+ "\n",
43
+ "Note that Stanza only supports Python 3.6 and above. Installing and importing Stanza are as simple as running the following commands:"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "metadata": {
49
+ "id": "owSj1UtdEvSU",
50
+ "colab_type": "code",
51
+ "colab": {}
52
+ },
53
+ "source": [
54
+ "# Install; note that the prefix \"!\" is not needed if you are running in a terminal\n",
55
+ "!pip install stanza\n",
56
+ "\n",
57
+ "# Import the package\n",
58
+ "import stanza"
59
+ ],
60
+ "execution_count": 0,
61
+ "outputs": []
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {
66
+ "id": "4ixllwEKeCJg",
67
+ "colab_type": "text"
68
+ },
69
+ "source": [
70
+ "### More Information\n",
71
+ "\n",
72
+ "For common troubleshooting, please visit our [troubleshooting page](https://stanfordnlp.github.io/stanfordnlp/installation_usage.html#troubleshooting)."
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "markdown",
77
+ "metadata": {
78
+ "id": "aeyPs5ARO79d",
79
+ "colab_type": "text"
80
+ },
81
+ "source": [
82
+ "## 2. Downloading Models\n",
83
+ "\n",
84
+ "You can download models with the `stanza.download` command. The language can be specified with either a full language name (e.g., \"english\"), or a short code (e.g., \"en\"). \n",
85
+ "\n",
86
+ "By default, models will be saved to your `~/stanza_resources` directory. If you want to specify your own path to save the model files, you can pass a `dir=your_path` argument.\n"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "metadata": {
92
+ "id": "HDwRm-KXGcYo",
93
+ "colab_type": "code",
94
+ "colab": {}
95
+ },
96
+ "source": [
97
+ "# Download an English model into the default directory\n",
98
+ "print(\"Downloading English model...\")\n",
99
+ "stanza.download('en')\n",
100
+ "\n",
101
+ "# Similarly, download a (simplified) Chinese model\n",
102
+ "# Note that you can use verbose=False to turn off all printed messages\n",
103
+ "print(\"Downloading Chinese model...\")\n",
104
+ "stanza.download('zh', verbose=False)"
105
+ ],
106
+ "execution_count": 0,
107
+ "outputs": []
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "metadata": {
112
+ "id": "7HCfQ0SfdmsU",
113
+ "colab_type": "text"
114
+ },
115
+ "source": [
116
+ "### More Information\n",
117
+ "\n",
118
+ "Pretrained models are provided for 60+ different languages. For all languages, available models and the corresponding short language codes, please check out the [models page](https://stanfordnlp.github.io/stanza/models.html).\n"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "metadata": {
124
+ "id": "b3-WZJrzWD2o",
125
+ "colab_type": "text"
126
+ },
127
+ "source": [
128
+ "## 3. Processing Text\n"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "markdown",
133
+ "metadata": {
134
+ "id": "XrnKl2m3fq2f",
135
+ "colab_type": "text"
136
+ },
137
+ "source": [
138
+ "### Constructing Pipeline\n",
139
+ "\n",
140
+ "To process a piece of text, you'll need to first construct a `Pipeline` with different `Processor` units. The pipeline is language-specific, so again you'll need to first specify the language (see examples).\n",
141
+ "\n",
142
+ "- By default, the pipeline will include all processors, including tokenization, multi-word token expansion, part-of-speech tagging, lemmatization, dependency parsing and named entity recognition (for supported languages). However, you can always specify what processors you want to include with the `processors` argument.\n",
143
+ "\n",
144
+ "- Stanza's pipeline is CUDA-aware, meaning that a CUDA-device will be used whenever it is available, otherwise CPUs will be used when a GPU is not found. You can force the pipeline to use CPU regardless by setting `use_gpu=False`.\n",
145
+ "\n",
146
+ "- Again, you can suppress all printed messages by setting `verbose=False`."
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "metadata": {
152
+ "id": "HbiTSBDPG53o",
153
+ "colab_type": "code",
154
+ "colab": {}
155
+ },
156
+ "source": [
157
+ "# Build an English pipeline, with all processors by default\n",
158
+ "print(\"Building an English pipeline...\")\n",
159
+ "en_nlp = stanza.Pipeline('en')\n",
160
+ "\n",
161
+ "# Build a Chinese pipeline, with customized processor list and no logging, and force it to use CPU\n",
162
+ "print(\"Building a Chinese pipeline...\")\n",
163
+ "zh_nlp = stanza.Pipeline('zh', processors='tokenize,lemma,pos,depparse', verbose=False, use_gpu=False)"
164
+ ],
165
+ "execution_count": 0,
166
+ "outputs": []
167
+ },
168
+ {
169
+ "cell_type": "markdown",
170
+ "metadata": {
171
+ "id": "Go123Bx8e1wt",
172
+ "colab_type": "text"
173
+ },
174
+ "source": [
175
+ "### Annotating Text\n",
176
+ "\n",
177
+ "After a pipeline is successfully constructed, you can get annotations of a piece of text simply by passing the string into the pipeline object. The pipeline will return a `Document` object, which can be used to access detailed annotations from. For example:\n"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "metadata": {
183
+ "id": "k_p0h1UTHDMm",
184
+ "colab_type": "code",
185
+ "colab": {}
186
+ },
187
+ "source": [
188
+ "# Processing English text\n",
189
+ "en_doc = en_nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")\n",
190
+ "print(type(en_doc))\n",
191
+ "\n",
192
+ "# Processing Chinese text\n",
193
+ "zh_doc = zh_nlp(\"达沃斯世界经济论坛是每年全球政商界领袖聚在一起的年度盛事。\")\n",
194
+ "print(type(zh_doc))"
195
+ ],
196
+ "execution_count": 0,
197
+ "outputs": []
198
+ },
199
+ {
200
+ "cell_type": "markdown",
201
+ "metadata": {
202
+ "id": "DavwCP9egzNZ",
203
+ "colab_type": "text"
204
+ },
205
+ "source": [
206
+ "### More Information\n",
207
+ "\n",
208
+ "For more information on how to construct a pipeline and information on different processors, please visit our [pipeline page](https://stanfordnlp.github.io/stanfordnlp/pipeline.html)."
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "markdown",
213
+ "metadata": {
214
+ "id": "O_PYLEGziQWR",
215
+ "colab_type": "text"
216
+ },
217
+ "source": [
218
+ "## 4. Accessing Annotations\n",
219
+ "\n",
220
+ "Annotations can be accessed from the returned `Document` object. \n",
221
+ "\n",
222
+ "A `Document` contains a list of `Sentence`s, and a `Sentence` contains a list of `Token`s and `Word`s. For the most part `Token`s and `Word`s overlap, but some tokens can be divided into mutiple words, for instance the French token `aux` is divided into the words `à` and `les`, while in English a word and a token are equivalent. Note that dependency parses are derived over `Word`s.\n",
223
+ "\n",
224
+ "Additionally, a `Span` object is used to represent annotations that are part of a document, such as named entity mentions.\n",
225
+ "\n",
226
+ "\n",
227
+ "The following example iterate over all English sentences and words, and print the word information one by one:"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "metadata": {
233
+ "id": "B5691SpFHFZ6",
234
+ "colab_type": "code",
235
+ "colab": {}
236
+ },
237
+ "source": [
238
+ "for i, sent in enumerate(en_doc.sentences):\n",
239
+ " print(\"[Sentence {}]\".format(i+1))\n",
240
+ " for word in sent.words:\n",
241
+ " print(\"{:12s}\\t{:12s}\\t{:6s}\\t{:d}\\t{:12s}\".format(\\\n",
242
+ " word.text, word.lemma, word.pos, word.head, word.deprel))\n",
243
+ " print(\"\")"
244
+ ],
245
+ "execution_count": 0,
246
+ "outputs": []
247
+ },
248
+ {
249
+ "cell_type": "markdown",
250
+ "metadata": {
251
+ "id": "-AUkCkNIrusq",
252
+ "colab_type": "text"
253
+ },
254
+ "source": [
255
+ "The following example iterate over all extracted named entity mentions and print out their character spans and types."
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "metadata": {
261
+ "id": "5Uu0-WmvsnlK",
262
+ "colab_type": "code",
263
+ "colab": {}
264
+ },
265
+ "source": [
266
+ "print(\"Mention text\\tType\\tStart-End\")\n",
267
+ "for ent in en_doc.ents:\n",
268
+ " print(\"{}\\t{}\\t{}-{}\".format(ent.text, ent.type, ent.start_char, ent.end_char))"
269
+ ],
270
+ "execution_count": 0,
271
+ "outputs": []
272
+ },
273
+ {
274
+ "cell_type": "markdown",
275
+ "metadata": {
276
+ "id": "Ql1SZlZOnMLo",
277
+ "colab_type": "text"
278
+ },
279
+ "source": [
280
+ "And similarly for the Chinese text:"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "metadata": {
286
+ "id": "XsVcEO9tHKPG",
287
+ "colab_type": "code",
288
+ "colab": {}
289
+ },
290
+ "source": [
291
+ "for i, sent in enumerate(zh_doc.sentences):\n",
292
+ " print(\"[Sentence {}]\".format(i+1))\n",
293
+ " for word in sent.words:\n",
294
+ " print(\"{:12s}\\t{:12s}\\t{:6s}\\t{:d}\\t{:12s}\".format(\\\n",
295
+ " word.text, word.lemma, word.pos, word.head, word.deprel))\n",
296
+ " print(\"\")"
297
+ ],
298
+ "execution_count": 0,
299
+ "outputs": []
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "metadata": {
304
+ "id": "dUhWAs8pnnHT",
305
+ "colab_type": "text"
306
+ },
307
+ "source": [
308
+ "Alternatively, you can directly print a `Word` object to view all its annotations as a Python dict:"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "metadata": {
314
+ "id": "6_UafNb7HHIg",
315
+ "colab_type": "code",
316
+ "colab": {}
317
+ },
318
+ "source": [
319
+ "word = en_doc.sentences[0].words[0]\n",
320
+ "print(word)"
321
+ ],
322
+ "execution_count": 0,
323
+ "outputs": []
324
+ },
325
+ {
326
+ "cell_type": "markdown",
327
+ "metadata": {
328
+ "id": "TAQlOsuRoq2V",
329
+ "colab_type": "text"
330
+ },
331
+ "source": [
332
+ "### More Information\n",
333
+ "\n",
334
+ "For all information on different data objects, please visit our [data objects page](https://stanfordnlp.github.io/stanza/data_objects.html)."
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "markdown",
339
+ "metadata": {
340
+ "id": "hiiWHxYPpmhd",
341
+ "colab_type": "text"
342
+ },
343
+ "source": [
344
+ "## 5. Resources\n",
345
+ "\n",
346
+ "Apart from this interactive tutorial, we also provide tutorials on our website that cover a variety of use cases such as how to use different model \"packages\" for a language, how to use spaCy as a tokenizer, how to process pretokenized text without running the tokenizer, etc. For these tutorials please visit [our Tutorials page](https://stanfordnlp.github.io/stanza/tutorials.html).\n",
347
+ "\n",
348
+ "Other resources that you may find helpful include:\n",
349
+ "\n",
350
+ "- [Stanza Homepage](https://stanfordnlp.github.io/stanza/index.html)\n",
351
+ "- [FAQs](https://stanfordnlp.github.io/stanza/faq.html)\n",
352
+ "- [GitHub Repo](https://github.com/stanfordnlp/stanza)\n",
353
+ "- [Reporting Issues](https://github.com/stanfordnlp/stanza/issues)\n",
354
+ "- [Stanza System Description Paper](http://arxiv.org/abs/2003.07082)\n"
355
+ ]
356
+ }
357
+ ]
358
+ }
stanza/demo/arabic_test.conllu.txt ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # newdoc id = assabah.20041005.0017
2
+ # newpar id = assabah.20041005.0017:p1
3
+ # sent_id = assabah.20041005.0017:p1u1
4
+ # text = سوريا: تعديل وزاري واسع يشمل 8 حقائب
5
+ # orig_file_sentence ASB_ARB_20041005.0017#1
6
+ 1 سوريا سُورِيَا X X--------- Foreign=Yes 0 root 0:root SpaceAfter=No|Vform=سُورِيَا|Gloss=Syria|Root=sUr|Translit=sūriyā|LTranslit=sūriyā
7
+ 2 : : PUNCT G--------- _ 1 punct 1:punct Vform=:|Translit=:
8
+ 3 تعديل تَعدِيل NOUN N------S1I Case=Nom|Definite=Ind|Number=Sing 6 nsubj 6:nsubj Vform=تَعدِيلٌ|Gloss=adjustment,change,modification,amendment|Root=`_d_l|Translit=taʿdīlun|LTranslit=taʿdīl
9
+ 4 وزاري وِزَارِيّ ADJ A-----MS1I Case=Nom|Definite=Ind|Gender=Masc|Number=Sing 3 amod 3:amod Vform=وِزَارِيٌّ|Gloss=ministry,ministerial|Root=w_z_r|Translit=wizārīyun|LTranslit=wizārīy
10
+ 5 واسع وَاسِع ADJ A-----MS1I Case=Nom|Definite=Ind|Gender=Masc|Number=Sing 3 amod 3:amod Vform=وَاسِعٌ|Gloss=wide,extensive,broad|Root=w_s_`|Translit=wāsiʿun|LTranslit=wāsiʿ
11
+ 6 يشمل شَمِل VERB VIIA-3MS-- Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Person=3|VerbForm=Fin|Voice=Act 1 parataxis 1:parataxis Vform=يَشمَلُ|Gloss=comprise,include,contain|Root=^s_m_l|Translit=yašmalu|LTranslit=šamil
12
+ 7 8 8 NUM Q--------- NumForm=Digit 6 obj 6:obj Vform=٨|Translit=8
13
+ 8 حقائب حَقِيبَة NOUN N------P2I Case=Gen|Definite=Ind|Number=Plur 7 nmod 7:nmod:gen Vform=حَقَائِبَ|Gloss=briefcase,suitcase,portfolio,luggage|Root=.h_q_b|Translit=ḥaqāʾiba|LTranslit=ḥaqībat
14
+
15
+ # newpar id = assabah.20041005.0017:p2
16
+ # sent_id = assabah.20041005.0017:p2u1
17
+ # text = دمشق (وكالات الانباء) - اجرى الرئيس السوري بشار الاسد تعديلا حكومياً واسعا تم بموجبه إقالة وزيري الداخلية والاعلام عن منصبيها في حين ظل محمد ناجي العطري رئيساً للحكومة.
18
+ # orig_file_sentence ASB_ARB_20041005.0017#2
19
+ 1 دمشق دمشق X U--------- _ 0 root 0:root Vform=دمشق|Root=OOV|Translit=dmšq
20
+ 2 ( ( PUNCT G--------- _ 3 punct 3:punct SpaceAfter=No|Vform=(|Translit=(
21
+ 3 وكالات وِكَالَة NOUN N------P1R Case=Nom|Definite=Cons|Number=Plur 1 dep 1:dep Vform=وِكَالَاتُ|Gloss=agency|Root=w_k_l|Translit=wikālātu|LTranslit=wikālat
22
+ 4 الانباء نَبَأ NOUN N------P2D Case=Gen|Definite=Def|Number=Plur 3 nmod 3:nmod:gen SpaceAfter=No|Vform=اَلأَنبَاءِ|Gloss=news_item,report|Root=n_b_'|Translit=al-ʾanbāʾi|LTranslit=nabaʾ
23
+ 5 ) ) PUNCT G--------- _ 3 punct 3:punct Vform=)|Translit=)
24
+ 6 - - PUNCT G--------- _ 1 punct 1:punct Vform=-|Translit=-
25
+ 7 اجرى أَجرَى VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 1 advcl 1:advcl:فِي_حِينَ Vform=أَجرَى|Gloss=conduct,carry_out,perform|Root=^g_r_y|Translit=ʾaǧrā|LTranslit=ʾaǧrā
26
+ 8 الرئيس رَئِيس NOUN N------S1D Case=Nom|Definite=Def|Number=Sing 7 nsubj 7:nsubj Vform=اَلرَّئِيسُ|Gloss=president,head,chairman|Root=r_'_s|Translit=ar-raʾīsu|LTranslit=raʾīs
27
+ 9 السوري سُورِيّ ADJ A-----MS1D Case=Nom|Definite=Def|Gender=Masc|Number=Sing 8 amod 8:amod Vform=اَلسُّورِيُّ|Gloss=Syrian|Root=sUr|Translit=as-sūrīyu|LTranslit=sūrīy
28
+ 10 بشار بشار X U--------- _ 11 nmod 11:nmod Vform=بشار|Root=OOV|Translit=bšār
29
+ 11 الاسد الاسد X U--------- _ 8 nmod 8:nmod Vform=الاسد|Root=OOV|Translit=ālāsd
30
+ 12 تعديلا تَعدِيل NOUN N------S4I Case=Acc|Definite=Ind|Number=Sing 7 obj 7:obj Vform=تَعدِيلًا|Gloss=adjustment,change,modification,amendment|Root=`_d_l|Translit=taʿdīlan|LTranslit=taʿdīl
31
+ 13 حكومياً حُكُومِيّ ADJ A-----MS4I Case=Acc|Definite=Ind|Gender=Masc|Number=Sing 12 amod 12:amod Vform=حُكُومِيًّا|Gloss=governmental,state,official|Root=.h_k_m|Translit=ḥukūmīyan|LTranslit=ḥukūmīy
32
+ 14 واسعا وَاسِع ADJ A-----MS4I Case=Acc|Definite=Ind|Gender=Masc|Number=Sing 12 amod 12:amod Vform=وَاسِعًا|Gloss=wide,extensive,broad|Root=w_s_`|Translit=wāsiʿan|LTranslit=wāsiʿ
33
+ 15 تم تَمّ VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 12 acl 12:acl Vform=تَمَّ|Gloss=conclude,take_place|Root=t_m_m|Translit=tamma|LTranslit=tamm
34
+ 16-18 بموجبه _ _ _ _ _ _ _ _
35
+ 16 ب بِ ADP P--------- AdpType=Prep 18 case 18:case Vform=بِ|Gloss=by,with|Root=bi|Translit=bi|LTranslit=bi
36
+ 17 موجب مُوجِب NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 16 fixed 16:fixed Vform=مُوجِبِ|Gloss=reason,motive|Root=w_^g_b|Translit=mūǧibi|LTranslit=mūǧib
37
+ 18 ه هُوَ PRON SP---3MS2- Case=Gen|Gender=Masc|Number=Sing|Person=3|PronType=Prs 15 nmod 15:nmod:بِ_مُوجِب:gen Vform=هِ|Gloss=he,she,it|Translit=hi|LTranslit=huwa
38
+ 19 إقالة إِقَالَة NOUN N------S1R Case=Nom|Definite=Cons|Number=Sing 15 nsubj 15:nsubj Vform=إِقَالَةُ|Gloss=dismissal,discharge|Root=q_y_l|Translit=ʾiqālatu|LTranslit=ʾiqālat
39
+ 20 وزيري وَزِير NOUN N------D2R Case=Gen|Definite=Cons|Number=Dual 19 nmod 19:nmod:gen Vform=وَزِيرَي|Gloss=minister|Root=w_z_r|Translit=wazīray|LTranslit=wazīr
40
+ 21 الداخلية دَاخِلِيّ ADJ A-----FS2D Case=Gen|Definite=Def|Gender=Fem|Number=Sing 20 amod 20:amod Vform=اَلدَّاخِلِيَّةِ|Gloss=internal,domestic,interior,of_state|Root=d__h_l|Translit=ad-dāḫilīyati|LTranslit=dāḫilīy
41
+ 22-23 والاعلام _ _ _ _ _ _ _ _
42
+ 22 و وَ CCONJ C--------- _ 23 cc 23:cc Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
43
+ 23 الإعلام إِعلَام NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 21 conj 20:amod|21:conj Vform=اَلإِعلَامِ|Gloss=information,media|Root=`_l_m|Translit=al-ʾiʿlāmi|LTranslit=ʾiʿlām
44
+ 24 عن عَن ADP P--------- AdpType=Prep 25 case 25:case Vform=عَن|Gloss=about,from|Root=`an|Translit=ʿan|LTranslit=ʿan
45
+ 25-26 منصبيها _ _ _ _ _ _ _ _
46
+ 25 منصبي مَنصِب NOUN N------D2R Case=Gen|Definite=Cons|Number=Dual 19 nmod 19:nmod:عَن:gen Vform=مَنصِبَي|Gloss=post,position,office|Root=n_.s_b|Translit=manṣibay|LTranslit=manṣib
47
+ 26 ها هُوَ PRON SP---3FS2- Case=Gen|Gender=Fem|Number=Sing|Person=3|PronType=Prs 25 nmod 25:nmod:gen Vform=هَا|Gloss=he,she,it|Translit=hā|LTranslit=huwa
48
+ 27 في فِي ADP P--------- AdpType=Prep 7 mark 7:mark Vform=فِي|Gloss=in|Root=fI|Translit=fī|LTranslit=fī
49
+ 28 حين حِينَ ADP PI------2- AdpType=Prep|Case=Gen 7 mark 7:mark Vform=حِينِ|Gloss=when|Root=.h_y_n|Translit=ḥīni|LTranslit=ḥīna
50
+ 29 ظل ظَلّ VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 7 parataxis 7:parataxis Vform=ظَلَّ|Gloss=remain,continue|Root=.z_l_l|Translit=ẓalla|LTranslit=ẓall
51
+ 30 محمد محمد X U--------- _ 32 nmod 32:nmod Vform=محمد|Root=OOV|Translit=mḥmd
52
+ 31 ناجي ناجي X U--------- _ 32 nmod 32:nmod Vform=ناجي|Root=OOV|Translit=nāǧy
53
+ 32 العطري العطري X U--------- _ 29 nsubj 29:nsubj Vform=العطري|Root=OOV|Translit=ālʿṭry
54
+ 33 رئيساً رَئِيس NOUN N------S4I Case=Acc|Definite=Ind|Number=Sing 29 xcomp 29:xcomp Vform=رَئِيسًا|Gloss=president,head,chairman|Root=r_'_s|Translit=raʾīsan|LTranslit=raʾīs
55
+ 34-35 للحكومة _ _ _ _ _ _ _ SpaceAfter=No
56
+ 34 ل لِ ADP P--------- AdpType=Prep 35 case 35:case Vform=لِ|Gloss=for,to|Root=l|Translit=li|LTranslit=li
57
+ 35 الحكومة حُكُومَة NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 33 nmod 33:nmod:لِ:gen Vform=اَلحُكُومَةِ|Gloss=government,administration|Root=.h_k_m|Translit=al-ḥukūmati|LTranslit=ḥukūmat
58
+ 36 . . PUNCT G--------- _ 1 punct 1:punct Vform=.|Translit=.
59
+
60
+ # newpar id = assabah.20041005.0017:p3
61
+ # sent_id = assabah.20041005.0017:p3u1
62
+ # text = واضافت المصادر ان مهدي دخل الله رئيس تحرير صحيفة الحزب الحاكم والليبرالي التوجهات تسلم منصب وزير الاعلام خلفا لاحمد الحسن فيما تسلم اللواء غازي كنعان رئيس شعبة الامن السياسي منصب وزير الداخلية.
63
+ # orig_file_sentence ASB_ARB_20041005.0017#3
64
+ 1-2 واضافت _ _ _ _ _ _ _ _
65
+ 1 و وَ CCONJ C--------- _ 0 root 0:root Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
66
+ 2 أضافت أَضَاف VERB VP-A-3FS-- Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Voice=Act 1 parataxis 1:parataxis Vform=أَضَافَت|Gloss=add,attach,receive_as_guest|Root=.d_y_f|Translit=ʾaḍāfat|LTranslit=ʾaḍāf
67
+ 3 المصادر مَصدَر NOUN N------P1D Case=Nom|Definite=Def|Number=Plur 2 nsubj 2:nsubj Vform=اَلمَصَادِرُ|Gloss=source|Root=.s_d_r|Translit=al-maṣādiru|LTranslit=maṣdar
68
+ 4 ان أَنَّ SCONJ C--------- _ 16 mark 16:mark Vform=أَنَّ|Gloss=that|Root='_n|Translit=ʾanna|LTranslit=ʾanna
69
+ 5 مهدي مهدي X U--------- _ 6 nmod 6:nmod Vform=مهدي|Root=OOV|Translit=mhdy
70
+ 6 دخل دخل X U--------- _ 16 nsubj 16:nsubj Vform=دخل|Root=OOV|Translit=dḫl
71
+ 7 الله الله X U--------- _ 6 nmod 6:nmod Vform=الله|Root=OOV|Translit=āllh
72
+ 8 رئيس رَئِيس NOUN N------S4R Case=Acc|Definite=Cons|Number=Sing 6 nmod 6:nmod:acc Vform=رَئِيسَ|Gloss=president,head,chairman|Root=r_'_s|Translit=raʾīsa|LTranslit=raʾīs
73
+ 9 تحرير تَحرِير NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 8 nmod 8:nmod:gen Vform=تَحرِيرِ|Gloss=liberation,liberating,editorship,editing|Root=.h_r_r|Translit=taḥrīri|LTranslit=taḥrīr
74
+ 10 صحيفة صَحِيفَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 9 nmod 9:nmod:gen Vform=صَحِيفَةِ|Gloss=newspaper,sheet,leaf|Root=.s_.h_f|Translit=ṣaḥīfati|LTranslit=ṣaḥīfat
75
+ 11 الحزب حِزب NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 10 nmod 10:nmod:gen Vform=اَلحِزبِ|Gloss=party,band|Root=.h_z_b|Translit=al-ḥizbi|LTranslit=ḥizb
76
+ 12 الحاكم حَاكِم NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 11 nmod 11:nmod:gen Vform=اَلحَاكِمِ|Gloss=ruler,governor|Root=.h_k_m|Translit=al-ḥākimi|LTranslit=ḥākim
77
+ 13-14 والليبرالي _ _ _ _ _ _ _ _
78
+ 13 و وَ CCONJ C--------- _ 6 cc 6:cc Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
79
+ 14 الليبرالي لِيبِرَالِيّ ADJ A-----MS4D Case=Acc|Definite=Def|Gender=Masc|Number=Sing 6 amod 6:amod Vform=اَللِّيبِرَالِيَّ|Gloss=liberal|Root=lIbirAl|Translit=al-lībirālīya|LTranslit=lībirālīy
80
+ 15 التوجهات تَوَجُّه NOUN N------P2D Case=Gen|Definite=Def|Number=Plur 14 nmod 14:nmod:gen Vform=اَلتَّوَجُّهَاتِ|Gloss=attitude,approach|Root=w_^g_h|Translit=at-tawaǧǧuhāti|LTranslit=tawaǧǧuh
81
+ 16 تسلم تَسَلَّم VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 2 ccomp 2:ccomp Vform=تَسَلَّمَ|Gloss=receive,assume|Root=s_l_m|Translit=tasallama|LTranslit=tasallam
82
+ 17 منصب مَنصِب NOUN N------S4R Case=Acc|Definite=Cons|Number=Sing 16 obj 16:obj Vform=مَنصِبَ|Gloss=post,position,office|Root=n_.s_b|Translit=manṣiba|LTranslit=manṣib
83
+ 18 وزير وَزِير NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 17 nmod 17:nmod:gen Vform=وَزِيرِ|Gloss=minister|Root=w_z_r|Translit=wazīri|LTranslit=wazīr
84
+ 19 الاعلام عَلَم NOUN N------P2D Case=Gen|Definite=Def|Number=Plur 18 nmod 18:nmod:gen Vform=اَلأَعلَامِ|Gloss=flag,banner,badge|Root=`_l_m|Translit=al-ʾaʿlāmi|LTranslit=ʿalam
85
+ 20 خلفا خَلَف NOUN N------S4I Case=Acc|Definite=Ind|Number=Sing 16 obl 16:obl:acc Vform=خَلَفًا|Gloss=substitute,scion|Root=_h_l_f|Translit=ḫalafan|LTranslit=ḫalaf
86
+ 21-22 لاحمد _ _ _ _ _ _ _ _
87
+ 21 ل لِ ADP P--------- AdpType=Prep 23 case 23:case Vform=لِ|Gloss=for,to|Root=l|Translit=li|LTranslit=li
88
+ 22 أحمد أَحمَد NOUN N------S2I Case=Gen|Definite=Ind|Number=Sing 23 nmod 23:nmod:gen Vform=أَحمَدَ|Gloss=Ahmad|Root=.h_m_d|Translit=ʾaḥmada|LTranslit=ʾaḥmad
89
+ 23 الحسن الحسن X U--------- _ 20 nmod 20:nmod:لِ Vform=الحسن|Root=OOV|Translit=ālḥsn
90
+ 24 فيما فِيمَا CCONJ C--------- _ 25 cc 25:cc Vform=فِيمَا|Gloss=while,during_which|Root=fI|Translit=fīmā|LTranslit=fīmā
91
+ 25 تسلم تَسَلَّم VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 16 conj 2:ccomp|16:conj Vform=تَسَلَّمَ|Gloss=receive,assume|Root=s_l_m|Translit=tasallama|LTranslit=tasallam
92
+ 26 اللواء لِوَاء NOUN N------S1D Case=Nom|Definite=Def|Number=Sing 25 nsubj 25:nsubj Vform=اَللِّوَاءُ|Gloss=banner,flag|Root=l_w_y|Translit=al-liwāʾu|LTranslit=liwāʾ
93
+ 27 غازي غازي X U--------- _ 28 nmod 28:nmod Vform=غازي|Root=OOV|Translit=ġāzy
94
+ 28 كنعان كنعان X U--------- _ 26 nmod 26:nmod Vform=كنعان|Root=OOV|Translit=knʿān
95
+ 29 رئيس رَئِيس NOUN N------S1R Case=Nom|Definite=Cons|Number=Sing 26 nmod 26:nmod:nom Vform=رَئِيسُ|Gloss=president,head,chairman|Root=r_'_s|Translit=raʾīsu|LTranslit=raʾīs
96
+ 30 شعبة شُعبَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 29 nmod 29:nmod:gen Vform=شُعبَةِ|Gloss=branch,subdivision|Root=^s_`_b|Translit=šuʿbati|LTranslit=šuʿbat
97
+ 31 الامن أَمن NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 30 nmod 30:nmod:gen Vform=اَلأَمنِ|Gloss=security,safety|Root='_m_n|Translit=al-ʾamni|LTranslit=ʾamn
98
+ 32 السياسي سِيَاسِيّ ADJ A-----MS2D Case=Gen|Definite=Def|Gender=Masc|Number=Sing 31 amod 31:amod Vform=اَلسِّيَاسِيِّ|Gloss=political|Root=s_w_s|Translit=as-siyāsīyi|LTranslit=siyāsīy
99
+ 33 منصب مَنصِب NOUN N------S4R Case=Acc|Definite=Cons|Number=Sing 25 obj 25:obj Vform=مَنصِبَ|Gloss=post,position,office|Root=n_.s_b|Translit=manṣiba|LTranslit=manṣib
100
+ 34 وزير وَزِير NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 33 nmod 33:nmod:gen Vform=وَزِيرِ|Gloss=minister|Root=w_z_r|Translit=wazīri|LTranslit=wazīr
101
+ 35 الداخلية دَاخِلِيّ ADJ A-----FS2D Case=Gen|Definite=Def|Gender=Fem|Number=Sing 34 amod 34:amod SpaceAfter=No|Vform=اَلدَّاخِلِيَّةِ|Gloss=internal,domestic,interior,of_state|Root=d__h_l|Translit=ad-dāḫilīyati|LTranslit=dāḫilīy
102
+ 36 . . PUNCT G--------- _ 1 punct 1:punct Vform=.|Translit=.
103
+
104
+ # newpar id = assabah.20041005.0017:p4
105
+ # sent_id = assabah.20041005.0017:p4u1
106
+ # text = وذكرت وكالة الانباء السورية ان التعديل شمل ثماني حقائب بينها وزارتا الداخلية والاقتصاد.
107
+ # orig_file_sentence ASB_ARB_20041005.0017#4
108
+ 1-2 وذكرت _ _ _ _ _ _ _ _
109
+ 1 و وَ CCONJ C--------- _ 0 root 0:root Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
110
+ 2 ذكرت ذَكَر VERB VP-A-3FS-- Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Voice=Act 1 parataxis 1:parataxis Vform=ذَكَرَت|Gloss=mention,cite,remember|Root=_d_k_r|Translit=ḏakarat|LTranslit=ḏakar
111
+ 3 وكالة وِكَالَة NOUN N------S1R Case=Nom|Definite=Cons|Number=Sing 2 nsubj 2:nsubj Vform=وِكَالَةُ|Gloss=agency|Root=w_k_l|Translit=wikālatu|LTranslit=wikālat
112
+ 4 الانباء نَبَأ NOUN N------P2D Case=Gen|Definite=Def|Number=Plur 3 nmod 3:nmod:gen Vform=اَلأَنبَاءِ|Gloss=news_item,report|Root=n_b_'|Translit=al-ʾanbāʾi|LTranslit=nabaʾ
113
+ 5 السورية سُورِيّ ADJ A-----FS1D Case=Nom|Definite=Def|Gender=Fem|Number=Sing 3 amod 3:amod Vform=اَلسُّورِيَّةُ|Gloss=Syrian|Root=sUr|Translit=as-sūrīyatu|LTranslit=sūrīy
114
+ 6 ان أَنَّ SCONJ C--------- _ 8 mark 8:mark Vform=أَنَّ|Gloss=that|Root='_n|Translit=ʾanna|LTranslit=ʾanna
115
+ 7 التعديل تَعدِيل NOUN N------S4D Case=Acc|Definite=Def|Number=Sing 8 obl 8:obl:acc Vform=اَلتَّعدِيلَ|Gloss=adjustment,change,modification,amendment|Root=`_d_l|Translit=at-taʿdīla|LTranslit=taʿdīl
116
+ 8 شمل شَمِل VERB VP-A-3MS-- Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Voice=Act 2 ccomp 2:ccomp Vform=شَمِلَ|Gloss=comprise,include,contain|Root=^s_m_l|Translit=šamila|LTranslit=šamil
117
+ 9 ثماني ثَمَانُون NUM QL------4R Case=Acc|Definite=Cons|NumForm=Word 8 obj 8:obj Vform=ثَمَانِي|Gloss=eighty|Root=_t_m_n|Translit=ṯamānī|LTranslit=ṯamānūn
118
+ 10 حقائب حَقِيبَة NOUN N------P2I Case=Gen|Definite=Ind|Number=Plur 9 nmod 9:nmod:gen Vform=حَقَائِبَ|Gloss=briefcase,suitcase,portfolio,luggage|Root=.h_q_b|Translit=ḥaqāʾiba|LTranslit=ḥaqībat
119
+ 11-12 بينها _ _ _ _ _ _ _ _
120
+ 11 بين بَينَ ADP PI------4- AdpType=Prep|Case=Acc 12 case 12:case Vform=بَينَ|Gloss=between,among|Root=b_y_n|Translit=bayna|LTranslit=bayna
121
+ 12 ها هُوَ PRON SP---3FS2- Case=Gen|Gender=Fem|Number=Sing|Person=3|PronType=Prs 10 obl 10:obl:بَينَ:gen Vform=هَا|Gloss=he,she,it|Translit=hā|LTranslit=huwa
122
+ 13 وزارتا وِزَارَة NOUN N------D1R Case=Nom|Definite=Cons|Number=Dual 12 nsubj 12:nsubj Vform=وِزَارَتَا|Gloss=ministry|Root=w_z_r|Translit=wizāratā|LTranslit=wizārat
123
+ 14 الداخلية دَاخِلِيّ ADJ A-----FS2D Case=Gen|Definite=Def|Gender=Fem|Number=Sing 13 amod 13:amod Vform=اَلدَّاخِلِيَّةِ|Gloss=internal,domestic,interior,of_state|Root=d__h_l|Translit=ad-dāḫilīyati|LTranslit=dāḫilīy
124
+ 15-16 والاقتصاد _ _ _ _ _ _ _ SpaceAfter=No
125
+ 15 و وَ CCONJ C--------- _ 16 cc 16:cc Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
126
+ 16 الاقتصاد اِقتِصَاد NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 14 conj 13:amod|14:conj Vform=اَلِاقتِصَادِ|Gloss=economy,saving|Root=q_.s_d|Translit=al-i-ʼqtiṣādi|LTranslit=iqtiṣād
127
+ 17 . . PUNCT G--------- _ 1 punct 1:punct Vform=.|Translit=.
stanza/demo/corenlp.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from stanza.server import CoreNLPClient
2
+
3
+ # example text
4
+ print('---')
5
+ print('input text')
6
+ print('')
7
+
8
+ text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."
9
+
10
+ print(text)
11
+
12
+ # set up the client
13
+ print('---')
14
+ print('starting up Java Stanford CoreNLP Server...')
15
+
16
+ # set up the client
17
+ with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','parse','depparse','coref'], timeout=60000, memory='16G') as client:
18
+ # submit the request to the server
19
+ ann = client.annotate(text)
20
+
21
+ # get the first sentence
22
+ sentence = ann.sentence[0]
23
+
24
+ # get the dependency parse of the first sentence
25
+ print('---')
26
+ print('dependency parse of first sentence')
27
+ dependency_parse = sentence.basicDependencies
28
+ print(dependency_parse)
29
+
30
+ # get the constituency parse of the first sentence
31
+ print('---')
32
+ print('constituency parse of first sentence')
33
+ constituency_parse = sentence.parseTree
34
+ print(constituency_parse)
35
+
36
+ # get the first subtree of the constituency parse
37
+ print('---')
38
+ print('first subtree of constituency parse')
39
+ print(constituency_parse.child[0])
40
+
41
+ # get the value of the first subtree
42
+ print('---')
43
+ print('value of first subtree of constituency parse')
44
+ print(constituency_parse.child[0].value)
45
+
46
+ # get the first token of the first sentence
47
+ print('---')
48
+ print('first token of first sentence')
49
+ token = sentence.token[0]
50
+ print(token)
51
+
52
+ # get the part-of-speech tag
53
+ print('---')
54
+ print('part of speech tag of token')
55
+ token.pos
56
+ print(token.pos)
57
+
58
+ # get the named entity tag
59
+ print('---')
60
+ print('named entity tag of token')
61
+ print(token.ner)
62
+
63
+ # get an entity mention from the first sentence
64
+ print('---')
65
+ print('first entity mention in sentence')
66
+ print(sentence.mentions[0])
67
+
68
+ # access the coref chain
69
+ print('---')
70
+ print('coref chains for the example')
71
+ print(ann.corefChain)
72
+
73
+ # Use tokensregex patterns to find who wrote a sentence.
74
+ pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
75
+ matches = client.tokensregex(text, pattern)
76
+ # sentences contains a list with matches for each sentence.
77
+ assert len(matches["sentences"]) == 3
78
+ # length tells you whether or not there are any matches in this
79
+ assert matches["sentences"][1]["length"] == 1
80
+ # You can access matches like most regex groups.
81
+ matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
82
+ matches["sentences"][1]["0"]["1"]["text"] == "Chris"
83
+
84
+ # Use semgrex patterns to directly find who wrote what.
85
+ pattern = '{word:wrote} >nsubj {}=subject >obj {}=object'
86
+ matches = client.semgrex(text, pattern)
87
+ # sentences contains a list with matches for each sentence.
88
+ assert len(matches["sentences"]) == 3
89
+ # length tells you whether or not there are any matches in this
90
+ assert matches["sentences"][1]["length"] == 1
91
+ # You can access matches like most regex groups.
92
+ matches["sentences"][1]["0"]["text"] == "wrote"
93
+ matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
94
+ matches["sentences"][1]["0"]["$object"]["text"] == "sentence"
95
+
stanza/demo/japanese_test.conllu.txt ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # newdoc id = test-s1
2
+ # sent_id = test-s1
3
+ # text = これに不快感を示す住民はいましたが,現在,表立って反対や抗議の声を挙げている住民はいないようです。
4
+ 1 これ 此れ PRON 代名詞 _ 6 obl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=代名詞|SpaceAfter=No|UnidicInfo=,此れ,これ,これ,コレ,,,コレ,コレ,此れ
5
+ 2 に に ADP 助詞-格助詞 _ 1 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,に,に,に,ニ,,,ニ,ニ,に
6
+ 3 不快 不快 NOUN 名詞-普通名詞-形状詞可能 _ 4 compound _ BunsetuBILabel=B|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,不快,不快,不快,フカイ,,,フカイ,フカイカン,不快感
7
+ 4 感 感 NOUN 名詞-普通名詞-一般 _ 6 obj _ BunsetuBILabel=I|BunsetuPositionType=SEM_HEAD|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,感,感,感,カン,,,カン,フカイカン,不快感
8
+ 5 を を ADP 助詞-格助詞 _ 4 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,を,を,を,オ,,,ヲ,ヲ,を
9
+ 6 示す 示す VERB 動詞-一般-五段-サ行 _ 7 acl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-五段-サ行|SpaceAfter=No|UnidicInfo=,示す,示す,示す,シメス,,,シメス,シメス,示す
10
+ 7 住民 住民 NOUN 名詞-普通名詞-一般 _ 9 nsubj _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,住民,住民,住民,ジューミン,,,ジュウミン,ジュウミン,住民
11
+ 8 は は ADP 助詞-係助詞 _ 7 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
12
+ 9 い 居る VERB 動詞-非自立可能-上一段-ア行 _ 29 advcl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-上一段-ア行|PrevUDLemma=いる|SpaceAfter=No|UnidicInfo=,居る,い,いる,イ,,,イル,イル,居る
13
+ 10 まし ます AUX 助動詞-助動詞-マス _ 9 aux _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助動詞-助動詞-マス|SpaceAfter=No|UnidicInfo=,ます,まし,ます,マシ,,,マス,マス,ます
14
+ 11 た た AUX 助動詞-助動詞-タ _ 9 aux _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-助動詞-タ|SpaceAfter=No|UnidicInfo=,た,た,た,タ,,,タ,タ,た
15
+ 12 が が SCONJ 助詞-接続助詞 _ 9 mark _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助詞-接続助詞|SpaceAfter=No|UnidicInfo=,が,が,が,ガ,,,ガ,ガ,が
16
+ 13 , , PUNCT 補助記号-読点 _ 9 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-読点|SpaceAfter=No|UnidicInfo=,,,,,,,,,,,
17
+ 14 現在 現在 ADV 名詞-普通名詞-副詞可能 _ 16 advmod _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=副詞|SpaceAfter=No|UnidicInfo=,現在,現在,現在,ゲンザイ,,,ゲンザイ,ゲンザイ,現在
18
+ 15 , , PUNCT 補助記号-読点 _ 14 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-読点|SpaceAfter=No|UnidicInfo=,,,,,,,,,,,
19
+ 16 表立っ 表立つ VERB 動詞-一般-五段-タ行 _ 24 advcl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-五段-タ行|SpaceAfter=No|UnidicInfo=,表立つ,表立っ,表立つ,オモテダッ,,,オモテダツ,オモテダツ,表立つ
20
+ 17 て て SCONJ 助詞-接続助詞 _ 16 mark _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-接続助詞|SpaceAfter=No|UnidicInfo=,て,て,て,テ,,,テ,テ,て
21
+ 18 反対 反対 NOUN 名詞-普通名詞-サ変形状詞可能 _ 20 nmod _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,反対,反対,反対,ハンタイ,,,ハンタイ,ハンタイ,反対
22
+ 19 や や ADP 助詞-副助詞 _ 18 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-副助詞|SpaceAfter=No|UnidicInfo=,や,や,や,ヤ,,,ヤ,ヤ,や
23
+ 20 抗議 抗議 NOUN 名詞-普通名詞-サ変可能 _ 22 nmod _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,抗議,抗議,抗議,コーギ,,,コウギ,コウギ,抗議
24
+ 21 の の ADP 助詞-格助詞 _ 20 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,の,の,の,ノ,,,ノ,ノ,の
25
+ 22 声 声 NOUN 名詞-普通名詞-一般 _ 24 obj _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,声,声,声,コエ,,,コエ,コエ,声
26
+ 23 を を ADP 助詞-格助詞 _ 22 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,を,を,を,オ,,,ヲ,ヲ,を
27
+ 24 挙げ 上げる VERB 動詞-非自立可能-下一段-ガ行 _ 27 acl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-下一段-ガ行|SpaceAfter=No|UnidicInfo=,上げる,挙げ,挙げる,アゲ,,,アゲル,アゲル,上げる
28
+ 25 て て SCONJ 助詞-接続助詞 _ 24 mark _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-上一段-ア行|SpaceAfter=No|UnidicInfo=,て,て,て,テ,,,テ,テイル,ている
29
+ 26 いる 居る VERB 動詞-非自立可能-上一段-ア行 _ 25 fixed _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=I|LUWPOS=助動詞-上一段-ア行|PrevUDLemma=いる|SpaceAfter=No|UnidicInfo=,居る,いる,いる,イル,,,イル,テイル,ている
30
+ 27 住民 住民 NOUN 名詞-普通名詞-一般 _ 29 nsubj _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,住民,住民,住民,ジューミン,,,ジュウミン,ジュウミン,住民
31
+ 28 は は ADP 助詞-係助詞 _ 27 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
32
+ 29 い 居る VERB 動詞-非自立可能-上一段-ア行 _ 0 root _ BunsetuBILabel=B|BunsetuPositionType=ROOT|LUWBILabel=B|LUWPOS=動詞-一般-上一段-ア行|PrevUDLemma=いる|SpaceAfter=No|UnidicInfo=,居る,い,いる,イ,,,イル,イル,居る
33
+ 30 ない ない AUX 助動詞-助動詞-ナイ Polarity=Neg 29 aux _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助動詞-助動詞-ナイ|SpaceAfter=No|UnidicInfo=,ない,ない,ない,ナイ,,,ナイ,ナイ,ない
34
+ 31 よう 様 AUX 形状詞-助動詞語幹 _ 29 aux _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=形状詞-助動詞語幹|PrevUDLemma=よう|SpaceAfter=No|UnidicInfo=,様,よう,よう,ヨー,,,ヨウ,ヨウ,様
35
+ 32 です です AUX 助動詞-助動詞-デス _ 29 aux _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-助動詞-デス|PrevUDLemma=だ|SpaceAfter=No|UnidicInfo=,です,です,です,デス,,,デス,デス,です
36
+ 33 。 。 PUNCT 補助記号-句点 _ 29 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-句点|SpaceAfter=Yes|UnidicInfo=,。,。,。,,,,,,。
37
+
38
+ # newdoc id = test-s2
39
+ # sent_id = test-s2
40
+ # text = 幸福の科学側からは,特にどうしてほしいという要望はいただいていません。
41
+ 1 幸福 幸福 NOUN 名詞-普通名詞-形状詞可能 _ 4 nmod _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,幸福,幸福,幸福,コーフク,,,コウフク,コウフクノカガクガワ,幸福の科学側
42
+ 2 の の ADP 助詞-格助詞 _ 1 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,の,の,の,ノ,,,ノ,コウフクノカガクガワ,幸福の科学側
43
+ 3 科学 科学 NOUN 名詞-普通名詞-サ変可能 _ 4 compound _ BunsetuBILabel=B|BunsetuPositionType=CONT|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,科学,科学,科学,カガク,,,カガク,コウフクノカガクガワ,幸福の科学側
44
+ 4 側 側 NOUN 名詞-普通名詞-一般 _ 17 obl _ BunsetuBILabel=I|BunsetuPositionType=SEM_HEAD|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,側,側,側,ガワ,,,ガワ,コウフクノカガクガワ,幸福の科学側
45
+ 5 から から ADP 助詞-格助詞 _ 4 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,から,から,から,カラ,,,カラ,カラ,から
46
+ 6 は は ADP 助詞-係助詞 _ 4 case _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
47
+ 7 , , PUNCT 補助記号-読点 _ 4 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-読点|SpaceAfter=No|UnidicInfo=,,,,,,,,,,,
48
+ 8 特に 特に ADV 副詞 _ 17 advmod _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=副詞|SpaceAfter=No|UnidicInfo=,特に,特に,特に,トクニ,,,トクニ,トクニ,特に
49
+ 9 どう どう ADV 副詞 _ 15 advcl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-サ行変格|SpaceAfter=No|UnidicInfo=,どう,どう,どう,ドー,,,ドウ,ドウスル,どうする
50
+ 10 し 為る AUX 動詞-非自立可能-サ行変格 _ 9 aux _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=I|LUWPOS=動詞-一般-サ行変格|PrevUDLemma=する|SpaceAfter=No|UnidicInfo=,為る,し,する,シ,,,スル,ドウスル,どうする
51
+ 11 て て SCONJ 助詞-接続助詞 _ 9 mark _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-形容詞|SpaceAfter=No|UnidicInfo=,て,て,て,テ,,,テ,テホシイ,てほしい
52
+ 12 ほしい 欲しい AUX 形容詞-非自立可能-形容詞 _ 11 fixed _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=I|LUWPOS=助動詞-形容詞|PrevUDLemma=ほしい|SpaceAfter=No|UnidicInfo=,欲しい,ほしい,ほしい,ホシー,,,ホシイ,テホシイ,てほしい
53
+ 13 と と ADP 助詞-格助詞 _ 9 case _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,と,と,と,ト,,,ト,トイウ,という
54
+ 14 いう 言う VERB 動詞-一般-五段-ワア行 _ 13 fixed _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=I|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,言う,いう,いう,イウ,,,イウ,トイウ,という
55
+ 15 要望 要望 NOUN 名詞-普通名詞-サ変可能 _ 17 nsubj _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,要望,要望,要望,ヨーボー,,,ヨウボウ,ヨウボウ,要望
56
+ 16 は は ADP 助詞-係助詞 _ 15 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
57
+ 17 いただい 頂く VERB 動詞-非自立可能-五段-カ行 _ 0 root _ BunsetuBILabel=B|BunsetuPositionType=ROOT|LUWBILabel=B|LUWPOS=動詞-一般-五段-カ行|PrevUDLemma=いただく|SpaceAfter=No|UnidicInfo=,頂く,いただい,いただく,イタダイ,,,イタダク,イタダク,頂く
58
+ 18 て て SCONJ 助詞-接続助詞 _ 17 mark _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-上一段-ア行|SpaceAfter=No|UnidicInfo=,て,て,て,テ,,,テ,テイル,ている
59
+ 19 い 居る VERB 動詞-非自立可能-上一段-ア行 _ 18 fixed _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=I|LUWPOS=助動詞-上一段-ア行|PrevUDLemma=いる|SpaceAfter=No|UnidicInfo=,居る,い,いる,イ,,,イル,テイル,ている
60
+ 20 ませ ます AUX 助動詞-助動詞-マス _ 17 aux _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=B|LUWPOS=助動詞-助動詞-マス|SpaceAfter=No|UnidicInfo=,ます,ませ,ます,マセ,,,マス,マス,ます
61
+ 21 ん ず AUX 助動詞-助動詞-ヌ Polarity=Neg 17 aux _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助動詞-助動詞-ヌ|PrevUDLemma=ぬ|SpaceAfter=No|UnidicInfo=,ず,ん,ぬ,ン,,,ヌ,ズ,ず
62
+ 22 。 。 PUNCT 補助記号-句点 _ 17 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-句点|SpaceAfter=Yes|UnidicInfo=,。,。,。,,,,,,。
63
+
64
+ # newdoc id = test-s3
65
+ # sent_id = test-s3
66
+ # text = 星取り参加は当然とされ,不参加は白眼視される。
67
+ 1 星取り 星取り NOUN 名詞-普通名詞-一般 _ 2 compound _ BunsetuBILabel=B|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,星取り,星取り,星取り,ホシトリ,,,ホシトリ,ホシトリサンカ,星取り参加
68
+ 2 参加 参加 NOUN 名詞-普通名詞-サ変可能 _ 4 nsubj _ BunsetuBILabel=I|BunsetuPositionType=SEM_HEAD|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,参加,参加,参加,サンカ,,,サンカ,ホシトリサンカ,星取り参加
69
+ 3 は は ADP 助詞-係助詞 _ 2 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
70
+ 4 当然 当然 ADJ 形状詞-一般 _ 6 advcl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=形状詞-一般|SpaceAfter=No|UnidicInfo=,当然,当然,当然,トーゼン,,,トウゼン,トウゼン,当然
71
+ 5 と と ADP 助詞-格助詞 _ 4 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-格助詞|SpaceAfter=No|UnidicInfo=,と,と,と,ト,,,ト,ト,と
72
+ 6 さ 為る VERB 動詞-非自立可能-サ行変格 _ 13 acl _ BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|LUWBILabel=B|LUWPOS=動詞-一般-サ行変格|PrevUDLemma=する|SpaceAfter=No|UnidicInfo=,為る,さ,する,サ,,,スル,スル,する
73
+ 7 れ れる AUX 助動詞-助動詞-レル _ 6 aux _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助動詞-助動詞-レル|SpaceAfter=No|UnidicInfo=,れる,れ,れる,レ,,,レル,レル,れる
74
+ 8 , , PUNCT 補助記号-読点 _ 6 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-読点|SpaceAfter=No|UnidicInfo=,,,,,,,,,,,
75
+ 9 不 不 NOUN 接頭辞 Polarity=Neg 10 compound _ BunsetuBILabel=B|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,不,不,不,フ,,,フ,フサンカ,不参加
76
+ 10 参加 参加 NOUN 名詞-普通名詞-サ変可能 _ 13 nsubj _ BunsetuBILabel=I|BunsetuPositionType=SEM_HEAD|LUWBILabel=I|LUWPOS=名詞-普通名詞-一般|SpaceAfter=No|UnidicInfo=,参加,参加,参加,サンカ,,,サンカ,フサンカ,不参加
77
+ 11 は は ADP 助詞-係助詞 _ 10 case _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助詞-係助詞|SpaceAfter=No|UnidicInfo=,は,は,は,ワ,,,ハ,ハ,は
78
+ 12 白眼 白眼 NOUN 名詞-普通名詞-一般 _ 13 compound _ BunsetuBILabel=B|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=動詞-一般-サ行変格|SpaceAfter=No|UnidicInfo=,白眼,白眼,白眼,ハクガン,,,ハクガン,ハクガンシスル,白眼視する
79
+ 13 視 視 NOUN 接尾辞-名詞的-サ変可能 _ 0 root _ BunsetuBILabel=I|BunsetuPositionType=ROOT|LUWBILabel=I|LUWPOS=動詞-一般-サ行変格|SpaceAfter=No|UnidicInfo=,視,視,視,シ,,,シ,ハクガンシスル,白眼視する
80
+ 14 さ 為る AUX 動詞-非自立可能-サ行変格 _ 13 aux _ BunsetuBILabel=I|BunsetuPositionType=FUNC|LUWBILabel=I|LUWPOS=動詞-一般-サ行変格|PrevUDLemma=する|SpaceAfter=No|UnidicInfo=,為る,さ,する,サ,,,スル,ハクガンシスル,白眼視する
81
+ 15 れる れる AUX 助動詞-助動詞-レル _ 13 aux _ BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|LUWBILabel=B|LUWPOS=助動詞-助動詞-レル|SpaceAfter=No|UnidicInfo=,れる,れる,れる,レル,,,レル,レル,れる
82
+ 16 。 。 PUNCT 補助記号-句点 _ 13 punct _ BunsetuBILabel=I|BunsetuPositionType=CONT|LUWBILabel=B|LUWPOS=補助記号-句点|SpaceAfter=Yes|UnidicInfo=,。,。,。,,,,,,。
stanza/demo/pipeline_demo.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A basic demo of the Stanza neural pipeline.
3
+ """
4
+
5
+ import sys
6
+ import argparse
7
+ import os
8
+
9
+ import stanza
10
+ from stanza.resources.common import DEFAULT_MODEL_DIR
11
+
12
+
13
+ if __name__ == '__main__':
14
+ # get arguments
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument('-d', '--models_dir', help='location of models files | default: ~/stanza_resources',
17
+ default=DEFAULT_MODEL_DIR)
18
+ parser.add_argument('-l', '--lang', help='Demo language',
19
+ default="en")
20
+ parser.add_argument('-c', '--cpu', action='store_true', help='Use cpu as the device.')
21
+ args = parser.parse_args()
22
+
23
+ example_sentences = {"en": "Barack Obama was born in Hawaii. He was elected president in 2008.",
24
+ "zh": "中国文化经历上千年的历史演变,是各区域、各民族古代文化长期相互交流、借鉴、融合的结果。",
25
+ "fr": "Van Gogh grandit au sein d'une famille de l'ancienne bourgeoisie. Il tente d'abord de faire carrière comme marchand d'art chez Goupil & C.",
26
+ "vi": "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai."}
27
+
28
+ if args.lang not in example_sentences:
29
+ print(f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}')
30
+ sys.exit(1)
31
+
32
+ # download the models
33
+ stanza.download(args.lang, dir=args.models_dir)
34
+ # set up a pipeline
35
+ print('---')
36
+ print('Building pipeline...')
37
+ pipeline = stanza.Pipeline(lang=args.lang, dir=args.models_dir, use_gpu=(not args.cpu))
38
+ # process the document
39
+ doc = pipeline(example_sentences[args.lang])
40
+ # access nlp annotations
41
+ print('')
42
+ print('Input: {}'.format(example_sentences[args.lang]))
43
+ print("The tokenizer split the input into {} sentences.".format(len(doc.sentences)))
44
+ print('---')
45
+ print('tokens of first sentence: ')
46
+ doc.sentences[0].print_tokens()
47
+ print('')
48
+ print('---')
49
+ print('dependency parse of first sentence: ')
50
+ doc.sentences[0].print_dependencies()
51
+ print('')
52
+
stanza/demo/scenegraph.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Very short demo for the SceneGraph interface in the CoreNLP server
3
+
4
+ Requires CoreNLP >= 4.5.5, Stanza >= 1.5.1
5
+ """
6
+
7
+ import json
8
+
9
+ from stanza.server import CoreNLPClient
10
+
11
+ # start_server=None if you have the server running in another process on the same host
12
+ # you can start it with whatever normal options CoreNLPClient has
13
+ #
14
+ # preload=False avoids having the server unnecessarily load annotators
15
+ # if you don't plan on using them
16
+ with CoreNLPClient(preload=False) as client:
17
+ result = client.scenegraph("Jennifer's antennae are on her head.")
18
+ print(json.dumps(result, indent=2))
19
+
20
+
stanza/demo/semgrex.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import stanza
2
+ from stanza.server.semgrex import Semgrex
3
+
4
+ nlp = stanza.Pipeline("en", processors="tokenize,pos,lemma,depparse")
5
+
6
+ doc = nlp("Banning opal removed all artifact decks from the meta. I miss playing lantern.")
7
+ with Semgrex(classpath="$CLASSPATH") as sem:
8
+ semgrex_results = sem.process(doc,
9
+ "{pos:NN}=object <obl {}=action",
10
+ "{cpos:NOUN}=thing <obj {cpos:VERB}=action")
11
+ print("COMPLETE RESULTS")
12
+ print(semgrex_results)
13
+
14
+ print("Number of matches in graph 0 ('Banning opal...') for semgrex query 1 (thing <obj action): %d" % len(semgrex_results.result[0].result[1].match))
15
+ for match_idx, match in enumerate(semgrex_results.result[0].result[1].match):
16
+ print("Match {}:\n-----------\n{}".format(match_idx, match))
17
+
18
+ print("graph 1 for semgrex query 0 is an empty match: len %d" % len(semgrex_results.result[1].result[0].match))
stanza/demo/semgrex_sample.conllu ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # sent_id = reviews-181748-0003
3
+ # text = My experience was awful though.
4
+ 1 My my PRON PRP$ Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs 2 nmod:poss 2:nmod:poss _
5
+ 2 experience experience NOUN NN Number=Sing 4 nsubj 4:nsubj _
6
+ 3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 cop 4:cop _
7
+ 4 awful awful ADJ JJ Degree=Pos 0 root 0:root _
8
+ 5 though though ADV RB _ 4 advmod 4:advmod SpaceAfter=No
9
+ 6 . . PUNCT . _ 4 punct 4:punct _
10
+
11
+
12
+
13
+ # sent_id = reviews-117115-0005
14
+ # text = The intruders slit the screen of the window.
15
+ 1 The the DET DT Definite=Def|PronType=Art 2 det 2:det _
16
+ 2 intruders intruder NOUN NNS Number=Plur 3 nsubj 3:nsubj _
17
+ 3 slit slit VERB VBD Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 0 root 0:root _
18
+ 4 the the DET DT Definite=Def|PronType=Art 5 det 5:det _
19
+ 5 screen screen NOUN NN Number=Sing 3 obj 3:obj _
20
+ 6 of of ADP IN _ 8 case 8:case _
21
+ 7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _
22
+ 8 window window NOUN NN Number=Sing 5 nmod 5:nmod:of SpaceAfter=No
23
+ 9 . . PUNCT . _ 3 punct 3:punct _
24
+
stanza/demo/ssurgeon_script.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To run this, use the stanza/server/ssurgeon.py main file.
2
+ # For example:
3
+ # python3 stanza/server/ssurgeon.py --edit_file demo/ssurgeon_script.txt --no_print_input --input_file ../data/ud2_11/UD_English-Pronouns/en_pronouns-ud-test.conllu > en_pronouns.updated.conllu
4
+ # This script updates the UD 2.11 version of UD_English-Pronouns to
5
+ # better match punctuation attachments, MWT, and no double subjects.
6
+
7
+ # This turns unwanted csubj into advcl
8
+ {}=source >nsubj {} >csubj=bad {}
9
+ relabelNamedEdge -edge bad -reln advcl
10
+
11
+ # This detects punctuations which are not attached to the root and reattaches them
12
+ {word:/[.]/}=punct <punct=bad {}=parent << {$}=root : {}=parent << {}=root
13
+ removeNamedEdge -edge bad
14
+ addEdge -gov root -dep punct -reln punct
15
+
16
+ # This detects the specific MWT found in the 2.11 dataset
17
+ {}=first . {word:/'s|n't|'ll/}=second
18
+ combineMWT -node first -node second
stanza/doc/CoreNLP.proto ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ syntax = "proto2";
2
+
3
+ package edu.stanford.nlp.pipeline;
4
+
5
+ option java_package = "edu.stanford.nlp.pipeline";
6
+ option java_outer_classname = "CoreNLPProtos";
7
+
8
+ //
9
+ // From JAVANLP_HOME, you can build me with the command:
10
+ //
11
+ // protoc -I=src/edu/stanford/nlp/pipeline/ --java_out=src src/edu/stanford/nlp/pipeline/CoreNLP.proto
12
+ //
13
+
14
+ //
15
+ // To do the python version:
16
+ //
17
+ // protoc -I=./doc --python_out=./stanza/protobuf ./doc/CoreNLP.proto
18
+ //
19
+
20
+ //
21
+ // An enumeration for the valid languages allowed in CoreNLP
22
+ //
23
+ enum Language {
24
+ Unknown = 0;
25
+ Any = 1;
26
+ Arabic = 2;
27
+ Chinese = 3;
28
+ English = 4;
29
+ German = 5;
30
+ French = 6;
31
+ Hebrew = 7;
32
+ Spanish = 8;
33
+ UniversalEnglish = 9;
34
+ UniversalChinese = 10;
35
+ }
36
+
37
+ //
38
+ // A document; that is, the equivalent of an Annotation.
39
+ //
40
+ message Document {
41
+ required string text = 1;
42
+ repeated Sentence sentence = 2;
43
+ repeated CorefChain corefChain = 3;
44
+ optional string docID = 4;
45
+ optional string docDate = 7;
46
+ optional uint64 calendar = 8;
47
+
48
+ /**
49
+ * A peculiar field, for the corner case when a Document is
50
+ * serialized without any sentences. Otherwise
51
+ */
52
+ repeated Token sentencelessToken = 5;
53
+ repeated Token character = 10;
54
+
55
+ repeated Quote quote = 6;
56
+
57
+ /**
58
+ * This field is for entity mentions across the document.
59
+ */
60
+ repeated NERMention mentions = 9;
61
+ optional bool hasEntityMentionsAnnotation = 13; // used to differentiate between null and empty list
62
+
63
+ /**
64
+ * xml information
65
+ */
66
+ optional bool xmlDoc = 11;
67
+ repeated Section sections = 12;
68
+
69
+ /** coref mentions for entire document **/
70
+ repeated Mention mentionsForCoref = 14;
71
+ optional bool hasCorefMentionAnnotation = 15;
72
+ optional bool hasCorefAnnotation = 16;
73
+ repeated int32 corefMentionToEntityMentionMappings = 17;
74
+ repeated int32 entityMentionToCorefMentionMappings = 18;
75
+
76
+ extensions 100 to 255;
77
+ }
78
+
79
+ //
80
+ // The serialized version of a CoreMap representing a sentence.
81
+ //
82
+ message Sentence {
83
+ repeated Token token = 1;
84
+ required uint32 tokenOffsetBegin = 2;
85
+ required uint32 tokenOffsetEnd = 3;
86
+ optional uint32 sentenceIndex = 4;
87
+ optional uint32 characterOffsetBegin = 5;
88
+ optional uint32 characterOffsetEnd = 6;
89
+ optional ParseTree parseTree = 7;
90
+ optional ParseTree binarizedParseTree = 31;
91
+ optional ParseTree annotatedParseTree = 32;
92
+ optional string sentiment = 33;
93
+ repeated ParseTree kBestParseTrees = 34;
94
+ optional DependencyGraph basicDependencies = 8;
95
+ optional DependencyGraph collapsedDependencies = 9;
96
+ optional DependencyGraph collapsedCCProcessedDependencies = 10;
97
+ optional DependencyGraph alternativeDependencies = 13;
98
+ repeated RelationTriple openieTriple = 14; // The OpenIE triples in the sentence
99
+ repeated RelationTriple kbpTriple = 16; // The KBP triples in this sentence
100
+ repeated SentenceFragment entailedSentence = 15; // The entailed sentences, by natural logic
101
+ repeated SentenceFragment entailedClause = 35; // The entailed clauses, by natural logic
102
+ optional DependencyGraph enhancedDependencies = 17;
103
+ optional DependencyGraph enhancedPlusPlusDependencies = 18;
104
+ repeated Token character = 19;
105
+
106
+ optional uint32 paragraph = 11;
107
+
108
+ optional string text = 12; // Only needed if we're only saving the sentence.
109
+
110
+ optional uint32 lineNumber = 20;
111
+
112
+ // Fields set by other annotators in CoreNLP
113
+ optional bool hasRelationAnnotations = 51;
114
+ repeated Entity entity = 52;
115
+ repeated Relation relation = 53;
116
+ optional bool hasNumerizedTokensAnnotation = 54;
117
+ repeated NERMention mentions = 55;
118
+ repeated Mention mentionsForCoref = 56;
119
+ optional bool hasCorefMentionsAnnotation = 57;
120
+
121
+ optional string sentenceID = 58; // Useful when storing sentences (e.g. ForEach)
122
+ optional string sectionDate = 59; // date of section
123
+ optional uint32 sectionIndex = 60; // section index for this sentence's section
124
+ optional string sectionName = 61; // name of section
125
+ optional string sectionAuthor = 62; // author of section
126
+ optional string docID = 63; // doc id
127
+ optional bool sectionQuoted = 64; // is this sentence in an xml quote in a post
128
+
129
+ optional bool hasEntityMentionsAnnotation = 65; // check if there are entity mentions
130
+ optional bool hasKBPTriplesAnnotation = 68; // check if there are KBP triples
131
+ optional bool hasOpenieTriplesAnnotation = 69; // check if there are OpenIE triples
132
+
133
+ // quote stuff
134
+ optional uint32 chapterIndex = 66;
135
+ optional uint32 paragraphIndex = 67;
136
+ // the quote annotator can soometimes add merged sentences
137
+ optional Sentence enhancedSentence = 70;
138
+
139
+ // speaker stuff
140
+ optional string speaker = 71; // The speaker speaking this sentence
141
+ optional string speakerType = 72; // The type of speaker speaking this sentence
142
+
143
+ extensions 100 to 255;
144
+ }
145
+
146
+ //
147
+ // The serialized version of a Token (a CoreLabel).
148
+ //
149
+ message Token {
150
+ // Fields set by the default annotators [new CoreNLP(new Properties())]
151
+ optional string word = 1; // the word's gloss (post-tokenization)
152
+ optional string pos = 2; // The word's part of speech tag
153
+ optional string value = 3; // The word's 'value', (e.g., parse tree node)
154
+ optional string category = 4; // The word's 'category' (e.g., parse tree node)
155
+ optional string before = 5; // The whitespace/xml before the token
156
+ optional string after = 6; // The whitespace/xml after the token
157
+ optional string originalText = 7; // The original text for this token
158
+ optional string ner = 8; // The word's NER tag
159
+ optional string coarseNER = 62; // The word's coarse NER tag
160
+ optional string fineGrainedNER = 63; // The word's fine-grained NER tag
161
+ repeated string nerLabelProbs = 66; // listing of probs
162
+ optional string normalizedNER = 9; // The word's normalized NER tag
163
+ optional string lemma = 10; // The word's lemma
164
+ optional uint32 beginChar = 11; // The character offset begin, in the document
165
+ optional uint32 endChar = 12; // The character offset end, in the document
166
+ optional uint32 utterance = 13; // The utterance tag used in dcoref
167
+ optional string speaker = 14; // The speaker speaking this word
168
+ optional string speakerType = 77; // The type of speaker speaking this word
169
+ optional uint32 beginIndex = 15; // The begin index of, e.g., a span
170
+ optional uint32 endIndex = 16; // The begin index of, e.g., a span
171
+ optional uint32 tokenBeginIndex = 17; // The begin index of the token
172
+ optional uint32 tokenEndIndex = 18; // The end index of the token
173
+ optional Timex timexValue = 19; // The time this word refers to
174
+ optional bool hasXmlContext = 21; // Used by clean xml annotator
175
+ repeated string xmlContext = 22; // Used by clean xml annotator
176
+ optional uint32 corefClusterID = 23; // The [primary] cluster id for this token
177
+ optional string answer = 24; // A temporary annotation which is occasionally left in
178
+ // optional string projectedCategory = 25; // The syntactic category of the maximal constituent headed by the word. Not used anywhere, so deleted.
179
+ optional uint32 headWordIndex = 26; // The index of the head word of this word.
180
+ optional Operator operator = 27; // If this is an operator, which one is it and what is its scope (as per Natural Logic)?
181
+ optional Polarity polarity = 28; // The polarity of this word, according to Natural Logic
182
+ optional string polarity_dir = 39; // The polarity of this word, either "up", "down", or "flat"
183
+ optional Span span = 29; // The span of a leaf node of a tree
184
+ optional string sentiment = 30; // The final sentiment of the sentence
185
+ optional int32 quotationIndex = 31; // The index of the quotation this token refers to
186
+ optional MapStringString conllUFeatures = 32;
187
+ optional string coarseTag = 33; // The coarse POS tag (used to store the UPOS tag)
188
+ optional Span conllUTokenSpan = 34;
189
+ optional string conllUMisc = 35;
190
+ optional MapStringString conllUSecondaryDeps = 36;
191
+ optional string wikipediaEntity = 37;
192
+ optional bool isNewline = 38;
193
+
194
+
195
+ // Fields set by other annotators in CoreNLP
196
+ optional string gender = 51; // gender annotation (machine reading)
197
+ optional string trueCase = 52; // true case type of token
198
+ optional string trueCaseText = 53; // true case gloss of token
199
+
200
+ // Chinese character info
201
+ optional string chineseChar = 54;
202
+ optional string chineseSeg = 55;
203
+ optional string chineseXMLChar = 60;
204
+
205
+ // Arabic character info
206
+ optional string arabicSeg = 76;
207
+
208
+ // Section info
209
+ optional string sectionName = 56;
210
+ optional string sectionAuthor = 57;
211
+ optional string sectionDate = 58;
212
+ optional string sectionEndLabel = 59;
213
+
214
+ // French tokens have parents
215
+ optional string parent = 61;
216
+
217
+ // mention index info
218
+ repeated uint32 corefMentionIndex = 64;
219
+ optional uint32 entityMentionIndex = 65;
220
+
221
+ // mwt stuff
222
+ optional bool isMWT = 67;
223
+ optional bool isFirstMWT = 68;
224
+ optional string mwtText = 69;
225
+ // setting this to a map might be nice, but there are a couple issues
226
+ // for one, there can be values with no key
227
+ // for another, it's a pain to correctly parse, since different treebanks
228
+ // can have different standards for how to write out the misc field
229
+ optional string mwtMisc = 78;
230
+
231
+ // number info
232
+ optional uint64 numericValue = 70;
233
+ optional string numericType = 71;
234
+ optional uint64 numericCompositeValue = 72;
235
+ optional string numericCompositeType = 73;
236
+
237
+ optional uint32 codepointOffsetBegin = 74;
238
+ optional uint32 codepointOffsetEnd = 75;
239
+
240
+ // Fields in the CoreLabel java class that are moved elsewhere
241
+ // string text @see Document#text + character offsets
242
+ // uint32 sentenceIndex @see Sentence#sentenceIndex
243
+ // string docID @see Document#docID
244
+ // uint32 paragraph @see Sentence#paragraph
245
+
246
+ // Most serialized annotations will not have this
247
+ // Some code paths may not correctly process this if serialized,
248
+ // since many places will read the index off the position in a sentence
249
+ // In particular, deserializing a Document using ProtobufAnnotationSerializer
250
+ // will clobber any index value
251
+ // But Semgrex and Ssurgeon in particular need a way
252
+ // to pass around nodes where the node's index is not strictly 1, 2, 3, ...
253
+ // thanks to the empty nodes in UD treebanks such as
254
+ // English EWT or Estonian EWT (not related to each other)
255
+ optional uint32 index = 79;
256
+ optional uint32 emptyIndex = 80;
257
+
258
+ extensions 100 to 255;
259
+ }
260
+
261
+ //
262
+ // An enumeration of valid sentiment values for the sentiment classifier.
263
+ //
264
+ enum Sentiment {
265
+ STRONG_NEGATIVE = 0;
266
+ WEAK_NEGATIVE = 1;
267
+ NEUTRAL = 2;
268
+ WEAK_POSITIVE = 3;
269
+ STRONG_POSITIVE = 4;
270
+ }
271
+
272
+ //
273
+ // A quotation marker in text
274
+ //
275
+ message Quote {
276
+ optional string text = 1;
277
+ optional uint32 begin = 2;
278
+ optional uint32 end = 3;
279
+ optional uint32 sentenceBegin = 5;
280
+ optional uint32 sentenceEnd = 6;
281
+ optional uint32 tokenBegin = 7;
282
+ optional uint32 tokenEnd = 8;
283
+ optional string docid = 9;
284
+ optional uint32 index = 10;
285
+ optional string author = 11;
286
+ optional string mention = 12;
287
+ optional uint32 mentionBegin = 13;
288
+ optional uint32 mentionEnd = 14;
289
+ optional string mentionType = 15;
290
+ optional string mentionSieve = 16;
291
+ optional string speaker = 17;
292
+ optional string speakerSieve = 18;
293
+ optional string canonicalMention = 19;
294
+ optional uint32 canonicalMentionBegin = 20;
295
+ optional uint32 canonicalMentionEnd = 21;
296
+ optional DependencyGraph attributionDependencyGraph = 22;
297
+ }
298
+
299
+ //
300
+ // A syntactic parse tree, with scores.
301
+ //
302
+ message ParseTree {
303
+ repeated ParseTree child = 1;
304
+ optional string value = 2;
305
+ optional uint32 yieldBeginIndex = 3;
306
+ optional uint32 yieldEndIndex = 4;
307
+ optional double score = 5;
308
+ optional Sentiment sentiment = 6;
309
+ }
310
+
311
+ //
312
+ // A dependency graph representation.
313
+ //
314
+ message DependencyGraph {
315
+ message Node {
316
+ required uint32 sentenceIndex = 1;
317
+ required uint32 index = 2;
318
+ optional uint32 copyAnnotation = 3;
319
+ optional uint32 emptyIndex = 4;
320
+ }
321
+
322
+ message Edge {
323
+ required uint32 source = 1;
324
+ required uint32 target = 2;
325
+ optional string dep = 3;
326
+ optional bool isExtra = 4;
327
+ optional uint32 sourceCopy = 5;
328
+ optional uint32 targetCopy = 6;
329
+ optional uint32 sourceEmpty = 8;
330
+ optional uint32 targetEmpty = 9;
331
+ optional Language language = 7 [default=Unknown];
332
+ }
333
+
334
+ repeated Node node = 1;
335
+ repeated Edge edge = 2;
336
+ repeated uint32 root = 3 [packed=true];
337
+ // optional: if this graph message is not part of a larger context,
338
+ // the tokens will help reconstruct the actual sentence
339
+ repeated Token token = 4;
340
+ // The values in this field will index directly into the node list
341
+ // This is useful so that additional information such as emptyIndex
342
+ // can be considered without having to pass it around a second time
343
+ repeated uint32 rootNode = 5 [packed=true];
344
+ }
345
+
346
+ //
347
+ // A coreference chain.
348
+ // These fields are not *really* optional. CoreNLP will crash without them.
349
+ //
350
+ message CorefChain {
351
+ message CorefMention {
352
+ optional int32 mentionID = 1;
353
+ optional string mentionType = 2;
354
+ optional string number = 3;
355
+ optional string gender = 4;
356
+ optional string animacy = 5;
357
+ optional uint32 beginIndex = 6;
358
+ optional uint32 endIndex = 7;
359
+ optional uint32 headIndex = 9;
360
+ optional uint32 sentenceIndex = 10;
361
+ optional uint32 position = 11; // the second element of position
362
+ }
363
+
364
+ required int32 chainID = 1;
365
+ repeated CorefMention mention = 2;
366
+ required uint32 representative = 3;
367
+ }
368
+
369
+ //
370
+ // a mention
371
+ //
372
+
373
+ message Mention {
374
+ optional int32 mentionID = 1;
375
+ optional string mentionType = 2;
376
+ optional string number = 3;
377
+ optional string gender = 4;
378
+ optional string animacy = 5;
379
+ optional string person = 6;
380
+ optional uint32 startIndex = 7;
381
+ optional uint32 endIndex = 9;
382
+ optional int32 headIndex = 10;
383
+ optional string headString = 11;
384
+ optional string nerString = 12;
385
+ optional int32 originalRef = 13;
386
+ optional int32 goldCorefClusterID = 14;
387
+ optional int32 corefClusterID = 15;
388
+ optional int32 mentionNum = 16;
389
+ optional int32 sentNum = 17;
390
+ optional int32 utter = 18;
391
+ optional int32 paragraph = 19;
392
+ optional bool isSubject = 20;
393
+ optional bool isDirectObject = 21;
394
+ optional bool isIndirectObject = 22;
395
+ optional bool isPrepositionObject = 23;
396
+ optional bool hasTwin = 24;
397
+ optional bool generic = 25;
398
+ optional bool isSingleton = 26;
399
+ optional bool hasBasicDependency = 27;
400
+ optional bool hasEnhancedDependency = 28;
401
+ optional bool hasContextParseTree = 29;
402
+ optional IndexedWord headIndexedWord = 30;
403
+ optional IndexedWord dependingVerb = 31;
404
+ optional IndexedWord headWord = 32;
405
+ optional SpeakerInfo speakerInfo = 33;
406
+
407
+ repeated IndexedWord sentenceWords = 50;
408
+ repeated IndexedWord originalSpan = 51;
409
+ repeated string dependents = 52;
410
+ repeated string preprocessedTerms = 53;
411
+ repeated int32 appositions = 54;
412
+ repeated int32 predicateNominatives = 55;
413
+ repeated int32 relativePronouns = 56;
414
+ repeated int32 listMembers = 57;
415
+ repeated int32 belongToLists = 58;
416
+
417
+ }
418
+
419
+ //
420
+ // store the position (sentence, token index) of a CoreLabel
421
+ //
422
+
423
+ message IndexedWord {
424
+ optional int32 sentenceNum = 1;
425
+ optional int32 tokenIndex = 2;
426
+ optional int32 docID = 3;
427
+ optional uint32 copyCount = 4;
428
+ }
429
+
430
+ //
431
+ // speaker info, this is used for Mentions
432
+ //
433
+
434
+ message SpeakerInfo {
435
+ optional string speakerName = 1;
436
+ repeated int32 mentions = 2;
437
+ }
438
+
439
+ //
440
+ // A Span of text
441
+ //
442
+ message Span {
443
+ required uint32 begin = 1;
444
+ required uint32 end = 2;
445
+ }
446
+
447
+ //
448
+ // A Timex object, representing a temporal expression (TIMe EXpression)
449
+ // These fields are not *really* optional. CoreNLP will crash without them.
450
+ //
451
+ message Timex {
452
+ optional string value = 1;
453
+ optional string altValue = 2;
454
+ optional string text = 3;
455
+ optional string type = 4;
456
+ optional string tid = 5;
457
+ optional uint32 beginPoint = 6;
458
+ optional uint32 endPoint = 7;
459
+ }
460
+
461
+ //
462
+ // A representation of an entity in a relation.
463
+ // This corresponds to the EntityMention, and more broadly the
464
+ // ExtractionObject classes.
465
+ //
466
+ message Entity {
467
+ optional uint32 headStart = 6;
468
+ optional uint32 headEnd = 7;
469
+ optional string mentionType = 8;
470
+ optional string normalizedName = 9;
471
+ optional uint32 headTokenIndex = 10;
472
+ optional string corefID = 11;
473
+ // inherited from ExtractionObject
474
+ optional string objectID = 1;
475
+ optional uint32 extentStart = 2;
476
+ optional uint32 extentEnd = 3;
477
+ optional string type = 4;
478
+ optional string subtype = 5;
479
+ // Implicit
480
+ // uint32 sentence @see implicit in sentence
481
+ }
482
+
483
+ //
484
+ // A representation of a relation, mirroring RelationMention
485
+ //
486
+ message Relation {
487
+ repeated string argName = 6;
488
+ repeated Entity arg = 7;
489
+ optional string signature = 8;
490
+ // inherited from ExtractionObject
491
+ optional string objectID = 1;
492
+ optional uint32 extentStart = 2;
493
+ optional uint32 extentEnd = 3;
494
+ optional string type = 4;
495
+ optional string subtype = 5;
496
+ // Implicit
497
+ // uint32 sentence @see implicit in sentence
498
+ }
499
+
500
+ //
501
+ // A Natural Logic operator
502
+ //
503
+ message Operator {
504
+ required string name = 1;
505
+ required int32 quantifierSpanBegin = 2;
506
+ required int32 quantifierSpanEnd = 3;
507
+ required int32 subjectSpanBegin = 4;
508
+ required int32 subjectSpanEnd = 5;
509
+ required int32 objectSpanBegin = 6;
510
+ required int32 objectSpanEnd = 7;
511
+ }
512
+
513
+ //
514
+ // The seven informative Natural Logic relations
515
+ //
516
+ enum NaturalLogicRelation {
517
+ EQUIVALENCE = 0;
518
+ FORWARD_ENTAILMENT = 1;
519
+ REVERSE_ENTAILMENT = 2;
520
+ NEGATION = 3;
521
+ ALTERNATION = 4;
522
+ COVER = 5;
523
+ INDEPENDENCE = 6;
524
+ }
525
+
526
+ //
527
+ // The polarity of a word, according to Natural Logic
528
+ //
529
+ message Polarity {
530
+ required NaturalLogicRelation projectEquivalence = 1;
531
+ required NaturalLogicRelation projectForwardEntailment = 2;
532
+ required NaturalLogicRelation projectReverseEntailment = 3;
533
+ required NaturalLogicRelation projectNegation = 4;
534
+ required NaturalLogicRelation projectAlternation = 5;
535
+ required NaturalLogicRelation projectCover = 6;
536
+ required NaturalLogicRelation projectIndependence = 7;
537
+ }
538
+
539
+ //
540
+ // An NER mention in the text
541
+ //
542
+ message NERMention {
543
+ optional uint32 sentenceIndex = 1;
544
+ required uint32 tokenStartInSentenceInclusive = 2;
545
+ required uint32 tokenEndInSentenceExclusive = 3;
546
+ required string ner = 4;
547
+ optional string normalizedNER = 5;
548
+ optional string entityType = 6;
549
+ optional Timex timex = 7;
550
+ optional string wikipediaEntity = 8;
551
+ optional string gender = 9;
552
+ optional uint32 entityMentionIndex = 10;
553
+ optional uint32 canonicalEntityMentionIndex = 11;
554
+ optional string entityMentionText = 12;
555
+ }
556
+
557
+ //
558
+ // An entailed sentence fragment.
559
+ // Created by the openie annotator.
560
+ //
561
+ message SentenceFragment {
562
+ repeated uint32 tokenIndex = 1;
563
+ optional uint32 root = 2;
564
+ optional bool assumedTruth = 3;
565
+ optional double score = 4;
566
+ }
567
+
568
+
569
+ //
570
+ // The index of a token in a document, including the sentence
571
+ // index and the offset.
572
+ //
573
+ message TokenLocation {
574
+ optional uint32 sentenceIndex = 1;
575
+ optional uint32 tokenIndex = 2;
576
+
577
+ }
578
+
579
+
580
+ //
581
+ // An OpenIE relation triple.
582
+ // Created by the openie annotator.
583
+ //
584
+ message RelationTriple {
585
+ optional string subject = 1; // The surface form of the subject
586
+ optional string relation = 2; // The surface form of the relation (required)
587
+ optional string object = 3; // The surface form of the object
588
+ optional double confidence = 4; // The [optional] confidence of the extraction
589
+ repeated TokenLocation subjectTokens = 13; // The tokens comprising the subject of the triple
590
+ repeated TokenLocation relationTokens = 14; // The tokens comprising the relation of the triple
591
+ repeated TokenLocation objectTokens = 15; // The tokens comprising the object of the triple
592
+ optional DependencyGraph tree = 8; // The dependency graph fragment for this triple
593
+ optional bool istmod = 9; // If true, this expresses an implicit tmod relation
594
+ optional bool prefixBe = 10; // If true, this relation string is missing a 'be' prefix
595
+ optional bool suffixBe = 11; // If true, this relation string is missing a 'be' suffix
596
+ optional bool suffixOf = 12; // If true, this relation string is missing a 'of' prefix
597
+ }
598
+
599
+
600
+ //
601
+ // A map from strings to strings.
602
+ // Used, minimally, in the CoNLLU featurizer
603
+ //
604
+ message MapStringString {
605
+ repeated string key = 1;
606
+ repeated string value = 2;
607
+ }
608
+
609
+ //
610
+ // A map from integers to strings.
611
+ // Used, minimally, in the CoNLLU featurizer
612
+ //
613
+ message MapIntString {
614
+ repeated uint32 key = 1;
615
+ repeated string value = 2;
616
+ }
617
+
618
+ //
619
+ // Store section info
620
+ //
621
+
622
+ message Section {
623
+ required uint32 charBegin = 1;
624
+ required uint32 charEnd = 2;
625
+ optional string author = 3;
626
+ repeated uint32 sentenceIndexes = 4;
627
+ optional string datetime = 5;
628
+ repeated Quote quotes = 6;
629
+ optional uint32 authorCharBegin = 7;
630
+ optional uint32 authorCharEnd = 8;
631
+ required Token xmlTag = 9;
632
+ }
633
+
634
+
635
+
636
+ // A message for requesting a semgrex
637
+ // Each sentence stores information about the tokens making up the
638
+ // corresponding graph
639
+ // An alternative would have been to use the existing Document or
640
+ // Sentence classes, but the problem with that is it would be
641
+ // ambiguous which dependency object to use.
642
+ message SemgrexRequest {
643
+ message Dependencies {
644
+ repeated Token token = 1;
645
+ required DependencyGraph graph = 2;
646
+ }
647
+
648
+ repeated string semgrex = 1;
649
+ repeated Dependencies query = 2;
650
+ }
651
+
652
+ // The response from running a semgrex
653
+ // If you pass in M semgrex expressions and N dependency graphs,
654
+ // this returns MxN nested results. Each SemgrexResult can match
655
+ // multiple times in one graph
656
+ //
657
+ // You may want to send multiple semgrexes per query because
658
+ // translating large numbers of dependency graphs to protobufs
659
+ // will be expensive, so doing several queries at once will save time
660
+ message SemgrexResponse {
661
+ message NamedNode {
662
+ required string name = 1;
663
+ required int32 matchIndex = 2;
664
+ }
665
+
666
+ message NamedRelation {
667
+ required string name = 1;
668
+ required string reln = 2;
669
+ }
670
+
671
+ message NamedEdge {
672
+ required string name = 1;
673
+ required int32 source = 2;
674
+ required int32 target = 3;
675
+ optional string reln = 4;
676
+ optional bool isExtra = 5;
677
+ optional uint32 sourceCopy = 6;
678
+ optional uint32 targetCopy = 7;
679
+ }
680
+
681
+ message Match {
682
+ required int32 matchIndex = 1;
683
+ repeated NamedNode node = 2;
684
+ repeated NamedRelation reln = 3;
685
+ repeated NamedEdge edge = 6;
686
+
687
+ // when processing multiple dependency graphs at once,
688
+ // which dependency graph this applies to
689
+ // indexed from 0
690
+ optional int32 graphIndex = 4;
691
+ // index of the semgrex expression this match applies to
692
+ // indexed from 0
693
+ optional int32 semgrexIndex = 5;
694
+ }
695
+
696
+ message SemgrexResult {
697
+ repeated Match match = 1;
698
+ }
699
+
700
+ message GraphResult {
701
+ repeated SemgrexResult result = 1;
702
+ }
703
+
704
+ repeated GraphResult result = 1;
705
+ }
706
+
707
+
708
+ // A message for processing an Ssurgeon
709
+ // Each sentence stores information about the tokens making up the
710
+ // corresponding graph
711
+ // An alternative would have been to use the existing Document or
712
+ // Sentence classes, but the problem with that is it would be
713
+ // ambiguous which dependency object to use. Another problem
714
+ // is that if the intent is to use multiple graphs from a
715
+ // Sentence, then edits to the nodes of one graph would show up
716
+ // in the nodes of the other graph (same backing CoreLabels)
717
+ // and the operations themselves may not have the intended effect.
718
+ // The Ssurgeon is composed of two pieces, the semgrex and the
719
+ // ssurgeon operations, along with some optional documentation.
720
+ message SsurgeonRequest {
721
+ message Ssurgeon {
722
+ optional string semgrex = 1;
723
+ repeated string operation = 2;
724
+ optional string id = 3;
725
+ optional string notes = 4;
726
+ optional string language = 5;
727
+ }
728
+
729
+ repeated Ssurgeon ssurgeon = 1;
730
+ repeated DependencyGraph graph = 2;
731
+ }
732
+
733
+ message SsurgeonResponse {
734
+ message SsurgeonResult {
735
+ optional DependencyGraph graph = 1;
736
+ optional bool changed = 2;
737
+ }
738
+
739
+ repeated SsurgeonResult result = 1;
740
+ }
741
+
742
+ // It's possible to send in a whole document, but we
743
+ // only care about the Sentences and Tokens
744
+ message TokensRegexRequest {
745
+ required Document doc = 1;
746
+ repeated string pattern = 2;
747
+ }
748
+
749
+ // The result will be a nested structure:
750
+ // repeated PatternMatch, one for each pattern
751
+ // each PatternMatch has a repeated Match,
752
+ // which tells you which sentence matched and where
753
+ message TokensRegexResponse {
754
+ message MatchLocation {
755
+ optional string text = 1;
756
+ optional int32 begin = 2;
757
+ optional int32 end = 3;
758
+ }
759
+
760
+ message Match {
761
+ required int32 sentence = 1;
762
+ required MatchLocation match = 2;
763
+ repeated MatchLocation group = 3;
764
+ }
765
+
766
+ message PatternMatch {
767
+ repeated Match match = 1;
768
+ }
769
+
770
+ repeated PatternMatch match = 1;
771
+ }
772
+
773
+ // A protobuf which allows to pass in a document with basic
774
+ // dependencies to be converted to enhanced
775
+ message DependencyEnhancerRequest {
776
+ required Document document = 1;
777
+
778
+ oneof ref {
779
+ Language language = 2;
780
+ // The expected value of this is a regex which matches relative pronouns
781
+ string relativePronouns = 3;
782
+ }
783
+ }
784
+
785
+ // A version of ParseTree with a flattened structure so that deep trees
786
+ // don't exceed the protobuf stack depth
787
+ message FlattenedParseTree {
788
+ message Node {
789
+ oneof contents {
790
+ bool openNode = 1;
791
+ bool closeNode = 2;
792
+ string value = 3;
793
+ }
794
+
795
+ optional double score = 4;
796
+ }
797
+
798
+ repeated Node nodes = 1;
799
+ }
800
+
801
+ // A protobuf for calling the java constituency parser evaluator from elsewhere
802
+ message EvaluateParserRequest {
803
+ message ParseResult {
804
+ required FlattenedParseTree gold = 1;
805
+ // repeated so you can send in kbest parses, if your parser handles that
806
+ // note that this already includes a score field
807
+ repeated FlattenedParseTree predicted = 2;
808
+ }
809
+
810
+ repeated ParseResult treebank = 1;
811
+ }
812
+
813
+ message EvaluateParserResponse {
814
+ required double f1 = 1;
815
+ optional double kbestF1 = 2;
816
+ // keep track of the individual tree F1 scores
817
+ repeated double treeF1 = 3;
818
+ }
819
+
820
+
821
+ // A protobuf for running Tsurgeon operations on constituency trees
822
+ message TsurgeonRequest {
823
+ message Operation {
824
+ required string tregex = 1;
825
+ repeated string tsurgeon = 2;
826
+ }
827
+ repeated Operation operations = 1;
828
+ repeated FlattenedParseTree trees = 2;
829
+ }
830
+
831
+ // The results of the Tsurgeon operation
832
+ message TsurgeonResponse {
833
+ repeated FlattenedParseTree trees = 1;
834
+ }
835
+
836
+ // Sent in Morphology requests - a stream of sentences with tagged words
837
+ message MorphologyRequest {
838
+ message TaggedWord {
839
+ required string word = 1;
840
+ optional string xpos = 2;
841
+ }
842
+
843
+ repeated TaggedWord words = 1;
844
+ }
845
+
846
+ // Sent back from the Morphology request - the words and their tags
847
+ message MorphologyResponse {
848
+ message WordTagLemma {
849
+ required string word = 1;
850
+ optional string xpos = 2;
851
+ required string lemma = 3;
852
+ }
853
+
854
+ repeated WordTagLemma words = 1;
855
+ }
856
+
857
+
858
+ // A request for converting constituency trees to dependency graphs
859
+ message DependencyConverterRequest {
860
+ repeated FlattenedParseTree trees = 1;
861
+ }
862
+
863
+ // The result of using the CoreNLP dependency converter.
864
+ // One graph per tree
865
+ message DependencyConverterResponse {
866
+ message DependencyConversion {
867
+ required DependencyGraph graph = 1;
868
+ optional FlattenedParseTree tree = 2;
869
+ }
870
+
871
+ repeated DependencyConversion conversions = 1;
872
+ }
873
+
stanza/scripts/config.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Set environment variables for the training and testing of stanza modules.
4
+
5
+ # Set UDBASE to the location of UD data folder
6
+ # The data should be CoNLL-U format
7
+ # For details, see
8
+ # http://universaldependencies.org/conll18/data.html (CoNLL-18 UD data)
9
+ # https://universaldependencies.org/
10
+ # When rebuilding models based on Universal Dependencies, download the
11
+ # UD data to some directory, set UDBASE to that directory, and
12
+ # uncomment this line. Alternatively, put UDBASE in your shell
13
+ # config, Windows env variables, etc as relevant.
14
+ # export UDBASE=/path/to/UD
15
+
16
+ # Set NERBASE to the location of NER data folder
17
+ # The data should be BIO format or convertable to that format
18
+ # For details, see https://www.aclweb.org/anthology/W03-0419.pdf (CoNLL-03 NER paper)
19
+ # There are other NER datasets, supported in
20
+ # stanza/utils/datasets/ner/prepare_ner_dataset.py
21
+ # If rebuilding NER data, choose a location for the NER directory
22
+ # and set NERBASE to that variable.
23
+ # export NERBASE=/path/to/NER
24
+
25
+ # Set CONSTITUENCY_BASE to the location of NER data folder
26
+ # The data will be in some dataset-specific format
27
+ # There is a conversion script which will turn this
28
+ # into a PTB style format
29
+ # stanza/utils/datasets/constituency/prepare_con_dataset.py
30
+ # If processing constituency data, choose a location for the CON data
31
+ # and set CONSTITUENCY_BASE to that variable.
32
+ # export CONSTITUENCY_BASE=/path/to/CON
33
+
34
+ # Set directories to store processed training/evaluation files
35
+ # $DATA_ROOT is a default home for where all the outputs from the
36
+ # preparation scripts will go. The training scripts will then look
37
+ # for the stanza formatted data in that directory.
38
+ export DATA_ROOT=./data
39
+ export TOKENIZE_DATA_DIR=$DATA_ROOT/tokenize
40
+ export MWT_DATA_DIR=$DATA_ROOT/mwt
41
+ export LEMMA_DATA_DIR=$DATA_ROOT/lemma
42
+ export POS_DATA_DIR=$DATA_ROOT/pos
43
+ export DEPPARSE_DATA_DIR=$DATA_ROOT/depparse
44
+ export ETE_DATA_DIR=$DATA_ROOT/ete
45
+ export NER_DATA_DIR=$DATA_ROOT/ner
46
+ export CHARLM_DATA_DIR=$DATA_ROOT/charlm
47
+ export CONSTITUENCY_DATA_DIR=$DATA_ROOT/constituency
48
+ export SENTIMENT_DATA_DIR=$DATA_ROOT/sentiment
49
+
50
+ # Set directories to store external word vector data
51
+ export WORDVEC_DIR=./extern_data/wordvec
stanza/scripts/download_vectors.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Download word vector files for all supported languages. Run as:
4
+ # ./download_vectors.sh WORDVEC_DIR
5
+ # where WORDVEC_DIR is the target directory to store the word vector data.
6
+
7
+ # check arguments
8
+ : ${1?"Usage: $0 WORDVEC_DIR"}
9
+ WORDVEC_DIR=$1
10
+
11
+ # constants and functions
12
+ CONLL17_URL="https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1989/word-embeddings-conll17.tar"
13
+ CONLL17_TAR="word-embeddings-conll17.tar"
14
+
15
+ FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"
16
+
17
+ # TODO: some fasttext vectors are now at
18
+ # https://fasttext.cc/docs/en/pretrained-vectors.html
19
+ # there are also vectors for
20
+ # Welsh, Icelandic, Thai, Sanskrit
21
+ # https://fasttext.cc/docs/en/crawl-vectors.html
22
+
23
+ # We get the Armenian word vectors from here:
24
+ # https://github.com/ispras-texterra/word-embeddings-eval-hy
25
+ # https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf
26
+ # In particular, the glove model (dogfooding):
27
+ # https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download
28
+ # These vectors improved F1 by about 1 on various tasks for Armenian
29
+ # and had much better coverage of Western Armenian
30
+
31
+ # For Eryza, we use word vectors available here:
32
+ # https://github.com/mokha/semantics
33
+ # @incollection{Alnajjar_2021,
34
+ # doi = {10.31885/9789515150257.24},
35
+ # url = {https://doi.org/10.31885%2F9789515150257.24},
36
+ # year = 2021,
37
+ # month = {mar},
38
+ # publisher = {University of Helsinki},
39
+ # pages = {275--288},
40
+ # author = {Khalid Alnajjar},
41
+ # title = {When Word Embeddings Become Endangered},
42
+ # booktitle = {Multilingual Facilitation}
43
+ # }
44
+
45
+ declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
46
+ declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
47
+ declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
48
+
49
+ color_green='\033[32;1m'
50
+ color_clear='\033[0m' # No Color
51
+ function msg() {
52
+ echo -e "${color_green}$@${color_clear}"
53
+ }
54
+
55
+ function prepare_fasttext_vec() {
56
+ lang=$1
57
+ ftcode=$2
58
+ code=$3
59
+
60
+ cwd=$(pwd)
61
+ mkdir -p $lang
62
+ cd $lang
63
+ msg "=== Downloading fasttext vector file for ${lang}..."
64
+ url="${FASTTEXT_BASE_URL}/wiki.${ftcode}.vec"
65
+ fname="${code}.vectors"
66
+ wget $url -O $fname
67
+
68
+ msg "=== Compressing file ${fname}..."
69
+ xz $fname
70
+ cd $cwd
71
+ }
72
+
73
+ # do the actual work
74
+ mkdir -p $WORDVEC_DIR
75
+ cd $WORDVEC_DIR
76
+
77
+ msg "Downloading CONLL17 word vectors. This may take a while..."
78
+ wget $CONLL17_URL -O $CONLL17_TAR
79
+
80
+ msg "Extracting CONLL17 word vector files..."
81
+ tar -xvf $CONLL17_TAR
82
+ rm $CONLL17_TAR
83
+
84
+ msg "Preparing fasttext vectors for the rest of the languages."
85
+ for (( i=0; i<${#FASTTEXT_LANG[*]}; ++i)); do
86
+ prepare_fasttext_vec ${FASTTEXT_LANG[$i]} ${FASTTEXT_CODE[$i]} ${LOCAL_CODE[$i]}
87
+ done
88
+
89
+ # handle old french
90
+ mkdir Old_French
91
+ ln -s French/fr.vectors.xz Old_French/fro.vectors.xz
92
+
93
+ msg "All done."
stanza/stanza/_version.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """ Single source of truth for version number """
2
+
3
+ __version__ = "1.10.1"
4
+ __resources_version__ = '1.10.0'
stanza/stanza/models/__init__.py ADDED
File without changes
stanza/stanza/models/_training_logging.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger('stanza')
4
+ logger.setLevel(logging.DEBUG)
stanza/stanza/models/classifier.py ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import ast
3
+ import logging
4
+ import os
5
+ import random
6
+ import re
7
+ from enum import Enum
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from stanza.models.common import loss
13
+ from stanza.models.common import utils
14
+ from stanza.models.pos.vocab import CharVocab
15
+
16
+ import stanza.models.classifiers.data as data
17
+ from stanza.models.classifiers.trainer import Trainer
18
+ from stanza.models.classifiers.utils import WVType, ExtraVectors, ModelType
19
+ from stanza.models.common.peft_config import add_peft_args, resolve_peft_args
20
+
21
+ from stanza.utils.confusion import format_confusion, confusion_to_accuracy, confusion_to_macro_f1
22
+
23
+
24
+ class Loss(Enum):
25
+ CROSS = 1
26
+ WEIGHTED_CROSS = 2
27
+ LOG_CROSS = 3
28
+ FOCAL = 4
29
+
30
+ class DevScoring(Enum):
31
+ ACCURACY = 'ACC'
32
+ WEIGHTED_F1 = 'WF'
33
+
34
+ logger = logging.getLogger('stanza')
35
+ tlogger = logging.getLogger('stanza.classifiers.trainer')
36
+
37
+ logging.getLogger('elmoformanylangs').setLevel(logging.WARNING)
38
+
39
+ DEFAULT_TRAIN='data/sentiment/en_sstplus.train.txt'
40
+ DEFAULT_DEV='data/sentiment/en_sst3roots.dev.txt'
41
+ DEFAULT_TEST='data/sentiment/en_sst3roots.test.txt'
42
+
43
+ """A script for training and testing classifier models, especially on the SST.
44
+
45
+ If you run the script with no arguments, it will start trying to train
46
+ a sentiment model.
47
+
48
+ python3 -m stanza.models.classifier
49
+
50
+ This requires the sentiment dataset to be in an `extern_data`
51
+ directory, such as by symlinking it from somewhere else.
52
+
53
+ The default model is a CNN where the word vectors are first mapped to
54
+ channels with filters of a few different widths, those channels are
55
+ maxpooled over the entire sentence, and then the resulting pools have
56
+ fully connected layers until they reach the number of classes in the
57
+ training data. You can see the defaults in the options below.
58
+
59
+ https://arxiv.org/abs/1408.5882
60
+
61
+ (Currently the CNN is the only sentence classifier implemented.)
62
+
63
+ To train with a more complicated CNN arch:
64
+
65
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 > FC41.out 2>&1 &
66
+
67
+ You can train models with word vectors other than the default word2vec. For example:
68
+
69
+ nohup python3 -u -m stanza.models.classifier --wordvec_type google --wordvec_dir extern_data/google --max_epochs 200 --filter_channels 1000 --fc_shapes 200,100 --base_name FC21_google > FC21_google.out 2>&1 &
70
+
71
+ A model trained on the 5 class dataset can be tested on the 2 class dataset with a command line like this:
72
+
73
+ python3 -u -m stanza.models.classifier --no_train --load_name saved_models/classifier/sst_en_ewt_FS_3_4_5_C_1000_FC_400_100_classifier.E0165-ACC41.87.pt --test_file data/sentiment/en_sst2roots.test.txt --test_remap_labels "{0:0, 1:0, 3:1, 4:1}"
74
+
75
+ python3 -u -m stanza.models.classifier --wordvec_type google --wordvec_dir extern_data/google --no_train --load_name saved_models/classifier/FC21_google_en_ewt_FS_3_4_5_C_1000_FC_200_100_classifier.E0189-ACC45.87.pt --test_file data/sentiment/en_sst2roots.test.txt --test_remap_labels "{0:0, 1:0, 3:1, 4:1}"
76
+
77
+ A model trained on the 3 class dataset can be tested on the 2 class dataset with a command line like this:
78
+
79
+ python3 -u -m stanza.models.classifier --wordvec_type google --wordvec_dir extern_data/google --no_train --load_name saved_models/classifier/FC21_3C_google_en_ewt_FS_3_4_5_C_1000_FC_200_100_classifier.E0101-ACC68.94.pt --test_file data/sentiment/en_sst2roots.test.txt --test_remap_labels "{0:0, 2:1}"
80
+
81
+ To train models on combined 3 class datasets:
82
+
83
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 --base_name FC41_3class --extra_wordvec_method CONCAT --extra_wordvec_dim 200 --train_file data/sentiment/en_sstplus.train.txt --dev_file data/sentiment/en_sst3roots.dev.txt --test_file data/sentiment/en_sst3roots.test.txt > FC41_3class.out 2>&1 &
84
+
85
+ This tests that model:
86
+
87
+ python3 -u -m stanza.models.classifier --no_train --load_name en_sstplus.pt --test_file data/sentiment/en_sst3roots.test.txt
88
+
89
+ Here is an example for training a model in a different language:
90
+
91
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 --base_name FC41_german --train_file data/sentiment/de_sb10k.train.txt --dev_file data/sentiment/de_sb10k.dev.txt --test_file data/sentiment/de_sb10k.test.txt --shorthand de_sb10k --min_train_len 3 --extra_wordvec_method CONCAT --extra_wordvec_dim 100 > de_sb10k.out 2>&1 &
92
+
93
+ This uses more data, although that wound up being worse for the German model:
94
+
95
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 --base_name FC41_german --train_file data/sentiment/de_sb10k.train.txt,data/sentiment/de_scare.train.txt,data/sentiment/de_usage.train.txt --dev_file data/sentiment/de_sb10k.dev.txt --test_file data/sentiment/de_sb10k.test.txt --shorthand de_sb10k --min_train_len 3 --extra_wordvec_method CONCAT --extra_wordvec_dim 100 > de_sb10k.out 2>&1 &
96
+
97
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 --base_name FC41_chinese --train_file data/sentiment/zh_ren.train.txt --dev_file data/sentiment/zh_ren.dev.txt --test_file data/sentiment/zh_ren.test.txt --shorthand zh_ren --wordvec_type fasttext --extra_wordvec_method SUM --wordvec_pretrain_file ../stanza_resources/zh-hans/pretrain/gsdsimp.pt > zh_ren.out 2>&1 &
98
+
99
+ nohup python3 -u -m stanza.models.classifier --max_epochs 400 --filter_channels 1000 --fc_shapes 400,100 --save_name vi_vsfc.pt --train_file data/sentiment/vi_vsfc.train.json --dev_file data/sentiment/vi_vsfc.dev.json --test_file data/sentiment/vi_vsfc.test.json --shorthand vi_vsfc --wordvec_pretrain_file ../stanza_resources/vi/pretrain/vtb.pt --wordvec_type word2vec --extra_wordvec_method SUM --dev_eval_scoring WEIGHTED_F1 > vi_vsfc.out 2>&1 &
100
+
101
+ python3 -u -m stanza.models.classifier --no_train --test_file extern_data/sentiment/vietnamese/_UIT-VSFC/test.txt --shorthand vi_vsfc --wordvec_pretrain_file ../stanza_resources/vi/pretrain/vtb.pt --wordvec_type word2vec --load_name vi_vsfc.pt
102
+ """
103
+
104
+ def convert_fc_shapes(arg):
105
+ """
106
+ Returns a tuple of sizes to use in FC layers.
107
+
108
+ For examples, converts "100" -> (100,)
109
+ "100,200" -> (100,200)
110
+ """
111
+ arg = arg.strip()
112
+ if not arg:
113
+ return ()
114
+ arg = ast.literal_eval(arg)
115
+ if isinstance(arg, int):
116
+ return (arg,)
117
+ if isinstance(arg, tuple):
118
+ return arg
119
+ return tuple(arg)
120
+
121
+ # For the most part, these values are for the constituency parser.
122
+ # Only the WD for adadelta is originally for sentiment
123
+ # Also LR for adadelta and madgrad
124
+
125
+ # madgrad learning rate experiment on sstplus
126
+ # note that the hyperparameters are not cross-validated in tandem, so
127
+ # later changes may make some earlier experiments slightly out of date
128
+ # LR
129
+ # 0.01 failed to converge
130
+ # 0.004 failed to converge
131
+ # 0.003 0.5572
132
+ # 0.002 failed to converge
133
+ # 0.001 0.6857
134
+ # 0.0008 0.6799
135
+ # 0.0005 0.6849
136
+ # 0.00025 0.6749
137
+ # 0.0001 0.6746
138
+ # 0.00001 0.6536
139
+ # 0.000001 0.6267
140
+ # LR 0.001 produced the best model, but it does occasionally fail to
141
+ # converge to a working model, so we set the default to 0.0005 instead
142
+ DEFAULT_LEARNING_RATES = { "adamw": 0.0002, "adadelta": 1.0, "sgd": 0.001, "adabelief": 0.00005, "madgrad": 0.0005, "sgd": 0.001 }
143
+ DEFAULT_LEARNING_EPS = { "adabelief": 1e-12, "adadelta": 1e-6, "adamw": 1e-8 }
144
+ DEFAULT_LEARNING_RHO = 0.9
145
+ DEFAULT_MOMENTUM = { "madgrad": 0.9, "sgd": 0.9 }
146
+ DEFAULT_WEIGHT_DECAY = { "adamw": 0.05, "adadelta": 0.0001, "sgd": 0.01, "adabelief": 1.2e-6, "madgrad": 2e-6 }
147
+
148
+ def build_argparse():
149
+ """
150
+ Build the argparse for the classifier.
151
+
152
+ Refactored so that other utility scripts can use the same parser if needed.
153
+ """
154
+ parser = argparse.ArgumentParser()
155
+
156
+ parser.add_argument('--train', dest='train', default=True, action='store_true', help='Train the model (default)')
157
+ parser.add_argument('--no_train', dest='train', action='store_false', help="Don't train the model")
158
+
159
+ parser.add_argument('--shorthand', type=str, default='en_ewt', help="Treebank shorthand, eg 'en' for English")
160
+
161
+ parser.add_argument('--load_name', type=str, default=None, help='Name for loading an existing model')
162
+ parser.add_argument('--save_dir', type=str, default='saved_models/classifier', help='Root dir for saving models.')
163
+ parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_{bert_finetuning}_{classifier_type}_classifier.pt", help='Name for saving the model')
164
+
165
+ parser.add_argument('--checkpoint_save_name', type=str, default=None, help="File name to save the most recent checkpoint")
166
+ parser.add_argument('--no_checkpoint', dest='checkpoint', action='store_false', help="Don't save checkpoints")
167
+
168
+ parser.add_argument('--save_intermediate_models', default=False, action='store_true',
169
+ help='Save all intermediate models - this can be a lot!')
170
+
171
+ parser.add_argument('--train_file', type=str, default=DEFAULT_TRAIN, help='Input file(s) to train a model from. Each line is an example. Should go <label> <tokenized sentence>. Comma separated list.')
172
+ parser.add_argument('--dev_file', type=str, default=DEFAULT_DEV, help='Input file(s) to use as the dev set.')
173
+ parser.add_argument('--test_file', type=str, default=DEFAULT_TEST, help='Input file(s) to use as the test set.')
174
+ parser.add_argument('--output_predictions', default=False, action='store_true', help='Output predictions when running the test set')
175
+ parser.add_argument('--max_epochs', type=int, default=100)
176
+ parser.add_argument('--tick', type=int, default=50)
177
+
178
+ parser.add_argument('--model_type', type=lambda x: ModelType[x.upper()], default=ModelType.CNN,
179
+ help='Model type to use. Options: %s' % " ".join(x.name for x in ModelType))
180
+
181
+ parser.add_argument('--filter_sizes', default=(3,4,5), type=ast.literal_eval, help='Filter sizes for the layer after the word vectors')
182
+ parser.add_argument('--filter_channels', default=1000, type=ast.literal_eval, help='Number of channels for layers after the word vectors. Int for same number of channels (scaled by width) for each filter, or tuple/list for exact lengths for each filter')
183
+ parser.add_argument('--fc_shapes', default="400,100", type=convert_fc_shapes, help='Extra fully connected layers to put after the initial filters. If set to blank, will FC directly from the max pooling to the output layer.')
184
+ parser.add_argument('--dropout', default=0.5, type=float, help='Dropout value to use')
185
+
186
+ parser.add_argument('--batch_size', default=50, type=int, help='Batch size when training')
187
+ parser.add_argument('--batch_single_item', default=200, type=int, help='Items of this size go in their own batch')
188
+ parser.add_argument('--dev_eval_batches', default=2000, type=int, help='Run the dev set after this many train batches. Set to 0 to only do it once per epoch')
189
+ parser.add_argument('--dev_eval_scoring', type=lambda x: DevScoring[x.upper()], default=DevScoring.WEIGHTED_F1,
190
+ help=('Scoring method to use for choosing the best model. Options: %s' %
191
+ " ".join(x.name for x in DevScoring)))
192
+
193
+ parser.add_argument('--weight_decay', default=None, type=float, help='Weight decay (eg, l2 reg) to use in the optimizer')
194
+ parser.add_argument('--learning_rate', default=None, type=float, help='Learning rate to use in the optimizer')
195
+ parser.add_argument('--momentum', default=None, type=float, help='Momentum to use in the optimizer')
196
+
197
+ parser.add_argument('--optim', default='adadelta', choices=['adadelta', 'madgrad', 'sgd'], help='Optimizer type: SGD, Adadelta, or madgrad. Highly recommend to install madgrad and use that')
198
+
199
+ parser.add_argument('--test_remap_labels', default=None, type=ast.literal_eval,
200
+ help='Map of which label each classifier label should map to. For example, "{0:0, 1:0, 3:1, 4:1}" to map a 5 class sentiment test to a 2 class. Any labels not mapped will be considered wrong')
201
+ parser.add_argument('--forgive_unmapped_labels', dest='forgive_unmapped_labels', default=True, action='store_true',
202
+ help='When remapping labels, such as from 5 class to 2 class, pick a different label if the first guess is not remapped.')
203
+ parser.add_argument('--no_forgive_unmapped_labels', dest='forgive_unmapped_labels', action='store_false',
204
+ help="When remapping labels, such as from 5 class to 2 class, DON'T pick a different label if the first guess is not remapped.")
205
+
206
+ parser.add_argument('--loss', type=lambda x: Loss[x.upper()], default=Loss.CROSS,
207
+ help="Whether to use regular cross entropy or scale it by 1/log(quantity)")
208
+ parser.add_argument('--loss_focal_gamma', default=2, type=float, help='gamma value for a focal loss')
209
+ parser.add_argument('--min_train_len', type=int, default=0,
210
+ help="Filter sentences less than this length")
211
+
212
+ parser.add_argument('--pretrain_max_vocab', type=int, default=-1)
213
+ parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
214
+ parser.add_argument('--wordvec_raw_file', type=str, default=None, help='Exact name of the raw wordvec file to read')
215
+ parser.add_argument('--wordvec_dir', type=str, default='extern_data/wordvec', help='Directory of word vectors')
216
+ parser.add_argument('--wordvec_type', type=lambda x: WVType[x.upper()], default='word2vec', help='Different vector types have different options, such as google 300d replacing numbers with #')
217
+ parser.add_argument('--extra_wordvec_dim', type=int, default=0, help="Extra dim of word vectors - will be trained")
218
+ parser.add_argument('--extra_wordvec_method', type=lambda x: ExtraVectors[x.upper()], default='sum', help='How to train extra dimensions of word vectors, if at all')
219
+ parser.add_argument('--extra_wordvec_max_norm', type=float, default=None, help="Max norm for initializing the extra vectors")
220
+
221
+ parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
222
+ parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
223
+ parser.add_argument('--charlm_projection', type=int, default=None, help="Project the charlm values to this dimension")
224
+ parser.add_argument('--char_lowercase', dest='char_lowercase', action='store_true', help="Use lowercased characters in character model.")
225
+
226
+ parser.add_argument('--elmo_model', default='extern_data/manyelmo/english', help='Directory with elmo model')
227
+ parser.add_argument('--use_elmo', dest='use_elmo', default=False, action='store_true', help='Use an elmo model as a source of parameters')
228
+ parser.add_argument('--elmo_projection', type=int, default=None, help='Project elmo to this many dimensions')
229
+
230
+ parser.add_argument('--bert_model', type=str, default=None, help="Use an external bert model (requires the transformers package)")
231
+ parser.add_argument('--no_bert_model', dest='bert_model', action="store_const", const=None, help="Don't use bert")
232
+ parser.add_argument('--bert_finetune', default=False, action='store_true', help="Finetune the Bert model")
233
+ parser.add_argument('--bert_learning_rate', default=0.01, type=float, help='Scale the learning rate for transformer finetuning by this much')
234
+ parser.add_argument('--bert_weight_decay', default=0.0001, type=float, help='Scale the weight decay for transformer finetuning by this much')
235
+ parser.add_argument('--bert_hidden_layers', type=int, default=4, help="How many layers of hidden state to use from the transformer")
236
+ parser.add_argument('--bert_hidden_layers_original', action='store_const', const=None, dest='bert_hidden_layers', help='Use layers 2,3,4 of the Bert embedding')
237
+
238
+ parser.add_argument('--bilstm', dest='bilstm', action='store_true', default=True, help="Use a bilstm after the inputs, before the convs. Using bilstm is about as accurate and significantly faster (because of dim reduction) than going straight to the filters")
239
+ parser.add_argument('--no_bilstm', dest='bilstm', action='store_false', help="Don't use a bilstm after the inputs, before the convs.")
240
+ # somewhere between 200-300 seems to be the sweet spot for a couple datasets:
241
+ # dev set macro f1 scores on 3 class problems
242
+ # note that these were only run once each
243
+ # more trials might narrow down which ones works best
244
+ # es_tass2020:
245
+ # 150 0.5580
246
+ # 200 0.5629
247
+ # 250 0.5586
248
+ # 300 0.5642 <---
249
+ # 400 0.5525
250
+ # 500 0.5579
251
+ # 750 0.5585
252
+ # en_sstplus:
253
+ # 150 0.6816
254
+ # 200 0.6721
255
+ # 250 0.6915 <---
256
+ # 300 0.6824
257
+ # 400 0.6757
258
+ # 500 0.6770
259
+ # 750 0.6781
260
+ # de_sb10k
261
+ # 150 0.6745
262
+ # 200 0.6798 <---
263
+ # 250 0.6459
264
+ # 300 0.6665
265
+ # 400 0.6521
266
+ # 500 0.6584
267
+ # 750 0.6447
268
+ parser.add_argument('--bilstm_hidden_dim', type=int, default=300, help="Dimension of the bilstm to use")
269
+
270
+ parser.add_argument('--maxpool_width', type=int, default=1, help="Width of the maxpool kernel to use")
271
+
272
+ parser.add_argument('--no_constituency_backprop', dest='constituency_backprop', default=True, action='store_false', help="When using a constituency parser, backprop into the parser's weights if True")
273
+ parser.add_argument('--constituency_model', type=str, default="/home/john/stanza_resources/it/constituency/vit_bert.pt", help="Which constituency model to use. TODO: make this more user friendly")
274
+ parser.add_argument('--constituency_batch_norm', default=False, action='store_true', help='Add a LayerNorm between the output of the parser and the classifier layers')
275
+ parser.add_argument('--constituency_node_attn', default=False, action='store_true', help='True means to make an attn layer out of the tree, with the words as key and nodes as query')
276
+ parser.add_argument('--no_constituency_node_attn', dest='constituency_node_attn', action='store_false', help='True means to make an attn layer out of the tree, with the words as key and nodes as query')
277
+ parser.add_argument('--constituency_top_layer', dest='constituency_top_layer', default=False, action='store_true', help='True means use the top (ROOT) layer of the constituents. Otherwise, the next layer down (S, usually) will be used')
278
+ parser.add_argument('--no_constituency_top_layer', dest='constituency_top_layer', action='store_false', help='True means use the top (ROOT) layer of the constituents. Otherwise, the next layer down (S, usually) will be used')
279
+ parser.add_argument('--constituency_all_words', default=False, action='store_true', help='Use all word positions in the constituency classifier')
280
+ parser.add_argument('--no_constituency_all_words', dest='constituency_all_words', default=False, action='store_false', help='Use the start and end word embeddings as inputs to the constituency classifier')
281
+
282
+ parser.add_argument('--log_norms', default=False, action='store_true', help='Log the parameters norms while training. A very noisy option')
283
+
284
+ parser.add_argument('--wandb', action='store_true', help='Start a wandb session and write the results of training. Only applies to training. Use --wandb_name instead to specify a name')
285
+ parser.add_argument('--wandb_name', default=None, help='Name of a wandb session to start when training. Will default to the dataset short name')
286
+
287
+ parser.add_argument('--seed', default=None, type=int, help='Random seed for model')
288
+
289
+ add_peft_args(parser)
290
+ utils.add_device_args(parser)
291
+
292
+ return parser
293
+
294
+ def build_model_filename(args):
295
+ shape = "FS_%s" % "_".join([str(x) for x in args.filter_sizes])
296
+ shape = shape + "_C_%d_" % args.filter_channels
297
+ if args.fc_shapes:
298
+ shape = shape + "_FC_%s_" % "_".join([str(x) for x in args.fc_shapes])
299
+
300
+ model_save_file = utils.standard_model_file_name(vars(args), "classifier", shape=shape, classifier_type=args.model_type.name)
301
+ logger.info("Expanded save_name: %s", model_save_file)
302
+ return model_save_file
303
+
304
+ def parse_args(args=None):
305
+ """
306
+ Add arguments for building the classifier.
307
+ Parses command line args and returns the result.
308
+ """
309
+ parser = build_argparse()
310
+ args = parser.parse_args(args)
311
+ resolve_peft_args(args, tlogger)
312
+
313
+ if args.wandb_name:
314
+ args.wandb = True
315
+
316
+ args.optim = args.optim.lower()
317
+ if args.weight_decay is None:
318
+ args.weight_decay = DEFAULT_WEIGHT_DECAY.get(args.optim, None)
319
+ if args.momentum is None:
320
+ args.momentum = DEFAULT_MOMENTUM.get(args.optim, None)
321
+ if args.learning_rate is None:
322
+ args.learning_rate = DEFAULT_LEARNING_RATES.get(args.optim, None)
323
+
324
+ return args
325
+
326
+
327
+ def dataset_predictions(model, dataset):
328
+ model.eval()
329
+ index_label_map = {x: y for (x, y) in enumerate(model.labels)}
330
+
331
+ dataset_lengths = data.sort_dataset_by_len(dataset, keep_index=True)
332
+
333
+ predictions = []
334
+ o_idx = []
335
+ for length in dataset_lengths.keys():
336
+ batch = dataset_lengths[length]
337
+ output = model([x[0] for x in batch])
338
+ for i in range(len(batch)):
339
+ predicted = torch.argmax(output[i])
340
+ predicted_label = index_label_map[predicted.item()]
341
+ predictions.append(predicted_label)
342
+ o_idx.append(batch[i][1])
343
+
344
+ predictions = utils.unsort(predictions, o_idx)
345
+ return predictions
346
+
347
+ def confusion_dataset(predictions, dataset, labels):
348
+ """
349
+ Returns a confusion matrix
350
+
351
+ First key: gold
352
+ Second key: predicted
353
+ so: confusion_matrix[gold][predicted]
354
+ """
355
+ confusion_matrix = {}
356
+ for label in labels:
357
+ confusion_matrix[label] = {}
358
+
359
+ for predicted_label, datum in zip(predictions, dataset):
360
+ expected_label = datum.sentiment
361
+ confusion_matrix[expected_label][predicted_label] = confusion_matrix[expected_label].get(predicted_label, 0) + 1
362
+
363
+ return confusion_matrix
364
+
365
+
366
+ def score_dataset(model, dataset, label_map=None,
367
+ remap_labels=None, forgive_unmapped_labels=False):
368
+ """
369
+ remap_labels: a dict from old label to new label to use when
370
+ testing a classifier on a dataset with a simpler label set.
371
+ For example, a model trained on 5 class sentiment can be tested
372
+ on a binary distribution with {"0": "0", "1": "0", "3": "1", "4": "1"}
373
+
374
+ forgive_unmapped_labels says the following: in the case that the
375
+ model predicts "2" in the above example for remap_labels, instead
376
+ treat the model's prediction as whichever label it gave the
377
+ highest score
378
+ """
379
+ model.eval()
380
+ if label_map is None:
381
+ label_map = {x: y for (y, x) in enumerate(model.labels)}
382
+ correct = 0
383
+ dataset_lengths = data.sort_dataset_by_len(dataset)
384
+
385
+ for length in dataset_lengths.keys():
386
+ # TODO: possibly break this up into smaller batches
387
+ batch = dataset_lengths[length]
388
+ expected_labels = [label_map[x.sentiment] for x in batch]
389
+
390
+ output = model(batch)
391
+
392
+ for i in range(len(expected_labels)):
393
+ predicted = torch.argmax(output[i])
394
+ predicted_label = predicted.item()
395
+ if remap_labels:
396
+ if predicted_label in remap_labels:
397
+ predicted_label = remap_labels[predicted_label]
398
+ else:
399
+ found = False
400
+ if forgive_unmapped_labels:
401
+ items = []
402
+ for j in range(len(output[i])):
403
+ items.append((output[i][j].item(), j))
404
+ items.sort(key=lambda x: -x[0])
405
+ for _, item in items:
406
+ if item in remap_labels:
407
+ predicted_label = remap_labels[item]
408
+ found = True
409
+ break
410
+ # if slack guesses allowed, none of the existing
411
+ # labels matched, so we count it wrong. if slack
412
+ # guesses not allowed, just count it wrong
413
+ if not found:
414
+ continue
415
+
416
+ if predicted_label == expected_labels[i]:
417
+ correct = correct + 1
418
+ return correct
419
+
420
+ def score_dev_set(model, dev_set, dev_eval_scoring):
421
+ predictions = dataset_predictions(model, dev_set)
422
+ confusion_matrix = confusion_dataset(predictions, dev_set, model.labels)
423
+ logger.info("Dev set confusion matrix:\n{}".format(format_confusion(confusion_matrix, model.labels)))
424
+ correct, total = confusion_to_accuracy(confusion_matrix)
425
+ macro_f1 = confusion_to_macro_f1(confusion_matrix)
426
+ logger.info("Dev set: %d correct of %d examples. Accuracy: %f" %
427
+ (correct, len(dev_set), correct / len(dev_set)))
428
+ logger.info("Macro f1: {}".format(macro_f1))
429
+
430
+ accuracy = correct / total
431
+ if dev_eval_scoring is DevScoring.ACCURACY:
432
+ return accuracy, accuracy, macro_f1
433
+ elif dev_eval_scoring is DevScoring.WEIGHTED_F1:
434
+ return macro_f1, accuracy, macro_f1
435
+ else:
436
+ raise ValueError("Unknown scoring method {}".format(dev_eval_scoring))
437
+
438
+ def intermediate_name(filename, epoch, dev_scoring, score):
439
+ """
440
+ Build an informative intermediate checkpoint name from a base name, epoch #, and accuracy
441
+ """
442
+ root, ext = os.path.splitext(filename)
443
+ return root + ".E{epoch:04d}-{score_type}{acc:05.2f}".format(**{"epoch": epoch, "score_type": dev_scoring.value, "acc": score * 100}) + ext
444
+
445
+ def log_param_sizes(model):
446
+ logger.debug("--- Model parameter sizes ---")
447
+ total_size = 0
448
+ for name, param in model.named_parameters():
449
+ param_size = param.element_size() * param.nelement()
450
+ total_size += param_size
451
+ logger.debug(" %s %d %d %d", name, param.element_size(), param.nelement(), param_size)
452
+ logger.debug(" Total size: %d", total_size)
453
+
454
+ def train_model(trainer, model_file, checkpoint_file, args, train_set, dev_set, labels):
455
+ tlogger.setLevel(logging.DEBUG)
456
+
457
+ # TODO: use a (torch) dataloader to possibly speed up the GPU usage
458
+ model = trainer.model
459
+ optimizer = trainer.optimizer
460
+
461
+ device = next(model.parameters()).device
462
+ logger.info("Current device: %s" % device)
463
+
464
+ label_map = {x: y for (y, x) in enumerate(labels)}
465
+ label_tensors = {x: torch.tensor(y, requires_grad=False, device=device)
466
+ for (y, x) in enumerate(labels)}
467
+
468
+ process_outputs = lambda x: x
469
+ if args.loss == Loss.CROSS:
470
+ logger.info("Creating CrossEntropyLoss")
471
+ loss_function = nn.CrossEntropyLoss()
472
+ elif args.loss == Loss.WEIGHTED_CROSS:
473
+ logger.info("Creating weighted cross entropy loss w/o log")
474
+ loss_function = loss.weighted_cross_entropy_loss([label_map[x[0]] for x in train_set], log_dampened=False)
475
+ elif args.loss == Loss.LOG_CROSS:
476
+ logger.info("Creating weighted cross entropy loss w/ log")
477
+ loss_function = loss.weighted_cross_entropy_loss([label_map[x[0]] for x in train_set], log_dampened=True)
478
+ elif args.loss == Loss.FOCAL:
479
+ try:
480
+ from focal_loss.focal_loss import FocalLoss
481
+ except ImportError:
482
+ raise ImportError("focal_loss not installed. Must `pip install focal_loss_torch` to use the --loss=focal feature")
483
+ logger.info("Creating FocalLoss with loss %f", args.loss_focal_gamma)
484
+ process_outputs = lambda x: torch.softmax(x, dim=1)
485
+ loss_function = FocalLoss(gamma=args.loss_focal_gamma)
486
+ else:
487
+ raise ValueError("Unknown loss function {}".format(args.loss))
488
+ loss_function.to(device)
489
+
490
+ train_set_by_len = data.sort_dataset_by_len(train_set)
491
+
492
+ if trainer.global_step > 0:
493
+ # We reloaded the model, so let's report its current dev set score
494
+ _ = score_dev_set(model, dev_set, args.dev_eval_scoring)
495
+ logger.info("Reloaded model for continued training.")
496
+ if trainer.best_score is not None:
497
+ logger.info("Previous best score: %.5f", trainer.best_score)
498
+
499
+ log_param_sizes(model)
500
+
501
+ # https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
502
+ if args.wandb:
503
+ import wandb
504
+ wandb_name = args.wandb_name if args.wandb_name else "%s_classifier" % args.shorthand
505
+ wandb.init(name=wandb_name, config=args)
506
+ wandb.run.define_metric('accuracy', summary='max')
507
+ wandb.run.define_metric('macro_f1', summary='max')
508
+ wandb.run.define_metric('epoch_loss', summary='min')
509
+
510
+ for opt_name, opt in optimizer.items():
511
+ current_lr = opt.param_groups[0]['lr']
512
+ logger.info("optimizer %s learning rate: %s", opt_name, current_lr)
513
+
514
+ # if this is a brand new training run, and we're saving all intermediate models, save the start model as well
515
+ if args.save_intermediate_models and trainer.epochs_trained == 0:
516
+ intermediate_file = intermediate_name(model_file, trainer.epochs_trained, args.dev_eval_scoring, 0.0)
517
+ trainer.save(intermediate_file, save_optimizer=False)
518
+ for trainer.epochs_trained in range(trainer.epochs_trained, args.max_epochs):
519
+ running_loss = 0.0
520
+ epoch_loss = 0.0
521
+ shuffled_batches = data.shuffle_dataset(train_set_by_len, args.batch_size, args.batch_single_item)
522
+
523
+ model.train()
524
+ logger.info("Starting epoch %d", trainer.epochs_trained)
525
+ if args.log_norms:
526
+ model.log_norms()
527
+
528
+ for batch_num, batch in enumerate(shuffled_batches):
529
+ # logger.debug("Batch size %d max len %d" % (len(batch), max(len(x.text) for x in batch)))
530
+ trainer.global_step += 1
531
+ logger.debug("Starting batch: %d step %d", batch_num, trainer.global_step)
532
+
533
+ batch_labels = torch.stack([label_tensors[x.sentiment] for x in batch])
534
+
535
+ # zero the parameter gradients
536
+ for opt in optimizer.values():
537
+ opt.zero_grad()
538
+
539
+ outputs = model(batch)
540
+ outputs = process_outputs(outputs)
541
+ batch_loss = loss_function(outputs, batch_labels)
542
+ batch_loss.backward()
543
+ for opt in optimizer.values():
544
+ opt.step()
545
+
546
+ # print statistics
547
+ running_loss += batch_loss.item()
548
+ if (batch_num + 1) % args.tick == 0: # print every so many batches
549
+ train_loss = running_loss / args.tick
550
+ logger.info('[%d, %5d] Average loss: %.3f', trainer.epochs_trained + 1, batch_num + 1, train_loss)
551
+ if args.wandb:
552
+ wandb.log({'train_loss': train_loss}, step=trainer.global_step)
553
+ if args.dev_eval_batches > 0 and (batch_num + 1) % args.dev_eval_batches == 0:
554
+ logger.info('---- Interim analysis ----')
555
+ dev_score, accuracy, macro_f1 = score_dev_set(model, dev_set, args.dev_eval_scoring)
556
+ if args.wandb:
557
+ wandb.log({'accuracy': accuracy, 'macro_f1': macro_f1}, step=trainer.global_step)
558
+ if trainer.best_score is None or dev_score > trainer.best_score:
559
+ trainer.best_score = dev_score
560
+ trainer.save(model_file, save_optimizer=False)
561
+ logger.info("Saved new best score model! Accuracy %.5f Macro F1 %.5f Epoch %5d Batch %d" % (accuracy, macro_f1, trainer.epochs_trained+1, batch_num+1))
562
+ model.train()
563
+ if args.log_norms:
564
+ trainer.model.log_norms()
565
+ epoch_loss += running_loss
566
+ running_loss = 0.0
567
+ # Add any leftover loss to the epoch_loss
568
+ epoch_loss += running_loss
569
+
570
+ logger.info("Finished epoch %d Total loss %.3f" % (trainer.epochs_trained + 1, epoch_loss))
571
+ dev_score, accuracy, macro_f1 = score_dev_set(model, dev_set, args.dev_eval_scoring)
572
+ if args.wandb:
573
+ wandb.log({'accuracy': accuracy, 'macro_f1': macro_f1, 'epoch_loss': epoch_loss}, step=trainer.global_step)
574
+ if checkpoint_file:
575
+ trainer.save(checkpoint_file, epochs_trained = trainer.epochs_trained + 1)
576
+ if args.save_intermediate_models:
577
+ intermediate_file = intermediate_name(model_file, trainer.epochs_trained + 1, args.dev_eval_scoring, dev_score)
578
+ trainer.save(intermediate_file, save_optimizer=False)
579
+ if trainer.best_score is None or dev_score > trainer.best_score:
580
+ trainer.best_score = dev_score
581
+ trainer.save(model_file, save_optimizer=False)
582
+ logger.info("Saved new best score model! Accuracy %.5f Macro F1 %.5f Epoch %5d" % (accuracy, macro_f1, trainer.epochs_trained+1))
583
+
584
+ if args.wandb:
585
+ wandb.finish()
586
+
587
+ def main(args=None):
588
+ args = parse_args(args)
589
+ seed = utils.set_random_seed(args.seed)
590
+ logger.info("Using random seed: %d" % seed)
591
+
592
+ utils.ensure_dir(args.save_dir)
593
+
594
+ save_name = build_model_filename(args)
595
+
596
+ # TODO: maybe the dataset needs to be in a torch data loader in order to
597
+ # make cuda operations faster
598
+ checkpoint_file = None
599
+ if args.train:
600
+ train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
601
+ logger.info("Using training set: %s" % args.train_file)
602
+ logger.info("Training set has %d labels" % len(data.dataset_labels(train_set)))
603
+ tlogger.setLevel(logging.DEBUG)
604
+
605
+ tlogger.info("Saving checkpoints: %s", args.checkpoint)
606
+ if args.checkpoint:
607
+ checkpoint_file = utils.checkpoint_name(args.save_dir, save_name, args.checkpoint_save_name)
608
+ tlogger.info("Checkpoint filename: %s", checkpoint_file)
609
+ elif not args.load_name:
610
+ if save_name:
611
+ args.load_name = save_name
612
+ else:
613
+ raise ValueError("No model provided and not asked to train a model. This makes no sense")
614
+ else:
615
+ train_set = None
616
+
617
+ if args.train and checkpoint_file is not None and os.path.exists(checkpoint_file):
618
+ trainer = Trainer.load(checkpoint_file, args, load_optimizer=args.train)
619
+ elif args.load_name:
620
+ trainer = Trainer.load(args.load_name, args, load_optimizer=args.train)
621
+ else:
622
+ trainer = Trainer.build_new_model(args, train_set)
623
+
624
+ trainer.model.log_configuration()
625
+
626
+ if args.train:
627
+ utils.log_training_args(args, logger)
628
+
629
+ dev_set = data.read_dataset(args.dev_file, args.wordvec_type, min_len=None)
630
+ logger.info("Using dev set: %s", args.dev_file)
631
+ logger.info("Training set has %d items", len(train_set))
632
+ logger.info("Dev set has %d items", len(dev_set))
633
+ data.check_labels(trainer.model.labels, dev_set)
634
+
635
+ train_model(trainer, save_name, checkpoint_file, args, train_set, dev_set, trainer.model.labels)
636
+
637
+ if args.log_norms:
638
+ trainer.model.log_norms()
639
+ test_set = data.read_dataset(args.test_file, args.wordvec_type, min_len=None)
640
+ logger.info("Using test set: %s" % args.test_file)
641
+ data.check_labels(trainer.model.labels, test_set)
642
+
643
+ if args.test_remap_labels is None:
644
+ predictions = dataset_predictions(trainer.model, test_set)
645
+ confusion_matrix = confusion_dataset(predictions, test_set, trainer.model.labels)
646
+ if args.output_predictions:
647
+ logger.info("List of predictions: %s", predictions)
648
+ logger.info("Confusion matrix:\n{}".format(format_confusion(confusion_matrix, trainer.model.labels)))
649
+ correct, total = confusion_to_accuracy(confusion_matrix)
650
+ logger.info("Macro f1: {}".format(confusion_to_macro_f1(confusion_matrix)))
651
+ else:
652
+ correct = score_dataset(trainer.model, test_set,
653
+ remap_labels=args.test_remap_labels,
654
+ forgive_unmapped_labels=args.forgive_unmapped_labels)
655
+ total = len(test_set)
656
+ logger.info("Test set: %d correct of %d examples. Accuracy: %f" %
657
+ (correct, total, correct / total))
658
+
659
+ if __name__ == '__main__':
660
+ main()
stanza/stanza/models/parser.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Entry point for training and evaluating a dependency parser.
3
+
4
+ This implementation combines a deep biaffine graph-based parser with linearization and distance features.
5
+ For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf.
6
+ """
7
+
8
+ """
9
+ Training and evaluation for the parser.
10
+ """
11
+
12
+ import sys
13
+ import os
14
+ import copy
15
+ import shutil
16
+ import time
17
+ import argparse
18
+ import logging
19
+ import numpy as np
20
+ import random
21
+ import torch
22
+ from torch import nn, optim
23
+
24
+ import stanza.models.depparse.data as data
25
+ from stanza.models.depparse.data import DataLoader
26
+ from stanza.models.depparse.trainer import Trainer
27
+ from stanza.models.depparse import scorer
28
+ from stanza.models.common import utils
29
+ from stanza.models.common import pretrain
30
+ from stanza.models.common.data import augment_punct
31
+ from stanza.models.common.doc import *
32
+ from stanza.models.common.peft_config import add_peft_args, resolve_peft_args
33
+ from stanza.utils.conll import CoNLL
34
+ from stanza.models import _training_logging
35
+
36
+ logger = logging.getLogger('stanza')
37
+
38
+ def build_argparse():
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument('--data_dir', type=str, default='data/depparse', help='Root dir for saving models.')
41
+ parser.add_argument('--wordvec_dir', type=str, default='extern_data/word2vec', help='Directory of word vectors.')
42
+ parser.add_argument('--wordvec_file', type=str, default=None, help='Word vectors filename.')
43
+ parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
44
+ parser.add_argument('--train_file', type=str, default=None, help='Input file for data loader.')
45
+ parser.add_argument('--eval_file', type=str, default=None, help='Input file for data loader.')
46
+ parser.add_argument('--output_file', type=str, default=None, help='Output CoNLL-U file.')
47
+ parser.add_argument('--no_gold_labels', dest='gold_labels', action='store_false', help="Don't score the eval file - perhaps it has no gold labels, for example. Cannot be used at training time")
48
+ parser.add_argument('--mode', default='train', choices=['train', 'predict'])
49
+ parser.add_argument('--lang', type=str, help='Language')
50
+ parser.add_argument('--shorthand', type=str, help="Treebank shorthand")
51
+
52
+ parser.add_argument('--hidden_dim', type=int, default=400)
53
+ parser.add_argument('--char_hidden_dim', type=int, default=400)
54
+ parser.add_argument('--deep_biaff_hidden_dim', type=int, default=400)
55
+ parser.add_argument('--composite_deep_biaff_hidden_dim', type=int, default=100)
56
+ parser.add_argument('--word_emb_dim', type=int, default=75)
57
+ parser.add_argument('--char_emb_dim', type=int, default=100)
58
+ parser.add_argument('--tag_emb_dim', type=int, default=50)
59
+ parser.add_argument('--no_upos', dest='use_upos', action='store_false', default=True, help="Don't use upos tags as part of the tag embedding")
60
+ parser.add_argument('--no_xpos', dest='use_xpos', action='store_false', default=True, help="Don't use xpos tags as part of the tag embedding")
61
+ parser.add_argument('--no_ufeats', dest='use_ufeats', action='store_false', default=True, help="Don't use ufeats as part of the tag embedding")
62
+ parser.add_argument('--transformed_dim', type=int, default=125)
63
+ parser.add_argument('--num_layers', type=int, default=3)
64
+ parser.add_argument('--char_num_layers', type=int, default=1)
65
+ parser.add_argument('--checkpoint_save_name', type=str, default=None, help="File name to save the most recent checkpoint")
66
+ parser.add_argument('--no_checkpoint', dest='checkpoint', action='store_false', help="Don't save checkpoints")
67
+ parser.add_argument('--pretrain_max_vocab', type=int, default=250000)
68
+ parser.add_argument('--word_dropout', type=float, default=0.33)
69
+ parser.add_argument('--dropout', type=float, default=0.5)
70
+ parser.add_argument('--rec_dropout', type=float, default=0, help="Recurrent dropout")
71
+ parser.add_argument('--char_rec_dropout', type=float, default=0, help="Recurrent dropout")
72
+
73
+ parser.add_argument('--no_char', dest='char', action='store_false', help="Turn off character model.")
74
+ parser.add_argument('--charlm', action='store_true', help="Turn on contextualized char embedding using pretrained character-level language model.")
75
+ parser.add_argument('--charlm_save_dir', type=str, default='saved_models/charlm', help="Root dir for pretrained character-level language model.")
76
+ parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.")
77
+ parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm")
78
+ parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm")
79
+
80
+ parser.add_argument('--bert_model', type=str, default=None, help="Use an external bert model (requires the transformers package)")
81
+ parser.add_argument('--no_bert_model', dest='bert_model', action="store_const", const=None, help="Don't use bert")
82
+ parser.add_argument('--bert_hidden_layers', type=int, default=4, help="How many layers of hidden state to use from the transformer")
83
+ parser.add_argument('--bert_hidden_layers_original', action='store_const', const=None, dest='bert_hidden_layers', help='Use layers 2,3,4 of the Bert embedding')
84
+ parser.add_argument('--bert_finetune', default=False, action='store_true', help='Finetune the bert (or other transformer)')
85
+ parser.add_argument('--no_bert_finetune', dest='bert_finetune', action='store_false', help="Don't finetune the bert (or other transformer)")
86
+ parser.add_argument('--bert_finetune_layers', default=None, type=int, help='Only finetune this many layers from the transformer')
87
+ parser.add_argument('--bert_learning_rate', default=1.0, type=float, help='Scale the learning rate for transformer finetuning by this much')
88
+ parser.add_argument('--second_bert_learning_rate', default=1e-3, type=float, help='Secondary stage transformer finetuning learning rate scale')
89
+ parser.add_argument('--bert_start_finetuning', default=200, type=int, help='When to start finetuning the transformer')
90
+ parser.add_argument('--bert_warmup_steps', default=200, type=int, help='How many steps for a linear warmup when finetuning the transformer')
91
+ parser.add_argument('--bert_weight_decay', default=0.0, type=float, help='Weight decay bert parameters by this much')
92
+
93
+ parser.add_argument('--no_pretrain', dest='pretrain', action='store_false', help="Turn off pretrained embeddings.")
94
+ parser.add_argument('--no_linearization', dest='linearization', action='store_false', help="Turn off linearization term.")
95
+ parser.add_argument('--no_distance', dest='distance', action='store_false', help="Turn off distance term.")
96
+
97
+ parser.add_argument('--sample_train', type=float, default=1.0, help='Subsample training data.')
98
+ parser.add_argument('--optim', type=str, default='adam', help='sgd, adagrad, adam or adamax.')
99
+ parser.add_argument('--second_optim', type=str, default=None, help='sgd, adagrad, adam or adamax.')
100
+ parser.add_argument('--lr', type=float, default=3e-3, help='Learning rate')
101
+ parser.add_argument('--second_lr', type=float, default=3e-4, help='Secondary stage learning rate')
102
+ parser.add_argument('--weight_decay', type=float, default=None, help='Weight decay for the first optimizer')
103
+ parser.add_argument('--beta2', type=float, default=0.95)
104
+ parser.add_argument('--second_optim_start_step', type=int, default=None, help='If set, switch to the second optimizer when stalled or at this step regardless of performance. Normally, the optimizer only switches when the dev scores have stalled for --max_steps_before_stop steps')
105
+ parser.add_argument('--second_warmup_steps', type=int, default=200, help="If set, give the 2nd optimizer a linear warmup. Idea being that the optimizer won't have a good grasp on the initial gradients and square gradients when it first starts")
106
+
107
+ parser.add_argument('--max_steps', type=int, default=50000)
108
+ parser.add_argument('--eval_interval', type=int, default=100)
109
+ parser.add_argument('--checkpoint_interval', type=int, default=500)
110
+ parser.add_argument('--max_steps_before_stop', type=int, default=1000)
111
+ parser.add_argument('--batch_size', type=int, default=5000)
112
+ parser.add_argument('--second_batch_size', type=int, default=None, help='Use a different batch size for the second optimizer. Can be relevant for models with different transformer finetuning settings between optimizers, for example, where the larger batch size is impossible for FT the transformer"')
113
+ parser.add_argument('--max_grad_norm', type=float, default=1.0, help='Gradient clipping.')
114
+ parser.add_argument('--log_step', type=int, default=20, help='Print log every k steps.')
115
+ parser.add_argument('--log_norms', action='store_true', default=False, help='Log the norms of all the parameters (noisy!)')
116
+ parser.add_argument('--save_dir', type=str, default='saved_models/depparse', help='Root dir for saving models.')
117
+ parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_parser.pt", help="File name to save the model")
118
+ parser.add_argument('--continue_from', type=str, default=None, help="File name to preload the model to continue training from")
119
+
120
+ parser.add_argument('--seed', type=int, default=1234)
121
+ add_peft_args(parser)
122
+ utils.add_device_args(parser)
123
+
124
+ parser.add_argument('--augment_nopunct', type=float, default=None, help='Augment the training data by copying this fraction of punct-ending sentences as non-punct. Default of None will aim for roughly 10%%')
125
+
126
+ parser.add_argument('--wandb', action='store_true', help='Start a wandb session and write the results of training. Only applies to training. Use --wandb_name instead to specify a name')
127
+ parser.add_argument('--wandb_name', default=None, help='Name of a wandb session to start when training. Will default to the dataset short name')
128
+ return parser
129
+
130
+ def parse_args(args=None):
131
+ parser = build_argparse()
132
+ args = parser.parse_args(args=args)
133
+ resolve_peft_args(args, logger)
134
+
135
+ if args.wandb_name:
136
+ args.wandb = True
137
+
138
+ args = vars(args)
139
+ return args
140
+
141
+ def main(args=None):
142
+ args = parse_args(args=args)
143
+
144
+ utils.set_random_seed(args['seed'])
145
+
146
+ logger.info("Running parser in {} mode".format(args['mode']))
147
+
148
+ if args['mode'] == 'train':
149
+ return train(args)
150
+ else:
151
+ evaluate(args)
152
+
153
+ def model_file_name(args):
154
+ return utils.standard_model_file_name(args, "parser")
155
+
156
+ # TODO: refactor with everywhere
157
+ def load_pretrain(args):
158
+ pt = None
159
+ if args['pretrain']:
160
+ pretrain_file = pretrain.find_pretrain_file(args['wordvec_pretrain_file'], args['save_dir'], args['shorthand'], args['lang'])
161
+ if os.path.exists(pretrain_file):
162
+ vec_file = None
163
+ else:
164
+ vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
165
+ pt = pretrain.Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
166
+ return pt
167
+
168
+ def predict_dataset(trainer, dev_batch):
169
+ dev_preds = []
170
+ if len(dev_batch) > 0:
171
+ for batch in dev_batch:
172
+ preds = trainer.predict(batch)
173
+ dev_preds += preds
174
+ dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)
175
+ return dev_preds
176
+
177
+ def train(args):
178
+ model_file = model_file_name(args)
179
+ utils.ensure_dir(os.path.split(model_file)[0])
180
+
181
+ # load pretrained vectors if needed
182
+ pretrain = load_pretrain(args)
183
+
184
+ # TODO: refactor. the exact same thing is done in the tagger
185
+ if args['charlm']:
186
+ if args['charlm_shorthand'] is None:
187
+ raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...")
188
+ logger.info('Using pretrained contextualized char embedding')
189
+ if not args['charlm_forward_file']:
190
+ args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])
191
+ if not args['charlm_backward_file']:
192
+ args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])
193
+
194
+ # load data
195
+ logger.info("Loading data with batch size {}...".format(args['batch_size']))
196
+ train_data, _, _ = CoNLL.conll2dict(input_file=args['train_file'])
197
+ # possibly augment the training data with some amount of fake data
198
+ # based on the options chosen
199
+ logger.info("Original data size: {}".format(len(train_data)))
200
+ train_data.extend(augment_punct(train_data, args['augment_nopunct'],
201
+ keep_original_sentences=False))
202
+ logger.info("Augmented data size: {}".format(len(train_data)))
203
+ train_doc = Document(train_data)
204
+ train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
205
+ vocab = train_batch.vocab
206
+ dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
207
+ dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
208
+
209
+ # pred path
210
+ system_pred_file = args['output_file']
211
+
212
+ # skip training if the language does not have training or dev data
213
+ if len(train_batch) == 0 or len(dev_batch) == 0:
214
+ logger.info("Skip training because no data available...")
215
+ sys.exit(0)
216
+
217
+ if args['wandb']:
218
+ import wandb
219
+ wandb_name = args['wandb_name'] if args['wandb_name'] else "%s_depparse" % args['shorthand']
220
+ wandb.init(name=wandb_name, config=args)
221
+ wandb.run.define_metric('train_loss', summary='min')
222
+ wandb.run.define_metric('dev_score', summary='max')
223
+
224
+ logger.info("Training parser...")
225
+ checkpoint_file = None
226
+ if args.get("checkpoint"):
227
+ # calculate checkpoint file name from the save filename
228
+ checkpoint_file = utils.checkpoint_name(args.get("save_dir"), model_file, args.get("checkpoint_save_name"))
229
+ args["checkpoint_save_name"] = checkpoint_file
230
+
231
+ if args.get("checkpoint") and os.path.exists(args["checkpoint_save_name"]):
232
+ trainer = Trainer(args=args, pretrain=pretrain, vocab=vocab, model_file=args["checkpoint_save_name"], device=args['device'], ignore_model_config=True)
233
+ if len(trainer.dev_score_history) > 0:
234
+ logger.info("Continuing from checkpoint %s Model was previously trained for %d steps, with a best dev score of %.4f", args["checkpoint_save_name"], trainer.global_step, max(trainer.dev_score_history))
235
+ elif args["continue_from"]:
236
+ if not os.path.exists(args["continue_from"]):
237
+ raise FileNotFoundError("--continue_from specified, but the file %s does not exist" % args["continue_from"])
238
+ trainer = Trainer(args=args, pretrain=pretrain, vocab=vocab, model_file=args["continue_from"], device=args['device'], ignore_model_config=True, reset_history=True)
239
+ else:
240
+ trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, device=args['device'])
241
+
242
+ max_steps = args['max_steps']
243
+ current_lr = args['lr']
244
+ global_start_time = time.time()
245
+ format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
246
+
247
+ is_second_stage = False
248
+ # start training
249
+ train_loss = 0
250
+ if args['log_norms']:
251
+ trainer.model.log_norms()
252
+ while True:
253
+ do_break = False
254
+ for i, batch in enumerate(train_batch):
255
+ start_time = time.time()
256
+ trainer.global_step += 1
257
+ loss = trainer.update(batch, eval=False) # update step
258
+ train_loss += loss
259
+
260
+ # will checkpoint if we switch optimizers or score a new best score
261
+ force_checkpoint = False
262
+ if trainer.global_step % args['log_step'] == 0:
263
+ duration = time.time() - start_time
264
+ logger.info(format_str.format(trainer.global_step, max_steps, loss, duration, current_lr))
265
+
266
+ if trainer.global_step % args['eval_interval'] == 0:
267
+ # eval on dev
268
+ logger.info("Evaluating on dev set...")
269
+ dev_preds = predict_dataset(trainer, dev_batch)
270
+
271
+ dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
272
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
273
+ _, _, dev_score = scorer.score(system_pred_file, args['eval_file'])
274
+
275
+ train_loss = train_loss / args['eval_interval'] # avg loss per batch
276
+ logger.info("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(trainer.global_step, train_loss, dev_score))
277
+
278
+ if args['wandb']:
279
+ wandb.log({'train_loss': train_loss, 'dev_score': dev_score})
280
+
281
+ train_loss = 0
282
+
283
+ # save best model
284
+ trainer.dev_score_history += [dev_score]
285
+ if dev_score >= max(trainer.dev_score_history):
286
+ trainer.last_best_step = trainer.global_step
287
+ trainer.save(model_file)
288
+ logger.info("new best model saved.")
289
+ force_checkpoint = True
290
+
291
+ for scheduler_name, scheduler in trainer.scheduler.items():
292
+ logger.info('scheduler %s learning rate: %s', scheduler_name, scheduler.get_last_lr())
293
+ if args['log_norms']:
294
+ trainer.model.log_norms()
295
+
296
+ if not is_second_stage and args.get('second_optim', None) is not None:
297
+ if trainer.global_step - trainer.last_best_step >= args['max_steps_before_stop'] or (args['second_optim_start_step'] is not None and trainer.global_step >= args['second_optim_start_step']):
298
+ logger.info("Switching to second optimizer: {}".format(args.get('second_optim', None)))
299
+ global_step = trainer.global_step
300
+ args["second_stage"] = True
301
+ # if the loader gets a model file, it uses secondary optimizer
302
+ # (because of the second_stage = True argument)
303
+ trainer = Trainer(args=args, vocab=trainer.vocab, pretrain=pretrain,
304
+ model_file=model_file, device=args['device'])
305
+ logger.info('Reloading best model to continue from current local optimum')
306
+
307
+ dev_preds = predict_dataset(trainer, dev_batch)
308
+ dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
309
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
310
+ _, _, dev_score = scorer.score(system_pred_file, args['eval_file'])
311
+ logger.info("Reloaded model with dev score %.4f", dev_score)
312
+
313
+ is_second_stage = True
314
+ trainer.global_step = global_step
315
+ trainer.last_best_step = global_step
316
+ if args['second_batch_size'] is not None:
317
+ train_batch.set_batch_size(args['second_batch_size'])
318
+ force_checkpoint = True
319
+ else:
320
+ if trainer.global_step - trainer.last_best_step >= args['max_steps_before_stop']:
321
+ do_break = True
322
+ break
323
+
324
+ if trainer.global_step % args['eval_interval'] == 0 or force_checkpoint:
325
+ # if we need to save checkpoint, do so
326
+ # (save after switching the optimizer, if applicable, so that
327
+ # the new optimizer is the optimizer used if a restart happens)
328
+ if checkpoint_file is not None:
329
+ trainer.save(checkpoint_file, save_optimizer=True)
330
+ logger.info("new model checkpoint saved.")
331
+
332
+ if trainer.global_step >= args['max_steps']:
333
+ do_break = True
334
+ break
335
+
336
+ if do_break: break
337
+
338
+ train_batch.reshuffle()
339
+
340
+ logger.info("Training ended with {} steps.".format(trainer.global_step))
341
+
342
+ if args['wandb']:
343
+ wandb.finish()
344
+
345
+ if len(trainer.dev_score_history) > 0:
346
+ # TODO: technically the iteration position will be wrong if
347
+ # the eval_interval changed when running from a checkpoint
348
+ # could fix this by saving step & score instead of just score
349
+ best_f, best_eval = max(trainer.dev_score_history)*100, np.argmax(trainer.dev_score_history)+1
350
+ logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
351
+ else:
352
+ logger.info("Dev set never evaluated. Saving final model.")
353
+ trainer.save(model_file)
354
+
355
+ return trainer
356
+
357
+ def evaluate(args):
358
+ model_file = model_file_name(args)
359
+ # load pretrained vectors if needed
360
+ pretrain = load_pretrain(args)
361
+
362
+ load_args = {'charlm_forward_file': args.get('charlm_forward_file', None),
363
+ 'charlm_backward_file': args.get('charlm_backward_file', None)}
364
+
365
+ # load model
366
+ logger.info("Loading model from: {}".format(model_file))
367
+ trainer = Trainer(pretrain=pretrain, model_file=model_file, device=args['device'], args=load_args)
368
+ return evaluate_trainer(args, trainer, pretrain)
369
+
370
+ def evaluate_trainer(args, trainer, pretrain):
371
+ system_pred_file = args['output_file']
372
+ loaded_args, vocab = trainer.args, trainer.vocab
373
+
374
+ # load config
375
+ for k in args:
376
+ if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'] or k == 'mode':
377
+ loaded_args[k] = args[k]
378
+
379
+ # load data
380
+ logger.info("Loading data with batch size {}...".format(args['batch_size']))
381
+ doc = CoNLL.conll2doc(input_file=args['eval_file'])
382
+ batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
383
+
384
+ preds = predict_dataset(trainer, batch)
385
+
386
+ # write to file and score
387
+ batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
388
+ CoNLL.write_doc2conll(batch.doc, system_pred_file)
389
+
390
+ if args['gold_labels']:
391
+ gold_doc = CoNLL.conll2doc(input_file=args['eval_file'])
392
+
393
+ # Check for None ... otherwise an inscrutable error occurs later in the scorer
394
+ for sent_idx, sentence in enumerate(gold_doc.sentences):
395
+ for word_idx, word in enumerate(sentence.words):
396
+ if word.deprel is None:
397
+ raise ValueError("Gold document {} has a None at sentence {} word {}\n{:C}".format(args['eval_file'], sent_idx, word_idx, sentence))
398
+
399
+ scorer.score_named_dependencies(batch.doc, gold_doc)
400
+ _, _, score = scorer.score(system_pred_file, args['eval_file'])
401
+
402
+ logger.info("Parser score:")
403
+ logger.info("{} {:.2f}".format(args['shorthand'], score*100))
404
+
405
+ if __name__ == '__main__':
406
+ main()
stanza/stanza/resources/__init__.py ADDED
File without changes
stanza/stanza/server/java_protobuf_requests.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ import subprocess
3
+
4
+ from stanza.models.common.utils import misc_to_space_after
5
+ from stanza.models.constituency.parse_tree import Tree
6
+ from stanza.protobuf import DependencyGraph, FlattenedParseTree
7
+ from stanza.server.client import resolve_classpath
8
+
9
+ def send_request(request, response_type, java_main, classpath=None):
10
+ """
11
+ Use subprocess to run a Java protobuf processor on the given request
12
+
13
+ Returns the protobuf response
14
+ """
15
+ classpath = resolve_classpath(classpath)
16
+ if classpath is None:
17
+ raise ValueError("Classpath is None, Perhaps you need to set the $CLASSPATH or $CORENLP_HOME environment variable to point to a CoreNLP install.")
18
+ pipe = subprocess.run(["java", "-cp", classpath, java_main],
19
+ input=request.SerializeToString(),
20
+ stdout=subprocess.PIPE,
21
+ check=True)
22
+ response = response_type()
23
+ response.ParseFromString(pipe.stdout)
24
+ return response
25
+
26
+ def add_tree_nodes(proto_tree, tree, score):
27
+ # add an open node
28
+ node = proto_tree.nodes.add()
29
+ node.openNode = True
30
+ if score is not None:
31
+ node.score = score
32
+
33
+ # add the content of this node
34
+ node = proto_tree.nodes.add()
35
+ node.value = tree.label
36
+
37
+ # add all children...
38
+ # leaves get just one node
39
+ # branches are called recursively
40
+ for child in tree.children:
41
+ if child.is_leaf():
42
+ node = proto_tree.nodes.add()
43
+ node.value = child.label
44
+ else:
45
+ add_tree_nodes(proto_tree, child, None)
46
+
47
+ node = proto_tree.nodes.add()
48
+ node.closeNode = True
49
+
50
+ def build_tree(tree, score):
51
+ """
52
+ Builds a FlattenedParseTree from CoreNLP.proto
53
+
54
+ Populates the value field from tree.label and iterates through the
55
+ children via tree.children. Should work on any tree structure
56
+ which follows that layout
57
+
58
+ The score will be added to the top node (if it is not None)
59
+
60
+ Operates by recursively calling add_tree_nodes
61
+ """
62
+ proto_tree = FlattenedParseTree()
63
+ add_tree_nodes(proto_tree, tree, score)
64
+ return proto_tree
65
+
66
+ def from_tree(proto_tree):
67
+ """
68
+ Convert a FlattenedParseTree back into a Tree
69
+
70
+ returns Tree, score
71
+ (score might be None if it is missing)
72
+ """
73
+ score = None
74
+ stack = deque()
75
+ for node in proto_tree.nodes:
76
+ if node.HasField("score") and score is None:
77
+ score = node.score
78
+
79
+ if node.openNode:
80
+ if len(stack) > 0 and isinstance(stack[-1], FlattenedParseTree.Node) and stack[-1].openNode:
81
+ raise ValueError("Got a proto with no label on a node: {}".format(proto_tree))
82
+ stack.append(node)
83
+ continue
84
+ if not node.closeNode:
85
+ child = Tree(label=node.value)
86
+ # TODO: do something with the score
87
+ stack.append(child)
88
+ continue
89
+
90
+ # must be a close operation...
91
+ if len(stack) <= 1:
92
+ raise ValueError("Got a proto with too many close operations: {}".format(proto_tree))
93
+ # on a close operation, pop until we hit the open
94
+ # then turn everything in that span into a new node
95
+ children = []
96
+ nextNode = stack.pop()
97
+ while not isinstance(nextNode, FlattenedParseTree.Node):
98
+ children.append(nextNode)
99
+ nextNode = stack.pop()
100
+ if len(children) == 0:
101
+ raise ValueError("Got a proto with an open immediately followed by a close: {}".format(proto_tree))
102
+ children.reverse()
103
+ label = children[0]
104
+ children = children[1:]
105
+ subtree = Tree(label=label.label, children=children)
106
+ stack.append(subtree)
107
+
108
+ if len(stack) > 1:
109
+ raise ValueError("Got a proto which does not close all of the nodes: {}".format(proto_tree))
110
+ tree = stack.pop()
111
+ if not isinstance(tree, Tree):
112
+ raise ValueError("Got a proto which was just one Open operation: {}".format(proto_tree))
113
+ return tree, score
114
+
115
+ def add_token(token_list, word, token):
116
+ """
117
+ Add a token to a proto request.
118
+
119
+ CoreNLP tokens have components of both word and token from stanza.
120
+
121
+ We pass along "after" but not "before"
122
+ """
123
+ if token is None and isinstance(word.id, int):
124
+ raise AssertionError("Only expected word w/o token for 'extra' words")
125
+
126
+ query_token = token_list.add()
127
+ query_token.word = word.text
128
+ query_token.value = word.text
129
+ if word.lemma is not None:
130
+ query_token.lemma = word.lemma
131
+ if word.xpos is not None:
132
+ query_token.pos = word.xpos
133
+ if word.upos is not None:
134
+ query_token.coarseTag = word.upos
135
+ if word.feats and word.feats != "_":
136
+ for feature in word.feats.split("|"):
137
+ key, value = feature.split("=", maxsplit=1)
138
+ query_token.conllUFeatures.key.append(key)
139
+ query_token.conllUFeatures.value.append(value)
140
+ if token is not None:
141
+ if token.ner is not None:
142
+ query_token.ner = token.ner
143
+ if token is not None and len(token.id) > 1:
144
+ query_token.mwtText = token.text
145
+ query_token.isMWT = True
146
+ query_token.isFirstMWT = token.id[0] == word.id
147
+ if token.id[-1] != word.id:
148
+ # if we are not the last word of an MWT token
149
+ # we are absolutely not followed by space
150
+ pass
151
+ else:
152
+ query_token.after = token.spaces_after
153
+
154
+ query_token.index = word.id
155
+ else:
156
+ # presumably empty words won't really be written this way,
157
+ # but we can still keep track of it
158
+ query_token.after = misc_to_space_after(word.misc)
159
+
160
+ query_token.index = word.id[0]
161
+ query_token.emptyIndex = word.id[1]
162
+
163
+ if word.misc and word.misc != "_":
164
+ query_token.conllUMisc = word.misc
165
+ if token is not None and token.misc and token.misc != "_":
166
+ query_token.mwtMisc = token.misc
167
+
168
+ def add_sentence(request_sentences, sentence, num_tokens):
169
+ """
170
+ Add the tokens for this stanza sentence to a list of protobuf sentences
171
+ """
172
+ request_sentence = request_sentences.add()
173
+ request_sentence.tokenOffsetBegin = num_tokens
174
+ request_sentence.tokenOffsetEnd = num_tokens + sum(len(token.words) for token in sentence.tokens)
175
+ for token in sentence.tokens:
176
+ for word in token.words:
177
+ add_token(request_sentence.token, word, token)
178
+ return request_sentence
179
+
180
+ def add_word_to_graph(graph, word, sent_idx, word_idx):
181
+ """
182
+ Add a node and possibly an edge for a word in a basic dependency graph.
183
+ """
184
+ node = graph.node.add()
185
+ node.sentenceIndex = sent_idx+1
186
+ node.index = word_idx+1
187
+
188
+ if word.head != 0 and word.head is not None:
189
+ edge = graph.edge.add()
190
+ edge.source = word.head
191
+ edge.target = word_idx+1
192
+ if word.deprel is not None:
193
+ edge.dep = word.deprel
194
+ else:
195
+ # the receiving side doesn't like null as a dependency
196
+ edge.dep = "_"
197
+
198
+ def convert_networkx_graph(graph_proto, sentence, sent_idx):
199
+ """
200
+ Turns a networkx graph into a DependencyGraph from the proto file
201
+ """
202
+ for token in sentence.tokens:
203
+ for word in token.words:
204
+ add_token(graph_proto.token, word, token)
205
+ for word in sentence.empty_words:
206
+ add_token(graph_proto.token, word, None)
207
+
208
+ dependencies = sentence._enhanced_dependencies
209
+ for target in dependencies:
210
+ if target == 0:
211
+ # don't need to send the explicit root
212
+ continue
213
+ for source in dependencies.predecessors(target):
214
+ if source == 0:
215
+ # unlike with basic, we need to send over the roots,
216
+ # as the enhanced can have loops
217
+ graph_proto.rootNode.append(len(graph_proto.node))
218
+ continue
219
+ for deprel in dependencies.get_edge_data(source, target):
220
+ edge = graph_proto.edge.add()
221
+ if isinstance(source, int):
222
+ edge.source = source
223
+ else:
224
+ edge.source = source[0]
225
+ if source[1] != 0:
226
+ edge.sourceEmpty = source[1]
227
+ if isinstance(target, int):
228
+ edge.target = target
229
+ else:
230
+ edge.target = target[0]
231
+ if target[1] != 0:
232
+ edge.targetEmpty = target[1]
233
+ edge.dep = deprel
234
+ node = graph_proto.node.add()
235
+ node.sentenceIndex = sent_idx + 1
236
+ # the nodes in the networkx graph are indexed from 1, not counting the root
237
+ if isinstance(target, int):
238
+ node.index = target
239
+ else:
240
+ node.index = target[0]
241
+ if target[1] != 0:
242
+ node.emptyIndex = target[1]
243
+ return graph_proto
244
+
245
+ def features_to_string(features):
246
+ if not features:
247
+ return None
248
+ if len(features.key) == 0:
249
+ return None
250
+ return "|".join("%s=%s" % (key, value) for key, value in zip(features.key, features.value))
251
+
252
+ def misc_space_pieces(misc):
253
+ """
254
+ Return only the space-related misc pieces
255
+ """
256
+ if misc is None or misc == "" or misc == "_":
257
+ return misc
258
+ pieces = misc.split("|")
259
+ pieces = [x for x in pieces if x.split("=", maxsplit=1)[0] in ("SpaceAfter", "SpacesAfter", "SpacesBefore")]
260
+ if len(pieces) > 0:
261
+ return "|".join(pieces)
262
+ return None
263
+
264
+ def remove_space_misc(misc):
265
+ """
266
+ Remove any pieces from misc which are space-related
267
+ """
268
+ if misc is None or misc == "" or misc == "_":
269
+ return misc
270
+ pieces = misc.split("|")
271
+ pieces = [x for x in pieces if x.split("=", maxsplit=1)[0] not in ("SpaceAfter", "SpacesAfter", "SpacesBefore")]
272
+ if len(pieces) > 0:
273
+ return "|".join(pieces)
274
+ return None
275
+
276
+ def substitute_space_misc(misc, space_misc):
277
+ space_misc_pieces = space_misc.split("|") if space_misc else []
278
+ space_misc_after = None
279
+ space_misc_before = None
280
+ for piece in space_misc_pieces:
281
+ if piece.startswith("SpaceBefore"):
282
+ space_misc_before = piece
283
+ elif piece.startswith("SpaceAfter") or piece.startswith("SpacesAfter"):
284
+ space_misc_after = piece
285
+ else:
286
+ raise AssertionError("An unknown piece wound up in the misc space fields: %s" % piece)
287
+
288
+ pieces = misc.split("|")
289
+ new_pieces = []
290
+ for piece in pieces:
291
+ if piece.startswith("SpaceBefore"):
292
+ if space_misc_before:
293
+ new_pieces.append(space_misc_before)
294
+ space_misc_before = None
295
+ elif piece.startswith("SpaceAfter") or piece.startswith("SpacesAfter"):
296
+ if space_misc_after:
297
+ new_pieces.append(space_misc_after)
298
+ space_misc_after = None
299
+ else:
300
+ new_pieces.append(piece)
301
+ if space_misc_after:
302
+ new_pieces.append(space_misc_after)
303
+ if space_misc_before:
304
+ new_pieces.append(space_misc_before)
305
+ if len(new_pieces) == 0:
306
+ return None
307
+ return "|".join(new_pieces)
308
+
309
+ class JavaProtobufContext(object):
310
+ """
311
+ A generic context for sending requests to a java program using protobufs in a subprocess
312
+ """
313
+ def __init__(self, classpath, build_response, java_main, extra_args=None):
314
+ self.classpath = resolve_classpath(classpath)
315
+ self.build_response = build_response
316
+ self.java_main = java_main
317
+
318
+ if extra_args is None:
319
+ extra_args = []
320
+ self.extra_args = extra_args
321
+ self.pipe = None
322
+
323
+ def open_pipe(self):
324
+ self.pipe = subprocess.Popen(["java", "-cp", self.classpath, self.java_main, "-multiple"] + self.extra_args,
325
+ stdin=subprocess.PIPE,
326
+ stdout=subprocess.PIPE)
327
+
328
+ def close_pipe(self):
329
+ if self.pipe.poll() is None:
330
+ self.pipe.stdin.write((0).to_bytes(4, 'big'))
331
+ self.pipe.stdin.flush()
332
+ self.pipe = None
333
+
334
+ def __enter__(self):
335
+ self.open_pipe()
336
+ return self
337
+
338
+ def __exit__(self, type, value, traceback):
339
+ self.close_pipe()
340
+
341
+ def process_request(self, request):
342
+ if self.pipe is None:
343
+ raise RuntimeError("Pipe to java process is not open or was closed")
344
+
345
+ text = request.SerializeToString()
346
+ self.pipe.stdin.write(len(text).to_bytes(4, 'big'))
347
+ self.pipe.stdin.write(text)
348
+ self.pipe.stdin.flush()
349
+ response_length = self.pipe.stdout.read(4)
350
+ if len(response_length) < 4:
351
+ raise BrokenPipeError("Could not communicate with java process!")
352
+ response_length = int.from_bytes(response_length, "big")
353
+ response_text = self.pipe.stdout.read(response_length)
354
+ response = self.build_response()
355
+ response.ParseFromString(response_text)
356
+ return response
357
+
stanza/stanza/server/main.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Simple shell program to pipe in
5
+ """
6
+
7
+ import corenlp
8
+
9
+ import json
10
+ import re
11
+ import csv
12
+ import sys
13
+ from collections import namedtuple, OrderedDict
14
+
15
+ FLOAT_RE = re.compile(r"\d*\.\d+")
16
+ INT_RE = re.compile(r"\d+")
17
+
18
+ def dictstr(arg):
19
+ """
20
+ Parse a key=value string as a tuple (key, value) that can be provided as an argument to dict()
21
+ """
22
+ key, value = arg.split("=")
23
+
24
+ if value.lower() == "true" or value.lower() == "false":
25
+ value = bool(value)
26
+ elif INT_RE.match(value):
27
+ value = int(value)
28
+ elif FLOAT_RE.match(value):
29
+ value = float(value)
30
+ return (key, value)
31
+
32
+
33
+ def do_annotate(args):
34
+ args.props = dict(args.props) if args.props else {}
35
+ if args.sentence_mode:
36
+ args.props["ssplit.isOneSentence"] = True
37
+
38
+ with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client:
39
+ for line in args.input:
40
+ if line.startswith("#"): continue
41
+
42
+ ann = client.annotate(line.strip(), output_format=args.format)
43
+
44
+ if args.format == "json":
45
+ if args.sentence_mode:
46
+ ann = ann["sentences"][0]
47
+
48
+ args.output.write(json.dumps(ann))
49
+ args.output.write("\n")
50
+
51
+ def main():
52
+ import argparse
53
+ parser = argparse.ArgumentParser(description='Annotate data')
54
+ parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file to process; each line contains one document (default: stdin)")
55
+ parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="File to write annotations to (default: stdout)")
56
+ parser.add_argument('-f', '--format', choices=["json",], default="json", help="Output format")
57
+ parser.add_argument('-a', '--annotators', nargs="+", type=str, default=["tokenize ssplit lemma pos"], help="A list of annotators")
58
+ parser.add_argument('-s', '--sentence-mode', action="store_true",help="Assume each line of input is a sentence.")
59
+ parser.add_argument('-v', '--verbose-server', action="store_true",help="Server is made verbose")
60
+ parser.add_argument('-m', '--memory', type=str, default="4G", help="Memory to use for the server")
61
+ parser.add_argument('-p', '--props', nargs="+", type=dictstr, help="Properties as a list of key=value pairs")
62
+ parser.set_defaults(func=do_annotate)
63
+
64
+ ARGS = parser.parse_args()
65
+ if ARGS.func is None:
66
+ parser.print_help()
67
+ sys.exit(1)
68
+ else:
69
+ ARGS.func(ARGS)
70
+
71
+ if __name__ == "__main__":
72
+ main()
stanza/stanza/server/morphology.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Direct pipe connection to the Java CoreNLP Morphology class
3
+
4
+ Only effective for English. Must be supplied with PTB scheme xpos, not upos
5
+ """
6
+
7
+
8
+ from stanza.protobuf import MorphologyRequest, MorphologyResponse
9
+ from stanza.server.java_protobuf_requests import send_request, JavaProtobufContext
10
+
11
+
12
+ MORPHOLOGY_JAVA = "edu.stanford.nlp.process.ProcessMorphologyRequest"
13
+
14
+ def send_morphology_request(request):
15
+ return send_request(request, MorphologyResponse, MORPHOLOGY_JAVA)
16
+
17
+ def build_request(words, xpos_tags):
18
+ """
19
+ Turn a list of words and a list of tags into a request
20
+
21
+ tags must be xpos, not upos
22
+ """
23
+ request = MorphologyRequest()
24
+ for word, tag in zip(words, xpos_tags):
25
+ tagged_word = request.words.add()
26
+ tagged_word.word = word
27
+ tagged_word.xpos = tag
28
+ return request
29
+
30
+
31
+ def process_text(words, xpos_tags):
32
+ """
33
+ Get the lemmata for each word/tag pair
34
+
35
+ Currently the return is a MorphologyResponse from CoreNLP.proto
36
+
37
+ tags must be xpos, not upos
38
+ """
39
+ request = build_request(words, xpos_tags)
40
+
41
+ return send_morphology_request(request)
42
+
43
+
44
+
45
+ class Morphology(JavaProtobufContext):
46
+ """
47
+ Morphology context window
48
+
49
+ This is a context window which keeps a process open. Should allow
50
+ for multiple requests without launching new java processes each time.
51
+
52
+ (much faster than calling process_text over and over)
53
+ """
54
+ def __init__(self, classpath=None):
55
+ super(Morphology, self).__init__(classpath, MorphologyResponse, MORPHOLOGY_JAVA)
56
+
57
+ def process(self, words, xpos_tags):
58
+ """
59
+ Get the lemmata for each word/tag pair
60
+ """
61
+ request = build_request(words, xpos_tags)
62
+ return self.process_request(request)
63
+
64
+
65
+ def main():
66
+ # TODO: turn this into a unit test, once a new CoreNLP is released
67
+ words = ["Jennifer", "has", "the", "prettiest", "antennae"]
68
+ tags = ["NNP", "VBZ", "DT", "JJS", "NNS"]
69
+ expected = ["Jennifer", "have", "the", "pretty", "antenna"]
70
+ result = process_text(words, tags)
71
+ lemma = [x.lemma for x in result.words]
72
+ print(lemma)
73
+ assert lemma == expected
74
+
75
+ with Morphology() as morph:
76
+ result = morph.process(words, tags)
77
+ lemma = [x.lemma for x in result.words]
78
+ assert lemma == expected
79
+
80
+ if __name__ == '__main__':
81
+ main()
stanza/stanza/server/parser_eval.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This class runs a Java process to evaluate a treebank prediction using CoreNLP
3
+ """
4
+
5
+ from collections import namedtuple
6
+ import sys
7
+
8
+ import stanza
9
+ from stanza.protobuf import EvaluateParserRequest, EvaluateParserResponse
10
+ from stanza.server.java_protobuf_requests import send_request, build_tree, JavaProtobufContext
11
+ from stanza.models.constituency.tree_reader import read_treebank
12
+
13
+ EVALUATE_JAVA = "edu.stanford.nlp.parser.metrics.EvaluateExternalParser"
14
+
15
+ ParseResult = namedtuple("ParseResult", ['gold', 'predictions', 'state', 'constituents'])
16
+ ScoredTree = namedtuple("ScoredTree", ['tree', 'score'])
17
+
18
+ def build_request(treebank):
19
+ """
20
+ treebank should be a list of pairs: [gold, predictions]
21
+ each predictions is a list of tuples (prediction, score, state)
22
+ state is ignored and can be None
23
+ Note that for now, only one tree is measured, but this may be extensible in the future
24
+ Trees should be in the form of a Tree from parse_tree.py
25
+ """
26
+ request = EvaluateParserRequest()
27
+ for raw_result in treebank:
28
+ gold = raw_result.gold
29
+ predictions = raw_result.predictions
30
+ parse_result = request.treebank.add()
31
+ parse_result.gold.CopyFrom(build_tree(gold, None))
32
+ for pred in predictions:
33
+ if isinstance(pred, tuple):
34
+ prediction, score = pred
35
+ else:
36
+ prediction = pred
37
+ score = None
38
+ try:
39
+ parse_result.predicted.append(build_tree(prediction, score))
40
+ except Exception as e:
41
+ raise RuntimeError("Unable to build parser request from tree {}".format(pred)) from e
42
+
43
+ return request
44
+
45
+ def collate(gold_treebank, predictions_treebank):
46
+ """
47
+ Turns a list of gold and prediction into a evaluation object
48
+ """
49
+ treebank = []
50
+ for gold, prediction in zip(gold_treebank, predictions_treebank):
51
+ result = ParseResult(gold, [prediction], None, None)
52
+ treebank.append(result)
53
+ return treebank
54
+
55
+
56
+ class EvaluateParser(JavaProtobufContext):
57
+ """
58
+ Parser evaluation context window
59
+
60
+ This is a context window which keeps a process open. Should allow
61
+ for multiple requests without launching new java processes each time.
62
+ """
63
+ def __init__(self, classpath=None, kbest=None, silent=False):
64
+ if kbest is not None:
65
+ extra_args = ["-evalPCFGkBest", "{}".format(kbest), "-evals", "pcfgTopK"]
66
+ else:
67
+ extra_args = []
68
+
69
+ if silent:
70
+ extra_args.extend(["-evals", "summary=False"])
71
+
72
+ super(EvaluateParser, self).__init__(classpath, EvaluateParserResponse, EVALUATE_JAVA, extra_args=extra_args)
73
+
74
+ def process(self, treebank):
75
+ request = build_request(treebank)
76
+ return self.process_request(request)
77
+
78
+
79
+ def main():
80
+ gold = read_treebank(sys.argv[1])
81
+ predictions = read_treebank(sys.argv[2])
82
+ treebank = collate(gold, predictions)
83
+
84
+ with EvaluateParser() as ep:
85
+ ep.process(treebank)
86
+
87
+
88
+ if __name__ == '__main__':
89
+ main()
stanza/stanza/server/tokensregex.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Invokes the Java tokensregex on a document
2
+
3
+ This operates tokensregex on docs processed with stanza models.
4
+
5
+ https://nlp.stanford.edu/software/tokensregex.html
6
+
7
+ A minimal example is the main method of this module.
8
+ """
9
+
10
+ import stanza
11
+
12
+ from stanza.protobuf import TokensRegexRequest, TokensRegexResponse
13
+ from stanza.server.java_protobuf_requests import send_request, add_sentence
14
+
15
+ def send_tokensregex_request(request):
16
+ return send_request(request, TokensRegexResponse,
17
+ "edu.stanford.nlp.ling.tokensregex.ProcessTokensRegexRequest")
18
+
19
+ def process_doc(doc, *patterns):
20
+ request = TokensRegexRequest()
21
+ for pattern in patterns:
22
+ request.pattern.append(pattern)
23
+
24
+ request_doc = request.doc
25
+ request_doc.text = doc.text
26
+ num_tokens = 0
27
+ for sentence in doc.sentences:
28
+ add_sentence(request_doc.sentence, sentence, num_tokens)
29
+ num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens)
30
+
31
+ return send_tokensregex_request(request)
32
+
33
+ def main():
34
+ #nlp = stanza.Pipeline('en',
35
+ # processors='tokenize,pos,lemma,ner')
36
+ nlp = stanza.Pipeline('en',
37
+ processors='tokenize')
38
+
39
+ doc = nlp('Uro ruined modern. Fortunately, Wotc banned him')
40
+ print(process_doc(doc, "him", "ruined"))
41
+
42
+
43
+ if __name__ == '__main__':
44
+ main()
stanza/stanza/server/tsurgeon.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Invokes the Java tsurgeon on a list of trees
2
+
3
+ Included with CoreNLP is a mechanism for modifying trees based on
4
+ existing patterns within a tree. The patterns are found using tregex:
5
+
6
+ https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/trees/tregex/TregexPattern.html
7
+
8
+ The modifications are then performed using tsurgeon:
9
+
10
+ https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/trees/tregex/tsurgeon/Tsurgeon.html
11
+
12
+ This module accepts Tree objects as produced by the conparser and
13
+ returns the modified trees that result from one or more tsurgeon
14
+ operations.
15
+ """
16
+
17
+ from stanza.models.constituency import tree_reader
18
+ from stanza.models.constituency.parse_tree import Tree
19
+ from stanza.protobuf import TsurgeonRequest, TsurgeonResponse
20
+ from stanza.server.java_protobuf_requests import send_request, build_tree, from_tree, JavaProtobufContext
21
+
22
+ TSURGEON_JAVA = "edu.stanford.nlp.trees.tregex.tsurgeon.ProcessTsurgeonRequest"
23
+
24
+ def send_tsurgeon_request(request):
25
+ return send_request(request, TsurgeonResponse, TSURGEON_JAVA)
26
+
27
+
28
+ def build_request(trees, operations):
29
+ """
30
+ Build the TsurgeonRequest object
31
+
32
+ trees: a list of trees
33
+ operations: a list of (tregex, tsurgeon, tsurgeon, ...)
34
+ """
35
+ if isinstance(trees, Tree):
36
+ trees = (trees,)
37
+
38
+ request = TsurgeonRequest()
39
+ for tree in trees:
40
+ request.trees.append(build_tree(tree, 0.0))
41
+ if all(isinstance(x, str) for x in operations):
42
+ operations = (operations,)
43
+ for operation in operations:
44
+ if len(operation) == 1:
45
+ raise ValueError("Expected [tregex, tsurgeon, ...] but just got a tregex")
46
+ operation_request = request.operations.add()
47
+ operation_request.tregex = operation[0]
48
+ for tsurgeon in operation[1:]:
49
+ operation_request.tsurgeon.append(tsurgeon)
50
+ return request
51
+
52
+
53
+ def process_trees(trees, *operations):
54
+ """
55
+ Returns the result of processing the given tsurgeon operations on the given trees
56
+
57
+ Returns a list of modified trees, eg, the result is already processed
58
+ """
59
+ request = build_request(trees, operations)
60
+ result = send_tsurgeon_request(request)
61
+
62
+ return [from_tree(t)[0] for t in result.trees]
63
+
64
+
65
+ class Tsurgeon(JavaProtobufContext):
66
+ """
67
+ Tsurgeon context window
68
+
69
+ This is a context window which keeps a process open. Should allow
70
+ for multiple requests without launching new java processes each time.
71
+ """
72
+ def __init__(self, classpath=None):
73
+ super(Tsurgeon, self).__init__(classpath, TsurgeonResponse, TSURGEON_JAVA)
74
+
75
+ def process(self, trees, *operations):
76
+ request = build_request(trees, operations)
77
+ result = self.process_request(request)
78
+ return [from_tree(t)[0] for t in result.trees]
79
+
80
+
81
+ def main():
82
+ """
83
+ A small demonstration of a tsurgeon operation
84
+ """
85
+ text="( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
86
+ trees = tree_reader.read_trees(text)
87
+
88
+ tregex = "WP=wp"
89
+ tsurgeon = "relabel wp WWWPPP"
90
+
91
+ result = process_trees(trees, (tregex, tsurgeon))
92
+ print(result)
93
+
94
+ if __name__ == '__main__':
95
+ main()
stanza/stanza/server/ud_enhancer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import stanza
4
+ from stanza.protobuf import DependencyEnhancerRequest, Document, Language
5
+ from stanza.server.java_protobuf_requests import send_request, add_sentence, JavaProtobufContext
6
+
7
+ ENHANCER_JAVA = "edu.stanford.nlp.trees.ud.ProcessUniversalEnhancerRequest"
8
+
9
+ def build_enhancer_request(doc, language, pronouns_pattern):
10
+ if bool(language) == bool(pronouns_pattern):
11
+ raise ValueError("Should set exactly one of language and pronouns_pattern")
12
+
13
+ request = DependencyEnhancerRequest()
14
+ if pronouns_pattern:
15
+ request.setRelativePronouns(pronouns_pattern)
16
+ elif language.lower() in ("en", "english"):
17
+ request.language = Language.UniversalEnglish
18
+ elif language.lower() in ("zh", "zh-hans", "chinese"):
19
+ request.language = Language.UniversalChinese
20
+ else:
21
+ raise ValueError("Sorry, but language " + language + " is not supported yet. Either set a pronouns pattern or file an issue at https://stanfordnlp.github.io/stanza suggesting a mechanism for converting this language")
22
+
23
+ request_doc = request.document
24
+ request_doc.text = doc.text
25
+ num_tokens = 0
26
+ for sent_idx, sentence in enumerate(doc.sentences):
27
+ request_sentence = add_sentence(request_doc.sentence, sentence, num_tokens)
28
+ num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens)
29
+
30
+ graph = request_sentence.basicDependencies
31
+ nodes = []
32
+ word_index = 0
33
+ for token in sentence.tokens:
34
+ for word in token.words:
35
+ # TODO: refactor with the bit in java_protobuf_requests
36
+ word_index = word_index + 1
37
+ node = graph.node.add()
38
+ node.sentenceIndex = sent_idx
39
+ node.index = word_index
40
+
41
+ if word.head != 0:
42
+ edge = graph.edge.add()
43
+ edge.source = word.head
44
+ edge.target = word_index
45
+ edge.dep = word.deprel
46
+
47
+ return request
48
+
49
+ def process_doc(doc, language=None, pronouns_pattern=None):
50
+ request = build_enhancer_request(doc, language, pronouns_pattern)
51
+ return send_request(request, Document, ENHANCER_JAVA)
52
+
53
+ class UniversalEnhancer(JavaProtobufContext):
54
+ """
55
+ UniversalEnhancer context window
56
+
57
+ This is a context window which keeps a process open. Should allow
58
+ for multiple requests without launching new java processes each time.
59
+ """
60
+ def __init__(self, language=None, pronouns_pattern=None, classpath=None):
61
+ super(UniversalEnhancer, self).__init__(classpath, Document, ENHANCER_JAVA)
62
+ if bool(language) == bool(pronouns_pattern):
63
+ raise ValueError("Should set exactly one of language and pronouns_pattern")
64
+ self.language = language
65
+ self.pronouns_pattern = pronouns_pattern
66
+
67
+ def process(self, doc):
68
+ request = build_enhancer_request(doc, self.language, self.pronouns_pattern)
69
+ return self.process_request(request)
70
+
71
+ def main():
72
+ nlp = stanza.Pipeline('en',
73
+ processors='tokenize,pos,lemma,depparse')
74
+
75
+ with UniversalEnhancer(language="en") as enhancer:
76
+ doc = nlp("This is the car that I bought")
77
+ result = enhancer.process(doc)
78
+ print(result.sentence[0].enhancedDependencies)
79
+
80
+ if __name__ == '__main__':
81
+ main()
stanza/stanza/tests/pytest.ini ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [pytest]
2
+ markers =
3
+ travis: all tests that will be run in travis CI
4
+ client: all tests that are related to the CoreNLP client interface
5
+ pipeline: all tests that are related to the Stanza neural pipeline
stanza/stanza/tests/setup.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import stanza
6
+ from stanza.resources import installation
7
+ from stanza.tests import TEST_HOME_VAR, TEST_DIR_BASE_NAME
8
+
9
+ logger = logging.getLogger('stanza')
10
+
11
+ test_dir = os.getenv(TEST_HOME_VAR, None)
12
+ if not test_dir:
13
+ test_dir = os.path.join(os.getcwd(), TEST_DIR_BASE_NAME)
14
+ logger.info("STANZA_TEST_HOME not set. Will assume $PWD/stanza_test = %s", test_dir)
15
+ logger.info("To use a different directory, export or set STANZA_TEST_HOME=...")
16
+
17
+ in_dir = os.path.join(test_dir, "in")
18
+ out_dir = os.path.join(test_dir, "out")
19
+ scripts_dir = os.path.join(test_dir, "scripts")
20
+ models_dir=os.path.join(test_dir, "models")
21
+ corenlp_dir=os.path.join(test_dir, "corenlp_dir")
22
+
23
+ os.makedirs(test_dir, exist_ok=True)
24
+ os.makedirs(in_dir, exist_ok=True)
25
+ os.makedirs(out_dir, exist_ok=True)
26
+ os.makedirs(scripts_dir, exist_ok=True)
27
+ os.makedirs(models_dir, exist_ok=True)
28
+ os.makedirs(corenlp_dir, exist_ok=True)
29
+
30
+ logger.info("COPYING FILES")
31
+
32
+ shutil.copy("stanza/tests/data/external_server.properties", scripts_dir)
33
+ shutil.copy("stanza/tests/data/example_french.json", out_dir)
34
+ shutil.copy("stanza/tests/data/aws_annotations.zip", in_dir)
35
+ for emb_file in glob.glob("stanza/tests/data/tiny_emb.*"):
36
+ shutil.copy(emb_file, in_dir)
37
+
38
+ logger.info("DOWNLOADING MODELS")
39
+
40
+ stanza.download(lang='en', model_dir=models_dir, logging_level='info')
41
+ stanza.download(lang="en", model_dir=models_dir, package=None, processors={"ner":"ncbi_disease"})
42
+ stanza.download(lang='fr', model_dir=models_dir, logging_level='info')
43
+ # Latin ITTB has no case information for the lemmatizer
44
+ stanza.download(lang='la', model_dir=models_dir, package='ittb', logging_level='info')
45
+ stanza.download(lang='zh', model_dir=models_dir, logging_level='info')
46
+ # useful not just for verifying RtL, but because the default Arabic has a unique style of xpos tags
47
+ stanza.download(lang='ar', model_dir=models_dir, logging_level='info')
48
+ stanza.download(lang='multilingual', model_dir=models_dir, logging_level='info')
49
+
50
+ logger.info("DOWNLOADING CORENLP")
51
+
52
+ installation.install_corenlp(dir=corenlp_dir)
53
+ installation.download_corenlp_models(model="french", version="main", dir=corenlp_dir)
54
+ installation.download_corenlp_models(model="german", version="main", dir=corenlp_dir)
55
+ installation.download_corenlp_models(model="italian", version="main", dir=corenlp_dir)
56
+ installation.download_corenlp_models(model="spanish", version="main", dir=corenlp_dir)
57
+
58
+ logger.info("Test setup completed.")
stanza/stanza/utils/__init__.py ADDED
File without changes
stanza/stanza/utils/confusion.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from collections import defaultdict, namedtuple
3
+
4
+ F1Result = namedtuple("F1Result", ['precision', 'recall', 'f1'])
5
+
6
+ def condense_ner_labels(confusion, gold_labels, pred_labels):
7
+ new_confusion = defaultdict(lambda: defaultdict(int))
8
+ new_gold_labels = []
9
+ new_pred_labels = []
10
+ for l1 in gold_labels:
11
+ if l1.find("-") >= 0:
12
+ new_l1 = l1.split("-", 1)[1]
13
+ else:
14
+ new_l1 = l1
15
+ if new_l1 not in new_gold_labels:
16
+ new_gold_labels.append(new_l1)
17
+ for l2 in pred_labels:
18
+ if l2.find("-") >= 0:
19
+ new_l2 = l2.split("-", 1)[1]
20
+ else:
21
+ new_l2 = l2
22
+ if new_l2 not in new_pred_labels:
23
+ new_pred_labels.append(new_l2)
24
+
25
+ old_value = confusion.get(l1, {}).get(l2, 0)
26
+ new_confusion[new_l1][new_l2] = new_confusion[new_l1][new_l2] + old_value
27
+ return new_confusion, new_gold_labels, new_pred_labels
28
+
29
+
30
+ def format_confusion(confusion, labels=None, hide_zeroes=False, hide_blank=False, transpose=False):
31
+ """
32
+ pretty print for confusion matrixes
33
+ adapted from https://gist.github.com/zachguo/10296432
34
+
35
+ The matrix should look like this:
36
+ confusion[gold][pred]
37
+ """
38
+ def sort_labels(labels):
39
+ """
40
+ Sorts the labels in the list, respecting BIES if all labels are BIES, putting O at the front
41
+ """
42
+ labels = set(labels)
43
+ if 'O' in labels:
44
+ had_O = True
45
+ labels.remove('O')
46
+ else:
47
+ had_O = False
48
+
49
+ if not all(isinstance(x, str) and len(x) > 2 and x[0] in ('B', 'I', 'E', 'S') and x[1] in ('-', '_') for x in labels):
50
+ labels = sorted(labels)
51
+ else:
52
+ # sort first by the body of the lable, then by BEIS
53
+ labels = sorted(labels, key=lambda x: (x[2:], x[0]))
54
+ if had_O:
55
+ labels = ['O'] + labels
56
+ return labels
57
+
58
+ if transpose:
59
+ new_confusion = defaultdict(lambda: defaultdict(int))
60
+ for label1 in confusion.keys():
61
+ for label2 in confusion[label1].keys():
62
+ new_confusion[label2][label1] = confusion[label1][label2]
63
+ confusion = new_confusion
64
+
65
+ if labels is None:
66
+ gold_labels = set(confusion.keys())
67
+ if hide_blank:
68
+ gold_labels = set(x for x in gold_labels if any(confusion[x][key] != 0 for key in confusion[x].keys()))
69
+
70
+ pred_labels = set()
71
+ for key in confusion.keys():
72
+ if hide_blank:
73
+ new_pred_labels = set(x for x in confusion[key].keys() if confusion[key][x] != 0)
74
+ else:
75
+ new_pred_labels = confusion[key].keys()
76
+ pred_labels = pred_labels.union(new_pred_labels)
77
+
78
+ if not hide_blank:
79
+ gold_labels = gold_labels.union(pred_labels)
80
+ pred_labels = gold_labels
81
+
82
+ gold_labels = sort_labels(gold_labels)
83
+ pred_labels = sort_labels(pred_labels)
84
+ else:
85
+ gold_labels = labels
86
+ pred_labels = labels
87
+
88
+ columnwidth = max([len(str(x)) for x in pred_labels] + [5]) # 5 is value length
89
+ empty_cell = " " * columnwidth
90
+
91
+ # If the numbers are all ints, no need to include the .0 at the end of each entry
92
+ all_ints = True
93
+ for i, label1 in enumerate(gold_labels):
94
+ for j, label2 in enumerate(pred_labels):
95
+ if not isinstance(confusion.get(label1, {}).get(label2, 0), int):
96
+ all_ints = False
97
+ break
98
+ if not all_ints:
99
+ break
100
+
101
+ if all_ints:
102
+ format_cell = lambda confusion_cell: "%{0}d".format(columnwidth) % confusion_cell
103
+ else:
104
+ format_cell = lambda confusion_cell: "%{0}.1f".format(columnwidth) % confusion_cell
105
+
106
+ # make sure the columnwidth can handle long numbers
107
+ for i, label1 in enumerate(gold_labels):
108
+ for j, label2 in enumerate(pred_labels):
109
+ cell = confusion.get(label1, {}).get(label2, 0)
110
+ columnwidth = max(columnwidth, len(format_cell(cell)))
111
+
112
+ # if this is an NER confusion matrix (well, if it has - in the labels)
113
+ # try to drop a bunch of labels to make the matrix easier to display
114
+ if columnwidth * len(pred_labels) > 150:
115
+ confusion, gold_labels, pred_labels = condense_ner_labels(confusion, gold_labels, pred_labels)
116
+
117
+ # Print header
118
+ if transpose:
119
+ corner_label = "p\\t"
120
+ else:
121
+ corner_label = "t\\p"
122
+ fst_empty_cell = (columnwidth-3)//2 * " " + corner_label + (columnwidth-3)//2 * " "
123
+ if len(fst_empty_cell) < len(empty_cell):
124
+ fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
125
+ header = " " + fst_empty_cell + " "
126
+ for label in pred_labels:
127
+ header = header + "%{0}s ".format(columnwidth) % str(label)
128
+ text = [header.rstrip()]
129
+
130
+ # Print rows
131
+ for i, label1 in enumerate(gold_labels):
132
+ row = " %{0}s ".format(columnwidth) % str(label1)
133
+ for j, label2 in enumerate(pred_labels):
134
+ confusion_cell = confusion.get(label1, {}).get(label2, 0)
135
+ cell = format_cell(confusion_cell)
136
+ if hide_zeroes:
137
+ cell = cell if confusion_cell else empty_cell
138
+ row = row + cell + " "
139
+ text.append(row.rstrip())
140
+ return "\n".join(text)
141
+
142
+
143
+ def confusion_to_accuracy(confusion_matrix):
144
+ """
145
+ Given a confusion dictionary, return correct, total
146
+ """
147
+ correct = 0
148
+ total = 0
149
+ for l1 in confusion_matrix.keys():
150
+ for l2 in confusion_matrix[l1].keys():
151
+ if l1 == l2:
152
+ correct = correct + confusion_matrix[l1][l2]
153
+ else:
154
+ total = total + confusion_matrix[l1][l2]
155
+ return correct, (correct + total)
156
+
157
+ def confusion_to_f1(confusion_matrix):
158
+ results = {}
159
+
160
+ keys = set()
161
+ for k in confusion_matrix.keys():
162
+ keys.add(k)
163
+ for k2 in confusion_matrix.get(k).keys():
164
+ keys.add(k2)
165
+
166
+ sum_f1 = 0
167
+ for k in keys:
168
+ tp = 0
169
+ fn = 0
170
+ fp = 0
171
+ for k2 in keys:
172
+ if k == k2:
173
+ tp = confusion_matrix.get(k, {}).get(k, 0)
174
+ else:
175
+ fn = fn + confusion_matrix.get(k, {}).get(k2, 0)
176
+ fp = fp + confusion_matrix.get(k2, {}).get(k, 0)
177
+ if tp + fp == 0:
178
+ precision = 0.0
179
+ else:
180
+ precision = tp / (tp + fp)
181
+ if tp + fn == 0:
182
+ recall = 0.0
183
+ else:
184
+ recall = tp / (tp + fn)
185
+ if precision + recall == 0.0:
186
+ f1 = 0.0
187
+ else:
188
+ f1 = 2 * (precision * recall) / (precision + recall)
189
+
190
+ results[k] = F1Result(precision, recall, f1)
191
+
192
+ return results
193
+
194
+ def confusion_to_macro_f1(confusion_matrix):
195
+ """
196
+ Return the macro f1 for a confusion matrix.
197
+ """
198
+ sum_f1 = 0.0
199
+ results = confusion_to_f1(confusion_matrix)
200
+ for k in results.keys():
201
+ sum_f1 = sum_f1 + results[k].f1
202
+
203
+ return sum_f1 / len(results)
204
+
205
+ def confusion_to_weighted_f1(confusion_matrix, exclude=None):
206
+ results = confusion_to_f1(confusion_matrix)
207
+
208
+ sum_f1 = 0.0
209
+ total_items = 0
210
+ for k in results.keys():
211
+ if exclude is not None and k in exclude:
212
+ continue
213
+ k_items = sum(confusion_matrix.get(k, {}).values())
214
+ total_items += k_items
215
+ sum_f1 += results[k].f1 * k_items
216
+ return sum_f1 / total_items
stanza/stanza/utils/conll.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for the loading and conversion of CoNLL-format files.
3
+ """
4
+ import os
5
+ import io
6
+ from zipfile import ZipFile
7
+
8
+ from stanza.models.common.doc import Document
9
+ from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, NER, START_CHAR, END_CHAR
10
+ from stanza.models.common.doc import FIELD_TO_IDX, FIELD_NUM
11
+
12
+ class CoNLLError(ValueError):
13
+ pass
14
+
15
+ class CoNLL:
16
+
17
+ @staticmethod
18
+ def load_conll(f, ignore_gapping=True):
19
+ """ Load the file or string into the CoNLL-U format data.
20
+ Input: file or string reader, where the data is in CoNLL-U format.
21
+ Output: a tuple whose first element is a list of list of list for each token in each sentence in the data,
22
+ where the innermost list represents all fields of a token; and whose second element is a list of lists for each
23
+ comment in each sentence in the data.
24
+ """
25
+ # f is open() or io.StringIO()
26
+ doc, sent = [], []
27
+ doc_comments, sent_comments = [], []
28
+ for line_idx, line in enumerate(f):
29
+ # leave whitespace such as NBSP, in case it is meaningful in the conll-u doc
30
+ line = line.lstrip().rstrip(' \n\r\t')
31
+ if len(line) == 0:
32
+ if len(sent) > 0:
33
+ doc.append(sent)
34
+ sent = []
35
+ doc_comments.append(sent_comments)
36
+ sent_comments = []
37
+ else:
38
+ if line.startswith('#'): # read comment line
39
+ sent_comments.append(line)
40
+ continue
41
+ array = line.split('\t')
42
+ if ignore_gapping and '.' in array[0]:
43
+ continue
44
+ if len(array) != FIELD_NUM:
45
+ raise CoNLLError(f"Cannot parse CoNLL line {line_idx+1}: expecting {FIELD_NUM} fields, {len(array)} found at line {line_idx}\n {array}")
46
+ sent += [array]
47
+ if len(sent) > 0:
48
+ doc.append(sent)
49
+ doc_comments.append(sent_comments)
50
+ return doc, doc_comments
51
+
52
+ @staticmethod
53
+ def convert_conll(doc_conll):
54
+ """ Convert the CoNLL-U format input data to a dictionary format output data.
55
+ Input: list of token fields loaded from the CoNLL-U format data, where the outmost list represents a list of sentences, and the inside list represents all fields of a token.
56
+ Output: a list of list of dictionaries for each token in each sentence in the document.
57
+ """
58
+ doc_dict = []
59
+ doc_empty = []
60
+ for sent_idx, sent_conll in enumerate(doc_conll):
61
+ sent_dict = []
62
+ sent_empty = []
63
+ for token_idx, token_conll in enumerate(sent_conll):
64
+ try:
65
+ token_dict = CoNLL.convert_conll_token(token_conll)
66
+ except ValueError as e:
67
+ raise CoNLLError("Could not process sentence %d token %d: %s" % (sent_idx, token_idx, str(e))) from e
68
+ if '.' in token_dict[ID]:
69
+ token_dict[ID] = tuple(int(x) for x in token_dict[ID].split(".", maxsplit=1))
70
+ sent_empty.append(token_dict)
71
+ else:
72
+ token_dict[ID] = tuple(int(x) for x in token_dict[ID].split("-", maxsplit=1))
73
+ sent_dict.append(token_dict)
74
+ doc_dict.append(sent_dict)
75
+ doc_empty.append(sent_empty)
76
+ return doc_dict, doc_empty
77
+
78
+ @staticmethod
79
+ def convert_dict(doc_dict):
80
+ """ Convert the dictionary format input data to the CoNLL-U format output data.
81
+
82
+ This is the reverse function of `convert_conll`, but does not include sentence level annotations or comments.
83
+
84
+ Can call this on a Document using `CoNLL.convert_dict(doc.to_dict())`
85
+
86
+ Input: dictionary format data, which is a list of list of dictionaries for each token in each sentence in the data.
87
+ Output: CoNLL-U format data as a list of list of list for each token in each sentence in the data.
88
+ """
89
+ doc = Document(doc_dict)
90
+ text = "{:c}".format(doc)
91
+ sentences = text.split("\n\n")
92
+ doc_conll = [[x.split("\t") for x in sentence.split("\n")] for sentence in sentences]
93
+ return doc_conll
94
+
95
+ @staticmethod
96
+ def convert_conll_token(token_conll):
97
+ """ Convert the CoNLL-U format input token to the dictionary format output token.
98
+ Input: a list of all CoNLL-U fields for the token.
99
+ Output: a dictionary that maps from field name to value.
100
+ """
101
+ token_dict = {}
102
+ for field in FIELD_TO_IDX:
103
+ value = token_conll[FIELD_TO_IDX[field]]
104
+ if value != '_':
105
+ if field == HEAD:
106
+ token_dict[field] = int(value)
107
+ else:
108
+ token_dict[field] = value
109
+ # special case if text is '_'
110
+ if token_conll[FIELD_TO_IDX[TEXT]] == '_':
111
+ token_dict[TEXT] = token_conll[FIELD_TO_IDX[TEXT]]
112
+ token_dict[LEMMA] = token_conll[FIELD_TO_IDX[LEMMA]]
113
+ return token_dict
114
+
115
+ @staticmethod
116
+ def conll2dict(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
117
+ """ Load the CoNLL-U format data from file or string into lists of dictionaries.
118
+ """
119
+ assert any([input_file, input_str]) and not all([input_file, input_str]), 'either use input file or input string'
120
+ if zip_file: assert input_file, 'must provide input_file if zip_file is set'
121
+
122
+ if input_str:
123
+ infile = io.StringIO(input_str)
124
+ doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping)
125
+ elif zip_file:
126
+ with ZipFile(zip_file) as zin:
127
+ with zin.open(input_file) as fin:
128
+ doc_conll, doc_comments = CoNLL.load_conll(io.TextIOWrapper(fin, encoding="utf-8"), ignore_gapping)
129
+ else:
130
+ with open(input_file, encoding='utf-8') as fin:
131
+ doc_conll, doc_comments = CoNLL.load_conll(fin, ignore_gapping)
132
+
133
+ doc_dict, doc_empty = CoNLL.convert_conll(doc_conll)
134
+ return doc_dict, doc_comments, doc_empty
135
+
136
+ @staticmethod
137
+ def conll2doc(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
138
+ doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file)
139
+ return Document(doc_dict, text=None, comments=doc_comments, empty_sentences=doc_empty)
140
+
141
+ @staticmethod
142
+ def conll2multi_docs(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
143
+ doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file)
144
+
145
+ docs = []
146
+ current_doc = []
147
+ current_comments = []
148
+ current_empty = []
149
+ current_doc_id = None
150
+ for doc, comments, empty in zip(doc_dict, doc_comments, doc_empty):
151
+ for comment in comments:
152
+ if comment.startswith("# doc_id =") or comment.startswith("# newdoc id ="):
153
+ doc_id = comment.split("=", maxsplit=1)[1]
154
+ if len(current_doc) == 0:
155
+ current_doc_id = doc_id
156
+ elif doc_id != current_doc_id:
157
+ new_doc = Document(current_doc, text=None, comments=current_comments, empty_sentences=current_empty)
158
+ if current_doc_id != None:
159
+ for i in new_doc.sentences:
160
+ i.doc_id = current_doc_id.strip()
161
+ docs.append(new_doc)
162
+ current_doc_id = doc_id
163
+ else:
164
+ continue
165
+ current_doc = [doc]
166
+ current_comments = [comments]
167
+ current_empty = [empty]
168
+ break
169
+ else: # no comments defined a new doc_id, so just add it to the current document
170
+ current_doc.append(doc)
171
+ current_comments.append(comments)
172
+ current_empty.append(empty)
173
+ if len(current_doc) > 0:
174
+ new_doc = Document(current_doc, text=None, comments=current_comments, empty_sentences=current_empty)
175
+ if current_doc_id != None:
176
+ for i in new_doc.sentences:
177
+ i.doc_id = current_doc_id.strip()
178
+ docs.append(new_doc)
179
+ current_doc_id = doc_id
180
+
181
+ return docs
182
+
183
+ @staticmethod
184
+ def dict2conll(doc_dict, filename):
185
+ """
186
+ Convert the dictionary format input data to the CoNLL-U format output data and write to a file.
187
+ """
188
+ doc = Document(doc_dict)
189
+ CoNLL.write_doc2conll(doc, filename)
190
+
191
+
192
+ @staticmethod
193
+ def write_doc2conll(doc, filename, mode='w', encoding='utf-8'):
194
+ """
195
+ Writes the doc as a conll file to the given file.
196
+
197
+ If passed a string, that filename will be opened. Otherwise, filename.write() will be called.
198
+
199
+ Note that the output needs an extra \n\n at the end to be a legal output file
200
+ """
201
+ if hasattr(filename, "write"):
202
+ filename.write("{:C}\n\n".format(doc))
203
+ else:
204
+ with open(filename, mode, encoding=encoding) as outfile:
205
+ outfile.write("{:C}\n\n".format(doc))
stanza/stanza/utils/conll18_ud_eval.py ADDED
@@ -0,0 +1,832 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # Code from CoNLL 2018 UD shared task updated for evaluation of enhanced
4
+ # dependencies in IWPT 2020 shared task.
5
+ # -- read DEPS, split on '|', compute overlap
6
+ # New metrics ELAS and EULAS.
7
+ # Gosse Bouma
8
+ # New option --enhancements can switch off evaluation of certain types of
9
+ # enhancements: default --enhancements 0 ... evaluate all enhancement types
10
+ # 1 ... no gapping; 2 ... no coord shared parents; 3 ... no coord shared dependents
11
+ # 4 ... no xsubj (control verbs); 5 ... no relative clauses; 6 ... no case info in deprels;
12
+ # combinations: 12 ... both 1 and 2 apply
13
+
14
+ # Compatible with Python 2.7 and 3.2+, can be used either as a module
15
+ # or a standalone executable.
16
+ #
17
+ # Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL),
18
+ # Faculty of Mathematics and Physics, Charles University, Czech Republic.
19
+ #
20
+ # This Source Code Form is subject to the terms of the Mozilla Public
21
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
22
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
23
+ #
24
+ # Authors: Milan Straka, Martin Popel <surname@ufal.mff.cuni.cz>
25
+ #
26
+ # Changelog:
27
+ # - [12 Apr 2018] Version 0.9: Initial release.
28
+ # - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children).
29
+ # Add --counts option.
30
+ # - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters,
31
+ # consider all Unicode characters of category Zs instead of
32
+ # just ASCII space.
33
+ # - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python).
34
+ # In Python2, make the whole computation use `unicode` strings.
35
+
36
+ # Command line usage
37
+ # ------------------
38
+ # eval.py [-v] [-c] gold_conllu_file system_conllu_file
39
+ #
40
+ # - if no -v is given, only the official IWPT 2020 Shared Task evaluation metrics
41
+ # are printed
42
+ # - if -v is given, more metrics are printed (as precision, recall, F1 score,
43
+ # and in case the metric is computed on aligned words also accuracy on these):
44
+ # - Tokens: how well do the gold tokens match system tokens
45
+ # - Sentences: how well do the gold sentences match system sentences
46
+ # - Words: how well can the gold words be aligned to system words
47
+ # - UPOS: using aligned words, how well does UPOS match
48
+ # - XPOS: using aligned words, how well does XPOS match
49
+ # - UFeats: using aligned words, how well does universal FEATS match
50
+ # - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
51
+ # - Lemmas: using aligned words, how well does LEMMA match
52
+ # - UAS: using aligned words, how well does HEAD match
53
+ # - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
54
+ # - CLAS: using aligned words with content DEPREL, how well does
55
+ # HEAD+DEPREL(ignoring subtypes) match
56
+ # - MLAS: using aligned words with content DEPREL, how well does
57
+ # HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match
58
+ # - BLEX: using aligned words with content DEPREL, how well does
59
+ # HEAD+DEPREL(ignoring subtypes)+LEMMAS match
60
+ # - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed
61
+ # instead of precision/recall/F1/AlignedAccuracy for all metrics.
62
+
63
+ # API usage
64
+ # ---------
65
+ # - load_conllu(file)
66
+ # - loads CoNLL-U file from given file object to an internal representation
67
+ # - the file object should return str in both Python 2 and Python 3
68
+ # - raises UDError exception if the given file cannot be loaded
69
+ # - evaluate(gold_ud, system_ud)
70
+ # - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
71
+ # - raises UDError if the concatenated tokens of gold and system file do not match
72
+ # - returns a dictionary with the metrics described above, each metric having
73
+ # three fields: precision, recall and f1
74
+
75
+ # Description of token matching
76
+ # -----------------------------
77
+ # In order to match tokens of gold file and system file, we consider the text
78
+ # resulting from concatenation of gold tokens and text resulting from
79
+ # concatenation of system tokens. These texts should match -- if they do not,
80
+ # the evaluation fails.
81
+ #
82
+ # If the texts do match, every token is represented as a range in this original
83
+ # text, and tokens are equal only if their range is the same.
84
+
85
+ # Description of word matching
86
+ # ----------------------------
87
+ # When matching words of gold file and system file, we first match the tokens.
88
+ # The words which are also tokens are matched as tokens, but words in multi-word
89
+ # tokens have to be handled differently.
90
+ #
91
+ # To handle multi-word tokens, we start by finding "multi-word spans".
92
+ # Multi-word span is a span in the original text such that
93
+ # - it contains at least one multi-word token
94
+ # - all multi-word tokens in the span (considering both gold and system ones)
95
+ # are completely inside the span (i.e., they do not "stick out")
96
+ # - the multi-word span is as small as possible
97
+ #
98
+ # For every multi-word span, we align the gold and system words completely
99
+ # inside this span using LCS on their FORMs. The words not intersecting
100
+ # (even partially) any multi-word span are then aligned as tokens.
101
+
102
+
103
+ from __future__ import division
104
+ from __future__ import print_function
105
+
106
+ import argparse
107
+ import io
108
+ import sys
109
+ import unicodedata
110
+ import unittest
111
+
112
+ # CoNLL-U column names
113
+ ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
114
+
115
+ # Content and functional relations
116
+ CONTENT_DEPRELS = {
117
+ "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
118
+ "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos",
119
+ "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list",
120
+ "parataxis", "orphan", "goeswith", "reparandum", "root", "dep"
121
+ }
122
+
123
+ FUNCTIONAL_DEPRELS = {
124
+ "aux", "cop", "mark", "det", "clf", "case", "cc"
125
+ }
126
+
127
+ UNIVERSAL_FEATURES = {
128
+ "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender",
129
+ "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood",
130
+ "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite"
131
+ }
132
+
133
+ # UD Error is used when raising exceptions in this module
134
+ class UDError(Exception):
135
+ pass
136
+
137
+ # Conversion methods handling `str` <-> `unicode` conversions in Python2
138
+ def _decode(text):
139
+ return text if sys.version_info[0] >= 3 or not isinstance(text, str) else text.decode("utf-8")
140
+
141
+ def _encode(text):
142
+ return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8")
143
+
144
+ CASE_DEPRELS = {'obl','nmod','conj','advcl'}
145
+ UNIVERSAL_DEPREL_EXTENSIONS = {'pass','relcl','xsubj'}
146
+
147
+ # Modify the set of deps produced by system to be in accordance with gold treebank type.
148
+ # Return a (filtered) list of (hd, dependency_path) tuples.
149
+ def process_enhanced_deps(deps) :
150
+ edeps = []
151
+ if deps != '' and deps != '_':
152
+ for edep in deps.split('|') :
153
+ (hd, path) = edep.split(':', 1)
154
+ steps = path.split('>') # collapsing empty nodes gives rise to paths like this : 3:conj:en>obl:voor
155
+ edeps.append((hd,steps)) # (3,['conj:en','obj:voor'])
156
+ return edeps
157
+
158
+ # Load given CoNLL-U file into internal representation.
159
+ # The file parameter is the open file object.
160
+ # The path parameter is needed only for diagnostic messages.
161
+ def load_conllu(file, path, treebank_type):
162
+ # Internal representation classes
163
+ class UDRepresentation:
164
+ def __init__(self):
165
+ # Characters of all the tokens in the whole file.
166
+ # Whitespace between tokens is not included.
167
+ self.characters = []
168
+ # List of UDSpan instances with start&end indices into `characters`.
169
+ self.tokens = []
170
+ # List of UDWord instances.
171
+ self.words = []
172
+ # List of UDSpan instances with start&end indices into `characters`.
173
+ self.sentences = []
174
+ # File path may be needed in error messages.
175
+ self.path = ''
176
+ class UDSpan:
177
+ def __init__(self, start, end, line):
178
+ self.start = start
179
+ # Note that self.end marks the first position **after the end** of span,
180
+ # so we can use characters[start:end] or range(start, end).
181
+ self.end = end
182
+ # Line number (1-based) will be useful if we need to report an error later.
183
+ self.line = line
184
+ class UDWord:
185
+ def __init__(self, span, columns, is_multiword):
186
+ # Span of this word (or MWT, see below) within ud_representation.characters.
187
+ self.span = span
188
+ # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
189
+ self.columns = columns
190
+ # is_multiword==True means that this word is part of a multi-word token.
191
+ # In that case, self.span marks the span of the whole multi-word token.
192
+ self.is_multiword = is_multiword
193
+ # Reference to the UDWord instance representing the HEAD (or None if root).
194
+ self.parent = None
195
+ # List of references to UDWord instances representing functional-deprel children.
196
+ self.functional_children = []
197
+ # Only consider universal FEATS.
198
+ self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|")
199
+ if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
200
+ # Let's ignore language-specific deprel subtypes.
201
+ self.columns[DEPREL] = columns[DEPREL].split(":")[0]
202
+ # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
203
+ self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS
204
+ self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS
205
+ # store enhanced deps --GB
206
+ # split string positions and enhanced labels as well?
207
+ self.columns[DEPS] = process_enhanced_deps(columns[DEPS])
208
+
209
+ ud = UDRepresentation()
210
+
211
+ # Load the CoNLL-U file
212
+ ud.path = path
213
+ index, sentence_start = 0, None
214
+ line_idx = 0
215
+ while True:
216
+ line = file.readline()
217
+ line_idx += 1 # errors will be displayed indexed from 1
218
+ if not line:
219
+ break
220
+ line = _decode(line.rstrip("\r\n"))
221
+
222
+ # Handle sentence start boundaries
223
+ if sentence_start is None:
224
+ # Skip comments
225
+ if line.startswith("#"):
226
+ continue
227
+ # Start a new sentence
228
+ ud.sentences.append(UDSpan(index, 0, line_idx))
229
+ sentence_start = len(ud.words)
230
+ if not line:
231
+ # Add parent and children UDWord links and check there are no cycles
232
+ def process_word(word):
233
+ if word.parent == "remapping":
234
+ raise UDError("There is a cycle in the sentence that ends at line %d" % line_idx)
235
+ if word.parent is None:
236
+ head = int(word.columns[HEAD])
237
+ if head < 0 or head > len(ud.words) - sentence_start:
238
+ raise UDError("HEAD '{}' points outside of the sentence that ends at line {}".format(_encode(word.columns[HEAD]), line_idx))
239
+ if head:
240
+ parent = ud.words[sentence_start + head - 1]
241
+ word.parent = "remapping"
242
+ process_word(parent)
243
+ word.parent = parent
244
+
245
+ position = sentence_start # need to incrementally keep track of current position for loop detection in relcl
246
+ for word in ud.words[sentence_start:]:
247
+ process_word(word)
248
+ enhanced_deps = word.columns[DEPS]
249
+ # replace head positions of enhanced dependencies with parent word object -- GB
250
+ processed_deps = []
251
+ for (head,steps) in word.columns[DEPS] : # (3,['conj:en','obj:voor'])
252
+ # Empty nodes should have been collapsed during preprocessing.
253
+ # If not, we cannot evaluate gapping correctly. However, people
254
+ # may care just about basic trees and may not want to bother
255
+ # with preprocessing.
256
+ if '.' in head:
257
+ if treebank_type.get('no_empty_nodes', False):
258
+ raise UDError("The collapsed CoNLL-U file still contains references to empty nodes at line {}: {}".format(line_idx, _encode(line)))
259
+ else:
260
+ continue
261
+ hd = int(head)
262
+ parent = ud.words[sentence_start + hd -1] if hd else hd # just assign '0' to parent for root cases
263
+ processed_deps.append((parent,steps))
264
+ enhanced_deps = processed_deps
265
+
266
+ # ignore rel>rel dependencies, and instead append the original hd/rel edge
267
+ # note that this also ignores other extensions (like adding lemma's)
268
+ # note that this sometimes introduces duplicates (if orig hd/rel was already included in DEPS)
269
+ if treebank_type.get('no_gapping', False) : # enhancement 1
270
+ processed_deps = []
271
+ for (parent,steps) in enhanced_deps :
272
+ if len(steps) > 1 :
273
+ processed_deps.append((word.parent,[word.columns[DEPREL]]))
274
+ else :
275
+ if (parent,steps) in processed_deps :
276
+ True
277
+ else :
278
+ processed_deps.append((parent,steps))
279
+ enhanced_deps = processed_deps
280
+
281
+ # for a given conj node, any rel other than conj in DEPS can be ignored
282
+ if treebank_type.get('no_shared_parents_in_coordination', False) : # enhancement 2
283
+ for (hd,steps) in enhanced_deps :
284
+ if len(steps) == 1 and steps[0].startswith('conj') :
285
+ enhanced_deps = [(hd,steps)]
286
+
287
+ # deprels not matching ud_hd/ud_dep are spurious.
288
+ # czech/pud estonian/ewt syntagrus finnish/pud
289
+ # TO DO: treebanks that do not mark xcomp and relcl subjects
290
+ if treebank_type.get('no_shared_dependents_in_coordination', False) : # enhancement 3
291
+ processed_deps = []
292
+ for (hd,steps) in enhanced_deps :
293
+ duplicate = 0
294
+ for (hd2,steps2) in enhanced_deps :
295
+ if steps == steps2 and hd2 == word.columns[HEAD] and hd != hd2 : # checking only for ud_hd here, check for ud_dep as well?
296
+ duplicate = 1
297
+ if not(duplicate) :
298
+ processed_deps.append((hd,steps))
299
+ enhanced_deps = processed_deps
300
+
301
+ # if treebank does not have control relations: subjects of xcomp parents in system are to be skipped
302
+ # note that rel is actually a path sometimes rel1>rel2 in theory rel2 could be subj?
303
+ # from lassy-small: 7:conj:en>nsubj:pass|7:conj:en>nsubj:xsubj (7,['conj:en','nsubj:xsubj'])
304
+ if treebank_type.get('no_control', False) : # enhancement 4
305
+ processed_deps = []
306
+ for (parent,steps) in enhanced_deps :
307
+ include = 1
308
+ if ( parent and parent.columns[DEPREL] == 'xcomp') :
309
+ for rel in steps:
310
+ if rel.startswith('nsubj') :
311
+ include = 0
312
+ if include :
313
+ processed_deps.append((parent,steps))
314
+ enhanced_deps = processed_deps
315
+
316
+ if treebank_type.get('no_external_arguments_of_relative_clauses', False) : # enhancement 5
317
+ processed_deps = []
318
+ for (parent,steps) in enhanced_deps :
319
+ if (steps[0] == 'ref') :
320
+ processed_deps.append((word.parent,[word.columns[DEPREL]])) # append the original relation
321
+ # ignore external argument link
322
+ # external args are deps of an acl:relcl where that acl also is a dependent of external arg (i.e. ext arg introduces a cycle)
323
+ elif ( parent and parent.columns[DEPREL].startswith('acl') and int(parent.columns[HEAD]) == position - sentence_start ) :
324
+ #print('removed external argument')
325
+ True
326
+ else :
327
+ processed_deps.append((parent,steps))
328
+ enhanced_deps = processed_deps
329
+
330
+ # treebanks where no lemma info has been added
331
+ if treebank_type.get('no_case_info', False) : # enhancement number 6
332
+ processed_deps = []
333
+ for (hd,steps) in enhanced_deps :
334
+ processed_steps = []
335
+ for dep in steps :
336
+ depparts = dep.split(':')
337
+ if depparts[0] in CASE_DEPRELS :
338
+ if (len(depparts) == 2 and not(depparts[1] in UNIVERSAL_DEPREL_EXTENSIONS )) :
339
+ dep = depparts[0]
340
+ processed_steps.append(dep)
341
+ processed_deps.append((hd,processed_steps))
342
+ enhanced_deps = processed_deps
343
+
344
+ position += 1
345
+ word.columns[DEPS] = enhanced_deps
346
+
347
+ # func_children cannot be assigned within process_word
348
+ # because it is called recursively and may result in adding one child twice.
349
+ for word in ud.words[sentence_start:]:
350
+ if word.parent and word.is_functional_deprel:
351
+ word.parent.functional_children.append(word)
352
+
353
+ if len(ud.words) == sentence_start :
354
+ raise UDError("There is a sentence with 0 tokens (possibly a double blank line) at line %d" % line_idx)
355
+
356
+ # Check there is a single root node
357
+ if len([word for word in ud.words[sentence_start:] if word.parent is None]) == 0:
358
+ raise UDError("There are no roots in the sentence that ends at %d" % line_idx)
359
+ if not treebank_type.get('multiple_roots_okay', False):
360
+ if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1:
361
+ raise UDError("There are multiple roots in the sentence that ends at %d" % line_idx)
362
+
363
+ # End the sentence
364
+ ud.sentences[-1].end = index
365
+ sentence_start = None
366
+ continue
367
+
368
+ # Read next token/word
369
+ columns = line.split("\t")
370
+ if len(columns) != 10:
371
+ raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(line)))
372
+
373
+ # Skip empty nodes
374
+ # If we are evaluating enhanced graphs, empty nodes should have been collapsed
375
+ # during preprocessing and should not occur here. However, we cannot raise
376
+ # an exception if they do because the user may be interested just in the
377
+ # basic tree and may not want to bother with preprocessing.
378
+ if "." in columns[ID]:
379
+ # When launching this script, we can specify that empty nodes should be considered errors.
380
+ if treebank_type.get('no_empty_nodes', False):
381
+ raise UDError("The collapsed CoNLL-U line still contains empty nodes at line {}: {}".format(line_idx, _encode(line)))
382
+ else:
383
+ continue
384
+
385
+ # Delete spaces from FORM, so gold.characters == system.characters
386
+ # even if one of them tokenizes the space. Use any Unicode character
387
+ # with category Zs.
388
+ columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM]))
389
+ if not columns[FORM]:
390
+ raise UDError("There is an empty FORM in the CoNLL-U file at line %d" % line_idx)
391
+
392
+ # Save token
393
+ ud.characters.extend(columns[FORM])
394
+ ud.tokens.append(UDSpan(index, index + len(columns[FORM]), line_idx))
395
+ index += len(columns[FORM])
396
+
397
+ # Handle multi-word tokens to save word(s)
398
+ if "-" in columns[ID]:
399
+ try:
400
+ start, end = map(int, columns[ID].split("-"))
401
+ except:
402
+ raise UDError("Cannot parse multi-word token ID '{}' at line {}".format(_encode(columns[ID]), line_idx))
403
+
404
+ words_expected = end - start + 1
405
+ words_found = 0
406
+ while words_found < words_expected:
407
+ word_line = _decode(file.readline().rstrip("\r\n"))
408
+ line_idx += 1
409
+ word_columns = word_line.split("\t")
410
+ if len(word_columns) != 10:
411
+ raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(word_line)))
412
+ if "." in word_columns[ID]:
413
+ if treebank_type.get('no_empty_nodes', False):
414
+ raise UDError("The collapsed CoNLL-U line still contains empty nodes at line {}: {}".format(line_idx, _encode(line)))
415
+ else:
416
+ continue
417
+ ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
418
+ words_found += 1
419
+
420
+ # Basic tokens/words
421
+ else:
422
+ try:
423
+ word_id = int(columns[ID])
424
+ except:
425
+ raise UDError("Cannot parse word ID '{}' at line {}".format(_encode(columns[ID]), line_idx))
426
+ if word_id != len(ud.words) - sentence_start + 1:
427
+ raise UDError("Incorrect word ID '{}' for word '{}', expected '{}' at line {}".format(
428
+ _encode(columns[ID]), _encode(columns[FORM]), len(ud.words) - sentence_start + 1, line_idx))
429
+
430
+ try:
431
+ head_id = int(columns[HEAD])
432
+ except ValueError as e:
433
+ raise UDError("Cannot parse HEAD '{}' at line {}".format(_encode(columns[HEAD]), line_idx)) from e
434
+ if head_id < 0:
435
+ raise UDError("HEAD cannot be negative at line %d" % line_idx)
436
+
437
+ ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
438
+
439
+ if sentence_start is not None:
440
+ raise UDError("The CoNLL-U file does not end with empty line")
441
+
442
+ return ud
443
+
444
+ # Evaluate the gold and system treebanks (loaded using load_conllu).
445
+ def evaluate(gold_ud, system_ud):
446
+ class Score:
447
+ def __init__(self, gold_total, system_total, correct, aligned_total=None):
448
+ self.correct = correct
449
+ self.gold_total = gold_total
450
+ self.system_total = system_total
451
+ self.aligned_total = aligned_total
452
+ self.precision = correct / system_total if system_total else 0.0
453
+ self.recall = correct / gold_total if gold_total else 0.0
454
+ self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
455
+ self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
456
+ class AlignmentWord:
457
+ def __init__(self, gold_word, system_word):
458
+ self.gold_word = gold_word
459
+ self.system_word = system_word
460
+ class Alignment:
461
+ def __init__(self, gold_words, system_words):
462
+ self.gold_words = gold_words
463
+ self.system_words = system_words
464
+ self.matched_words = []
465
+ self.matched_words_map = {}
466
+ def append_aligned_words(self, gold_word, system_word):
467
+ self.matched_words.append(AlignmentWord(gold_word, system_word))
468
+ self.matched_words_map[system_word] = gold_word
469
+
470
+ def spans_score(gold_spans, system_spans):
471
+ correct, gi, si = 0, 0, 0
472
+ while gi < len(gold_spans) and si < len(system_spans):
473
+ if system_spans[si].start < gold_spans[gi].start:
474
+ si += 1
475
+ elif gold_spans[gi].start < system_spans[si].start:
476
+ gi += 1
477
+ else:
478
+ correct += gold_spans[gi].end == system_spans[si].end
479
+ si += 1
480
+ gi += 1
481
+
482
+ return Score(len(gold_spans), len(system_spans), correct)
483
+
484
+ def alignment_score(alignment, key_fn=None, filter_fn=None):
485
+ if filter_fn is not None:
486
+ gold = sum(1 for gold in alignment.gold_words if filter_fn(gold))
487
+ system = sum(1 for system in alignment.system_words if filter_fn(system))
488
+ aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word))
489
+ else:
490
+ gold = len(alignment.gold_words)
491
+ system = len(alignment.system_words)
492
+ aligned = len(alignment.matched_words)
493
+
494
+ if key_fn is None:
495
+ # Return score for whole aligned words
496
+ return Score(gold, system, aligned)
497
+
498
+ def gold_aligned_gold(word):
499
+ return word
500
+ def gold_aligned_system(word):
501
+ return alignment.matched_words_map.get(word, 'NotAligned') if word is not None else None
502
+ correct = 0
503
+ for words in alignment.matched_words:
504
+ if filter_fn is None or filter_fn(words.gold_word):
505
+ if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system):
506
+ correct += 1
507
+
508
+ return Score(gold, system, correct, aligned)
509
+
510
+ def enhanced_alignment_score(alignment, EULAS):
511
+ # count all matching enhanced deprels in gold, system GB
512
+ # gold and system = sum of gold and predicted deps
513
+ # parents are pointers to word object, make sure to compare system parent with aligned word in gold in cases where
514
+ # tokenization introduces mismatches in number of words per sentence.
515
+ gold = 0
516
+ for gold_word in alignment.gold_words :
517
+ gold += len(gold_word.columns[DEPS])
518
+ system = 0
519
+ for system_word in alignment.system_words :
520
+ system += len(system_word.columns[DEPS])
521
+ correct = 0
522
+ for words in alignment.matched_words:
523
+ gold_deps = words.gold_word.columns[DEPS]
524
+ system_deps = words.system_word.columns[DEPS]
525
+ for (parent, dep) in gold_deps :
526
+ eulas_dep = [d.split(':')[0] for d in dep]
527
+ for (sparent, sdep) in system_deps:
528
+ eulas_sdep = [d.split(':')[0] for d in sdep]
529
+ if dep == sdep or ( eulas_dep == eulas_sdep and EULAS ) :
530
+ if parent == alignment.matched_words_map.get(sparent, 'NotAligned') :
531
+ correct += 1
532
+ elif (parent == 0 and sparent == 0) : # cases where parent is root
533
+ correct += 1
534
+ return Score(gold, system, correct)
535
+
536
+ def beyond_end(words, i, multiword_span_end):
537
+ if i >= len(words):
538
+ return True
539
+ if words[i].is_multiword:
540
+ return words[i].span.start >= multiword_span_end
541
+ return words[i].span.end > multiword_span_end
542
+
543
+ def extend_end(word, multiword_span_end):
544
+ if word.is_multiword and word.span.end > multiword_span_end:
545
+ return word.span.end
546
+ return multiword_span_end
547
+
548
+ def find_multiword_span(gold_words, system_words, gi, si):
549
+ # We know gold_words[gi].is_multiword or system_words[si].is_multiword.
550
+ # Find the start of the multiword span (gs, ss), so the multiword span is minimal.
551
+ # Initialize multiword_span_end characters index.
552
+ if gold_words[gi].is_multiword:
553
+ multiword_span_end = gold_words[gi].span.end
554
+ if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
555
+ si += 1
556
+ else: # if system_words[si].is_multiword
557
+ multiword_span_end = system_words[si].span.end
558
+ if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
559
+ gi += 1
560
+ gs, ss = gi, si
561
+
562
+ # Find the end of the multiword span
563
+ # (so both gi and si are pointing to the word following the multiword span end).
564
+ while not beyond_end(gold_words, gi, multiword_span_end) or \
565
+ not beyond_end(system_words, si, multiword_span_end):
566
+ if gi < len(gold_words) and (si >= len(system_words) or
567
+ gold_words[gi].span.start <= system_words[si].span.start):
568
+ multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
569
+ gi += 1
570
+ else:
571
+ multiword_span_end = extend_end(system_words[si], multiword_span_end)
572
+ si += 1
573
+ return gs, ss, gi, si
574
+
575
+ def compute_lcs(gold_words, system_words, gi, si, gs, ss):
576
+ lcs = [[0] * (si - ss) for i in range(gi - gs)]
577
+ for g in reversed(range(gi - gs)):
578
+ for s in reversed(range(si - ss)):
579
+ if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower():
580
+ lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
581
+ lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
582
+ lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
583
+ return lcs
584
+
585
+ def align_words(gold_words, system_words):
586
+ alignment = Alignment(gold_words, system_words)
587
+
588
+ gi, si = 0, 0
589
+ while gi < len(gold_words) and si < len(system_words):
590
+ if gold_words[gi].is_multiword or system_words[si].is_multiword:
591
+ # A: Multi-word tokens => align via LCS within the whole "multiword span".
592
+ gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
593
+
594
+ if si > ss and gi > gs:
595
+ lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
596
+
597
+ # Store aligned words
598
+ s, g = 0, 0
599
+ while g < gi - gs and s < si - ss:
600
+ if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower():
601
+ alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
602
+ g += 1
603
+ s += 1
604
+ elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
605
+ g += 1
606
+ else:
607
+ s += 1
608
+ else:
609
+ # B: No multi-word token => align according to spans.
610
+ if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
611
+ alignment.append_aligned_words(gold_words[gi], system_words[si])
612
+ gi += 1
613
+ si += 1
614
+ elif gold_words[gi].span.start <= system_words[si].span.start:
615
+ gi += 1
616
+ else:
617
+ si += 1
618
+
619
+ return alignment
620
+
621
+ # Check that the underlying character sequences match.
622
+ if gold_ud.characters != system_ud.characters:
623
+ # Identify the surrounding tokens and line numbers so the error is easier to debug.
624
+ index = 0
625
+ while index < len(gold_ud.characters) and index < len(system_ud.characters) and \
626
+ gold_ud.characters[index] == system_ud.characters[index]:
627
+ index += 1
628
+ gtindex = 0
629
+ while gtindex < len(gold_ud.tokens) and gold_ud.tokens[gtindex].end-1 < index:
630
+ gtindex += 1
631
+ stindex = 0
632
+ while stindex < len(system_ud.tokens) and system_ud.tokens[stindex].end-1 < index:
633
+ stindex += 1
634
+ gtokenreport = "The error occurs right at the beginning of the two files.\n"
635
+ stokenreport = ""
636
+ if gtindex > 0:
637
+ nprev = 10 if gtindex >= 10 else gtindex
638
+ nnext = 10 if gtindex + 10 <= len(gold_ud.tokens) else len(gold_ud.tokens) - gtindex
639
+ nfirst = gtindex - nprev
640
+ prevtokens = ' '.join([''.join(gold_ud.characters[t.start:t.end]) for t in gold_ud.tokens[nfirst:gtindex]])
641
+ nexttokens = ' '.join([''.join(gold_ud.characters[t.start:t.end]) for t in gold_ud.tokens[gtindex:gtindex + nnext]])
642
+ gtokenreport = "File '{}':\n".format(gold_ud.path)
643
+ gtokenreport += " Token no. {} on line no. {} is the last one with all characters reproduced in the other file.\n".format(gtindex, gold_ud.tokens[gtindex-1].line)
644
+ gtokenreport += " The previous {} tokens are '{}'.\n".format(nprev, prevtokens)
645
+ gtokenreport += " The next {} tokens are '{}'.\n".format(nnext, nexttokens)
646
+ if stindex > 0:
647
+ nprev = 10 if stindex >= 10 else stindex
648
+ nnext = 10 if stindex + 10 <= len(system_ud.tokens) else len(system_ud.tokens) - stindex
649
+ nfirst = stindex - nprev
650
+ prevtokens = ' '.join([''.join(system_ud.characters[t.start:t.end]) for t in system_ud.tokens[nfirst:stindex]])
651
+ nexttokens = ' '.join([''.join(system_ud.characters[t.start:t.end]) for t in system_ud.tokens[stindex:stindex + nnext]])
652
+ stokenreport = "File '{}':\n".format(system_ud.path)
653
+ stokenreport += " Token no. {} on line no. {} is the last one with all characters reproduced in the other file.\n".format(stindex, system_ud.tokens[stindex-1].line)
654
+ stokenreport += " The previous {} tokens are '{}'.\n".format(nprev, prevtokens)
655
+ stokenreport += " The next {} tokens are '{}'.\n".format(nnext, nexttokens)
656
+ raise UDError(
657
+ "The concatenation of tokens in gold file and in system file differ!\n" + gtokenreport + stokenreport +
658
+ "First 20 differing characters in gold file: '{}' and system file: '{}'".format(
659
+ "".join(map(_encode, gold_ud.characters[index:index + 20])),
660
+ "".join(map(_encode, system_ud.characters[index:index + 20]))
661
+ )
662
+ )
663
+
664
+ # Align words
665
+ alignment = align_words(gold_ud.words, system_ud.words)
666
+
667
+ # Compute the F1-scores
668
+ return {
669
+ "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
670
+ "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
671
+ "Words": alignment_score(alignment),
672
+ "UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]),
673
+ "XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]),
674
+ "UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]),
675
+ "AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
676
+ "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"),
677
+ "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)),
678
+ "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])),
679
+ "ELAS": enhanced_alignment_score(alignment, 0),
680
+ "EULAS": enhanced_alignment_score(alignment, 1),
681
+ "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]),
682
+ filter_fn=lambda w: w.is_content_deprel),
683
+ "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS],
684
+ [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS])
685
+ for c in w.functional_children]),
686
+ filter_fn=lambda w: w.is_content_deprel),
687
+ "BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL],
688
+ w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"),
689
+ filter_fn=lambda w: w.is_content_deprel),
690
+ }
691
+
692
+ def load_conllu_file(path, treebank_type=None):
693
+ if treebank_type is None:
694
+ treebank_type = {}
695
+ _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
696
+ return load_conllu(_file, path, treebank_type)
697
+
698
+ def evaluate_wrapper(args):
699
+ treebank_type = {}
700
+ enhancements = list(args.enhancements)
701
+ treebank_type['no_gapping'] = 1 if '1' in enhancements else 0
702
+ treebank_type['no_shared_parents_in_coordination'] = 1 if '2' in enhancements else 0
703
+ treebank_type['no_shared_dependents_in_coordination'] = 1 if '3' in enhancements else 0
704
+ treebank_type['no_control'] = 1 if '4' in enhancements else 0
705
+ treebank_type['no_external_arguments_of_relative_clauses'] = 1 if '5' in enhancements else 0
706
+ treebank_type['no_case_info'] = 1 if '6' in enhancements else 0
707
+ treebank_type['no_empty_nodes'] = args.no_empty_nodes
708
+ treebank_type['multiple_roots_okay'] = args.multiple_roots_okay
709
+
710
+ # Load CoNLL-U files
711
+ gold_ud = load_conllu_file(args.gold_file, treebank_type)
712
+ system_ud = load_conllu_file(args.system_file, treebank_type)
713
+ return evaluate(gold_ud, system_ud)
714
+
715
+ def build_evaluation_table(evaluation, verbose, counts, enhanced):
716
+ text = []
717
+
718
+ # Print the evaluation
719
+ if not verbose and not counts:
720
+ text.append("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
721
+ text.append("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1))
722
+ text.append("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1))
723
+ if enhanced:
724
+ text.append("ELAS F1 Score: {:.2f}".format(100 * evaluation["ELAS"].f1))
725
+ text.append("EULAS F1 Score: {:.2f}".format(100 * evaluation["EULAS"].f1))
726
+ else:
727
+ if counts:
728
+ text.append("Metric | Correct | Gold | Predicted | Aligned")
729
+ else:
730
+ text.append("Metric | Precision | Recall | F1 Score | AligndAcc")
731
+ text.append("-----------+-----------+-----------+-----------+-----------")
732
+ metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]
733
+ if enhanced:
734
+ metrics += ["ELAS", "EULAS"]
735
+ for metric in metrics:
736
+ if counts:
737
+ text.append("{:11}|{:10} |{:10} |{:10} |{:10}".format(
738
+ metric,
739
+ evaluation[metric].correct,
740
+ evaluation[metric].gold_total,
741
+ evaluation[metric].system_total,
742
+ evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "")
743
+ ))
744
+ else:
745
+ text.append("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
746
+ metric,
747
+ 100 * evaluation[metric].precision,
748
+ 100 * evaluation[metric].recall,
749
+ 100 * evaluation[metric].f1,
750
+ "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
751
+ ))
752
+
753
+ return "\n".join(text)
754
+
755
+ def main():
756
+ # Parse arguments
757
+ parser = argparse.ArgumentParser()
758
+ parser.add_argument('gold_file', type=str,
759
+ help='Name of the CoNLL-U file with the gold data.')
760
+ parser.add_argument('system_file', type=str,
761
+ help='Name of the CoNLL-U file with the predicted data.')
762
+ parser.add_argument('--verbose', '-v', default=False, action='store_true',
763
+ help='Print all metrics.')
764
+ parser.add_argument('--counts', '-c', default=False, action='store_true',
765
+ help='Print raw counts of correct/gold/system/aligned words instead of precision/recall/F1 for all metrics.')
766
+ parser.add_argument('--no-enhanced', dest='enhanced', action='store_false', default=True,
767
+ help='Turn off evaluation of enhanced dependencies.')
768
+ parser.add_argument('--enhancements', type=str, default='0',
769
+ help='Level of enhancements in the gold data (see guidelines) 0=all (default), 1=no gapping, 2=no shared parents, 3=no shared dependents 4=no control, 5=no external arguments, 6=no lemma info, combinations: 12=both 1 and 2 apply, etc.')
770
+ parser.add_argument('--no-empty-nodes', default=False,
771
+ help='Empty nodes have been collapsed (needed to correctly evaluate enhanced/gapping). Raise exception if an empty node is encountered.')
772
+ parser.add_argument('--multiple-roots-okay', default=False, action='store_true',
773
+ help='A single sentence can have multiple nodes with HEAD=0.')
774
+ args = parser.parse_args()
775
+
776
+ # Evaluate
777
+ evaluation = evaluate_wrapper(args)
778
+ results = build_evaluation_table(evaluation, args.verbose, args.counts, args.enhanced)
779
+ print(results)
780
+
781
+ if __name__ == "__main__":
782
+ main()
783
+
784
+ # Tests, which can be executed with `python -m unittest conll18_ud_eval`.
785
+ class TestAlignment(unittest.TestCase):
786
+ @staticmethod
787
+ def _load_words(words):
788
+ """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
789
+ lines, num_words = [], 0
790
+ for w in words:
791
+ parts = w.split(" ")
792
+ if len(parts) == 1:
793
+ num_words += 1
794
+ lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
795
+ else:
796
+ lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
797
+ for part in parts[1:]:
798
+ num_words += 1
799
+ lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
800
+ return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
801
+
802
+ def _test_exception(self, gold, system):
803
+ self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
804
+
805
+ def _test_ok(self, gold, system, correct):
806
+ metrics = evaluate(self._load_words(gold), self._load_words(system))
807
+ gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
808
+ system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
809
+ self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
810
+ (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
811
+
812
+ def test_exception(self):
813
+ self._test_exception(["a"], ["b"])
814
+
815
+ def test_equal(self):
816
+ self._test_ok(["a"], ["a"], 1)
817
+ self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
818
+
819
+ def test_equal_with_multiword(self):
820
+ self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
821
+ self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
822
+ self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
823
+ self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
824
+
825
+ def test_alignment(self):
826
+ self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
827
+ self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
828
+ self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
829
+ self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
830
+ self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
831
+ self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
832
+ self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
stanza/stanza/utils/helper_func.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def make_table(header, content, column_width=None):
2
+ '''
3
+ Input:
4
+ header -> List[str]: table header
5
+ content -> List[List[str]]: table content
6
+ column_width -> int: table column width; set to None for dynamically calculated widths
7
+
8
+ Output:
9
+ table_str -> str: well-formatted string for the table
10
+ '''
11
+ table_str = ''
12
+ len_column, len_row = len(header), len(content) + 1
13
+ if column_width is None:
14
+ # dynamically decide column widths
15
+ lens = [[len(str(h)) for h in header]]
16
+ lens += [[len(str(x)) for x in row] for row in content]
17
+ column_widths = [max(c)+3 for c in zip(*lens)]
18
+ else:
19
+ column_widths = [column_width] * len_column
20
+
21
+ table_str += '=' * (sum(column_widths) + 1) + '\n'
22
+
23
+ table_str += '|'
24
+ for i, item in enumerate(header):
25
+ table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|'
26
+ table_str += '\n'
27
+
28
+ table_str += '-' * (sum(column_widths) + 1) + '\n'
29
+
30
+ for line in content:
31
+ table_str += '|'
32
+ for i, item in enumerate(line):
33
+ table_str += ' ' + str(item).ljust(column_widths[i] - 2) + '|'
34
+ table_str += '\n'
35
+
36
+ table_str += '=' * (sum(column_widths) + 1) + '\n'
37
+
38
+ return table_str