Spaces:

xhiroga
/

llm-exercise-report

Sleeping

App Files Files Community

Hiroaki Ogasawara commited on Dec 1, 2024

Commit

31fce62

1 Parent(s): 2638c2c

chore: Gemma2 as option, refactor

Browse files

Files changed (1) hide show

app.py +42 -14

app.py CHANGED Viewed

@@ -22,7 +22,11 @@ def process_jsonl_file(jsonl_file_path: str, api_key: str):
         file_name, _ = os.path.splitext(file_name_with_ext)
         with tempfile.NamedTemporaryFile(
-            delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8"
         ) as temp_file:
             temp_file.write(html_content)
             output_file = temp_file.name
@@ -35,7 +39,9 @@ def process_jsonl_file(jsonl_file_path: str, api_key: str):
 # Gradioデモ
 with gr.Blocks() as reporting:
     jsonl_input = gr.File(label="JSONLファイルをアップロード")
-    api_key_input = gr.Textbox(label="GeminiのAPIキー（スコアのセルフ評価を行う場合）", type="password")
     gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
     process_button = gr.Button("レポートを作成")
@@ -43,44 +49,66 @@ with gr.Blocks() as reporting:
     output_text = gr.Textbox(label="システムメッセージ")
     process_button.click(
-        process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_text]
     )
 llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
 gemma_2 = "google/gemma-2-2b"
 llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
-gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
 tokenizers = {
     "LLM-JP-3": llm_jp_3_tokenizer,
-    "Gemma-2": gemma_2_tokenizer
 }
 def tokenize_text(text: str, tokenizer_name: str):
     tokenizer = tokenizers[tokenizer_name]
     tokens = tokenizer.tokenize(text)
-    colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFFCC', '#CCFFFF', '#FFCCFF']
-    tokenized_text = ''.join([f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' for i, token in enumerate(tokens)])
     token_count = len(tokens)
     return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
 with gr.Blocks() as tokenization:
     with gr.Row():
-        tokenizer_dropdown = gr.Dropdown(label="Tokenizerを選択", choices=["LLM-JP-3", "Gemma-2"], value="LLM-JP-3")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Input Text")
         with gr.Column():
-            tokenized_output = gr.HTML(label="Tokenized Output")
-    tokenizer_dropdown.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
-    text_input.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
 tabbed = gr.TabbedInterface(
-    [reporting, tokenization],
     tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
-    title="LLM開発支援ツール"
 )
 if __name__ == "__main__":

         file_name, _ = os.path.splitext(file_name_with_ext)
         with tempfile.NamedTemporaryFile(
+            delete=False,
+            prefix=f"{file_name}-report-",
+            suffix=".html",
+            mode="w",
+            encoding="utf-8",
         ) as temp_file:
             temp_file.write(html_content)
             output_file = temp_file.name
 # Gradioデモ
 with gr.Blocks() as reporting:
     jsonl_input = gr.File(label="JSONLファイルをアップロード")
+    api_key_input = gr.Textbox(
+        label="GeminiのAPIキー（スコアのセルフ評価を行う場合）", type="password"
+    )
     gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
     process_button = gr.Button("レポートを作成")
     output_text = gr.Textbox(label="システムメッセージ")
     process_button.click(
+        process_jsonl_file,
+        inputs=[jsonl_input, api_key_input],
+        outputs=[output_file, output_text],
     )
 llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
 gemma_2 = "google/gemma-2-2b"
 llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
 tokenizers = {
     "LLM-JP-3": llm_jp_3_tokenizer,
 }
+try:
+    gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
+    tokenizers["Gemma-2"] = gemma_2_tokenizer
+except OSError as e:
+    print(e)
+tokenizer_names = list(tokenizers.keys())
 def tokenize_text(text: str, tokenizer_name: str):
     tokenizer = tokenizers[tokenizer_name]
     tokens = tokenizer.tokenize(text)
+    colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
+    tokenized_text = "".join(
+        [
+            f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
+            for i, token in enumerate(tokens)
+        ]
+    )
     token_count = len(tokens)
     return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
 with gr.Blocks() as tokenization:
     with gr.Row():
+        tokenizer_dropdown = gr.Dropdown(
+            label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
+        )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Input Text")
         with gr.Column():
+            tokenized_output = gr.HTML(
+                tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
+            )
+    tokenizer_dropdown.change(
+        tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
+    )
+    text_input.change(
+        tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
+    )
 tabbed = gr.TabbedInterface(
+    [reporting, tokenization],
     tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
+    title="LLM開発支援ツール",
 )
 if __name__ == "__main__":