import re import gradio as gr from model import ToyModel """ Model specification """ vision_model_path = 'openai/clip-vit-base-patch32' language_model_path = 'openai-community/gpt2' model = ToyModel(vision_model_path, language_model_path) def chat(image_input, text_input): text_output = model.chat(image_input, text_input) return image_input, text_output """ Gradio """ def gradio_taskselect(idx): prompt_list = [ '', '[grounding] describe this image in detail', '[refer] ', '[detection] ', '[identify] what is this ', '[vqa] ' ] instruct_list = [ '**Hint:** Type in whatever you want', '**Hint:** Send the command to generate a grounded image description', '**Hint:** Type in a phrase about an object in the image and send the command', '**Hint:** Type in a caption or phrase, and see object locations in the image', '**Hint:** Draw a bounding box on the uploaded image then send the command. Click the "clear" botton on the ' 'top right of the image before redraw', '**Hint:** Send a question to get a short answer', ] return prompt_list[idx], instruct_list[idx] title = """

RS-Visual Perception Demo

""" description = 'Welcome to Our RS-Visual Perception Demo!' introduction = ''' For Abilities Involving Visual Grounding: 1. Grounding: CLICK **Send** to generate a grounded image description. 2. Refer: Input a referring object and CLICK **Send**. 3. Detection: Write a caption or phrase, and CLICK **Send**. 4. Identify: Draw the bounding box on the uploaded image window and CLICK **Send** to generate the bounding box. (CLICK "clear" button before re-drawing next time). 5. VQA: Input a visual question and CLICK **Send**. 6. No Tag: Input whatever you want and CLICK **Send** without any tagging You can also simply chat in free form! ''' with gr.Blocks() as demo: gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(scale=0.5): image_input = gr.Image(type="pil", label="Input Image") temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.6, step=0.1, interactive=True, label="Temperature", ) dataset = gr.Dataset( components=[gr.Textbox(visible=False)], samples=[['No Tag'], ['Grounding'], ['Refer'], ['Detection'], ['Identify'], ['VQA']], type="index", label='Task Shortcuts', ) task_inst = gr.Markdown('**Hint:** Upload your image and chat') text_input = gr.Textbox(label='Input text', placeholder='Upload your image and chat', interactive=True, ) submit_button = gr.Button("Submit", variant='primary', size='sm', scale=1) gr.Markdown(introduction) with gr.Column(): image_output = gr.Image(type="pil", label='Output image') text_output = gr.Textbox(label='Output text', interactive=True) with gr.Row(): with gr.Column(): gr.Examples(examples=[ ["examples_v2/office.jpg", "[grounding] describe this image in detail"], ["examples_v2/sofa.jpg", "[detection] sofas"], ["examples_v2/2000x1372_wmkn_0012149409555.jpg", "[refer] the world cup"], ["examples_v2/KFC-20-for-20-Nuggets.jpg", "[identify] what is this {<4><50><30><65>}"], ], inputs=[image_input, text_input], fn=chat, outputs=[image_output, text_output]) with gr.Column(): gr.Examples(examples=[ ["examples_v2/glip_test.jpg", "[vqa] where should I hide in this room when playing hide and seek"], ["examples_v2/float.png", "Please write a poem about the image"], ["examples_v2/thief.png", "Is the weapon fateful"], ["examples_v2/cockdial.png", "What might happen in this image in the next second"], ], inputs=[image_input, text_input], fn=chat, outputs=[image_output, text_output]) dataset.click( gradio_taskselect, inputs=[dataset], outputs=[text_input, task_inst], show_progress="hidden", postprocess=False, queue=False, ) text_input.submit( chat, inputs=[image_input, text_input], outputs=[image_output, text_output], ) submit_button.click( chat, inputs=[image_input, text_input], outputs=[image_output, text_output], ) demo.launch()