RS-Visual Perception Demo

import re

import gradio as gr

from model import ToyModel

"""
Model specification
"""
vision_model_path = 'openai/clip-vit-base-patch32'
language_model_path = 'openai-community/gpt2'
model = ToyModel(vision_model_path, language_model_path)


def chat(image_input, text_input):
    text_output = model.chat(image_input, text_input)
    return image_input, text_output


"""
Gradio 
"""


def gradio_taskselect(idx):
    prompt_list = [
        '',
        '[grounding] describe this image in detail',
        '[refer] ',
        '[detection] ',
        '[identify] what is this ',
        '[vqa] '
    ]
    instruct_list = [
        '**Hint:** Type in whatever you want',
        '**Hint:** Send the command to generate a grounded image description',
        '**Hint:** Type in a phrase about an object in the image and send the command',
        '**Hint:** Type in a caption or phrase, and see object locations in the image',
        '**Hint:** Draw a bounding box on the uploaded image then send the command. Click the "clear" botton on the '
        'top right of the image before redraw',
        '**Hint:** Send a question to get a short answer',
    ]
    return prompt_list[idx], instruct_list[idx]


title = """<h1 align="center">RS-Visual Perception Demo</h1>"""
description = 'Welcome to Our RS-Visual Perception Demo!'

introduction = '''
For Abilities Involving Visual Grounding:
1. Grounding: CLICK **Send** to generate a grounded image description.
2. Refer: Input a referring object and CLICK **Send**.
3. Detection: Write a caption or phrase, and CLICK **Send**.
4. Identify: Draw the bounding box on the uploaded image window and CLICK **Send** to generate the bounding box. (CLICK "clear" button before re-drawing next time).
5. VQA: Input a visual question and CLICK **Send**.
6. No Tag: Input whatever you want and CLICK **Send** without any tagging
You can also simply chat in free form!
'''

with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Row():
        with gr.Column(scale=0.5):
            image_input = gr.Image(type="pil", label="Input Image")

            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.5,
                value=0.6,
                step=0.1,
                interactive=True,
                label="Temperature",
            )

            dataset = gr.Dataset(
                components=[gr.Textbox(visible=False)],
                samples=[['No Tag'], ['Grounding'], ['Refer'], ['Detection'], ['Identify'], ['VQA']],
                type="index",
                label='Task Shortcuts',
            )
            task_inst = gr.Markdown('**Hint:** Upload your image and chat')
            text_input = gr.Textbox(label='Input text', placeholder='Upload your image and chat', interactive=True, )
            submit_button = gr.Button("Submit", variant='primary', size='sm', scale=1)

            gr.Markdown(introduction)

        with gr.Column():
            image_output = gr.Image(type="pil", label='Output image')
            text_output = gr.Textbox(label='Output text', interactive=True)

    with gr.Row():
        with gr.Column():
            gr.Examples(examples=[
                ["examples_v2/office.jpg", "[grounding] describe this image in detail"],
                ["examples_v2/sofa.jpg", "[detection] sofas"],
                ["examples_v2/2000x1372_wmkn_0012149409555.jpg", "[refer] the world cup"],
                ["examples_v2/KFC-20-for-20-Nuggets.jpg", "[identify] what is this {<4><50><30><65>}"],
            ], inputs=[image_input, text_input], fn=chat,
                outputs=[image_output, text_output])
        with gr.Column():
            gr.Examples(examples=[
                ["examples_v2/glip_test.jpg", "[vqa] where should I hide in this room when playing hide and seek"],
                ["examples_v2/float.png", "Please write a poem about the image"],
                ["examples_v2/thief.png", "Is the weapon fateful"],
                ["examples_v2/cockdial.png", "What might happen in this image in the next second"],
            ], inputs=[image_input, text_input], fn=chat,
                outputs=[image_output, text_output])

    dataset.click(
        gradio_taskselect,
        inputs=[dataset],
        outputs=[text_input, task_inst],
        show_progress="hidden",
        postprocess=False,
        queue=False,
    )

    text_input.submit(
        chat,
        inputs=[image_input, text_input],
        outputs=[image_output, text_output],
    )

    submit_button.click(
        chat,
        inputs=[image_input, text_input],
        outputs=[image_output, text_output],
    )

demo.launch()