Spaces:

shuangzhiaishang
/

RS-VL-Perception

Runtime error

App Files Files Community

RS-VL-Perception / app.py

shuangzhiaishang

Update app.py

68ebecc verified over 1 year ago

raw

history blame contribute delete

4.68 kB

	import re

	import gradio as gr

	from model import ToyModel

	"""
	Model specification
	"""
	vision_model_path = 'openai/clip-vit-base-patch32'
	language_model_path = 'openai-community/gpt2'
	model = ToyModel(vision_model_path, language_model_path)


	def chat(image_input, text_input):
	text_output = model.chat(image_input, text_input)
	return image_input, text_output


	"""
	Gradio
	"""


	def gradio_taskselect(idx):
	prompt_list = [
	'',
	'[grounding] describe this image in detail',
	'[refer] ',
	'[detection] ',
	'[identify] what is this ',
	'[vqa] '
	]
	instruct_list = [
	'Hint: Type in whatever you want',
	'Hint: Send the command to generate a grounded image description',
	'Hint: Type in a phrase about an object in the image and send the command',
	'Hint: Type in a caption or phrase, and see object locations in the image',
	'Hint: Draw a bounding box on the uploaded image then send the command. Click the "clear" botton on the '
	'top right of the image before redraw',
	'Hint: Send a question to get a short answer',
	]
	return prompt_list[idx], instruct_list[idx]


	title = """<h1 align="center">RS-Visual Perception Demo</h1>"""
	description = 'Welcome to Our RS-Visual Perception Demo!'

	introduction = '''
	For Abilities Involving Visual Grounding:
	1. Grounding: CLICK Send to generate a grounded image description.
	2. Refer: Input a referring object and CLICK Send.
	3. Detection: Write a caption or phrase, and CLICK Send.
	4. Identify: Draw the bounding box on the uploaded image window and CLICK Send to generate the bounding box. (CLICK "clear" button before re-drawing next time).
	5. VQA: Input a visual question and CLICK Send.
	6. No Tag: Input whatever you want and CLICK Send without any tagging
	You can also simply chat in free form!
	'''

	with gr.Blocks() as demo:
	gr.Markdown(title)
	gr.Markdown(description)

	with gr.Row():
	with gr.Column(scale=0.5):
	image_input = gr.Image(type="pil", label="Input Image")

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.6,
	step=0.1,
	interactive=True,
	label="Temperature",
	)

	dataset = gr.Dataset(
	components=[gr.Textbox(visible=False)],
	samples=[['No Tag'], ['Grounding'], ['Refer'], ['Detection'], ['Identify'], ['VQA']],
	type="index",
	label='Task Shortcuts',
	)
	task_inst = gr.Markdown('Hint: Upload your image and chat')
	text_input = gr.Textbox(label='Input text', placeholder='Upload your image and chat', interactive=True, )
	submit_button = gr.Button("Submit", variant='primary', size='sm', scale=1)

	gr.Markdown(introduction)

	with gr.Column():
	image_output = gr.Image(type="pil", label='Output image')
	text_output = gr.Textbox(label='Output text', interactive=True)

	with gr.Row():
	with gr.Column():
	gr.Examples(examples=[
	["examples_v2/office.jpg", "[grounding] describe this image in detail"],
	["examples_v2/sofa.jpg", "[detection] sofas"],
	["examples_v2/2000x1372_wmkn_0012149409555.jpg", "[refer] the world cup"],
	["examples_v2/KFC-20-for-20-Nuggets.jpg", "[identify] what is this {<4><50><30><65>}"],
	], inputs=[image_input, text_input], fn=chat,
	outputs=[image_output, text_output])
	with gr.Column():
	gr.Examples(examples=[
	["examples_v2/glip_test.jpg", "[vqa] where should I hide in this room when playing hide and seek"],
	["examples_v2/float.png", "Please write a poem about the image"],
	["examples_v2/thief.png", "Is the weapon fateful"],
	["examples_v2/cockdial.png", "What might happen in this image in the next second"],
	], inputs=[image_input, text_input], fn=chat,
	outputs=[image_output, text_output])

	dataset.click(
	gradio_taskselect,
	inputs=[dataset],
	outputs=[text_input, task_inst],
	show_progress="hidden",
	postprocess=False,
	queue=False,
	)

	text_input.submit(
	chat,
	inputs=[image_input, text_input],
	outputs=[image_output, text_output],
	)

	submit_button.click(
	chat,
	inputs=[image_input, text_input],
	outputs=[image_output, text_output],
	)

	demo.launch()