File size: 4,568 Bytes
400ac25 b2b8df9 400ac25 b2b8df9 400ac25 b2b8df9 400ac25 b2b8df9 400ac25 b2b8df9 b6ed801 400ac25 b2b8df9 400ac25 9c906a9 400ac25 b2b8df9 400ac25 b2b8df9 9c906a9 400ac25 b2b8df9 400ac25 b6ed801 b2b8df9 400ac25 b2b8df9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import streamlit as st
from datasets import load_dataset
from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
from utils import get_unique_docs
# Load the dataset
@st.cache_data(show_spinner=False)
def load_documents():
"""
Load the documents from the dataset considering only unique documents.
Returns:
- documents: list of dictionaries with the documents.
"""
unique_docs = set()
dataset_name = "PedroCJardim/QASports"
dataset_split = "basketball"
st.caption(f'Fetching "{dataset_name}" dataset')
# build the dataset
dataset = load_dataset(dataset_name, name=dataset_split)
docs_validation = get_unique_docs(dataset["validation"], unique_docs)
docs_train = get_unique_docs(dataset["train"], unique_docs)
docs_test = get_unique_docs(dataset["test"], unique_docs)
documents = docs_validation + docs_train + docs_test
return documents
@st.cache_resource(show_spinner=False)
def get_document_store(documents):
"""
Index the files in the document store.
Args:
- files: list of dictionaries with the documents.
"""
# Create in memory database
st.caption(f"Building the Document Store")
document_store = InMemoryDocumentStore()
document_store.write_documents(documents=documents)
return document_store
@st.cache_resource(show_spinner=False)
def get_question_pipeline(_doc_store):
"""
Create the pipeline with the retriever and reader components.
Args:
- doc_store: instance of the document store.
Returns:
- pipe: instance of the pipeline.
"""
st.caption(f"Building the Question Answering pipeline")
# Create the retriever and reader
retriever = InMemoryBM25Retriever(document_store=_doc_store)
reader = ExtractiveReader(model="deepset/roberta-base-squad2")
reader.warm_up()
# Create the pipeline
pipe = Pipeline()
pipe.add_component(instance=retriever, name="retriever")
pipe.add_component(instance=reader, name="reader")
pipe.connect("retriever.documents", "reader.documents")
return pipe
def search(pipeline, question: str):
"""
Search for the answer to a question in the documents.
Args:
- pipeline: instance of the pipeline.
- question: string with the question.
Returns:
- answer: dictionary with the answer.
"""
# Get the answers
top_k = 3
answer = pipeline.run(
data={
"retriever": {"query": question, "top_k": 10},
"reader": {"query": question, "top_k": top_k},
}
)
max_k = min(top_k, len(answer["reader"]["answers"]))
return answer["reader"]["answers"][0:max_k]
# Loading status
with st.status(
"Downloading dataset...", expanded=st.session_state.get("expanded", True)
) as status:
documents = load_documents()
status.update(label="Indexing documents...")
doc_store = get_document_store(documents)
status.update(label="Creating pipeline...")
pipe = get_question_pipeline(doc_store)
status.update(
label="Download and indexing complete!", state="complete", expanded=False
)
st.session_state["expanded"] = False
st.subheader("π HoopMind Basketball Wiki", divider="rainbow")
st.caption(
"""Welcome to **HoopMind**!
This AI answers basketball questions using the QASports dataset β the first large sports question answering dataset.
It includes real info on players, teams, and matches from basketball, soccer, and football, with over **1.5M Q&A pairs** across **54k+ documents**."""
)
if user_query := st.text_input(
label="Ask HoopMind anything about Basketball! π§ ",
placeholder="Who is Kobe Bryant?",
):
# Get the answers
with st.spinner("Thinking... π"):
try:
answer = search(pipe, user_query)
for idx, ans in enumerate(answer):
st.info(
f"""
**Answer {idx+1}:** "{ans.data}"
π₯ Score: {ans.score:0.4f}
π Document: "{ans.document.meta["title"]}"
π URL: {ans.document.meta["url"]}
"""
)
with st.expander("See details", expanded=False):
st.write(ans)
st.divider()
except Exception:
st.error("β HoopMind couldnβt find an answer for that one...")
|