Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import xgboost as xgb | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from gensim.models import FastText | |
| import joblib | |
| # Load the trained FastText model | |
| try: | |
| fasttext_model = FastText.load('fasttext_model.bin') | |
| except FileNotFoundError: | |
| st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.") | |
| st.stop() | |
| # Load the trained XGBoost model for the combined features | |
| try: | |
| model = joblib.load('model.pkl') | |
| except FileNotFoundError: | |
| st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.") | |
| st.stop() | |
| def tokenize(text): | |
| if isinstance(text, str): | |
| return text.split() | |
| else: | |
| return [] | |
| def embed_text(text_series, fasttext_model): | |
| embeddings = [] | |
| for text in text_series: | |
| tokens = tokenize(text) | |
| vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv] | |
| if vectors: | |
| embeddings.append(np.mean(vectors, axis=0)) | |
| else: | |
| embeddings.append(np.zeros(fasttext_model.vector_size)) | |
| return np.array(embeddings) | |
| def preprocess_input(query, title, description, url, fasttext_model): | |
| query = str(query) if pd.notna(query) else '' | |
| title = str(title) if pd.notna(title) else '' | |
| description = str(description) if pd.notna(description) else '' | |
| url = str(url) if pd.notna(url) else '' | |
| query_ft = embed_text(pd.Series([query]), fasttext_model) | |
| title_ft = embed_text(pd.Series([title]), fasttext_model) | |
| description_ft = embed_text(pd.Series([description]), fasttext_model) | |
| url_ft = embed_text(pd.Series([url]), fasttext_model) | |
| combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft]) | |
| dmatrix = xgb.DMatrix(combined_features) | |
| return dmatrix | |
| def extract_title_description(url): | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36' | |
| } | |
| try: | |
| response = requests.get(url, headers=headers) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| title = soup.title.string if soup.title else 'No title found' | |
| description_tag = soup.find('meta', attrs={'name': 'description'}) | |
| description = description_tag['content'] if description_tag else 'No description found' | |
| return title, description | |
| except Exception as e: | |
| return 'Error extracting title', 'Error extracting description' | |
| def predict(query, title, description, url, fasttext_model): | |
| dmatrix = preprocess_input(query, title, description, url, fasttext_model) | |
| probability = model.predict(dmatrix, validate_features=False)[0] | |
| binary_prediction = int(probability >= 0.5) | |
| return binary_prediction, probability | |
| # Streamlit interface | |
| st.title('CTR Prediction Inference') | |
| tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"]) | |
| with tab1: | |
| st.header('Single Entry Inference') | |
| query = st.text_input('Query') | |
| url = st.text_input('URL') | |
| if st.button('Predict'): | |
| title, description = extract_title_description(url) | |
| st.write(f'Extracted Title: {title}') | |
| st.write(f'Extracted Description: {description}') | |
| if query and url: | |
| binary_result, confidence = predict(query, title, description, url, fasttext_model) | |
| st.write(f'Predicted +/-: {binary_result}') | |
| st.write(f'Conf.: {confidence:.2%}') | |
| confidence_percentage = int(confidence * 100) | |
| st.progress(confidence_percentage) | |
| else: | |
| st.write('Please enter both a query and a URL.') | |
| with tab2: | |
| st.header('Batch Entry Inference') | |
| uploaded_file = st.file_uploader("Upload CSV", type="csv") | |
| if uploaded_file is not None: | |
| df = pd.read_csv(uploaded_file) | |
| required_columns = ['Query', 'Title', 'Description', 'URL'] | |
| if set(required_columns).issubset(df.columns): | |
| predictions = [] | |
| confidences = [] | |
| for index, row in df.iterrows(): | |
| binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model) | |
| predictions.append(binary_result) | |
| confidences.append(confidence) | |
| df['+/-'] = predictions | |
| df['Conf.'] = [f"{conf:.2%}" for conf in confidences] | |
| cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']] | |
| df = df[cols] | |
| st.write(df) | |
| st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv") | |
| else: | |
| st.write('CSV must contain Query, Title, Description, and URL columns.') | |
| with tab3: | |
| st.header('A/B Test Inference') | |
| query = st.text_input('Query for A/B Test') | |
| url = st.text_input('URL for A/B Test') | |
| if 'step' not in st.session_state: | |
| st.session_state.step = 0 | |
| if st.button('Scrape A/B'): | |
| title_A, description_A = extract_title_description(url) | |
| st.session_state['title_A'] = title_A | |
| st.session_state['description_A'] = description_A | |
| st.session_state.step = 1 | |
| if st.session_state.step == 1: | |
| title_B = st.text_input('Title B', value=st.session_state.get('title_A', '')) | |
| description_B = st.text_area('Description B', value=st.session_state.get('description_A', '')) | |
| if st.button('Predict A/B'): | |
| if query and url: | |
| binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model) | |
| binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model) | |
| st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}') | |
| st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}') | |
| if binary_result_A == 1 and binary_result_B == 0: | |
| st.write("B is worse than A") | |
| elif binary_result_A == 0 and binary_result_B == 1: | |
| st.write("B is better than A") | |
| else: | |
| st.write("B is the same as A") | |