Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import datasets | |
| from sentence_transformers import SentenceTransformer, util | |
| import torch | |
| import re | |
| import nltk | |
| from nltk.corpus import words, stopwords | |
| import urllib.parse as _url | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from nltk.stem import PorterStemmer | |
| import gradio as gr | |
| import os | |
| from tqdm import tqdm | |
| tqdm.pandas() | |
| # --- NLTK Data Download --- | |
| for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']: | |
| try: | |
| nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}') | |
| except LookupError: | |
| nltk.download(package) | |
| STOPWORDS = set(stopwords.words('english')) | |
| stemmer = PorterStemmer() | |
| # --- Expanded Skill Whitelist --- | |
| SKILL_WHITELIST = { | |
| # Technical & Data | |
| 'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue', | |
| 'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter', | |
| 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib', | |
| 'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering', | |
| 'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics', | |
| 'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins', | |
| 'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing', | |
| 'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis', | |
| 'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas', | |
| # Business & Consulting | |
| 'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development', | |
| 'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance', | |
| 'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain', | |
| 'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking', | |
| 'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication', | |
| 'organizational skills', | |
| # Soft & Other | |
| 'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving', | |
| 'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing', | |
| 'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising' | |
| } | |
| # --- GLOBAL STATE & DATA --- | |
| original_df = None | |
| combined_df = None | |
| model = None | |
| combined_job_embeddings = None | |
| original_job_title_embeddings = None | |
| LLM_PIPELINE = None | |
| LLM_MODEL_NAME = "microsoft/phi-2" | |
| FINETUNED_MODEL_ID = "its-zion-18/projfinetuned" | |
| KNOWN_WORDS = set() | |
| # --- CORE NLP & HELPER FUNCTIONS --- | |
| def _norm_skill_token(s: str) -> str: | |
| s = s.lower().strip() | |
| s = re.sub(r'[\(\)\[\]\{\}\*]', '', s) | |
| s = re.sub(r'^\W+|\W+$', '', s) | |
| s = re.sub(r'\s+', ' ', s) | |
| return s | |
| def build_known_vocabulary(df: pd.DataFrame): | |
| global KNOWN_WORDS | |
| english_words = set(w.lower() for w in words.words()) | |
| job_words = set(re.findall(r'\b\w+\b', " ".join(df['full_text'].astype(str).tolist()).lower())) | |
| job_words = {w for w in job_words if w.isalpha() and len(w) > 2} | |
| KNOWN_WORDS = english_words | job_words | |
| return "Known vocabulary built." | |
| def check_spelling_in_query(query: str) -> list[str]: | |
| words_in_query = query.lower().split() | |
| unrecognized_words = [] | |
| if not KNOWN_WORDS: return [] | |
| for word in words_in_query: | |
| if word.isalpha() and len(word) > 1 and word not in KNOWN_WORDS: | |
| unrecognized_words.append(word) | |
| return list(set(unrecognized_words)) | |
| def initialize_llm_client(): | |
| global LLM_PIPELINE | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True) | |
| model_llm = AutoModelForCausalLM.from_pretrained( | |
| LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True | |
| ) | |
| LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer) | |
| return True | |
| except Exception as e: | |
| print(f"π¨ ERROR initializing local LLM: {e}") | |
| return False | |
| def llm_expand_query(user_input: str) -> str: | |
| global LLM_PIPELINE | |
| if not LLM_PIPELINE: return user_input | |
| prompt_template = ( | |
| f"User's career interest: '{user_input}'\n" | |
| f"Instruction: Based on the user's interest, write a concise, single-sentence summary (40-60 words) that elaborates on the core intent, typical skills, and responsibilities. " | |
| f"Do not include a preamble, the user input, or any list formatting in the output. Just the expanded sentence.\n" | |
| f"Expanded Intent:" | |
| ) | |
| try: | |
| response = LLM_PIPELINE(prompt_template, max_new_tokens=100, do_sample=True, temperature=0.6) | |
| expanded_query = response[0]['generated_text'].strip().split("Expanded Intent:")[-1].strip() | |
| final_query = user_input + ". " + expanded_query.replace('\n', ' ').replace(':', '').strip() | |
| final_query = final_query.replace('..', '.').strip() | |
| return final_query | |
| except Exception: | |
| return user_input | |
| def find_job_matches(original_user_query: str, expanded_user_query: str, top_k: int = 50) -> pd.DataFrame: | |
| expanded_user_embedding = model.encode(expanded_user_query, convert_to_tensor=True) | |
| general_similarity_scores = util.cos_sim(expanded_user_embedding, combined_job_embeddings)[0] | |
| top_indices = torch.topk(general_similarity_scores, k=len(combined_df)) | |
| sorted_combined_df = combined_df.iloc[top_indices.indices.cpu()].copy() | |
| sorted_combined_df['general_score'] = top_indices.values.cpu().numpy() | |
| unique_matches = sorted_combined_df.drop_duplicates(subset=['job_id'], keep='first').set_index('job_id') | |
| original_user_embedding = model.encode(original_user_query, convert_to_tensor=True) | |
| title_boost_scores = util.cos_sim(original_user_embedding, original_job_title_embeddings)[0].cpu().numpy() | |
| title_boost_map = pd.Series(title_boost_scores, index=original_df['job_id']) | |
| unique_matches['title_boost_score'] = unique_matches.index.map(title_boost_map).fillna(0) | |
| unique_matches['Similarity Score'] = (0.70 * unique_matches['general_score'] + 0.30 * unique_matches['title_boost_score']) | |
| final_job_ids = unique_matches.sort_values(by='Similarity Score', ascending=False).head(top_k).index.tolist() | |
| final_results_df = original_df[original_df['job_id'].isin(final_job_ids)].copy() | |
| scores_df = unique_matches.reset_index()[['job_id', 'Similarity Score']].copy() | |
| final_results_df = pd.merge(final_results_df, scores_df, on='job_id', how='left') | |
| final_results_df = final_results_df.sort_values(by='Similarity Score', ascending=False).reset_index(drop=True) | |
| final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'}) | |
| return final_results_df | |
| def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame: | |
| if df_to_rank is None or df_to_rank.empty or not user_skills: | |
| return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame() | |
| ranked_df = df_to_rank.copy() | |
| if 'Skills' not in ranked_df.columns: | |
| return ranked_df.sort_values(by='Similarity Score', ascending=False) | |
| user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True) | |
| all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list))) | |
| if not all_job_skills: | |
| ranked_df['Skill Match Score'] = 0.0 | |
| ranked_df['Final Score'] = ranked_df['Similarity Score'] | |
| return ranked_df | |
| job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True) | |
| similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings) | |
| def calculate_confidence_adjusted_score(row): | |
| job_skills_list = row.get('Skills', []) | |
| if not job_skills_list: | |
| return 0.0 | |
| total_required = len(job_skills_list) | |
| sum_of_max_similarities = 0.0 | |
| for job_skill in job_skills_list: | |
| try: | |
| job_skill_idx = all_job_skills.index(job_skill) | |
| max_sim = torch.max(similarity_matrix[:, job_skill_idx]) | |
| sum_of_max_similarities += max_sim.item() | |
| except (ValueError, IndexError): | |
| continue | |
| avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0 | |
| skill_count_factor = min(1.0, total_required / 5.0) | |
| return avg_score * skill_count_factor | |
| ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1) | |
| ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score']) | |
| ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True) | |
| return ranked_df.set_index('Job ID', drop=False).rename_axis(None) | |
| def initialize_data_and_model(): | |
| global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings | |
| PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet" | |
| print("--- Initializing LLM Client ---") | |
| if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.") | |
| if os.path.exists(PROCESSED_DATA_PATH): | |
| print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---") | |
| original_df = pd.read_parquet(PROCESSED_DATA_PATH) | |
| else: | |
| print("--- No pre-processed data found. Starting one-time processing... ---") | |
| ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset") | |
| original_df = ds["original"].to_pandas() | |
| def extract_skills_llm(text: str) -> list[str]: | |
| if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return [] | |
| prompt = f""" | |
| Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string. | |
| [Example 1] | |
| Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus." | |
| Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code | |
| [Example 2] | |
| Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum." | |
| Extracted Skills: project management, leadership, stakeholder communication, agile, scrum | |
| [Actual Task] | |
| Text: "{text}" | |
| Extracted Skills: | |
| """ | |
| try: | |
| response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1) | |
| generated_text = response[0]['generated_text'] | |
| skills_part = generated_text.split("Extracted Skills:")[-1].strip() | |
| skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()] | |
| return list(dict.fromkeys(s.lower() for s in skills)) | |
| except Exception: return [] | |
| def extract_skills_nltk(text: str) -> list[str]: | |
| if not isinstance(text, str): return [] | |
| text_lower = text.lower() | |
| grammar = "NP: {<JJ.*>*<NN.*>+}" | |
| chunk_parser = nltk.RegexpParser(grammar) | |
| tokens = nltk.word_tokenize(text_lower) | |
| tagged_tokens = nltk.pos_tag(tokens) | |
| chunked_text = chunk_parser.parse(tagged_tokens) | |
| potential_skills = set() | |
| for subtree in chunked_text.subtrees(): | |
| if subtree.label() == 'NP': | |
| phrase = " ".join(word for word, tag in subtree.leaves()) | |
| if _norm_skill_token(phrase) in SKILL_WHITELIST: | |
| potential_skills.add(_norm_skill_token(phrase)) | |
| return sorted(list(potential_skills)) | |
| def extract_skills_direct_scan(text: str) -> list[str]: | |
| if not isinstance(text, str): return [] | |
| found_skills = set() | |
| for skill in SKILL_WHITELIST: | |
| if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE): | |
| found_skills.add(skill) | |
| return list(found_skills) | |
| def expand_skills_with_llm(job_title: str, existing_skills: list) -> list: | |
| if not LLM_PIPELINE or not job_title: return [] | |
| skills_to_add = 6 - len(existing_skills) | |
| prompt = f""" | |
| Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}. | |
| Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role? | |
| List only the new skills, separated by commas. Do not repeat skills from the original list. | |
| Additional Skills: | |
| """ | |
| try: | |
| response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5) | |
| generated_text = response[0]['generated_text'] | |
| skills_part = generated_text.split("Additional Skills:")[-1].strip() | |
| new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()] | |
| return new_skills | |
| except Exception: | |
| return [] | |
| def extract_skills_hybrid(row) -> list[str]: | |
| text = row['text_for_skills'] | |
| job_title = row.get('Job title', '') # Use original Job title for context | |
| llm_skills = extract_skills_llm(text) | |
| nltk_skills = extract_skills_nltk(text) | |
| direct_skills = extract_skills_direct_scan(text) | |
| combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills) | |
| # If the combined list is still too short, expand it | |
| if len(combined_skills) < 6: | |
| expanded_skills = expand_skills_with_llm(job_title, list(combined_skills)) | |
| combined_skills.update(expanded_skills) | |
| return sorted(list(combined_skills)) | |
| def create_text_for_skills(row): | |
| return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)]) | |
| original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1) | |
| print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---") | |
| # Apply the hybrid function row-wise to include job title context | |
| original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1) | |
| original_df = original_df.drop(columns=['text_for_skills']) | |
| print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---") | |
| original_df.to_parquet(PROCESSED_DATA_PATH) | |
| original_df['job_id'] = original_df.index | |
| def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]]) | |
| original_df["full_text"] = original_df.apply(create_full_text, axis=1) | |
| ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset") | |
| augmented_df = ds["augmented"].to_pandas() | |
| max_id = len(original_df) - 1 | |
| augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id)) | |
| augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1) | |
| combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True) | |
| original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'}) | |
| print("--- Loading Fine-Tuned Sentence Transformer Model ---") | |
| model = SentenceTransformer(FINETUNED_MODEL_ID) | |
| print("--- Encoding Embeddings ---") | |
| combined_job_embeddings = model.encode(combined_df["full_text"].tolist(), convert_to_tensor=True, show_progress_bar=True) | |
| original_job_title_embeddings = model.encode(original_df["job_title"].tolist(), convert_to_tensor=True, show_progress_bar=True) | |
| print("--- Building Vocabulary ---") | |
| build_known_vocabulary(combined_df) | |
| return "--- Initialization Complete ---" | |
| def _course_links_for(skill: str) -> str: | |
| q = _url.quote(skill) | |
| links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")] | |
| return " β’ ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links]) | |
| def get_job_matches(dream_job: str, top_n: int, skills_text: str): | |
| status = "Searching using hybrid model..." | |
| expanded_desc = llm_expand_query(dream_job) | |
| emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50) | |
| user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] | |
| # --- NEW: Initialize variables for the recommendations section --- | |
| recommendations_table = pd.DataFrame() | |
| recommendations_visible = False | |
| if user_skills: | |
| scored_df = score_jobs_by_skills(user_skills, emb_matches) | |
| # --- NEW: Logic to get top 5 jobs based purely on skill match score --- | |
| skill_sorted_df = scored_df.sort_values(by='Skill Match Score', ascending=False).head(5) | |
| if not skill_sorted_df.empty: | |
| recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy() | |
| recs = recs.rename(columns={'Final Score': 'Overall Score'}) | |
| recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format) | |
| recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format) | |
| recommendations_table = recs | |
| recommendations_visible = True | |
| # --- END NEW --- | |
| display_df = scored_df.head(top_n) | |
| status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}." | |
| else: | |
| display_df = emb_matches.head(top_n) | |
| status = f"Found {len(display_df)} top matches using semantic search." | |
| if 'Final Score' in display_df.columns: | |
| table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']] | |
| table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'}) | |
| table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format) | |
| table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) | |
| else: | |
| table_to_show = display_df[['job_title', 'company', 'Similarity Score']] | |
| table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'}) | |
| table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) | |
| dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()] | |
| dropdown_value = dropdown_options[0][1] if dropdown_options else None | |
| # --- MODIFIED: Added new outputs for recommendations --- | |
| return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible) | |
| def rerank_current_results(initial_matches_df, skills_text, top_n): | |
| if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty: | |
| return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False), pd.DataFrame(), gr.Accordion(visible=False) | |
| initial_matches_df = pd.DataFrame(initial_matches_df) | |
| user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] | |
| # --- NEW: Initialize variables for the recommendations section --- | |
| recommendations_table = pd.DataFrame() | |
| recommendations_visible = False | |
| if not user_skills: | |
| status = "Skills cleared. Showing original semantic search results." | |
| display_df = initial_matches_df.head(top_n) | |
| table_to_show = display_df[['job_title', 'company', 'Similarity Score']] | |
| table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'}) | |
| table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) | |
| else: | |
| ranked_df = score_jobs_by_skills(user_skills, initial_matches_df) | |
| status = f"Results **re-ranked** based on your {len(user_skills)} skills." | |
| display_df = ranked_df.head(top_n) | |
| # --- NEW: Logic to get top 5 jobs based purely on skill match score --- | |
| skill_sorted_df = ranked_df.sort_values(by='Skill Match Score', ascending=False).head(5) | |
| if not skill_sorted_df.empty: | |
| recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy() | |
| recs = recs.rename(columns={'Final Score': 'Overall Score'}) | |
| recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format) | |
| recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format) | |
| recommendations_table = recs | |
| recommendations_visible = True | |
| # --- END NEW --- | |
| table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']] | |
| table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'}) | |
| table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format) | |
| table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format) | |
| dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()] | |
| dropdown_value = dropdown_options[0][1] if dropdown_options else None | |
| # --- MODIFIED: Added new outputs for recommendations --- | |
| return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible) | |
| def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str): | |
| if not dream_job: | |
| # --- MODIFIED: Added new default outputs --- | |
| return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False) | |
| unrecognized_words = check_spelling_in_query(dream_job) | |
| if unrecognized_words: | |
| word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words]) | |
| alert_message = f"<b><span style='color: #F87171;'>β οΈ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}." | |
| # --- MODIFIED: Added new default outputs --- | |
| return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True), pd.DataFrame(), gr.Accordion(visible=False) | |
| status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text) | |
| return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion | |
| def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str): | |
| status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text) | |
| return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion | |
| def on_select_job(job_id, skills_text): | |
| if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False) | |
| row = original_df.loc[job_id] | |
| title, company = str(row.get("job_title", "")), str(row.get("company", "")) | |
| job_details_markdown = f"### {title} β {company}" | |
| duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', '')) | |
| user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)] | |
| job_skills = row.get("Skills", []) | |
| if not job_skills: | |
| learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>" | |
| return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) | |
| score_val = 0 | |
| all_missing_skills = job_skills | |
| if user_skills: | |
| user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True) | |
| job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True) | |
| similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings) | |
| sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values) | |
| avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0 | |
| skill_count_factor = min(1.0, len(job_skills) / 5.0) | |
| score_val = avg_score * skill_count_factor | |
| matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0) | |
| all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]] | |
| if user_skills and score_val >= 0.98: | |
| learning_plan_html = "<h4 style='color:green;'>π You have all the required skills!</h4>" | |
| job_details_markdown += f"\n**Your skill match:** {score_val:.1%}" | |
| return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) | |
| if user_skills: | |
| job_details_markdown += f"\n**Your skill match:** {score_val:.1%}" | |
| headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>" | |
| learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>" | |
| skills_to_display = sorted(all_missing_skills)[:5] | |
| items_html = [f"<li><b>{ms}</b><br>β’ Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] | |
| learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" | |
| return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False) | |
| else: | |
| headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>" | |
| skills_to_display = sorted(job_skills)[:5] | |
| items_html = [f"<li><b>{ms}</b><br>β’ Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] | |
| learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" | |
| full_skill_list_for_state = sorted(job_skills) | |
| new_offset = len(skills_to_display) | |
| should_button_be_visible = len(full_skill_list_for_state) > 5 | |
| return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible) | |
| def load_more_skills(full_skills_list, current_offset): | |
| SKILLS_INCREMENT = 5 | |
| new_offset = current_offset + SKILLS_INCREMENT | |
| skills_to_display = full_skills_list[:new_offset] | |
| items_html = [f"<li><b>{ms}</b><br>β’ Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display] | |
| learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>" | |
| should_button_be_visible = new_offset < len(full_skills_list) | |
| return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible) | |
| def on_reset(): | |
| # --- MODIFIED: Added new default outputs for reset --- | |
| return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.Accordion(visible=False)) | |
| print("Starting application initialization...") | |
| initialization_status = initialize_data_and_model() | |
| print(initialization_status) | |
| with gr.Blocks(theme=gr.themes.Soft()) as ui: | |
| gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer") | |
| gr.Markdown( | |
| """ | |
| Welcome to your personal career co-pilot! This tool uses AI to match your dream job description | |
| with real-world roles. Add your skills to see a detailed analysis of where you stand and what | |
| you need to learn to land the job. | |
| """ | |
| ) | |
| initial_matches_state = gr.State() | |
| missing_skills_state = gr.State([]) | |
| skills_offset_state = gr.State(0) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'") | |
| with gr.Accordion("Optional: Add Your Skills to Re-rank Results", open=False): | |
| with gr.Row(): | |
| skills_text = gr.Textbox(label='Your Skills (comma-separated)', placeholder="e.g., Python, data analysis", scale=3) | |
| rerank_btn = gr.Button("Re-rank", variant="secondary", scale=1) | |
| with gr.Column(scale=1): | |
| topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches") | |
| search_btn = gr.Button("Find Matches", variant="primary") | |
| reset_btn = gr.Button("Reset All") | |
| status_text = gr.Markdown("Status: Ready.") | |
| spelling_alert = gr.Markdown(visible=False) | |
| with gr.Row(visible=False) as spelling_row: | |
| search_anyway_btn = gr.Button("Search Anyway", variant="secondary") | |
| retype_btn = gr.Button("Let Me Fix It", variant="stop") | |
| df_output = gr.DataFrame(label="Job Matches (Sorted by Overall Relevance)", interactive=False) | |
| # --- NEW: Added the recommendations section --- | |
| with gr.Accordion("β¨ Based on your current skills and career interest consider these jobs...", open=True, visible=False) as recommendations_accordion: | |
| recommendations_df_output = gr.DataFrame(label="Top Skill Matches", interactive=False) | |
| job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False) | |
| with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion: | |
| job_details_markdown = gr.Markdown() | |
| with gr.Tabs(): | |
| with gr.TabItem("Duties"): duties_markdown = gr.Markdown() | |
| with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown() | |
| with gr.TabItem("Full Description"): description_markdown = gr.Markdown() | |
| learning_plan_output = gr.HTML(label="Learning Plan") | |
| load_more_btn = gr.Button("Load More Skills", visible=False) | |
| # --- MODIFIED: Added new outputs to the click events --- | |
| search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) | |
| search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) | |
| retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion]) | |
| reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn, recommendations_df_output, recommendations_accordion], queue=False) | |
| rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector, recommendations_df_output, recommendations_accordion]) | |
| job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]) | |
| load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn]) | |
| ui.launch() | |