Spaces:

its-zion-18
/

career_planner

Sleeping

App Files Files Community

its-zion-18 commited on Oct 12

Commit

ed9ab6a

verified ·

1 Parent(s): 9257d69

Update app.py

Browse files

Files changed (1) hide show

app.py +303 -173

app.py CHANGED Viewed

@@ -6,24 +6,49 @@ import re
 import nltk
 from nltk.corpus import words, stopwords
 import urllib.parse as _url
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
-# --- CORRECTED: Download necessary NLTK data ---
-# This revised block is more direct and ensures all packages are downloaded.
 for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
     try:
         nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
     except LookupError:
         nltk.download(package)
-# ------------------------------------------------
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 # --- GLOBAL STATE & DATA ---
 original_df = None
 combined_df = None
@@ -43,22 +68,6 @@ def _norm_skill_token(s: str) -> str:
     s = re.sub(r'\s+', ' ', s)
     return s
-def _skill_match(token1: str, token2: str, threshold: float = 0.9) -> bool:
-    t1 = _norm_skill_token(token1)
-    t2 = _norm_skill_token(token2)
-    if t1 == t2 or t1 in t2 or t2 in t1:
-        return True
-    try:
-        if len(t1) > 2 and len(t2) > 2:
-            vectorizer = TfidfVectorizer().fit([t1, t2])
-            vectors = vectorizer.transform([t1, t2])
-            similarity = cosine_similarity(vectors)[0, 1]
-            if similarity >= threshold:
-                return True
-    except:
-        pass
-    return False
 def build_known_vocabulary(df: pd.DataFrame):
     global KNOWN_WORDS
     english_words = set(w.lower() for w in words.words())
@@ -83,9 +92,7 @@ def initialize_llm_client():
         model_llm = AutoModelForCausalLM.from_pretrained(
             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
         )
-        LLM_PIPELINE = pipeline(
-            "text-generation", model=model_llm, tokenizer=tokenizer, max_new_tokens=100, do_sample=True, temperature=0.7
-        )
         return True
     except Exception as e:
         print(f"🚨 ERROR initializing local LLM: {e}")
@@ -129,64 +136,173 @@ def find_job_matches(original_user_query: str, expanded_user_query: str, top_k:
     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
     return final_results_df
-def score_jobs_by_skills(user_tokens: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
-    if df_to_rank is None or df_to_rank.empty: return pd.DataFrame()
     ranked_df = df_to_rank.copy()
-    if 'Skills' not in ranked_df.columns: return ranked_df.sort_values(by='Similarity Score', ascending=False)
-    def calculate_match(row, user_tokens):
-        job_skills = row.get('Skills', [])
-        if not isinstance(job_skills, list): return [], 0, 0.0
-        matched_skills = [s for s in job_skills if any(_skill_match(ut, s) for ut in user_tokens)]
-        total_required_count = len(job_skills)
-        match_score = len(matched_skills) / total_required_count if total_required_count > 0 else 0.0
-        return matched_skills, len(matched_skills), match_score
-    results = ranked_df.apply(lambda row: calculate_match(row, user_tokens), axis=1, result_type='expand')
-    ranked_df[['Skill Matches', 'Skill Match Count', 'Skill Match Score']] = results
-    ranked_df = ranked_df.sort_values(by=['Skill Match Score', 'Similarity Score'], ascending=[False, False]).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
     print("--- Initializing LLM Client ---")
-    if not initialize_llm_client(): print("Warning: LLM Client failed to initialize.")
-    print("--- Loading Datasets ---")
     ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
-    original_df = ds["original"].to_pandas()
     augmented_df = ds["augmented"].to_pandas()
-    original_df['job_id'] = original_df.index
     max_id = len(original_df) - 1
     augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
-    def create_full_text(row):
-        return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
-    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
     augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
     combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
     original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
-    def extract_skills_from_text(text):
-        if not isinstance(text, str): return []
-        grammar = "NP: {<JJ.?>*<NN.?>+}"
-        chunk_parser = nltk.RegexpParser(grammar)
-        tokens = nltk.word_tokenize(text.lower())
-        tagged_tokens = nltk.pos_tag(tokens)
-        chunked_text = chunk_parser.parse(tagged_tokens)
-        skills = []
-        for subtree in chunked_text.subtrees():
-            if subtree.label() == 'NP':
-                phrase = " ".join(word for word, tag in subtree.leaves())
-                junk_phrases = {'demonstrated experience', 'experience', 'related field', 'college/university level', 'equivalent foreign degree', 'cacrep standards', 'students', 'learning experience', 'ability', 'process', 'accreditation', 'human development', 'social welfare', 'sociology', 'pre-service teachers', 'abilities', 'books', 'certifications', 'college', 'level', 'licenses', 'years', 'form', 'knowledge', 'skills'}
-                if phrase not in junk_phrases and _norm_skill_token(phrase) and phrase not in STOPWORDS:
-                    skills.append(_norm_skill_token(phrase))
-        keywords = {'teaching', 'training', 'leadership', 'management', 'data management', 'budget development', 'report'}
-        for keyword in keywords:
-            if re.search(r'\b' + re.escape(keyword) + r'\b', text.lower()) and _norm_skill_token(keyword) not in skills:
-                skills.append(_norm_skill_token(keyword))
-        stemmed_skills = {}
-        for skill in skills:
-            stemmed_phrase = ' '.join([stemmer.stem(word) for word in skill.split()])
-            if stemmed_phrase not in stemmed_skills:
-                stemmed_skills[stemmed_phrase] = skill
-        return list(stemmed_skills.values())
-    original_df['Skills'] = original_df['qualifications'].apply(extract_skills_from_text)
     print("--- Loading Fine-Tuned Sentence Transformer Model ---")
     model = SentenceTransformer(FINETUNED_MODEL_ID)
     print("--- Encoding Embeddings ---")
@@ -201,145 +317,186 @@ def _course_links_for(skill: str) -> str:
     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
-# --- GRADIO INTERFACE FUNCTIONS ---
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
     emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
-    if user_skills:
-        display_df = score_jobs_by_skills(user_skills, emb_matches)
-    else:
-        display_df = emb_matches
-    display_df = display_df.head(top_n)
     if user_skills:
         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
     else:
         status = f"Found {len(display_df)} top matches using semantic search."
-    table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
-    if 'Skill Match Score' in display_df.columns:
-        table_to_show['Skill Match Score'] = display_df['Skill Match Score']
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
-    return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True)
 def rerank_current_results(initial_matches_df, skills_text, top_n):
     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
-        return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False)
     initial_matches_df = pd.DataFrame(initial_matches_df)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     if not user_skills:
         status = "Skills cleared. Showing original semantic search results."
         display_df = initial_matches_df.head(top_n)
         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
     else:
         ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
         display_df = ranked_df.head(top_n)
-        table_to_show = display_df[['job_title', 'company', 'Similarity Score', 'Skill Match Score']]
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
-    return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True)
 def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
     if not dream_job:
-        return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False)
     unrecognized_words = check_spelling_in_query(dream_job)
     if unrecognized_words:
         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
-        return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True)
-    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
-    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
 def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
-    status, emb_matches, table_to_show, dropdown, details_accordion = get_job_matches(dream_job, top_n, skills_text)
-    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False)
 def on_select_job(job_id, skills_text):
-    if job_id is None:
-        return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
     job_details_markdown = f"### {title} — {company}"
     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     job_skills = row.get("Skills", [])
     if not job_skills:
-        learning_plan_html = "<p><i>No specific skills were extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
-    all_missing_skills = sorted([s for s in job_skills if not any(_skill_match(ut, s) for ut in user_skills)], key=lambda x: x.lower())
-    if not all_missing_skills:
         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     if user_skills:
-        score_val = (len(job_skills) - len(all_missing_skills)) / len(job_skills)
         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
-        skills_to_display = all_missing_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     else:
         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
-        skills_to_display = all_missing_skills[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
-        full_skill_list_for_state = all_missing_skills
         new_offset = len(skills_to_display)
-        should_button_be_visible = len(all_missing_skills) > 5
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
 def load_more_skills(full_skills_list, current_offset):
     SKILLS_INCREMENT = 5
     new_offset = current_offset + SKILLS_INCREMENT
     skills_to_display = full_skills_list[:new_offset]
     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
     should_button_be_visible = new_offset < len(full_skills_list)
     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
 def on_reset():
-    return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False))
-# --- Run Initialization ---
 print("Starting application initialization...")
 initialization_status = initialize_data_and_model()
 print(initialization_status)
-# --- Gradio Interface Definition ---
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
     initial_matches_state = gr.State()
     missing_skills_state = gr.State([])
     skills_offset_state = gr.State(0)
     with gr.Row():
         with gr.Column(scale=3):
             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
@@ -351,64 +508,37 @@ with gr.Blocks(theme=gr.themes.Soft()) as ui:
             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
             search_btn = gr.Button("Find Matches", variant="primary")
             reset_btn = gr.Button("Reset All")
     status_text = gr.Markdown("Status: Ready.")
     spelling_alert = gr.Markdown(visible=False)
     with gr.Row(visible=False) as spelling_row:
         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
         retype_btn = gr.Button("Let Me Fix It", variant="stop")
-    df_output = gr.DataFrame(label="Job Matches", interactive=False)
-    job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
         job_details_markdown = gr.Markdown()
         with gr.Tabs():
-            with gr.TabItem("Duties"):
-                duties_markdown = gr.Markdown()
-            with gr.TabItem("Qualifications"):
-                qualifications_markdown = gr.Markdown()
-            with gr.TabItem("Full Description"):
-                description_markdown = gr.Markdown()
         learning_plan_output = gr.HTML(label="Learning Plan")
         load_more_btn = gr.Button("Load More Skills", visible=False)
-    # --- Event Handlers ---
-    search_btn.click(
-        fn=find_matches_and_rank_with_check,
-        inputs=[dream_text, topk_slider, skills_text],
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    search_anyway_btn.click(
-        fn=find_matches_and_rank_anyway,
-        inputs=[dream_text, topk_slider, skills_text],
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    retype_btn.click(
-        lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False)),
-        outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row]
-    )
-    reset_btn.click(
-        fn=on_reset,
-        outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn],
-        queue=False
-    )
-    rerank_btn.click(
-        fn=rerank_current_results,
-        inputs=[initial_matches_state, skills_text, topk_slider],
-        outputs=[status_text, df_output, job_selector]
-    )
-    job_selector.change(
-        fn=on_select_job,
-        inputs=[job_selector, skills_text],
-        outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn]
-    )
-    load_more_btn.click(
-        fn=load_more_skills,
-        inputs=[missing_skills_state, skills_offset_state],
-        outputs=[learning_plan_output, skills_offset_state, load_more_btn]
-    )
-ui.launch()

 import nltk
 from nltk.corpus import words, stopwords
 import urllib.parse as _url
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from nltk.stem import PorterStemmer
 import gradio as gr
+import os
+from tqdm import tqdm
+tqdm.pandas()
+# --- NLTK Data Download ---
 for package in ['words', 'stopwords', 'averaged_perceptron_tagger', 'punkt']:
     try:
         nltk.data.find(f'corpora/{package}' if package in ['words', 'stopwords'] else f'taggers/{package}' if package == 'averaged_perceptron_tagger' else f'tokenizers/{package}')
     except LookupError:
         nltk.download(package)
 STOPWORDS = set(stopwords.words('english'))
 stemmer = PorterStemmer()
+# --- Expanded Skill Whitelist ---
+SKILL_WHITELIST = {
+    # Technical & Data
+    'python', 'java', 'c++', 'javascript', 'typescript', 'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue',
+    'nodejs', 'django', 'flask', 'fastapi', 'spring boot', 'ruby on rails', 'php', 'swift', 'kotlin', 'dart', 'flutter',
+    'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn', 'pandas', 'numpy', 'matplotlib',
+    'natural language processing', 'nlp', 'computer vision', 'data analysis', 'data science', 'data engineering',
+    'big data', 'spark', 'hadoop', 'kafka', 'data visualization', 'tableau', 'power bi', 'd3.js', 'statistics', 'analytics',
+    'aws', 'azure', 'google cloud', 'gcp', 'docker', 'kubernetes', 'terraform', 'ansible', 'ci/cd', 'jenkins',
+    'git', 'github', 'devops', 'linux', 'unix', 'shell scripting', 'powershell', 'cybersecurity', 'penetration testing',
+    'network security', 'cryptography', 'blockchain', 'c#', '.net', 'sql server', 'mysql', 'postgresql', 'mongodb', 'redis',
+    'elasticsearch', 'api design', 'rest apis', 'graphql', 'microservices', 'serverless', 'system design', 'saas',
+    # Business & Consulting
+    'agile', 'scrum', 'project management', 'product management', 'consulting', 'client management', 'business development',
+    'strategy', 'stakeholder management', 'risk management', 'compliance', 'aml', 'kyc', 'reinsurance', 'finance',
+    'financial modeling', 'financial analysis', 'due diligence', 'sourcing', 'procurement', 'negotiation', 'supply chain',
+    'business analysis', 'business intelligence', 'presentations', 'public speaking', 'time management', 'critical thinking',
+    'design thinking', 'innovation', 'adaptability', 'supervisory', 'pmp', 'cpsm', 'cips', 'microsoft office', 'communication',
+    'organizational skills',
+    # Soft & Other
+    'leadership', 'stakeholder communication', 'client communication', 'teamwork', 'collaboration', 'problem solving',
+    'ui/ux design', 'figma', 'sketch', 'adobe xd', 'graphic design', 'autocad', 'solidworks', 'sales', 'marketing',
+    'seo', 'sem', 'content writing', 'customer support', 'technical writing', 'sap', 'oracle', 'budgeting', 'mentoring', 'supervising'
+}
 # --- GLOBAL STATE & DATA ---
 original_df = None
 combined_df = None
     s = re.sub(r'\s+', ' ', s)
     return s
 def build_known_vocabulary(df: pd.DataFrame):
     global KNOWN_WORDS
     english_words = set(w.lower() for w in words.words())
         model_llm = AutoModelForCausalLM.from_pretrained(
             LLM_MODEL_NAME, torch_dtype="auto", device_map="auto", trust_remote_code=True
         )
+        LLM_PIPELINE = pipeline("text-generation", model=model_llm, tokenizer=tokenizer)
         return True
     except Exception as e:
         print(f"🚨 ERROR initializing local LLM: {e}")
     final_results_df = final_results_df.set_index('job_id', drop=False).rename(columns={'job_id': 'Job ID'})
     return final_results_df
+def score_jobs_by_skills(user_skills: list[str], df_to_rank: pd.DataFrame) -> pd.DataFrame:
+    if df_to_rank is None or df_to_rank.empty or not user_skills:
+        return df_to_rank.sort_values(by='Similarity Score', ascending=False) if df_to_rank is not None else pd.DataFrame()
     ranked_df = df_to_rank.copy()
+    if 'Skills' not in ranked_df.columns:
+        return ranked_df.sort_values(by='Similarity Score', ascending=False)
+    user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
+    all_job_skills = sorted(list(set(skill for skills_list in ranked_df['Skills'] if skills_list for skill in skills_list)))
+    if not all_job_skills:
+        ranked_df['Skill Match Score'] = 0.0
+        ranked_df['Final Score'] = ranked_df['Similarity Score']
+        return ranked_df
+    job_skill_embeddings = model.encode(all_job_skills, convert_to_tensor=True)
+    similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
+    def calculate_confidence_adjusted_score(row):
+        job_skills_list = row.get('Skills', [])
+        if not job_skills_list:
+            return 0.0
+        total_required = len(job_skills_list)
+        sum_of_max_similarities = 0.0
+        for job_skill in job_skills_list:
+            try:
+                job_skill_idx = all_job_skills.index(job_skill)
+                max_sim = torch.max(similarity_matrix[:, job_skill_idx])
+                sum_of_max_similarities += max_sim.item()
+            except (ValueError, IndexError):
+                continue
+        avg_score = sum_of_max_similarities / total_required if total_required > 0 else 0.0
+        skill_count_factor = min(1.0, total_required / 5.0)
+        return avg_score * skill_count_factor
+    ranked_df['Skill Match Score'] = ranked_df.apply(calculate_confidence_adjusted_score, axis=1)
+    ranked_df['Final Score'] = (0.8 * ranked_df['Similarity Score']) + (0.2 * ranked_df['Skill Match Score'])
+    ranked_df = ranked_df.sort_values(by='Final Score', ascending=False).reset_index(drop=True)
     return ranked_df.set_index('Job ID', drop=False).rename_axis(None)
 def initialize_data_and_model():
     global original_df, combined_df, model, combined_job_embeddings, original_job_title_embeddings
+    PROCESSED_DATA_PATH = "processed_jobs_with_skills.parquet"
     print("--- Initializing LLM Client ---")
+    if not initialize_llm_client(): print("Warning: LLM Client failed to initialize. Will use NLTK only for skills.")
+    if os.path.exists(PROCESSED_DATA_PATH):
+        print(f"--- Loading pre-processed data from {PROCESSED_DATA_PATH} ---")
+        original_df = pd.read_parquet(PROCESSED_DATA_PATH)
+    else:
+        print("--- No pre-processed data found. Starting one-time processing... ---")
+        ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
+        original_df = ds["original"].to_pandas()
+        def extract_skills_llm(text: str) -> list[str]:
+            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
+            prompt = f"""
+Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
+[Example 1]
+Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
+Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
+[Example 2]
+Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
+Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
+[Actual Task]
+Text: "{text}"
+Extracted Skills:
+"""
+            try:
+                response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
+                generated_text = response[0]['generated_text']
+                skills_part = generated_text.split("Extracted Skills:")[-1].strip()
+                skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
+                return list(dict.fromkeys(s.lower() for s in skills))
+            except Exception: return []
+        def extract_skills_nltk(text: str) -> list[str]:
+            if not isinstance(text, str): return []
+            text_lower = text.lower()
+            grammar = "NP: {<JJ.*>*<NN.*>+}"
+            chunk_parser = nltk.RegexpParser(grammar)
+            tokens = nltk.word_tokenize(text_lower)
+            tagged_tokens = nltk.pos_tag(tokens)
+            chunked_text = chunk_parser.parse(tagged_tokens)
+            potential_skills = set()
+            for subtree in chunked_text.subtrees():
+                if subtree.label() == 'NP':
+                    phrase = " ".join(word for word, tag in subtree.leaves())
+                    if _norm_skill_token(phrase) in SKILL_WHITELIST:
+                        potential_skills.add(_norm_skill_token(phrase))
+            return sorted(list(potential_skills))
+        def extract_skills_direct_scan(text: str) -> list[str]:
+            if not isinstance(text, str): return []
+            found_skills = set()
+            for skill in SKILL_WHITELIST:
+                if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE):
+                    found_skills.add(skill)
+            return list(found_skills)
+        def expand_skills_with_llm(job_title: str, existing_skills: list) -> list:
+            if not LLM_PIPELINE or not job_title: return []
+            skills_to_add = 6 - len(existing_skills)
+            prompt = f"""
+Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
+Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
+List only the new skills, separated by commas. Do not repeat skills from the original list.
+Additional Skills:
+"""
+            try:
+                response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
+                generated_text = response[0]['generated_text']
+                skills_part = generated_text.split("Additional Skills:")[-1].strip()
+                new_skills = [skill.strip().lower() for skill in skills_part.split(',') if skill.strip()]
+                return new_skills
+            except Exception:
+                return []
+        def extract_skills_hybrid(row) -> list[str]:
+            text = row['text_for_skills']
+            job_title = row.get('Job title', '') # Use original Job title for context
+            llm_skills = extract_skills_llm(text)
+            nltk_skills = extract_skills_nltk(text)
+            direct_skills = extract_skills_direct_scan(text)
+            combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
+            # If the combined list is still too short, expand it
+            if len(combined_skills) < 6:
+                expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
+                combined_skills.update(expanded_skills)
+            return sorted(list(combined_skills))
+        def create_text_for_skills(row):
+            return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
+        original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
+        print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
+        # Apply the hybrid function row-wise to include job title context
+        original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
+        original_df = original_df.drop(columns=['text_for_skills'])
+        print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
+        original_df.to_parquet(PROCESSED_DATA_PATH)
+    original_df['job_id'] = original_df.index
+    def create_full_text(row): return " ".join([str(s) for s in [row.get("Job title"), row.get("Company"), row.get("Duties"), row.get("qualifications"), row.get("Description")]])
+    original_df["full_text"] = original_df.apply(create_full_text, axis=1)
     ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
     augmented_df = ds["augmented"].to_pandas()
     max_id = len(original_df) - 1
     augmented_df['job_id'] = augmented_df.index.map(lambda i: min(i // 20, max_id))
     augmented_df["full_text"] = augmented_df.apply(create_full_text, axis=1)
     combined_df = pd.concat([original_df.copy(), augmented_df.copy()], ignore_index=True)
     original_df = original_df.rename(columns={'Job title': 'job_title', 'Company': 'company'})
     print("--- Loading Fine-Tuned Sentence Transformer Model ---")
     model = SentenceTransformer(FINETUNED_MODEL_ID)
     print("--- Encoding Embeddings ---")
     links = [("Coursera", f"https://www.coursera.org/search?query={q}"), ("edX", f"https://www.edx.org/search?q={q}"), ("Udemy", f"https://www.udemy.com/courses/search/?q={q}"), ("YouTube", f"https://www.youtube.com/results?search_query={q}+tutorial")]
     return " • ".join([f'<a href="{u}" target="_blank" style="color: #007bff;">{name}</a>' for name, u in links])
 def get_job_matches(dream_job: str, top_n: int, skills_text: str):
     status = "Searching using hybrid model..."
     expanded_desc = llm_expand_query(dream_job)
     emb_matches = find_job_matches(dream_job, expanded_desc, top_k=50)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    # --- NEW: Initialize variables for the recommendations section ---
+    recommendations_table = pd.DataFrame()
+    recommendations_visible = False
     if user_skills:
+        scored_df = score_jobs_by_skills(user_skills, emb_matches)
+        # --- NEW: Logic to get top 5 jobs based purely on skill match score ---
+        skill_sorted_df = scored_df.sort_values(by='Skill Match Score', ascending=False).head(5)
+        if not skill_sorted_df.empty:
+            recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
+            recs = recs.rename(columns={'Final Score': 'Overall Score'})
+            recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format)
+            recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
+            recommendations_table = recs
+            recommendations_visible = True
+        # --- END NEW ---
+        display_df = scored_df.head(top_n)
         status = f"Found and **re-ranked** results by your {len(user_skills)} skills. Displaying top {len(display_df)}."
     else:
+        display_df = emb_matches.head(top_n)
         status = f"Found {len(display_df)} top matches using semantic search."
+    if 'Final Score' in display_df.columns:
+        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
+        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
+        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
+    else:
+        table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
+        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
+    # --- MODIFIED: Added new outputs for recommendations ---
+    return status, emb_matches, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), gr.Accordion(visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
 def rerank_current_results(initial_matches_df, skills_text, top_n):
     if initial_matches_df is None or pd.DataFrame(initial_matches_df).empty:
+        return "Please find matches first before re-ranking.", pd.DataFrame(), gr.Dropdown(visible=False), pd.DataFrame(), gr.Accordion(visible=False)
     initial_matches_df = pd.DataFrame(initial_matches_df)
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
+    # --- NEW: Initialize variables for the recommendations section ---
+    recommendations_table = pd.DataFrame()
+    recommendations_visible = False
     if not user_skills:
         status = "Skills cleared. Showing original semantic search results."
         display_df = initial_matches_df.head(top_n)
         table_to_show = display_df[['job_title', 'company', 'Similarity Score']]
+        table_to_show = table_to_show.rename(columns={'Similarity Score': 'Overall Score'})
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
     else:
         ranked_df = score_jobs_by_skills(user_skills, initial_matches_df)
         status = f"Results **re-ranked** based on your {len(user_skills)} skills."
         display_df = ranked_df.head(top_n)
+        # --- NEW: Logic to get top 5 jobs based purely on skill match score ---
+        skill_sorted_df = ranked_df.sort_values(by='Skill Match Score', ascending=False).head(5)
+        if not skill_sorted_df.empty:
+            recs = skill_sorted_df[['job_title', 'company', 'Skill Match Score', 'Final Score']].copy()
+            recs = recs.rename(columns={'Final Score': 'Overall Score'})
+            recs['Skill Match Score'] = recs['Skill Match Score'].map('{:.2%}'.format)
+            recs['Overall Score'] = recs['Overall Score'].map('{:.2%}'.format)
+            recommendations_table = recs
+            recommendations_visible = True
+        # --- END NEW ---
+        table_to_show = display_df[['job_title', 'company', 'Final Score', 'Skill Match Score']]
+        table_to_show = table_to_show.rename(columns={'Final Score': 'Overall Score'})
+        table_to_show['Skill Match Score'] = table_to_show['Skill Match Score'].map('{:.2%}'.format)
+        table_to_show['Overall Score'] = table_to_show['Overall Score'].map('{:.2%}'.format)
     dropdown_options = [(f"{i+1}. {row['job_title']} - {row['company']}", row.name) for i, row in display_df.iterrows()]
     dropdown_value = dropdown_options[0][1] if dropdown_options else None
+    # --- MODIFIED: Added new outputs for recommendations ---
+    return status, table_to_show, gr.Dropdown(choices=dropdown_options, value=dropdown_value, visible=True), recommendations_table, gr.Accordion(visible=recommendations_visible)
 def find_matches_and_rank_with_check(dream_job: str, top_n: int, skills_text: str):
     if not dream_job:
+        # --- MODIFIED: Added new default outputs ---
+        return "Please describe your dream job first.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(""), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)
     unrecognized_words = check_spelling_in_query(dream_job)
     if unrecognized_words:
         word_list_html = ", ".join([f"<b><span style='color: #F87171;'>{w}</span></b>" for w in unrecognized_words])
         alert_message = f"<b><span style='color: #F87171;'>⚠️ Possible Spelling Error:</span></b> Unrecognized: {word_list_html}."
+        # --- MODIFIED: Added new default outputs ---
+        return "Status: Awaiting confirmation.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(alert_message, visible=True), gr.Row(visible=True), pd.DataFrame(), gr.Accordion(visible=False)
+    status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text)
+    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion
 def find_matches_and_rank_anyway(dream_job: str, top_n: int, skills_text: str):
+    status, emb_matches, table_to_show, dropdown, details_accordion, recommendations_table, recommendations_accordion = get_job_matches(dream_job, top_n, skills_text)
+    return status, emb_matches, table_to_show, dropdown, details_accordion, gr.Markdown(visible=False), gr.Row(visible=False), recommendations_table, recommendations_accordion
 def on_select_job(job_id, skills_text):
+    if job_id is None: return "", "", "", "", "", gr.Accordion(visible=False), [], 0, gr.Button(visible=False)
     row = original_df.loc[job_id]
     title, company = str(row.get("job_title", "")), str(row.get("company", ""))
     job_details_markdown = f"### {title} — {company}"
     duties, qualifications, description = str(row.get('Duties', '')), str(row.get('qualifications', '')), str(row.get('Description', ''))
     user_skills = [_norm_skill_token(s) for s in skills_text.split(',') if _norm_skill_token(s)]
     job_skills = row.get("Skills", [])
     if not job_skills:
+        learning_plan_html = "<p><i>No specific skills could be extracted for this job.</i></p>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
+    score_val = 0
+    all_missing_skills = job_skills
+    if user_skills:
+        user_skill_embeddings = model.encode(user_skills, convert_to_tensor=True)
+        job_skill_embeddings = model.encode(job_skills, convert_to_tensor=True)
+        similarity_matrix = util.cos_sim(user_skill_embeddings, job_skill_embeddings)
+        sum_of_max_similarities = torch.sum(torch.max(similarity_matrix, dim=0).values)
+        avg_score = (sum_of_max_similarities / len(job_skills)).item() if len(job_skills) > 0 else 0
+        skill_count_factor = min(1.0, len(job_skills) / 5.0)
+        score_val = avg_score * skill_count_factor
+        matched_job_skills_mask = torch.any(similarity_matrix > 0.58, dim=0)
+        all_missing_skills = [skill for i, skill in enumerate(job_skills) if not matched_job_skills_mask[i]]
+    if user_skills and score_val >= 0.98:
         learning_plan_html = "<h4 style='color:green;'>🎉 You have all the required skills!</h4>"
+        job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     if user_skills:
         job_details_markdown += f"\n**Your skill match:** {score_val:.1%}"
         headline = "<b>Great fit!</b>" if score_val >= 0.8 else "<b>Good progress!</b>" if score_val >= 0.5 else "<b>Solid starting point.</b>"
         learning_plan_html = f"<h4>{headline} Focus on these skills to improve your match:</h4>"
+        skills_to_display = sorted(all_missing_skills)[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html += f"<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), [], 0, gr.Button(visible=False)
     else:
         headline = "<h4>To be a good fit for this role, you'll need to learn these skills:</h4>"
+        skills_to_display = sorted(job_skills)[:5]
         items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
         learning_plan_html = f"{headline}<ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
+        full_skill_list_for_state = sorted(job_skills)
         new_offset = len(skills_to_display)
+        should_button_be_visible = len(full_skill_list_for_state) > 5
         return job_details_markdown, duties, qualifications, description, learning_plan_html, gr.Accordion(visible=True), full_skill_list_for_state, new_offset, gr.Button(visible=should_button_be_visible)
 def load_more_skills(full_skills_list, current_offset):
     SKILLS_INCREMENT = 5
     new_offset = current_offset + SKILLS_INCREMENT
     skills_to_display = full_skills_list[:new_offset]
     items_html = [f"<li><b>{ms}</b><br>• Learn: {_course_links_for(ms)}</li>" for ms in skills_to_display]
     learning_plan_html = f"<h4>To be a good fit for this role, you'll need to learn these skills:</h4><ul style='list-style-type: none; padding-left: 0;'>{''.join(items_html)}</ul>"
     should_button_be_visible = new_offset < len(full_skills_list)
     return learning_plan_html, new_offset, gr.Button(visible=should_button_be_visible)
 def on_reset():
+    # --- MODIFIED: Added new default outputs for reset ---
+    return ("", 3, "", pd.DataFrame(), None, gr.Dropdown(visible=False), gr.Accordion(visible=False), "Status: Ready.", "", "", "", "", gr.Markdown(visible=False), gr.Row(visible=False), [], 0, gr.Button(visible=False), pd.DataFrame(), gr.Accordion(visible=False))
 print("Starting application initialization...")
 initialization_status = initialize_data_and_model()
 print(initialization_status)
 with gr.Blocks(theme=gr.themes.Soft()) as ui:
     gr.Markdown("# Hybrid Career Planner & Skill Gap Analyzer")
     initial_matches_state = gr.State()
     missing_skills_state = gr.State([])
     skills_offset_state = gr.State(0)
     with gr.Row():
         with gr.Column(scale=3):
             dream_text = gr.Textbox(label='Your Dream Job Description', lines=3, placeholder="e.g., 'A role in a tech startup focused on machine learning...'")
             topk_slider = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Number of Matches")
             search_btn = gr.Button("Find Matches", variant="primary")
             reset_btn = gr.Button("Reset All")
     status_text = gr.Markdown("Status: Ready.")
     spelling_alert = gr.Markdown(visible=False)
     with gr.Row(visible=False) as spelling_row:
         search_anyway_btn = gr.Button("Search Anyway", variant="secondary")
         retype_btn = gr.Button("Let Me Fix It", variant="stop")
+    df_output = gr.DataFrame(label="Job Matches (Sorted by Overall Relevance)", interactive=False)
+    # --- NEW: Added the recommendations section ---
+    with gr.Accordion("✨ Based on your current skills and career interest consider these jobs...", open=True, visible=False) as recommendations_accordion:
+        recommendations_df_output = gr.DataFrame(label="Top Skill Matches", interactive=False)
+    job_selector = gr.Dropdown(label="Select a job to see more details & learning plan:", visible=False)
     with gr.Accordion("Job Details & Learning Plan", open=False, visible=False) as details_accordion:
         job_details_markdown = gr.Markdown()
         with gr.Tabs():
+            with gr.TabItem("Duties"): duties_markdown = gr.Markdown()
+            with gr.TabItem("Qualifications"): qualifications_markdown = gr.Markdown()
+            with gr.TabItem("Full Description"): description_markdown = gr.Markdown()
         learning_plan_output = gr.HTML(label="Learning Plan")
         load_more_btn = gr.Button("Load More Skills", visible=False)
+    # --- MODIFIED: Added new outputs to the click events ---
+    search_btn.click(fn=find_matches_and_rank_with_check, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
+    search_anyway_btn.click(fn=find_matches_and_rank_anyway, inputs=[dream_text, topk_slider, skills_text], outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
+    retype_btn.click(lambda: ("Status: Ready for you to retype.", None, pd.DataFrame(), gr.Dropdown(visible=False), gr.Accordion(visible=False), gr.Markdown(visible=False), gr.Row(visible=False), pd.DataFrame(), gr.Accordion(visible=False)), outputs=[status_text, initial_matches_state, df_output, job_selector, details_accordion, spelling_alert, spelling_row, recommendations_df_output, recommendations_accordion])
+    reset_btn.click(fn=on_reset, outputs=[dream_text, topk_slider, skills_text, df_output, initial_matches_state, job_selector, details_accordion, status_text, job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, spelling_alert, spelling_row, missing_skills_state, skills_offset_state, load_more_btn, recommendations_df_output, recommendations_accordion], queue=False)
+    rerank_btn.click(fn=rerank_current_results, inputs=[initial_matches_state, skills_text, topk_slider], outputs=[status_text, df_output, job_selector, recommendations_df_output, recommendations_accordion])
+    job_selector.change(fn=on_select_job, inputs=[job_selector, skills_text], outputs=[job_details_markdown, duties_markdown, qualifications_markdown, description_markdown, learning_plan_output, details_accordion, missing_skills_state, skills_offset_state, load_more_btn])
+    load_more_btn.click(fn=load_more_skills, inputs=[missing_skills_state, skills_offset_state], outputs=[learning_plan_output, skills_offset_state, load_more_btn])
+ui.launch()