Source code for utils.feature_engineering

import pandas as pd
import numpy as np
import re
import time
import warnings
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
from typing import List, Tuple, Optional, Dict, Union

warnings.filterwarnings('ignore')



[docs]
def calculate_study_title_score(df: pd.DataFrame) -> pd.Series:
    """
    Calculate the normalized difference between candidate and required study levels.

    This function maps education levels to a numerical ranking and computes the
    normalized difference between a candidate's level and the job's requirement.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing 'Study Title' and 'Study Level' columns.

    Returns
    -------
    pandas.Series
        A Series of normalized score differences between candidate and job study levels.
    """
    ordered_levels = [
        "Middle school diploma",
        "High school graduation",
        "Professional qualification",
        "Three-year degree",
        "Five-year degree",
        "master's degree",
        "Doctorate"
    ]

    level_to_rank = {level: idx for idx, level in enumerate(ordered_levels)}
    max_distance = len(ordered_levels) - 1

    def _calculate_score(candidate_level: str, required_level: str):
        if pd.isna(candidate_level) or pd.isna(required_level):
            return np.nan
        if candidate_level not in level_to_rank or required_level not in level_to_rank:
            return np.nan

        diff = level_to_rank[candidate_level] - level_to_rank[required_level]
        return diff / max_distance

    return df.apply(
        lambda row: _calculate_score(row.get('Study Title'), row.get('Study Level')),
        axis=1
    )




[docs]
def calculate_experience_match_score(df: pd.DataFrame) -> pd.Series:
    """
    Calculate the normalized difference between candidate and required experience.

    The function compares years of experience and returns a normalized score
    based on the range of values found in the dataset.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing 'Years Experience_int' and 'Years Experience.1_int'.

    Returns
    -------
    pandas.Series
        A Series containing normalized experience difference scores.
    """
    candidate_exps = df['Years Experience_int']
    job_exps = df['Years Experience.1_int']
    global_min = pd.concat([candidate_exps, job_exps]).min()
    global_max = pd.concat([candidate_exps, job_exps]).max()
    max_range = global_max - global_min if global_max != global_min else 1  

    def _calculate_score(candidate_exp: float, job_req_exp: float) -> float:
        if pd.isna(job_req_exp):
            return 0
        diff = candidate_exp - job_req_exp
        return diff / max_range

    return df.apply(
        lambda row: _calculate_score(row.get('Years Experience_int'), row.get('Years Experience.1_int')),
        axis=1
    ) 




[docs]
def calculate_salary_fit_score(df: pd.DataFrame, is_expected: bool = True) -> pd.Series:
    """
    Calculate the salary fit score between a candidate's salary and job's salary range.

    Returns 1.0 if candidate's salary is within range; otherwise, returns a normalized
    score based on how far it is from the closest bound.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with salary information, including candidate and job salary columns.
    is_expected : bool, optional
        If True, uses 'Expected Ral'; if False, uses 'Current Ral'. Default is True.

    Returns
    -------
    pandas.Series
        A Series of salary fit scores.
    """
    def _calculate_score(expected_ral: float, min_ral: float, max_ral: float):
        if pd.isna(expected_ral) or pd.isna(min_ral) or pd.isna(max_ral):
            return np.nan
        if expected_ral >= min_ral and expected_ral <= max_ral:
            return 1.0 

        distance = expected_ral - min_ral if expected_ral < min_ral else expected_ral - max_ral
        range_size = max_ral - min_ral
        scale_factor = range_size if range_size > 0 else min_ral
        if scale_factor <= 0:
            scale_factor = 1000
        return distance / scale_factor

    return df.apply(
        lambda row: _calculate_score(
            row.get('Expected Ral' if is_expected else 'Current Ral'),
            row.get('Minimum Ral'),
            row.get('Ral Maximum')
        ),
        axis=1
    )



model = SentenceTransformer('all-MiniLM-L6-v2') 



[docs]
def calculate_study_area_score(df: pd.DataFrame) -> pd.Series:
    """
    Calculate semantic similarity between candidate and required study areas.

    Uses sentence embeddings and cosine similarity to quantify alignment between
    study fields.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with 'Study area' and 'Study Area.1' columns.

    Returns
    -------
    pandas.Series
        A Series of cosine similarity scores.
    """
    all_study_areas = pd.concat([df['Study area'], df['Study Area.1']]).dropna().unique()
    embeddings = {s: model.encode(s, convert_to_tensor=True) for s in all_study_areas}

    def _score(a: str, b: str):
        if pd.isna(a) or pd.isna(b):
            return np.nan
        emb_a = embeddings.get(a)
        emb_b = embeddings.get(b)
        return float(util.cos_sim(emb_a, emb_b))

    return df.apply(lambda row: _score(row.get('Study area'), row.get('Study Area.1')), axis=1)




[docs]
def calculate_professional_similarity_score(df: pd.DataFrame) -> pd.Series:
    """
    Calculate semantic similarity between candidate's background and job description.

    Compares sector and last role against job family and job title using sentence
    embeddings and cosine similarity.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with 'Sector', 'Last Role', 'Job Family Hiring', and 'Job Title Hiring'.

    Returns
    -------
    pandas.Series
        A Series of professional similarity scores.
    """
    def build_text(*fields: str) -> Optional[str]:
        non_empty = [str(f).strip() for f in fields if pd.notna(f) and str(f).strip()]
        if not non_empty:
            return None
        return ' | '.join(non_empty)

    embedding_cache: Dict[str, any] = {}

    def get_embedding(text: str):
        if text in embedding_cache:
            return embedding_cache[text]
        embedding = model.encode(text, convert_to_tensor=True)
        embedding_cache[text] = embedding
        return embedding

    def _similarity(row: pd.Series):
        candidate_text = build_text(row.get('Sector'), row.get('Last Role'))
        job_text = build_text(row.get('Job Family Hiring'), row.get('Job Title Hiring'))

        if candidate_text is None or job_text is None:
            return np.nan

        emb_a = get_embedding(candidate_text)
        emb_b = get_embedding(job_text)
        return float(util.cos_sim(emb_a, emb_b))

    return df.apply(_similarity, axis=1)




[docs]
def create_candidate_text(row: pd.Series) -> str:
    """
    Create a text description summarizing a candidate's profile.

    Combines fields such as education, sector, last role, experience, and skills
    into a single formatted string.

    Parameters
    ----------
    row : pandas.Series
        A row from the candidate DataFrame.

    Returns
    -------
    str
        A text summary of the candidate.
    """
    parts = []

    if pd.notna(row.get('Study Title')) and pd.notna(row.get('Study area')):
        parts.append(f"{row['Study Title']} in {row['Study area']}")
    elif pd.notna(row.get('Study Title')):
        parts.append(f"Studied {row['Study Title']}")
    elif pd.notna(row.get('Study area')):
        parts.append(f"Studied in {row['Study area']}")

    if pd.notna(row.get('Sector')):
        parts.append(f"Worked in the {row['Sector']} sector")

    if pd.notna(row.get('Last Role')):
        parts.append(f"Last held the role of {row['Last Role']}")

    if pd.notna(row.get('Years Experience')):
        parts.append(f"with {row['Years Experience']} years of experience")

    if pd.notna(row.get('TAG')):
        parts.append(f"Key skills include: {row['TAG']}")

    return ". ".join(parts) + "."




[docs]
def create_job_text(row: pd.Series) -> str:
    """
    Create a text description summarizing a job posting.

    Combines job title, department, job description, and requirements into a
    single formatted string for use in NLP models.

    Parameters
    ----------
    row : pandas.Series
        A row from the job DataFrame.

    Returns
    -------
    str
        A text summary of the job posting.
    """
    parts = []

    if pd.notna(row.get('Job Title Hiring')):
        parts.append(f"Job title: {row['Job Title Hiring']}")

    if pd.notna(row.get('Job Family Hiring')):
        parts.append(f"Department: {row['Job Family Hiring']}")

    if pd.notna(row.get('Recruitment Request')):
        parts.append(f"Recruitment context: {row['Recruitment Request']}")

    if pd.notna(row.get('Job Description')):
        parts.append(f"Job description: {row['Job Description']}")

    if pd.notna(row.get('Candidate Profile')):
        parts.append(f"Ideal candidate profile: {row['Candidate Profile']}")

    if pd.notna(row.get('Study Level')) and pd.notna(row.get('Study Area.1')):
        parts.append(f"Educational requirement: {row['Study Level']} in {row['Study Area.1']}")
    elif pd.notna(row.get('Study Level')):
        parts.append(f"Educational requirement: {row['Study Level']}")
    elif pd.notna(row.get('Study Area.1')):
        parts.append(f"Field of study required: {row['Study Area.1']}")

    if pd.notna(row.get('Years Experience.1')):
        parts.append(f"Requires {row['Years Experience.1']} years of experience")

    return ". ".join(parts) + "."




[docs]
def prepare_nlp_text_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create candidate_text and job_text columns for NLP similarity calculations.

    This function adds text summaries for both candidate and job profiles to the DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame with candidate and job information.

    Returns
    -------
    pandas.DataFrame
        DataFrame with added 'candidate_text' and 'job_text' columns.
    """
    df_processed = df.copy()
    df_processed['candidate_text'] = df_processed.apply(create_candidate_text, axis=1).fillna("")
    df_processed['job_text'] = df_processed.apply(create_job_text, axis=1).fillna("")
    return df_processed




[docs]
def calculate_distance(coord1: Tuple[float, float], coord2: Tuple[float, float]) -> Optional[float]:
    """
    Compute geodesic distance in kilometers between two coordinate pairs.

    Parameters
    ----------
    coord1 : tuple of float
        First coordinate as (latitude, longitude).
    coord2 : tuple of float
        Second coordinate as (latitude, longitude).

    Returns
    -------
    float or None
        Distance in kilometers, or None if calculation fails.
    """
    try:
        return geodesic(coord1, coord2).kilometers
    except:
        return None