import pandas as pd
from typing import List
[docs]
def clean_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean and sanitize the column names of a pandas DataFrame.
This function performs the following operations:
1. Strips leading/trailing whitespace from each column name.
2. Removes any non-ASCII characters from column names.
Parameters
----------
df : pandas.DataFrame
The input DataFrame whose columns need to be cleaned.
Returns
-------
pandas.DataFrame
The same DataFrame with cleaned column names.
"""
df.columns = [c.strip() for c in df.columns]
df.columns = [c.encode('ascii', 'ignore').decode().strip() for c in df.columns]
return df
[docs]
def split_duplicate_ids_by_invariant_columns(df: pd.DataFrame, invariant_columns: List[str]) -> pd.DataFrame:
"""
Split duplicate IDs in a DataFrame when invariant columns vary within the same ID group.
This function ensures that each group of rows sharing an ID also shares identical values
in the specified invariant columns. If not, the ID is suffixed to distinguish subgroups.
Parameters
----------
df : pandas.DataFrame
The input DataFrame containing duplicate IDs.
invariant_columns : list of str
Columns that should have the same values across all rows for a given ID. If differences
are found, new IDs will be generated to separate subgroups.
Returns
-------
pandas.DataFrame
A DataFrame with updated IDs and no unintended merges across differing invariant data.
"""
if invariant_columns is None:
raise ValueError("You must provide a list of invariant_columns.")
unique_ids_before: int = df['ID'].nunique()
df['Original_ID'] = df['ID']
def split_id(group: pd.DataFrame) -> pd.DataFrame:
unique_combos = group[invariant_columns].drop_duplicates()
if len(unique_combos) == 1:
group['New_ID'] = group['Original_ID'].iloc[0]
else:
for idx, (_, subgroup) in enumerate(group.groupby(invariant_columns, dropna=False)):
group.loc[subgroup.index, 'New_ID'] = f"{group['Original_ID'].iloc[0]}_{idx + 1}"
return group
df = df.groupby('Original_ID', group_keys=False).apply(split_id)
df['ID'] = df['New_ID'].fillna(df['Original_ID'])
df = df.drop(columns=['New_ID', 'Original_ID']).reset_index(drop=True)
unique_ids_after: int = df['ID'].nunique()
print(f"🔵 Unique IDs before cleaning: {unique_ids_before}")
print(f"🟢 Unique IDs after cleaning: {unique_ids_after}")
print(f"🧮 Difference: {unique_ids_after - unique_ids_before} new IDs created")
return df
[docs]
def remove_initial_stage_candidates(df: pd.DataFrame) -> pd.DataFrame:
"""
Removes candidates who are only present in the earliest stages of the selection process
and have no sector information. These are typically low-signal entries that did not
progress in the recruitment pipeline.
A candidate row will be removed if:
- It is the only row for that candidate ID.
- The 'Candidate State' is one of ['imported', 'first contact', 'in selection'].
- The 'Sector' is missing (NaN).
Parameters
----------
df : pandas.DataFrame
The input DataFrame containing candidate records.
Returns
-------
pandas.DataFrame
Cleaned DataFrame with early-stage, low-information candidates removed.
"""
df['Candidate State'] = df['Candidate State'].str.strip().str.lower()
ids_to_drop: List[str] = []
for id_value, group in df.groupby('ID'):
is_single_row = len(group) == 1
is_initial_stage_only = group['Candidate State'].isin(['imported', 'first contact', 'in selection']).all()
has_no_sector_info = group['Sector'].isna().all()
if is_single_row and is_initial_stage_only and has_no_sector_info:
ids_to_drop.append(id_value)
df_cleaned = df[~df['ID'].isin(ids_to_drop)].reset_index(drop=True)
print(f"🗂️ Removed {len(ids_to_drop)} initial-stage only candidates.")
return df_cleaned
[docs]
def sort_group(group: pd.DataFrame, state_order: List[str], event_order: List[str]) -> pd.DataFrame:
"""
Sorts a group of candidate records by the 'Candidate State' and 'event_type__val' columns,
based on provided orderings for both states and event types.
The sorting is done first by 'Candidate State' according to the `state_order`, and then by
'event_type__val' according to the `event_order`.
Parameters:
----------
group : pandas.DataFrame
A group of rows with the same 'ID', representing the different stages in the recruitment process
for a single candidate.
state_order : list of str
The predefined order for sorting the 'Candidate State' column.
event_order : list of str
The predefined order for sorting the 'event_type__val' column.
Returns:
-------
pandas.DataFrame
The same group sorted by 'Candidate State' and 'event_type__val'.
"""
sorted_group = group.sort_values(by=['Candidate State', 'event_type__val'],
key=lambda col: col.map(
lambda x: (
state_order.index(x) if x in state_order else -1,
event_order.index(x) if x in event_order else -1
)
)
)
return sorted_group
[docs]
def remove_not_hired_valid_candidates(df: pd.DataFrame, state_order: List[str], event_order: List[str], feedbacks_to_remove: List[str]) -> pd.DataFrame:
"""
Removes candidates from the DataFrame who have invalid or irrelevant status based on
their most recent event feedback and event type. Specifically, it removes candidates
who are not hired and have certain feedback or event types indicating they are not
progressing in the recruitment process.
The function performs the following steps:
1. Strips whitespace and converts the relevant columns to lowercase.
2. Applies sorting logic to ensure candidate events are processed in the correct order.
3. Identifies candidates who have invalid feedback or event types in their last event
and are not hired.
4. Removes these candidates from the DataFrame.
Parameters
----------
df : pandas.DataFrame
The input DataFrame containing candidate records with columns such as 'Candidate State',
'event_type__val', and 'event_feedback'.
state_order : list of str
The order in which the 'Candidate State' values should be sorted.
event_order : list of str
The order in which the 'event_type__val' values should be sorted.
feedbacks_to_remove : list of str
The feedback values that indicate candidates should be removed if they are not hired.
Returns
-------
pandas.DataFrame
The cleaned DataFrame with invalid candidates removed.
"""
df['Candidate State'] = df['Candidate State'].str.strip().str.lower()
df['event_type__val'] = df['event_type__val'].str.strip().str.lower()
df['event_feedback'] = df['event_feedback'].str.strip()
df['Hired'] = df['Candidate State'].apply(
lambda x: True if x == 'hired' else False
)
df = df.groupby('ID', group_keys=False).apply(sort_group, state_order=state_order, event_order=event_order)
df = df.reset_index(drop=True)
last_event = df.groupby('ID').tail(1)
ids_to_remove_feedback = last_event[
(last_event['event_feedback'].isin(feedbacks_to_remove)) & (last_event['Hired'] != True)
]['ID'].tolist()
ids_to_remove_event = last_event[
(last_event['event_type__val'].isin(['economic proposal', 'candidate notification'])) &
(last_event['Hired'] != True)
]['ID'].tolist()
all_ids_to_remove = set(ids_to_remove_feedback + ids_to_remove_event)
print(f"Number of unique IDs to remove: {len(all_ids_to_remove)}")
total_ids_before = df['ID'].nunique()
df = df[~df['ID'].isin(all_ids_to_remove)].reset_index(drop=True)
total_ids_after = df['ID'].nunique()
print(f"Total IDs before cleaning: {total_ids_before}")
print(f"Total IDs after cleaning: {total_ids_after}")
print(f"Total IDs removed: {total_ids_before - total_ids_after}")
df = df.drop(columns=['state_order', 'event_order'], errors='ignore')
return df