#Connecting to drive

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

Mounted at /content/drive

#Importing dataset and turning it into a dataframe

df = pd.read_csv("/content/drive/MyDrive/DATA/global_ai_jobs.csv")

df.head()

#find data types

df.dtypes

#mapping to strings

df = df.astype({'country': str, 'job_role': str, 'ai_specialization': str, 'experience_level': str, 'education_required': str, 'industry': str, 'company_size': str, 'work_mode': str, })

#standardizing each str column so that extra white space is removed and all text is lowercase

df['country'] = (df['country'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True)) # last part one or more spaces with a single space
df['job_role'] = (df['job_role'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['ai_specialization'] = (df['ai_specialization'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['experience_level'] = (df['experience_level'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['education_required'] = (df['education_required'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['industry'] = (df['industry'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['company_size'] = (df['company_size'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))
df['work_mode'] = (df['work_mode'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True))

#checking for NA values

df.isna().sum()

#Remove rows with NA values from dataframe

df = df.dropna()

#find duplicates

df.duplicated().sum()

np.int64(0)

#Remove duplicate rows, keep first occurrences

df = df.drop_duplicates()

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       90000 non-null  int64  
 1   country                  90000 non-null  object 
 2   job_role                 90000 non-null  object 
 3   ai_specialization        90000 non-null  object 
 4   experience_level         90000 non-null  object 
 5   experience_years         90000 non-null  int64  
 6   salary_usd               90000 non-null  int64  
 7   bonus_usd                90000 non-null  int64  
 8   education_required       90000 non-null  object 
 9   industry                 90000 non-null  object 
 10  company_size             90000 non-null  object 
 11  interview_rounds         90000 non-null  int64  
 12  year                     90000 non-null  int64  
 13  work_mode                90000 non-null  object 
 14  weekly_hours             90000 non-null  float64
 15  company_rating           90000 non-null  float64
 16  job_openings             90000 non-null  int64  
 17  hiring_difficulty_score  90000 non-null  float64
 18  layoff_risk              90000 non-null  float64
 19  ai_adoption_score        90000 non-null  int64  
 20  company_funding_billion  90000 non-null  float64
 21  economic_index           90000 non-null  float64
 22  ai_maturity_years        90000 non-null  int64  
 23  offer_acceptance_rate    90000 non-null  float64
 24  tax_rate_percent         90000 non-null  float64
 25  vacation_days            90000 non-null  int64  
 26  skill_demand_score       90000 non-null  int64  
 27  automation_risk          90000 non-null  int64  
 28  job_security_score       90000 non-null  int64  
 29  career_growth_score      90000 non-null  int64  
 30  work_life_balance_score  90000 non-null  int64  
 31  promotion_speed          90000 non-null  int64  
 32  salary_percentile        90000 non-null  int64  
 33  cost_of_living_index     90000 non-null  float64
 34  employee_satisfaction    90000 non-null  int64  
dtypes: float64(9), int64(18), object(8)
memory usage: 24.0+ MB

df.describe()

for col in df.select_dtypes(include='object').columns:
    print(f"Column '{col}':\n{df[col].value_counts().head(10)}\n")

Column 'country':
country
canada         7602
australia      7589
singapore      7583
brazil         7545
uk             7532
uae            7529
netherlands    7514
germany        7461
india          7450
france         7440
Name: count, dtype: int64

Column 'job_role':
job_role
nlp engineer                 11412
software engineer ai         11333
research scientist           11309
machine learning engineer    11263
ai engineer                  11247
computer vision engineer     11227
data scientist               11146
data analyst                 11063
Name: count, dtype: int64

Column 'ai_specialization':
ai_specialization
computer vision           11487
llm                       11466
reinforcement learning    11261
analytics                 11258
generative ai             11208
mlops                     11133
forecasting               11115
nlp                       11072
Name: count, dtype: int64

Column 'experience_level':
experience_level
senior    22680
lead      22556
mid       22459
entry     22305
Name: count, dtype: int64

Column 'education_required':
education_required
bootcamp    18148
phd         18070
bachelor    18005
diploma     17914
master      17863
Name: count, dtype: int64

Column 'industry':
industry
gaming        9190
education     9067
finance       9057
energy        9017
retail        8987
tech          8984
consulting    8954
telecom       8939
automotive    8930
healthcare    8875
Name: count, dtype: int64

Column 'company_size':
company_size
medium        18328
startup       18008
enterprise    17988
small         17919
large         17757
Name: count, dtype: int64

Column 'work_mode':
work_mode
onsite    30233
remote    30005
hybrid    29762
Name: count, dtype: int64

# Select only numerical columns
numerical_df = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numerical_df.corr()

display(correlation_matrix)

import matplotlib.pyplot as plt

df.hist(figsize=(15, 12), bins=30)
plt.tight_layout()
plt.show()

from scipy import stats

stats.probplot(df['salary_usd'], dist = 'norm', plot = plt)
plt.title('Q-Q Plot of Salary')
plt.show()

#Creating a loged version of salary_usd

import numpy as np

df['salary_log'] = np.log1p(df['salary_usd'])

#Q-Q Plot Using new logged version

from scipy import stats
import matplotlib.pyplot as plt

stats.probplot(df['salary_log'], dist = 'norm', plot = plt)
plt.title('Q-Q Plot of Log Salary')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D

selected_numerical_cols = ['experience_years', 'salary_usd', 'bonus_usd',
                           'weekly_hours', 'company_rating', 'vacation_days',
                           'hiring_difficulty_score']

titles = ['Experience (Years)', 'Salary (USD)', 'Bonus (USD)',
          'Weekly Hours', 'Company Rating', 'Vacation Days',
          'Hiring Difficulty Score']

fig, axes = plt.subplots(3, 3, figsize=(15, 10))
fig.suptitle('Checking for Outliers', fontsize=16, fontweight='bold')
#boxes
for i, (col, title) in enumerate(zip(selected_numerical_cols, titles)):
    ax = axes[i // 3][i % 3]
    sns.boxplot(y=df[col], ax=ax,
                flierprops=dict(marker='o', markerfacecolor='red', markersize=4))
    ax.set_title(title, fontsize=12)
    ax.set_ylabel('Value')
    ax.set_xlabel('Distribution')
    ax.grid(True, alpha=0.3)

#legend
legend_elements = [
    Line2D([0], [0], color='steelblue', linewidth=4, label='IQR (Middle 50%)'),
    Line2D([0], [0], color='black', linewidth=1.5, label='Median'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=6, label='Outliers')
]
fig.legend(handles=legend_elements, loc='lower right', fontsize=10, title='Legend')

axes[2][1].set_visible(False)
axes[2][2].set_visible(False)
plt.tight_layout()
plt.show()

from scipy import stats

# Define the null and alternative hypotheses
print("Hypothesis Testing: Is there a significant difference in average salary across different job roles?\n")
print("Null Hypothesis (H0): There is no significant difference in the mean salary_usd across different job_role categories.")
print("Alternative Hypothesis (Ha): There is a significant difference in the mean salary_usd for at least one job_role group.\n")

# Prepare data for ANOVA
job_roles = df['job_role'].unique()
salaries_by_job_role = [df['salary_usd'][df['job_role'] == role] for role in job_roles]

# Perform one-way ANOVA test
f_statistic, p_value = stats.f_oneway(*salaries_by_job_role)

print(f"ANOVA F-statistic: {f_statistic:.2f}")
print(f"ANOVA P-value: {p_value:.3e}") # Display p-value in scientific notation for very small values

# Draw a conclusion based on the p-value
alpha = 0.05
print(f"\nConclusion (at alpha = {alpha}):")
if p_value < alpha:
    print("Since the p-value is less than alpha, we reject the Null Hypothesis.")
    print("This suggests that there is a statistically significant difference in the average salary across at least two job roles.")
else:
    print("Since the p-value is greater than or equal to alpha, we fail to reject the Null Hypothesis.")
    print("This suggests that there is no statistically significant difference in the average salary across different job roles.")

print("\nNote: The ANOVA test assumes normality of the samples and homogeneity of variances. Further tests (e.g., Levene's test for homogeneity of variance, Shapiro-Wilk for normality) could be performed to validate these assumptions. If assumptions are violated, non-parametric tests like Kruskal-Wallis H-test might be more appropriate, or transformations of the data may be considered.")

plt.figure(figsize=(14, 6))
sns.violinplot(data=df, x='job_role', y='salary_usd', palette='muted', inner='quartile')
plt.xticks(rotation=45, ha='right')
plt.title('Salary Distribution by Job Role')
plt.xlabel('Job Role')
plt.ylabel('Salary (USD)')
plt.tight_layout()
plt.show()

Hypothesis Testing: Is there a significant difference in average salary across different job roles?

Null Hypothesis (H0): There is no significant difference in the mean salary_usd across different job_role categories.
Alternative Hypothesis (Ha): There is a significant difference in the mean salary_usd for at least one job_role group.

ANOVA F-statistic: 864.24
ANOVA P-value: 0.000e+00

Conclusion (at alpha = 0.05):
Since the p-value is less than alpha, we reject the Null Hypothesis.
This suggests that there is a statistically significant difference in the average salary across at least two job roles.

Note: The ANOVA test assumes normality of the samples and homogeneity of variances. Further tests (e.g., Levene's test for homogeneity of variance, Shapiro-Wilk for normality) could be performed to validate these assumptions. If assumptions are violated, non-parametric tests like Kruskal-Wallis H-test might be more appropriate, or transformations of the data may be considered.

/tmp/ipykernel_12916/4281342597.py:31: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=df, x='job_role', y='salary_usd', palette='muted', inner='quartile')

import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

print("Hypothesis Testing: Is there a linear relationship between experience_years and salary_usd?\n")
print("Null Hypothesis (H0): There is no linear relationship between experience_years and salary_usd (beta_1 = 0).")
print("Alternative Hypothesis (Ha): There is a linear relationship between experience_years and salary_usd (beta_1 != 0).\n")

# Define dependent and independent variables
X = df['experience_years']
y = df['salary_usd']

# Add a constant to the independent variable for intercept calculation
X = sm.add_constant(X)

# Create and fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())

# Draw a conclusion based on the p-value of experience_years
alpha = 0.05
p_value_experience_years = results.pvalues['experience_years']

print(f"\nConclusion (at alpha = {alpha}):")
if p_value_experience_years < alpha:
    print("Since the p-value for 'experience_years' is less than alpha, we reject the Null Hypothesis.")
    print("This suggests that there is a statistically significant linear relationship between experience_years and salary_usd.")
else:
    print("Since the p-value for 'experience_years' is greater than or equal to alpha, we fail to reject the Null Hypothesis.")
    print("This suggests that there is no statistically significant linear relationship between experience_years and salary_usd.")

plt.figure(figsize=(10, 6))
sns.regplot(x='experience_years', y='salary_usd', data=df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('Relationship between Experience Years and Salary (USD)')
plt.xlabel('Experience Years')
plt.ylabel('Salary (USD)')
plt.grid(True)
plt.show()

Hypothesis Testing: Is there a linear relationship between experience_years and salary_usd?

Null Hypothesis (H0): There is no linear relationship between experience_years and salary_usd (beta_1 = 0).
Alternative Hypothesis (Ha): There is a linear relationship between experience_years and salary_usd (beta_1 != 0).

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             salary_usd   R-squared:                       0.524
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                 9.907e+04
Date:                Fri, 08 May 2026   Prob (F-statistic):               0.00
Time:                        23:39:14   Log-Likelihood:            -1.0564e+06
No. Observations:               90000   AIC:                         2.113e+06
Df Residuals:                   89998   BIC:                         2.113e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             5.859e+04    157.318    372.445      0.000    5.83e+04    5.89e+04
experience_years  5400.2729     17.157    314.760      0.000    5366.646    5433.900
==============================================================================
Omnibus:                     3638.458   Durbin-Watson:                   2.007
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5144.614
Skew:                          -0.403   Prob(JB):                         0.00
Kurtosis:                       3.851   Cond. No.                         14.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Conclusion (at alpha = 0.05):
Since the p-value for 'experience_years' is less than alpha, we reject the Null Hypothesis.
This suggests that there is a statistically significant linear relationship between experience_years and salary_usd.

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# dropping columns that leak the target, are redundant, or have no predictive value
drop_cols = ['id', 'salary_percentile', 'salary_log', 'year']

# we're predicting salary_usd and employee_satisfaction separately
features_salary = df.drop(columns=drop_cols + ['salary_usd', 'employee_satisfaction'])
features_satisfaction = df.drop(columns=drop_cols + ['salary_usd', 'employee_satisfaction'])

target_salary = df['salary_usd']
target_satisfaction = df['employee_satisfaction']

print("Features shape:", features_salary.shape)
print("Columns kept:", list(features_salary.columns))

Features shape: (90000, 30)
Columns kept: ['country', 'job_role', 'ai_specialization', 'experience_level', 'experience_years', 'bonus_usd', 'education_required', 'industry', 'company_size', 'interview_rounds', 'work_mode', 'weekly_hours', 'company_rating', 'job_openings', 'hiring_difficulty_score', 'layoff_risk', 'ai_adoption_score', 'company_funding_billion', 'economic_index', 'ai_maturity_years', 'offer_acceptance_rate', 'tax_rate_percent', 'vacation_days', 'skill_demand_score', 'automation_risk', 'job_security_score', 'career_growth_score', 'work_life_balance_score', 'promotion_speed', 'cost_of_living_index']

# label encode all object/string columns

categorical_cols = features_salary.select_dtypes(include='object').columns.tolist()
print("Categorical columns to encode:", categorical_cols)

le = LabelEncoder()

for col in categorical_cols:
    features_salary[col] = le.fit_transform(features_salary[col])
    features_satisfaction[col] = features_salary[col].copy()  # same encoding for both

print("\nEncoding complete. Sample of encoded data:")
features_salary[categorical_cols].head()

Categorical columns to encode: ['country', 'job_role', 'ai_specialization', 'experience_level', 'education_required', 'industry', 'company_size', 'work_mode']

Encoding complete. Sample of encoded data:

X_train_sal, X_test_sal, y_train_sal, y_test_sal = train_test_split(
    features_salary, target_salary, test_size=0.2, random_state=42
)

X_train_sat, X_test_sat, y_train_sat, y_test_sat = train_test_split(
    features_satisfaction, target_satisfaction, test_size=0.2, random_state=42
)

print(f"Salary model — Train: {X_train_sal.shape}, Test: {X_test_sal.shape}")
print(f"Satisfaction model — Train: {X_train_sat.shape}, Test: {X_test_sat.shape}")

Salary model — Train: (72000, 30), Test: (18000, 30)
Satisfaction model — Train: (72000, 30), Test: (18000, 30)

lr_salary = LinearRegression()
lr_satisfaction = LinearRegression()

# random forest
# n_estimators=100 means 100 decision trees — good balance of speed and accuracy
# random_state=42 ensures reproducibility
# n_jobs=-1 uses all CPU cores to speed up training on 90k rows
rf_salary = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_satisfaction = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

print("Models defined:")
print(f"  Baseline: {lr_salary}")
print(f"  Primary:  {rf_salary}")

Models defined:
  Baseline: LinearRegression()
  Primary:  RandomForestRegressor(n_jobs=-1, random_state=42)

# training baseline linear regression model
print("Training Linear Regression baseline models...")

lr_salary.fit(X_train_sal, y_train_sal)
lr_satisfaction.fit(X_train_sat, y_train_sat)

print("Linear Regression training complete")

Training Linear Regression baseline models...
Linear Regression training complete

# training primary random forest model

print("Training Random Forest models...")

rf_salary.fit(X_train_sal, y_train_sal)
print("Salary Random Forest complete")

rf_satisfaction.fit(X_train_sat, y_train_sat)
print("Satisfaction Random Forest complete")

Training Random Forest models...
Salary Random Forest complete
Satisfaction Random Forest complete

# evaluating the model

def evaluate_model(name, model, X_test, y_test):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name}")
    print(f"  RMSE : {rmse:,.2f}")
    print(f"  R²   : {r2:.4f}")
    return preds, rmse, r2

print("Salary Prediction")
lr_sal_preds, lr_sal_rmse, lr_sal_r2 = evaluate_model("Linear Regression", lr_salary, X_test_sal, y_test_sal)
rf_sal_preds, rf_sal_rmse, rf_sal_r2 = evaluate_model("Random Forest", rf_salary, X_test_sal, y_test_sal)

print("\nEmployee Satisfaction Prediction")
lr_sat_preds, lr_sat_rmse, lr_sat_r2 = evaluate_model("Linear Regression", lr_satisfaction, X_test_sat, y_test_sat)
rf_sat_preds, rf_sat_rmse, rf_sat_r2 = evaluate_model("Random Forest", rf_satisfaction, X_test_sat, y_test_sat)

Salary Prediction
Linear Regression
  RMSE : 22,968.63
  R²   : 0.7254
Random Forest
  RMSE : 12,082.25
  R²   : 0.9240

Employee Satisfaction Prediction
Linear Regression
  RMSE : 5.50
  R²   : 0.5388
Random Forest
  RMSE : 5.38
  R²   : 0.5580

# summary table

results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Linear Regression', 'Random Forest'],
    'Target': ['Salary', 'Salary', 'Satisfaction', 'Satisfaction'],
    'RMSE': [lr_sal_rmse, rf_sal_rmse, lr_sat_rmse, rf_sat_rmse],
    'R²': [lr_sal_r2, rf_sal_r2, lr_sat_r2, rf_sat_r2]
})

print(results.to_string(index=False))

            Model       Target         RMSE       R²
Linear Regression       Salary 22968.627789 0.725382
    Random Forest       Salary 12082.251325 0.924010
Linear Regression Satisfaction     5.496207 0.538835
    Random Forest Satisfaction     5.380824 0.557995

importances_sal = pd.Series(
    rf_salary.feature_importances_,
    index=features_salary.columns
).sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 6))
sns.barplot(x=importances_sal.values, y=importances_sal.index, palette='viridis')
plt.title('Top 15 Most Important Features for Predicting Salary', fontsize=14)
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nTop 5 features for salary prediction:")
print(importances_sal.head())

/tmp/ipykernel_12916/402026441.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=importances_sal.values, y=importances_sal.index, palette='viridis')

Top 5 features for salary prediction:
experience_years         0.483148
bonus_usd                0.225793
country                  0.172043
job_role                 0.047528
offer_acceptance_rate    0.003661
dtype: float64

# predicted salary vs actual salary

plt.figure(figsize=(8, 6))
plt.scatter(y_test_sal, rf_sal_preds, alpha=0.1, color='steelblue')
plt.plot([y_test_sal.min(), y_test_sal.max()],
         [y_test_sal.min(), y_test_sal.max()],
         'r--', linewidth=2, label='Perfect prediction')
plt.title('Random Forest: Predicted vs Actual Salary')
plt.xlabel('Actual Salary (USD)')
plt.ylabel('Predicted Salary (USD)')
plt.legend()
plt.tight_layout()
plt.show()

# predicted vs actual employee satisfcation
plt.figure(figsize=(8, 6))
plt.scatter(y_test_sat, rf_sat_preds, alpha=0.1, color='darkorchid')
plt.plot([y_test_sat.min(), y_test_sat.max()],
         [y_test_sat.min(), y_test_sat.max()],
         'r--', linewidth=2, label='Perfect prediction')
plt.title('Random Forest: Predicted vs Actual Employee Satisfaction')
plt.xlabel('Actual Satisfaction Score')
plt.ylabel('Predicted Satisfaction Score')
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=results, x='Target', y='R²', hue='Model')
plt.title('Model Performance Comparison by R² Score')
plt.xlabel('Prediction Target')
plt.ylabel('R² Score')
plt.ylim(0, 1)
plt.legend(title='Model')
plt.tight_layout()
plt.show()

salary_residuals = y_test_sal - rf_sal_preds

plt.figure(figsize=(8, 6))
plt.scatter(rf_sal_preds, salary_residuals, alpha=0.2)
plt.axhline(0, linestyle='--', color='red')
plt.title('Residual Plot for Random Forest Salary Predictions')
plt.xlabel('Predicted Salary (USD)')
plt.ylabel('Residuals: Actual - Predicted Salary')
plt.tight_layout()
plt.show()

sat_residuals = y_test_sat - rf_sat_preds

plt.figure(figsize=(8, 6))
plt.scatter(rf_sat_preds, sat_residuals, alpha=0.2)
plt.axhline(0, linestyle='--', color='red')
plt.title('Residual Plot for Random Forest Satisfaction Score Predictions')
plt.xlabel('Predicted Employee Satisfaction Score')
plt.ylabel('Residuals: Actual - Predicted Satisfaction Score')
plt.tight_layout()
plt.show()

top_roles_salary = (
    df.groupby('job_role')['salary_usd']
    .mean()
    .sort_values(ascending=False)
)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_roles_salary.values, y=top_roles_salary.index)
plt.title('Average Salary by Job Role')
plt.xlabel('Average Salary (USD)')
plt.ylabel('Job Role')
plt.tight_layout()
plt.show()

# Average Employee Satisfaction by Job Role

top_roles_satisfaction = (
    df.groupby('job_role')['employee_satisfaction']
    .mean()
    .sort_values(ascending=False)
)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_roles_satisfaction.values,y=top_roles_satisfaction.index)
plt.title('Average Employee Satisfaction by Job Role')
plt.xlabel('Average Employee Satisfaction Score')
plt.ylabel('Job Role')
plt.tight_layout()
plt.show()

Member	Sections	1-2 Sentence Summary
Brandon Wagstaff	A, B, C	Came up with initial idea and sourced the dataset. Handled raw data ingestion/cleaning and EDA/Summary Statistics.
Sean Lee	D, E	Built/Designed ML algorithm. Trained ML algorithm and did test data analysis.
Rebecca Holcomb	F, G, H	Completed final visualizations, conclusion section, and textual descriptions. Contributed to visualizations for the EDA portion.

	id	experience_years	salary_usd	bonus_usd	interview_rounds	year	weekly_hours	company_rating	job_openings	hiring_difficulty_score	...	vacation_days	skill_demand_score	automation_risk	job_security_score	career_growth_score	work_life_balance_score	promotion_speed	salary_percentile	cost_of_living_index	employee_satisfaction
count	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	...	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000	90000.000000
mean	45000.500000	7.028133	96546.249222	13028.418722	4.495689	2023.003200	45.476268	3.998004	17.521867	55.028604	...	19.986367	50.461200	50.357544	75.563533	57.198544	69.146478	38.439633	50.542411	1.503042	72.733100
std	25980.906451	5.889327	43935.479553	7886.738085	1.704553	2.002624	5.475497	0.461914	7.848576	17.901451	...	6.069607	28.853798	28.845671	11.316485	12.900225	13.213996	18.429221	28.891570	0.576449	8.124018
min	1.000000	0.000000	28000.000000	1404.000000	2.000000	2020.000000	36.000000	3.200000	1.000000	0.000000	...	10.000000	1.000000	1.000000	29.000000	25.000000	25.000000	12.000000	1.000000	0.500000	42.000000
25%	22500.750000	2.000000	64676.750000	7104.750000	3.000000	2021.000000	40.700000	3.600000	12.000000	42.881134	...	15.000000	25.000000	25.000000	68.000000	48.000000	59.000000	24.000000	25.000000	1.010000	67.000000
50%	45000.500000	6.000000	87544.000000	11279.000000	4.000000	2023.000000	45.500000	4.000000	17.000000	55.066089	...	20.000000	51.000000	50.000000	77.000000	57.000000	69.000000	37.000000	51.000000	1.510000	73.000000
75%	67500.250000	12.000000	123906.000000	16997.250000	6.000000	2025.000000	50.200000	4.400000	23.000000	67.118119	...	25.000000	75.000000	75.000000	84.000000	66.000000	79.000000	51.000000	76.000000	2.000000	78.000000
max	90000.000000	19.000000	300622.000000	57681.000000	7.000000	2026.000000	55.000000	4.800000	50.000000	100.000000	...	30.000000	100.000000	100.000000	99.000000	99.000000	98.000000	98.000000	100.000000	2.500000	99.000000

	id	experience_years	salary_usd	bonus_usd	interview_rounds	year	weekly_hours	company_rating	job_openings	hiring_difficulty_score	...	vacation_days	skill_demand_score	automation_risk	job_security_score	career_growth_score	work_life_balance_score	promotion_speed	salary_percentile	cost_of_living_index	employee_satisfaction
id	1.000000	0.001841	0.007894	0.003261	-0.003397	0.003830	-0.002532	-0.004251	0.001834	0.000831	...	0.000019	-0.003792	0.004279	0.001153	-0.003507	0.006872	-0.000345	-0.000167	-0.003333	0.004867
experience_years	0.001841	1.000000	0.723879	0.545966	-0.005699	0.000253	-0.004406	0.004693	0.000691	0.001893	...	-0.004695	0.000309	0.003161	0.599466	0.007329	0.004165	0.363093	-0.003814	-0.000504	0.517066
salary_usd	0.007894	0.723879	1.000000	0.752046	-0.006230	-0.001309	-0.004970	-0.000347	0.003853	-0.000759	...	-0.006457	0.001087	0.002201	0.434177	0.003107	0.003890	0.261381	-0.003327	0.001462	0.629030
bonus_usd	0.003261	0.545966	0.752046	1.000000	-0.000935	0.002574	-0.002344	0.001736	0.004792	-0.000395	...	-0.006002	0.001459	0.004375	0.323791	0.003217	-0.000007	0.200770	-0.002921	0.002414	0.471220
interview_rounds	-0.003397	-0.005699	-0.006230	-0.000935	1.000000	-0.001724	0.002794	-0.000020	0.000547	0.002658	...	-0.003531	0.004476	-0.001391	-0.008056	0.002056	-0.002010	-0.000859	-0.002871	-0.000128	-0.005249
year	0.003830	0.000253	-0.001309	0.002574	-0.001724	1.000000	-0.000237	0.002733	0.003498	-0.001752	...	0.000196	0.004989	0.004411	0.001453	-0.003908	0.000372	0.000601	-0.000705	-0.002321	-0.002956
weekly_hours	-0.002532	-0.004406	-0.004970	-0.002344	0.002794	-0.000237	1.000000	0.002996	0.000721	0.004533	...	0.010298	-0.003575	0.003114	-0.001098	0.007250	-0.828129	-0.004508	0.001682	0.005979	-0.340942
company_rating	-0.004251	0.004693	-0.000347	0.001736	-0.000020	0.002733	0.002996	1.000000	0.001081	0.004933	...	-0.001946	-0.003076	0.002467	0.000561	-0.004091	-0.000381	-0.006642	0.002993	0.002303	0.001647
job_openings	0.001834	0.000691	0.003853	0.004792	0.000547	0.003498	0.000721	0.001081	1.000000	-0.003196	...	-0.000350	-0.001368	-0.002554	0.002749	-0.003083	-0.000346	-0.001725	-0.006480	0.000364	0.000449
hiring_difficulty_score	0.000831	0.001893	-0.000759	-0.000395	0.002658	-0.001752	0.004533	0.004933	-0.003196	1.000000	...	-0.007393	0.000155	0.002565	0.000677	-0.002118	-0.004073	0.005938	-0.000692	0.004137	-0.004272
layoff_risk	-0.001771	-0.736261	-0.533439	-0.400488	0.008220	-0.001814	0.002535	-0.000504	-0.002779	-0.001128	...	0.008087	0.000863	-0.003503	-0.814724	-0.006072	-0.003127	-0.262194	0.002372	0.001555	-0.458603
ai_adoption_score	-0.002036	0.011152	0.006550	0.005741	0.003865	0.001092	0.006803	-0.003508	0.001911	-0.001640	...	-0.000192	0.002603	0.003411	-0.000275	0.632821	-0.012552	0.007184	-0.002058	-0.003565	0.001904
company_funding_billion	-0.001114	0.000836	-0.000362	0.000291	-0.004522	-0.006256	0.004978	-0.003454	-0.003849	0.001660	...	-0.002685	0.001386	0.003501	0.094822	0.618247	-0.007616	0.003884	0.000898	-0.000942	0.021581
economic_index	-0.000963	0.001017	-0.000094	0.001971	-0.009564	0.002703	0.005988	-0.000341	0.001014	-0.000781	...	0.004724	0.001852	-0.004481	0.001294	-0.000122	-0.003367	0.003939	-0.005454	0.000603	-0.000099
ai_maturity_years	0.003937	-0.001581	0.002797	0.003038	0.000855	0.000456	-0.000180	0.002273	-0.004978	-0.002163	...	0.000839	0.004087	-0.003905	-0.006132	-0.003650	0.001879	-0.002652	-0.000369	0.002854	0.004734
offer_acceptance_rate	-0.004582	0.001150	-0.000143	-0.000178	-0.000389	0.002936	-0.000754	-0.002149	0.006873	-0.006999	...	-0.001657	-0.001162	0.001691	0.006524	0.004740	-0.002458	0.000192	-0.003299	0.002349	0.002283
tax_rate_percent	-0.004382	-0.000862	-0.003189	-0.002669	0.002493	0.006862	0.001866	0.002092	0.002477	-0.003384	...	-0.001802	-0.004449	-0.005310	0.000042	0.001000	0.001760	-0.004266	0.001252	0.006345	0.002423
vacation_days	0.000019	-0.004695	-0.006457	-0.006002	-0.003531	0.000196	0.010298	-0.001946	-0.000350	-0.007393	...	1.000000	0.002513	-0.003492	-0.006496	-0.002325	-0.008081	-0.005202	-0.001229	0.003497	-0.010687
skill_demand_score	-0.003792	0.000309	0.001087	0.001459	0.004476	0.004989	-0.003575	-0.003076	-0.001368	0.000155	...	0.002513	1.000000	0.001631	-0.000336	0.001018	0.003236	-0.005853	-0.003002	0.001141	0.006355
automation_risk	0.004279	0.003161	0.002201	0.004375	-0.001391	0.004411	0.003114	0.002467	-0.002554	0.002565	...	-0.003492	0.001631	1.000000	0.006845	0.002760	-0.001923	0.001795	-0.004244	0.000978	0.000281
job_security_score	0.001153	0.599466	0.434177	0.323791	-0.008056	0.001453	-0.001098	0.000561	0.002749	0.000677	...	-0.006496	-0.000336	0.006845	1.000000	0.060844	0.103007	0.057297	-0.003836	-0.002466	0.484868
career_growth_score	-0.003507	0.007329	0.003107	0.003217	0.002056	-0.003908	0.007250	-0.004091	-0.003083	-0.002118	...	-0.002325	0.001018	0.002760	0.060844	1.000000	-0.012795	0.006078	-0.002568	-0.003714	0.014033
work_life_balance_score	0.006872	0.004165	0.003890	-0.000007	-0.002010	0.000372	-0.828129	-0.000381	-0.000346	-0.004073	...	-0.008081	0.003236	-0.001923	0.103007	-0.012795	1.000000	-0.151577	-0.000841	-0.006415	0.433110
promotion_speed	-0.000345	0.363093	0.261381	0.200770	-0.000859	0.000601	-0.004508	-0.006642	-0.001725	0.005938	...	-0.005202	-0.005853	0.001795	0.057297	0.006078	-0.151577	1.000000	-0.000017	-0.002123	0.087818
salary_percentile	-0.000167	-0.003814	-0.003327	-0.002921	-0.002871	-0.000705	0.001682	0.002993	-0.006480	-0.000692	...	-0.001229	-0.003002	-0.004244	-0.003836	-0.002568	-0.000841	-0.000017	1.000000	0.001764	-0.003411
cost_of_living_index	-0.003333	-0.000504	0.001462	0.002414	-0.000128	-0.002321	0.005979	0.002303	0.000364	0.004137	...	0.003497	0.001141	0.000978	-0.002466	-0.003714	-0.006415	-0.002123	0.001764	1.000000	-0.001090
employee_satisfaction	0.004867	0.517066	0.629030	0.471220	-0.005249	-0.002956	-0.340942	0.001647	0.000449	-0.004272	...	-0.010687	0.006355	0.000281	0.484868	0.014033	0.433110	0.087818	-0.003411	-0.001090	1.000000

Predicting Salary and Satisfaction in the AI/Data Workforce¶

Introduction¶

Data Curation and Preprocessing¶

Summary Statistics and Data Exploration¶

Summary Statistics¶

Exploratory Data Analysis¶

Conclusion 1: Outliers?¶

Conclusion 2: Hypothesis Testing¶

Conlusion 3: Linear Regression¶

Primary Analysis¶

ML Algorithm Design/Development¶

ML Algorithm Training and Test Data Analysis¶

Visualizations¶

Insights and Conclusion¶

References¶

	id	country	job_role	ai_specialization	experience_level	experience_years	salary_usd	bonus_usd	education_required	industry	...	vacation_days	skill_demand_score	automation_risk	job_security_score	career_growth_score	work_life_balance_score	promotion_speed	salary_percentile	cost_of_living_index	employee_satisfaction
0	1	UAE	Machine Learning Engineer	Reinforcement Learning	Entry	0	66465	5395	Master	Automotive	...	27	12	76	57	65	73	15	55	1.23	76
1	2	USA	AI Engineer	LLM	Entry	1	75507	11713	Bootcamp	Retail	...	27	54	29	69	60	51	15	58	0.87	67
2	3	Brazil	Research Scientist	Analytics	Entry	0	41660	5268	PhD	Healthcare	...	13	12	49	70	59	68	37	13	2.13	61
3	4	India	Software Engineer AI	Computer Vision	Senior	6	43268	7975	Diploma	Tech	...	30	80	47	79	65	55	46	74	1.49	56
4	5	Germany	Machine Learning Engineer	Computer Vision	Entry	0	69119	4758	Master	Retail	...	24	82	47	64	52	69	17	21	0.87	72

	0
id	int64
country	object
job_role	object
ai_specialization	object
experience_level	object
experience_years	int64
salary_usd	int64
bonus_usd	int64
education_required	object
industry	object
company_size	object
interview_rounds	int64
year	int64
work_mode	object
weekly_hours	float64
company_rating	float64
job_openings	int64
hiring_difficulty_score	float64
layoff_risk	float64
ai_adoption_score	int64
company_funding_billion	float64
economic_index	float64
ai_maturity_years	int64
offer_acceptance_rate	float64
tax_rate_percent	float64
vacation_days	int64
skill_demand_score	int64
automation_risk	int64
job_security_score	int64
career_growth_score	int64
work_life_balance_score	int64
promotion_speed	int64
salary_percentile	int64
cost_of_living_index	float64
employee_satisfaction	int64