Skip to content

Commit

Permalink
more updates to notebook, include tiers
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasheriques committed Sep 12, 2024
1 parent 5e98e59 commit c2f6216
Show file tree
Hide file tree
Showing 2 changed files with 472 additions and 52 deletions.
70 changes: 62 additions & 8 deletions comp_scraping/analyze_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ def load_data(file_path):

def clean_data(df):
df['Total Compensation'] = df['Total Compensation'].str.replace('R$', '').str.replace(',', '').astype(float)
# Divide the total compensation by 12 to get monthly salary
df['Total Compensation'] = df['Total Compensation'] / 12
df['Years of Experience'] = df['Years of Experience'].str.extract('(\d+)').astype(float)
df['Years at Company'] = df['Years at Company'].str.extract('(\d+)').astype(float)
return df

def format_currency(value):
return locale.currency(value, grouping=True, symbol='R$')
# Round to 2 decimal places before formatting
return locale.currency(round(value, 2), grouping=True, symbol='R$')

def categorize_experience(years):
if years <= 3:
Expand All @@ -28,13 +31,50 @@ def categorize_experience(years):
else:
return '13+ years'

def categorize_companies(df):
# Calculate median compensation for each company
company_median = df.groupby('Company')['Total Compensation'].median().sort_values(ascending=False)

# Calculate percentile thresholds
threshold_90 = company_median.quantile(0.90)
threshold_70 = company_median.quantile(0.70)

# Categorize companies into tiers
tier_3 = company_median[company_median > threshold_90].index.tolist()
tier_2 = company_median[(company_median > threshold_70) & (company_median <= threshold_90)].index.tolist()
tier_1 = company_median[company_median <= threshold_70].index.tolist()

return {
'Tier 3 (Global)': tier_3,
'Tier 2 (All Local)': tier_2,
'Tier 1 (Local)': tier_1
}

def analyze_data(df):
results = {}

# Experience group analysis
df['Experience Group'] = df['Years of Experience'].apply(categorize_experience)
exp_group_avg = df.groupby('Experience Group')['Total Compensation'].mean().sort_values(ascending=False)
results['Average Compensation by Experience Group'] = exp_group_avg.apply(format_currency).to_frame('Average Compensation')
exp_group_stats = df.groupby('Experience Group')['Total Compensation'].agg(['count', 'mean', 'median',
lambda x: x.quantile(0.75),
lambda x: x.quantile(0.90)])
exp_group_stats.columns = ['count', 'mean', 'p50', 'p75', 'p90']
exp_group_stats = exp_group_stats.sort_values('mean', ascending=False)

results['Compensation Stats by Experience Group'] = exp_group_stats.apply(lambda x: x.apply(format_currency) if x.name != 'count' else x).to_dict()

# Overall compensation stats
overall_stats = df['Total Compensation'].agg(['count', 'mean', 'median',
lambda x: x.quantile(0.75),
lambda x: x.quantile(0.90)])
overall_stats.index = ['count', 'mean', 'p50', 'p75', 'p90']

formatted_stats = overall_stats.copy()
for stat in ['mean', 'p50', 'p75', 'p90']:
formatted_stats[stat] = format_currency(formatted_stats[stat])
formatted_stats['count'] = int(formatted_stats['count'])

results['Overall Compensation Stats'] = formatted_stats.to_dict()

# Top paying companies by experience group
top_companies_by_group = {}
Expand All @@ -51,9 +91,6 @@ def analyze_data(df):

results['Top Paying Companies by Experience Group'] = top_companies_by_group

# Highest average salary overall
results['Highest Average Salary Overall'] = format_currency(df['Total Compensation'].mean())

# Experience vs Compensation correlation
exp_corr = df['Years of Experience'].corr(df['Total Compensation'])
results['Correlation (Years of Experience vs Total Compensation)'] = f"{exp_corr:.2f}"
Expand All @@ -62,13 +99,30 @@ def analyze_data(df):
tenure_corr = df['Years at Company'].corr(df['Total Compensation'])
results['Correlation (Years at Company vs Total Compensation)'] = f"{tenure_corr:.2f}"

# Add company tiers analysis
company_tiers = categorize_companies(df)
results['Company Tiers'] = company_tiers

# Add tier information to the dataframe
df['Company Tier'] = df['Company'].map({company: tier
for tier, companies in company_tiers.items()
for company in companies})

# Calculate average compensation by tier
tier_stats = df.groupby('Company Tier')['Total Compensation'].agg(['mean', 'median'])
tier_stats = tier_stats.sort_values('mean', ascending=False)
tier_stats = tier_stats.applymap(format_currency)
results['Compensation Stats by Company Tier'] = tier_stats.to_dict()

# Convert results to a more notebook-friendly format
formatted_results = {
'Average Compensation by Experience Group': results['Average Compensation by Experience Group'].to_dict()['Average Compensation'],
'Compensation Stats by Experience Group': results['Compensation Stats by Experience Group'],
'Overall Compensation Stats': results['Overall Compensation Stats'],
'Top Paying Companies by Experience Group': {group: companies.to_dict() for group, companies in results['Top Paying Companies by Experience Group'].items()},
'Highest Average Salary Overall': results['Highest Average Salary Overall'],
'Correlation (Years of Experience vs Total Compensation)': results['Correlation (Years of Experience vs Total Compensation)'],
'Correlation (Years at Company vs Total Compensation)': results['Correlation (Years at Company vs Total Compensation)'],
'Company Tiers': results['Company Tiers'], # Add this line
'Compensation Stats by Company Tier': results['Compensation Stats by Company Tier'],
}

return formatted_results
Expand Down
Loading

0 comments on commit c2f6216

Please sign in to comment.