more updates to notebook, include tiers

lucasheriques · Sep 12, 2024 · c2f6216 · c2f6216
1 parent 5e98e59
commit c2f6216
Show file tree

Hide file tree

Showing 2 changed files with 472 additions and 52 deletions.
diff --git a/comp_scraping/analyze_data.py b/comp_scraping/analyze_data.py
@@ -11,12 +11,15 @@ def load_data(file_path):
 
 def clean_data(df):
     df['Total Compensation'] = df['Total Compensation'].str.replace('R$', '').str.replace(',', '').astype(float)
+    # Divide the total compensation by 12 to get monthly salary
+    df['Total Compensation'] = df['Total Compensation'] / 12
     df['Years of Experience'] = df['Years of Experience'].str.extract('(\d+)').astype(float)
     df['Years at Company'] = df['Years at Company'].str.extract('(\d+)').astype(float)
     return df
 
 def format_currency(value):
-    return locale.currency(value, grouping=True, symbol='R$')
+    # Round to 2 decimal places before formatting
+    return locale.currency(round(value, 2), grouping=True, symbol='R$')
 
 def categorize_experience(years):
     if years <= 3:
@@ -28,13 +31,50 @@ def categorize_experience(years):
     else:
         return '13+ years'
 
+def categorize_companies(df):
+    # Calculate median compensation for each company
+    company_median = df.groupby('Company')['Total Compensation'].median().sort_values(ascending=False)
+
+    # Calculate percentile thresholds
+    threshold_90 = company_median.quantile(0.90)
+    threshold_70 = company_median.quantile(0.70)
+
+    # Categorize companies into tiers
+    tier_3 = company_median[company_median > threshold_90].index.tolist()
+    tier_2 = company_median[(company_median > threshold_70) & (company_median <= threshold_90)].index.tolist()
+    tier_1 = company_median[company_median <= threshold_70].index.tolist()
+
+    return {
+        'Tier 3 (Global)': tier_3,
+        'Tier 2 (All Local)': tier_2,
+        'Tier 1 (Local)': tier_1
+    }
+
 def analyze_data(df):
     results = {}
 
     # Experience group analysis
     df['Experience Group'] = df['Years of Experience'].apply(categorize_experience)
-    exp_group_avg = df.groupby('Experience Group')['Total Compensation'].mean().sort_values(ascending=False)
-    results['Average Compensation by Experience Group'] = exp_group_avg.apply(format_currency).to_frame('Average Compensation')
+    exp_group_stats = df.groupby('Experience Group')['Total Compensation'].agg(['count', 'mean', 'median',
+                                                                                lambda x: x.quantile(0.75),
+                                                                                lambda x: x.quantile(0.90)])
+    exp_group_stats.columns = ['count', 'mean', 'p50', 'p75', 'p90']
+    exp_group_stats = exp_group_stats.sort_values('mean', ascending=False)
+
+    results['Compensation Stats by Experience Group'] = exp_group_stats.apply(lambda x: x.apply(format_currency) if x.name != 'count' else x).to_dict()
+
+    # Overall compensation stats
+    overall_stats = df['Total Compensation'].agg(['count', 'mean', 'median',
+                                                  lambda x: x.quantile(0.75),
+                                                  lambda x: x.quantile(0.90)])
+    overall_stats.index = ['count', 'mean', 'p50', 'p75', 'p90']
+
+    formatted_stats = overall_stats.copy()
+    for stat in ['mean', 'p50', 'p75', 'p90']:
+        formatted_stats[stat] = format_currency(formatted_stats[stat])
+    formatted_stats['count'] = int(formatted_stats['count'])
+
+    results['Overall Compensation Stats'] = formatted_stats.to_dict()
 
     # Top paying companies by experience group
     top_companies_by_group = {}
@@ -51,9 +91,6 @@ def analyze_data(df):
 
     results['Top Paying Companies by Experience Group'] = top_companies_by_group
 
-    # Highest average salary overall
-    results['Highest Average Salary Overall'] = format_currency(df['Total Compensation'].mean())
-
     # Experience vs Compensation correlation
     exp_corr = df['Years of Experience'].corr(df['Total Compensation'])
     results['Correlation (Years of Experience vs Total Compensation)'] = f"{exp_corr:.2f}"
@@ -62,13 +99,30 @@ def analyze_data(df):
     tenure_corr = df['Years at Company'].corr(df['Total Compensation'])
     results['Correlation (Years at Company vs Total Compensation)'] = f"{tenure_corr:.2f}"
 
+    # Add company tiers analysis
+    company_tiers = categorize_companies(df)
+    results['Company Tiers'] = company_tiers
+
+    # Add tier information to the dataframe
+    df['Company Tier'] = df['Company'].map({company: tier
+                                            for tier, companies in company_tiers.items()
+                                            for company in companies})
+
+    # Calculate average compensation by tier
+    tier_stats = df.groupby('Company Tier')['Total Compensation'].agg(['mean', 'median'])
+    tier_stats = tier_stats.sort_values('mean', ascending=False)
+    tier_stats = tier_stats.applymap(format_currency)
+    results['Compensation Stats by Company Tier'] = tier_stats.to_dict()
+
     # Convert results to a more notebook-friendly format
     formatted_results = {
-        'Average Compensation by Experience Group': results['Average Compensation by Experience Group'].to_dict()['Average Compensation'],
+        'Compensation Stats by Experience Group': results['Compensation Stats by Experience Group'],
+        'Overall Compensation Stats': results['Overall Compensation Stats'],
         'Top Paying Companies by Experience Group': {group: companies.to_dict() for group, companies in results['Top Paying Companies by Experience Group'].items()},
-        'Highest Average Salary Overall': results['Highest Average Salary Overall'],
         'Correlation (Years of Experience vs Total Compensation)': results['Correlation (Years of Experience vs Total Compensation)'],
         'Correlation (Years at Company vs Total Compensation)': results['Correlation (Years at Company vs Total Compensation)'],
+        'Company Tiers': results['Company Tiers'],  # Add this line
+        'Compensation Stats by Company Tier': results['Compensation Stats by Company Tier'],
     }
 
     return formatted_results