import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("C:/Users/lucy8/Downloads/Latest_Data_Science_Salaries.csv")
df_23 = df[df['Year'] == 2023]


print(df_23.shape)
print(df_23.info())
print(df_23.isnull().sum())

(1996, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 2036
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           1996 non-null   object
 1   Employment Type     1996 non-null   object
 2   Experience Level    1996 non-null   object
 3   Expertise Level     1996 non-null   object
 4   Salary              1996 non-null   int64 
 5   Salary Currency     1996 non-null   object
 6   Company Location    1996 non-null   object
 7   Salary in USD       1996 non-null   int64 
 8   Employee Residence  1996 non-null   object
 9   Company Size        1996 non-null   object
 10  Year                1996 non-null   int64 
dtypes: int64(3), object(8)
memory usage: 187.1+ KB
None
Job Title             0
Employment Type       0
Experience Level      0
Expertise Level       0
Salary                0
Salary Currency       0
Company Location      0
Salary in USD         0
Employee Residence    0
Company Size          0
Year                  0
dtype: int64


df_23


plt.hist(df_23['Salary in USD'], bins=10, color='skyblue', edgecolor='black')
plt.title('Salary Distribution in USD')
plt.xlabel('USD')
plt.ylabel('Frequency')

plt.show()


unique_job_titles = df_23['Job Title'].unique()
num_unique_job_titles = len(unique_job_titles)

print(num_unique_job_titles)

plt.figure(figsize=(18, 6))
plt.bar(range(num_unique_job_titles), df_23['Job Title'].value_counts())
plt.xticks(range(num_unique_job_titles), unique_job_titles, rotation=90)
plt.xlabel('Job Title')
plt.ylabel('Count')
plt.title('Number of Unique Job Titles')
plt.tight_layout()
plt.show()

90


job_counts = df_23['Job Title'].value_counts()
print(job_counts)

Data Engineer                    413
Data Scientist                   353
Data Analyst                     282
Machine Learning Engineer        202
Analytics Engineer                85
                                ... 
Autonomous Vehicle Technician      1
Data Engineer 2                    1
Analytics Engineering Manager      1
AWS Data Architect                 1
Data Analytics Lead                1
Name: Job Title, Length: 90, dtype: int64


job_20 = job_counts.iloc[:19]

plt.figure(figsize=(10, 6))
sns.set_style('whitegrid')
sns.boxplot(x=df_23['Job Title'], y=df_23['Salary'], order=job_20.index)
plt.xlabel('Job Title')
plt.xticks(rotation=90)
plt.ylabel('Salary')
plt.title('Salary Distribution for Top 20 Job Titles in 2023')
plt.tight_layout()
plt.show()


job_15 = job_counts.iloc[:14]


plt.figure(figsize=(10, 6))
sns.set_style('whitegrid')

sns.boxplot(x=df_23['Job Title'], y=df_23['Salary'], order=job_15.index)
plt.xlabel('Job Title')
plt.xticks(rotation=90)
plt.ylabel('Salary')
plt.title('Salary Distribution for Top 15 Job Titles in 2023')
plt.tight_layout()
plt.show()


salary_stats_by_job = df.groupby('Job Title')['Salary in USD'].agg(['mean', 'median', 'max', 'min']).reset_index()
salary_stats_by_job


experience_salary_grouped = df_23.groupby('Experience Level')['Salary'].mean().reset_index()
expertise_salary_grouped = df_23.groupby('Expertise Level')['Salary'].mean().reset_index()

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.barplot(x='Experience Level', y='Salary', data=experience_salary_grouped)
plt.xlabel('Experience Level')
plt.ylabel('Salary (USD)')
plt.title('Average Salary by Experience Level')

plt.subplot(1, 2, 2)
sns.barplot(x='Expertise Level', y='Salary', data=expertise_salary_grouped)
plt.xlabel('Expertise Level')
plt.ylabel('Salary (USD)')
plt.title('Average Salary by Expertise Level')

plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt
filtered_df = df_23.dropna(subset=['Employee Residence'])

residence_counts = filtered_df['Employee Residence'].value_counts()
total_unique_residences = len(residence_counts)

plt.figure(figsize=(12, 6))
residence_counts.plot(kind='bar')
plt.xlabel('Employee Residence')
plt.ylabel('Count')
plt.title(f'Number of Unique Employee Residences ({total_unique_residences} unique)')
plt.tight_layout()
plt.show()


filtered_df = df_23[df_23['Employee Residence'] != 'United States']

residence_counts = filtered_df['Employee Residence'].value_counts()
total_unique_residences = len(residence_counts)

plt.figure(figsize=(12, 6))
residence_counts.plot(kind='bar')
plt.xlabel('Employee Residence')
plt.ylabel('Count')
plt.title(f'Number of Unique Employee Residences excluding "United States" ({total_unique_residences} unique)')
plt.tight_layout()
plt.show()


filtered_df = df_23[(df_23['Employee Residence'] != 'United Kingdom') & (df_23['Employee Residence'] != 'United States')]

residence_counts = filtered_df['Employee Residence'].value_counts()
total_unique_residences = len(residence_counts)

plt.figure(figsize=(12, 6))
residence_counts.plot(kind='bar')
plt.xlabel('Employee Residence')
plt.ylabel('Count')
plt.title(f'Number of Unique Employee Residences excluding "United Kingdom" and "United States" ({total_unique_residences} unique)')
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))

sns.barplot(x='Company Size', y='Salary', data=df_23)
plt.xlabel('Company Size')
plt.ylabel('Salary (USD)')
plt.title('Average Salary by Company Size')
plt.tight_layout()
plt.show()


columns = ["Job Title", "Experience Level", "Expertise Level", "Salary in USD", "Company Size"]
df_23 = pd.DataFrame(df_23, columns=columns)

heatmap_data = df_23.pivot_table(index="Job Title", columns=["Experience Level", "Expertise Level"], values="Salary in USD")

plt.figure(figsize=(12, 20))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlGnBu", linewidths=0.5)
plt.title("Salary in USD by Job Title, Experience Level, and Expertise Level")
plt.xlabel("Experience Level, Expertise Level")
plt.ylabel("Job Title")
plt.show()


from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_23['Job Title'] = label_encoder.fit_transform(df_23['Job Title'])
df_23['Experience Level'] = label_encoder.fit_transform(df_23['Experience Level'])
df_23['Expertise Level'] = label_encoder.fit_transform(df_23['Expertise Level'])
df_23['Company Size'] = label_encoder.fit_transform(df_23['Company Size'])

corr_matrix = df_23[['Job Title', 'Experience Level', 'Expertise Level', 'Salary in USD', 'Company Size']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

	Job Title	mean	median	max	min
0	AI Architect	237484.000000	209968.0	330000	200000
1	AI Developer	133177.777778	108000.0	275000	60000
2	AI Programmer	56021.750000	55000.0	74087	40000
3	AI Scientist	114117.333333	90919.0	417937	18053
4	AWS Data Architect	258000.000000	258000.0	258000	258000
...	...	...	...	...	...
106	Sales Data Analyst	60000.000000	60000.0	60000	60000
107	Software Data Engineer	111627.666667	74883.0	210000	50000
108	Staff Data Analyst	97499.000000	97499.0	179998	15000
109	Staff Data Scientist	134500.000000	134500.0	164000	105000
110	Staff Machine Learning Engineer	185000.000000	185000.0	185000	185000

	Job Title	Employment Type	Experience Level	Expertise Level	Salary	Salary Currency	Company Location	Salary in USD	Employee Residence	Company Size	Year
0	Data Engineer	Full-Time	Senior	Expert	210000	United States Dollar	United States	210000	United States	Medium	2023
1	Data Engineer	Full-Time	Senior	Expert	165000	United States Dollar	United States	165000	United States	Medium	2023
2	Data Engineer	Full-Time	Senior	Expert	185900	United States Dollar	United States	185900	United States	Medium	2023
3	Data Engineer	Full-Time	Senior	Expert	129300	United States Dollar	United States	129300	United States	Medium	2023
4	Data Scientist	Full-Time	Senior	Expert	140000	United States Dollar	United States	140000	United States	Medium	2023
...	...	...	...	...	...	...	...	...	...	...	...
2032	Machine Learning Engineer	Full-Time	Mid	Intermediate	52000	British Pound Sterling	United Kingdom	63980	United Kingdom	Medium	2023
2033	Machine Learning Engineer	Full-Time	Mid	Intermediate	48000	British Pound Sterling	United Kingdom	59059	United Kingdom	Medium	2023
2034	Machine Learning Engineer	Full-Time	Mid	Intermediate	38000	British Pound Sterling	United Kingdom	46755	United Kingdom	Medium	2023
2035	Data Architect	Full-Time	Senior	Expert	110000	United States Dollar	United States	110000	United States	Medium	2023
2036	Data Engineer	Full-Time	Senior	Expert	236000	United States Dollar	United States	236000	United States	Medium	2023