-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsql_db.session.sql
307 lines (260 loc) · 7.55 KB
/
sql_db.session.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
SELECT
job_title_short AS title,
job_location AS location,
job_posted_date AT TIME ZONE 'UTC' AT TIME ZONE 'EST' AS date_time,
EXTRACT(MONTH FROM job_posted_date) AS date_month
FROM job_postings_fact
LIMIT 5;
SELECT
COUNT(job_id) AS job_posted_count,
EXTRACT(MONTH FROM job_posted_date) AS date_month
FROM
job_postings_fact
WHERE
job_title_short = 'Data Analyst'
GROUP BY
date_month
ORDER BY
job_posted_count DESC;
-- 1-) Write a query to find the average salary both yearly(salary_year_avg) and hourly (salary_hour_avg) for job postings that were posted after June 1, 2023. Group the results by job schedule type.
SELECT
job_title,
job_schedule_type,
AVG(salary_year_avg) AS year_avg_salary,
AVG(salary_hour_avg) AS hour_avg_salary
FROM
job_postings_fact
WHERE
job_posted_date > '2023-06-01'
GROUP BY
job_schedule_type,
job_title;
-- 2-) Write a query to count the number of job postings for each month in 2023, adjusting the job_posted_date to be in 'America/New_York' time zone before extracting the month. Assume the job_posted_date is storen in UTC. Group by and order by the month.
SELECT
COUNT(job_id),
EXTRACT(MONTH FROM job_posted_date AT TIME ZONE 'UTC' AT TIME ZONE 'America/New_York') as date_month
FROM
job_postings_fact
WHERE
EXTRACT(YEAR FROM job_posted_date AT TIME ZONE 'UTC' AT TIME ZONE 'America/New_York') = 2023
GROUP BY
date_month
ORDER BY
date_month;
-- 3-) Write a query to find companies(include company name) that have posted jobs offering health insurance, where these postings were made in the second quarter of 2023. Use date extraction to filter by quarter.
SELECT
company.name,
jobs.job_title
FROM company_dim AS company
INNER JOIN job_postings_fact AS jobs ON company.company_id = jobs.company_id
WHERE
jobs.job_health_insurance = true AND
EXTRACT(YEAR FROM jobs.job_posted_date) = 2023 AND
EXTRACT(QUARTER FROM jobs.job_posted_date) = 2;
/*
- Create table From other tables
- Jan 2023 Jobs
- Feb 2023 Jobs
- Mar 2023 Jobs
*/
CREATE TABLE january_jobs AS
SELECT *
FROM
job_postings_fact
WHERE
EXTRACT(MONTH FROM job_posted_date) = 1 AND
EXTRACT(YEAR FROM job_posted_date) = 2023;
CREATE TABLE february_jobs AS
SELECT *
FROM
job_postings_fact
WHERE
EXTRACT(MONTH FROM job_posted_date) = 2 AND
EXTRACT(YEAR FROM job_posted_date) = 2023;
CREATE TABLE march_jobs AS
SELECT *
FROM
job_postings_fact
WHERE
EXTRACT(MONTH FROM job_posted_date) = 3 AND
EXTRACT(YEAR FROM job_posted_date) = 2023;
SELECT * FROM march_jobs;
/*
Label new column as follows:
- 'Anywhere' jobs as 'Remote',
- 'New York, NY' as 'Local',
- Otherwise 'Onsite'
*/
SELECT
COUNT(job_id) AS number_of_jobs,
CASE
WHEN job_location = 'Anywhere' THEN 'Remote'
WHEN job_location = 'New York, NY' THEN 'Local'
ELSE 'Onsite'
END AS location_category
FROM
job_postings_fact
WHERE
job_title_short = 'Data Analyst'
GROUP BY
location_category
ORDER BY
number_of_jobs DESC;
/* Want to categorize the salaries from each job posting. To see if it fits in my desired salary range.
- Put salary into different buckets
- Define what's a high, standart or low salary with our own conditions.
- Why? It is east to determine which job postings are worth looking at based on salary.
Bucketing is a common thing in data analysis when viewing categories.
- Look only for Data Analyst roles.
- Order from highest to lowest.
*/
SELECT
job_title,
CASE
WHEN salary_year_avg BETWEEN 0 AND 65000 THEN 'Low'
WHEN salary_year_avg BETWEEN 65000 AND 110000 THEN 'Standart'
ELSE 'High'
END
FROM
job_postings_fact
WHERE
job_title LIKE '%Analyst%'
ORDER BY
salary_year_avg DESC;
-- Subqueries and CTEs --
SELECT *
FROM(
SELECT *
FROM job_postings_fact
WHERE EXTRACT(MONTH FROM job_posted_date) = 1
) AS january_jobs;
WITH january_jobs AS (
SELECT *
FROM job_postings_fact
WHERE EXTRACT(MONTH FROM job_posted_date) = 1
)
SELECT *
FROM january_jobs;
-- Get the company names and ids that dont requires a degree for a job.
SELECT
company_id,
name AS company_name
FROM
company_dim
WHERE company_id IN(
SELECT company_id
FROM job_postings_fact
WHERE job_no_degree_mention = true
)
ORDER BY company_id;
/* Find the companies that have the most job openings.
- Get the total number of job postings per company id.
- Return the total number of jobs with the company name.
*/
WITH company_job_count AS(
SELECT
company_id,
COUNT(*) AS total_jobs
FROM
job_postings_fact
GROUP BY
company_id
)
SELECT
company.name AS company_name,
company_job_count.total_jobs
FROM company_dim AS company
LEFT JOIN company_job_count ON company.company_id = company_job_count.company_id
ORDER BY total_jobs DESC;
/*
Idendify the top 5 skills that are most frequently mentioned in job postings.
- Use a subquery to find the skill IDs with the highest counts in the skills_job_dim table and then join this results with the skills_dim table to get the skill names.
*/
SELECT
skill_id,
COUNT(*) as skill_count
FROM
skills_job_dim AS sjd
INNER JOIN job_postings_fact AS j ON sjd.job_id = j.job_id
WHERE sjd.skill_id IN(
SELECT skill_id
FROM skills_dim
)
GROUP BY
sjd.skill_id
ORDER BY skill_count DESC
LIMIT 5;
/* Find the count of the number of remote job postings per skill
- Display the top 5 skills by their demand in remote jobs
- Include skill id, name, and count of postings requiring the skill
*/
WITH remote_job_skills AS (
SELECT
skill_id,
COUNT(*) AS skill_count
FROM
skills_job_dim AS sjd
INNER JOIN job_postings_fact AS j ON sjd.job_id = j.job_id
WHERE
j.job_work_from_home = True
GROUP BY
skill_id
)
SELECT
s.skill_id,
s.skills,
skill_count
FROM remote_job_skills
INNER JOIN skills_dim AS s ON remote_job_skills.skill_id = s.skill_id
ORDER BY
skill_count DESC
LIMIT 5;
/*
- Find job postings from the first quarter that have a salary greater than $70K
- Combine job posting tables from the first quarter of 2023 (Jan-Mar)
- Get job postings with an average yearly salary > $70,000
*/
SELECT
quarter1_job_postings.job_title_short,
quarter1_job_postings.job_location,
quarter1_job_postings.job_via,
quarter1_job_postings.job_posted_date::DATE,
quarter1_job_postings.salary_year_avg
FROM(
SELECT *
FROM january_jobs
UNION ALL -- Don't want to lose any duplicates
SELECT *
FROM february_jobs
UNION ALL
SELECT *
FROM march_jobs
) AS quarter1_job_postings
WHERE
quarter1_job_postings.salary_year_avg > 70000 AND
quarter1_job_postings.job_title_short = 'Data Analyst'
ORDER BY
quarter1_job_postings.salary_year_avg DESC;
/*
- Get the corresponding skill and skill type for each job posting for q1.
- Include those without any skills too.
- Why? Look at the skills and the type for each job in the first quarter that has salary > $70,000
*/
SELECT
q1_jobs.job_title_short,
s.skills,
s.type
FROM (
SELECT *
FROM january_jobs
UNION ALL -- Don't want to lose any duplicates
SELECT *
FROM february_jobs
UNION ALL
SELECT *
FROM march_jobs
) AS q1_jobs
INNER JOIN skills_job_dim AS sjd ON sjd.job_id = q1_jobs.job_id
RIGHT JOIN skills_dim AS s ON s.skill_id = sjd.skill_id -- Including data without skills.
WHERE
q1_jobs.salary_year_avg > 110000;