-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.ts
186 lines (175 loc) · 6.74 KB
/
config.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import { startCase } from 'lodash';
import {
createActorConfig,
createActorInputSchema,
createBooleanField,
createIntegerField,
createStringField,
Field,
ActorInputSchema,
createActorOutputSchema,
} from 'apify-actor-config';
import { AllActorInputs, allActorInputs } from 'crawlee-one';
import { DATASET_TYPE, DatasetType, EmploymentType, SalaryPeriod, WorkFromHomeType } from './types';
import actorSpec from './actorspec';
import { SALARY_PERIOD } from './types';
import { EMPLOYMENT_TYPE } from './types';
import { WORK_FROM_HOME_TYPE } from './types';
const createTagFn = (tag: string) => (t: string) => `<${tag}>${t}</${tag}>`;
const strong = createTagFn('strong');
const newLine = (repeats = 1) => '<br/>'.repeat(repeats);
export interface CustomActorInput {
/** Choose what kind of data you want to extract - job offers, list of companies, list of industries, ... */
datasetType?: DatasetType;
/** If checked, the scraper will obtain more detailed info for job offers by visit the details page of each job offer to extract data. If un-checked, only the data from the listing page is extracted. For details, please refer to http://apify.com/store/jurooravec/profesia-sk-scraper#output */
jobOfferDetailed?: boolean;
/** Comma-separated list of keywords. If given, only entries matching the query will be retrieved (full-text search) */
jobOfferFilterQuery?: string;
/** If set, only entries offering this much or more will be extracted */
jobOfferFilterMinSalaryValue?: number;
/** Choose if the minimum salary is in per hour or per month format */
jobOfferFilterMinSalaryPeriod?: SalaryPeriod;
/** If set, only entries with this employment filter will be extracted */
jobOfferFilterEmploymentType?: EmploymentType;
/** If set, only entries with this type of remote work filter will be extracted */
jobOfferFilterRemoteWorkType?: WorkFromHomeType;
/** If set, only entries this much days old will be extracted. E.g. 7 = 1 week old, 31 = 1 month old, ... */
jobOfferFilterLastNDays?: number;
/** If checked, no data is extracted. Instead, the count of matched job offers is printed in the log. */
jobOfferCountOnly?: boolean;
}
/** Shape of the data passed to the actor from Apify */
export interface ActorInput
// Include the common fields in input
extends AllActorInputs,
CustomActorInput {}
const customActorInput: Record<keyof CustomActorInput, Field> = {
datasetType: createStringField<DatasetType>({
type: 'string',
title: 'Dataset type',
description: `Use this option if you want to scrape a whole dataset,
not just specific URLs.${newLine(2)}
This option is ignored if ${strong('Start URLs:')} are given`,
editor: 'select',
example: 'jobOffers',
default: 'jobOffers',
prefill: 'jobOffers',
enum: DATASET_TYPE,
enumTitles: DATASET_TYPE.map(startCase),
nullable: true,
}),
jobOfferDetailed: createBooleanField({
title: 'Detailed',
type: 'boolean',
description: `If checked, the scraper will obtain more detailed info
for job offers by visit the details page of each job offer.${newLine(2)}
If un-checked, only the data from the listing page is extracted.${newLine(2)}
For details, please refer to ${actorSpec.actor.publicUrl}#output`,
example: true,
default: true,
sectionCaption: 'Job Offer Filters',
sectionDescription: `These filters are applied ${strong('ONLY')} when scraping job offers`,
nullable: true,
}),
jobOfferFilterQuery: createStringField({
type: 'string',
title: 'Search keywords (full-text search)',
description: `Comma-separated list of keywords. If given, only entries
matching the keywords will be retrieved (full-text search)`,
example: 'specialist, Bratislava',
editor: 'textfield',
nullable: true,
}),
jobOfferFilterMinSalaryValue: createIntegerField({
title: 'Min salary',
type: 'integer',
description: 'If set, only entries offering this much or more will be extracted',
example: 1000,
minimum: 1,
nullable: true,
}),
jobOfferFilterMinSalaryPeriod: createStringField<SalaryPeriod>({
title: 'Min salary per hour/month',
type: 'string',
description: 'Choose if the minimum salary is in per hour or per month format',
editor: 'select',
example: 'month',
default: 'month',
prefill: 'month',
enum: SALARY_PERIOD,
enumTitles: SALARY_PERIOD.map((s) => `Per ${s}`),
nullable: true,
}),
jobOfferFilterEmploymentType: createStringField<EmploymentType>({
title: 'Type of employment',
type: 'string',
description: 'If set, only entries with this employment filter will be extracted',
editor: 'select',
example: 'fte',
enum: EMPLOYMENT_TYPE,
enumTitles: EMPLOYMENT_TYPE.map(startCase),
nullable: true,
}),
jobOfferFilterRemoteWorkType: createStringField<WorkFromHomeType>({
title: 'Remote vs On-site',
type: 'string',
description: 'If set, only entries with this type of remote work filter will be extracted',
editor: 'select',
example: 'fullRemote',
enum: WORK_FROM_HOME_TYPE,
enumTitles: WORK_FROM_HOME_TYPE.map(startCase),
nullable: true,
}),
jobOfferFilterLastNDays: createIntegerField({
title: 'Last N days',
type: 'integer',
description: `If set, only entries up to this much days old will be extracted.
E.g. 7 = max 1 week old, 31 = max 1 month old, ...`,
example: 10,
minimum: 0,
nullable: true,
}),
jobOfferCountOnly: createBooleanField({
title: 'Count the matched job offers',
type: 'boolean',
description: `If checked, no data is extracted. Instead, the count of matched
job offers is printed in the log.`,
default: false,
groupCaption: 'Troubleshooting options',
groupDescription: 'Use these to verify that your custom startUrls are correct',
nullable: true,
}),
};
// Customize the default options
allActorInputs.requestHandlerTimeoutSecs.prefill = 60 * 3;
const inputSchema = createActorInputSchema<ActorInputSchema<Record<keyof ActorInput, Field>>>({
schemaVersion: 1,
title: actorSpec.actor.title,
description: `Configure the ${actorSpec.actor.title}. ${newLine(2)}
${strong('NOTE:')} Either ${strong('Dataset type')} or
${strong('Start URLs')} must be given.`,
type: 'object',
properties: {
...customActorInput,
// Include the common fields in input
...allActorInputs,
},
});
const outputSchema = createActorOutputSchema({
actorSpecification: 1,
fields: {},
views: {},
});
const config = createActorConfig({
actorSpecification: 1,
name: actorSpec.platform.actorId,
title: actorSpec.actor.title,
description: actorSpec.actor.shortDesc,
version: '1.0',
dockerfile: './Dockerfile',
input: inputSchema,
storages: {
dataset: outputSchema,
},
});
export default config;