-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpipeline_fileRead-fileWrite.xml
81 lines (77 loc) · 2.97 KB
/
pipeline_fileRead-fileWrite.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
<?xml version="1.0" encoding="UTF-8" ?>
<pipeline-def name="event-consolidation" description="This is the process for transforming event data" version="1.0.0">
<settings>
<singleSparkSession setting="false" />
<globalViewAsLocal setting="true" />
</settings>
<variables>
<variable name="staging_uri" value="/tmp/staging/events" />
<variable name="export_dir" value="${events.output_dir}" />
</variables>
<aliases>
<alias name="file-reader" type="com.qwshen.etl.source.FileReader" />
<alias name="flat-reader" type="com.qwshen.etl.source.FlatFileReader" />
<alias name="sql" type="com.qwshen.etl.transform.SqlTransformer" />
<alias name="file-writer" type="com.qwshen.etl.sink.FileWriter" />
</aliases>
<job name="prepare events-features">
<action name="load users">
<actor type="file-reader">
<properties>
<format>csv</format>
<options>
<header>true</header>
<delimiter>,</delimiter>
</options>
<fileUri>${events.users_input}</fileUri>
</properties>
</actor>
<output-view name="users" global="false" />
</action>
<action name="load train">
<actor type="flat-reader">
<properties>
<ddlFieldsString>user:1-9 string, event:10-10 long, timestamp:20-32 string, interested:52-1 int</ddlFieldsString>
<fileUri>${events.train_input}</fileUri>
</properties>
</actor>
<output-view name="train" />
</action>
<action name="transform users-train">
<actor type="sql">
<properties>
<sqlFile>${application.scripts_uri}/transform-user-train.sql</sqlFile>
</properties>
</actor>
<input-views>
<view name="users" />
<view name="train" />
</input-views>
<output-view name="features" />
</action>
<action name="write features">
<actor type="file-writer">
<properties>
<format>csv</format>
<options>
<header>true</header>
<maxRecordsPerFile>30000</maxRecordsPerFile>
</options>
<partitionBy>gender,interested</partitionBy>
<mode>overwrite</mode>
<fileUri>${export_dir}</fileUri>
<view>features</view>
</properties>
</actor>
<input-views>
<view name="features" />
</input-views>
</action>
</job>
<staging>
<uri>${staging_uri}</uri>
<actions>
<action name="transform users-train" />
</actions>
</staging>
</pipeline-def>