-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduck.sql
86 lines (75 loc) · 2.64 KB
/
duck.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
-- Create table with constraints
create or replace table acronyms (
id_acronym VARCHAR NOT NULL CHECK (id_acronym = UPPER(id_acronym)),
id_acronym_unique VARCHAR PRIMARY KEY CHECK (id_acronym_unique like id_acronym || '%'),
description VARCHAR UNIQUE
);
-- Load acronyms csv data into table
insert into acronyms (id_acronym, id_acronym_unique, description)
from
(
FROM read_csv('data/acronyms_optnc.csv',
header = true,
columns = {
'id_acronym': 'VARCHAR',
'id_acronym_unique': 'VARCHAR',
'description': 'VARCHAR'
})
);
-- Get a preveiw
from acronyms limit 5;
------------------------------------------------
-- Preserve order of rows
-- Prepare test environment
CREATE SEQUENCE seq_original START 1;
CREATE SEQUENCE seq_sorted START 1;
create or replace temp table orig_table as
select nextval('seq_original') as index,
id_acronym,
description
from acronyms;
create or replace temp table sorted_table as
select nextval('seq_sorted') as index,
id_acronym,
description
from (select id_acronym,
description
from acronyms
-- order by acronym and description
order by id_acronym, id_acronym_unique, description);
-- Check the resulting tables
from orig_table limit 5;
from sorted_table limit 5;
-- Create the table that compares the sorted and original tables columns
create or replace temp table test_sorted(
orig_id_acronym varchar,
orig_description varchar,
orig_index integer,
sorted_index integer
-- the magic part XD
check(orig_index = sorted_index)
);
-- Populate the comparison table
insert into test_sorted
select
orig_table.id_acronym as orig_id_acronym,
orig_table.description as orig_description,
orig_table.index as orig_index,
sorted_table.index as sorted_index,
from
orig_table,
sorted_table
where
orig_table.id_acronym = sorted_table.id_acronym
and orig_table.description = sorted_table.description
order by orig_table.index;
-- Check the resulting table
-- from test_sorted
-- where orig_index != sorted_index;
-- reporting des duplicats
from acronyms
select id_acronym,
count(*) as nb_occurrences
group by id_acronym
having nb_occurrences > 1
order by nb_occurrences desc;