This repository has been archived by the owner on Jul 7, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
69 lines (63 loc) · 2.68 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Functions relating to specific data processing."""
from collections import Counter, defaultdict
from csv import writer
from pickle import dump, load
from config import AMAZON_ALL_GENRE_FILE, AMAZON_FILE, AMAZON_RAW_OUTPUTS
def make_genre_csv() -> None:
"""Create CSV listing each ASIN and their genres."""
dvd_asin_file = AMAZON_RAW_OUTPUTS / "dvd_asin.bin"
cur_asin: str = ""
# Simple caching
if dvd_asin_file.exists():
with open(dvd_asin_file, mode="rb") as dvd_file_io:
dvd_asins: set[str] = load(dvd_file_io)
else:
with open(dvd_asin_file, mode="wb") as dvd_file_io:
dvd_asins = set()
with open(AMAZON_FILE, mode="r", encoding="utf-8") as io_obj:
for line in io_obj:
ls_ = line.split()
if line.startswith("ASIN: "):
cur_asin = ls_[1]
if line.startswith(" group: DVD"):
dvd_asins.add(cur_asin)
dump(dvd_asins, dvd_file_io)
all_genres: set[str] = set()
genre_flag: bool = False
genre_dict: defaultdict[str, set[str]] = defaultdict(set)
with open(AMAZON_FILE, mode="r", encoding="utf-8") as amazon_io:
for line in amazon_io:
ls_ = line.split()
if line.startswith("ASIN: "):
cur_asin = ls_[1]
if cur_asin in dvd_asins:
if line.startswith(" categories: "):
genre_flag = True
genre_dict[cur_asin] = set()
elif line.startswith(" reviews: "):
genre_flag = False
if genre_flag:
bar_split = line.split("|")
# how genres are denoted
if (len(bar_split) > 3) and (bar_split[3] == "Genres[404276]"):
genre = bar_split[4].strip()
all_genres.add(genre)
if genre not in genre_dict[cur_asin]:
genre_dict[cur_asin].add(genre)
all_genres_list = sorted(list(all_genres))
num_genres = len(all_genres_list)
genre_counter: Counter[str] = Counter()
with open(
AMAZON_ALL_GENRE_FILE,
mode="w",
newline="",
encoding="utf-8",
) as genre_io:
csv_writer = writer(genre_io)
csv_writer.writerow(["ASIN"] + all_genres_list)
for asin, genre_set in genre_dict.items():
to_write_list: list[str | int] = [""] * num_genres
for genre in genre_set:
genre_counter[genre] += 1
to_write_list[all_genres_list.index(genre)] = 1
csv_writer.writerow([asin] + to_write_list)