-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_making.py
47 lines (39 loc) · 1.6 KB
/
link_making.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import h5py
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def create_graph_json(embedding_file, output_file, similarity_threshold=0.7):
data = {"nodes": [], "links": []}
filenames = []
topics = []
embeddings = []
with h5py.File(embedding_file, "r") as f:
for group_name, group in f.items():
for dataset_name, dataset in group.items():
filename = dataset_name
filenames.append(filename)
# base64でエンコードされているので、デコードする
topic = dataset["topic"][()].decode("utf-8")
topics.append(topic)
embedding = dataset["embeddings"][()][:10]
embeddings.append(embedding)
data["nodes"].append({"id": filename, "group": topic})
# Compute cosine similarity
similarities = cosine_similarity(embeddings)
# Create links
for i in range(len(filenames)):
for j in range(i + 1, len(filenames)):
print(similarities[i][j])
if similarities[i][j] > similarity_threshold:
data["links"].append(
{
"source": filenames[i],
"target": filenames[j],
"value": similarities[i][j] - similarity_threshold,
}
)
# Write the data to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
create_graph_json("embeddings.hdf5", "page/graph.json", 0.75)