forked from evanc577/sourcecatcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_phashes.py
75 lines (61 loc) · 2.14 KB
/
gen_phashes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from multiprocessing import Pool, TimeoutError, cpu_count
from pathlib import Path
from PIL import Image
from annoy import AnnoyIndex
import imagehash
import os
import yaml
import sqlite3
import numpy as np
import sys
import sqlite3
def insert_phash(files):
"""calculate the phash of a image"""
i = files[0]
filename = files[1]
phash = imagehash.phash(Image.open(filename))
phash_arr = phash.hash.flatten()
print('file #{:08d}, phash: {:08x}, filename: {}'.format(i, int(str(phash), 16), filename))
basename = os.path.basename(filename)
dirname = os.path.dirname(filename)
return phash_arr, basename, dirname, i
def gen_phash():
"""calculate the phashes of all images, insert into a searchable database"""
# parse config.yaml
try:
dirpath = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(dirpath, 'config.yaml')
with open(path) as f:
config = yaml.safe_load(f)
except IOError:
print("error loading config file")
sys.exit(1)
try:
access_token = config['access_token']
access_secret = config['access_secret']
consumer_key = config['consumer_key']
consumer_secret = config['consumer_secret']
users = config['users']
media_dir = config['media_dir']
except KeyError:
print("could not parse users file")
sys.exit(1)
index = AnnoyIndex(64, metric='hamming')
conn = sqlite3.connect('working/twitter_scraper.db')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS annoy')
c.execute('CREATE TABLE IF NOT EXISTS annoy (filename text, path text, idx int32, UNIQUE (idx))')
# calc phash of all images
files = enumerate(Path(media_dir).glob('*/*/*.jpg'))
num_cpus = cpu_count() // 2
if num_cpus == 0:
num_cpus = 1
with Pool(processes=num_cpus) as pool:
for r in pool.imap(insert_phash, files, chunksize=64):
index.add_item(r[3], r[0])
c.execute('INSERT INTO annoy VALUES (?,?,?)', (r[1], r[2], r[3]))
conn.commit()
index.build(20)
index.save('working/phash_index.ann')
if __name__ == '__main__':
gen_phash()