Skip to content

Commit

Permalink
more examples
Browse files Browse the repository at this point in the history
  • Loading branch information
jszheng committed Feb 6, 2016
1 parent b567266 commit 1361a8d
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 0 deletions.
111 changes: 111 additions & 0 deletions download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#author:lenglingx@gmail.com
#date:2014-12-08

#coding:utf-8

import os
import sys
import re
import urllib.request
import urllib.parse
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
'''
网页解析生成一个HTMLParser的类,然后利用这个类,
把给定的一个网址中所需要的地址解析并保存在该类中,
然后利用该类的的地址,下载图片。
'''
def __init__(self):
HTMLParser.__init__(self)
self.links = []
pass
def handle_starttag(self,tag,attrs):
#print("Encountered a start tag:",tag)
if tag == "img":
s = []
for (variable, value) in attrs:
s.append(value)
#print("ss:",s)
self.links.append(s)
s = []
pass

def handle_endtag(self,tag):
#print("Encountered a end tag:",tag)
pass
def handle_data(self,data):
#print("Encountered some data:",data)
pass



def geturl(url):
'''
打开给定的网页,并返回网页的内容,
python3中来来是以字节码形式返回的,
可以根据网页编码判定编码为gb2312,是gbk的子集,
以字符串形式返回。
'''
req = urllib.request.urlopen(url)
req = req.read()
return req.decode("gbk")


def continsrc(src):
'''
根据网页的内容,找到我们所需要的内容,
这里主要是有两个需要关注的内容,一个是picture标签,另一个是boxinfo标签。
'''
inta = src.find("<div id=\"picture\">")
#print(inta) 所找的第一个位置点
intb = src.find("<div class=\"boxinfo\">")
#print(intb) 所找的第二个位置点
content = src[inta:intb]
return content


def pageinurl(url):
'''
这个是把上面的许多功能放在一个函数库里,方便操作。
作用是给定一个url,自动去解析地址,并自动下载保存图片。
'''
src = geturl(url)
content = continsrc(src)
parser = MyHTMLParser()
parser.feed(content)
parser.close()
alinks = parser.links
for i in range(len(alinks)):
print("filename:",alinks[i][0],"fileurl:",alinks[i][1])
urllib.request.urlretrieve(alinks[i][1],alinks[i][0]+".jpg")
print("ok!!")


if __name__ == "__main__":
print("------------------------")
#url = "http://www.meizitu.com/a/4647.html"
url = "http://www.meizitu.com/a/4674.html"
src = geturl(url)

content = continsrc(src)
print(content)

parser = MyHTMLParser()
parser.feed(content)
parser.close()

print("------------------------------------------")
print(parser.links)

a = parser.links
b = len(a)
print(len(a))

for i in range(b):
print("filename:",a[i][0],"fileurl:",a[i][1])
urllib.request.urlretrieve(a[i][1],a[i][0]+".jpg")


print("==================================")
pageinurl("http://www.meizitu.com/a/4647.html")
24 changes: 24 additions & 0 deletions normalize_filename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import re
import shutil


def ord2chr(m):
return chr(int(m.group(1), 16))


def process_filename(dirname):
espat = re.compile(r'%(\d\d)')
count = 0
for f in os.listdir(dirname):
fnew = espat.sub(ord2chr, f)
if f != fnew:
src = os.path.join(dirname,f)
dst = os.path.join(dirname,fnew)
print(src, " => ", dst)
shutil.move(src, dst)
count += 1
print('Total', count, 'files renamed.')

if __name__ == '__main__':
process_filename('D:\\Download\\4K常规演示高清图')
Binary file added persondb.dat
Binary file not shown.
3 changes: 3 additions & 0 deletions persondb.dir
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
'Tom Jones', (1024, 91)
'Sue Jones', (512, 90)
'Bob Smith', (0, 80)
21 changes: 21 additions & 0 deletions process_reference_paper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# coding=utf-8
import re

file_path = 'D:/work/sources/MS_COCO/Microsoft_COCO_reference.txt'

# one liner
with open(file_path, 'r') as file:
hold_line = None
for line in file:
if re.match(r'^\[\d+\]', line): #start of line
if hold_line is not None:
print(hold_line)
hold_line = line.strip()
else:
if hold_line is not None:
hold_line += ' ' + line.strip()
else:
print(line)
# before exist, print last
if hold_line is not None:
print(hold_line)

0 comments on commit 1361a8d

Please sign in to comment.