-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
159 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#author:lenglingx@gmail.com | ||
#date:2014-12-08 | ||
|
||
#coding:utf-8 | ||
|
||
import os | ||
import sys | ||
import re | ||
import urllib.request | ||
import urllib.parse | ||
from html.parser import HTMLParser | ||
|
||
class MyHTMLParser(HTMLParser): | ||
''' | ||
网页解析生成一个HTMLParser的类,然后利用这个类, | ||
把给定的一个网址中所需要的地址解析并保存在该类中, | ||
然后利用该类的的地址,下载图片。 | ||
''' | ||
def __init__(self): | ||
HTMLParser.__init__(self) | ||
self.links = [] | ||
pass | ||
def handle_starttag(self,tag,attrs): | ||
#print("Encountered a start tag:",tag) | ||
if tag == "img": | ||
s = [] | ||
for (variable, value) in attrs: | ||
s.append(value) | ||
#print("ss:",s) | ||
self.links.append(s) | ||
s = [] | ||
pass | ||
|
||
def handle_endtag(self,tag): | ||
#print("Encountered a end tag:",tag) | ||
pass | ||
def handle_data(self,data): | ||
#print("Encountered some data:",data) | ||
pass | ||
|
||
|
||
|
||
def geturl(url): | ||
''' | ||
打开给定的网页,并返回网页的内容, | ||
python3中来来是以字节码形式返回的, | ||
可以根据网页编码判定编码为gb2312,是gbk的子集, | ||
以字符串形式返回。 | ||
''' | ||
req = urllib.request.urlopen(url) | ||
req = req.read() | ||
return req.decode("gbk") | ||
|
||
|
||
def continsrc(src): | ||
''' | ||
根据网页的内容,找到我们所需要的内容, | ||
这里主要是有两个需要关注的内容,一个是picture标签,另一个是boxinfo标签。 | ||
''' | ||
inta = src.find("<div id=\"picture\">") | ||
#print(inta) 所找的第一个位置点 | ||
intb = src.find("<div class=\"boxinfo\">") | ||
#print(intb) 所找的第二个位置点 | ||
content = src[inta:intb] | ||
return content | ||
|
||
|
||
def pageinurl(url): | ||
''' | ||
这个是把上面的许多功能放在一个函数库里,方便操作。 | ||
作用是给定一个url,自动去解析地址,并自动下载保存图片。 | ||
''' | ||
src = geturl(url) | ||
content = continsrc(src) | ||
parser = MyHTMLParser() | ||
parser.feed(content) | ||
parser.close() | ||
alinks = parser.links | ||
for i in range(len(alinks)): | ||
print("filename:",alinks[i][0],"fileurl:",alinks[i][1]) | ||
urllib.request.urlretrieve(alinks[i][1],alinks[i][0]+".jpg") | ||
print("ok!!") | ||
|
||
|
||
if __name__ == "__main__": | ||
print("------------------------") | ||
#url = "http://www.meizitu.com/a/4647.html" | ||
url = "http://www.meizitu.com/a/4674.html" | ||
src = geturl(url) | ||
|
||
content = continsrc(src) | ||
print(content) | ||
|
||
parser = MyHTMLParser() | ||
parser.feed(content) | ||
parser.close() | ||
|
||
print("------------------------------------------") | ||
print(parser.links) | ||
|
||
a = parser.links | ||
b = len(a) | ||
print(len(a)) | ||
|
||
for i in range(b): | ||
print("filename:",a[i][0],"fileurl:",a[i][1]) | ||
urllib.request.urlretrieve(a[i][1],a[i][0]+".jpg") | ||
|
||
|
||
print("==================================") | ||
pageinurl("http://www.meizitu.com/a/4647.html") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import os | ||
import re | ||
import shutil | ||
|
||
|
||
def ord2chr(m): | ||
return chr(int(m.group(1), 16)) | ||
|
||
|
||
def process_filename(dirname): | ||
espat = re.compile(r'%(\d\d)') | ||
count = 0 | ||
for f in os.listdir(dirname): | ||
fnew = espat.sub(ord2chr, f) | ||
if f != fnew: | ||
src = os.path.join(dirname,f) | ||
dst = os.path.join(dirname,fnew) | ||
print(src, " => ", dst) | ||
shutil.move(src, dst) | ||
count += 1 | ||
print('Total', count, 'files renamed.') | ||
|
||
if __name__ == '__main__': | ||
process_filename('D:\\Download\\4K常规演示高清图') |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
'Tom Jones', (1024, 91) | ||
'Sue Jones', (512, 90) | ||
'Bob Smith', (0, 80) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# coding=utf-8 | ||
import re | ||
|
||
file_path = 'D:/work/sources/MS_COCO/Microsoft_COCO_reference.txt' | ||
|
||
# one liner | ||
with open(file_path, 'r') as file: | ||
hold_line = None | ||
for line in file: | ||
if re.match(r'^\[\d+\]', line): #start of line | ||
if hold_line is not None: | ||
print(hold_line) | ||
hold_line = line.strip() | ||
else: | ||
if hold_line is not None: | ||
hold_line += ' ' + line.strip() | ||
else: | ||
print(line) | ||
# before exist, print last | ||
if hold_line is not None: | ||
print(hold_line) |