简易 Python 爬虫
#!/usr/bin/python3
# by Qige <[email protected]> at 2017.11.05
import urllib
import re
from time import ctime as ts
def loadUrl(url):
page = urllib.urlopen(url)
rawResponse = page.read()
return rawResponse
def saveImage(path, data):
with open(path, 'w+') as fd:
fd.write(data)
fd.flush()
fd.close()
def getAllImage(data):
imgUrls = re.findall(r'http://[\S]*.jpg', data)
num = 1
for imgUrl in imgUrls:
rawImg = loadUrl(imgUrl)
if (len(rawImg) >= 50*1024):
fname = "/tmp/%s.jpg" % (num)
saveImage(fname, rawImg)
print('- image %d - %s downloaded' % (num, fname))
num = num + 1
print('Download completed at %r' % (ts()))
# TODO: read from user input would be better
url = 'http://desk.zol.com.cn/pubuliu/'
print('Trying %s at %r' % (url, ts()))
html = loadUrl(url)
getAllImage(html)
执行结果
Trying http://desk.zol.com.cn/pubuliu/ at 'Sun Nov 5 18:20:50 2017'
- image 1 - /tmp/1.jpg downloaded
- image 2 - /tmp/2.jpg downloaded
- image 3 - /tmp/3.jpg downloaded
- image 4 - /tmp/4.jpg downloaded
- image 5 - /tmp/5.jpg downloaded
- image 6 - /tmp/6.jpg downloaded
- image 7 - /tmp/7.jpg downloaded
- image 8 - /tmp/8.jpg downloaded
Download completed at 'Sun Nov 5 18:20:55 2017'