简易 Python 爬虫

#!/usr/bin/python3
# by Qige <[email protected]> at 2017.11.05

import urllib

import re
from time import ctime as ts


def loadUrl(url):
    page = urllib.urlopen(url)
    rawResponse = page.read()
    return rawResponse

def saveImage(path, data):
    with open(path, 'w+') as fd:
        fd.write(data)
        fd.flush()
        fd.close()

def getAllImage(data):
    imgUrls = re.findall(r'http://[\S]*.jpg', data)
    num = 1
    for imgUrl in imgUrls:
        rawImg = loadUrl(imgUrl)
        if (len(rawImg) >= 50*1024):
            fname = "/tmp/%s.jpg" % (num)
            saveImage(fname, rawImg)
            print('- image %d - %s downloaded' % (num, fname))
            num = num + 1

    print('Download completed at %r' % (ts())) 

# TODO: read from user input would be better
url = 'http://desk.zol.com.cn/pubuliu/'

print('Trying %s at %r' % (url, ts()))
html = loadUrl(url)
getAllImage(html)

执行结果

Trying http://desk.zol.com.cn/pubuliu/ at 'Sun Nov  5 18:20:50 2017'
- image 1 - /tmp/1.jpg downloaded
- image 2 - /tmp/2.jpg downloaded
- image 3 - /tmp/3.jpg downloaded
- image 4 - /tmp/4.jpg downloaded
- image 5 - /tmp/5.jpg downloaded
- image 6 - /tmp/6.jpg downloaded
- image 7 - /tmp/7.jpg downloaded
- image 8 - /tmp/8.jpg downloaded
Download completed at 'Sun Nov  5 18:20:55 2017'

results matching ""

    No results matching ""