快速注册

豆瓣图书爬虫

小红菌 2016-05-01 09:36:19

#-*- coding:utf-8 -*-
import urllib2
import re
import os
import urllib
import codecs
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#设定输出的环境为utf8
def get_tag():
    url = 'http://book.douban.com/tag/'
    request = urllib2.Request(url)
    request.add_header('User-Agent', "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0")
    text = urllib2.urlopen(request)
    soup = BeautifulSoup(text.read())
    table = soup.findAll('table', {'class':'tagCol'})
    reg_tag = re.compile('>(.*)<')
    reg_url = re.compile('[""]http[s]?://(.*)+[""]')
    tag = {}
    tag_txt = codecs.open('tag.txt', 'w', 'utf-8')
    for item in table:
        for subitem in item.findAll('a'):
            _tag = reg_tag.search(str(subitem)).group(1)
            _url = reg_url.search(str(subitem)).group(0)[1:-1]
            if _tag == u'几米': pass
            if _tag != None and _url != "":
                print _tag, _url
                tag_txt.write(_tag+" "+_url+'\n')
                tag[_tag] = _url
    tag_txt.close()
    return tag
'''
    llist = soup.select("table.tagCol")
    tag = []
    reg_tag = re.compile('>(.*)<')
    reg_url = re.compile('[""]http[s]?://(.*)+[""]')
    for item in llist:
        print item
        alist = BeautifulSoup(str(item)).find_all('a')
        for subitem in alist:
            _tag = reg_tag.search(str(subitem)).group(1)
            _url = reg_url.search(str(subitem)).group(0)[1:-1]
            if _tag != None and _url != "":
                print _tag, _url
            # tag[_tag] = _url
            tag.append(_url)
    return tag
'''
def get_from_file():
    if os.path.exists('tag.txt'):
        tag_txt = codecs.open('tag.txt', 'r', 'utf-8')
        tag = {}
        while True:
            line = tag_txt.readline()
            if line == "":
                break
            token = line.split()
            tag[token[0］ = token[1]
        return tag
    else:
        return get_tag()
def get_proxy():
    text = urllib2.urlopen("http://www.xici.net.co/nn").read()
    #print text
    proxy = []
    soup = BeautifulSoup(text)
    llist = soup.find_all("tr")
    for item in range(1, len(llist)):
        td = BeautifulSoup(str(llist[item])).find_all("td")
       # print td[2].string, td[3].string, td[6].string
        proxy.append({td[6].string.lower():td[6].string.lower()+"://"+td[2].string+":"+td[3].string})
    return proxy
def get_single_book(item0):
    page = 0
    try_times = 0
    book_list = []
    proxy_list = get_proxy()
    while True:
        url = "http://www.douban.com/tag/" + urllib.quote(str(item0)) + "/book?start="+str(page*15)
        proxy_times = 0
        while True:
            proxy_ip = proxy_list.pop(0)
            proxy_times += 1
            if len(proxy_list) == 0:
                proxy_list = get_proxy()
            #urllib2 默认会使用环境变量 http_proxy 来设置 HTTP Proxy
            if proxy_times >= 10:
                proxy_support = urllib2.ProxyHandler({})
                opener = urllib2.build_opener(proxy_support)
                proxy_times = 0
                print "local proxy"
                sleep(60*2)
            else:
                proxy_support = urllib2.ProxyHandler(proxy_ip)
                opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
                print proxy_ip
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            try:
                request.add_header('User-Agent', "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0")
                text = urllib2.urlopen(request, timeout=4)
                break
            except urllib2.URLError:
                print 'URLError, bad proxy'
            except urllib2.HTTPError:
                print 'HTTPError, bad proxy'
            except:
                print 'Unkwon Error'
        print proxy_ip
        soup = BeautifulSoup(text.read())
        list_soup = soup.find('div', {'class': 'mod book-list'})
        try_times += 1
        if page > 20:
            break
        print 'searching page :', page
        if list_soup == None and try_times < 10:
            continue
        elif list_soup == None or len(list_soup) <= 1:
            break
        for book_info in list_soup.findAll('dd'):
            title = book_info.find('a', {'class':'title'}).string.strip()
            desc = book_info.find('div', {'class':'desc'}).string.strip()
            desc_list = desc.split('/')
            try:
                author_info = '作者/译者： ' + '/'.join(desc_list[0:-3])
            except:
                author_info='作者/译者：暂无'
            try:
                pub_info = '出版信息： ' + '/'.join(desc_list[-3:])
            except:
                pub_info='出版信息：暂无'
            try:
                rating = book_info.find('span', {'class':'rating_nums'}).string.strip()
            except:
                rating='0.0'
            book_list.append([title, rating, author_info, pub_info])
        try_times = 0
        page += 1
    book_list = sorted(book_list, key=lambda student : student[1], reverse=True)
    return book_list
def get_book():
    tag = get_from_file()
    book = codecs.open('book_list.txt', 'w', 'utf-8')
    cnt = 0
    for item0, item1 in tag.items():
        print item0, item1
        book_list = get_single_book(item0)
        for item in book_list:
            book.write('%s %f %s %s\n' % (item[0], float(item[1]), item[2], item[3]))
        cnt += 1
        if cnt >= 10:
            break
    book.close()
get_book()
'''
book_list = get_single_book(u'小说')
book = codecs.open('book_list.txt', 'w', 'utf-8')
for item in book_list:
    book.write('%s %f %s %s\n' % (item[0], float(item[1]), item[2], item[3]))
book.close()
'''

python

回应转发赞收藏

加载中...

私密豆列或部分疑似不友善内容不展示

<前页后页>

小红菌 (湖北武汉)

哈哈

豆瓣图书爬虫

热门话题 · · · · · · ( 去话题广场 )