代码只爬取列表页 详情页和作者页都不爬取

来源:8-10 ThreadPoolExecutor线程池重构爬虫

慕斯8319460

2020-07-03

'''爬取csdn论坛页的爬虫
步骤:1.获取页面左边的所有url
2.解析出第一层的url,在所有url中去掉第一层url 这样就得到了所有页面的url
3.拼出推荐精华,已解决,待解决的url
4.解析出网页列表页的内容+详细内容+作者页内容
'''

import requests
import re
import ast
import warnings

from concurrent.futures import ThreadPoolExecutor,ALL_COMPLETED,wait
from datetime import datetime
from urllib import parse
from scrapy import Selector

from csdn爬虫.csdn_database import *

warnings.filterwarnings('ignore')

domain = 'https://bbs.csdn.net/'
headers= {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
executor=ThreadPoolExecutor(max_workers=10)

def get_html_list():
    '''获取左侧导航栏的网页源代码,是个list'''
    html=requests.get('https://bbs.csdn.net/dynamic_js/left_menu.js?csdn').text
    html=re.search('forumNodes: (.*])',html).group(1)
    if html:
        html=html.replace('null','None')
        html_list=ast.literal_eval(html)
        return html_list
    return []

all_url_list=[]  # 这里all_url_list要写在外面  不然下次调用时all_url_list=[]
def process_html_list(html_list):
    '''获取所有页面的url'''
    for item in html_list:
        if 'url' in item and item['url']:
            all_url_list.append(item['url'])
        if 'children' in item:
            process_html_list(item['children'])
    return all_url_list

def get_level1_urls(html_list):
    level1_url_list=[]
    for item in html_list:
        if 'url' in item and item['url']:
            level1_url_list.append(item['url'])
    return level1_url_list

def get_last_url():

    html_list=get_html_list()
    if html_list:
        all_url_list=process_html_list(html_list)
        level1_url_list=get_level1_urls(html_list)
        url_list=[]
        for url in all_url_list:
            if url not in level1_url_list:
                url_list.append(parse.urljoin(domain,url))
                url_list.append(parse.urljoin(domain, url+'recommend'))
                url_list.append(parse.urljoin(domain, url + 'closed'))
    return url_list

def parse_topic_list(url):
    print('开始抓取列表页:%s' % url)
    if requests.get(url,headers=headers).status_code==200:
        html=requests.get(url,headers=headers).text
        selector=Selector(text=html)
        trs=selector.xpath('//table[@class="forums_tab_table"]/tbody/tr')
        for tr in trs:
            status=tr.xpath('.//td[@class="forums_topic_flag"]/span/text()').get()
            score=tr.xpath('.//td[@class="forums_score"]/em/text()').get()
            title=tr.xpath('.//td[@class="forums_topic"]/a[contains(@class,"forums_title ")]/text()').get()
            title_url=tr.xpath('.//td[@class="forums_topic"]//a[contains(@class,"forums_title ")]/@href').get()
            id=tr.xpath('.//td[@class="forums_topic"]/a[contains(@class,"forums_title ")]/@href').get().split('/')[-1]
            author=tr.xpath('.//td[@class="forums_author"]/a/text()').get()
            author_url=tr.xpath('.//td[@class="forums_author"]/a/@href').get()
            create_time=tr.xpath('.//td[@class="forums_author"]/em/text()').get()
            click_nums=tr.xpath('.//td[@class="forums_reply"]/span/text()').get().split('/')[-1]
            ansewr_nums=tr.xpath('.//td[@class="forums_reply"]/span/text()').get().split('/')[0]
            last_answer_time=tr.xpath('.//td[@class="forums_last_pub"]/em/text()').get()
            topic=Topic()
            topic.id=int(id)
            topic.title=title
            topic.author=author
            topic.create_time=datetime.strptime(create_time,'%Y-%m-%d %H:%M')
            topic.status=status
            topic.score=int(score)
            topic.click_nums=int(click_nums)
            topic.ansewr_nums=int(ansewr_nums)
            topic.last_answer_time=datetime.strptime(last_answer_time,'%Y-%m-%d %H:%M')
            existed_topic=Topic.select().where(Topic.id==id)
            if existed_topic:
                topic.save()
            else:
                topic.save(force_insert=True)

            executor.submit(parse_topic,parse.urljoin(domain,title_url))
            executor.submit(parse_author,parse.urljoin(domain,author_url))
        next_page=selector.xpath('.//div[@class="page_nav"]//a[contains(@class,"next_page")]/@href').get()
        if next_page:
                if next_page.startswith('/forums'):
                    next_page=parse.urljoin(domain,next_page)
                    executor.submit(parse_topic_list,next_page)
    else:
        print('列表页{}不存在,不抓取'.format(url))

def parse_topic(url):
    '''
    content=TextField(default='')
    parised_nums=IntegerField(default=0)
    jtl=FloatField(default=0.0) # 结帖率
    '''
    print('开始抓取topic content:%s'%url)
    html=requests.get(url,headers=headers).text
    selector=Selector(text=html)
    post_info=selector.xpath('.//div[@class="mod_topic_wrap post"]')  # 帖子
    title_info=selector.xpath('.//div[@class="mod_topic_wrap post topic"]')  # 标题
    id = url.split('/')[-1]
    id = re.search('(\d+)', id).group(1)
    if title_info:
        content=title_info.xpath('.//div[@class="post_body post_body_min_h"]/text()').getall()
        content=''.join(content).strip()
        parised_nums=title_info.xpath('.//div[@class="control_l fl"]//em/text()').get()
        jtl=title_info.xpath('.//div[@class="close_topic"]/text()').get()
        jtl=re.search('(\d+)',jtl).group(1)

        topic=Topic()
        existed_topic = Topic.select().where(Topic.id == id)
        if existed_topic:
            topic.id=id
            topic.content=content
            topic.parised_nums=int(parised_nums)
            topic.jtl=float(jtl)
            topic.save()

    if post_info:
        for post in post_info:
            content=post.xpath('.//div[@class="post_body post_body_min_h"]/text()').get().strip()
            author=post.xpath('.//div[@class="nick_name"]/a/text()').get()
            create_time=post.xpath('.//label[@class="date_time"]/text()').get()
            parised_nums=post.xpath('.//div[@class="control_l fl"]//em/text()').get()

            answer=Answer()
            answer.topic_id=id
            answer.content=content
            answer.author=author
            answer.create_time=datetime.strptime(create_time,'%Y-%m-%d %H:%M:%S')
            answer.parised_nums=parised_nums
            answer.save()

    next_page=selector.xpath('.//a[contains(@class,"next_page")]/@href').get()
    if next_page:
        if next_page.startswith('/topics'):
            next_page=parse.urljoin(domain,next_page)
            executor.submit(parse_topic,next_page)


def parse_author(url):

    response_code=requests.get(url,headers=headers).status_code
    if response_code == 200:
        print('开始抓取author:%s'%url)
        html=requests.get(url,headers=headers).text
        selector=Selector(text=html)
        name=selector.xpath('.//div[@class="lt_title"]/text()').getall()
        id=url.split('/')[-1]
        name=''.join(name).strip()
        info=selector.xpath('.//ul[@class="me_chanel_list clearfix"]/li')
        blog_nums=strip_str(info[0].xpath('.//span[@class="count"]/text()').get())
        resource_nums=strip_str(info[1].xpath('.//span[@class="count"]/text()').get())
        luntan_nums=strip_str(info[2].xpath('.//span[@class="count"]/text()').get())
        blink_nums=strip_str(info[3].xpath('.//span[@class="count"]/text()').get())
        wenda_nums=strip_str(info[4].xpath('.//span[@class="count"]/text()').get())
        collect_nums=strip_str(info[5].xpath('.//span[@class="count"]/text()').get())
        zhuanlan_nums=strip_str(info[6].xpath('.//span[@class="count"]/text()').get())
        rate=selector.xpath('.//div[@class="me_chanel_det"]//div[@class="me_chanel_det_item access"]//span/text()').getall()[1].strip()
        follower_nums=strip_str(selector.xpath('.//div[@class="fans"]//span/text()').get())
        following_nums=strip_str(selector.xpath('.//div[@class="att"]//span/text()').get())


        author=Author()
        author.id = id
        author.name = name
        author.blog_nums = int(blog_nums)
        author.resource_nums=int(resource_nums)
        author.luntan_nums=int(luntan_nums)
        author.blink_nums=int(blink_nums)
        author.wenda_nums=int(wenda_nums)
        author.collect_nums=int(collect_nums)
        author.zhuanlan_nums=int(zhuanlan_nums)
        author.rate=rate
        author.follower_nums=follower_nums
        author.following_nums=following_nums

        existed_author=Author.select().where(Author.id==id)
        if existed_author:
            author.save()
        else:
            author.save(force_insert=True)
    else:
        print('%s author页面不存在,不抓取' % url)


def strip_str(x):
    if x:
        return x.strip()
    else:
        return 0




if __name__=='__main__':

    url_list=get_last_url()
    for url in url_list:
        executor.submit(parse_topic_list,url)




写回答

1回答

bobby

2020-07-05

你有没有看过爬不到是不会进入具体的逻辑 还是这两个页面不会返回数据?

0
2
bobby
回复
慕斯8319460
那你有没有调试过?看看返回的html是否符合预期?
2020-07-09
共2条回复

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2377 学习 · 1158 问题

查看课程