代码只爬取列表页 详情页和作者页都不爬取
来源:8-10 ThreadPoolExecutor线程池重构爬虫

慕斯8319460
2020-07-03
'''爬取csdn论坛页的爬虫
步骤:1.获取页面左边的所有url
2.解析出第一层的url,在所有url中去掉第一层url 这样就得到了所有页面的url
3.拼出推荐精华,已解决,待解决的url
4.解析出网页列表页的内容+详细内容+作者页内容
'''
import requests
import re
import ast
import warnings
from concurrent.futures import ThreadPoolExecutor,ALL_COMPLETED,wait
from datetime import datetime
from urllib import parse
from scrapy import Selector
from csdn爬虫.csdn_database import *
warnings.filterwarnings('ignore')
domain = 'https://bbs.csdn.net/'
headers= {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
executor=ThreadPoolExecutor(max_workers=10)
def get_html_list():
'''获取左侧导航栏的网页源代码,是个list'''
html=requests.get('https://bbs.csdn.net/dynamic_js/left_menu.js?csdn').text
html=re.search('forumNodes: (.*])',html).group(1)
if html:
html=html.replace('null','None')
html_list=ast.literal_eval(html)
return html_list
return []
all_url_list=[] # 这里all_url_list要写在外面 不然下次调用时all_url_list=[]
def process_html_list(html_list):
'''获取所有页面的url'''
for item in html_list:
if 'url' in item and item['url']:
all_url_list.append(item['url'])
if 'children' in item:
process_html_list(item['children'])
return all_url_list
def get_level1_urls(html_list):
level1_url_list=[]
for item in html_list:
if 'url' in item and item['url']:
level1_url_list.append(item['url'])
return level1_url_list
def get_last_url():
html_list=get_html_list()
if html_list:
all_url_list=process_html_list(html_list)
level1_url_list=get_level1_urls(html_list)
url_list=[]
for url in all_url_list:
if url not in level1_url_list:
url_list.append(parse.urljoin(domain,url))
url_list.append(parse.urljoin(domain, url+'recommend'))
url_list.append(parse.urljoin(domain, url + 'closed'))
return url_list
def parse_topic_list(url):
print('开始抓取列表页:%s' % url)
if requests.get(url,headers=headers).status_code==200:
html=requests.get(url,headers=headers).text
selector=Selector(text=html)
trs=selector.xpath('//table[@class="forums_tab_table"]/tbody/tr')
for tr in trs:
status=tr.xpath('.//td[@class="forums_topic_flag"]/span/text()').get()
score=tr.xpath('.//td[@class="forums_score"]/em/text()').get()
title=tr.xpath('.//td[@class="forums_topic"]/a[contains(@class,"forums_title ")]/text()').get()
title_url=tr.xpath('.//td[@class="forums_topic"]//a[contains(@class,"forums_title ")]/@href').get()
id=tr.xpath('.//td[@class="forums_topic"]/a[contains(@class,"forums_title ")]/@href').get().split('/')[-1]
author=tr.xpath('.//td[@class="forums_author"]/a/text()').get()
author_url=tr.xpath('.//td[@class="forums_author"]/a/@href').get()
create_time=tr.xpath('.//td[@class="forums_author"]/em/text()').get()
click_nums=tr.xpath('.//td[@class="forums_reply"]/span/text()').get().split('/')[-1]
ansewr_nums=tr.xpath('.//td[@class="forums_reply"]/span/text()').get().split('/')[0]
last_answer_time=tr.xpath('.//td[@class="forums_last_pub"]/em/text()').get()
topic=Topic()
topic.id=int(id)
topic.title=title
topic.author=author
topic.create_time=datetime.strptime(create_time,'%Y-%m-%d %H:%M')
topic.status=status
topic.score=int(score)
topic.click_nums=int(click_nums)
topic.ansewr_nums=int(ansewr_nums)
topic.last_answer_time=datetime.strptime(last_answer_time,'%Y-%m-%d %H:%M')
existed_topic=Topic.select().where(Topic.id==id)
if existed_topic:
topic.save()
else:
topic.save(force_insert=True)
executor.submit(parse_topic,parse.urljoin(domain,title_url))
executor.submit(parse_author,parse.urljoin(domain,author_url))
next_page=selector.xpath('.//div[@class="page_nav"]//a[contains(@class,"next_page")]/@href').get()
if next_page:
if next_page.startswith('/forums'):
next_page=parse.urljoin(domain,next_page)
executor.submit(parse_topic_list,next_page)
else:
print('列表页{}不存在,不抓取'.format(url))
def parse_topic(url):
'''
content=TextField(default='')
parised_nums=IntegerField(default=0)
jtl=FloatField(default=0.0) # 结帖率
'''
print('开始抓取topic content:%s'%url)
html=requests.get(url,headers=headers).text
selector=Selector(text=html)
post_info=selector.xpath('.//div[@class="mod_topic_wrap post"]') # 帖子
title_info=selector.xpath('.//div[@class="mod_topic_wrap post topic"]') # 标题
id = url.split('/')[-1]
id = re.search('(\d+)', id).group(1)
if title_info:
content=title_info.xpath('.//div[@class="post_body post_body_min_h"]/text()').getall()
content=''.join(content).strip()
parised_nums=title_info.xpath('.//div[@class="control_l fl"]//em/text()').get()
jtl=title_info.xpath('.//div[@class="close_topic"]/text()').get()
jtl=re.search('(\d+)',jtl).group(1)
topic=Topic()
existed_topic = Topic.select().where(Topic.id == id)
if existed_topic:
topic.id=id
topic.content=content
topic.parised_nums=int(parised_nums)
topic.jtl=float(jtl)
topic.save()
if post_info:
for post in post_info:
content=post.xpath('.//div[@class="post_body post_body_min_h"]/text()').get().strip()
author=post.xpath('.//div[@class="nick_name"]/a/text()').get()
create_time=post.xpath('.//label[@class="date_time"]/text()').get()
parised_nums=post.xpath('.//div[@class="control_l fl"]//em/text()').get()
answer=Answer()
answer.topic_id=id
answer.content=content
answer.author=author
answer.create_time=datetime.strptime(create_time,'%Y-%m-%d %H:%M:%S')
answer.parised_nums=parised_nums
answer.save()
next_page=selector.xpath('.//a[contains(@class,"next_page")]/@href').get()
if next_page:
if next_page.startswith('/topics'):
next_page=parse.urljoin(domain,next_page)
executor.submit(parse_topic,next_page)
def parse_author(url):
response_code=requests.get(url,headers=headers).status_code
if response_code == 200:
print('开始抓取author:%s'%url)
html=requests.get(url,headers=headers).text
selector=Selector(text=html)
name=selector.xpath('.//div[@class="lt_title"]/text()').getall()
id=url.split('/')[-1]
name=''.join(name).strip()
info=selector.xpath('.//ul[@class="me_chanel_list clearfix"]/li')
blog_nums=strip_str(info[0].xpath('.//span[@class="count"]/text()').get())
resource_nums=strip_str(info[1].xpath('.//span[@class="count"]/text()').get())
luntan_nums=strip_str(info[2].xpath('.//span[@class="count"]/text()').get())
blink_nums=strip_str(info[3].xpath('.//span[@class="count"]/text()').get())
wenda_nums=strip_str(info[4].xpath('.//span[@class="count"]/text()').get())
collect_nums=strip_str(info[5].xpath('.//span[@class="count"]/text()').get())
zhuanlan_nums=strip_str(info[6].xpath('.//span[@class="count"]/text()').get())
rate=selector.xpath('.//div[@class="me_chanel_det"]//div[@class="me_chanel_det_item access"]//span/text()').getall()[1].strip()
follower_nums=strip_str(selector.xpath('.//div[@class="fans"]//span/text()').get())
following_nums=strip_str(selector.xpath('.//div[@class="att"]//span/text()').get())
author=Author()
author.id = id
author.name = name
author.blog_nums = int(blog_nums)
author.resource_nums=int(resource_nums)
author.luntan_nums=int(luntan_nums)
author.blink_nums=int(blink_nums)
author.wenda_nums=int(wenda_nums)
author.collect_nums=int(collect_nums)
author.zhuanlan_nums=int(zhuanlan_nums)
author.rate=rate
author.follower_nums=follower_nums
author.following_nums=following_nums
existed_author=Author.select().where(Author.id==id)
if existed_author:
author.save()
else:
author.save(force_insert=True)
else:
print('%s author页面不存在,不抓取' % url)
def strip_str(x):
if x:
return x.strip()
else:
return 0
if __name__=='__main__':
url_list=get_last_url()
for url in url_list:
executor.submit(parse_topic_list,url)
写回答
1回答
-
bobby
2020-07-05
你有没有看过爬不到是不会进入具体的逻辑 还是这两个页面不会返回数据?
022020-07-09
相似问题