请问反复出现这个问题如何解决
来源:14-12 获取和解析详情页 - 1

慕仰4153531
2019-06-27
csdn在文章标题前加了前缀,然后我改了一下代码,发现直接运行程序出错而debug爬取了前三个第四个就报错了,请问一下为啥?
#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
抓取
解析
储存
'''
#系统自带的包
import re
import ast
import time
from datetime import datetime
from models1 import *
#第三方包
import requests
from urllib import parse
from selenium import webdriver
from scrapy import Selector
#自己创建的包
#全局变量——域名
domian = "https://bbs.csdn.net"
#带问号一般都是动态页面
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text #通过header的url获取内容
nodes_str_match = re.search("forumNodes: (.*])",left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace("null","None") #分组获取,python对null会报错
nodes_list = ast.literal_eval(nodes_str)
return nodes_list
return []
url_list = []
def process_nodes_list(nodes_list):
#将js格式提取出url到list中
for item in nodes_list:
if "url" in item:
url_list.append(item["url"])
if "children" in item:
process_nodes_list(item["children"])
#这部分逻辑就是排除掉所有非children下的url
def get_level1_list(nodes_list):
level1_url = [] #因为每个文件夹的信息实际上都是其children集成的,故其url可以去掉
for item2 in nodes_list:
if "url" in item2 and item2["url"]:
level1_url.append(item2["url"])
return level1_url
def get_last_urls():
last_urls = []
nodes_list = get_nodes_json()
process_nodes_list(nodes_list)
level1_urls = get_level1_list(nodes_list)
last_urls = []
for url in url_list:
if url not in level1_urls:
last_urls.append(url)
#进入每个页面后要获取的目标有推荐精华、已解决、待解决
all_urls = []
for url in last_urls:
all_urls.append(parse.urljoin(domian,url))
all_urls.append(parse.urljoin(domian,str(url)+"/recommand"))#查看网页即可发现该规律,推荐精华
all_urls.append(parse.urljoin(domian,str(url)+"/closed"))#已解决
return all_urls
#获取帖子的目录页
def parse_list(url):
res_text = requests.get(url,cookies = cookie_dict).text
sel = Selector(text=res_text) #selctor对象为了抽取更详细的基本信息
#根据页面的html代码,要获取帖子的基本信息就是获取所有的tr
all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:] #双斜杠表示选取所有的,返回selectorlist类型,由2开始是为了选取tbody里面的数据
for tr in all_trs:
topic = Topic()
if tr.xpath(".//td[1]/span/text()").extract():
status = tr.xpath(".//td[1]/span/text()").extract()[0] #抽取出来是一个list
topic.status = status
if tr.xpath(".//td[2]/em/text()").extract():
score = tr.xpath(".//td[2]/em/text()").extract()[0]
topic.score = int(score)
if tr.xpath(".//td[3]/a/@href").extract():
topic_url = parse.urljoin(domian,tr.xpath(".//td[3]/a[2]/@href").extract()[0])
if tr.xpath(".//td[3]/a[2]/text()").extract():
topic_title = tr.xpath(".//td[3]/a[2]/text()").extract()[0]
if tr.xpath(".//td[4]/a/@href").extract()[0]:
author_url = parse.urljoin(domian, tr.xpath(".//td[4]/a/@href").extract()[0])
author_id = author_url.split("/")[-1] #可以用于拼接author_url
if tr.xpath(".//td[4]/em/text()").extract()[0]:
create_time_str = tr.xpath(".//td[4]/em/text()").extract()[0]
create_time = datetime.strptime(create_time_str ,'%Y-%m-%d %H:%M')
if tr.xpath(".//td[5]/span/text()").extract()[0]:
answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]
answer_num = answer_info.split("/")[0]
chlick_num = answer_info.split("/")[1]
if tr.xpath(".//td[6]/em/text()").extract()[0]:
last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]
last_time = datetime.strptime(last_time_str,'%Y-%m-%d %H:%M')
try:
topic.id = int(topic_url.split("/")[-1])
except:
continue
topic.title = topic_title
topic.author = author_id
topic.click_nums = chlick_num
topic.answer_nums = answer_num
topic.create_time = create_time
topic.last_answer_time = last_time
topic.status = status
topic.score = int(score)
existed_topics = Topic.select().where(Topic.id == topic.id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
parse_topic(topic_url)
parse_author(author_url)
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
if next_page:
page_url = parse.urljoin(domian,next_page[0])
parse_list(page_url)
#获取帖子的详情以及回复
def parse_topic(url):
pass
#获取用户的详情
def parse_author(url):
pass
if __name__ == "__main__":
# 反爬应对措施
browser = webdriver.Chrome(executable_path="C:\\Users\\yangzili\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe")
browser.get("https://bbs.csdn.net")
time.sleep(5)
cookies = browser.get_cookies()
cookie_dict = {}
for item1 in cookies:
cookie_dict[item1["name"]] = item1["value"]
last_urls = get_last_urls()
for url in last_urls:
parse_list(url)
# print("all_urls:{}".format(len(all_urls)))
# print(all_urls)
写回答
1回答
-
bobby
2019-07-01
00
相似问题