debug时数据可以顺利写入数据库, 但是直接run,就报错了。
来源:14-12 获取和解析详情页 - 1

翻版郭富城
2019-04-14
import re
import ast
from urllib import parse
import requests
from datetime import datetime
from scrapy import Selector
from csdn_spider.models import Topic
domain = "https://bbs.csdn.net/forums/"
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_math = re.search("forumNodes: (.*])", left_menu_text)
if nodes_str_math:
nodes_str = nodes_str_math.group(1).replace("null", "None")
nodes_list = ast.literal_eval(nodes_str)
return nodes_list
return []
url_list = []
def process_nodes_list(nodes_list):
#获取菜单里的全部url
for item in nodes_list:
if "url" in item and item["url"]:
url_list.append(item["url"])
if "children" in item:
process_nodes_list(item["children"])
def get_level1_list(nodes_list):
#获取首层的url
level1_list = []
for item in nodes_list:
if "url" in item and item["url"]:
level1_list.append(item["url"])
return level1_list
def get_last_urls():
#获取最终抓取的url
nodes_list = get_nodes_json()
process_nodes_list(nodes_list)
level1_url = get_level1_list(nodes_list)
last_urls = []
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
for url in last_urls:
all_urls.append(parse.urljoin(domain, url))
all_urls.append(parse.urljoin(domain, url+"/recommend"))
all_urls.append(parse.urljoin(domain, url+"/closed"))
return all_urls
def parse_topic(url):
#获取帖子的详情以及回复
pass
def parse_author(url):
#获取作者的详情以及回复
pass
def parse_list(url):
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_trs = sel.css(".forums_table_c tbody tr")
for tr in all_trs:
topic = Topic()
if tr.css(".forums_topic_flag span::text").extract()[0]:
status = tr.css(".forums_topic_flag span::text").extract()[0]
topic.status = status
if tr.css(".forums_score em::text").extract()[0]:
score = tr.css(".forums_score em::text").extract()[0]
topic.score = int(score)
topic_url = parse.urljoin(domain, tr.css(".forums_topic a::attr(href)").extract()[0])
topic_id = topic_url.split("/")[-1]
topic_title = tr.css(".forums_topic a::text").extract()[0]
author_url = parse.urljoin(domain, tr.css(".forums_author a::attr(href)").extract()[0])
author_id = author_url.split("/")[-1]
create_time = tr.css(".forums_author em::text").extract()[0]
create_time = datetime.strptime(create_time, "%Y-%m-%d %H:%M")
answer_info = tr.css(".forums_reply span::text").extract()[0]
answer_nums = answer_info.split("/")[0]
click_nums = answer_info.split("/")[1]
last_time = tr.css(".forums_last_pub em::text").extract()[0]
last_time = datetime.strptime(last_time, "%Y-%m-%d %H:%M")
topic.id = int(topic_id)
topic.title = topic_title
topic.author = author_id
topic.create_time = create_time
topic.click_nums = int(click_nums)
topic.answer_nums = int(answer_nums)
topic.last_answer_time = last_time
existed_topic = Topic.select().where(Topic.id == topic.id)
if existed_topic:
topic.save()
else:
topic.save(force_insert=True)
# parse_topic(topic_url)
# parse_author(author_url)
# next_page = sel.css("a.pageliststy.next_page ::attr(href)").extract()
# if next_page:
# next_url = parse.urljoin(domain, next_page[0])
# parse_list(next_url)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
parse_list(url)
# print(last_urls)
写回答
1回答
-
这个问题是因为csdn中出现了这种连接
也就是提取到了这里的两个a中的第一个, 课程录制的时候没有这种,但是录制到后面的时候发现这个问题了,后面的章节小结中我讲解了如何解决这个问题,你可以先往后看 也可以
在这一行做一个try catch,如果有异常直接continue下一个罗就就行了,课程后面也会讲解更好的解决办法
022019-04-16
相似问题