请问反复出现这个问题如何解决

来源:14-12 获取和解析详情页 - 1

慕仰4153531

2019-06-27

csdn在文章标题前加了前缀,然后我改了一下代码,发现直接运行程序出错而debug爬取了前三个第四个就报错了,请问一下为啥?图片描述

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
'''
抓取
解析
储存
'''

#系统自带的包
import re
import ast
import time
from datetime import datetime
from models1 import *

#第三方包
import requests
from urllib import parse
from selenium import webdriver
from scrapy import Selector
#自己创建的包


#全局变量——域名
domian = "https://bbs.csdn.net"


#带问号一般都是动态页面
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text #通过header的url获取内容
    nodes_str_match = re.search("forumNodes: (.*])",left_menu_text)
    if nodes_str_match:
        nodes_str = nodes_str_match.group(1).replace("null","None")    #分组获取,python对null会报错
        nodes_list = ast.literal_eval(nodes_str)
        return nodes_list
    return []

url_list = []
def process_nodes_list(nodes_list):
    #将js格式提取出url到list中
    for item in nodes_list:
        if "url" in item:
            url_list.append(item["url"])
            if "children" in item:
                process_nodes_list(item["children"])

#这部分逻辑就是排除掉所有非children下的url
def get_level1_list(nodes_list):
    level1_url = []  #因为每个文件夹的信息实际上都是其children集成的,故其url可以去掉
    for item2 in nodes_list:
        if "url" in item2 and item2["url"]:
            level1_url.append(item2["url"])
    return level1_url

def get_last_urls():
    last_urls = []
    nodes_list = get_nodes_json()
    process_nodes_list(nodes_list)
    level1_urls = get_level1_list(nodes_list)
    last_urls = []
    for url in url_list:
        if url not in level1_urls:
            last_urls.append(url)
#进入每个页面后要获取的目标有推荐精华、已解决、待解决
    all_urls = []
    for url in last_urls:
        all_urls.append(parse.urljoin(domian,url))
        all_urls.append(parse.urljoin(domian,str(url)+"/recommand"))#查看网页即可发现该规律,推荐精华
        all_urls.append(parse.urljoin(domian,str(url)+"/closed"))#已解决

    return all_urls

#获取帖子的目录页
def parse_list(url):
    res_text = requests.get(url,cookies = cookie_dict).text
    sel = Selector(text=res_text) #selctor对象为了抽取更详细的基本信息
    #根据页面的html代码,要获取帖子的基本信息就是获取所有的tr
    all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:]  #双斜杠表示选取所有的,返回selectorlist类型,由2开始是为了选取tbody里面的数据
    for tr in all_trs:
        topic = Topic()

        if tr.xpath(".//td[1]/span/text()").extract():
            status = tr.xpath(".//td[1]/span/text()").extract()[0] #抽取出来是一个list
            topic.status = status
        if tr.xpath(".//td[2]/em/text()").extract():
            score = tr.xpath(".//td[2]/em/text()").extract()[0]
            topic.score = int(score)
        if tr.xpath(".//td[3]/a/@href").extract():
            topic_url = parse.urljoin(domian,tr.xpath(".//td[3]/a[2]/@href").extract()[0])
        if tr.xpath(".//td[3]/a[2]/text()").extract():
            topic_title = tr.xpath(".//td[3]/a[2]/text()").extract()[0]
        if tr.xpath(".//td[4]/a/@href").extract()[0]:
            author_url = parse.urljoin(domian, tr.xpath(".//td[4]/a/@href").extract()[0])
            author_id = author_url.split("/")[-1] #可以用于拼接author_url
        if tr.xpath(".//td[4]/em/text()").extract()[0]:
            create_time_str = tr.xpath(".//td[4]/em/text()").extract()[0]
            create_time = datetime.strptime(create_time_str ,'%Y-%m-%d %H:%M')
        if tr.xpath(".//td[5]/span/text()").extract()[0]:
            answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]
            answer_num = answer_info.split("/")[0]
            chlick_num = answer_info.split("/")[1]
        if tr.xpath(".//td[6]/em/text()").extract()[0]:
            last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]
            last_time = datetime.strptime(last_time_str,'%Y-%m-%d %H:%M')

        try:
            topic.id = int(topic_url.split("/")[-1])
        except:
            continue
        topic.title = topic_title
        topic.author = author_id
        topic.click_nums = chlick_num
        topic.answer_nums = answer_num
        topic.create_time = create_time
        topic.last_answer_time = last_time
        topic.status = status
        topic.score = int(score)
        existed_topics = Topic.select().where(Topic.id == topic.id)
        if existed_topics:
            topic.save()
        else:
            topic.save(force_insert=True)

        parse_topic(topic_url)
        parse_author(author_url)

    next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
    if next_page:
        page_url = parse.urljoin(domian,next_page[0])
        parse_list(page_url)





#获取帖子的详情以及回复
def parse_topic(url):
    pass

#获取用户的详情
def parse_author(url):
    pass

if __name__ == "__main__":
    # 反爬应对措施
    browser = webdriver.Chrome(executable_path="C:\\Users\\yangzili\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe")
    browser.get("https://bbs.csdn.net")
    time.sleep(5)
    cookies = browser.get_cookies()
    cookie_dict = {}
    for item1 in cookies:
        cookie_dict[item1["name"]] = item1["value"]

    last_urls = get_last_urls()
    for url in last_urls:
        parse_list(url)
    # print("all_urls:{}".format(len(all_urls)))
    # print(all_urls)
写回答

1回答

bobby

2019-07-01

0
0

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2378 学习 · 1158 问题

查看课程