debug时数据可以顺利写入数据库, 但是直接run,就报错了。

来源:14-12 获取和解析详情页 - 1

翻版郭富城

2019-04-14

import re
import ast
from urllib import parse

import requests
from datetime import datetime
from scrapy import Selector

from csdn_spider.models import Topic

domain = "https://bbs.csdn.net/forums/"
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_math = re.search("forumNodes: (.*])", left_menu_text)
    if nodes_str_math:
        nodes_str = nodes_str_math.group(1).replace("null", "None")
        nodes_list = ast.literal_eval(nodes_str)
        return nodes_list
    return []


url_list = []
def process_nodes_list(nodes_list):
    #获取菜单里的全部url
    for item in nodes_list:
        if "url" in item and item["url"]:
            url_list.append(item["url"])
            if "children" in item:
                process_nodes_list(item["children"])


def get_level1_list(nodes_list):
    #获取首层的url
    level1_list = []
    for item in nodes_list:
        if "url" in item and item["url"]:
            level1_list.append(item["url"])

    return level1_list


def get_last_urls():
    #获取最终抓取的url
    nodes_list = get_nodes_json()
    process_nodes_list(nodes_list)
    level1_url = get_level1_list(nodes_list)
    last_urls = []
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)
    all_urls = []
    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))
        all_urls.append(parse.urljoin(domain, url+"/recommend"))
        all_urls.append(parse.urljoin(domain, url+"/closed"))
    return all_urls


def parse_topic(url):
    #获取帖子的详情以及回复
    pass


def parse_author(url):
    #获取作者的详情以及回复
    pass



def parse_list(url):
    res_text = requests.get(url).text
    sel = Selector(text=res_text)
    all_trs = sel.css(".forums_table_c tbody tr")
    for tr in all_trs:
        topic = Topic()
        if tr.css(".forums_topic_flag span::text").extract()[0]:
            status = tr.css(".forums_topic_flag span::text").extract()[0]
            topic.status = status
        if tr.css(".forums_score em::text").extract()[0]:
            score = tr.css(".forums_score em::text").extract()[0]
            topic.score = int(score)

        topic_url = parse.urljoin(domain, tr.css(".forums_topic a::attr(href)").extract()[0])
        topic_id = topic_url.split("/")[-1]
        topic_title = tr.css(".forums_topic a::text").extract()[0]
        author_url = parse.urljoin(domain, tr.css(".forums_author a::attr(href)").extract()[0])
        author_id = author_url.split("/")[-1]
        create_time = tr.css(".forums_author em::text").extract()[0]
        create_time = datetime.strptime(create_time, "%Y-%m-%d %H:%M")
        answer_info = tr.css(".forums_reply span::text").extract()[0]
        answer_nums = answer_info.split("/")[0]
        click_nums = answer_info.split("/")[1]
        last_time = tr.css(".forums_last_pub em::text").extract()[0]
        last_time = datetime.strptime(last_time, "%Y-%m-%d %H:%M")

        topic.id = int(topic_id)
        topic.title = topic_title
        topic.author = author_id
        topic.create_time = create_time
        topic.click_nums = int(click_nums)
        topic.answer_nums = int(answer_nums)
        topic.last_answer_time = last_time

        existed_topic = Topic.select().where(Topic.id == topic.id)
        if existed_topic:
            topic.save()
        else:
            topic.save(force_insert=True)

        # parse_topic(topic_url)
        # parse_author(author_url)

        # next_page = sel.css("a.pageliststy.next_page ::attr(href)").extract()
        # if next_page:
        #     next_url = parse.urljoin(domain, next_page[0])
        #     parse_list(next_url)


if __name__ == "__main__":
    last_urls = get_last_urls()
    for url in last_urls:
        parse_list(url)
    # print(last_urls)




图片描述图片描述

写回答

1回答

bobby

2019-04-15

这个问题是因为csdn中出现了这种连接 //img.mukewang.com/szimg/5cb4004800011ce012620453.jpg 也就是提取到了这里的两个a中的第一个, 课程录制的时候没有这种,但是录制到后面的时候发现这个问题了,后面的章节小结中我讲解了如何解决这个问题,你可以先往后看 也可以//img.mukewang.com/szimg/5cb4009a000187aa08850159.jpg 在这一行做一个try catch,如果有异常直接continue下一个罗就就行了,课程后面也会讲解更好的解决办法

0
2
bobby
回复
翻版郭富城
好的,
2019-04-16
共2条回复

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2378 学习 · 1158 问题

查看课程