运行代码,没有报错,数据库内没有数据?

来源:14-15 获取个人信息详情 - 2

燚燚生辉

2019-07-20

"""
抓取
解析
存储
"""

import re
import ast
import requests
import time
from urllib import parse  # js加载的路径为相对路径,需要通过 parse.urljoin()方法将域名与相对路径结合生成可以直接访问的URL
from scrapy import Selector
from datetime import datetime
from selenium import webdriver
from csdn_spider.Models import *

domain = "https://www.csdn.net/"  # 域名


def get_nodes_json():  # 获取所有的URL
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_search = re.search("forumNodes: (.*])", left_menu_text)  # 使用贪婪匹配

    if nodes_str_search:
        nodes_str = nodes_str_search.group(1).replace("null", "None")  # 因为null在python里json中将会抛异常,所以将所有的null替换为None
        nodes_list = ast.literal_eval(nodes_str)  # 将nodes_str转换为list
        return nodes_list
    return []


url_list = []


def process_nodes_list(nodes_list):  # 获取所有的URL
    # 将json的格式提取出url到list
    for item in nodes_list:
        if "url" in item:
            if item["url"]:
                url_list.append(item["url"])
            if "children" in item:
                process_nodes_list(item["children"])


def get_level1_url(nodes_list):  # 获取第一层URL
    level1_url = []
    for item in nodes_list:
        if "url" in item and item["url"]:
            level1_url.append(item["url"])

    return level1_url


def get_last_urls():
    # 获取需要最终抓取的URL
    nodes_list = get_nodes_json()
    process_nodes_list(nodes_list)
    level1_url = get_level1_url(nodes_list)
    last_urls = []
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)

    all_urls = []

    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))  # 待解决
        all_urls.append(parse.urljoin(domain, url + "/recommend"))  # 推荐精华
        all_urls.append(parse.urljoin(domain, url + "/closed"))  # 已解决
    return all_urls


def parse_topic(url):
    # 获取帖子得详情及回复
    topic_id = int(url.split("/")[-1])
    res_text = requests.get(url,cookies = cookie_dict).text
    sel = Selector(text=res_text)
    all_divs = sel.xpath("//div[starts-with(@id,'post-')]")
    topic_item = all_divs[0]
    content = topic_item.xpath(".//div[@class = 'post_body post_body_min_h']/text()").extract()[0]
    praised_nums = int(topic_item.xpath(".//label[@class = 'red_praise digg']//em/text()").extract()[0])
    jtl_str = topic_item.xpath("//div[@class = 'close_topic']/text()").extract()[0]
    jtl = 0
    jtl_mctch = re.search("(\d)%", jtl_str)
    if jtl_mctch:
        jtl = jtl_mctch.group(1)

    existed_topic = Topic.select().where(Topic.id == topic_id)
    if existed_topic:
        topic = existed_topic[0]
        topic.jtl = jtl
        topic.content = content
        topic.praised_nums = praised_nums
        topic.save()

    for answer_item in all_divs[1:]:
        answer = Answer()
        answer.topic_id = topic_id
        author_info = answer_item.xpath("//div[@class = 'nick_name']/a[1]/@herf").extract()[0]
        author_id = author_info.split("/")[-1]
        create_time_str = answer_item.xpath(".//label[@class = 'date_time'/text()]").extract[0]
        create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M:%S")
        answer.author = author_id
        answer.create_time = create_time
        content = answer_item.xpath(".//div[@class = 'post_body post_body_min_h']/text()").extract()[0]
        answer.content = content
        praised_nums = int(answer_item.xpath(".//label[@class = 'red_praise digg']//em/text()").extract()[0])
        answer.parised_nums = praised_nums

        answer.save()

        next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()
        if next_page:
            next_page_url = parse.urljoin(domain, next_page[0])
            parse_topic(next_page_url)


def parse_author(url):
    # 获取用户得详情
    author_id = url.split("/")[-1]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
    }
    res_text = requests.get(url,cookies = cookie_dict).text
    sel = Selector(text=res_text, headers=headers)
    author = Author()
    author.id = author_id
    all_li_strs = sel.xpath("//ul[@class = 'me_chanel_list clearfix']//li/text()").extract()
    click_nums = all_li_strs[0]
    original_nums = all_li_strs[1]
    forward_nums = int(all_li_strs[2])
    rate = int(all_li_strs[3])
    answer_nums = int(all_li_strs[4])
    parised_nums = int(all_li_strs[5])

    author.click_nums = click_nums
    author.original_nums = original_nums
    author.forward_nums = forward_nums
    author.rate = rate
    author.answer_nums = answer_nums
    author.parised_nums = parised_nums

    name = sel.xpath("//h4[@class='username']/text()").extract()[0]
    author.name = name.strip()
    existed_author = Author.select().where(Author.id == author_id)
    if existed_author:
        author.save()
    else:
        author.save(force_insert=True)

    desc = sel.xpath("//dd[@class='user_desc']/text()").extract()
    if desc:
        author.desc = desc[0].strip()
    person_b = sel.xpath("//dd[@class='person_b']/ul/li")
    for item in person_b:
        item_text = "".join(item.extract())
        if "csdnc-m-add" in item_text:
            location = item.xpath(".//span/text()").extract()[0].strip()
            author.location = location
        else:
            industry = item.xpath(".//span/text()").extract()[0].strip()
            author.industry = industry
    name = sel.xpath("//h4[@class='username']/text()").extract()[0]
    author.name = name.strip()
    existed_author = Author.select().where(Author.id == author_id)
    if existed_author:
        author.save()
    else:
        author.save(force_insert=True)


def parse_list(url):
    # 解析列表页内容
    res_text = requests.get(url,cookies = cookie_dict).text
    sel = Selector(text=res_text)
    all_trs = sel.xpath("//table[@class = 'forums_tab_table']//tr")[2:]
    for tr in all_trs:
        status = tr.xpath(".//td[1]/span/text()").extract()[0]
        score = tr.xpath(".//td[2]/em/text()").extract()[0]
        topic_url = parse.urljoin(domain, tr.xpath(".//td[3]/a/[@herf]").extract()[0])
        topic_id = tr.xpath(topic_url.split("/"))[-1]
        topic_title = tr.xpath(".//td[3]/a/text()").extract()[0]
        author_url = parse.urljoin(domain, tr.xpath(".//td[4]/a/[@herf]").extract()[0])
        author_id = author_url.split("/")[-1]
        create_time_str = tr.xpath(".//td[4]/em/text()").extract()[0]
        create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M")
        answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]
        answer_nums = answer_info.split("/")[0]
        click_nums = answer_info.split("/")[1]
        last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]
        last_time = datetime.strptime(last_time_str, "%Y-%m-%d %H:%M")

        topic = Topic()
        try:
            topic.id = int(topic_id)
        except:
            continue
        topic.title = topic_title
        topic.author = author_id
        topic.create_time = create_time
        topic.answer_nums = int(answer_nums)
        topic.click_nums = int(click_nums)
        topic.score = int(score)
        topic.status = status
        topic.last_answer_time = last_time
        existed_topic = Topic.select().where(Topic.id == topic.id)
        if existed_topic:
            topic.save()
        else:
            topic.save(force_insert=True)

        parse_topic(url)
        parse_author(url)

        next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()
        if next_page:
            next_page_url = parse.urljoin(domain, next_page[0])
            # 注:如果写成 next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()[0]
            # 当没有下一页时(即next_page取不到值时)list[]将会抛异常,故采用以上写法,先进性判断

            parse_list(next_page_url)


if __name__ == "__main__":
    browser = webdriver.Chrome(executable_path="E:\python\Chrome_Drive_win32\chromedriver.exe")   # cookies反爬
    browser.get("https://www.csdn.net/" )
    time.sleep(5)
    cookies = browser.get_cookies()
    cookie_dict = {}
    for item in cookies:
        cookie_dict[item["name"]] = item["value"]
    last_urls = get_last_urls()
    for url in last_urls:
        parse_list(url)

#运行结果
E:\Python\python.exe E:/python/项目/csdn_spider/spider.py

Process finished with exit code 0


写回答

2回答

微酸袅袅_

2019-08-29

同问,我也是同样的情况

0
1
bobby
有没有试过debug看看有没有运行到save方法处?
2019-08-31
共1条回复

bobby

2019-07-21

你可以试试打个断点 看看能否运行到断点处

0
3
燚燚生辉
回复
bobby
回复 bobby1320525135
2019-09-09
共3条回复

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2377 学习 · 1158 问题

查看课程