为什么我试了一下,程序可以运行,可是数据库没有显示任何东西,看了下没有反爬

来源:14-10 获取和解析列表页-1

慕仰0532536

2020-09-13

import requests
import re
import ast
from scrapy import Selector
from urllib import parse
from datetime import datetime

from models import *

url_list = []

headList = []

last_list = []

# 获得json的list

def get_json():

re_text = requests.get(‘https://bbs.csdn.net/dynamic_js/left_menu.js?csdn=’).text

nodes = re.search(‘forumNodes: (.*])’, re_text)

if nodes:

nodes_text2 = nodes.group(1).replace(‘null’, ‘None’)

nodes_list = ast.literal_eval(nodes_text2)

return nodes_list

return []

# 获取所有的url的list

def trans_list(nodes_list):

for item in nodes_list:

if ‘url’ in item and item[‘url’]:

url_list.append(item[‘url’])

if ‘children’ in item:

url_list.append(item[‘url’])

trans_list(item[‘children’])

return url_list

# 获取开头url的list

def head_list(nodes_list):

for item in nodes_list:

if ‘url’ in item and item[‘url’]:

headList.append(item[‘url’])

return headList

# 获取最后的list

def get_all_list():

jsonGet = get_json()

urlList = trans_list(jsonGet)

headerList = head_list(jsonGet)

for i in urlList:

if i not in headerList:

last_list.append(parse.urljoin(url, i))

last_list.append(parse.urljoin(url, i + ‘/recommend’))

last_list.append(parse.urljoin(url, i + ‘/closed’))

return last_list

获得每个url的topic

def parse_url(url_parse):
parse_re = requests.get(url_parse).text
select = Selector(text=parse_re)
table_sel = select.xpath("//table[@class=‘forums_tab_table’]/tbody//tr")
for tr in table_sel:
status = tr.xpath(’//td[1]/span/text()’).extract()[0]
score = tr.xpath(’//td[2]/em/text()’).extract()[0]
topic_title = tr.xpath(’//td[3]//a[@title]/text()’).extract()[0]
topic_url = parse.urljoin(url, tr.xpath(’//td[3]//a[@title]/@href’).extract()[0])
topic_id = topic_url.split(’/’)[-1]
author_url = tr.xpath(’//td[4]/a/@href’).extract()[0]
author_id = author_url.split(’/’)[-1]
answer_num = tr.xpath(’//td[5]/span/text()’).extract()[0].split(’/’)[0]
check_num = tr.xpath(’//td[5]/span/text()’).extract()[0].split(’/’)[1]
create_time_str = tr.xpath(’//td[4]/em/text()’).extract()[0]
create_time = datetime.strptime(create_time_str, ‘%Y-%m-%d %H:%M’)
last_time_str = tr.xpath(’//td[6]/em/text()’).extract()[0]
last_time = datetime.strptime(last_time_str, ‘%Y-%m-%d %H:%M’)

    topic = Topic()
    topic.id = int(topic_id)
    topic.topic_name = topic_title
    topic.author = author_id
    topic.answer_nums = int(answer_num)
    topic.click_nums = int(check_num)
    topic.create_time = create_time
    topic.score = int(score)
    topic.status = status
    topic.last_answer_time = last_time
    topic.save()

if name == ‘main’:
# allList = get_all_list()
parse_url(‘https://bbs.csdn.net/forums/ios’)

写回答

1回答

bobby

2020-09-16

代码能否格式化一下 我这里拷贝下来有格式问题

0
2
bobby
回复
慕仰0532536
好的。
2020-09-16
共2条回复

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2378 学习 · 1158 问题

查看课程