为什么我试了一下,程序可以运行,可是数据库没有显示任何东西,看了下没有反爬
来源:14-10 获取和解析列表页-1

慕仰0532536
2020-09-13
import requests
import re
import ast
from scrapy import Selector
from urllib import parse
from datetime import datetime
from models import *
url_list = []
headList = []
last_list = []
# 获得json的list
def get_json():
re_text = requests.get(‘https://bbs.csdn.net/dynamic_js/left_menu.js?csdn=’).text
nodes = re.search(‘forumNodes: (.*])’, re_text)
if nodes:
nodes_text2 = nodes.group(1).replace(‘null’, ‘None’)
nodes_list = ast.literal_eval(nodes_text2)
return nodes_list
return []
# 获取所有的url的list
def trans_list(nodes_list):
for item in nodes_list:
if ‘url’ in item and item[‘url’]:
url_list.append(item[‘url’])
if ‘children’ in item:
url_list.append(item[‘url’])
trans_list(item[‘children’])
return url_list
# 获取开头url的list
def head_list(nodes_list):
for item in nodes_list:
if ‘url’ in item and item[‘url’]:
headList.append(item[‘url’])
return headList
url = ‘https://bbs.csdn.net’
# 获取最后的list
def get_all_list():
jsonGet = get_json()
urlList = trans_list(jsonGet)
headerList = head_list(jsonGet)
for i in urlList:
if i not in headerList:
last_list.append(parse.urljoin(url, i))
last_list.append(parse.urljoin(url, i + ‘/recommend’))
last_list.append(parse.urljoin(url, i + ‘/closed’))
return last_list
获得每个url的topic
def parse_url(url_parse):
parse_re = requests.get(url_parse).text
select = Selector(text=parse_re)
table_sel = select.xpath("//table[@class=‘forums_tab_table’]/tbody//tr")
for tr in table_sel:
status = tr.xpath(’//td[1]/span/text()’).extract()[0]
score = tr.xpath(’//td[2]/em/text()’).extract()[0]
topic_title = tr.xpath(’//td[3]//a[@title]/text()’).extract()[0]
topic_url = parse.urljoin(url, tr.xpath(’//td[3]//a[@title]/@href’).extract()[0])
topic_id = topic_url.split(’/’)[-1]
author_url = tr.xpath(’//td[4]/a/@href’).extract()[0]
author_id = author_url.split(’/’)[-1]
answer_num = tr.xpath(’//td[5]/span/text()’).extract()[0].split(’/’)[0]
check_num = tr.xpath(’//td[5]/span/text()’).extract()[0].split(’/’)[1]
create_time_str = tr.xpath(’//td[4]/em/text()’).extract()[0]
create_time = datetime.strptime(create_time_str, ‘%Y-%m-%d %H:%M’)
last_time_str = tr.xpath(’//td[6]/em/text()’).extract()[0]
last_time = datetime.strptime(last_time_str, ‘%Y-%m-%d %H:%M’)
topic = Topic()
topic.id = int(topic_id)
topic.topic_name = topic_title
topic.author = author_id
topic.answer_nums = int(answer_num)
topic.click_nums = int(check_num)
topic.create_time = create_time
topic.score = int(score)
topic.status = status
topic.last_answer_time = last_time
topic.save()
if name == ‘main’:
# allList = get_all_list()
parse_url(‘https://bbs.csdn.net/forums/ios’)
1回答
-
代码能否格式化一下 我这里拷贝下来有格式问题
022020-09-16
相似问题
回答 1
回答 2
回答 2
回答 1
回答 2