请问老师,在抓取topic时一直有这样的报错(还没加提取评论和作者功能时也是),但是数据能够正常入库,是什么原因呢?
来源:7-17 解析帖子回复和用户信息

56161
2022-01-22
如图。麻烦老师了
spider.py的代码
import json
import re
import time
from urllib.parse import urlparse,parse_qs
from datetime import datetime
import requests
from scrapy import Selector
from csdn_spider.signer import Signer
from csdn_spider.models import Topic,Answer,Author
# //*[@id="left-floor-user-content_584"]/div[2]/div[2]/div/div[1]/div/div[1]/div[3]/div/div/span[1]
# //*[@id="left-floor-user-content_584"]/div[2]/div[2]/div/div[1]/div/div[1]/div[4]/div/div/span[1]
# url = "https://bbs.csdn.net"
# rsp = requests.get(url)
# print(rsp.status_code)
def get_last_urls():
# 获取所有的最终抓取的二级分类的url
urls = []
headers = {
"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding":"gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.9,en;q=0.8",
"cache-control":"max-age=0",
"upgrade-insecure-requests":"1",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
}
url = "https://bbs.csdn.net/"
rsp = requests.get(url,headers=headers)
# print(rsp.text)
if rsp.status_code !=200:
raise Exception("反爬了")
sel = Selector(text=rsp.text)
c_node = sel.css("div.el-tree-node .custom-tree-node")
for index, c_node in enumerate(c_node):
url = "https://bizapi.csdn.net/community-cloud/v1/homepage/community/by/tag?deviceType=PC&tagId={}".format(index+1)
signer = Signer()
code,re_json = signer.get_html(url)
if code != 200:
raise Exception("反爬了2")
if "data" in re_json:
for item in re_json["data"]:
url = "{}?category={}".format(item["url"],item["id"])#??? index???
urls.append(url)
break
# 后期去掉
return urls
def parse_topic(url):
# 抽取回复
# 从json中抽取出answer并入库
topic_rsp = requests.get(url)
if topic_rsp.status_code != 200:
raise Exception("获取帖子详情页反爬了")
# text = topic_rsp.text
data = re.search("window.__INITIAL_STATE__= (.*});</script>", topic_rsp.text, re.IGNORECASE)
if data:
data = data.group(1)
data = json.loads(data)
reply_list = data["pageData"]["data"]["baseInfo"]["contentReply"]["list"]
for value in reply_list:
# print(value["description"])
answer = Answer()
answer.topic_id = value["contentResourceId"] #是主题id,回复id是 value["id"]
answer.id = value["id"]
answer.author = value["username"]
answer.create_time = datetime.strptime(value["createTime"],'%Y-%m-%d %H:%M:%S')
answer.content = value["description"]
answer.praised_nums = value["diggCount"]
existed_ans = Answer.select().where(Answer.id == answer.id)
if existed_ans:
answer.save() #如果你设置了id,peewee会认为是一个更新操作
else:
answer.save(force_insert=True)
print("添加成功")
pass
pass
def parse_author(url):
# 抽取author的信息
author_rsp = requests.get(url,headers='')
if author_rsp.status_code != 200:
raise Exception("获取作者详情页反爬了")
author_text = author_rsp.text
data = re.search("window.__INITIAL_STATE__= (.*});</script>",author_rsp.text,re.IGNORECASE)
if data:
data = data.group(1)
data = json.loads(data)
author = Author()
author_id = url.split("/")[-1]
author.name = author_id
base_info = data["pageData"]["data"]["baseInfo"]
# author.name = base_info["userModule"]["nickname"]
# author.id = base_info["userModule"]["username"]
author.desc = base_info["seoModule"]["description"]
if base_info["blogModule"]:
author.id = base_info["blogModule"]["blogId"]
interested = []
if len(base_info["interestModule"]):
tags = base_info["interestModule"][0]["tags"]
for tag in tags:
interested.append(tag["name"])
author.industry = ",".join(interested)
if base_info["achievementModule"]["viewCount"]:
author.click_nums = int(base_info["achievementModule"]["viewCount"].replace(",",""))#访问数
if base_info["achievementModule"]["rank"]:
author.rate = int(base_info["achievementModule"]["rank"].replace(",",""))#排名
if base_info["achievementModule"]["achievementList"]:
author.parised_nums = int(base_info["achievementModule"]["achievementList"][0]["variable"].replace(",", ""))#点赞数
author.answer_nums = int(base_info["achievementModule"]["achievementList"][1]["variable"].replace(",", ""))#回复数
# author.forward_nums = int(base_info["achievementModule"]["achievementList"][2]["variable"].replace(",", ""))
if base_info["achievementModule"]["originalCount"]:
author.original_nums = int(base_info["achievementModule"]["originalCount"].replace(",", ""))#原创数
if base_info["achievementModule"]["fansCount"]:
author.follower_nums = int(base_info["achievementModule"]["fansCount"].replace(",", ""))#粉丝数
existed_author = Author.select().where(Author.id == author.id)
if existed_author:
author.save()
else:
author.save(force_insert=True)
# print("")
# i=0
# j=i+1
print("插入作者author成功!!!!")
print(data)
# pass
# author.click_nums = user_info[]
# author.original_nums
# author.rate
# author.answer_nums
# author.parised_nums
# author.industry
# author.follower_nums
pass
def extract_topic(data_list):
# 抽取出topic并入库
for value in data_list:
content = value["content"]
topic = Topic()
topic.id = content["contentId"]
topic.title = content["topicTitle"]
topic.content = content["description"]
topic.author = content["username"]
topic.create_time = datetime.strptime(content["createTime"],'%Y-%m-%d %H:%M:%S')
topic.answer_nums = content["commentCount"]
topic.click_nums = content["viewCount"]
topic.praised_nums = content["diggNum"]
# print(content["url"]) #输出了topic页面的url
existed_topics = Topic.select().where(Topic.id == topic.id)
# 直接写会执行更新操作,必须要判断
# topic.save()
if existed_topics:
topic.save() #如果你设置了id,peewee会认为是一个更新操作
else:
topic.save(force_insert=True)
# time.sleep(2)
# parse_topic(content["url"]) #抓取topic详情
# parse_author("https://blog.csdn.net/{}".format(content["username"])) #抓取作者author详情
pass
def parse_list(url): #获取分类下的category列表
next_page = 1
tabid = 0
total_pages = 1
page_size = 20
o = urlparse(url) #提取url
query_dict = parse_qs(o.query) #将url的query部分转换成dict类型
cate_id = query_dict["category"][0]
category_rsp = requests.get(url)
if category_rsp.status_code != 200:
raise Exception("反爬了")
# 用正则表达式 截取网页源代码的script部分
data = re.search("window.__INITIAL_STATE__= (.*});</script>",category_rsp.text,re.IGNORECASE)
if data:
data = data.group(1)
data = json.loads(data)
total = data["pageData"]["data"]["baseInfo"]["page"]["total"]
tabid = data["pageData"]["data"]["baseInfo"]["defaultActiveTab"] #17816
total_pages = total / page_size
if total % page_size > 0:
total_pages += 1
extract_topic(data["pageData"]["data"]["baseInfo"]["dataList"])
next_page += 1
pass
# 下一页
while next_page < total_pages:
#出现403
# url = "https://bizapi.csdn.net/community-cloud/v1/community/listV2?page={}&pageSize=20&tabId={}&noMore=false&communityId={}&type=1&viewType=0".format(next_page,tabid,cate_id)
# 调整参数的顺序,按ASCII升序
url = "https://bizapi.csdn.net/community-cloud/v1/community/listV2?communityId={}&noMore=false&page={}&pageSize=20&tabId={}&type=1&viewType=0".format(cate_id,next_page,tabid)
signer = Signer()
code,re_json = signer.get_html(url)
if code != 200:
raise Exception("反爬了")
# extract_topic(re_json["data"]["dataList"])
print("翻页了!!!")
next_page += 1
break #之后删掉
if __name__ == "__main__":
urls = get_last_urls()
for url in urls:
parse_list(url)
# parse_topic("https://bbs.csdn.net/topics/604385318")
models.py代码
from peewee import *
db = MySQLDatabase("spider",host="127.0.0.1",port=3306,user="root",password="4619")
class BaseModel(Model):
class Meta:
database = db # This model uses the "people.db" database.
# table_name = "users"
# # 设计数据表注意:
# char类型,设置最大长度
# 对于无法确定最大长度的字段,一般设置为text类型
# 实际表采集到的数据要尽量格式化
# default值尽量设置,注意确定是否设置null
class Topic(BaseModel):
title = CharField()
content = TextField()
id = IntegerField(primary_key=True)
author = CharField()
create_time = DateTimeField()
answer_nums = IntegerField(default=0)
click_nums = IntegerField(default=0)
praised_nums = IntegerField(default=0)
class Answer(BaseModel):
id = IntegerField(primary_key=True)
topic_id = IntegerField()
author = CharField()
create_time = DateTimeField()
content = TextField()
praised_nums = IntegerField(default=0)
class Author(BaseModel):
name = CharField()
id = IntegerField(primary_key=True)
click_nums = IntegerField(default=0) # 访问数
original_nums = IntegerField(default=0) # 原创数
rate = IntegerField(default=-1) # 排名
answer_nums = IntegerField(default=0) # 评论数
parised_nums = IntegerField(default=0) # 获赞数
desc = TextField(null=True)
industry = CharField(null=True) # 关注的标签
follower_nums = IntegerField(default=0) # 粉丝数
if __name__ == "__main__":
db.create_tables([Topic, Answer, Author])
import requests
import execjs
from base64 import b64encode
import hmac
import hashlib
from urllib.parse import urlparse
class Signer:
def __init__(self):
self.nonce_func = execjs.compile("""
p = function(e) {
var t = e || null;
return null == t && (t = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, (function(e) {
var t = 16 * Math.random() | 0;
return ("x" === e ? t : 3 & t | 8).toString(16)
}
))),
t
}
""")
def get_path(self,url):
parse_result = urlparse(url)
path = "{}?{}".format(parse_result.path,parse_result.query)
return path
def gen_singnature(self,url,accept,nonce_str,ca_key,secrect_key):
url_path = self.get_path(url)
data = "GET\n"
data += "{}\n".format(accept) #application/json, text/plain, */*
data += "\n\n\n"
data += "x-ca-key:{}\n".format(ca_key) #203899271
data += "x-ca-nonce:{}\n".format(nonce_str) #一次性值
data += url_path
data = data.encode("utf-8")
# app_secrect_key = "bK9jk5dBEtjauy6gXL7vZCPJ1fOy076H" # 从浏览器复制
sign = b64encode(hmac.new(secrect_key.encode("utf-8"), data, digestmod=hashlib.sha256).digest()).decode()
return sign
def get_html(self,url):
nonce_str = self.nonce_func.call("p", )
accept = "application/json, text/plain, */*"
ca_key = "203899271"
app_secrect_key = "bK9jk5dBEtjauy6gXL7vZCPJ1fOy076H"
headers = {
"Accept":accept,
"Origin":"https://bbs.csdn.net",
"Referer":"https://bbs.csdn.net/?category=4",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"X-Ca-Key":ca_key,
"X-Ca-Nonce":nonce_str, # 只使用一次的
"X-Ca-Signature":self.gen_singnature(url,accept,nonce_str,ca_key,app_secrect_key), # 签名
"X-Ca-Signature-Headers":"x-ca-key,x-ca-nonce",
}
rsp = requests.get(url,headers=headers)
a = rsp.json()
return rsp.status_code,rsp.json()
if __name__ == "__main__":
# parse_result = urlparse("https://bizapi.csdn.net/community-cloud/v1/homepage/community/by/tag?deviceType=PC&tagId=26")
signer = Signer()
code,data = signer.get_html("https://bizapi.csdn.net/community-cloud/v1/homepage/community/by/tag?deviceType=PC&tagId=26")
print(code)
print(data)
写回答
1回答
-
错误提示很明显了
肯定是你在入库时content没有值,最好对提取的值做个判断在入库
032022-01-25
相似问题