无法获取到start_answer_url
来源:6-18 知乎spider爬虫逻辑的实现以及answer的提取 - 2
慕勒5311868
2022-02-12
无法获取到start_answer_url,之前随便写的start_answer_url报403,后在settings中加了一句,就报paging和data是KeyError,整体Zhihu.py代码如下
import re
import scrapy
import json
import datetime
from urllib import parse
from scrapy.loader import ItemLoader
from ZhihuSpider.items import ZhihuQuestionItem,ZhihuAnswerItem
from ZhihuSpider.settings import ZHIHU_USERNAME,ZHIHU_PASSWORD
from ZhihuSpider.utils import zhihu_login_sel
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com']
start_answer_url="https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={1}&limit={2}"
custom_settings = {
"COOKIES_ENABLED" : True
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'
}
def start_requests(self):
#这里模拟登录拿到cookie就可以了
l=zhihu_login_sel.Login(ZHIHU_USERNAME,ZHIHU_PASSWORD,6)
cookie_dict=l.login()
for url in self.start_urls:
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'
}
yield scrapy.Request(url,cookies=cookie_dict,headers=headers,dont_filter=True)
def parse(self, response, **kwargs):
"""
提取出html页面中的所有url 并跟踪这些url进行一步爬取
如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
"""
all_urls = response.xpath("//a/@href").extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
for url in all_urls:
print(url)
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
if match_obj:
# 如果提取到question相关的页面则下载后交由提取函数进行提取
request_url = match_obj.group(1)
yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
else:
pass
# 如果不是question页面则直接进一步跟踪
# yield scrapy.Request(url, headers=self.headers, callback=self.parse)
def parse_question(self,response):
if"Card ViewAll"in response.text:
#老版本
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title", "//div[@id='root']//*[@class='QuestionHeader-title']/text()")
item_loader.add_xpath("content", "//div[@class='QuestionAnswer-content']")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_xpath("answer_num", "//a[@id='QuestionMainAction ViewAll-QuestionMainAction']/text()")
item_loader.add_xpath("comments_num", "//div[@class='ContentItem-actions RichContent-actions']/button/text()")
item_loader.add_xpath("watch_user_num", "//*[@class='NumberBoard-itemValue']/text()")
item_loader.add_xpath("topics", "//div[@class='Popover']/div/text()")
question_item = item_loader.load_item()
else:
#新版本
match_obj=re.match("(.*zhihu.com/question/(\d+))(/|$).*",response.url)
if match_obj:
question_id=int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title","//div[@id='root']//*[@class='QuestionHeader-title']/text()")
item_loader.add_xpath("content", "//div[@class='List-item']")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_xpath("answer_num", "//h4[@class='List-headerText']/span/text()")
item_loader.add_xpath("comments_num","//div[@class='ContentItem-actions RichContent-actions']/button[1]/text()")
item_loader.add_xpath("watch_user_num", "//*[@class='NumberBoard-itemValue']/text()")
item_loader.add_xpath("topics","//div[@class='Popover']/div/text()")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id,0,5),headers=self.headers,callback=self.parse_answer)
def parse_answer(self, response):
ans_json=json.loads(response.text)
is_end = ans_json["paging"]["is_end"]
totals_answer=ans_json["paging"]["totals"]
next_url = ans_json["paging"]["next"]
# 提取answer的具体字段
for answer in ans_json["data"]:
answer_item = ZhihuAnswerItem()
answer_item["zhihu_id"] = answer["id"]
answer_item["url"] = answer["url"]
answer_item["question_id"] = answer["question"]["id"]
answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
answer_item["content"] = answer["content"] if "content" in answer else None
answer_item["parise_num"] = answer["voteup_count"]
answer_item["comments_num"] = answer["comment_count"]
answer_item["create_time"] = answer["created_time"]
answer_item["update_time"] = answer["updated_time"]
answer_item["crawl_time"] = datetime.datetime.now()
yield answer_item
if not is_end:
yield scrapy.Request(next_url,headers=self.headers, callback=self.parse_answer)
写回答
1回答
-
bobby
2022-02-13
知乎最近不断的变更反爬策略, https://coding.imooc.com/learn/questiondetail/258075.html 这个同学也遇到了这个问题,我现在正在分析网站,由于太多加密所以比较费时
022024-12-21
相似问题