无法获取到start_answer_url

来源:6-18 知乎spider爬虫逻辑的实现以及answer的提取 - 2

慕勒5311868

2022-02-12

图片描述
图片描述
图片描述
图片描述
图片描述
图片描述
无法获取到start_answer_url,之前随便写的start_answer_url报403,后在settings中加了一句,就报paging和data是KeyError,整体Zhihu.py代码如下

import re
import scrapy
import json
import datetime
from urllib import parse
from scrapy.loader import ItemLoader
from ZhihuSpider.items import ZhihuQuestionItem,ZhihuAnswerItem
from ZhihuSpider.settings import ZHIHU_USERNAME,ZHIHU_PASSWORD
from ZhihuSpider.utils import zhihu_login_sel
class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["www.zhihu.com"]
    start_urls = ['https://www.zhihu.com']
    start_answer_url="https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={1}&limit={2}"
    custom_settings = {
        "COOKIES_ENABLED" : True
    }
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'
    }

    def start_requests(self):
        #这里模拟登录拿到cookie就可以了
        l=zhihu_login_sel.Login(ZHIHU_USERNAME,ZHIHU_PASSWORD,6)
        cookie_dict=l.login()
        for url in self.start_urls:
            headers={
                'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36'
            }
            yield scrapy.Request(url,cookies=cookie_dict,headers=headers,dont_filter=True)
    def parse(self, response, **kwargs):
        """
        提取出html页面中的所有url 并跟踪这些url进行一步爬取
        如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
        """
        all_urls = response.xpath("//a/@href").extract()
        all_urls = [parse.urljoin(response.url, url) for url in all_urls]
        all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
        for url in all_urls:
            print(url)
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
            if match_obj:
                # 如果提取到question相关的页面则下载后交由提取函数进行提取
                request_url = match_obj.group(1)
                yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
            else:
                pass
                # 如果不是question页面则直接进一步跟踪
                # yield scrapy.Request(url, headers=self.headers, callback=self.parse)
    def parse_question(self,response):
        if"Card ViewAll"in response.text:
            #老版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_xpath("title", "//div[@id='root']//*[@class='QuestionHeader-title']/text()")
            item_loader.add_xpath("content", "//div[@class='QuestionAnswer-content']")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_xpath("answer_num", "//a[@id='QuestionMainAction ViewAll-QuestionMainAction']/text()")
            item_loader.add_xpath("comments_num", "//div[@class='ContentItem-actions RichContent-actions']/button/text()")
            item_loader.add_xpath("watch_user_num", "//*[@class='NumberBoard-itemValue']/text()")
            item_loader.add_xpath("topics", "//div[@class='Popover']/div/text()")

            question_item = item_loader.load_item()
        else:
            #新版本
            match_obj=re.match("(.*zhihu.com/question/(\d+))(/|$).*",response.url)
            if match_obj:
                question_id=int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_xpath("title","//div[@id='root']//*[@class='QuestionHeader-title']/text()")
            item_loader.add_xpath("content", "//div[@class='List-item']")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_xpath("answer_num", "//h4[@class='List-headerText']/span/text()")

            item_loader.add_xpath("comments_num","//div[@class='ContentItem-actions RichContent-actions']/button[1]/text()")
            item_loader.add_xpath("watch_user_num", "//*[@class='NumberBoard-itemValue']/text()")
            item_loader.add_xpath("topics","//div[@class='Popover']/div/text()")

            question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id,0,5),headers=self.headers,callback=self.parse_answer)

    def parse_answer(self, response):
        ans_json=json.loads(response.text)
        is_end = ans_json["paging"]["is_end"]
        totals_answer=ans_json["paging"]["totals"]
        next_url = ans_json["paging"]["next"]
        # 提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
            answer_item["content"] = answer["content"] if "content" in answer else None
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item
        if not is_end:
            yield scrapy.Request(next_url,headers=self.headers, callback=self.parse_answer)
写回答

1回答

bobby

2022-02-13

知乎最近不断的变更反爬策略, https://coding.imooc.com/learn/questiondetail/258075.html 这个同学也遇到了这个问题,我现在正在分析网站,由于太多加密所以比较费时

0
2
bobby
回复
慕仰9161356
你留个qq我加你看看
2024-12-21
共2条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5797 学习 · 6290 问题

查看课程