老师您好,请问我无法进入到parse_answer函数,并且返回400

来源:6-18 知乎spider爬虫逻辑的实现以及answer的提取 - 2

慕斯3428064

2019-01-05

import re
import json
try:
import urlparse as parse
except:
from urllib import parse

import scrapy
from scrapy.loader import ItemLoader
from ArticleSpider.items import ZhihuQuestionItem, ZhihuAnswerItem

class ZhihuSpider(scrapy.Spider):
name = 'zhihu’
allowed_domains = [‘www.zhihu.com’]
start_urls = [‘https://www.zhihu.com/’]

# question的第一页answer的请求url
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"

headers = {
    "HOST": "www.zhihu.com",
    "Referer": "https://www.zhizhu.com",
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
   # 'x-app-za':" OS = Android&Release=5.1.1&Model=SM-G925F&VersionName=5.21.2&VersionCode=764&Product=com.zhihu.android&Width=1080&Height=1920&Installer=%E5%BA%94%E7% 94%A8%E5%AE%9D-%E5%B9%BF%E5%91%8A&DeviceType=AndroidPhone&Brand=samsung&OperatorType=46000"

}

def parse(self, response):
    """
    提取出html页面中的所有url 并跟踪这些url进行一步爬取
    如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
    """
    all_urls = response.css("a::attr(href)").extract()
    all_urls = [parse.urljoin(response.url, url) for url in all_urls]
    all_urls = filter(lambda x:True if x.startswith("https") else False,all_urls)
    for url in all_urls:
        print(url)
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*",url)
        if match_obj:
            #如果提取到question相关页面,则下载后交由提取函数进行提取
            request_url = match_obj.group(1)
            yield scrapy.Request(request_url,headers=self.headers,callback=self.parse_question)
        else:
            pass
            #如果不是question相关页面,则直接进一步跟踪
           # yield scrapy.Request(url,headers=headers,callback=self.parse)

def parse_question(self, response):
    # 处理question页面, 从页面中提取出具体的question item
    if "QuestionHeader-title" in response.text:
        #新版本
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(),response=response)
        item_loader.add_css("title","h1.QuestionHeader-title::text")
        item_loader.add_css("content",".QuestionHeader-detail")
        item_loader.add_value("url",response.url)
        item_loader.add_value("zhihu_id",question_id)
        item_loader.add_css("answer_num",".List-headerText span::text")
        item_loader.add_css("comments_num",".QuestionHeader-Comment button::text")
        item_loader.add_css("watch_user_num",".NumberBoard-itemValue::text")
        item_loader.add_css("topics",".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

    else:#处理知乎旧版本
        match_obj = re.match("(.*zhihu.com/question/(d+))(/|$).*", response.url)
        if match_obj:
            question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", ".zh-question-title h2 a::text")
        item_loader.add_css("content", "#zh-question-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", "#zh-question-answer-num::text")
        item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
        item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
        item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

        question_item = item_loader.load_item()

    url1 = self.start_answer_url.format(question_id,20,0)
    print(url1)
    yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,callback=self.parse_answer)
    yield question_item


def parse_answer(self, reponse):
写回答

1回答

bobby

2019-01-06

是个别出现400还是所有的这种请求都会出现400呢

0
3
bobby
回复
慕斯3428064
好的,
2019-01-08
共3条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5796 学习 · 6290 问题

查看课程