求助我进不去的parse_answer

来源:15-1 es完成搜索建议-搜索建议字段保存 - 1

Grant_Lian

2017-05-27

# -*- coding: utf-8 -*-
import scrapy
import re
import json
import datetime
from urllib import parse
from scrapy.loader import ItemLoader
from items import ZhihuQuestionItem, ZhihuAnswerItem
#from settings import user_agent_list

class ZhihuSpider(scrapy.Spider):

   name = "zhihu"
   allowed_domains = ["www.zhihu.com"]
   start_urls = ['http://www.zhihu.com/']

   #question的answer第一页的请求url
   start_answer_url = "http://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"

老师,我再插入elasticsearch时候发现parse_answer进不去了,也不报错,求解

   agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
   headers = {
   "HOST":"www.zhihu.com",
   "Referer": "https://www.zhihu.com",
   'User-Agent': agent
   }

   custom_settings = {
       "COOKIES_ENABLED": True
   }

   """
   parse() 是spider的一个方法。 被调用时,每个初始URL完成下载后生成的 Response 对象将会作为唯一的参数传递给该函数。
   该方法负责解析返回的数据(response data),提取数据(生成item)以及生成需要进一步处理的URL的 Request 对象。
   """
   def parse(self, response):
       #提取出html页面中的所有url  并且跟踪这些url进行进一步爬取
       #如果提取的URL中格式为/question/**********:就可以下载之后进入解析函数
       all_urls = response.css("a::attr(href)").extract()
       all_urls = [parse.urljoin(response.url, url) for url in all_urls]
       all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
       for url in all_urls:
           match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)#/或者$结尾符
           if match_obj:
               request_url = match_obj.group(1)


               #如果提取到question相关的页面则下载后交由提取函数进行提取处理
               yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
           #else:
               #如果不是question页面则直接进一步跟踪
               if "/logout" not in url:
                   yield scrapy.Request(url, headers=self.headers, callback=self.parse)

   def parse_question(self, response):
       #处理question页面,从页面中提取出具体的question item
       if "QuestionHeader-title" in response.text:
           #处理知乎新版本
           match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)#/或者$结尾符
           if match_obj:
               question_id = int(match_obj.group(2))
           item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
           item_loader.add_css("title", "h1.QuestionHeader-title::text")
           item_loader.add_css("content", ".QuestionHeader-detail span::text")
           item_loader.add_value("url", response.url)
           item_loader.add_value("zhihu_id", question_id)
           item_loader.add_css("answer_num", ".List-headerText span::text")
           item_loader.add_css("comments_num", ".QuestionHeaderActions button::text")
           item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
           item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
           question_item = item_loader.load_item()

           yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
           #解析完成之后首先将下一个即将爬取的页面
           yield question_item


   def parse_answer(self, response):
       #处理question的answer
       ans_json =json.loads(response.text)
       is_end = ans_json["paging"]["is_end"]
       next_url = ans_json["paging"]["next"]

       #提取answer的具体字段
       for answer in ans_json["data"]:
           answer_item = ZhihuAnswerItem()
           answer_item["zhihu_id"] = answer["id"]
           answer_item["url"] = answer["url"]
           answer_item["question_id"] = answer["question"]["id"]
           answer_item["author_id"] =answer["author"]["id"] if "id" in answer["author"] else None
           answer_item["content"] = answer["content"] if "content" in answer["content"] else None
           answer_item["praise_num"] = answer["voteup_count"]
           answer_item["comments_num"] = answer["comment_count"]
           answer_item["create_time"] = answer["created_time"]
           answer_item["update_time"] = answer["updated_time"]
           answer_item["crawl_time"] = datetime.datetime.now()

           yield answer_item


       if not is_end:
           yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)

写回答

1回答

bobby

2017-05-27

self.start_answer_url.format(question_id, 20, 0) 亲 你先debug一下看看这个变量出来的url有没有问题

0
4
bobby
回复
Grant_Lian
好的, 解决了问题就行
2017-05-29
共4条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5795 学习 · 6290 问题

查看课程