-- coding: utf-8 --

import re
import scrapy
import json

try:
import urlparse as parse #python 2
except:
from urllib import parse #python 3 兼容的方式 2 不行就3

class ZhihuSpider(scrapy.Spider):
name = 'zhihu’
allowed_domains = [‘www.zhihu.com’]
start_urls = [‘http://www.zhihu.com/’]

headers = {
    "HOST": "www.zhihu.com",
    "Referer": "https://www.zhizhu.com",
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:51.0) Gecko/20100101 Firefox/51.0"
}

def parse(self, response):
    """
    提取html页面的所有url，并根据这些url进一步爬取
    如果提取的url格式为/question/xxx 就下载之后直接进入解析函数
    """
    all_urls = response.css("a::attr(href)").extract()
    all_urls = [parse.urljoin(response.url, url) for url in all_urls]
    for url in all_urls:
        pass


def parse_detail(self, response):
    pass

def start_requests(self):
    return [scrapy.Request('https://www.zhihu.com/signup?next=%2F', headers=self.headers, callback=self.login)]  #指定回调函数，不然就会默认回到上面的pass
    #这个逻辑也就是说首先从登录页面获取数据


def login(self, response):
    response_text = response.text
    match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
    xsrf = ''
    if match_obj:
        xsrf = (match_obj.group(1))

    if xsrf:
        post_url = "https://www.zhihu.com/login/phone_num"
        post_data = {
            "_xsrf": xsrf,
            "phone_num": "15166791288",
            "password": "yang97221949",
            #"captcha": get_captcha()
        }
        return [scrapy.FormRequest(      #formrequet可以完成表单提交
            url= post_url,
            formdata= post_data,
            headers=self.headers,
            callback=self.check_login   #不加括号，只调用函数名称 获取值
        )]
def check_login(self, respone):
    #验证服务器的返回数据判断是否成功
    text_json = json.loads(respone.text)
    if "msg" in text_json and text_json["msg"] == "登录成功":
        for url in self.start_urls:
            yield scrapy.Request(url, dont_filter=True, headers=self.headers)

控制台出现
图片描述