尝试使用scrapy登录豆瓣, 但是碰到一个问题, 困扰很久都无法解决
来源:6-15 item loder方式提取question - 2
ming2281_0001
2017-09-24
# 实际现象
1. 豆瓣登录时需要验证码, 我尝试使用bobby老师教的方法去登录, 但是遇到一个问题(见后文)
# 预期现象
1. 对于这种需要验证码的网站, 我一般倾向于使用下载图片后手动输入
# 重现步骤
1. 拷贝代码
2. 执行
3. 看结果
# 上下文环境
1. Python3.5
2. Windows10
3. Pycharm
写回答
2回答
-
bobby
2017-09-25
#!/usr/bin/env python # encoding: utf-8 import re import json from urllib import parse import scrapy from scrapy.http.response.html import HtmlResponse from scrapy.http import Request, FormRequest from scrapy.loader import ItemLoader from PIL import Image class DoubanGroupSpider(scrapy.Spider): name = 'douban' allowed_domains = ['www.douban.com'] # start_urls = ['http://blog.jobbole.com/'] start_urls = ['https://www.douban.com/group/program/discussion?start=0'] HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' } def parse(self, response): pass def start_requests(self): """ 首先请求一下主页 :return: """ return [Request( 'https://www.douban.com/note/638232175/', callback=self._get_captcha_id, headers=DoubanGroupSpider.HEADERS )] def _get_captcha_id(self, response): """ 2. 然后获取验证码所在的链接 :param response: :return: """ # captcha_id = response.xpath('//input[@type="hidden" and @name="captcha-id"]/@value').extract_first('') yield Request( 'https://www.douban.com/j/misc/captcha', headers=DoubanGroupSpider.HEADERS, callback=self._get_captcha, ) def _get_captcha(self, response): """ 3. 根据上一步的信息, 拼接验证码链接,开始请求 :type response: HtmlResponse :param response: :return: """ response_json = json.loads(response.text) url = response_json['url'] captcha_url = 'https://www.douban.com/misc/captcha?id={url}'.format(url=self.__get_url(url)) token = response_json['token'] yield Request( captcha_url, headers=DoubanGroupSpider.HEADERS, meta={'captcha_id': token}, # token其实就是post_data里面的captcha_id callback=self._login ) def __get_url(self, url): REGEX = re.compile(r'id=(?P<id>.+$)') m = REGEX.search(url) return m.group('id') def _login(self, response): """ 4. 登录 其中验证码需要手动填写 :type response: HtmlResponse :param response: :return: """ captcha_id = response.meta.get('captcha_id') _filename = './captcha.jpeg' with open(_filename, mode='wb') as f: f.write(response.body) image = Image.open(_filename) image.show() captcha_solution = input('captcha: ') if captcha_id: post_data = { 'source': 'index_nav', 'form_email': 'he.zhiming@foxmail.com', 'form_password': '$$hzm878415db', 'captcha-solution': captcha_solution, 'captcha-id': captcha_id, 'remember': 'on', 'redir':'https://www.douban.com/', 'login':"登录" } # 生成FormRequest yield FormRequest( 'https://www.douban.com/accounts/login', headers=DoubanGroupSpider.HEADERS, formdata=post_data, callback=self._check_login ) def _check_login(self, response): """ 5. 检查是否登录成功 问题: 1. 总是没有到达这一步 :type response: HtmlResponse :param response: :return: """ status = response.status print(status) text = response.text def parse(self, response): """解析一个页面 :type response: HtmlResponse :param response: :return: """ tr_nodes = response.xpath('//table[@class="olt"]/tr[@class=""]') for node in tr_nodes: title_url = node.xpath('./td[@class="title"]/a/@href').extract_first('') response_amount = node.xpath('./td[@class=""]/text()').extract_first('') if title_url: yield Request(title_url, callback=self.parse_single, meta={'response_amount': response_amount}) # get next url next_url = response.xpath('//span[@class="next"]/a/@href').extract_first(default='') if next_url: yield Request(next_url, callback=self.parse) def parse_single(self, response): """ :type response: HtmlResponse :param response: :return: """ pass你将这段代码拷贝过去运行一下, 我这里运行已经通过, 你和你的代码对比一下就知道哪里不一样了
222017-09-25 -
ming2281_0001
提问者
2017-09-24
daima
00
相似问题