老师帮忙看看代码,还是会302重定向
来源:7-4 Rule和LinkExtractor使用
慕用5281994
2018-05-24
重定向2018-05-24 21:55:30 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.lagou.com?msg=validation&uStatus=3&clientIp=113.99.224.83> from <GET https://www.lagou.com/zhaopin/yonghuyunying/>
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium import webdriver
from LagouSpider.settings import BASE_DIR
import time
import pickle
import os
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ["https://www.lagou.com/"]
header={
"HOST": "www.lagou.com",
"Referer": "https://www.lagou.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0"
}
# rules = (
# Rule(LinkExtractor(allow=r'zhaopin/.*'),follow=True),
# Rule(LinkExtractor(allow=r'gongsi/j\d+.html'), follow=True),
# Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
# )
rules = (
Rule ( LinkExtractor ( allow=r'gongsi/j/\d+.html' ) , follow=True ) ,
Rule ( LinkExtractor ( allow=r'zhaopin/.*' ) , follow=True ) ,
Rule ( LinkExtractor ( allow=r'jobs/\d+.html' ) , callback='parse_job' , follow=True ) ,
)
# #手动重载该函数,相当于之前的parse()
# def parse_start_url(self, response):
# return []
#
# #手动重载该函数,该函数是处理parse_start_url()函数的结果,默认是直接返回parse_start_url()函数的结果。调用该函数的逻辑详见CrawlSpider类的_parse_response()函数
# def process_results(self, response, results):
# return results
#该函数名称在rules中设置
def parse_job(self, response):
#解析拉勾网的职位
#不能重载parse()函数
i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
def start_requests(self):
#注意传入headers
# return [scrapy.Request("https://www.zhihu.com/#signin",callback=self.login,headers=self.header)]
username=input("请输入拉勾用户名:")
password=input("请输入拉勾密码:")
browser=webdriver.Firefox(executable_path=BASE_DIR+"\geckodriver.exe")
browser.get("https://passport.lagou.com/login/login.html?service=https%3a%2f%2fwww.lagou.com%2f")
browser.find_element_by_css_selector("input[placeholder='请输入常用手机号/邮箱']").send_keys(username)
browser.find_element_by_css_selector("input[placeholder='请输入密码']").send_keys(password)
browser.find_element_by_css_selector(".active .btn_green").click()
#等待5秒以使得页面读取完毕
time.sleep(5)
cookies=browser.get_cookies()
# print(cookies)
cookie_dict={}
for cookie in cookies:
f = open('D:/PycharmProjects/LagouSpider/cookies/Lagou/' + cookie['name'] + '.lagou', 'wb')
pickle.dump(cookie, f)
f.close()
#只获取cookie的name/value字段的值并装进字典,将该字典赋值给scrapy的cookies以维持登陆状态。注意该原来的字典中有很多字段。
cookie_dict[cookie['name']] = cookie['value']
browser.close()
#注意dont_filter参数以及setting.py中设置ROBOTSTXT_OBEY = False
# 没有写回调函数的话默认调用prase()
#注意要传入参数headers
# return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict,headers=self.header)]
for url in self.start_urls:
# 不写回调函数即提交至parse()
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict,headers=self.header)
1回答
-
这种是ip封爬虫 有没有登录都不行 你需要限制一下你的爬取速度
032018-12-16
相似问题