爬取拉勾网的时候,页面请求被重定向了
来源:7-4 Rule和LinkExtractor使用
慕粉1657409443
2017-11-16
爬取拉勾网的时候,页面请求被重定向了。设置DOWNLOAD_DELAY 没有作用。
写回答
3回答
-
bobby
2017-12-18
import logging from datetime import datetime import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy import signals from selenium import webdriver from scrapy.xlib.pydispatch import dispatcher import json class LagouSpider(CrawlSpider): name = 'lagou_test' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', } rules = ( Rule(LinkExtractor(allow=("www.lagou.com/zhaopin/",)), follow=True,callback='parse_job'), ) def __init__(self): self.browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe") super(LagouSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): # 当爬虫退出的时候退出chrome print("spider closed") self.browser.quit() def get_cookie_from_cache(self): import os import pickle import time cookie_dict = {} for parent, dirnames, filenames in os.walk('H:/scrapy/ArticleSpider/cookies/lagou'): for filename in filenames: if filename.endswith('.lagou'): print(filename) with open('H:/scrapy/ArticleSpider/cookies/lagou/' + filename, 'rb') as f: d = pickle.load(f) cookie_dict[d['name']] = d['value'] return cookie_dict def start_requests(self): cookie_dict = self.get_cookie_from_cache() chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe", chrome_options=chrome_options) browser.get("https://www.taobao.com") self.browser.get("https://passport.lagou.com/login/login.html") self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(1) > input").send_keys( "") self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(2) > input").send_keys( "") self.browser.find_element_by_css_selector( "div:nth-child(2) > form > div.input_item.btn_group.clearfix > input").click() import time time.sleep(10) Cookies = self.browser.get_cookies() print(Cookies) cookie_dict = {} import pickle for cookie in Cookies: # 写入文件 f = open('H:/scrapy/ArticleSpider/cookies/lagou/'+cookie['name'] + '.lagou', 'wb') pickle.dump(cookie, f) f.close() cookie_dict[cookie['name']] = cookie['value'] return cookie_dict # # # jsonCookies = json.dumps(Cookies) # cookie = json.loads(jsonCookies) # self.cookie = cookie # import requests # response = requests # headers = { # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # 'Accept-Encoding':'gzip, deflate, sdch, br', # 'Accept-Language': 'zh-CN, zh;q = 0.8', # 'Cache-Control': 'max-age = 0', # 'Connection': 'keep-alive', # 'Cookie':'user_trace_token=20170421093424-4de1545e1533450998cfd49656ffa6e8; LGUID=20170421093426-a79b63ec-2632-11e7-8615-525400f775ce; __guid=237742470.2521632498681821700.1508290616954.1816; JSESSIONID=ABAAABAAADEAAFI58C0C15D474804D8985D47B510C54153; _gat=1; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=0b9e8cdd785bac4b932ffbbaecf53a20; _putrc=151A76E83B6D3EDF; login=true; unick=%E6%9C%AC%E5%90%8D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; index_location_city=%E4%B8%8A%E6%B5%B7; monitor_count=3; _ga=GA1.2.175533001.1492738464; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508290599,1508900376; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508900402; LGSID=20171025105935-88397720-b930-11e7-9613-5254005c3644; LGRID=20171025110000-973a85ae-b930-11e7-a797-525400f775ce', # 'Host': 'www.lagou.com', # 'Upgrade-Insecure-Requests': 1, # 'User-Agent': 'Mozilla/5.0(Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' # } # r = requests.get(self.start_urls[0], headers=headers, cookies=cookie_dict) return [scrapy.Request(url=self.start_urls[0], cookies=cookie_dict, callback=self.parse)] def parse_job(self, response): # 解析拉勾网的职位 logging.info(u'-------------消息分割线-------------') response_text = response.text return # return scrapy.Request(url=self.start_urls[0], cookies=self.start_urls, callback=self.after_login)
00 -
慕雪0267781
2017-11-19
我在爬但时候也遇到了重定向的问题。原因是拉钩网识别了scrap的agent,所以不让爬。 解决办法是1.可以模拟登录。 2.在request中加一个agent。所以只需要重写spider中的start_request和CrawlerSpider中的_build_request方法就好了,在发request的时候把新的header加上。
00 -
bobby
2017-11-17
拉勾是通过ip地址来判断爬虫的 如果已经判定你为爬虫了 那你delay肯定是没用的 你可以试试重启路由器更换ip或者过一两天再试试 一般都是针对ip进行24小时封ip, 你也可以试试登录之后带上cookie去访问
022017-12-18
相似问题