selenium的相关问题
来源:9-4 selenium集成到scrapy中
Meet相识
2017-12-16
browser.get("https://passport.lagou.com/login/login.html") browser.find_element_by_css_selector(".active div[data-controltype='Phone']>input").send_keys("xxxx") browser.find_element_by_css_selector(".active div[data-controltype='Password']>input").send_keys("xxxx") browser.find_element_by_css_selector(".active div[data-propertyname='submit']>input").click()
老师,这是我selenium模拟拉钩登录的代码,我的问题是如何在登录成功后获取跳转以后的url并把登录成功的cookie设置进spider,老师可以提供下关键代码吗
写回答
1回答
-
import logging from datetime import datetime import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy import signals from selenium import webdriver from scrapy.xlib.pydispatch import dispatcher import json class LagouSpider(CrawlSpider): name = 'lagou_test' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', } rules = ( Rule(LinkExtractor(allow=("www.lagou.com/zhaopin/",)), follow=True,callback='parse_job'), ) def __init__(self): self.browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe") super(LagouSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): # 当爬虫退出的时候退出chrome print("spider closed") self.browser.quit() def get_cookie_from_cache(self): import os import pickle import time cookie_dict = {} for parent, dirnames, filenames in os.walk('H:/scrapy/ArticleSpider/cookies/lagou'): for filename in filenames: if filename.endswith('.lagou'): print(filename) with open('H:/scrapy/ArticleSpider/cookies/lagou/' + filename, 'rb') as f: d = pickle.load(f) cookie_dict[d['name']] = d['value'] return cookie_dict def start_requests(self): cookie_dict = self.get_cookie_from_cache() chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe", chrome_options=chrome_options) browser.get("https://www.taobao.com") self.browser.get("https://passport.lagou.com/login/login.html") self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(1) > input").send_keys( "") self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(2) > input").send_keys( "") self.browser.find_element_by_css_selector( "div:nth-child(2) > form > div.input_item.btn_group.clearfix > input").click() import time time.sleep(10) Cookies = self.browser.get_cookies() print(Cookies) cookie_dict = {} import pickle for cookie in Cookies: # 写入文件 f = open('H:/scrapy/ArticleSpider/cookies/lagou/'+cookie['name'] + '.lagou', 'wb') pickle.dump(cookie, f) f.close() cookie_dict[cookie['name']] = cookie['value'] return cookie_dict # # # jsonCookies = json.dumps(Cookies) # cookie = json.loads(jsonCookies) # self.cookie = cookie # import requests # response = requests # headers = { # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # 'Accept-Encoding':'gzip, deflate, sdch, br', # 'Accept-Language': 'zh-CN, zh;q = 0.8', # 'Cache-Control': 'max-age = 0', # 'Connection': 'keep-alive', # 'Cookie':'user_trace_token=20170421093424-4de1545e1533450998cfd49656ffa6e8; LGUID=20170421093426-a79b63ec-2632-11e7-8615-525400f775ce; __guid=237742470.2521632498681821700.1508290616954.1816; JSESSIONID=ABAAABAAADEAAFI58C0C15D474804D8985D47B510C54153; _gat=1; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=0b9e8cdd785bac4b932ffbbaecf53a20; _putrc=151A76E83B6D3EDF; login=true; unick=%E6%9C%AC%E5%90%8D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; index_location_city=%E4%B8%8A%E6%B5%B7; monitor_count=3; _ga=GA1.2.175533001.1492738464; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508290599,1508900376; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508900402; LGSID=20171025105935-88397720-b930-11e7-9613-5254005c3644; LGRID=20171025110000-973a85ae-b930-11e7-a797-525400f775ce', # 'Host': 'www.lagou.com', # 'Upgrade-Insecure-Requests': 1, # 'User-Agent': 'Mozilla/5.0(Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' # } # r = requests.get(self.start_urls[0], headers=headers, cookies=cookie_dict) return [scrapy.Request(url=self.start_urls[0], cookies=cookie_dict, callback=self.parse)] def parse_job(self, response): # 解析拉勾网的职位 logging.info(u'-------------消息分割线-------------') response_text = response.text return # return scrapy.Request(url=self.start_urls[0], cookies=self.start_urls, callback=self.after_login)
112017-12-18
相似问题