selenium获取cookie后,如何传递给requests?

来源:9-4 selenium集成到scrapy中

一一倾

2017-12-01

selenium获取cookie后,如何传递给requests?登录知乎后,使用cookie = browser.get_cookies()拿到cookie转换为字典,可是登不上知乎不知为什么,以下是拿到cookie后的代码:

。。

cookie = browser.get_cookies()
browser.quit()
cookie_dict = {}
for i in cookie:
   cookie_dict[i["name"]] = i["value"]
print(cookie_dict) # 结果{'l_cap_id': '"ZWMzYzZkYWY0ZTQ4NDc4ODlkMzU3MzJkNzlhNGVlY2Y=|1512139745|3e31f928d222a1d2de9a5dee9c01526f94422263"', 'aliyungf_tc': 'AQAAALBEag3DnQAApq9K36fsygrOC3Ib', 'q_c1': '51125293f12e4fdd997cfa6ec71b94fd|1512139745000|1512139745000', 'cap_id': '"NWQ1OTJjMWJkNzc1NDg5YWExMGNkNzg0M2QyZTY1YmM=|1512139745|43df788f5cd11d0e1af296edc0a1bb000728fdce"', 'n_c': '1', '_xsrf': 'a89abcb83e4281411f87f1cab76b0fe6', 'd_c0': '"AFCC3QW3xAyPTitXZQrt0YPJfw9lPsrtUNI=|1512139745"', '_zap': '1cd257c8-c7ca-4210-8fb4-4ed7e62a6ce9', 'r_cap_id': '"NWE3M2I3ZWYzOTc2NDU5ZTkwOTgxZmFjMjQyZGJmOTU=|1512139745|b12eb3dbe2d657f0d46bfe108ebcc83ab85f096e"', 'l_n_c': '1'}
s = requests.session()
s.post(url, headers=headers, cookies=cookie_dict)  #这里post,get都不行
url2 = "https://www.zhihu.com/settings/profile"
login_code = s.get(url2, headers=headers, allow_redirects=False).status_code
print(login_code) # 结果302


写回答

1回答

bobby

2017-12-04

import logging
from datetime import datetime
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy import signals
from selenium import webdriver
from scrapy.xlib.pydispatch import dispatcher
import json
class LagouSpider(CrawlSpider):
    name = 'lagou_test'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    }
    rules = (
        Rule(LinkExtractor(allow=("www.lagou.com/zhaopin/",)), follow=True,callback='parse_job'),
    )
    def __init__(self):
        self.browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe")
        super(LagouSpider, self).__init__()
        dispatcher.connect(self.spider_closed, signals.spider_closed)
    def spider_closed(self, spider):
        # 当爬虫退出的时候退出chrome
        print("spider closed")
        self.browser.quit()

    def get_cookie_from_cache(self):
        import os
        import pickle
        import time
        cookie_dict = {}
        for parent, dirnames, filenames in os.walk('H:/scrapy/ArticleSpider/cookies/lagou'):
            for filename in filenames:
                if filename.endswith('.lagou'):
                    print(filename)
                    with open('H:/scrapy/ArticleSpider/cookies/lagou/' + filename, 'rb') as f:
                        d = pickle.load(f)
                        cookie_dict[d['name']] = d['value']
        return cookie_dict
    def start_requests(self):
        cookie_dict = self.get_cookie_from_cache()
        chrome_options = webdriver.ChromeOptions()
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        browser = webdriver.Chrome(executable_path="E:/tmp/chromedriver.exe", chrome_options=chrome_options)
        browser.get("https://www.taobao.com")

        self.browser.get("https://passport.lagou.com/login/login.html")
        self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(1) > input").send_keys(
            "")
        self.browser.find_element_by_css_selector("div:nth-child(2) > form > div:nth-child(2) > input").send_keys(
            "")
        self.browser.find_element_by_css_selector(
            "div:nth-child(2) > form > div.input_item.btn_group.clearfix > input").click()
        import time
        time.sleep(10)
        Cookies = self.browser.get_cookies()
        print(Cookies)
        cookie_dict = {}
        import pickle
        for cookie in Cookies:
            # 写入文件
            f = open('H:/scrapy/ArticleSpider/cookies/lagou/'+cookie['name'] + '.lagou', 'wb')
            pickle.dump(cookie, f)
            f.close()
            cookie_dict[cookie['name']] = cookie['value']
        return cookie_dict
        #
        #
        # jsonCookies = json.dumps(Cookies)
        # cookie = json.loads(jsonCookies)
        # self.cookie = cookie
        # import requests
        # response = requests
        # headers = {
        #     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        #     'Accept-Encoding':'gzip, deflate, sdch, br',
        #     'Accept-Language': 'zh-CN, zh;q = 0.8',
        #     'Cache-Control': 'max-age = 0',
        #     'Connection': 'keep-alive',
        #     'Cookie':'user_trace_token=20170421093424-4de1545e1533450998cfd49656ffa6e8; LGUID=20170421093426-a79b63ec-2632-11e7-8615-525400f775ce; __guid=237742470.2521632498681821700.1508290616954.1816; JSESSIONID=ABAAABAAADEAAFI58C0C15D474804D8985D47B510C54153; _gat=1; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; X_HTTP_TOKEN=0b9e8cdd785bac4b932ffbbaecf53a20; _putrc=151A76E83B6D3EDF; login=true; unick=%E6%9C%AC%E5%90%8D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; index_location_city=%E4%B8%8A%E6%B5%B7; monitor_count=3; _ga=GA1.2.175533001.1492738464; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508290599,1508900376; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1508900402; LGSID=20171025105935-88397720-b930-11e7-9613-5254005c3644; LGRID=20171025110000-973a85ae-b930-11e7-a797-525400f775ce',
        #     'Host': 'www.lagou.com',
        #     'Upgrade-Insecure-Requests': 1,
        #     'User-Agent': 'Mozilla/5.0(Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        #     }
        # r = requests.get(self.start_urls[0], headers=headers, cookies=cookie_dict)
        return [scrapy.Request(url=self.start_urls[0], cookies=cookie_dict, callback=self.parse)]
    def parse_job(self, response):
        # 解析拉勾网的职位
        logging.info(u'-------------消息分割线-------------')
        response_text = response.text
        return
        # return scrapy.Request(url=self.start_urls[0], cookies=self.start_urls, callback=self.after_login)

这里是拉勾的登录后加cookie到request中 你看看, 记住要在settings中开启cookie

0
2
bobby
回复
一一倾
好的,
2017-12-05
共2条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5796 学习 · 6290 问题

查看课程