scrapy+selenium模拟登陆淘宝,登陆成功后,一直302重定向至登陆界面

来源:9-5 其余动态网页获取技术介绍-chrome无界面运行、scrapy-splash、selenium-grid, splinter

qq_祢奇_0

2019-04-17

import scrapy
import requests
import re,time,json,os,pickle
from TencentSpider.settings import BASE_DIR
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By



class TaobaoSpider(scrapy.Spider):
    name = 'taobao'
    allowed_domains = ['taobao.com']
    start_urls = ['https://login.taobao.com/']

    user_agent = 'Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/537.36(KHTML, like Gecko)Chrome/73.0.3683.86 Safari/537.36'
    headers = {
        "user-agent": user_agent ,
        "Upgrade-Insecure-Requests": 1,
        "Referer" : "https://www.taobao.com/"
    }

    def parse(self, response):
        page_source = response.text
        #print(page_source)
        with open(r'keywords.txt'.format(list), 'r', encoding='utf-8') as file:
            for keywords in file:
                keyword = re.sub(r'(\d{4,10})', '', keywords)
                keyword = re.sub(r'\n', '', keyword)
                keyword = re.sub(r' ', '', keyword)
                key_id = re.findall(r'(\d{4,10})', keywords)[0]
                for i in range(1, 25):
                    i *= 44
                    # url = 'https://s.taobao.com/search?q={0}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-6&ntoffset=-6&p4ppushleft=1%2C48&s={1}'.format(keyword, i)
                    url = 'https://s.taobao.com/search?q={0}'.format(keyword)

                    yield scrapy.Request(url,headers=self.headers,callback=self.parse_detail,meta={'key_id':key_id, 'keyword':keyword})

    def parse_detail(self,response):
        keyword = response.meta.get("keyword")
        print(response.body)
        image_all_ids = response.css('div[data-category="auctions"]::text').extract()
        for i in image_all_ids:
            image_url = 'https://item.taobao.com/item.htm?id={0}'.format(i)
            yield scrapy.Request(image_url,callback=self.get_image_urls,headers=self.headers)
        # next_page = response.css(".prev-disabled::text").extract()

        total_pages = response.css(".m-page .total::text").extract_first()
        print(total_pages)
        if total_pages :
            for i in range(int(total_pages)):
                i = i*44
                new_url = 'https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20190417&ie=utf8&bcoffset=9&ntoffset=9&p4ppushleft=1%2C48&s={1}'.format(keyword,i)
                #yield scrapy.Request(new_url,callback=self.parse_detail,headers=self.headers)
                pass



    def get_image_urls(self,response):
        pass


    def start_requests(self):
        cookies = []

        url = 'https://login.taobao.com/member/login.jhtml'
        options = webdriver.ChromeOptions()
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        browser = webdriver.Chrome(executable_path="C:/Users/v_gccai/AppData/Local/Programs/Python/chromedriver.exe",options=options)
        wait = WebDriverWait(browser, 20)

            # prox = requests.get('http://api.ip.data5u.com/dynamic/get.html?order=78cefa327faa6e95ff2a1005f1bd3d4e&sep=3')
            # proxy = re.sub('\n', '', prox.text)
            # proxies = {
            #     'https': 'https://' + proxy,
            #     'http': 'http://' + proxy
            # }
        browser.get(url)
        password_login = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))
        password_login.click()
        weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
        weibo_login.click()
        weibo_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input')))
        weibo_user.send_keys("15975141646")
        weibo_pwd = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input')))
        weibo_pwd.send_keys("1230456mq")
        time.sleep(1)
        submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span')))
        submit.click()
        time.sleep(10)

            # taobao_name = wait.until(EC.presence_of_element_located(
            #     (By.CSS_SELECTOR,'.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick ')))

        Cookies = browser.get_cookies()
        print(Cookies)
        cookie_dict = {}
        for cookie in Cookies:
                    # 写入文件
            f = open(BASE_DIR + "/cookies/taobao.json", 'wb')
            pickle.dump(cookie, f)
            f.close()
            cookie_dict[cookie['name']] = cookie['value']

            print(cookie_dict)
        return [scrapy.Request(url=self.start_urls[0], headers=self.headers, cookies=cookie_dict,
                                       callback=self.parse)]
selenium模拟登陆可以拿到cookies,但是登陆后就马上302到登陆界面,然后就一直是302了。
写回答

1回答

bobby

2019-04-18

//img.mukewang.com/szimg/5cb84df70001000d06780175.jpg是否在settings中减伤这个设置呢 以及是否有添加user-agent呢?我刚才试过是可用的

0
2
qq_祢奇_0
老师我有在qq发给你了,你能看一下吗
2019-04-18
共2条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5796 学习 · 6290 问题

查看课程