scrapy+selenium模拟登陆淘宝,登陆成功后,一直302重定向至登陆界面
来源:9-5 其余动态网页获取技术介绍-chrome无界面运行、scrapy-splash、selenium-grid, splinter
qq_祢奇_0
2019-04-17
import scrapy
import requests
import re,time,json,os,pickle
from TencentSpider.settings import BASE_DIR
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['taobao.com']
start_urls = ['https://login.taobao.com/']
user_agent = 'Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/537.36(KHTML, like Gecko)Chrome/73.0.3683.86 Safari/537.36'
headers = {
"user-agent": user_agent ,
"Upgrade-Insecure-Requests": 1,
"Referer" : "https://www.taobao.com/"
}
def parse(self, response):
page_source = response.text
#print(page_source)
with open(r'keywords.txt'.format(list), 'r', encoding='utf-8') as file:
for keywords in file:
keyword = re.sub(r'(\d{4,10})', '', keywords)
keyword = re.sub(r'\n', '', keyword)
keyword = re.sub(r' ', '', keyword)
key_id = re.findall(r'(\d{4,10})', keywords)[0]
for i in range(1, 25):
i *= 44
# url = 'https://s.taobao.com/search?q={0}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-6&ntoffset=-6&p4ppushleft=1%2C48&s={1}'.format(keyword, i)
url = 'https://s.taobao.com/search?q={0}'.format(keyword)
yield scrapy.Request(url,headers=self.headers,callback=self.parse_detail,meta={'key_id':key_id, 'keyword':keyword})
def parse_detail(self,response):
keyword = response.meta.get("keyword")
print(response.body)
image_all_ids = response.css('div[data-category="auctions"]::text').extract()
for i in image_all_ids:
image_url = 'https://item.taobao.com/item.htm?id={0}'.format(i)
yield scrapy.Request(image_url,callback=self.get_image_urls,headers=self.headers)
# next_page = response.css(".prev-disabled::text").extract()
total_pages = response.css(".m-page .total::text").extract_first()
print(total_pages)
if total_pages :
for i in range(int(total_pages)):
i = i*44
new_url = 'https://s.taobao.com/search?q={0}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20190417&ie=utf8&bcoffset=9&ntoffset=9&p4ppushleft=1%2C48&s={1}'.format(keyword,i)
#yield scrapy.Request(new_url,callback=self.parse_detail,headers=self.headers)
pass
def get_image_urls(self,response):
pass
def start_requests(self):
cookies = []
url = 'https://login.taobao.com/member/login.jhtml'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path="C:/Users/v_gccai/AppData/Local/Programs/Python/chromedriver.exe",options=options)
wait = WebDriverWait(browser, 20)
# prox = requests.get('http://api.ip.data5u.com/dynamic/get.html?order=78cefa327faa6e95ff2a1005f1bd3d4e&sep=3')
# proxy = re.sub('\n', '', prox.text)
# proxies = {
# 'https': 'https://' + proxy,
# 'http': 'http://' + proxy
# }
browser.get(url)
password_login = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))
password_login.click()
weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
weibo_login.click()
weibo_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input')))
weibo_user.send_keys("15975141646")
weibo_pwd = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input')))
weibo_pwd.send_keys("1230456mq")
time.sleep(1)
submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span')))
submit.click()
time.sleep(10)
# taobao_name = wait.until(EC.presence_of_element_located(
# (By.CSS_SELECTOR,'.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick ')))
Cookies = browser.get_cookies()
print(Cookies)
cookie_dict = {}
for cookie in Cookies:
# 写入文件
f = open(BASE_DIR + "/cookies/taobao.json", 'wb')
pickle.dump(cookie, f)
f.close()
cookie_dict[cookie['name']] = cookie['value']
print(cookie_dict)
return [scrapy.Request(url=self.start_urls[0], headers=self.headers, cookies=cookie_dict,
callback=self.parse)]
selenium模拟登陆可以拿到cookies,但是登陆后就马上302到登陆界面,然后就一直是302了。
写回答
1回答
-
bobby
2019-04-18
是否在settings中减伤这个设置呢 以及是否有添加user-agent呢?我刚才试过是可用的
022019-04-18
相似问题