301
来源:7-4 Rule和LinkExtractor使用
ghy3030
2018-01-24
老师,我已经用了selinium模拟登陆了,也加了header,还是出现了301错误,该怎么解决啊?
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import pickle class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com']#不在这个域名下的会被忽略 start_urls = ['https://www.lagou.com/'] headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } rules = ( Rule(LinkExtractor(allow=("zhaopin/.*",)),follow=True), Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) # def parse_start_url(self, response): # return [] # # def process_results(self, response, results): # return results def parse_job(self, response): #解析拉勾网的职位 i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i def start_requests(self): #第一次模拟登陆获取cookies用这一段 from selenium import webdriver browser = webdriver.Chrome(executable_path="F:\chromedriver_win32\chromedriver.exe") browser.get("https://passport.lagou.com/login/login.html") browser.find_element_by_xpath("//form[@class='active']/div[@class='input_item clearfix'][1]/input").send_keys( "13736821938") browser.find_element_by_xpath("//form[@class='active']/div[@class='input_item clearfix'][2]/input").send_keys("gehongYI88") print(10) browser.find_element_by_xpath( "//form[@class='active']/div[@class='input_item btn_group clearfix']/input").click() import time time.sleep(10) Cookies = browser.get_cookies() print(Cookies) cookie_dict = {} for cookie in Cookies: #写入文件 f = open('F:/Users/hongyi/PycharmProjects/ArticleSpider/cookies/lagou' + cookie['name'] + '.zhihu','wb') pickle.dump(cookie,f) f.close() cookie_dict[cookie['name']] = cookie['value'] browser.close() # #直接从文件中提取保存的cookies # cookie_dict = {} # root_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'cookies')#得到cookies文件夹的路径 # for root, dirs, files in os.walk(root_dir):#遍历cookies文件夹中的所有文件 # for file in files: # with open(os.path.join(root_dir, file), 'rb') as f: # filecontent = pickle.load(f)#load pickle文件 # cookie_dict[file.split('.')[0].split('zhihu')[1]] = filecontent['value']#filecontent['value']取出文件内容作为字典的value return [scrapy.Request(url=self.start_urls[0],dont_filter=True,cookies=cookie_dict,headers=self.headers)]
写回答
2回答
-
慕娘6095299
2018-10-17
现在解决了吗? 求分享
032018-10-25 -
bobby
2018-01-25
301是重定向 识别为爬虫了 你可以试试重启路由器切换ip看看 拉勾网是通过ip地址进行爬虫识别的
00
相似问题