无法爬取拉钩信息
来源:7-4 Rule和LinkExtractor使用
刘明轩
2023-01-08
老师,目前爬取拉钩网时报错,下面是代码和错误文本。另外用scrapy shell也不能分析拉钩的网页,显示结果虽然状态码是200,但request和response出来的结果是两个不同的url。现在感觉看您几年前讲的方法是一点都没效果,还有没有一些新的方法来解决这些问题
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/']
rules = (
Rule(LinkExtractor(allow=("wn/jobs\?kd=.*&city=.*",)), follow=True),
# Rule(LinkExtractor(allow=(r"gongsi/v1/.*\.html",)), follow=True),
Rule(LinkExtractor(allow="jobs/\d+.html"), callback='parse_job', follow=True), # 回调函数callback
)
manes = []
def parse_start_url(self, response, **kwargs):
return []
def process_results(self, response, results):
return results
def parse_job(self, response):
# 解析拉勾网的职位
item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
item_loader.add_css("title", ".position-head-wrap-name::text")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("salary", ".job_request .salary::text")
item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()")
item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()")
item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()")
item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()")
item_loader.add_css("tags", '.position-label li::text')
item_loader.add_css("publish_time", ".publish_time::text")
item_loader.add_css("job_advantage", ".job-advantage p::text")
item_loader.add_css("job_desc", ".job_bt div")
item_loader.add_css("job_addr", ".work_addr")
item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
item_loader.add_css("company_url", "#job_company dt a::attr(href)")
item_loader.add_value("crawl_time", datetime.now())
job_item = item_loader.load_item()
return job_item错误文本
2023-01-08 17:09:50 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: ArticleSpider)
2023-01-08 17:09:50 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.12, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.7.15 (default, Nov 24 2022, 18:44:54) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.4, Platform Windows-10-10.0.19041-SP0
2023-01-08 17:09:50 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2023-01-08 17:09:50 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop
2023-01-08 17:09:50 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'ArticleSpider',
'NEWSPIDER_MODULE': 'ArticleSpider.spiders',
'SPIDER_MODULES': ['ArticleSpider.spiders'],
'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2023-01-08 17:09:51 [scrapy.extensions.telnet] INFO: Telnet Password: 8f862061dd73a1a3
2023-01-08 17:09:51 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2023-01-08 17:09:54 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2023-01-08 17:09:54 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2023-01-08 17:09:55 [scrapy.middleware] WARNING: Disabled ArticleImagePipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
Unhandled error in Deferred:
2023-01-08 17:09:55 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1947, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1857, in _cancellableInlineCallbacks
_inlineCallbacks(None, gen, status, _copy_context())
--- <exception caught here> ---
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
result = context.run(gen.send, result)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 87, in crawl
self.engine = self._create_engine()
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\utils\misc.py", line 172, in create_instance
instance = objcls(*args, **kwargs)
File "C:\Users\Administrator\Desktop\ArticleSpider\ArticleSpider\pipelines.py", line 116, in __init__
self.exporter.start_exporting()
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\exporters.py", line 120, in start_exporting
self.file.write(b"[")
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 721, in write
return self.writer.write(data)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 377, in write
data, consumed = self.encode(object, self.errors)
builtins.TypeError: utf_8_encode() argument 1 must be str, not bytes
2023-01-08 17:09:55 [twisted] CRITICAL:
Traceback (most recent call last):
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks
result = context.run(gen.send, result)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 87, in crawl
self.engine = self._create_engine()
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 101, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\utils\misc.py", line 172, in create_instance
instance = objcls(*args, **kwargs)
File "C:\Users\Administrator\Desktop\ArticleSpider\ArticleSpider\pipelines.py", line 116, in __init__
self.exporter.start_exporting()
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\exporters.py", line 120, in start_exporting
self.file.write(b"[")
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 721, in write
return self.writer.write(data)
File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 377, in write
data, consumed = self.encode(object, self.errors)
TypeError: utf_8_encode() argument 1 must be str, not bytes写回答
1回答
-
bobby
2023-01-09
拉钩建议使用selenium抓取, 因为拉钩经常变动反爬策略, 而且对cookie要求也经常变
00
相似问题