无法爬取拉钩信息
来源:7-4 Rule和LinkExtractor使用
刘明轩
2023-01-08
老师,目前爬取拉钩网时报错,下面是代码和错误文本。另外用scrapy shell也不能分析拉钩的网页,显示结果虽然状态码是200,但request和response出来的结果是两个不同的url。现在感觉看您几年前讲的方法是一点都没效果,还有没有一些新的方法来解决这些问题
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] rules = ( Rule(LinkExtractor(allow=("wn/jobs\?kd=.*&city=.*",)), follow=True), # Rule(LinkExtractor(allow=(r"gongsi/v1/.*\.html",)), follow=True), Rule(LinkExtractor(allow="jobs/\d+.html"), callback='parse_job', follow=True), # 回调函数callback ) manes = [] def parse_start_url(self, response, **kwargs): return [] def process_results(self, response, results): return results def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".position-head-wrap-name::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
错误文本 2023-01-08 17:09:50 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: ArticleSpider) 2023-01-08 17:09:50 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.12, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.7.15 (default, Nov 24 2022, 18:44:54) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.4, Platform Windows-10-10.0.19041-SP0 2023-01-08 17:09:50 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor 2023-01-08 17:09:50 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop 2023-01-08 17:09:50 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'ArticleSpider', 'NEWSPIDER_MODULE': 'ArticleSpider.spiders', 'SPIDER_MODULES': ['ArticleSpider.spiders'], 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'} 2023-01-08 17:09:51 [scrapy.extensions.telnet] INFO: Telnet Password: 8f862061dd73a1a3 2023-01-08 17:09:51 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.telnet.TelnetConsole', 'scrapy.extensions.logstats.LogStats'] 2023-01-08 17:09:54 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2023-01-08 17:09:54 [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2023-01-08 17:09:55 [scrapy.middleware] WARNING: Disabled ArticleImagePipeline: ImagesPipeline requires installing Pillow 4.0.0 or later Unhandled error in Deferred: 2023-01-08 17:09:55 [twisted] CRITICAL: Unhandled error in Deferred: Traceback (most recent call last): File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 192, in crawl return self._crawl(crawler, *args, **kwargs) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 196, in _crawl d = crawler.crawl(*args, **kwargs) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1947, in unwindGenerator return _cancellableInlineCallbacks(gen) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1857, in _cancellableInlineCallbacks _inlineCallbacks(None, gen, status, _copy_context()) --- <exception caught here> --- File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks result = context.run(gen.send, result) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 87, in crawl self.engine = self._create_engine() File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 101, in _create_engine return ExecutionEngine(self, lambda _: self.stop()) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\engine.py", line 70, in __init__ self.scraper = Scraper(crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__ self.itemproc = itemproc_cls.from_crawler(crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler return cls.from_settings(crawler.settings, crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 35, in from_settings mw = create_instance(mwcls, settings, crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\utils\misc.py", line 172, in create_instance instance = objcls(*args, **kwargs) File "C:\Users\Administrator\Desktop\ArticleSpider\ArticleSpider\pipelines.py", line 116, in __init__ self.exporter.start_exporting() File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\exporters.py", line 120, in start_exporting self.file.write(b"[") File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 721, in write return self.writer.write(data) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 377, in write data, consumed = self.encode(object, self.errors) builtins.TypeError: utf_8_encode() argument 1 must be str, not bytes 2023-01-08 17:09:55 [twisted] CRITICAL: Traceback (most recent call last): File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\twisted\internet\defer.py", line 1697, in _inlineCallbacks result = context.run(gen.send, result) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 87, in crawl self.engine = self._create_engine() File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\crawler.py", line 101, in _create_engine return ExecutionEngine(self, lambda _: self.stop()) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\engine.py", line 70, in __init__ self.scraper = Scraper(crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\core\scraper.py", line 71, in __init__ self.itemproc = itemproc_cls.from_crawler(crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler return cls.from_settings(crawler.settings, crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\middleware.py", line 35, in from_settings mw = create_instance(mwcls, settings, crawler) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\utils\misc.py", line 172, in create_instance instance = objcls(*args, **kwargs) File "C:\Users\Administrator\Desktop\ArticleSpider\ArticleSpider\pipelines.py", line 116, in __init__ self.exporter.start_exporting() File "D:\Program Files (x86)\anaconda\envs\Spider\lib\site-packages\scrapy\exporters.py", line 120, in start_exporting self.file.write(b"[") File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 721, in write return self.writer.write(data) File "D:\Program Files (x86)\anaconda\envs\Spider\lib\codecs.py", line 377, in write data, consumed = self.encode(object, self.errors) TypeError: utf_8_encode() argument 1 must be str, not bytes
写回答
1回答
-
bobby
2023-01-09
拉钩建议使用selenium抓取, 因为拉钩经常变动反爬策略, 而且对cookie要求也经常变
00
相似问题