scrapy-redis 多个解析函数之间 如何传递 Item,采用meta 程序出错,去掉调试 正常。
来源:10-8 scrapy-redis源码分析- scheduler.py、spider.py-
Hi_Mike
2020-05-24
源码如下:
# -*- coding: utf-8 -*-
import re
import logging
import scrapy
import ipdb
from urllib import parse
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
from utils.common import get_md5
from items import MutoItem, MutoItemLoader
logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO)
class MutoSpider(RedisSpider):
"""采用redis方式来爬取网站"""
name = 'muto'
allowed_domains = ['www.mutongzixun.com']
# start_urls = 'https://www.mutongzixun.com'
redis_key = 'muto:start_urls'
headers = {
"HOST": "www.mutongzixun.com",
"Referer": "https://www.mutongzixun.com/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
def __init__(self, *args, **kwargs):
super(MutoSpider, self).__init__(*args, **kwargs)
def parse(self, response):
# 获取信息页的所有url
all_urls = response.css(".row.mod-main-list a::attr(href)").extract()
pattern = re.compile(r"^/index.*")
# 去掉重复的连接
urls = set(all_urls)
urls = list(urls)
for url in urls:
match_obj = pattern.match(url)
if match_obj:
url = match_obj.group(0)
request_url = parse.urljoin(response.url, url)
logging.info("请求的URL地址:%s" % request_url)
# ipdb.set_trace()
yield Request(request_url,
headers=self.headers,
callback=self.parse_detail)
else:
continue
# 获取下一页的url
# next_url = response.xpath("//div[@class='row mod-main-list']//a[text()='下一页']/@href").extract()[0]
# logging.info("下一页的URL地址:%s" % next_url)
# # ipdb.set_trace()
# yield Request(next_url,
# headers=self.headers,
# callback=self.parse)
def parse_detail(self, response):
"""新闻详情页解析"""
match_re = re.match(r".*?(\d+)$", response.url)
if match_re:
detail_id = match_re.group(1)
item_loader = MutoItemLoader(item=MutoItem(), response=response)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_value("url", response.url)
item_loader.add_css("title", "h3.text-center::text")
item_loader.add_xpath("tag", "//ol[@class='breadcrumb']/li[2]/a/text()")
date_str = response.xpath("(//p[@class='text-center']/text())[1]").extract_first()
date_pattern = re.search(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", date_str)
date = date_pattern.group(0)
item_loader.add_value("date", date)
item_loader.add_css("content", "#show-content")
item_loader.add_css("img_url", "#show-content img::attr(src)")
item_loader.add_value('click_num', 0)
click_num_url = parse.urljoin(response.url, '/api.php?op=count&id={}&modelid=1'.format(detail_id))
# ipdb.set_trace()
news_item = item_loader.load_item()
yield news_item
# yield Request(click_num_url,
# headers=self.headers,
# meta={"item_loader": item_loader},
# callback=self.parse_click_nums)
def parse_click_nums(self, response):
"""api请求获取点击数量"""
item_loader = response.meta.get("item_loader")
res = response.text
match = re.search(r".*\('#hits'\).*'(\d+)'.*", res)
click_num = match.group(1)
item_loader.add_value("click_num", click_num)
news_item = item_loader.load_item()
# ipdb.set_trace()
print("^"*100)
print(news_item)
print("^"*100)
yield news_item
parse_detail 解析函数 不能解析完所有需要的内容,需要请求接口 也就是还需要回调函数parse_click_nums解析点击次数。我尝试cnblog的方式 在两个函数之间传递item,但是程序报错,查明原因是meta参数原因。请求的时候不调用meta 程序正常,请问在scrapy—redis中多个解析函数之间,如何更好传递Item。感谢!
2回答
-
Hi_Mike
提问者
2020-05-25
此问题 已解决。使用item 不用itemloader 就可以传递。
112020-05-26 -
Hi_Mike
提问者
2020-05-25
传递 meta 出如下错误:
Traceback (most recent call last):
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/twisted/internet/task.py", line 517, in _oneWorkUnit
result = next(self._iterator)
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/utils/defer.py", line 74, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/scraper.py", line 193, in _process_spidermw_output
self.crawler.engine.crawl(request=output, spider=spider)
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/engine.py", line 216, in crawl
self.schedule(request, spider)
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/engine.py", line 222, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/scheduler.py", line 159, in enqueue_request
self.queue.push(request)
File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/queue.py", line 99, in push
data = self._encode_request(request)
File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/queue.py", line 43, in _encode_request
return self.serializer.dumps(obj)
File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/picklecompat.py", line 14, in dumps
return pickle.dumps(obj, protocol=-1)
File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/parsel/selector.py", line 222, in __getstate__
raise TypeError("can't pickle Selector objects")
TypeError: can't pickle Selector objects
00
相似问题