scrapy-redis 多个解析函数之间 如何传递 Item,采用meta 程序出错,去掉调试 正常。

来源:10-8 scrapy-redis源码分析- scheduler.py、spider.py-

Hi_Mike

2020-05-24

源码如下:

# -*- coding: utf-8 -*-
import re
import logging

import scrapy

import ipdb

from urllib import parse
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider

from utils.common import get_md5
from items import MutoItem, MutoItemLoader

logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
                    level=logging.INFO)

class MutoSpider(RedisSpider):
    """采用redis方式来爬取网站"""
    name = 'muto'
    allowed_domains = ['www.mutongzixun.com']
    # start_urls = 'https://www.mutongzixun.com'
    redis_key = 'muto:start_urls'

    headers = {
        "HOST": "www.mutongzixun.com",
        "Referer": "https://www.mutongzixun.com/",
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
    }

    def __init__(self, *args, **kwargs):
        super(MutoSpider, self).__init__(*args, **kwargs)

    def parse(self, response):
        # 获取信息页的所有url
        all_urls = response.css(".row.mod-main-list a::attr(href)").extract()
        pattern = re.compile(r"^/index.*")
        # 去掉重复的连接
        urls = set(all_urls)
        urls = list(urls)
        for url in urls:
            match_obj = pattern.match(url)
            if match_obj:
                url = match_obj.group(0)
                request_url = parse.urljoin(response.url, url)
                logging.info("请求的URL地址:%s" % request_url)
                # ipdb.set_trace()
                yield Request(request_url,
                              headers=self.headers,
                              callback=self.parse_detail)
            else:
                continue

        # 获取下一页的url
        # next_url = response.xpath("//div[@class='row mod-main-list']//a[text()='下一页']/@href").extract()[0]
        # logging.info("下一页的URL地址:%s" % next_url)
        # # ipdb.set_trace()
        # yield Request(next_url,
        #                         headers=self.headers,
        #                         callback=self.parse)

    def parse_detail(self, response):
        """新闻详情页解析"""
        match_re = re.match(r".*?(\d+)$", response.url)
        if match_re:
            detail_id = match_re.group(1)

        item_loader = MutoItemLoader(item=MutoItem(), response=response)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("url", response.url)
        item_loader.add_css("title", "h3.text-center::text")
        item_loader.add_xpath("tag", "//ol[@class='breadcrumb']/li[2]/a/text()")
        date_str = response.xpath("(//p[@class='text-center']/text())[1]").extract_first()

        date_pattern = re.search(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", date_str)
        date = date_pattern.group(0)
        item_loader.add_value("date", date)
        item_loader.add_css("content", "#show-content")
        item_loader.add_css("img_url", "#show-content img::attr(src)")
        item_loader.add_value('click_num', 0)

        click_num_url = parse.urljoin(response.url, '/api.php?op=count&id={}&modelid=1'.format(detail_id))
        # ipdb.set_trace()
        news_item = item_loader.load_item()
        yield news_item
        # yield Request(click_num_url,
        #               headers=self.headers,
        #               meta={"item_loader": item_loader},
        #               callback=self.parse_click_nums)

    def parse_click_nums(self, response):
        """api请求获取点击数量"""
        item_loader = response.meta.get("item_loader")
        res = response.text
        match = re.search(r".*\('#hits'\).*'(\d+)'.*", res)
        click_num = match.group(1)
        item_loader.add_value("click_num", click_num)

        news_item = item_loader.load_item()
        # ipdb.set_trace()
        print("^"*100)
        print(news_item)
        print("^"*100)
        yield news_item

parse_detail 解析函数 不能解析完所有需要的内容,需要请求接口 也就是还需要回调函数parse_click_nums解析点击次数。我尝试cnblog的方式 在两个函数之间传递item,但是程序报错,查明原因是meta参数原因。请求的时候不调用meta 程序正常,请问在scrapy—redis中多个解析函数之间,如何更好传递Item。感谢!

写回答

2回答

Hi_Mike

提问者

2020-05-25

此问题 已解决。使用item 不用itemloader 就可以传递。

1
1
bobby
好的,
2020-05-26
共1条回复

Hi_Mike

提问者

2020-05-25

传递 meta 出如下错误:

Traceback (most recent call last):

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/twisted/internet/task.py", line 517, in _oneWorkUnit

    result = next(self._iterator)

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/utils/defer.py", line 74, in <genexpr>

    work = (callable(elem, *args, **named) for elem in iterable)

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/scraper.py", line 193, in _process_spidermw_output

    self.crawler.engine.crawl(request=output, spider=spider)

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/engine.py", line 216, in crawl

    self.schedule(request, spider)

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/scrapy/core/engine.py", line 222, in schedule

    if not self.slot.scheduler.enqueue_request(request):

  File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/scheduler.py", line 159, in enqueue_request

    self.queue.push(request)

  File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/queue.py", line 99, in push

    data = self._encode_request(request)

  File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/queue.py", line 43, in _encode_request

    return self.serializer.dumps(obj)

  File "/Users/yangjiayuan/Workspace/MuTo/scrapy_redis/picklecompat.py", line 14, in dumps

    return pickle.dumps(obj, protocol=-1)

  File "/Users/yangjiayuan/Workspace/virtualenv/Spider/lib/python3.7/site-packages/parsel/selector.py", line 222, in __getstate__

    raise TypeError("can't pickle Selector objects")

TypeError: can't pickle Selector objects


0
0

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5796 学习 · 6290 问题

查看课程