IndexError: list index out of range
来源:4-21 数据插入主键冲突的解决方法
ak918xp
2020-02-21
如题,很奇怪,之前第一天完全可以爬取信息入库,第二天打开再运行就报错了。老师请帮我看看这个错误该怎么解决。
我在网上搜到说可能是index超出范围,或者list是空值。怎么修改呢?
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/']
def parse(self, response):
post_nodes = response.css('#news_list .news_block')[1:2]
for post_node in post_nodes:
image_url = post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url,post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
#if next_url == "Next >":
#next_url = response.css("div.pager a:last-child::attr(href)").extract_first("")
#next_url = response.xpath("//a[contains(text(),'Next >')]/@href").extract_first("")
#yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self,response):
match_re = re.match(".*?(\d+)", response.url)
if match_re:
post_id = match_re.group(1)
article_item = CnblogsArticleItem()
title = response.css("#news_title a::text").extract_first("")
#title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
create_date = response.css("#news_info .time::text").extract_first("")
match_re = re.match(".*?(\d+.*)",create_date)
if match_re:
create_date = match_re.group(1)
#create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()")
content = response.css("#news_content").extract()[0]
#content = response.xpath("//*[@id='news_content']").extract()[0]
tag_list = response.css(".news_tags a::text").extract()
tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract()[0]
tags = ",".join(tag_list)
#html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
#j_data = json.loads(html.text)
article_item["title"] = title
article_item["create_date"] = create_date
article_item["content"] = content
article_item["tags"] = tags
article_item["title"] = title
article_item["url"] = response.url
if response.meta.get("front_image_url",""):
article_item["front_image_url"] =[response.meta.get("front_image_url","")]
else:
article_item["front_image_url"] = []
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item":article_item} ,callback=self.parse_nums)
pass
写回答
1回答
-
ak918xp
提问者
2020-02-21
好奇怪,一会行一会又不行,取决于新闻的不同,我觉得应该是index超出范围了
012020-02-22
相似问题