如果知乎 question 的 content 为空,就会报错
来源:6-20 保存数据到mysql中 -2
PearSheep
2018-01-11
Pipelines 底下的MysqlTwistedPipeline:
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
# 使用 twisted 将 mysql 插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #处理异常
def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print(failure)
def do_insert(self, cursor, item):
# 执行具体的插入
# 根据不同的 item 构建不同的 sql 语句并插入到 MySQL 中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)class ZhihuQuestionItem(scrapy.Item):
#知乎的问题 item
zhihu_id = scrapy.Field()
topics = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_num = scrapy.Field()
comment_num = scrapy.Field()
watch_user_num = scrapy.Field()
click_num = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
# 插入知乎 question 表的 SQL 语句
insert_sql = """
insert into zhihu_question (zhihu_id, topics, url, title, content, answer_num,
comments_num, watch_user_num, click_num, crawl_time)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
zhihu_id = self["zhihu_id"][0]
topics = ",".join(self["topics"])
url = self["url"][0]
title = "".join(self["title"])
content = "".join(self["content"])
answer_num = extract_num("".join(self["answer_num"]))
comments_num = extract_num("".join(self["comment_num"]))
if len(self["watch_user_num"]) == 2:
watch_user_num = self.str2int(self["watch_user_num"][0])
click_num = self.str2int(self["watch_user_num"][1])
else:
watch_user_num = self.str2int(self["watch_user_num"])
click_num = 0
crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
watch_user_num, click_num, crawl_time)
return insert_sql, params
def str2int(self, num):
num = num.replace(",", "")
return int(num)报错内容:
[Failure instance: Traceback: <class 'KeyError'>: 'content'
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/threading.py:914:_bootstrap_inner
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/threading.py:862:run
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/_threads/_threadworker.py:46:work
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/_threads/_team.py:190:doWork
--- <exception caught here> ---
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/threadpool.py:250:inContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/threadpool.py:266:<lambda>
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/context.py:122:callWithContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/context.py:85:callWithContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/enterprise/adbapi.py:477:_runInteraction
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/compat.py:463:reraise
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/enterprise/adbapi.py:467:_runInteraction
/Users/liyang/PythonProjects/ArticleSpider/ArticleSpider/pipelines.py:84:do_insert
/Users/liyang/PythonProjects/ArticleSpider/ArticleSpider/items.py:123:get_insert_sql
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/scrapy/item.py:59:__getitem__
]
1回答
-
PearSheep
提问者
2018-01-11
只需要将以下的代码:
content = "".join(self["content"])
替换为以下代码即可:
if "content" in self: content = "".join(self["content"]) else: content = self["content"] = "None"
谢谢老师!
112018-01-15
相似问题