如果知乎 question 的 content 为空,就会报错
来源:6-20 保存数据到mysql中 -2
PearSheep
2018-01-11
Pipelines 底下的MysqlTwistedPipeline: class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset = 'utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True, ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用 twisted 将 mysql 插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) #处理异常 def handle_error(self, failure, item, spider): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): # 执行具体的插入 # 根据不同的 item 构建不同的 sql 语句并插入到 MySQL 中 insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params)
class ZhihuQuestionItem(scrapy.Item): #知乎的问题 item zhihu_id = scrapy.Field() topics = scrapy.Field() url = scrapy.Field() title = scrapy.Field() content = scrapy.Field() answer_num = scrapy.Field() comment_num = scrapy.Field() watch_user_num = scrapy.Field() click_num = scrapy.Field() crawl_time = scrapy.Field() def get_insert_sql(self): # 插入知乎 question 表的 SQL 语句 insert_sql = """ insert into zhihu_question (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ zhihu_id = self["zhihu_id"][0] topics = ",".join(self["topics"]) url = self["url"][0] title = "".join(self["title"]) content = "".join(self["content"]) answer_num = extract_num("".join(self["answer_num"])) comments_num = extract_num("".join(self["comment_num"])) if len(self["watch_user_num"]) == 2: watch_user_num = self.str2int(self["watch_user_num"][0]) click_num = self.str2int(self["watch_user_num"][1]) else: watch_user_num = self.str2int(self["watch_user_num"]) click_num = 0 crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params def str2int(self, num): num = num.replace(",", "") return int(num)
报错内容:
[Failure instance: Traceback: <class 'KeyError'>: 'content'
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/threading.py:914:_bootstrap_inner
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/threading.py:862:run
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/_threads/_threadworker.py:46:work
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/_threads/_team.py:190:doWork
--- <exception caught here> ---
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/threadpool.py:250:inContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/threadpool.py:266:<lambda>
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/context.py:122:callWithContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/context.py:85:callWithContext
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/enterprise/adbapi.py:477:_runInteraction
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/python/compat.py:463:reraise
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/twisted/enterprise/adbapi.py:467:_runInteraction
/Users/liyang/PythonProjects/ArticleSpider/ArticleSpider/pipelines.py:84:do_insert
/Users/liyang/PythonProjects/ArticleSpider/ArticleSpider/items.py:123:get_insert_sql
/Users/liyang/workspaces/articlespider/lib/python3.5/site-packages/scrapy/item.py:59:__getitem__
]
1回答
-
PearSheep
提问者
2018-01-11
只需要将以下的代码:
content = "".join(self["content"])
替换为以下代码即可:
if "content" in self: content = "".join(self["content"]) else: content = self["content"] = "None"
谢谢老师!
112018-01-15
相似问题