无法进入parse函数
来源:6-14 item loder方式提取question - 1

浮生长恨欢娱少
2020-11-27
老师,你好!向你请教几个问题
1.在使用selenium已成功登录知乎后,想提取all_urls,进行debug时无法跳转到parse函数,代码会重新执行start_requestion函数进行登录,这时会报无法找到登录界面切换密码登录的元素而终止:raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {“method”:“xpath”,“selector”:"//div[@class=“SignFlow-tabs”]/div[2]"}
(Session info: chrome=86.0.4240.111)
2.我尝试使用try expect 语句将初始登录的代码写到expect中去,在已登录成功知乎的情况下在debug,也无法成功,报如下错误:
2020-11-27 01:34:28 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
Unhandled error in Deferred:
2020-11-27 01:34:30 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File “C:\Python\lib\site-packages\scrapy\crawler.py”, line 177, in crawl
return self._crawl(crawler, *args, **kwargs)
File “C:\Python\lib\site-packages\scrapy\crawler.py”, line 181, in _crawl
d = crawler.crawl(*args, **kwargs)
File “C:\Python\lib\site-packages\twisted\internet\defer.py”, line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File “C:\Python\lib\site-packages\twisted\internet\defer.py”, line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
— —
File “C:\Python\lib\site-packages\twisted\internet\defer.py”, line 1418, in _inlineCallbacks
result = g.send(result)
File “C:\Python\lib\site-packages\scrapy\crawler.py”, line 90, in crawl
start_requests = iter(self.spider.start_requests())
builtins.TypeError: ‘NoneType’ object is not iterable
2020-11-27 01:34:30 [twisted] CRITICAL:
Traceback (most recent call last):
File “C:\Python\lib\site-packages\twisted\internet\defer.py”, line 1418, in _inlineCallbacks
result = g.send(result)
File “C:\Python\lib\site-packages\scrapy\crawler.py”, line 90, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: ‘NoneType’ object is not iterable
3.我尝试先退出知乎登录,然后再debug发现可以进入parse函数并成功返回了url值
如果每次debug时都要退出手动知乎后再进行其它操作就显得如此“笨比”,反复思考后认为操作2中的逻辑没有问题,但debug始终无法进入parse函数,多次尝试无果后请教老师,后续代码是哪里出了问题?
代码如下:`# -- coding: utf-8 --
class ZhihuSpider(scrapy.Spider):
name = 'zhihu’
allowed_domains = [‘www.zhihu.com’]
start_urls = [‘https://www.zhihu.com/’]
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
def start_question(self):
pass
def parse_answer(self):
pass
def parse(self, response):
# cookie_dict = response.meta.get('cookies')
all_urls = response.css('a::attr(href)').extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
for url in all_urls:
print(url)
def start_requests(self):
# 知乎存在反爬,使用selenium驱动Chromedriver模拟登录
chrome_option = Options()
chrome_option.add_argument('--disable-extensions')
chrome_option.add_experimental_option('debuggerAddress', '127.0.0.1:9222')
# 指定Chrome浏览器绝对路径,否则会报浏览器启动路径错误
option_location = webdriver.ChromeOptions()
option_location.binary_location = r'F:/Google/Application/chrome.exe'
browser = webdriver.Chrome(executable_path='E:/Scrapy code/zhihu/chromedriver.exe', options=option_location,
chrome_options=chrome_option)
try:
browser.maximize_window()
except:
pass
browser.get('https://www.zhihu.com/signin')
# 若find_element_by_xpath/css().click()方法失效时使用获取屏幕坐标切换登录方式
# move(725, 280)
# click()
login_sign = False
try:
sign_success = browser.find_element_by_class_name('AppHeader-profile')
login_sign = True
except:
browser.find_element_by_xpath('//div[@class="SignFlow-tabs"]/div[2]').click()
time.sleep(1)
# 若出现重复添加用户名和密码,则使用下语句
# browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys('15898518817')
time.sleep(1)
# browser.find_element_by_css_selector('.SignFlow-password input').send_keys(Keys.CONTROL+'a')
browser.find_element_by_css_selector('.SignFlow-password input').send_keys('12345678ggg')
time.sleep(1)
# 启动登录按钮,.click()方法失效时使用
# move(730, 522)
# click()
browser.find_element_by_css_selector('.Button.SignFlow-submitButton').click()
time.sleep(2)
# login_sign = False
while not login_sign:
try:
sign_success = browser.find_element_by_class_name('AppHeader-profile')
login_sign = True
except:
pass
try:
english_captcha_element = browser.find_element_by_class_name('Captcha-englishImg')
except:
english_captcha_element = None
try:
chinese_captcha_element = browser.find_element_by_class_name('Captcha-chineseImg')
except:
chinese_captcha_element = None
if chinese_captcha_element:
base64_text = chinese_captcha_element.get_attribute('src')
element_relative = chinese_captcha_element.location
x_relative = element_relative['x']
y_relative = element_relative['y']
browser_opposite = browser.execute_script('return window.outerHeight - window.innerHeight;')
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
fh = open('yzm_cn.jpeg', 'wb')
fh.write(base64.b64decode(code))
fh.close()
z = zheye()
positions = z.Recognize('yzm_cn.jpeg')
last_position = []
if len(positions) == 1:
last_position.append([positions[0][1], positions[0][0]])
move(x_relative + int(last_position[0][0]/2), y_relative + int(last_position[0][1]/2) + browser_opposite)
time.sleep(1)
click()
else:
if positions[0][1] > positions[1][1]:
last_position.append([positions[1][1], positions[1][0]])
last_position.append([positions[0][1], positions[0][0]])
else:
last_position.append([positions[0][1], positions[0][0]])
last_position.append([positions[1][1], positions[1][0]])
first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
second_position = [int(last_position[1][0] / 2), int(last_position[1][1] / 2)]
move(x_relative+first_position[0], y_relative+first_position[1]+browser_opposite)
time.sleep(1)
click()
move(x_relative+second_position[0], y_relative+second_position[1]+browser_opposite)
time.sleep(1)
click()
print(last_position)
browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys('15898518817')
time.sleep(1)
browser.find_element_by_css_selector('.SignFlow-password input').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input').send_keys('12345678gg')
time.sleep(1)
browser.find_element_by_css_selector('.Button.SignFlow-submitButton').click()
if english_captcha_element:
base64_text = english_captcha_element.get_attribute('src')
code = base64_text.replace('data:image/jpg;base64,', '').replace('%0A', '')
fh = open('yzm_en.jpeg', 'wb')
fh.write(base64.b64decode(code))
fh.close()
cjy = chaojiying.Chaojiying_Client('personal', '123456789', '910064')
im = open('yzm_en.jpeg', 'rb').read()
js_dict = cjy.PostPic(im, 1902)
en_yzm = js_dict['pic_str']
while True:
if en_yzm == '':
en_yzm = cjy.PostPic(im, 1906)['pic_str']
else:
break
time.sleep(2)
browser.find_element_by_xpath('//div[@class="Captcha SignFlow-captchaContainer"]/div/div/label/input').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_xpath('//div[@class="Captcha SignFlow-captchaContainer"]/div/div/label/input').send_keys(en_yzm)
time.sleep(1)
browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys(
Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-accountInput.Input-wrapper input').send_keys(
'15898518817')
time.sleep(1)
browser.find_element_by_css_selector('.SignFlow-password input').send_keys(Keys.CONTROL + 'a')
browser.find_element_by_css_selector('.SignFlow-password input').send_keys('12345678gg')
time.sleep(1)
browser.find_element_by_css_selector('.Button.SignFlow-submitButton').click()
time.sleep(3)
try:
sign_success = browser.find_element_by_class_name("AppHeader-profile")
login_sign = True
cookies = browser.get_cookies()
pickle.dump(cookies, open('E:/Scrapy code/zhihu/zhihu_article/cookies/zhihu.cookie', 'wb'))
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie['name']] = cookie['value']
browser.close()
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict, headers=self.headers)]
except:
pass
4.由于个人水平属实菜的离谱,观看爬取知乎这一章很费劲,加上知乎几经更改已和原视频的一些章节内容存在差距,动手起来大部分时间都在排错找错,希望能向老师提个建议,补录一两个视频用于知乎这章的前后新旧内容衔接,也能方便后续同学看到这里时能快速上手,谢谢老师!
1回答
-
这门课程我也在规划中 关于重新录制 或者补录的内容,但是因为我最近新上线的课程还没有录制完成,所以需要等到新课录制完成以后才会有时间, 如果可以的话 考虑将知乎章节全部重新录制一次
122020-11-30
相似问题