老师请问关于京东爬虫threadpool改写的问题
来源:9-13 课后作业和总结

JackyBreak
2020-03-22
我照着之前的thread pool教程改写了爬虫,但是遇到了个问题,就是它在爬取所有页码的所有商品id的时候是成功的,而且通过浏览器同时打开的数量也能得知再爬取商品id的时候多线程是起作用的,但是它不会继续地去爬取每个商品地详情,而是直接exit code 0,我想了好久不太知道原因是什么。求解谢谢!!
from selenium import webdriver
from scrapy import Selector
import json
import time
import re
from datetime import datetime
from jd_spider.models import *
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
good_id_list = []
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("blink-settings=imagesEnabled=false")
def process_value(num_str):
"""
extract number from a string
:param num_str: a string contains number and other characters
:return: success: the number in the string; failure: 0
"""
nums = 0
re_match = re.search("(\d+)", num_str)
if re_match:
return int(re_match.group(1))
else:
return 0
def parse_good(goodid):
browser = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
chrome_options=chrome_options)
browser.get("https://item.jd.com/{}.html".format(goodid))
sel = Selector(text=browser.page_source)
good = Good(id=goodid)
name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
price = "".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(goodid)).extract()).strip()
detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract()).strip()
good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
re_match = re.search('<a href="//(.*).jd.com', supplier_info)
if re_match:
good.supplier = re_match.group(1)
else:
good.supplier = "京东"
good.name = name
good.price = price
good.content = detail
good.image_list = json.dumps(good_images)
ggbz_btn = browser.find_element_by_xpath("//div[@id='detail']//li[contains(text(),'规格与包装')]")
ggbz_btn.click()
time.sleep(3)
sel = Selector(text=browser.page_source)
ggbz_detail = sel.xpath("//div[@id='detail']//div[@class='tab-con']/div[@style='display: block;']").extract()
good.ggbz = ggbz_detail
comment_btn = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
comment_btn.click()
time.sleep(5)
sel = Selector(text=browser.page_source)
tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract()
good_rate_ratio = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
good.good_rate = good_rate_ratio
summary_as = sel.xpath("//ul[@class='filter-list']/li[@data-tab='trigger']")
for summary in summary_as:
name = summary.xpath("./a/text()").extract()[0]
num_str = summary.xpath("./@data-num").extract()[0]
num = process_value(num_str)
if name == "全部评价":
good.comments_nums = num
elif name == "晒图":
good.has_image_comment_nums = num
elif name == "视频晒单":
good.has_video_comment_nums = num
elif name == "追评":
good.has_add_comment_nums = num
elif name == "好评":
good.well_comment_nums = num
elif name == "中评":
good.middle_comment_nums = num
elif name == "差评":
good.bad_comment_nums = num
existed_good = Good.select().where(Good.id == good.id)
if existed_good:
good.save()
else:
good.save(force_insert=True)
for tag in tag_list:
re_match = re.search("(.*)\((\d+)\)", tag)
if re_match:
tag_name = re_match.group(1)
nums = (re_match.group(2))
existed_summary = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good, GoodEvaluateSummary.tag == tag_name)
if existed_summary:
summary = existed_summary[0]
else:
summary = GoodEvaluateSummary(good=good)
summary.tag = tag_name
summary.num = nums
summary.save()
has_next_page = True
while has_next_page:
all_eva = sel.xpath("//div[@class='comment-item']")
for item in all_eva:
good_evaluate = GoodEvaluate(good=good)
evaluate_id = item.xpath("./@data-guid").extract()[0]
good_evaluate.id = evaluate_id
user_head_url = "https:{}".format(item.xpath(".//div[@class='user-info']/img/@src").extract()[0])
user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
good_evaluate.user_head_url = user_head_url
good_evaluate.user_name = user_name
star = item.xpath("./div[@class='comment-column J-comment-column']/div[1]/@class").extract()[0]
star = int(star[-1])
good_evaluate.star = star
eva_comment = "".join(item.xpath("./div[@class='comment-column J-comment-column']/p["
"@class='comment-con']/text()").extract()[0]).strip()
good_evaluate.content = eva_comment
image_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='pic-list "
"J-pic-list']/a/img/@src").extract()
video_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='J-video-view-wrap "
"clearfix']//video/@src").extract()
good_evaluate.image_list = json.dumps(image_list)
good_evaluate.video_list = json.dumps(video_list)
comment_num = int(item.xpath(".//div[@class='comment-op']//a[3]/text()").extract()[0])
praise_num = int(item.xpath(".//div[@class='comment-op']//a[2]/text()").extract()[0])
good_evaluate.comment_nums = comment_num
good_evaluate.praised_nums = praise_num
good_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
eva_time = good_info[-1]
good_info = good_info[:-1]
good_info = json.dumps(good_info)
eva_time = datetime.strptime(eva_time, "%Y-%m-%d %H:%M")
good_evaluate.evaluate_time = eva_time
good_evaluate.good_info = good_info
existed_good_evaluate = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
if existed_good_evaluate:
good_evaluate.save()
else:
good_evaluate.save(force_insert=True)
try:
next_page_ele = browser.find_element_by_xpath("//a[@clstag='shangpin|keycount|product|pinglunfanye-nextpage']")
next_page_ele.send_keys("\n")
time.sleep(5)
sel = Selector(text=browser.page_source)
except NoSuchElementException as e:
has_next_page = False
def get_all_pages(start_url):
"""
获取每一页的url
:param start_url: 搜索结果页面 一般来说有很多页
:return: 返回一个list 里面是搜索结果所有页码的url
"""
browser2 = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
chrome_options=chrome_options)
browser2.get(start_url)
sel = Selector(text=browser2.page_source)
num = sel.xpath("//div[@class='page clearfix']//a/text()").extract()[-3]
pages = []
for i in range(1, 10): #暂时先测试前9页
new_page = "{}&page={}".format(start_url, i)
pages.append(new_page)
return pages
def get_good_ids(curr_page_url):
"""
过去当前页面所有产品的good id
:param curr_page_url: 当前页面url
:return: 使用parse_good解析每个good_id
"""
# print("processing page: {}".format(page))
browser3 = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
chrome_options=chrome_options)
browser3.get(curr_page_url)
sel = Selector(text=browser3.page_source)
all_ids_part1 = sel.xpath("//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()
all_ids_part2 = sel.xpath("//div[@class='gl-i-wrap j-sku-item ']/@data-sku").extract()
all_ids = all_ids_part1 + all_ids_part2
for curr_id in all_ids:
print("processing good: https://item.jd.com/{}.html".format(curr_id))
executor.submit(parse_good, curr_id)
if __name__ == "__main__":
executor = ThreadPoolExecutor(max_workers=8)
all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url
for page in all_pages:
executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id
经测试每个方法单独运行都没有问题 但是用上threadpool之后程序就好像直接跳过了201行的“executor.submit(parse_good, curr_id)”
写回答
3回答
-
JackyBreak
提问者
2020-03-26
from peewee import * from datetime import date db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="1234") class BaseModel(Model): class Meta: database = db class Good(BaseModel): id = IntegerField(primary_key=True, verbose_name="商品id") name = CharField(max_length=500, verbose_name="商品名称") content = TextField(default="", verbose_name="商品描述") supplier = CharField(max_length=500, default="") ggbz = TextField(default="", verbose_name="规格和包装") image_list = TextField(default="", verbose_name="商品的轮播图") price = FloatField(default=0.0, verbose_name="商品价格") good_rate = IntegerField(default=0, verbose_name="好评率") comments_nums = IntegerField(default=0, verbose_name="评论数") has_image_comment_nums = IntegerField(default=0, verbose_name="晒图数") has_video_comment_nums = IntegerField(default=0, verbose_name="视频晒单数") has_add_comment_nums = IntegerField(default=0, verbose_name="追评数") well_comment_nums = IntegerField(default=0, verbose_name="好评数") middle_comment_nums = IntegerField(default=0, verbose_name="中评数") bad_comment_nums = IntegerField(default=0, verbose_name="差评数") class GoodEvaluate(BaseModel): id = CharField(primary_key=True) good = ForeignKeyField(Good,verbose_name="商品") user_head_url = CharField(verbose_name="用户头像") user_name = CharField(verbose_name="用户名") good_info = CharField(max_length=500, default="", verbose_name="购买的商品的信息") evaluate_time = DateTimeField(verbose_name="评价时间") content = TextField(default="", verbose_name="评论内容") star = IntegerField(default=0, verbose_name="评分") comment_nums = IntegerField(default=0, verbose_name="评论数") praised_nums = IntegerField(default=0, verbose_name="点赞数") image_list = TextField(default="", verbose_name="图片") video_list = TextField(default="", verbose_name="视频") class GoodEvaluateSummary(BaseModel): good = ForeignKeyField(Good, verbose_name="商品") tag = CharField(max_length=20, verbose_name="标签") num = IntegerField(default=0, verbose_name="数量") if __name__ == "__main__": db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary])
00 -
bobby
2020-03-23
因为我这里没有你的model代码 所有我将里面关于model的部分全部删除了 然后重新跑了一遍,速度挺快的,开的chrome非常多,你检查一下是不是因为model查询导致了很慢呢?
from selenium import webdriver from scrapy import Selector import json import time import re from datetime import datetime from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.chrome.options import Options from concurrent.futures import ThreadPoolExecutor good_id_list = [] chrome_options = Options() # chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("blink-settings=imagesEnabled=false") def process_value(num_str): """ extract number from a string :param num_str: a string contains number and other characters :return: success: the number in the string; failure: 0 """ nums = 0 re_match = re.search("(\d+)", num_str) if re_match: return int(re_match.group(1)) else: return 0 def parse_good(goodid): browser = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe", chrome_options=chrome_options) browser.get("https://item.jd.com/{}.html".format(goodid)) sel = Selector(text=browser.page_source) name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip() price = "".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(goodid)).extract()).strip() detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract()).strip() good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract() supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract()) re_match = re.search('<a href="//(.*).jd.com', supplier_info) ggbz_btn = browser.find_element_by_xpath("//div[@id='detail']//li[contains(text(),'规格与包装')]") ggbz_btn.click() time.sleep(3) sel = Selector(text=browser.page_source) ggbz_detail = sel.xpath("//div[@id='detail']//div[@class='tab-con']/div[@style='display: block;']").extract() comment_btn = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']") comment_btn.click() time.sleep(5) sel = Selector(text=browser.page_source) tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract() good_rate_ratio = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0]) summary_as = sel.xpath("//ul[@class='filter-list']/li[@data-tab='trigger']") for summary in summary_as: name = summary.xpath("./a/text()").extract()[0] num_str = summary.xpath("./@data-num").extract()[0] num = process_value(num_str) for tag in tag_list: re_match = re.search("(.*)\((\d+)\)", tag) if re_match: tag_name = re_match.group(1) nums = (re_match.group(2)) summary.tag = tag_name summary.num = nums summary.save() has_next_page = True while has_next_page: all_eva = sel.xpath("//div[@class='comment-item']") for item in all_eva: evaluate_id = item.xpath("./@data-guid").extract()[0] user_head_url = "https:{}".format(item.xpath(".//div[@class='user-info']/img/@src").extract()[0]) user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip() star = item.xpath("./div[@class='comment-column J-comment-column']/div[1]/@class").extract()[0] star = int(star[-1]) eva_comment = "".join(item.xpath("./div[@class='comment-column J-comment-column']/p[" "@class='comment-con']/text()").extract()[0]).strip() image_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='pic-list " "J-pic-list']/a/img/@src").extract() video_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='J-video-view-wrap " "clearfix']//video/@src").extract() comment_num = int(item.xpath(".//div[@class='comment-op']//a[3]/text()").extract()[0]) praise_num = int(item.xpath(".//div[@class='comment-op']//a[2]/text()").extract()[0]) good_info = item.xpath(".//div[@class='order-info']/span/text()").extract() eva_time = good_info[-1] good_info = good_info[:-1] good_info = json.dumps(good_info) eva_time = datetime.strptime(eva_time, "%Y-%m-%d %H:%M") try: next_page_ele = browser.find_element_by_xpath("//a[@clstag='shangpin|keycount|product|pinglunfanye-nextpage']") next_page_ele.send_keys("\n") time.sleep(5) sel = Selector(text=browser.page_source) except NoSuchElementException as e: has_next_page = False def get_all_pages(start_url): """ 获取每一页的url :param start_url: 搜索结果页面 一般来说有很多页 :return: 返回一个list 里面是搜索结果所有页码的url """ browser2 = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe", chrome_options=chrome_options) browser2.get(start_url) sel = Selector(text=browser2.page_source) num = sel.xpath("//div[@class='page clearfix']//a/text()").extract()[-3] pages = [] for i in range(1, 10): #暂时先测试前9页 new_page = "{}&page={}".format(start_url, i) pages.append(new_page) return pages def get_good_ids(curr_page_url): """ 过去当前页面所有产品的good id :param curr_page_url: 当前页面url :return: 使用parse_good解析每个good_id """ # print("processing page: {}".format(page)) browser3 = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe", chrome_options=chrome_options) browser3.get(curr_page_url) sel = Selector(text=browser3.page_source) all_ids_part1 = sel.xpath("//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract() all_ids_part2 = sel.xpath("//div[@class='gl-i-wrap j-sku-item ']/@data-sku").extract() all_ids = all_ids_part1 + all_ids_part2 for curr_id in all_ids: print("processing good: https://item.jd.com/{}.html".format(curr_id)) executor.submit(parse_good, curr_id) if __name__ == "__main__": executor = ThreadPoolExecutor(max_workers=8) all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url for page in all_pages: executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id while 1: time.sleep(1)
042020-03-30 -
JackyBreak
提问者
2020-03-22
在main方法里面加上了sleep之后虽然程序可以继续爬取商品详细内容了 但是速度特别慢不知道为什么 而且还是一条一条爬取 然后程序也会越多占用cpu资源
if __name__ == "__main__": stop = False executor = ThreadPoolExecutor(max_workers=15) all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url for page in all_pages: executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id while not stop: time.sleep(1)
00
相似问题