老师请问关于京东爬虫threadpool改写的问题

来源:9-13 课后作业和总结

JackyBreak

2020-03-22

我照着之前的thread pool教程改写了爬虫,但是遇到了个问题,就是它在爬取所有页码的所有商品id的时候是成功的,而且通过浏览器同时打开的数量也能得知再爬取商品id的时候多线程是起作用的,但是它不会继续地去爬取每个商品地详情,而是直接exit code 0,我想了好久不太知道原因是什么。求解谢谢!!

from selenium import webdriver
from scrapy import Selector
import json
import time
import re
from datetime import datetime
from jd_spider.models import *
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor

good_id_list = []
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("blink-settings=imagesEnabled=false")


def process_value(num_str):
    """
    extract number from a string
    :param num_str: a string contains number and other characters
    :return: success: the number in the string; failure: 0
    """
    nums = 0
    re_match = re.search("(\d+)", num_str)
    if re_match:
        return int(re_match.group(1))
    else:
        return 0


def parse_good(goodid):
    browser = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
                               chrome_options=chrome_options)
    browser.get("https://item.jd.com/{}.html".format(goodid))

    sel = Selector(text=browser.page_source)

    good = Good(id=goodid)
    name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
    price = "".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(goodid)).extract()).strip()
    detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract()).strip()
    good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
    supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
    re_match = re.search('<a href="//(.*).jd.com', supplier_info)
    if re_match:
        good.supplier = re_match.group(1)
    else:
        good.supplier = "京东"

    good.name = name
    good.price = price
    good.content = detail
    good.image_list = json.dumps(good_images)

    ggbz_btn = browser.find_element_by_xpath("//div[@id='detail']//li[contains(text(),'规格与包装')]")
    ggbz_btn.click()
    time.sleep(3)
    sel = Selector(text=browser.page_source)
    ggbz_detail = sel.xpath("//div[@id='detail']//div[@class='tab-con']/div[@style='display: block;']").extract()
    good.ggbz = ggbz_detail

    comment_btn = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
    comment_btn.click()
    time.sleep(5)
    sel = Selector(text=browser.page_source)
    tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract()
    good_rate_ratio = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
    good.good_rate = good_rate_ratio

    summary_as = sel.xpath("//ul[@class='filter-list']/li[@data-tab='trigger']")
    for summary in summary_as:
        name = summary.xpath("./a/text()").extract()[0]
        num_str = summary.xpath("./@data-num").extract()[0]
        num = process_value(num_str)

        if name == "全部评价":
            good.comments_nums = num
        elif name == "晒图":
            good.has_image_comment_nums = num
        elif name == "视频晒单":
            good.has_video_comment_nums = num
        elif name == "追评":
            good.has_add_comment_nums = num
        elif name == "好评":
            good.well_comment_nums = num
        elif name == "中评":
            good.middle_comment_nums = num
        elif name == "差评":
            good.bad_comment_nums = num

    existed_good = Good.select().where(Good.id == good.id)
    if existed_good:
        good.save()
    else:
        good.save(force_insert=True)

    for tag in tag_list:
        re_match = re.search("(.*)\((\d+)\)", tag)
        if re_match:
            tag_name = re_match.group(1)
            nums = (re_match.group(2))

            existed_summary = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good, GoodEvaluateSummary.tag == tag_name)
            if existed_summary:
                summary = existed_summary[0]
            else:
                summary = GoodEvaluateSummary(good=good)

            summary.tag = tag_name
            summary.num = nums
            summary.save()
    has_next_page = True
    while has_next_page:
        all_eva = sel.xpath("//div[@class='comment-item']")
        for item in all_eva:
            good_evaluate = GoodEvaluate(good=good)

            evaluate_id = item.xpath("./@data-guid").extract()[0]
            good_evaluate.id = evaluate_id
            user_head_url = "https:{}".format(item.xpath(".//div[@class='user-info']/img/@src").extract()[0])
            user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()

            good_evaluate.user_head_url = user_head_url
            good_evaluate.user_name = user_name

            star = item.xpath("./div[@class='comment-column J-comment-column']/div[1]/@class").extract()[0]
            star = int(star[-1])
            good_evaluate.star = star
            eva_comment = "".join(item.xpath("./div[@class='comment-column J-comment-column']/p["
                                             "@class='comment-con']/text()").extract()[0]).strip()
            good_evaluate.content = eva_comment
            image_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='pic-list "
                                    "J-pic-list']/a/img/@src").extract()
            video_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='J-video-view-wrap "
                                    "clearfix']//video/@src").extract()
            good_evaluate.image_list = json.dumps(image_list)
            good_evaluate.video_list = json.dumps(video_list)
            comment_num = int(item.xpath(".//div[@class='comment-op']//a[3]/text()").extract()[0])
            praise_num = int(item.xpath(".//div[@class='comment-op']//a[2]/text()").extract()[0])
            good_evaluate.comment_nums = comment_num
            good_evaluate.praised_nums = praise_num
            good_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
            eva_time = good_info[-1]
            good_info = good_info[:-1]
            good_info = json.dumps(good_info)
            eva_time = datetime.strptime(eva_time, "%Y-%m-%d %H:%M")
            good_evaluate.evaluate_time = eva_time
            good_evaluate.good_info = good_info

            existed_good_evaluate = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
            if existed_good_evaluate:
                good_evaluate.save()
            else:
                good_evaluate.save(force_insert=True)

        try:
            next_page_ele = browser.find_element_by_xpath("//a[@clstag='shangpin|keycount|product|pinglunfanye-nextpage']")
            next_page_ele.send_keys("\n")
            time.sleep(5)
            sel = Selector(text=browser.page_source)
        except NoSuchElementException as e:
            has_next_page = False


def get_all_pages(start_url):
    """
    获取每一页的url
    :param start_url: 搜索结果页面 一般来说有很多页
    :return: 返回一个list 里面是搜索结果所有页码的url
    """
    browser2 = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
                                chrome_options=chrome_options)
    browser2.get(start_url)
    sel = Selector(text=browser2.page_source)
    num = sel.xpath("//div[@class='page clearfix']//a/text()").extract()[-3]
    pages = []
    for i in range(1, 10): #暂时先测试前9页
        new_page = "{}&page={}".format(start_url, i)
        pages.append(new_page)
    return pages


def get_good_ids(curr_page_url):
    """
    过去当前页面所有产品的good id
    :param curr_page_url: 当前页面url
    :return: 使用parse_good解析每个good_id
    """
    # print("processing page: {}".format(page))
    browser3 = webdriver.Chrome(executable_path="C:/Users/JackyBreak/Downloads/chromedriver_win32/chromedriver.exe",
                                chrome_options=chrome_options)
    browser3.get(curr_page_url)
    sel = Selector(text=browser3.page_source)
    all_ids_part1 = sel.xpath("//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()
    all_ids_part2 = sel.xpath("//div[@class='gl-i-wrap j-sku-item ']/@data-sku").extract()
    all_ids = all_ids_part1 + all_ids_part2
    for curr_id in all_ids:
        print("processing good: https://item.jd.com/{}.html".format(curr_id))
        executor.submit(parse_good, curr_id)


if __name__ == "__main__":
    executor = ThreadPoolExecutor(max_workers=8)
    all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url
    for page in all_pages:
        executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id




经测试每个方法单独运行都没有问题 但是用上threadpool之后程序就好像直接跳过了201行的“executor.submit(parse_good, curr_id)”

写回答

3回答

JackyBreak

提问者

2020-03-26

from peewee import *
from datetime import date

db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="1234")


class BaseModel(Model):
    class Meta:
        database = db


class Good(BaseModel):
    id = IntegerField(primary_key=True, verbose_name="商品id")
    name = CharField(max_length=500, verbose_name="商品名称")
    content = TextField(default="", verbose_name="商品描述")
    supplier = CharField(max_length=500, default="")
    ggbz = TextField(default="", verbose_name="规格和包装")
    image_list = TextField(default="", verbose_name="商品的轮播图")
    price = FloatField(default=0.0, verbose_name="商品价格")

    good_rate = IntegerField(default=0, verbose_name="好评率")
    comments_nums = IntegerField(default=0, verbose_name="评论数")
    has_image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
    has_video_comment_nums = IntegerField(default=0, verbose_name="视频晒单数")
    has_add_comment_nums = IntegerField(default=0, verbose_name="追评数")
    well_comment_nums = IntegerField(default=0, verbose_name="好评数")
    middle_comment_nums = IntegerField(default=0, verbose_name="中评数")
    bad_comment_nums = IntegerField(default=0, verbose_name="差评数")


class GoodEvaluate(BaseModel):
    id = CharField(primary_key=True)
    good = ForeignKeyField(Good,verbose_name="商品")
    user_head_url = CharField(verbose_name="用户头像")
    user_name = CharField(verbose_name="用户名")
    good_info = CharField(max_length=500, default="", verbose_name="购买的商品的信息")
    evaluate_time = DateTimeField(verbose_name="评价时间")
    content = TextField(default="", verbose_name="评论内容")
    star = IntegerField(default=0, verbose_name="评分")
    comment_nums = IntegerField(default=0, verbose_name="评论数")
    praised_nums = IntegerField(default=0, verbose_name="点赞数")
    image_list = TextField(default="", verbose_name="图片")
    video_list = TextField(default="", verbose_name="视频")


class GoodEvaluateSummary(BaseModel):
    good = ForeignKeyField(Good, verbose_name="商品")
    tag = CharField(max_length=20, verbose_name="标签")
    num = IntegerField(default=0, verbose_name="数量")


if __name__ == "__main__":
    db.create_tables([Good, GoodEvaluate, GoodEvaluateSummary])


0
0

bobby

2020-03-23

因为我这里没有你的model代码 所有我将里面关于model的部分全部删除了 然后重新跑了一遍,速度挺快的,开的chrome非常多,你检查一下是不是因为model查询导致了很慢呢?

from selenium import webdriver
from scrapy import Selector
import json
import time
import re
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor

good_id_list = []
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("blink-settings=imagesEnabled=false")


def process_value(num_str):
    """
    extract number from a string
    :param num_str: a string contains number and other characters
    :return: success: the number in the string; failure: 0
    """
    nums = 0
    re_match = re.search("(\d+)", num_str)
    if re_match:
        return int(re_match.group(1))
    else:
        return 0


def parse_good(goodid):
    browser = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe",
                               chrome_options=chrome_options)
    browser.get("https://item.jd.com/{}.html".format(goodid))

    sel = Selector(text=browser.page_source)

    name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
    price = "".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(goodid)).extract()).strip()
    detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract()).strip()
    good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
    supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
    re_match = re.search('<a href="//(.*).jd.com', supplier_info)


    ggbz_btn = browser.find_element_by_xpath("//div[@id='detail']//li[contains(text(),'规格与包装')]")
    ggbz_btn.click()
    time.sleep(3)
    sel = Selector(text=browser.page_source)
    ggbz_detail = sel.xpath("//div[@id='detail']//div[@class='tab-con']/div[@style='display: block;']").extract()

    comment_btn = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
    comment_btn.click()
    time.sleep(5)
    sel = Selector(text=browser.page_source)
    tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract()
    good_rate_ratio = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])

    summary_as = sel.xpath("//ul[@class='filter-list']/li[@data-tab='trigger']")
    for summary in summary_as:
        name = summary.xpath("./a/text()").extract()[0]
        num_str = summary.xpath("./@data-num").extract()[0]
        num = process_value(num_str)



    for tag in tag_list:
        re_match = re.search("(.*)\((\d+)\)", tag)
        if re_match:
            tag_name = re_match.group(1)
            nums = (re_match.group(2))
            summary.tag = tag_name
            summary.num = nums
            summary.save()
    has_next_page = True
    while has_next_page:
        all_eva = sel.xpath("//div[@class='comment-item']")
        for item in all_eva:

            evaluate_id = item.xpath("./@data-guid").extract()[0]
            user_head_url = "https:{}".format(item.xpath(".//div[@class='user-info']/img/@src").extract()[0])
            user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()


            star = item.xpath("./div[@class='comment-column J-comment-column']/div[1]/@class").extract()[0]
            star = int(star[-1])
            eva_comment = "".join(item.xpath("./div[@class='comment-column J-comment-column']/p["
                                             "@class='comment-con']/text()").extract()[0]).strip()
            image_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='pic-list "
                                    "J-pic-list']/a/img/@src").extract()
            video_list = item.xpath("./div[@class='comment-column J-comment-column']/div[@class='J-video-view-wrap "
                                    "clearfix']//video/@src").extract()
            comment_num = int(item.xpath(".//div[@class='comment-op']//a[3]/text()").extract()[0])
            praise_num = int(item.xpath(".//div[@class='comment-op']//a[2]/text()").extract()[0])
            good_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
            eva_time = good_info[-1]
            good_info = good_info[:-1]
            good_info = json.dumps(good_info)
            eva_time = datetime.strptime(eva_time, "%Y-%m-%d %H:%M")

        try:
            next_page_ele = browser.find_element_by_xpath("//a[@clstag='shangpin|keycount|product|pinglunfanye-nextpage']")
            next_page_ele.send_keys("\n")
            time.sleep(5)
            sel = Selector(text=browser.page_source)
        except NoSuchElementException as e:
            has_next_page = False


def get_all_pages(start_url):
    """
    获取每一页的url
    :param start_url: 搜索结果页面 一般来说有很多页
    :return: 返回一个list 里面是搜索结果所有页码的url
    """
    browser2 = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe",
                                chrome_options=chrome_options)
    browser2.get(start_url)
    sel = Selector(text=browser2.page_source)
    num = sel.xpath("//div[@class='page clearfix']//a/text()").extract()[-3]
    pages = []
    for i in range(1, 10): #暂时先测试前9页
        new_page = "{}&page={}".format(start_url, i)
        pages.append(new_page)
    return pages


def get_good_ids(curr_page_url):
    """
    过去当前页面所有产品的good id
    :param curr_page_url: 当前页面url
    :return: 使用parse_good解析每个good_id
    """
    # print("processing page: {}".format(page))
    browser3 = webdriver.Chrome(executable_path="D:/c盘下载/chromedriver/chromedriver.exe",
                                chrome_options=chrome_options)
    browser3.get(curr_page_url)
    sel = Selector(text=browser3.page_source)
    all_ids_part1 = sel.xpath("//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract()
    all_ids_part2 = sel.xpath("//div[@class='gl-i-wrap j-sku-item ']/@data-sku").extract()
    all_ids = all_ids_part1 + all_ids_part2
    for curr_id in all_ids:
        print("processing good: https://item.jd.com/{}.html".format(curr_id))
        executor.submit(parse_good, curr_id)


if __name__ == "__main__":
    executor = ThreadPoolExecutor(max_workers=8)
    all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url
    for page in all_pages:
        executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id

    while 1:
        time.sleep(1)


0
4
bobby
回复
JackyBreak
我看到你的代码中好像很多地方open 了chrome以后都没有进行close?
2020-03-30
共4条回复

JackyBreak

提问者

2020-03-22

在main方法里面加上了sleep之后虽然程序可以继续爬取商品详细内容了 但是速度特别慢不知道为什么 而且还是一条一条爬取 然后程序也会越多占用cpu资源

if __name__ == "__main__":
    stop = False
    executor = ThreadPoolExecutor(max_workers=15)
    all_pages = get_all_pages("https://list.jd.com/list.html?cat=9987,653,655") #返回一个list 里面是所有页码的url
    for page in all_pages:
        executor.submit(get_good_ids, page) #针对于每一页的url 使用get_good_id
    while not stop:
        time.sleep(1)


0
0

Python爬虫工程师实战 大数据时代必备

慕课网严选精品教程,高质量内容+服务!

2377 学习 · 1158 问题

查看课程