老师,为什么使用优化后的代码,输出的一号店的数据上的店铺只有一个字啊?单独用一号店的爬虫爬就是正常的
来源:2-13 代码优化

慕慕8105830
2019-04-07
def yhd(self):
""" 爬取一号店的数据 """
url = 'https://search.yhd.com/c0-0/k{0}/'.format(self.sn)
# 获取到html源码
html_doc = requests.get(url).text
# xpath对象
selector = html.fromstring(html_doc)
# 书籍列表
ul_list = selector.xpath('//div[@id="itemSearchList"]/div')
print(len(ul_list))
# 解析数据
for li in ul_list:
# 标题
title = li.xpath('div/p[@class="proName clearfix"]/a/@title')
print(title[0])
# 价格
price = li.xpath('div//p[@class="proPrice"]/em/@yhdprice')
print(price[0])
# 购买链接
link = li.xpath('div/p[@class="proName clearfix"]/a/@href')
print(link[0])
# 店铺
store = li.xpath('div/p[@class="searh_shop_storeName storeName limit_width"]/a/@title')
print(store[0])
book = BootEntity(
title=title[0],
price=price[0],
link=link[0],
store=store[0]
)
print(book)
self.book_list.append(book)
print('----------------------------------------------')
print('一号店数据爬取完毕')
print('----------------------------------------------')
def spider(self):
""" 得到爬取后的数据 """
self.dangdang()
#self.jd()
self.yhd()
self.taobao()
print('--------------------------------')
bk_list = sorted(self.book_list, key=lambda item: float(item.price), reverse=True)
# 按字典中的price价格倒序排列
for book in bk_list:
print(book)
写回答
4回答
-
贴一下你的代码,老师帮你看看
022019-04-20 -
David璐_senior_DBA
2019-04-27
# -*- coding:utf-8 -*- # @Time : 2019/4/26 22:47 # @Author : David # @Email : iamkuboy@163.com # @File : spider_oop.py # @Software: PyCharm import json import re from typing import NamedTuple import requests from lxml import html import taobao_cookie class BookEntity(NamedTuple): """ 定义书籍信息 """ title: str price: float link: str store: str website: str def __str__(self): return '价格:{self.price}\n\t名称:{self.title}\n\t链接:{self.link}\n\t店铺:{self.store}\n\t网站:{self.website}'.format(self=self) class MySpider(object): def __init__(self, isbn): self.isbn = isbn self.book_list = [] def dd(self): """ 爬取当当网的图书信息 """ url = 'http://search.dangdang.com/?key={isbn}'.format(isbn=self.isbn) # 获取html的内容 html_data = requests.get(url).text # xpath对象 selector = html.fromstring(html_data) # 找到书籍列表 ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li') # 解析书籍内容 for li in ul_list: # 书名 _title = li.xpath('a/@title')[0] # 价格 _price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')[0].replace('¥', '') # 链接 _link = li.xpath('a/@href')[0] # 商家 store1 = li.xpath('p[@class="search_shangjia"]/a/text()') if not store1: store2 = '当当自营' _store = store2 else: _store = store1[0] # OOP book = BookEntity( title=_title, price=_price, link=_link, store=_store, website='当当网', ) # 添加书籍列表 self.book_list.append(book) def jd(self): """ 爬取京东网的图书信息 """ url = 'https://search.jd.com/Search?keyword={0}'.format(self.isbn) # 获取html文档 resp = requests.get(url, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWeb' 'Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 ' 'Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400' }) resp.encoding = 'utf-8' html_doc = resp.text # 获取xpath对象 selector = html.fromstring(html_doc) # 找到书籍列表 ul_list = selector.xpath('//div[@id="J_goodsList"]/ul/li') # 解析内容 for li in ul_list: # 找到书籍标题 _title = li.xpath('div/div[@class="p-name"]/a/@title')[0] # 找到书籍价格 _price = li.xpath('div/div[@class="p-price"]/strong/i/text()')[0] # 找到书籍链接 _link = li.xpath('div/div[@class="p-name"]/a/@href')[0].replace('//', 'http://') # 找到书籍店铺 store1 = li.xpath('div/div[@class="p-shopnum"]/a[@class="curr-shop"]/@title') store2 = li.xpath('div/div[@class="p-icons"]/i[@class="goods-icons J-picon-tips J-picon-fix"]/text()') if store2: _store = store2[0] else: _store = store1[0] # OOP book = BookEntity( title=_title, price=_price, link=_link, store=_store, website='京东网', ) # 添加书籍列表 self.book_list.append(book) def yhd(self): """ 爬取1号店的图书信息 """ url = 'https://search.yhd.com/c0-0/k{0}/'.format(self.isbn) # 获取 html 源码 html_doc = requests.get(url).text # 获取 xpath 对象 selector = html.fromstring(html_doc) # 找到书籍列表 ul_list = selector.xpath('//div[@id="itemSearchList"]/div') # 解析内容 for li in ul_list: # 获取标题 title1 = li.xpath('div/p/a[@class="mainTitle"]/text()') if not title1: title2 = li.xpath('div//p/a[@pmid="0"]/text()') _title = title2[1].strip() else: _title = title1[1].strip() # 获取链接 link1 = li.xpath('div/p/a[@class="mainTitle"]/@href') if not link1: link2 = li.xpath('div//p/a[@pmid="0"]/@href') _link = link2[0].replace('//', 'http://') else: _link = link1[0].replace('//', 'http://') # 获取价格 _price = li.xpath('div/p/em[@class="num"]/text()')[1].strip() # 获取店铺 _store = li.xpath('div/p/a/span[@class="shop_text"]/text()')[0] # OOP book = BookEntity( title=_title, price=_price, link=_link, store=_store, website='1号店' ) # 添加书籍列表 self.book_list.append(book) def tb(self): """ 爬取淘宝网的图书信息 """ url = 'https://s.taobao.com/search?q={0}'.format(self.isbn) headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103\ Safari/537.36', 'cookie': taobao_cookie.cookie } # 获取 html 信息 html_doc = requests.get(url, headers=headers).text # 使用正则表达式找到json对象 r = re.compile(r'g_page_config = (\{.+\});\s*', re.M) rest = r.search(html_doc).group(1) if rest: data = json.loads(rest) json_list = data['mods']['itemlist']['data']['auctions'] # 提取内容 for book in json_list: # 获取标题 _title = book["raw_title"] # 获取价格 _price = book["view_price"] # 商品链接 _link = book["detail_url"].replace('//', 'http://') # 商铺名称 _store = book["nick"] # OOP book = BookEntity( title=_title, price=_price, link=_link, store=_store, website='淘宝网', ) # 添加到一个dict中 self.book_list.append(book) def spider(self): """ 得到排序后的数据 """ self.dd() self.jd() self.yhd() self.tb() print('--------------------------------') bk_list = sorted(self.book_list, key=lambda item: float(item.price), reverse=True) for book in bk_list: print(book) print('========================') print('最低价图书为:\n',bk_list[-1]) print('图书价格对比完成,感谢使用:)') if __name__ == '__main__': isbn = input("请输入您要查找的图书ISBN(如果不输入则为示例书籍):") if not isbn: isbn= 9787115478818 print('正在对比图书:"笨办法学Python3",请稍等~~') client = MySpider(isbn) client.spider()
00 -
慕慕8105830
提问者
2019-04-20
后来自己解决了,去掉了"store[0]"的"[0]"就显示正常了
012019-04-20 -
NavCat
2019-04-11
你爬取一号店的数据是正常的噢,你再仔细确认下看是不是一号店的数据有问题。或者你在github上建个仓库,把你的代码传上去,告诉我仓库地址,我要拿到全部代码才好帮你看。
00
相似问题