报错list index out of range
来源:13-10 sorted 排序
oldbang
2018-03-08
老师我抓取的全部直播这个页面 由于推荐位的关系 前面几个抓取的人数是空的列表“【】”
下面是代码
import re
from urllib import request
# 断点调试
class Spider():
url = 'https://www.douyu.com/directory/all'
root = '<div class="mes">([\s\S]*?)</p>'
root_name = '<span class="dy-name ellipsis fl">([\s\S]*?)</span>'
root_number = '<span class="dy-num fr">([\w\W]*?)</span>'
def __fetch_content(self):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
}
req = request.Request(url = Spider.url, headers=headers)
htmls = request.urlopen(req).read()
htmls = str(htmls,encoding='utf-8')
a = 1
return htmls
def __analysis(self,htmls):
root_html = re.findall(Spider.root,htmls)
anchors = []
for html in root_html:
name = re.findall(Spider.root_name,html)
number = re.findall(Spider.root_number,html)
anchor = {'name':name,'number':number}
anchors.append(anchor)
return anchors
def __refine(self,anchors):
x = lambda anchor:{'name':anchor['name'][0],
'number':anchor['number'][0]}
return map(x,anchors)
def __sort(self,anchors):
anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
return anchors
def __sort_seed(send,anchor):
r = re.findall('\d*',anchor['number'])
number = float(r[0])
if '万' in anchor['number']:
number *=10000
return number
def __sort_show(self,anchors):
for anchor in anchors:
print(anchor['name']+'..........'+anchor['number'])
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = list(self.__refine(anchors))
anchors = self.__sort(anchors)
self.__sort_show(anchors)
spider = Spider()
spider.go()
1回答
-
7七月
2018-03-08
为什么抓的是斗鱼呢?不同的网站对于HTML的解析技巧肯定是不同的,这个需要具体的问题具体分析。
032018-03-10
相似问题