运行代码,没有报错,数据库内没有数据?
来源:14-15 获取个人信息详情 - 2

燚燚生辉
2019-07-20
"""
抓取
解析
存储
"""
import re
import ast
import requests
import time
from urllib import parse # js加载的路径为相对路径,需要通过 parse.urljoin()方法将域名与相对路径结合生成可以直接访问的URL
from scrapy import Selector
from datetime import datetime
from selenium import webdriver
from csdn_spider.Models import *
domain = "https://www.csdn.net/" # 域名
def get_nodes_json(): # 获取所有的URL
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_search = re.search("forumNodes: (.*])", left_menu_text) # 使用贪婪匹配
if nodes_str_search:
nodes_str = nodes_str_search.group(1).replace("null", "None") # 因为null在python里json中将会抛异常,所以将所有的null替换为None
nodes_list = ast.literal_eval(nodes_str) # 将nodes_str转换为list
return nodes_list
return []
url_list = []
def process_nodes_list(nodes_list): # 获取所有的URL
# 将json的格式提取出url到list
for item in nodes_list:
if "url" in item:
if item["url"]:
url_list.append(item["url"])
if "children" in item:
process_nodes_list(item["children"])
def get_level1_url(nodes_list): # 获取第一层URL
level1_url = []
for item in nodes_list:
if "url" in item and item["url"]:
level1_url.append(item["url"])
return level1_url
def get_last_urls():
# 获取需要最终抓取的URL
nodes_list = get_nodes_json()
process_nodes_list(nodes_list)
level1_url = get_level1_url(nodes_list)
last_urls = []
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
for url in last_urls:
all_urls.append(parse.urljoin(domain, url)) # 待解决
all_urls.append(parse.urljoin(domain, url + "/recommend")) # 推荐精华
all_urls.append(parse.urljoin(domain, url + "/closed")) # 已解决
return all_urls
def parse_topic(url):
# 获取帖子得详情及回复
topic_id = int(url.split("/")[-1])
res_text = requests.get(url,cookies = cookie_dict).text
sel = Selector(text=res_text)
all_divs = sel.xpath("//div[starts-with(@id,'post-')]")
topic_item = all_divs[0]
content = topic_item.xpath(".//div[@class = 'post_body post_body_min_h']/text()").extract()[0]
praised_nums = int(topic_item.xpath(".//label[@class = 'red_praise digg']//em/text()").extract()[0])
jtl_str = topic_item.xpath("//div[@class = 'close_topic']/text()").extract()[0]
jtl = 0
jtl_mctch = re.search("(\d)%", jtl_str)
if jtl_mctch:
jtl = jtl_mctch.group(1)
existed_topic = Topic.select().where(Topic.id == topic_id)
if existed_topic:
topic = existed_topic[0]
topic.jtl = jtl
topic.content = content
topic.praised_nums = praised_nums
topic.save()
for answer_item in all_divs[1:]:
answer = Answer()
answer.topic_id = topic_id
author_info = answer_item.xpath("//div[@class = 'nick_name']/a[1]/@herf").extract()[0]
author_id = author_info.split("/")[-1]
create_time_str = answer_item.xpath(".//label[@class = 'date_time'/text()]").extract[0]
create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M:%S")
answer.author = author_id
answer.create_time = create_time
content = answer_item.xpath(".//div[@class = 'post_body post_body_min_h']/text()").extract()[0]
answer.content = content
praised_nums = int(answer_item.xpath(".//label[@class = 'red_praise digg']//em/text()").extract()[0])
answer.parised_nums = praised_nums
answer.save()
next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()
if next_page:
next_page_url = parse.urljoin(domain, next_page[0])
parse_topic(next_page_url)
def parse_author(url):
# 获取用户得详情
author_id = url.split("/")[-1]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
res_text = requests.get(url,cookies = cookie_dict).text
sel = Selector(text=res_text, headers=headers)
author = Author()
author.id = author_id
all_li_strs = sel.xpath("//ul[@class = 'me_chanel_list clearfix']//li/text()").extract()
click_nums = all_li_strs[0]
original_nums = all_li_strs[1]
forward_nums = int(all_li_strs[2])
rate = int(all_li_strs[3])
answer_nums = int(all_li_strs[4])
parised_nums = int(all_li_strs[5])
author.click_nums = click_nums
author.original_nums = original_nums
author.forward_nums = forward_nums
author.rate = rate
author.answer_nums = answer_nums
author.parised_nums = parised_nums
name = sel.xpath("//h4[@class='username']/text()").extract()[0]
author.name = name.strip()
existed_author = Author.select().where(Author.id == author_id)
if existed_author:
author.save()
else:
author.save(force_insert=True)
desc = sel.xpath("//dd[@class='user_desc']/text()").extract()
if desc:
author.desc = desc[0].strip()
person_b = sel.xpath("//dd[@class='person_b']/ul/li")
for item in person_b:
item_text = "".join(item.extract())
if "csdnc-m-add" in item_text:
location = item.xpath(".//span/text()").extract()[0].strip()
author.location = location
else:
industry = item.xpath(".//span/text()").extract()[0].strip()
author.industry = industry
name = sel.xpath("//h4[@class='username']/text()").extract()[0]
author.name = name.strip()
existed_author = Author.select().where(Author.id == author_id)
if existed_author:
author.save()
else:
author.save(force_insert=True)
def parse_list(url):
# 解析列表页内容
res_text = requests.get(url,cookies = cookie_dict).text
sel = Selector(text=res_text)
all_trs = sel.xpath("//table[@class = 'forums_tab_table']//tr")[2:]
for tr in all_trs:
status = tr.xpath(".//td[1]/span/text()").extract()[0]
score = tr.xpath(".//td[2]/em/text()").extract()[0]
topic_url = parse.urljoin(domain, tr.xpath(".//td[3]/a/[@herf]").extract()[0])
topic_id = tr.xpath(topic_url.split("/"))[-1]
topic_title = tr.xpath(".//td[3]/a/text()").extract()[0]
author_url = parse.urljoin(domain, tr.xpath(".//td[4]/a/[@herf]").extract()[0])
author_id = author_url.split("/")[-1]
create_time_str = tr.xpath(".//td[4]/em/text()").extract()[0]
create_time = datetime.strptime(create_time_str, "%Y-%m-%d %H:%M")
answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]
answer_nums = answer_info.split("/")[0]
click_nums = answer_info.split("/")[1]
last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]
last_time = datetime.strptime(last_time_str, "%Y-%m-%d %H:%M")
topic = Topic()
try:
topic.id = int(topic_id)
except:
continue
topic.title = topic_title
topic.author = author_id
topic.create_time = create_time
topic.answer_nums = int(answer_nums)
topic.click_nums = int(click_nums)
topic.score = int(score)
topic.status = status
topic.last_answer_time = last_time
existed_topic = Topic.select().where(Topic.id == topic.id)
if existed_topic:
topic.save()
else:
topic.save(force_insert=True)
parse_topic(url)
parse_author(url)
next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()
if next_page:
next_page_url = parse.urljoin(domain, next_page[0])
# 注:如果写成 next_page = sel.xpath("//a[@class = 'pageliststy next_page']/@heref").extract()[0]
# 当没有下一页时(即next_page取不到值时)list[]将会抛异常,故采用以上写法,先进性判断
parse_list(next_page_url)
if __name__ == "__main__":
browser = webdriver.Chrome(executable_path="E:\python\Chrome_Drive_win32\chromedriver.exe") # cookies反爬
browser.get("https://www.csdn.net/" )
time.sleep(5)
cookies = browser.get_cookies()
cookie_dict = {}
for item in cookies:
cookie_dict[item["name"]] = item["value"]
last_urls = get_last_urls()
for url in last_urls:
parse_list(url)
#运行结果
E:\Python\python.exe E:/python/项目/csdn_spider/spider.py
Process finished with exit code 0
写回答
2回答
-
微酸袅袅_
2019-08-29
同问,我也是同样的情况
012019-08-31 -
bobby
2019-07-21
你可以试试打个断点 看看能否运行到断点处
032019-09-09
相似问题