请求的城市 不全:
来源:2-3 加入代理逻辑隐藏爬虫 使用多进程加速抓取

天才在左_我在右
2019-11-29
这种的问题时什么原因引起的 : 代理我用了, User_Agent 也用了, 城市的名字 可以获取全部的城市名字,抓不了!
写回答
1回答
-
天才在左_我在右
提问者
2019-11-29
# coding=gbk import json import multiprocessing import time import pandas as pd import requests import re from Lagou.setting import proxy_ip , User_agent class Handle_lagou(object): # 定义初始化方法: def __init__(self): # 定义初始化session 保存cookies信息 self.lagou_session = requests.session() # 定义初始请求头 # self.header = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', # } self.header = { 'User-Agent': User_agent() } # 初始化城市,存放城市信息 self.lagou_city = "" # 定义获取城市的方法 def handle_city(self): # 定义获取城市名称的re city_search = re.compile(r'www\.lagou\.com\/.*\/">(.*?)</a>') # 创建请求路由 city_url = "https://www.lagou.com/jobs/allCity.html" # 调用handle_requests方法,请求 city_result = self.handle_requests(method="GET", url=city_url) # 正则提取响应中的 城市信息 self.city_list = city_search.findall(city_result) # 清除 获取城市列表时的cookies信息 self.lagou_session.cookies.clear() def handle_city_job(self, city): # 先发送get请求, 获取 post请求的cookies first_url = 'https://www.lagou.com/jobs/list_python?&px=default&city={}'.format(city) first_response = self.handle_requests(method="GET", url=first_url) # 获取城市岗位页数 total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>') # 如果没有岗位信息,直接return try: page_num = total_page_search.search(first_response).group(1) # print(page_num,city) # print("\n\n"+page_num, city + "\n\n") print(city,page_num+"页") except: return else: # 根据 page_num,range出 页码数: for page in range(1, int(page_num)+1): # 建立 post请求的formdata data = { "pn": page, "kd": "python" } # Ajax异步请求的 url page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city # Ajax异步请求的 referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city # referer的URL需要进行encode self.header['Referer'] = referer_url.encode() response = self.handle_requests(method="POST", url=page_url, data=data, info=city) lagou_data = json.loads(response) job_list = lagou_data['content']['positionResult']['result'] jobs = [] for job in job_list: jobs.append(job) pd.DataFrame(jobs).to_csv('job.csv', mode='a', index=False, ) # print(job) # 定义请求的方法 传入method:请求方式,data:请求参数,url:请求路由.info:请求参数信息 def handle_requests(self, method, url, data=None, info=None, cookies=None): while True: # proxy = proxy_ip() try: proxy = proxy_ip() if method == "GET": response = self.lagou_session.get(url=url, headers=self.header, ) elif method == "POST": response = self.lagou_session.post(url=url, headers=self.header, data=data, proxies=proxy, timeout=6) # response = self.lagou_session.post(url=url,headers=self.header,data=data,timeout=6) response.encoding = "utf-8" except: print("\n\n"+"请求中报错"+"\n\n") if '频繁' in response.text: print("频繁") # 清除cookies self.lagou_session.cookies.clear() # 重新获取cookies信息 first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info self.handle_requests(method="GET", url=first_request_url) time.sleep(10) continue return response.text if __name__ == '__main__': lagou = Handle_lagou() lagou.handle_city() # for city in lagou.city_list: # lagou.handle_city_job(city) pool = multiprocessing.Pool(2) for city in lagou.city_list: pool.apply_async(lagou.handle_city_job, args=(city,)) pool.close() pool.join()
00
相似问题
无法获取全部城市连接
回答 1
杭州和上海会爬不出岗位信息?
回答 1