请求的城市 不全:

来源:2-3 加入代理逻辑隐藏爬虫 使用多进程加速抓取

天才在左_我在右

2019-11-29

这种的问题时什么原因引起的 : 代理我用了, User_Agent 也用了, 城市的名字 可以获取全部的城市名字,抓不了!
图片描述

写回答

1回答

天才在左_我在右

提问者

2019-11-29

# coding=gbk
import json
import multiprocessing
import time
import pandas as pd
import requests
import re
from Lagou.setting import proxy_ip , User_agent


class Handle_lagou(object):

    # 定义初始化方法:
    def __init__(self):
        # 定义初始化session 保存cookies信息
        self.lagou_session = requests.session()
        # 定义初始请求头
        # self.header = {
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        # }
        self.header = {
            'User-Agent': User_agent()
        }
        # 初始化城市,存放城市信息
        self.lagou_city = ""

    # 定义获取城市的方法
    def handle_city(self):
        # 定义获取城市名称的re
        city_search = re.compile(r'www\.lagou\.com\/.*\/">(.*?)</a>')
        # 创建请求路由
        city_url = "https://www.lagou.com/jobs/allCity.html"
        # 调用handle_requests方法,请求
        city_result = self.handle_requests(method="GET", url=city_url)
        # 正则提取响应中的 城市信息
        self.city_list = city_search.findall(city_result)
        # 清除 获取城市列表时的cookies信息
        self.lagou_session.cookies.clear()

    def handle_city_job(self, city):
        # 先发送get请求, 获取 post请求的cookies
        first_url = 'https://www.lagou.com/jobs/list_python?&px=default&city={}'.format(city)
        first_response = self.handle_requests(method="GET", url=first_url)
        # 获取城市岗位页数
        total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
        # 如果没有岗位信息,直接return
        try:
            page_num = total_page_search.search(first_response).group(1)
            # print(page_num,city)
            # print("\n\n"+page_num, city + "\n\n")
            print(city,page_num+"页")
        except:
            return
        else:
            # 根据 page_num,range出 页码数:
            for page in range(1, int(page_num)+1):
                # 建立 post请求的formdata
                data = {
                    "pn": page,
                    "kd": "python"
                }
                # Ajax异步请求的 url
                page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
                # Ajax异步请求的
                referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
                # referer的URL需要进行encode
                self.header['Referer'] = referer_url.encode()
                response = self.handle_requests(method="POST", url=page_url, data=data, info=city)
                lagou_data = json.loads(response)
                job_list = lagou_data['content']['positionResult']['result']
                jobs = []
                for job in job_list:
                    jobs.append(job)
                pd.DataFrame(jobs).to_csv('job.csv', mode='a', index=False, )
                    # print(job)
    # 定义请求的方法 传入method:请求方式,data:请求参数,url:请求路由.info:请求参数信息
    def handle_requests(self, method, url, data=None, info=None, cookies=None):
        while True:
            # proxy = proxy_ip()
            try:
                proxy = proxy_ip()
                if method == "GET":
                    response = self.lagou_session.get(url=url, headers=self.header, )
                elif method == "POST":
                    response = self.lagou_session.post(url=url, headers=self.header, data=data, proxies=proxy, timeout=6)
                    # response = self.lagou_session.post(url=url,headers=self.header,data=data,timeout=6)
                response.encoding = "utf-8"
            except:
                print("\n\n"+"请求中报错"+"\n\n")
            if '频繁' in response.text:
                print("频繁")
                # 清除cookies
                self.lagou_session.cookies.clear()
                # 重新获取cookies信息
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_requests(method="GET", url=first_request_url)
                time.sleep(10)
                continue
            return response.text


if __name__ == '__main__':
    lagou = Handle_lagou()
    lagou.handle_city()
    # for city in lagou.city_list:
    #     lagou.handle_city_job(city)
    pool = multiprocessing.Pool(2)
    for city in lagou.city_list:
        pool.apply_async(lagou.handle_city_job, args=(city,))
    pool.close()
    pool.join()


0
0

Python爬虫实战数据可视化分析

网站数据收集分析必备技能

3982 学习 · 115 问题

查看课程