设置IP代理爬取知乎被拒绝连接
来源:8-8 scrapy实现ip代理池 - 3
慕设计9544645
2018-05-28

class proxyMiddleware(object):
def process_request(self , request , spider):
iptype = re.match("https|http" , request.url).group()
proxy_ip = self.get_random_ip(iptype)
proxy_user_pass = "18520864631:150145lqfli"
encoded_user_pass = base64.b64encode(proxy_user_pass.encode(encoding='utf-8')).decode()
request.meta["proxy"] = proxy_ip
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
def process_response(self , request , response , spider):
#处理response
if response.status != 200 :
iptype = re.match("https|http", request.url).group()
proxy_ip = self.get_random_ip(iptype)
request.meta["proxy"] = proxy_ip
return request
return response
def get_random_ip(self , type):
with open("proxy_ip.txt" , "r") as rf:
ip_str = rf.read()
ip_list = ip_str.split(',')
ip_list_http = []
ip_list_https = []
for ip in ip_list:
if ip.startswith("http://"):
ip_list_http.append(ip)
elif ip.startswith("https://"):
ip_list_https.append(ip)
if type == "http":
ip = random.choice(ip_list_http).strip()
return ip
else:
ip = random.choice(ip_list_https).strip()
return ip写回答
1回答
-
bobby
2018-06-01
是后续所有的访问都会这样 还是某一个url会出现这种情况呢
022018-06-04
相似问题