ip不可用

来源:8-7 scrapy实现ip代理池 - 2

天上掉下个小馅饼

2019-02-19

老师,为什么我爬取的ip都是不可用的,而且爬取了一会后就没有继续爬取了

import requests
from scrapy.selector import Selector
import MySQLdb


conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="article_spider", charset="utf8")
cursor = conn.cursor()
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    }

def crawl_ips():
    # 爬取西刺的免费ip代理
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    }
    for i in range(1, 3599):
        re = requests.get("https://www.xicidaili.com/nn/{0}".format(i), headers=headers)
        selector = Selector(text=re.text)
        all_trs = selector.css("#ip_list tr")

        ip_list = []
        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract_first()
            if speed_str:
                speed = float(speed_str.split("秒")[0])
            all_texts = tr.css("td::text").extract()

            ip = all_texts[0]
            port = all_texts[1]
            proxy_type = all_texts[5]
            ip_list.append((ip, port, proxy_type, speed))

        for ip_infor in ip_list:
            cursor.execute(
                "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, '{3}')".format(
                    ip_infor[0], ip_infor[1], ip_infor[3], ip_infor[2]
                )
            )
            conn.commit()


class GetIP(object):
    def delete_ip(self, ip):
        # 从数据库中删除无效的ip
        delete_sql = """
            delete from proxy_ip where ip='{0}'
        """.format(ip)
        cursor.execute(delete_sql)
        conn.commit()
        return True

    def judge_ip(self, ip, port, proxy_type):
        # 判断ip是否可用
        http_url = "https://www.baidu.com"
        proxy_url = "https://{0}:{1}".format(ip, port)
        if proxy_type is "HTTP":
            proxy_dict = {
                "http": "http://{0}:{1}".format(ip, port),
            }
        elif proxy_type is "HTTPS":
            proxy_dict = {
                "https": "https://{0}:{1}".format(ip, port),
            }
        try:
            response = requests.get(http_url, proxies=proxy_dict, headers=headers)
        except Exception as e:
            print("invalid ip and port")
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 and code < 300:
                print("effective ip")
                return True
            else:
                print("invalid ip and port")
                self.delete_ip(ip)
                return False



    def get_random_ip(self):
        random_sql = """
            SELECT ip, port, proxy_type FROM proxy_ip
            ORDER BY RAND()
            LIMIT 1
        """
        result = cursor.execute(random_sql)
        ip, port, proxy_type = "", "", ""
        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]
            proxy_type = ip_info[2]

        judge_re = self.judge_ip(ip, port, proxy_type)
        if judge_re:
            return "{0}://{1}:{2}".format(proxy_type.lower(), ip, port)
        else:
            return self.get_random_ip()

get_ip = GetIP()
print(get_ip.get_random_ip())

# crawl_ips()
写回答

1回答

bobby

2019-02-21

西刺的ip因为用的人太多,所以ip很多都不稳定,而且这些ip很多都可能被封了,所以你可以尝试一下使用收费代理

0
1
天上掉下个小馅饼
非常感谢!好的,谢谢老师
2019-02-21
共1条回复

Scrapy打造搜索引擎 畅销4年的Python分布式爬虫课

带你彻底掌握Scrapy,用Django+Elasticsearch搭建搜索引擎

5796 学习 · 6290 问题

查看课程