bs4练习

工具

pycharm

目的

'''
网址:http://ip.yqie.com/ipproxy.htm,原url不能用,更换url为:http://www.66ip.cn/index.html
用bs4来做一个简单的爬虫,爬取某个ip网址里的免费ip,获取每个ip的代理IP地址、端口、服务器地址、是否匿名、类型、存活时间
'''

代码

import requests
from bs4 import BeautifulSoup
import json

ip_agents = []

for i in range(1,11):  # 提取前10页
    if i == 1:
        url_str = 'index'
    else:
        url_str = str(i)
    url = f'http://www.66ip.cn/{url_str}.html'
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response.encoding = 'gb2312'
    # print(response.text)
    soup = BeautifulSoup(response.text,'lxml')
    ip_content = soup.select('div[class="containerbox boxindex"] table tr')[1:]
    # print(ip_content)

    for ip_tr in ip_content:
        # print(ip_tr)
        ip_agent = {}
        ip_agent['ip'] = ip_tr.select('td:first-child')[0].getText()
        ip_agent['port'] = ip_tr.select('td:nth-child(2)')[0].getText()
        ip_agent['address'] = ip_tr.select('td:nth-child(3)')[0].getText()
        ip_agent['type'] = ip_tr.select('td:nth-child(4)')[0].getText()
        ip_agent['verify_time'] = ip_tr.select('td:last-child')[0].getText()
        ip_agents.append(ip_agent)

with open('ip_agents.json','w',encoding='utf-8') as f:
    json.dump(ip_agents,f,ensure_ascii=False,indent=2)

运行结果

见资源