bs4练习
工具
pycharm
目的
'''
网址:http://ip.yqie.com/ipproxy.htm,原url不能用,更换url为:http://www.66ip.cn/index.html
用bs4来做一个简单的爬虫,爬取某个ip网址里的免费ip,获取每个ip的代理IP地址、端口、服务器地址、是否匿名、类型、存活时间
'''
代码
import requests
from bs4 import BeautifulSoup
import json
ip_agents = []
for i in range(1,11): # 提取前10页
if i == 1:
url_str = 'index'
else:
url_str = str(i)
url = f'http://www.66ip.cn/{url_str}.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
response = requests.get(url,headers=headers)
response.encoding = 'gb2312'
# print(response.text)
soup = BeautifulSoup(response.text,'lxml')
ip_content = soup.select('div[class="containerbox boxindex"] table tr')[1:]
# print(ip_content)
for ip_tr in ip_content:
# print(ip_tr)
ip_agent = {}
ip_agent['ip'] = ip_tr.select('td:first-child')[0].getText()
ip_agent['port'] = ip_tr.select('td:nth-child(2)')[0].getText()
ip_agent['address'] = ip_tr.select('td:nth-child(3)')[0].getText()
ip_agent['type'] = ip_tr.select('td:nth-child(4)')[0].getText()
ip_agent['verify_time'] = ip_tr.select('td:last-child')[0].getText()
ip_agents.append(ip_agent)
with open('ip_agents.json','w',encoding='utf-8') as f:
json.dump(ip_agents,f,ensure_ascii=False,indent=2)
运行结果
见资源