从阿里云sls日志导下来的,处理之后只留域名和访问ip,筛选域名和访问ip最后查询域名的icp(根据备案地址来封禁访问ip)
第一版功能的集合
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import time
import json
def serch(first,num):
'''first 两个参数host或者ip'''
if first == 'host':
domain = 'wangzhan/icp.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&domain=%s' %pd_host.index[num]
hostip = 'icp'
elif first == 'ip':
domain = 'ip/chaapi.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&ip=%s' %pd_ip.index[num]
hostip = 'sheng'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
url='https://cn.apihz.cn/api/%s' %domain
time.sleep(10)
res = requests.get(url=url, headers=header)
soup = BeautifulSoup(res.text,'lxml')
mes = soup.p.text
mes_js =json.loads(mes)
ser_result = mes_js[hostip]
return ser_result
pd1 = pd.read_csv('test01.csv')
'''处理表格'''
pd_c1 = pd1[['host','remote_addr','status']]
'''统计排序最高的几个host'''
pd_c1_host =pd_c1['host'].value_counts()
'''需要一个筛选标准:高于平均值的host接着筛选ip'''
std_host =pd_c1['host'].count()//pd_c1_host.count()
pd_host = pd_c1_host[pd_c1_host > std_host]
'''要用for循环处理上面的域名对应ip'''
for num_host in range(len(pd_host.index)):
pd_c2 = pd_c1[pd_c1['host']==pd_host.index[num_host]]
icp = serch('host',num_host)###到这都没有问题
print('%s:%s'%(pd_host.index[num_host],icp))
pd_ip = pd_c2['remote_addr'].value_counts() ###rest_index不一定要用
pd_ip =pd_ip[:10]
for num_ip in range(10):
ip = serch('ip',num_ip)
print('%s:%s'%(pd_ip.index[num_ip],ip))
print('\n')
time.sleep(10)
第二版功能函数化(方便调用,调试)并且减少icp接口的查询调用
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
class js_Icp:
def writer(self,d_icp):
self.d_icp = d_icp
f =open('ICPjson.txt',mode='w')
json.dump(self.d_icp,f)
def reader(self):
f =open('ICPjson.txt',mode='r')
res = json.load(f)
return res
def serch(first,num,):
'''first 两个参数host或者ip'''
if first == 'host':
domain = 'wangzhan/icp.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&domain=%s' %std_pd_host.index[num]
hostip = 'icp'
elif first == 'ip':
domain = 'ip/chaapi.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&ip=%s' %pd_ip.index[num]
hostip = 'sheng'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
url='https://cn.apihz.cn/api/%s' %domain
res = requests.get(url=url, headers=header)
time.sleep(7)
soup = BeautifulSoup(res.text,'lxml')
mes = soup.p.text
mes_js =json.loads(mes)
ser_result = mes_js[hostip]
return ser_result
def host_csv(csv_name):
global std_pd_host, pd_c1
pd1 = pd.read_csv(csv_name, low_memory=False)
pd_c1 = pd1[['host', 'remote_addr', 'status']]
pd_c1_host = pd_c1['host'].value_counts()
std_host = pd_c1['host'].count() // pd_c1_host.count()
pd_host = pd_c1_host[pd_c1_host > std_host]
std_pd_host = pd_host[~pd_host.index.str.startswith('urm')]
return std_pd_host
def ip_csv(num_host):
global pd_ip
pd_c2 = pd_c1[pd_c1['host'] == std_pd_host.index[num_host]]
pd_ip = pd_c2['remote_addr'].value_counts()
pd_ip = pd_ip[:10]
return pd_ip
def analysis(csv_name):
icps = js_Icp()
pd_host = host_csv(csv_name)
for num in range(len(pd_host)):
domain = pd_host.index[num]
jsicptext = icps.reader()
if domain not in jsicptext:
icp = serch('host', num)
jsicptext.update({domain:icp})
icps.writer(jsicptext)
else:
icp = jsicptext.get(domain)
print('%s:%s' % (domain, icp))
ips = ip_csv(num)
for ipnum in range(10):
ip = serch('ip', ipnum)
print('%s:%s' % (ips.index[ipnum], ip))
print('\n')
time.sleep(7)
if __name__ == '__main__':
analysis('C:/Users/Administrator/Desktop/77710c73-d476-4a29-95fe-58080140212f.csv')
第三版,将境外ip全部输出,方便封禁
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
china_provinces = ["北京市", "天津市", "上海市", "重庆市", "河北省", "山西省", "辽宁省", "吉林省", "黑龙江省", "江苏省", "浙江省",
"安徽省", "福建省", "江西省", "山东省", "河南省", "湖北省", "湖南省", "广东省", "海南省", "四川省", "贵州省",
"云南省", "陕西省", "甘肃省", "青海省", "内蒙古自治区", "广西壮族自治区", "西藏自治区", "宁夏回族自治区", "新疆维吾尔自治区",
None]
class js_Icp:
def writer(self,d_icp):
self.d_icp = d_icp
f =open('ICPjson.txt',mode='w')
json.dump(self.d_icp,f)
def reader(self):
f =open('ICPjson.txt',mode='r')
res = json.load(f)
return res
def serch(first,num,):
'''first 两个参数host或者ip'''
if first == 'host':
domain = 'wangzhan/icp.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&domain=%s' %std_pd_host.index[num]
hostip = 'icp'
elif first == 'ip':
domain = 'ip/chaapi.php?id=10006660&key=d92a9a568fe83ffcd539d5764579d739&ip=%s' %pd_ip.index[num]
hostip = 'sheng'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
url='https://cn.apihz.cn/api/%s' %domain
res = requests.get(url=url, headers=header)
time.sleep(7)
soup = BeautifulSoup(res.text,'lxml')
mes = soup.p.text
mes_js =json.loads(mes)
ser_result = mes_js[hostip]
return ser_result
def host_csv(csv_name):
global std_pd_host, pd_c1
pd1 = pd.read_csv(csv_name, low_memory=False)
pd_c1 = pd1[['host', 'remote_addr', 'status']]
pd_c1_host = pd_c1['host'].value_counts()
std_host = pd_c1['host'].count() // pd_c1_host.count()
pd_host = pd_c1_host[pd_c1_host > std_host]
std_pd_host = pd_host[~pd_host.index.str.startswith('urm')]
return std_pd_host
def ip_csv(num_host):
global pd_ip
pd_c2 = pd_c1[pd_c1['host'] == std_pd_host.index[num_host]]
pd_ip = pd_c2['remote_addr'].value_counts()
pd_ip = pd_ip[:10]
return pd_ip
def analysis(csv_name):
outputip = '' ###测试1
icps = js_Icp()
pd_host = host_csv(csv_name)
for num in range(len(pd_host)):
domain = pd_host.index[num]
jsicptext = icps.reader()
if domain not in jsicptext:
icp = serch('host', num)
jsicptext.update({domain:icp})
icps.writer(jsicptext)
else:
icp = jsicptext.get(domain)
print('%s:%s' % (domain, icp))
ips = ip_csv(num)
if len(pd_ip) > 10:
for ipnum in range(10):
ip = serch('ip', ipnum)
'''测试2'''
if ip not in china_provinces:
outputip += ips.index[ipnum] + ' '
print('%s:%s' % (ips.index[ipnum], ip))
else:
for ipnum in range(len(pd_ip)):
ip = serch('ip', ipnum)
'''测试2'''
if ip not in china_provinces:
outputip += ips.index[ipnum] + ' '
print('%s:%s' % (ips.index[ipnum], ip))
time.sleep(7)
return outputip
if __name__ == '__main__':
test1=analysis('C:/Users/Administrator/Desktop/b19de193-69d2-4629-a0eb-77253c057064.csv')
print(test1)