目标:根据各个字段数据的分布(例如srcIP和dstIP的top 10)以及其他特征来进行样本标注,最终将几类样本分别标注在black/white/ddos/mddos/cdn/unknown几类。
效果示意:
-------------choose one--------------
sub domain: DNSQueryName(N)
ip: srcip(S) or dstip(D)
length: DNSRequestLength(R1) or DNSReplyLength(R2)
length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)
port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)
code: DNSReplyCode(C2) or DNSRequestRRType(C1)
other: DNSRRClass(RR) or DNSReplyIPv4(V)
-------------label or quit------------
black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)
next(Q) or exit(E)?
***************************************
domain: workgroup. flow count: 206
***************************************
------------srcip-----------------
count 206
unique 9
top 162.105.129.122
freq 150
Name: sourceIP, dtype: object
--------------destip---------------
count 206
unique 12
top 199.7.83.42
freq 82
Name: destIP, dtype: object
代码:
import sys
import json
import os
import pandas as pd
import tldextract
# import numpy as np
medata_field = '''
3 = sourceIP
4 = destIP
5 = sourcePort
6 = destPort
7 = protocol
12 = flowStartSeconds
13 = flowEndSecond
54 = DNSReplyCode
55 = DNSQueryName
56 = DNSRequestRRType
57 = DNSRRClass
58 = DNSDelay
59 = DNSReplyTTL
60 = DNSReplyIPv4
61 = DNSReplyIPv6
62 = DNSReplyRRType
77 = DNSReplyName
81 = payload
88 = DNSRequestLength
89 = DNSRequestErrLength
90 = DNSReplyLength
91 = DNSReplyErrLength
'''
medata_field_num = []
medata_field_info = []
for l in medata_field.split("\n"):
if len(l) == 0: continue
num, info = l.split(" = ")
medata_field_num.append(int(num)-1)
medata_field_info.append(info)
print medata_field_num
print medata_field_info
def extract_domain(domain):
try:
ext = tldextract.extract(domain)
subdomain = ext.subdomain
if ext.domain == "":
mdomain = ext.suffix
else:
mdomain = ".".join(ext[1:])
return mdomain
except Exception,e:
print "extract_domain error:", e
return "unknown"
def parse_metadata(path):
df = pd.read_csv(path, sep="^", header=None)
dns_df = df.iloc[:, medata_field_num].copy()
dns_df.columns = medata_field_info
# print dns_df.tail()
dns_df["mdomain"] = dns_df["DNSQueryName"].apply(extract_domain)
# print dns_df.groupby('mdomain').describe()
# print dns_df.groupby('mdomain').groups
return dns_df.groupby('mdomain')
def get_data_dist(df, col="sourceIP"):
# group count by ip dist
grouped = df.groupby(col)
# print grouped.head(10)[col]
print type(grouped.size())
size = grouped.size()
print size
print "-----------top 10-------------"
print size.nlargest(10)
def get_ipv4_dist(df, col="DNSReplyLength"):
# group count by ip dist
df2 = df[df[col] > 0]
print "filter before length:", len(df), "filter after length:", len(df2)
grouped = df2.groupby(by="DNSReplyIPv4")
# print grouped.head(10)[col]
size = grouped.size()
print size
print "-----------top 10-------------"
print size.nlargest(10)
def move_to(srcpath, domain, dst_path):
with open(dst_path, "w") as w:
with open(srcpath) as r:
for line in r:
if extract_domain(line.split("^")[55-1]) == domain:
w.write(line)
def main():
history_op = {}
if os.path.exists("history_op.json"):
with open("history_op.json") as h:
history_op = json.load(h)
print history_op
for day in range(24, 27):
for hour in range(0, 24):
path = "/home/bonelee/latest_metadata_sample/sampled/unknown_sample/debugdogcom-medata_wanted-2017-09-%d-%d.txt" % (day, hour)
if not os.path.exists(path) or os.path.getsize(path) == 0:
print path, "passed, file not exists or empty file."
continue
print path, "running..."
try:
domains_info = parse_metadata(path)
except IOError, e:
print e
continue
for domain, group in domains_info:
print "***************************************"
print "domain:", domain, "flow count:", len(group)
print "***************************************"
# print type(group) #<class 'pandas.core.frame.DataFrame'>
print "------------srcip-----------------"
print group["sourceIP"].describe()
print "--------------destip---------------"
print group["destIP"].describe()
print "----------------------------------------"
print "ipv4 address return dist:"
get_ipv4_dist(group)
print "----------------------------------------"
has_judged = False
need_break = False
while True:
print "-------------choose one--------------"
print "sub domain: DNSQueryName(N)"
print "ip: srcip(S) or dstip(D)"
print "length: DNSRequestLength(R1) or DNSReplyLength(R2)"
print "length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)"
print "port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)"
print "code: DNSReplyCode(C2) or DNSRequestRRType(C1)"
print "other: DNSRRClass(RR) or DNSReplyIPv4(V)"
dist_dict = {"R1": "DNSRequestLength",
"R2": "DNSReplyLength",
"R3": "DNSRequestErrLength",
"R4": "DNSReplyErrLength",
"P1": "sourcePort",
"P2": "destPort",
"T": "DNSReplyTTL",
"C2": "DNSReplyCode",
"C1": "DNSRequestRRType",
"RR": "DNSRRClass",
"V": "DNSReplyIPv4",
"S": "sourceIP",
"D": "destIP",
"N": "DNSQueryName"
}
print "-------------label or quit------------"
print "black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)"
print "next(Q) or exit(E)?"
domain = domain.lower()
if "win" == domain[-len("win"):] or "site" == domain[-len("site"):] or "vip" == domain[-len("vip"):]:
check = "U"
need_break = True
elif "lan" in domain or "local" in domain or "dhcp" in domain or "workgroup" in domain or "home" in domain:
check = "DDOS"
need_break = True
elif "cdn" in domain:
check = "CDN"
need_break = True
else:
if domain in history_op and not has_judged:
print "found history op:", history_op[domain]
if not raw_input("OK(Enter for Y)?"):
check = history_op[domain]
need_break = True
else:
check = raw_input("Input:")
else:
check = raw_input("Input:")
has_judged = True
if check == "Q":
print path, "next OK!"
break
elif check == "E":
print path, "Exit!"
with open("history_op.json", "w") as f:
json.dump(history_op, f)
print "saved history_op.json"
sys.exit()
elif check == "B":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_black/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "B"
print "Saved OK!"
if need_break: break
elif check == "W":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "W"
print "Saved OK!"
if need_break: break
elif check == "L":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white_like/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "L"
print "Saved OK!"
if need_break: break
elif check == "CDN":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_cdn/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "CDN"
print "Saved OK!"
if need_break: break
elif check == "DDOS":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_ddos/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "DDOS"
print "Saved OK!"
if need_break: break
elif check == "M":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_mddos/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "M"
print "Saved OK!"
if need_break: break
elif check == "U":
move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_unknown/2017-8-%d-%d-%s.txt" % (day, hour, domain))
history_op[domain] = "U"
print "Saved OK!"
if need_break: break
else:
if check in dist_dict:
get_data_dist(group, dist_dict[check])
else:
print "unknown input!Choose the following one:"
print "*******************************"
print path, "check over..."
print "*******************************"
if __name__ == "__main__":
main()
标签:-%,domain,示意,标记,样本,break,print,path,history From: https://blog.51cto.com/u_11908275/6385867