下载:wget https://mem:[email protected]/mem-prototype/mem-data-analysis-framework.tar.gz
docker load <mem-data-analysis-framework.tar.gz
docker run --network="host" -t -d mem-prototype:latest
然后访问localhost:8080即可看到代码!
上传数据到docker:
sudo docker cp x.txt e66d9d295e54:/tmp/
[sudo] password for kali:
root@kali:/work# ls /tmp/
hsperfdata_root spark-events x.txt zeppelin-index
可以看到tmp下有数据了!
计算相似性,基于相似性通过KNN进行“聚类”:
%spark.pyspark
# Returns all edges between server behaviors where k versions match.
# For the remaining client hellos without matches, one of the responses has to be a timeout.
def get_core(versions, k):
versions = versions.select(f.col('id').alias('id_a'), f.col('versions').alias('v_a'))\
.join(versions.select(f.col('id').alias('id_b'), f.col('versions').alias('v_b')), f.col('id_a') <= f.col('id_b'))\
.filter(version_match(f.col('v_a'), f.col('v_b'), f.lit(k)))
print(f'threshold: {k}, version matches: {versions.count()}')
return(versions)
versions = df_only_fp.select(f.col('versions'),f.col('timeouts')).distinct().sort(f.col('versions').asc()).withColumn("id", f.monotonically_increasing_id())
versions.write.mode('overwrite').parquet(f'/tmp/versions.parquet')
versions = spark.read.parquet(f'/tmp/versions.parquet')
versions = versions.filter(f.col('timeouts')<=5)
for i in range(0,9):
tmp_versions = get_core(versions, 24-i)
edges = calculate_similarites(df_only_fp, tmp_versions)
edges.write.mode('overwrite').parquet(f'/tmp/fp_distances_raw_{24-i}_new.parquet')
edges = spark.read.parquet(f'/tmp/fp_distances_raw_{24-i}_new.parquet')
%spark.pyspark
window = Window.partitionBy('fp_a').orderBy(col('similarity').desc_nulls_last())
# The functions temprorarily store resutls for faster processing afterwards
def generate_knndf(fp_distances, k, file_name):
if k != 0:
knn_df = fp_distances.withColumn('row', f.rank().over(window)).where(col('row') <= k).drop('row')
else:
knn_df = fp_distances
knn_df.write.mode('overwrite').parquet(os.path.join('/tmp/', file_name))
return knn_df
def generate_simple_knndf(knn_df, file_name):
s_knn_df = knn_df.withColumn('fp_1', f.when(col('fp_a') < col('fp_b'), col('fp_a')).otherwise(col('fp_b'))).withColumn('fp_2', f.when(col('fp_1') == col('fp_b'), col('fp_a')).otherwise(col('fp_b'))).drop('fp_a').drop('fp_b').withColumnRenamed('fp_1', 'fp_a').withColumnRenamed('fp_2', 'fp_b').distinct()
s_knn_df.write.mode('overwrite').parquet(os.path.join('/tmp/', 's_' + file_name))
def generate_g_knndf(knn_df, file_name):
gf_knn = GraphFrame(df_only_fp.select(col('fingerprint_all').alias('id')), knn_df.select(col('fp_a').alias('src'), col('fp_b').alias('dst')))
g_knn_df = gf_knn.connectedComponents().select(col('id').alias('fingerprint_all'), 'component')
g_knn_df.write.mode('overwrite').parquet(os.path.join('/tmp/', 'g_' + file_name))
def generate(k):
file_name = f'{k}_knn_df.parquet'
knn_df = generate_knndf(fp_distances_raw, k, file_name)
knn_df = spark.read.parquet(os.path.join('/tmp/', file_name))
generate_simple_knndf(knn_df, file_name)
generate_g_knndf(knn_df, file_name)
# Read k-NN graph files
def get_knn(k):
file_name = f'{k}_knn_df.parquet'
if not os.path.exists(os.path.join('/tmp/', file_name)):
raise ValueError()
knn_df = spark.read.parquet(os.path.join('/tmp/', file_name))
if not os.path.exists(os.path.join('/tmp/', 's_' + file_name)):
raise ValueError()
s_knn_df = spark.read.parquet(os.path.join('/tmp/', 's_' + file_name))
if not os.path.exists(os.path.join('/tmp/', 's_' + file_name)):
raise ValueError()
g_knn_df = spark.read.parquet(os.path.join('/tmp/', 'g_' + file_name))
component_count = g_knn_df.select('component').distinct().count()
return knn_df, s_knn_df, g_knn_df, component_count
2-Analyze-FPs
输出处理
def read_parsed_scan(scan, label):
file_dir = os.path.join(scan, "certs.parquet")
certs = spark.read.parquet(file_dir)
file_dir = os.path.join(scan, "fingerprints.parquet")
fps = spark. read.parquet(file_dir)
file_dir = os.path.join(scan, "http.parquet")
http = spark.read.parquet(file_dir)
file_dir = os.path.join(scan, "tls_verbose.parquet")
tls = spark.read.parquet(file_dir)
file_dir = os.path.join(scan, "input.parquet")
labeled_input = spark.read.parquet(file_dir)
file_dir = os.path.join(scan, "hosts.parquet")
hosts = spark.read.parquet(file_dir)
hosts = hosts.withColumn('label', f.lit('label'))
hosts = hosts.fillna("empty", ['server_name'])
return labeled_input, hosts, tls, fps, certs, http
输入来源:
bl_labeled_input, bl_hosts, bl_tls, bl_fps, bl_certs, bl_http = read_parsed_scan("/data/blocklist-parsed", "blocklist")
tl_labeled_input, tl_hosts, tl_tls, tl_fps, tl_certs, tl_http = read_parsed_scan("/data/toplist-parsed", "toplist")
input路径:
/data/blocklist-parsed
/data/toplist-parsed
默认数据的输出:
Top 5 behaviors from block lists (based on number of targets:
+--------------------+-------+--------------+---------------------+--------------------+
| fingerprint_all|targets|Distinct Ports|Distinct IP addresses| collect_set(labels)|
+--------------------+-------+--------------+---------------------+--------------------+
|771_c030_65281.-1...| 57| 4| 57|[[Dridex], [Emotet]]|
|771_9c_65281.-35....| 51| 10| 49| [[QakBot]]|
|771_c02f_65281.-1...| 38| 14| 38| [[Dridex]]|
|771_c02f_65281.-1...| 36| 19| 36| [[Dridex]]|
|769_c014_65281.-1...| 22| 2| 22| [[TrickBot]]|
+--------------------+-------+--------------+---------------------+--------------------+
Top 5 behaviors from top lists (based on number of targets:
+--------------------+-------+----------------+---------------------+
| fingerprint_all|targets|Distinct Domains|Distinct IP addresses|
+--------------------+-------+----------------+---------------------+
|771_1301_51.29-43...| 272| 55| 260|
|771_1301_43.AwQ-5...| 128| 24| 118|
|771_1301_51.29-43...| 110| 31| 110|
|771_1301_51.29-43...| 96| 25| 74|
|771_1301_51.29-43...| 93| 38| 65|
+--------------------+-------+----------------+---------------------+
上述数据来自第一步 1-Parse-Scan,里面的输入和输出我们看下:
def parse(input_dir, tmp_dir, out, toplist=False):
# Load and save all input csv files as parquets
with ThreadPool(processes=4) as pool:
def process_file(t):
file = os.path.join(input_dir, t[0])
try:
df_tmp = load_csv(file, schema=t[1])
save_to_parquet(out, df_tmp, t[2], 4)
except FileNotFoundError:
logging.info(f'Skipping {t}')
except Exception as err:
logging.exception('Could not save csv', exc_info=err)
jobs = []
r = pool.map_async(process_file, [
('tls_verbose.csv', TLS_VERBOSE_DF_SCHEMA, 'tls_verbose'),
('http.csv', HTTP_DF_SCHEMA, 'http'),
('certs.csv', CERTS_DF_SCHEMA, 'certs'),
('labeled-input.csv', INPUT_DF_SCHEMA, 'input_tmp')
], chunksize=1)
jobs.append(r)
hosts_csv = load_csv(os.path.join(input_dir, 'hosts.csv'), schema=HOSTS_DF_SCHEMA)
df_ip = hosts_csv.select('ip').distinct().mapInPandas(get_map_pandas_add_as(
os.path.join(input_dir.replace('file://', ''), '..', 'pyasn.dat'),
os.path.join(input_dir.replace('file://', ''), '..', 'pyasn.asnames.json')), ADD_AS_SCHEMA)
hosts_csv = hosts_csv.join(df_ip, on='ip', how='left_outer')
r = pool.apply_async(lambda: save_to_parquet(out, hosts_csv, 'hosts', 4))
jobs.append(r)
for p in jobs:
p.get()
# Reload for faster processing
hosts_csv = spark.read.parquet(os.path.join(out, 'hosts.parquet'))
input_csv = spark.read.parquet(os.path.join(out, 'input_tmp.parquet'))
tls_csv = spark.read.parquet(os.path.join(out, 'tls_verbose.parquet'))
# Join input for scan ids
if not toplist:
input_csv = input_csv.join(hosts_csv.select('id', 'server_name', 'ip','port'), (input_csv.ip.eqNullSafe(hosts_csv.ip)) & (input_csv.server_name.eqNullSafe(hosts_csv.server_name)) & (input_csv.port.eqNullSafe(hosts_csv.port)))\
.select(input_csv.ip, input_csv.server_name, input_csv.rank, input_csv.label, input_csv.port, input_csv.list, hosts_csv.id).distinct()
else:
input_csv = input_csv.join(hosts_csv.select('id', 'server_name', 'ip','port'), (input_csv.server_name.eqNullSafe(hosts_csv.server_name)))\
.select(hosts_csv.ip, input_csv.server_name, input_csv.rank, input_csv.label, hosts_csv.port, input_csv.list, hosts_csv.id).distinct()
save_to_parquet(out, input_csv, 'input', 4)
splittext = f.udf(lambda FullPath: FullPath.split('.')[0], StringType())
# Compute TLS Fingerprints
hosts_csv = hosts_csv.fillna('default', subset='client_hello').withColumn('client_hello_simple', splittext("client_hello"))
hosts_csv= hosts_csv.drop('client_hello').withColumnRenamed('client_hello_simple', 'client_hello')
pivot_src_df = hosts_csv.join(tls_csv, on='id').select('id', 'ip', 'port', 'server_name', 'client_hello', 'fingerprint')
# Compute combined Fingerprints
client_hellos_escaped = sorted(pivot_src_df.select('client_hello').distinct().rdd.map(lambda r: r[0]).collect())
fingerprint_df = pivot_src_df.groupBy('ip', 'port', 'server_name').pivot('client_hello', client_hellos_escaped).agg(f.first('fingerprint'))
fingerprint_df = fingerprint_df.withColumn('fingerprint_all', f.array_join(remove_status_request(f.array(*client_hellos_escaped)), delimiter='|', null_replacement='______<255'))
#fingerprint_df = add_fingerprint_col(fingerprint_df, 'fingerprint_all', client_hellos)
# Save FP joined
save_to_parquet(out, fingerprint_df, 'fingerprints', 4, partition_columns=['ip', 'port'])
parse("/data/blocklist", "/tmp", "/data/blocklist-parsed")
parse("/data/toplist", "/tmp", "/data/toplist-parsed", toplist=True)
看来输入是在/data里,输出刚好是/data/blocklist-parsed和/data/toplist-parsed
下面是我修改 blocklist-parsed 为blocklist-parsed2后运行的结果:
/data# ls
blocklist blocklist-parsed2 pyasn.asnames.json toplist toplist-parsed2
blocklist-parsed blocklist.zip pyasn.dat toplist-parsed toplist.zip
进入docker bash里去看看数据(322c74ee1119通过docker ps看到):
sudo docker exec -it 322c74ee1119 /bin/bash
我们看看blocklist里内容:
/data/blocklist# ls
blocklist.input certs.csv http.csv labeled-input.csv tls_verbose.csv
cert_chain.csv hosts.csv http_verbose.csv log
看看hosts里内容,一共5万多数据:
看看toplist里内容:
/data/toplist# ls
alexa.input certs.csv http.csv labeled-input.csv tls_verbose.csv
cert_chain.csv hosts.csv http_verbose.csv log
hosts文件内容,一共6万多:
第三部分的运行效果:
标签:join,mem,framework,hosts,parquet,input,csv,data,dir From: https://blog.51cto.com/u_11908275/6384736