KNN模型家族Top10列表
测试集家族Top10列表
判定依据:
特别说明:
结论如下:
0.95错误样本抽样分析
0.90错误样本抽样分析
恶意家族标签 |
恶意样本数量 |
恶意样本占比 |
virlock |
169631 |
26.83% |
botnet |
168238 |
26.61% |
gandcrab |
76101 |
12.04% |
wabot |
47340 |
7.49% |
pluto |
28554 |
4.52% |
coinminer |
19226 |
3.04% |
autorun |
17681 |
2.80% |
virut |
8012 |
1.27% |
gandcrypt |
7210 |
1.14% |
upatre |
6717 |
1.06% |
总计 |
548710 |
86.79% |
家族标签 |
样本数量 |
样本占比 |
clean |
200 |
50.00% |
gandcrypt |
14 |
3.50% |
upatre |
13 |
3.25% |
ipamor |
11 |
2.75% |
wabot |
11 |
2.75% |
pluto |
9 |
2.25% |
autoit |
9 |
2.25% |
virut |
8 |
2.00% |
allaple |
8 |
2.00% |
sytro |
6 |
1.50% |
总计 |
289 |
72.25% |
import
sys
import
requests
from
python_mmdt.mmdt.common
import
mmdt_load
dlt
=
0.95
def
mmdt_scan_online_check():
file_name
=
sys.argv[
1
]
features
=
mmdt_load(file_name)
TP
=
0
TN
=
0
FP
=
0
FN
=
0
count
=
0
print
(
'检测结果,文件md5,真实标签,相似文件,预测标签,相似度'
)
for
feature
in
features:
count
+
=
1
tmp
=
feature.strip().split(
":"
)
file_mmdt
=
':'
.join(tmp[:
2
])
tag
=
tmp[
2
]
file_sha1
=
tmp[
3
]
data
=
{
"md5"
: file_sha1,
"sha1"
: file_sha1,
"file_name"
: file_sha1,
"mmdt"
: file_mmdt,
"data"
: {}
}
r
=
requests.post(url
=
'http://146.56.242.184/mmdt/scan'
, json
=
data)
r_data
=
r.json()
if
r_data.get(
'status'
,
0
)
=
=
20001
:
status
=
r_data.get(
'status'
,
0
)
message
=
r_data.get(
'message'
, '')
print
(
'文件md5: %s, 状态码: %d, 提交信息: %s'
%
(file_sha1, status, message))
else
:
label
=
r_data.get(
'data'
, {}).get(
'label'
,
'unknown'
)
sim_hash
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'hash'
,
'None'
)
sim
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'sim'
,
0.0
)
check_result
=
''
if
tag
=
=
label
and
sim > dlt:
TP
+
=
1
check_result
=
'正确'
elif
tag
=
=
'clean'
and
sim > dlt:
FP
+
=
1
check_result
=
'错误'
elif
tag
=
=
'clean'
and
sim <
=
dlt:
TN
+
=
1
check_result
=
'正确'
else
:
FN
+
=
1
check_result
=
'错误'
print
(
'%s,%s,%s,%s,%s,%.5f'
%
(check_result, file_sha1, tag, sim_hash, label, sim))
if
count >
=
500
:
break
print
(
'测试mmdthash总数:%d'
%
count)
print
(
'检测正确总数:%d'
%
(TP
+
TN))
print
(
'检测错误总数:%d'
%
(FP
+
FN))
print
(
'检测TP总数:%d'
%
TP)
print
(
'检测TN总数:%d'
%
TN)
print
(
'检测FP总数:%d'
%
FP)
print
(
'检测FN总数:%d'
%
FN)
print
(
'检测准确率ACC:%.3f'
%
((TP
+
TN)
/
(TP
+
TN
+
FP
+
FN)))
print
(
'检测精确率PRE:%.3f'
%
(TP
/
(TP
+
FP)))
print
(
'检测召回率REC:%.3f'
%
(TP
/
(TP
+
FN)))
def
main():
mmdt_scan_online_check()
if
__name__
=
=
'__main__'
:
main()
import
sys
import
requests
from
python_mmdt.mmdt.common
import
mmdt_load
dlt
=
0.95
def
mmdt_scan_online_check():
file_name
=
sys.argv[
1
]
features
=
mmdt_load(file_name)
TP
=
0
TN
=
0
FP
=
0
FN
=
0
count
=
0
print
(
'检测结果,文件md5,真实标签,相似文件,预测标签,相似度'
)
for
feature
in
features:
count
+
=
1
tmp
=
feature.strip().split(
":"
)
file_mmdt
=
':'
.join(tmp[:
2
])
tag
=
tmp[
2
]
file_sha1
=
tmp[
3
]
data
=
{
"md5"
: file_sha1,
"sha1"
: file_sha1,
"file_name"
: file_sha1,
"mmdt"
: file_mmdt,
"data"
: {}
}
r
=
requests.post(url
=
'http://146.56.242.184/mmdt/scan'
, json
=
data)
r_data
=
r.json()
if
r_data.get(
'status'
,
0
)
=
=
20001
:
status
=
r_data.get(
'status'
,
0
)
message
=
r_data.get(
'message'
, '')
print
(
'文件md5: %s, 状态码: %d, 提交信息: %s'
%
(file_sha1, status, message))
else
:
label
=
r_data.get(
'data'
, {}).get(
'label'
,
'unknown'
)
sim_hash
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'hash'
,
'None'
)
sim
=
r_data.get(
'data'
, {}).get(
'similars'
, [])[
0
].get(
'sim'
,
0.0
)
check_result
=
''
if
tag
=
=
label
and
sim > dlt:
TP
+
=
1
check_result
=
'正确'
elif
tag
=
=
'clean'
and
sim > dlt:
FP
+
=
1
check_result
=
'错误'
elif
tag
=
=
'clean'
and
sim <
=
dlt:
TN
+
=
1
check_result
=
'正确'
else
:
FN
+
=
1
check_result
=
'错误'
print
(
'%s,%s,%s,%s,%s,%.5f'
%
(check_result, file_sha1, tag, sim_hash, label, sim))
if
count >
=
500
:
break
print
(
'测试mmdthash总数:%d'
%
count)
print
(
'检测正确总数:%d'
%
(TP
+
TN))
print
(
'检测错误总数:%d'
%
(FP
+
FN))
print
(
'检测TP总数:%d'
%
TP)
print
(
'检测TN总数:%d'
%
TN)
print
(
'检测FP总数:%d'
%
FP)
print
(
'检测FN总数:%d'
%
FN)
print
(
'检测准确率ACC:%.3f'
%
((TP
+
TN)
/
(TP
+
TN
+
FP
+
FN)))
print
(
'检测精确率PRE:%.3f'
%
(TP
/
(TP
+
FP)))
print
(
'检测召回率REC:%.3f'
%
(TP
/
(TP
+
FN)))
[培训]内核驱动高级班,冲击BAT一流互联网大厂工作,每周日13:00-18:00直播授课
最后于 2022-1-24 14:46
被大大薇薇编辑
,原因: 增加测试数据