-
-
[分享]文件相似度的计算
-
发表于:
2013-6-21 23:45
8721
-
相似度 = (1 - (差异数)/ max(sizeof(文件1), sizoof(文件2))) * (min(sizeof(文件1), sizoof(文件2)) / max(sizeof(文件1), sizoof(文件2)))
相似度取值0(不相似)-> 1(相似) ,一个应用是计算一批变形蠕虫样本是不是同一个
// filr_sim.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <vector>
#include <fstream>
#include <math.h>
#include <iostream>
using namespace std;
bool loadfile(const wchar_t *name, vector<unsigned char> &v)
{
fstream fs(name, ios::in | ios::binary);
if (!fs.is_open())
return false;
fs.seekg(0, ios::end);
streampos ps = fs.tellg();
fs.seekg(0, ios::beg);
v.resize(ps);
fs.read((char *)&v[0], ps);
return true;
}
bool loadfile(const char *name, vector<unsigned char> &v)
{
fstream fs(name, ios::in | ios::binary);
if (!fs.is_open())
return false;
fs.seekg(0, ios::end);
streampos ps = fs.tellg();
fs.seekg(0, ios::beg);
v.resize(ps);
fs.read((char *)&v[0], ps);
return true;
}
bool savefile(const char *name, vector<unsigned char> &v)
{
fstream fs(name, ios::out | ios::binary);
if (!fs.is_open())
return false;
fs.write((char *)&v[0], v.size());
return true;
}
typedef struct
{
vector<int> _v;
}bitvec;
bool file2vec(unsigned char *buf, size_t len, bitvec &vec)
{
vec._v.resize(256 * 256);
for (size_t i = 0; i < len - 1; i++)
{
unsigned char x = buf[i + 1];
unsigned char y = buf[i];
vec._v[y * 256 + x] += 1;
}
return true;
}
inline double sim(bitvec &vec1, bitvec &vec2)
{
double x = 0;
double nx = 0;
double ny = 0;
for (size_t i = 0; i < vec1._v.size(); i++)
{
x += abs(vec1._v[i] - vec2._v[i]);
nx += vec1._v[i];
ny += vec2._v[i];
}
return (1 - x / (max(nx, ny) + 1)) * ((min(nx, ny) + 1) / (max(nx, ny) + 1));
}
int _tmain(int argc, _TCHAR* argv[])
{
vector<unsigned char> buf;
loadfile("calc1.exe", buf);
bitvec vec;
file2vec(&buf[0], buf.size(), vec);
loadfile("calc2.exe", buf);
bitvec vec1;
file2vec(&buf[0], buf.size(), vec1);
cout << sim(vec, vec1) << endl;
return 0;
}
[培训]内核驱动高级班,冲击BAT一流互联网大厂工作,每周日13:00-18:00直播授课