Add machine learning feature extraction for PE files
- Implemented MachineLearning class with ExtractFeatures method - Updated project files to include new machine learning source and header files - Modified main executable to call feature extraction - Updated VSCode settings to include additional C++ headers - Commented out previous file dumping code in main function
This commit is contained in:
4
.vscode/settings.json
vendored
4
.vscode/settings.json
vendored
@@ -56,6 +56,8 @@
|
||||
"xtr1common": "cpp",
|
||||
"xtree": "cpp",
|
||||
"xutility": "cpp",
|
||||
"functional": "cpp"
|
||||
"functional": "cpp",
|
||||
"array": "cpp",
|
||||
"numeric": "cpp"
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,9 @@ int main() {
|
||||
if (peBuffer) {
|
||||
printf("peBuffer: %p\n", peBuffer.get());
|
||||
printf("peSize: %d\n", peSize);
|
||||
peconv::dump_to_file("z:\\dumped_main.exe", peBuffer.get(), peSize);
|
||||
// peconv::dump_to_file("z:\\dumped_main.exe", peBuffer.get(), peSize);
|
||||
MachineLearning ml;
|
||||
ml.ExtractFeatures(peBuffer.get(), peSize, "z:\\features.txt");
|
||||
}
|
||||
peBuffer.release();
|
||||
system("pause");
|
||||
|
||||
@@ -170,6 +170,7 @@
|
||||
<ClCompile Include="libpeconv\libpeconv\src\resource_parser.cpp" />
|
||||
<ClCompile Include="libpeconv\libpeconv\src\resource_util.cpp" />
|
||||
<ClCompile Include="libpeconv\libpeconv\src\util.cpp" />
|
||||
<ClCompile Include="ml.cpp" />
|
||||
<ClCompile Include="sandbox.cpp" />
|
||||
<ClCompile Include="sandbox_api_emu.cpp" />
|
||||
<ClCompile Include="sandbox_callbacks.cpp" />
|
||||
@@ -178,6 +179,7 @@
|
||||
<ClInclude Include="head.h" />
|
||||
<ClInclude Include="libpeconv\libpeconv\src\fix_dot_net_ep.h" />
|
||||
<ClInclude Include="libpeconv\libpeconv\src\ntddk.h" />
|
||||
<ClInclude Include="ml.h" />
|
||||
<ClInclude Include="native_struct.h" />
|
||||
<ClInclude Include="sandbox.h" />
|
||||
<ClInclude Include="sandbox_callbacks.h" />
|
||||
|
||||
@@ -22,6 +22,12 @@
|
||||
<Filter Include="头文件\libpe">
|
||||
<UniqueIdentifier>{38ea362d-55dc-410e-92f1-3a44ced4dc2d}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="源文件\machine_learning">
|
||||
<UniqueIdentifier>{2b38b24a-cb8f-41db-bd53-4a25f8152c17}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="头文件\machine_learning">
|
||||
<UniqueIdentifier>{65a79261-ea29-4842-b41c-7983eddbdc85}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ai_anti_malware.cpp">
|
||||
@@ -117,6 +123,9 @@
|
||||
<ClCompile Include="sandbox_api_emu.cpp">
|
||||
<Filter>源文件\sandbox</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ml.cpp">
|
||||
<Filter>源文件\machine_learning</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="head.h">
|
||||
@@ -137,5 +146,8 @@
|
||||
<ClInclude Include="sandbox_callbacks.h">
|
||||
<Filter>头文件\sandbox</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ml.h">
|
||||
<Filter>头文件\machine_learning</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -29,3 +29,4 @@ struct BasicPeInfo {
|
||||
PIMAGE_NT_HEADERS32 ntHead32;
|
||||
};
|
||||
#include "sandbox.h"
|
||||
#include "ml.h"
|
||||
591
ai_anti_malware/ml.cpp
Normal file
591
ai_anti_malware/ml.cpp
Normal file
@@ -0,0 +1,591 @@
|
||||
#include "ml.h"
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <cfloat>
|
||||
|
||||
// 确保std命名空间中的函数可用
|
||||
using std::max;
|
||||
using std::min;
|
||||
|
||||
MachineLearning::MachineLearning() {
|
||||
// 初始化属性列表
|
||||
_properties = {"has_configuration", "has_debug", "has_exceptions",
|
||||
"has_exports", "has_imports", "has_nx",
|
||||
"has_relocations", "has_resources", "has_signatures",
|
||||
"has_tls", "has_entry_iat", "has_image_base",
|
||||
"has_delay_imports", "has_rich"};
|
||||
|
||||
// 初始化库列表
|
||||
_libraries = {"libssp-0",
|
||||
"kernel32",
|
||||
"user32",
|
||||
"advapi32",
|
||||
"oleaut32",
|
||||
"shell32",
|
||||
"ole32",
|
||||
"gdi32",
|
||||
"comctl32",
|
||||
"version",
|
||||
"msvcrt",
|
||||
"comdlg32",
|
||||
"shlwapi",
|
||||
"wininet",
|
||||
"ws2_32",
|
||||
"winmm",
|
||||
"winspool.drv",
|
||||
"wsock32",
|
||||
"msvbvm60",
|
||||
"rpcrt4",
|
||||
"mpr",
|
||||
"psapi",
|
||||
"iphlpapi",
|
||||
"ntdll",
|
||||
"msimg32",
|
||||
"mscoree",
|
||||
"crypt32",
|
||||
"gdiplus",
|
||||
"userenv",
|
||||
"crtdll",
|
||||
"oledlg",
|
||||
"mfc42",
|
||||
"urlmon",
|
||||
"imm32",
|
||||
"rtl100.bpl",
|
||||
"netapi32",
|
||||
"wintrust",
|
||||
"vcl100.bpl",
|
||||
"vcl50.bpl",
|
||||
"uxtheme",
|
||||
"setupapi",
|
||||
"ntoskrnl.pe",
|
||||
"msi",
|
||||
"msvcp60",
|
||||
"lz32",
|
||||
"winhttp",
|
||||
"hal",
|
||||
"core.bpl",
|
||||
"rbrcl1416.bpl",
|
||||
"dbghelp",
|
||||
"api-ms-win-crt-runtime-l1-1-0",
|
||||
"api-ms-win-crt-heap-l1-1-0",
|
||||
"api-ms-win-crt-math-l1-1-0",
|
||||
"api-ms-win-crt-stdio-l1-1-0",
|
||||
"api-ms-win-crt-locale-l1-1-0",
|
||||
"oleacc",
|
||||
"komponentyd17.bpl",
|
||||
"job.bpl",
|
||||
"cam.bpl",
|
||||
"vcruntime140",
|
||||
"secur32",
|
||||
"msvcr100",
|
||||
"cxeditorsrs17.bpl",
|
||||
"rasapi32",
|
||||
"api-ms-win-crt-string-l1-1-0",
|
||||
"wtsapi32",
|
||||
"imagehlp",
|
||||
"msvcp140",
|
||||
"cnc.bpl",
|
||||
"indyprotocols190.bpl",
|
||||
"api-ms-win-crt-convert-l1-1-0",
|
||||
"msvcr120",
|
||||
"vcl60.bpl",
|
||||
"rbrcl210.bpl",
|
||||
"rtl170.bpl",
|
||||
"rbide1416.bpl",
|
||||
"rtl60.bpl",
|
||||
"vcl170.bpl",
|
||||
"wldap32",
|
||||
"shfolder",
|
||||
"cxlibraryrs17.bpl",
|
||||
"msvcirt",
|
||||
"report.bpl",
|
||||
"rtl190.bpl",
|
||||
"msvcr90",
|
||||
"api-ms-win-crt-filesystem-l1-1-0",
|
||||
"cxeditorsrs16.bpl",
|
||||
"avifil32",
|
||||
"api-ms-win-crt-time-l1-1-0",
|
||||
"jli",
|
||||
"graphic.bpl",
|
||||
"olepro32",
|
||||
"rtl160.bpl",
|
||||
"spmmachine.bpl",
|
||||
"cabinet",
|
||||
"indycore190.bpl",
|
||||
"sacom210.bpl",
|
||||
"rbrtl1416.bpl",
|
||||
"api-ms-win-crt-utility-l1-1-0",
|
||||
"vcl160.bpl",
|
||||
"api-ms-win-crt-environment-l1-1-0",
|
||||
"zcomponent170.bpl",
|
||||
"msvfw32",
|
||||
"libadm_coreutils6",
|
||||
"rbsha",
|
||||
"dxpscorers16.bpl",
|
||||
"msacm32",
|
||||
"vcl70.bpl",
|
||||
"applicationmanagement.bpl",
|
||||
"jobgui.bpl",
|
||||
"indyprotocols170.bpl",
|
||||
"rtl70.bpl",
|
||||
"cxed210.bpl",
|
||||
"msvcr80",
|
||||
"libadm_coretinypy6",
|
||||
"ucrtbased",
|
||||
"vcruntime140d",
|
||||
"msvcp120",
|
||||
"msvcp140d",
|
||||
"dinput8",
|
||||
"gui.bpl",
|
||||
"maincontrols.bpl",
|
||||
"rtl120.bpl",
|
||||
"jcl170.bpl",
|
||||
"frx17.bpl",
|
||||
"fs17.bpl",
|
||||
"vcl190.bpl",
|
||||
"sdl2",
|
||||
"machine.bpl",
|
||||
"mfc42u",
|
||||
"normaliz",
|
||||
"sdl2_gfx",
|
||||
"sdl2_ttf",
|
||||
"sdl2_mixer",
|
||||
"msvcp80",
|
||||
"cxgridrs17.bpl",
|
||||
"cxeditorsvcld7.bpl",
|
||||
"libeay32",
|
||||
"cxlibraryd11.bpl",
|
||||
"vcl120.bpl",
|
||||
"gr32_d6.bpl",
|
||||
"cxlibraryrs16.bpl",
|
||||
"cxgridrs16.bpl",
|
||||
"vcl40.bpl",
|
||||
"opengl32",
|
||||
"qt5core",
|
||||
"qtcore4",
|
||||
"wdfldr.sys",
|
||||
"nesting.bpl",
|
||||
"fltmgr.sys"};
|
||||
}
|
||||
|
||||
MachineLearning::~MachineLearning() {
|
||||
// 析构函数,清理资源(如有必要)
|
||||
}
|
||||
|
||||
bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
const std::string& outputPath) {
|
||||
// 使用libpeconv解析PE文件
|
||||
size_t v_size = 0;
|
||||
BYTE* peBuffer = peconv::load_pe_module(const_cast<BYTE*>(buffer),
|
||||
bufferSize, v_size, false, false);
|
||||
if (!peBuffer) {
|
||||
std::cerr << "无法加载PE文件" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// 解析PE信息
|
||||
PeInfo peInfo;
|
||||
std::vector<SectionInfo> sections;
|
||||
std::vector<std::string> importedLibraries;
|
||||
std::vector<uint8_t> entrypointBytes;
|
||||
|
||||
// 检查是否为64位PE
|
||||
peInfo.isX64 = peconv::is64bit(peBuffer);
|
||||
|
||||
// 获取PE头信息
|
||||
PIMAGE_NT_HEADERS ntHeaders =
|
||||
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
|
||||
if (!ntHeaders) {
|
||||
peconv::free_pe_buffer(peBuffer);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 从NT头部获取信息
|
||||
if (peInfo.isX64) {
|
||||
// 64位PE文件
|
||||
PIMAGE_NT_HEADERS64 ntHeaders64 = (PIMAGE_NT_HEADERS64)ntHeaders;
|
||||
peInfo.addressOfEntryPoint =
|
||||
ntHeaders64->OptionalHeader.AddressOfEntryPoint;
|
||||
peInfo.baseOfCode = ntHeaders64->OptionalHeader.BaseOfCode;
|
||||
peInfo.sizeOfCode = ntHeaders64->OptionalHeader.SizeOfCode;
|
||||
peInfo.sizeOfImage = ntHeaders64->OptionalHeader.SizeOfImage;
|
||||
peInfo.sizeOfHeaders = ntHeaders64->OptionalHeader.SizeOfHeaders;
|
||||
peInfo.characteristics = ntHeaders64->FileHeader.Characteristics;
|
||||
peInfo.dllCharacteristics =
|
||||
ntHeaders64->OptionalHeader.DllCharacteristics;
|
||||
} else {
|
||||
// 32位PE文件
|
||||
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
|
||||
peInfo.addressOfEntryPoint =
|
||||
ntHeaders32->OptionalHeader.AddressOfEntryPoint;
|
||||
peInfo.baseOfCode = ntHeaders32->OptionalHeader.BaseOfCode;
|
||||
peInfo.sizeOfCode = ntHeaders32->OptionalHeader.SizeOfCode;
|
||||
peInfo.sizeOfImage = ntHeaders32->OptionalHeader.SizeOfImage;
|
||||
peInfo.sizeOfHeaders = ntHeaders32->OptionalHeader.SizeOfHeaders;
|
||||
peInfo.characteristics = ntHeaders32->FileHeader.Characteristics;
|
||||
peInfo.dllCharacteristics =
|
||||
ntHeaders32->OptionalHeader.DllCharacteristics;
|
||||
}
|
||||
|
||||
// 检查PE目录
|
||||
IMAGE_DATA_DIRECTORY* dataDir = peconv::get_directory_entry(
|
||||
peBuffer, IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR);
|
||||
peInfo.hasConfiguration = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_DEBUG);
|
||||
peInfo.hasDebug = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_EXCEPTION);
|
||||
peInfo.hasExceptions = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_EXPORT);
|
||||
peInfo.hasExports = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IMPORT);
|
||||
peInfo.hasImports = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
// NX标志检查
|
||||
peInfo.hasNx =
|
||||
(peInfo.dllCharacteristics & IMAGE_DLLCHARACTERISTICS_NX_COMPAT) != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_BASERELOC);
|
||||
peInfo.hasRelocations = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_RESOURCE);
|
||||
peInfo.hasResources = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_SECURITY);
|
||||
peInfo.hasSignatures = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir = peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_TLS);
|
||||
peInfo.hasTls = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
dataDir = peconv::get_directory_entry(peBuffer,
|
||||
IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT);
|
||||
peInfo.hasDelayImports = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
peInfo.hasImageBase = true; // PE文件都有ImageBase
|
||||
|
||||
dataDir = peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IAT);
|
||||
peInfo.hasEntryIat = dataDir && dataDir->VirtualAddress != 0;
|
||||
|
||||
// Rich头部检测
|
||||
peInfo.hasRich = false;
|
||||
PIMAGE_DOS_HEADER dosHeader = reinterpret_cast<PIMAGE_DOS_HEADER>(peBuffer);
|
||||
if (dosHeader && dosHeader->e_magic == IMAGE_DOS_SIGNATURE) {
|
||||
const uint32_t* richPtr = reinterpret_cast<const uint32_t*>(
|
||||
peBuffer + sizeof(IMAGE_DOS_HEADER));
|
||||
size_t maxLen = dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER);
|
||||
for (size_t i = 0; i < maxLen / 4 - 1; i++) {
|
||||
if (richPtr[i] == 0x68636952) { // "Rich"
|
||||
peInfo.hasRich = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 获取导入DLL列表
|
||||
if (peInfo.hasImports) {
|
||||
size_t impRva = 0;
|
||||
IMAGE_DATA_DIRECTORY* impDir =
|
||||
peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IMPORT);
|
||||
if (impDir) {
|
||||
impRva = impDir->VirtualAddress;
|
||||
IMAGE_IMPORT_DESCRIPTOR* impDesc =
|
||||
reinterpret_cast<IMAGE_IMPORT_DESCRIPTOR*>(
|
||||
RvaToPtr(impRva, peBuffer));
|
||||
while (impDesc && impDesc->Name != 0) {
|
||||
char* libName =
|
||||
reinterpret_cast<char*>(RvaToPtr(impDesc->Name, peBuffer));
|
||||
if (libName) {
|
||||
std::string libNameStr = libName;
|
||||
std::transform(libNameStr.begin(), libNameStr.end(),
|
||||
libNameStr.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
importedLibraries.push_back(libNameStr);
|
||||
}
|
||||
impDesc++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 获取节区信息
|
||||
size_t sectionsCount = peconv::get_sections_count(peBuffer, bufferSize);
|
||||
for (size_t i = 0; i < sectionsCount; i++) {
|
||||
PIMAGE_SECTION_HEADER section =
|
||||
peconv::get_section_hdr(peBuffer, bufferSize, i);
|
||||
if (!section) continue;
|
||||
|
||||
SectionInfo secInfo;
|
||||
secInfo.characteristics = section->Characteristics;
|
||||
secInfo.sizeOfRawData = section->SizeOfRawData;
|
||||
secInfo.virtualSize = section->Misc.VirtualSize;
|
||||
|
||||
// 计算节区熵
|
||||
BYTE* sectionData = RvaToPtr(section->VirtualAddress, peBuffer);
|
||||
secInfo.entropy =
|
||||
(sectionData && section->SizeOfRawData > 0)
|
||||
? CalculateEntropy(sectionData, section->SizeOfRawData)
|
||||
: 0.0;
|
||||
|
||||
sections.push_back(secInfo);
|
||||
}
|
||||
|
||||
// 获取入口点前255字节
|
||||
if (peInfo.addressOfEntryPoint > 0) {
|
||||
BYTE* epPtr = RvaToPtr(peInfo.addressOfEntryPoint, peBuffer);
|
||||
if (epPtr) {
|
||||
// 确保不会越界
|
||||
size_t maxBytes =
|
||||
std::min<size_t>(255, bufferSize - (epPtr - peBuffer));
|
||||
entrypointBytes.assign(epPtr, epPtr + maxBytes);
|
||||
}
|
||||
}
|
||||
|
||||
// 提取所有特征
|
||||
std::vector<double> allFeatures;
|
||||
|
||||
// 1. PE段属性
|
||||
std::vector<double> propFeatures =
|
||||
EncodeProperties(peInfo, importedLibraries);
|
||||
allFeatures.insert(allFeatures.end(), propFeatures.begin(),
|
||||
propFeatures.end());
|
||||
|
||||
// 2. 导入DLL检测
|
||||
std::vector<double> libFeatures = EncodeLibraries(importedLibraries);
|
||||
allFeatures.insert(allFeatures.end(), libFeatures.begin(),
|
||||
libFeatures.end());
|
||||
|
||||
// 3. 文件熵
|
||||
double fileEntropy = CalculateEntropy(buffer, bufferSize);
|
||||
allFeatures.push_back(fileEntropy);
|
||||
|
||||
// 4. 入口点前255字节
|
||||
std::vector<double> epFeatures = EncodeEntrypoint(entrypointBytes);
|
||||
allFeatures.insert(allFeatures.end(), epFeatures.begin(), epFeatures.end());
|
||||
|
||||
// 5. 节区信息
|
||||
std::vector<double> secFeatures = EncodeSections(sections, peInfo.isX64);
|
||||
allFeatures.insert(allFeatures.end(), secFeatures.begin(),
|
||||
secFeatures.end());
|
||||
|
||||
// 6. 文件和代码段的比率
|
||||
double codeRatio =
|
||||
(peInfo.sizeOfCode > 0 && peInfo.sizeOfImage > 0)
|
||||
? static_cast<double>(peInfo.sizeOfCode) / peInfo.sizeOfImage
|
||||
: 0.0;
|
||||
allFeatures.push_back(codeRatio);
|
||||
|
||||
// 7. 节区数量
|
||||
allFeatures.push_back(static_cast<double>(sections.size()));
|
||||
|
||||
// 导出特征到CSV
|
||||
bool result = ExportToCSV(allFeatures, outputPath);
|
||||
|
||||
// 清理资源
|
||||
peconv::free_pe_buffer(peBuffer);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeProperties(
|
||||
const PeInfo& peInfo, const std::vector<std::string>& dllTables) {
|
||||
std::vector<double> features;
|
||||
|
||||
// 添加各属性的布尔值(转为double: 1.0=true, 0.0=false)
|
||||
features.push_back(peInfo.hasConfiguration ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasDebug ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasExceptions ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasExports ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasImports ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasNx ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasRelocations ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasResources ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasSignatures ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasTls ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasEntryIat ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasImageBase ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasDelayImports ? 1.0 : 0.0);
|
||||
features.push_back(peInfo.hasRich ? 1.0 : 0.0);
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeEntrypoint(
|
||||
const std::vector<uint8_t>& epBytes) {
|
||||
std::vector<double> features;
|
||||
|
||||
// 原始字节转为浮点值(按Python代码中的normalize处理)
|
||||
for (const auto& byte : epBytes) {
|
||||
features.push_back(static_cast<double>(byte) / 255.0);
|
||||
}
|
||||
|
||||
// 填充至64字节长度
|
||||
while (features.size() < 64) {
|
||||
features.push_back(0.0);
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeHistogram(const uint8_t* data,
|
||||
size_t size) {
|
||||
std::vector<double> features(256, 0.0);
|
||||
|
||||
if (data && size > 0) {
|
||||
// 统计字节频率
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
features[data[i]]++;
|
||||
}
|
||||
|
||||
// 归一化频率
|
||||
for (auto& freq : features) {
|
||||
freq /= static_cast<double>(size);
|
||||
}
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeLibraries(
|
||||
const std::vector<std::string>& importedLibraries) {
|
||||
std::vector<double> features(_libraries.size(), 0.0);
|
||||
|
||||
// 检查每个库是否被导入
|
||||
for (size_t i = 0; i < _libraries.size(); i++) {
|
||||
const std::string& lib = _libraries[i];
|
||||
for (const auto& imported : importedLibraries) {
|
||||
if (imported.find(lib) != std::string::npos) {
|
||||
features[i] = 1.0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
std::vector<double> MachineLearning::EncodeSections(
|
||||
const std::vector<SectionInfo>& sections, bool isX64) {
|
||||
std::vector<double> features;
|
||||
size_t numSections = sections.size();
|
||||
if (numSections == 0) {
|
||||
return std::vector<double>(5, 0.0); // 返回全零特征
|
||||
}
|
||||
|
||||
// 计算熵特征
|
||||
double totalEntropy = 0.0;
|
||||
double maxEntropy = 0.0;
|
||||
for (const auto& sec : sections) {
|
||||
totalEntropy += sec.entropy;
|
||||
if (sec.entropy > maxEntropy) {
|
||||
maxEntropy = sec.entropy;
|
||||
}
|
||||
}
|
||||
double avgEntropy = totalEntropy / numSections;
|
||||
double normAvgEntropy = (maxEntropy > 0) ? avgEntropy / maxEntropy : 0.0;
|
||||
|
||||
// 计算大小比率
|
||||
double maxSize = 0.0;
|
||||
double minVSize = DBL_MAX;
|
||||
for (const auto& sec : sections) {
|
||||
if (static_cast<double>(sec.sizeOfRawData) > maxSize) {
|
||||
maxSize = static_cast<double>(sec.sizeOfRawData);
|
||||
}
|
||||
if (sec.virtualSize > 0 &&
|
||||
static_cast<double>(sec.virtualSize) < minVSize) {
|
||||
minVSize = static_cast<double>(sec.virtualSize);
|
||||
}
|
||||
}
|
||||
|
||||
// 根据PE文件类型调整计算方式
|
||||
double normSize = 0.0;
|
||||
if (minVSize > 0 && minVSize != DBL_MAX) {
|
||||
if (isX64) {
|
||||
// 64位PE文件可能有更大的对齐要求
|
||||
normSize = maxSize / (minVSize * 2.0);
|
||||
} else {
|
||||
// 32位PE文件的处理方式
|
||||
normSize = maxSize / minVSize;
|
||||
}
|
||||
}
|
||||
|
||||
// 返回特征
|
||||
features.push_back(static_cast<double>(numSections));
|
||||
features.push_back(avgEntropy);
|
||||
features.push_back(maxEntropy);
|
||||
features.push_back(normAvgEntropy);
|
||||
features.push_back(normSize);
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
double MachineLearning::CalculateEntropy(const uint8_t* data, size_t size) {
|
||||
if (!data || size == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
std::array<double, 256> frequencies = {};
|
||||
|
||||
// 统计每个字节的频率
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
frequencies[data[i]] += 1.0;
|
||||
}
|
||||
|
||||
// 计算香农熵
|
||||
double entropy = 0.0;
|
||||
for (const auto& freq : frequencies) {
|
||||
if (freq > 0) {
|
||||
double p = freq / static_cast<double>(size);
|
||||
entropy -= p * std::log2(p);
|
||||
}
|
||||
}
|
||||
|
||||
return entropy;
|
||||
}
|
||||
|
||||
bool MachineLearning::ExportToCSV(const std::vector<double>& features,
|
||||
const std::string& outputPath) {
|
||||
std::ofstream outFile(outputPath);
|
||||
if (!outFile.is_open()) {
|
||||
std::cerr << "无法打开输出文件: " << outputPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// 写入特征
|
||||
for (size_t i = 0; i < features.size(); i++) {
|
||||
outFile << std::fixed << std::setprecision(6) << features[i];
|
||||
if (i < features.size() - 1) {
|
||||
outFile << ",";
|
||||
}
|
||||
}
|
||||
outFile << std::endl;
|
||||
|
||||
outFile.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
int MachineLearning::GetOpcodeType(const void* code, bool isX64) {
|
||||
// 此函数未使用,但保留实现接口
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::tuple<std::vector<double>, std::vector<int>>
|
||||
MachineLearning::GetOpcodeStatistics(const uint8_t* data, size_t dataSize,
|
||||
bool isX64, const PeInfo& peInfo) {
|
||||
// 此函数未使用,但保留实现接口
|
||||
return std::make_tuple(std::vector<double>(), std::vector<int>());
|
||||
}
|
||||
128
ai_anti_malware/ml.h
Normal file
128
ai_anti_malware/ml.h
Normal file
@@ -0,0 +1,128 @@
|
||||
#pragma once
|
||||
#include "head.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <unordered_map>
|
||||
|
||||
// 前向声明
|
||||
struct PeInfo;
|
||||
struct SectionInfo;
|
||||
class BasicPeInfo;
|
||||
|
||||
// RVA转换为内存中的指针的辅助函数
|
||||
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
||||
if (!peBuffer || rva == 0) return nullptr;
|
||||
|
||||
PIMAGE_NT_HEADERS ntHeaders =
|
||||
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
|
||||
if (!ntHeaders) return nullptr;
|
||||
|
||||
PIMAGE_SECTION_HEADER section = IMAGE_FIRST_SECTION(ntHeaders);
|
||||
WORD numSections = ntHeaders->FileHeader.NumberOfSections;
|
||||
|
||||
for (WORD i = 0; i < numSections; i++, section++) {
|
||||
// 检查RVA是否在这个节区范围内
|
||||
if (rva >= section->VirtualAddress &&
|
||||
rva < section->VirtualAddress + section->Misc.VirtualSize) {
|
||||
// 计算文件偏移
|
||||
DWORD offset =
|
||||
rva - section->VirtualAddress + section->PointerToRawData;
|
||||
return peBuffer + offset;
|
||||
}
|
||||
}
|
||||
|
||||
// 如果RVA在PE头部内
|
||||
DWORD sizeOfHeaders = 0;
|
||||
bool isX64 = peconv::is64bit(peBuffer);
|
||||
|
||||
if (isX64) {
|
||||
PIMAGE_NT_HEADERS64 ntHeaders64 = (PIMAGE_NT_HEADERS64)ntHeaders;
|
||||
sizeOfHeaders = ntHeaders64->OptionalHeader.SizeOfHeaders;
|
||||
} else {
|
||||
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
|
||||
sizeOfHeaders = ntHeaders32->OptionalHeader.SizeOfHeaders;
|
||||
}
|
||||
|
||||
if (rva < sizeOfHeaders) {
|
||||
return peBuffer + rva;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
class MachineLearning {
|
||||
public:
|
||||
MachineLearning();
|
||||
~MachineLearning();
|
||||
|
||||
// 主函数:提取特征并导出到CSV
|
||||
bool ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
const std::string& outputPath);
|
||||
|
||||
private:
|
||||
// 特征提取辅助函数
|
||||
std::vector<double> EncodeProperties(
|
||||
const PeInfo& peInfo, const std::vector<std::string>& dllTables);
|
||||
std::vector<double> EncodeEntrypoint(const std::vector<uint8_t>& epBytes);
|
||||
std::vector<double> EncodeHistogram(const uint8_t* data, size_t size);
|
||||
std::vector<double> EncodeLibraries(
|
||||
const std::vector<std::string>& dllTable);
|
||||
std::vector<double> EncodeSections(const std::vector<SectionInfo>& sections,
|
||||
bool isX64);
|
||||
std::tuple<std::vector<double>, std::vector<int>> GetOpcodeStatistics(
|
||||
const uint8_t* data, size_t dataSize, bool isX64, const PeInfo& peInfo);
|
||||
int GetOpcodeType(const void* code, bool isX64);
|
||||
double CalculateEntropy(const uint8_t* data, size_t size);
|
||||
|
||||
// 将特征导出到CSV
|
||||
bool ExportToCSV(const std::vector<double>& features,
|
||||
const std::string& outputPath);
|
||||
|
||||
// 常量定义
|
||||
std::vector<std::string> _properties;
|
||||
std::vector<std::string> _libraries;
|
||||
std::unordered_map<std::string, int> _opcodeTypeDict;
|
||||
};
|
||||
|
||||
// PE文件信息结构
|
||||
struct PeInfo {
|
||||
uint32_t addressOfEntryPoint;
|
||||
uint32_t baseOfCode;
|
||||
uint32_t sizeOfCode;
|
||||
uint32_t sizeOfImage;
|
||||
uint32_t sizeOfHeaders;
|
||||
uint32_t characteristics;
|
||||
uint32_t dllCharacteristics;
|
||||
bool isX64;
|
||||
|
||||
// PE目录标志
|
||||
bool hasConfiguration;
|
||||
bool hasDebug;
|
||||
bool hasExceptions;
|
||||
bool hasExports;
|
||||
bool hasImports;
|
||||
bool hasNx; // NX兼容标志
|
||||
bool hasRelocations;
|
||||
bool hasResources;
|
||||
bool hasSignatures;
|
||||
bool hasTls;
|
||||
bool hasDelayImports;
|
||||
bool hasImageBase;
|
||||
bool hasEntryIat;
|
||||
bool hasRich;
|
||||
};
|
||||
|
||||
// 节区信息结构
|
||||
struct SectionInfo {
|
||||
uint32_t characteristics;
|
||||
double entropy;
|
||||
uint32_t sizeOfRawData;
|
||||
uint32_t virtualSize;
|
||||
};
|
||||
@@ -164,7 +164,44 @@ class cFixImprot : public peconv::t_function_resolver {
|
||||
};
|
||||
Sandbox::Sandbox() {}
|
||||
|
||||
Sandbox::~Sandbox() {}
|
||||
Sandbox::~Sandbox() {
|
||||
// 1. 先清理高层资源
|
||||
m_crossSectionExecution.clear();
|
||||
envStrings.clear();
|
||||
api_map.clear();
|
||||
m_moduleList.clear();
|
||||
m_impFuncDict.clear();
|
||||
m_exportFuncDict.clear();
|
||||
|
||||
// 2. 清理内存映射
|
||||
if (m_ucEngine) {
|
||||
uc_close(m_ucEngine);
|
||||
m_ucEngine = nullptr;
|
||||
}
|
||||
|
||||
// 3. 清理堆内存
|
||||
for (auto& [address, segment] : m_heapSegments) {
|
||||
HeapBlock* current = segment->blocks;
|
||||
while (current) {
|
||||
HeapBlock* next = current->next;
|
||||
delete current;
|
||||
current = next;
|
||||
}
|
||||
delete segment;
|
||||
}
|
||||
m_heapSegments.clear();
|
||||
|
||||
// 4. 清理栈内存
|
||||
if (m_stackBuffer) {
|
||||
free(m_stackBuffer);
|
||||
m_stackBuffer = nullptr;
|
||||
}
|
||||
|
||||
// 5. 最后清理底层资源
|
||||
if (m_csHandle) {
|
||||
cs_close(&m_csHandle);
|
||||
}
|
||||
}
|
||||
|
||||
auto Sandbox::PushModuleToVM(const char* dllName, uint64_t moduleBase) -> void {
|
||||
for (auto module : m_moduleList) {
|
||||
@@ -401,9 +438,9 @@ auto Sandbox::SetupVirtualMachine() -> void {
|
||||
/*
|
||||
映射 m_KSharedUserDataBase
|
||||
*/
|
||||
uint64_t m_KSharedUserDataBase = 0x7FFE0000;
|
||||
m_KSharedUserDataBase = 0x7FFE0000;
|
||||
uint64_t m_KSharedUserDataEnd = 0x7FFE0FFF; // 0x7FFE2000
|
||||
uint64_t m_KSharedUserDataSize = AlignToSectionAlignment(
|
||||
m_KSharedUserDataSize = AlignToSectionAlignment(
|
||||
m_KSharedUserDataEnd - m_KSharedUserDataBase, PAGE_SIZE);
|
||||
|
||||
uc_mem_map(m_ucEngine, m_KSharedUserDataBase, m_KSharedUserDataSize,
|
||||
@@ -663,29 +700,9 @@ auto Sandbox::Run() -> void {
|
||||
InitApiHooks();
|
||||
std::cout << "Starting execution at " << std::hex << entryPoint
|
||||
<< std::endl;
|
||||
err = uc_emu_start(m_ucEngine, entryPoint, m_peInfo->imageEnd, 0, 0);
|
||||
if (err != UC_ERR_OK) {
|
||||
std::cerr << "Emulation error: " << uc_strerror(err) << std::endl;
|
||||
|
||||
// 32位环境下的错误处理
|
||||
if (!m_peInfo->isX64) {
|
||||
uint32_t eip;
|
||||
uc_reg_read(m_ucEngine, UC_X86_REG_EIP, &eip);
|
||||
std::cerr << "Error occurred at EIP: 0x" << std::hex << eip
|
||||
<< std::endl;
|
||||
|
||||
// 尝试读取当前指令
|
||||
uint8_t instruction[16];
|
||||
if (uc_mem_read(m_ucEngine, eip, instruction,
|
||||
sizeof(instruction)) == UC_ERR_OK) {
|
||||
std::cerr << "Instruction bytes: ";
|
||||
for (int i = 0; i < 16; i++) {
|
||||
printf("%02X ", instruction[i]);
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
uint64_t timeout = 60 * 1000;
|
||||
err = uc_emu_start(m_ucEngine, entryPoint, m_peInfo->imageEnd, timeout, 0);
|
||||
std::cerr << "Emulation error: " << uc_strerror(err) << std::endl;
|
||||
}
|
||||
|
||||
auto Sandbox::GetEnvString() -> std::vector<wchar_t> {
|
||||
@@ -909,11 +926,11 @@ auto Sandbox::DumpPE() -> std::pair<std::unique_ptr<BYTE[]>, size_t> {
|
||||
reinterpret_cast<HMODULE>(moduleBuffer.get()),
|
||||
module->base);
|
||||
}
|
||||
//这里有一个严重的问题,就懒得处理了:
|
||||
//壳里面吐出来的代码的导入表和壳的导入表不是同样一个.
|
||||
//这个修的是壳的 导入表,所以导入表 修 不 全
|
||||
//有个很简单的办法,需要搜索IAT结构,然后修改脱壳后的IAT的字段到壳的字段里面,然后再执行一次fix_imports
|
||||
//懒得写了,家庭作业.自己完成
|
||||
// 这里有一个严重的问题,就懒得处理了:
|
||||
// 壳里面吐出来的代码的导入表和壳的导入表不是同样一个.
|
||||
// 这个修的是壳的 导入表,所以导入表 修 不 全
|
||||
// 有个很简单的办法,需要搜索IAT结构,然后修改脱壳后的IAT的字段到壳的字段里面,然后再执行一次fix_imports
|
||||
// 懒得写了,家庭作业.自己完成
|
||||
bool importsFixed = peconv::fix_imports(
|
||||
resultBuffer.get(), virtualMemorySize, exportsMap, nullptr);
|
||||
if (importsFixed) {
|
||||
|
||||
@@ -217,4 +217,6 @@ class Sandbox {
|
||||
auto InitCommandLine(std::string commandLine) -> void;
|
||||
std::vector<uint64_t> m_crossSectionExecution; // 记录跨区段执行地址
|
||||
uint64_t m_lastExecuteSectionIndex = 0; // 上次执行的区段索引
|
||||
uint64_t m_KSharedUserDataBase{0};
|
||||
uint64_t m_KSharedUserDataSize{0};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user