This commit is contained in:
Huoji's
2025-03-09 03:19:40 +08:00
parent 1cea516cf7
commit defe59ffe8
7 changed files with 337 additions and 59 deletions

View File

@@ -1,4 +1,5 @@
#include "ml.h"
#include <Windows.h>
#include <array>
#include <limits>
#include <algorithm>
@@ -7,6 +8,7 @@
#include <iomanip>
#include <sstream>
#include <cfloat>
#include <filesystem>
// 确保std命名空间中的函数可用
using std::max;
@@ -177,15 +179,14 @@ MachineLearning::~MachineLearning() {
// 析构函数,清理资源(如有必要)
}
bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
const std::string& outputPath) {
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
size_t bufferSize) {
// 使用libpeconv解析PE文件
size_t v_size = 0;
BYTE* peBuffer = peconv::load_pe_module(const_cast<BYTE*>(buffer),
bufferSize, v_size, false, false);
if (!peBuffer) {
std::cerr << "无法加载PE文件" << std::endl;
return false;
return std::vector<double>();
}
// 解析PE信息
@@ -202,7 +203,7 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
if (!ntHeaders) {
peconv::free_pe_buffer(peBuffer);
return false;
return std::vector<double>();
}
// 从NT头部获取信息
@@ -392,13 +393,10 @@ bool MachineLearning::ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
// 7. 节区数量
allFeatures.push_back(static_cast<double>(sections.size()));
// 导出特征到CSV
bool result = ExportToCSV(allFeatures, outputPath);
// 清理资源
peconv::free_pe_buffer(peBuffer);
return result;
return allFeatures;
}
std::vector<double> MachineLearning::EncodeProperties(
@@ -588,4 +586,124 @@ MachineLearning::GetOpcodeStatistics(const uint8_t* data, size_t dataSize,
bool isX64, const PeInfo& peInfo) {
// 此函数未使用,但保留实现接口
return std::make_tuple(std::vector<double>(), std::vector<int>());
}
std::vector<uint8_t> MachineLearning::ReadFileToBuffer(
const std::string& filePath) {
std::ifstream fileStream(filePath, std::ios::binary | std::ios::ate);
if (!fileStream.is_open()) {
std::cerr << "无法打开文件: " << filePath << std::endl;
return std::vector<uint8_t>();
}
// 获取文件大小
std::streamsize fileSize = fileStream.tellg();
fileStream.seekg(0, std::ios::beg);
// 分配缓冲区并读取文件
std::vector<uint8_t> buffer(fileSize);
if (!fileStream.read(reinterpret_cast<char*>(buffer.data()), fileSize)) {
std::cerr << "读取文件失败: " << filePath << std::endl;
return std::vector<uint8_t>();
}
return buffer;
}
bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
const std::string& outputCsvPath) {
// 打开CSV文件用于写入
std::ofstream csvFile(outputCsvPath);
if (!csvFile.is_open()) {
std::cerr << "无法创建CSV文件: " << outputCsvPath << std::endl;
return false;
}
/*
// 写入CSV标题行
csvFile << "文件路径";
for (size_t i = 0; i < _properties.size(); i++) {
csvFile << ",属性_" << i;
}
for (size_t i = 0; i < _libraries.size(); i++) {
csvFile << ",库_" << i;
}
csvFile << ",文件熵";
for (size_t i = 0; i < 64; i++) { // 前64个字节特征
csvFile << ",EP_" << i;
}
csvFile << ",节区数";
csvFile << ",平均熵";
csvFile << ",最大熵";
csvFile << ",归一化平均熵";
csvFile << ",节区大小比率";
csvFile << ",代码比率";
csvFile << ",节区计数";
csvFile << std::endl;
*/
// 递归遍历目录
WIN32_FIND_DATAA findData;
std::string searchPath = directoryPath + "\\*";
HANDLE hFind = FindFirstFileA(searchPath.c_str(), &findData);
if (hFind == INVALID_HANDLE_VALUE) {
std::cerr << "无法访问目录: " << directoryPath << std::endl;
csvFile.close();
return false;
}
int processedCount = 0;
int failedCount = 0;
do {
// 跳过 "." 和 ".." 目录
if (strcmp(findData.cFileName, ".") == 0 ||
strcmp(findData.cFileName, "..") == 0) {
continue;
}
std::string currentPath = directoryPath + "\\" + findData.cFileName;
if (findData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
// 递归处理子目录
ProcessDirectory(currentPath, outputCsvPath);
} else {
// 处理文件
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
if (fileBuffer.empty()) {
std::cerr << "跳过文件: " << currentPath << " (读取失败)"
<< std::endl;
failedCount++;
continue;
}
// 提取特征
std::vector<double> features =
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
if (features.empty()) {
std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
<< std::endl;
failedCount++;
continue;
}
// 写入CSV
csvFile << currentPath;
for (const auto& feature : features) {
csvFile << "," << std::fixed << std::setprecision(6) << feature;
}
csvFile << std::endl;
processedCount++;
if (processedCount % 100 == 0) {
std::cout << "已处理 " << processedCount << " 个文件..."
<< std::endl;
}
}
} while (FindNextFileA(hFind, &findData));
FindClose(hFind);
csvFile.close();
printf("ML Process Result, success count: %d fail count: %d \n",
processedCount, failedCount);
return true;
}