Add machine learning feature extraction for PE files
- Implemented MachineLearning class with ExtractFeatures method - Updated project files to include new machine learning source and header files - Modified main executable to call feature extraction - Updated VSCode settings to include additional C++ headers - Commented out previous file dumping code in main function
This commit is contained in:
128
ai_anti_malware/ml.h
Normal file
128
ai_anti_malware/ml.h
Normal file
@@ -0,0 +1,128 @@
|
||||
#pragma once
|
||||
#include "head.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include <unordered_map>
|
||||
|
||||
// 前向声明
|
||||
struct PeInfo;
|
||||
struct SectionInfo;
|
||||
class BasicPeInfo;
|
||||
|
||||
// RVA转换为内存中的指针的辅助函数
|
||||
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
||||
if (!peBuffer || rva == 0) return nullptr;
|
||||
|
||||
PIMAGE_NT_HEADERS ntHeaders =
|
||||
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
|
||||
if (!ntHeaders) return nullptr;
|
||||
|
||||
PIMAGE_SECTION_HEADER section = IMAGE_FIRST_SECTION(ntHeaders);
|
||||
WORD numSections = ntHeaders->FileHeader.NumberOfSections;
|
||||
|
||||
for (WORD i = 0; i < numSections; i++, section++) {
|
||||
// 检查RVA是否在这个节区范围内
|
||||
if (rva >= section->VirtualAddress &&
|
||||
rva < section->VirtualAddress + section->Misc.VirtualSize) {
|
||||
// 计算文件偏移
|
||||
DWORD offset =
|
||||
rva - section->VirtualAddress + section->PointerToRawData;
|
||||
return peBuffer + offset;
|
||||
}
|
||||
}
|
||||
|
||||
// 如果RVA在PE头部内
|
||||
DWORD sizeOfHeaders = 0;
|
||||
bool isX64 = peconv::is64bit(peBuffer);
|
||||
|
||||
if (isX64) {
|
||||
PIMAGE_NT_HEADERS64 ntHeaders64 = (PIMAGE_NT_HEADERS64)ntHeaders;
|
||||
sizeOfHeaders = ntHeaders64->OptionalHeader.SizeOfHeaders;
|
||||
} else {
|
||||
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
|
||||
sizeOfHeaders = ntHeaders32->OptionalHeader.SizeOfHeaders;
|
||||
}
|
||||
|
||||
if (rva < sizeOfHeaders) {
|
||||
return peBuffer + rva;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
class MachineLearning {
|
||||
public:
|
||||
MachineLearning();
|
||||
~MachineLearning();
|
||||
|
||||
// 主函数:提取特征并导出到CSV
|
||||
bool ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
|
||||
const std::string& outputPath);
|
||||
|
||||
private:
|
||||
// 特征提取辅助函数
|
||||
std::vector<double> EncodeProperties(
|
||||
const PeInfo& peInfo, const std::vector<std::string>& dllTables);
|
||||
std::vector<double> EncodeEntrypoint(const std::vector<uint8_t>& epBytes);
|
||||
std::vector<double> EncodeHistogram(const uint8_t* data, size_t size);
|
||||
std::vector<double> EncodeLibraries(
|
||||
const std::vector<std::string>& dllTable);
|
||||
std::vector<double> EncodeSections(const std::vector<SectionInfo>& sections,
|
||||
bool isX64);
|
||||
std::tuple<std::vector<double>, std::vector<int>> GetOpcodeStatistics(
|
||||
const uint8_t* data, size_t dataSize, bool isX64, const PeInfo& peInfo);
|
||||
int GetOpcodeType(const void* code, bool isX64);
|
||||
double CalculateEntropy(const uint8_t* data, size_t size);
|
||||
|
||||
// 将特征导出到CSV
|
||||
bool ExportToCSV(const std::vector<double>& features,
|
||||
const std::string& outputPath);
|
||||
|
||||
// 常量定义
|
||||
std::vector<std::string> _properties;
|
||||
std::vector<std::string> _libraries;
|
||||
std::unordered_map<std::string, int> _opcodeTypeDict;
|
||||
};
|
||||
|
||||
// PE文件信息结构
|
||||
struct PeInfo {
|
||||
uint32_t addressOfEntryPoint;
|
||||
uint32_t baseOfCode;
|
||||
uint32_t sizeOfCode;
|
||||
uint32_t sizeOfImage;
|
||||
uint32_t sizeOfHeaders;
|
||||
uint32_t characteristics;
|
||||
uint32_t dllCharacteristics;
|
||||
bool isX64;
|
||||
|
||||
// PE目录标志
|
||||
bool hasConfiguration;
|
||||
bool hasDebug;
|
||||
bool hasExceptions;
|
||||
bool hasExports;
|
||||
bool hasImports;
|
||||
bool hasNx; // NX兼容标志
|
||||
bool hasRelocations;
|
||||
bool hasResources;
|
||||
bool hasSignatures;
|
||||
bool hasTls;
|
||||
bool hasDelayImports;
|
||||
bool hasImageBase;
|
||||
bool hasEntryIat;
|
||||
bool hasRich;
|
||||
};
|
||||
|
||||
// 节区信息结构
|
||||
struct SectionInfo {
|
||||
uint32_t characteristics;
|
||||
double entropy;
|
||||
uint32_t sizeOfRawData;
|
||||
uint32_t virtualSize;
|
||||
};
|
||||
Reference in New Issue
Block a user