Files
awesome_anti_virus_engine/ai_anti_malware/ml.h
Huoji's 1cea516cf7 Add machine learning feature extraction for PE files
- Implemented MachineLearning class with ExtractFeatures method
- Updated project files to include new machine learning source and header files
- Modified main executable to call feature extraction
- Updated VSCode settings to include additional C++ headers
- Commented out previous file dumping code in main function
2025-03-09 02:05:07 +08:00

128 lines
3.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
#include "head.h"
#include <vector>
#include <string>
#include <map>
#include <memory>
#include <cmath>
#include <fstream>
#include <algorithm>
#include <numeric>
#include <functional>
#include <unordered_map>
// 前向声明
struct PeInfo;
struct SectionInfo;
class BasicPeInfo;
// RVA转换为内存中的指针的辅助函数
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
if (!peBuffer || rva == 0) return nullptr;
PIMAGE_NT_HEADERS ntHeaders =
(PIMAGE_NT_HEADERS)peconv::get_nt_hdrs(peBuffer);
if (!ntHeaders) return nullptr;
PIMAGE_SECTION_HEADER section = IMAGE_FIRST_SECTION(ntHeaders);
WORD numSections = ntHeaders->FileHeader.NumberOfSections;
for (WORD i = 0; i < numSections; i++, section++) {
// 检查RVA是否在这个节区范围内
if (rva >= section->VirtualAddress &&
rva < section->VirtualAddress + section->Misc.VirtualSize) {
// 计算文件偏移
DWORD offset =
rva - section->VirtualAddress + section->PointerToRawData;
return peBuffer + offset;
}
}
// 如果RVA在PE头部内
DWORD sizeOfHeaders = 0;
bool isX64 = peconv::is64bit(peBuffer);
if (isX64) {
PIMAGE_NT_HEADERS64 ntHeaders64 = (PIMAGE_NT_HEADERS64)ntHeaders;
sizeOfHeaders = ntHeaders64->OptionalHeader.SizeOfHeaders;
} else {
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
sizeOfHeaders = ntHeaders32->OptionalHeader.SizeOfHeaders;
}
if (rva < sizeOfHeaders) {
return peBuffer + rva;
}
return nullptr;
}
class MachineLearning {
public:
MachineLearning();
~MachineLearning();
// 主函数提取特征并导出到CSV
bool ExtractFeatures(const uint8_t* buffer, size_t bufferSize,
const std::string& outputPath);
private:
// 特征提取辅助函数
std::vector<double> EncodeProperties(
const PeInfo& peInfo, const std::vector<std::string>& dllTables);
std::vector<double> EncodeEntrypoint(const std::vector<uint8_t>& epBytes);
std::vector<double> EncodeHistogram(const uint8_t* data, size_t size);
std::vector<double> EncodeLibraries(
const std::vector<std::string>& dllTable);
std::vector<double> EncodeSections(const std::vector<SectionInfo>& sections,
bool isX64);
std::tuple<std::vector<double>, std::vector<int>> GetOpcodeStatistics(
const uint8_t* data, size_t dataSize, bool isX64, const PeInfo& peInfo);
int GetOpcodeType(const void* code, bool isX64);
double CalculateEntropy(const uint8_t* data, size_t size);
// 将特征导出到CSV
bool ExportToCSV(const std::vector<double>& features,
const std::string& outputPath);
// 常量定义
std::vector<std::string> _properties;
std::vector<std::string> _libraries;
std::unordered_map<std::string, int> _opcodeTypeDict;
};
// PE文件信息结构
struct PeInfo {
uint32_t addressOfEntryPoint;
uint32_t baseOfCode;
uint32_t sizeOfCode;
uint32_t sizeOfImage;
uint32_t sizeOfHeaders;
uint32_t characteristics;
uint32_t dllCharacteristics;
bool isX64;
// PE目录标志
bool hasConfiguration;
bool hasDebug;
bool hasExceptions;
bool hasExports;
bool hasImports;
bool hasNx; // NX兼容标志
bool hasRelocations;
bool hasResources;
bool hasSignatures;
bool hasTls;
bool hasDelayImports;
bool hasImageBase;
bool hasEntryIat;
bool hasRich;
};
// 节区信息结构
struct SectionInfo {
uint32_t characteristics;
double entropy;
uint32_t sizeOfRawData;
uint32_t virtualSize;
};