Implement Rich Header parsing for PE file analysis
- Added ParseRichHeader method to extract Rich header information from PE files - Defined RichEntry and RichHeaderInfo structures to store Rich header details - Implemented decoding of Rich header entries with checksum XOR technique - Updated ml.h and ml.cpp to support Rich header parsing - Improved error handling and logging in ProcessDirectory method - Translated some log messages to English for consistency
This commit is contained in:
@@ -179,6 +179,70 @@ MachineLearning::~MachineLearning() {
|
|||||||
// 析构函数,清理资源(如有必要)
|
// 析构函数,清理资源(如有必要)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool MachineLearning::ParseRichHeader(const uint8_t* peBuffer,
|
||||||
|
RichHeaderInfo& richInfo) {
|
||||||
|
PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(peBuffer);
|
||||||
|
|
||||||
|
// 检查DOS头部有效性
|
||||||
|
if (!dosHeader || dosHeader->e_magic != IMAGE_DOS_SIGNATURE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 搜索范围是DOS头后到PE头前
|
||||||
|
const uint32_t* scanPtr =
|
||||||
|
reinterpret_cast<const uint32_t*>(peBuffer + sizeof(IMAGE_DOS_HEADER));
|
||||||
|
size_t maxItems =
|
||||||
|
(dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER)) / sizeof(uint32_t);
|
||||||
|
|
||||||
|
// 查找DanS标记
|
||||||
|
size_t dansIndex = 0;
|
||||||
|
for (; dansIndex < maxItems - 1; dansIndex++) {
|
||||||
|
if (scanPtr[dansIndex] == 0x536E6144) { // "DanS"
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dansIndex >= maxItems - 1) {
|
||||||
|
return false; // 没找到DanS
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取校验和
|
||||||
|
uint32_t checksum = scanPtr[dansIndex + 1];
|
||||||
|
richInfo.checksum = checksum;
|
||||||
|
|
||||||
|
// 找Rich标记
|
||||||
|
size_t richIndex = 0;
|
||||||
|
for (richIndex = dansIndex + 2; richIndex < maxItems; richIndex++) {
|
||||||
|
if ((scanPtr[richIndex] ^ checksum) ==
|
||||||
|
0x68636952) { // "Rich" ^ checksum
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (richIndex >= maxItems) {
|
||||||
|
return false; // 没找到Rich
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析Rich条目
|
||||||
|
// DanS之前的数据是Rich条目,每个条目占用2个DWORD
|
||||||
|
size_t entryCount = (richIndex - dansIndex - 2) / 2;
|
||||||
|
richInfo.entries.reserve(entryCount);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < entryCount; i++) {
|
||||||
|
size_t entryPos = richIndex - 2 * (i + 1);
|
||||||
|
uint32_t dword1 = scanPtr[entryPos] ^ checksum;
|
||||||
|
uint32_t dword2 = scanPtr[entryPos + 1] ^ checksum;
|
||||||
|
|
||||||
|
RichEntry entry;
|
||||||
|
entry.productId = dword1 & 0xFFFF; // 低16位是ProductId
|
||||||
|
entry.buildId = (dword1 >> 16) & 0xFFFF; // 高16位是BuildId
|
||||||
|
entry.useCount = dword2; // 使用次数
|
||||||
|
|
||||||
|
richInfo.entries.push_back(entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
||||||
size_t bufferSize) {
|
size_t bufferSize) {
|
||||||
// 使用libpeconv解析PE文件
|
// 使用libpeconv解析PE文件
|
||||||
@@ -672,7 +736,7 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
|||||||
// 处理文件
|
// 处理文件
|
||||||
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
|
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
|
||||||
if (fileBuffer.empty()) {
|
if (fileBuffer.empty()) {
|
||||||
std::cerr << "跳过文件: " << currentPath << " (读取失败)"
|
std::cerr << "skip file: " << currentPath << " (read failed)"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
failedCount++;
|
failedCount++;
|
||||||
continue;
|
continue;
|
||||||
@@ -682,8 +746,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
|||||||
std::vector<double> features =
|
std::vector<double> features =
|
||||||
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
|
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
|
||||||
if (features.empty()) {
|
if (features.empty()) {
|
||||||
std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
|
std::cerr << "skip file: " << currentPath
|
||||||
<< std::endl;
|
<< " (can't get feature)" << std::endl;
|
||||||
failedCount++;
|
failedCount++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -697,8 +761,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
|||||||
|
|
||||||
processedCount++;
|
processedCount++;
|
||||||
if (processedCount % 100 == 0) {
|
if (processedCount % 100 == 0) {
|
||||||
std::cout << "已处理 " << processedCount << " 个文件..."
|
std::cout << "a ready processed " << processedCount
|
||||||
<< std::endl;
|
<< " files..." << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (FindNextFileA(hFind, &findData));
|
} while (FindNextFileA(hFind, &findData));
|
||||||
|
|||||||
@@ -15,7 +15,16 @@
|
|||||||
struct PeInfo;
|
struct PeInfo;
|
||||||
struct SectionInfo;
|
struct SectionInfo;
|
||||||
class BasicPeInfo;
|
class BasicPeInfo;
|
||||||
|
struct RichEntry {
|
||||||
|
uint16_t productId; // 组件ID
|
||||||
|
uint16_t buildId; // 版本号
|
||||||
|
uint32_t useCount; // 使用次数
|
||||||
|
};
|
||||||
|
|
||||||
|
struct RichHeaderInfo {
|
||||||
|
uint32_t checksum; // 校验和
|
||||||
|
std::vector<RichEntry> entries; // Rich头条目
|
||||||
|
};
|
||||||
// RVA转换为内存中的指针的辅助函数
|
// RVA转换为内存中的指针的辅助函数
|
||||||
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
||||||
if (!peBuffer || rva == 0) return nullptr;
|
if (!peBuffer || rva == 0) return nullptr;
|
||||||
@@ -61,7 +70,7 @@ class MachineLearning {
|
|||||||
public:
|
public:
|
||||||
MachineLearning();
|
MachineLearning();
|
||||||
~MachineLearning();
|
~MachineLearning();
|
||||||
|
bool ParseRichHeader(const uint8_t* peBuffer, RichHeaderInfo& richInfo);
|
||||||
// 提取特征并返回特征向量
|
// 提取特征并返回特征向量
|
||||||
std::vector<double> ExtractFeatures(const uint8_t* buffer,
|
std::vector<double> ExtractFeatures(const uint8_t* buffer,
|
||||||
size_t bufferSize);
|
size_t bufferSize);
|
||||||
|
|||||||
Reference in New Issue
Block a user