Implement Rich Header parsing for PE file analysis

- Added ParseRichHeader method to extract Rich header information from PE files
- Defined RichEntry and RichHeaderInfo structures to store Rich header details
- Implemented decoding of Rich header entries with checksum XOR technique
- Updated ml.h and ml.cpp to support Rich header parsing
- Improved error handling and logging in ProcessDirectory method
- Translated some log messages to English for consistency
This commit is contained in:
Huoji's
2025-03-09 03:29:14 +08:00
parent 2fed2d5bae
commit 4d1ccb16aa
2 changed files with 79 additions and 6 deletions

View File

@@ -179,6 +179,70 @@ MachineLearning::~MachineLearning() {
// 析构函数,清理资源(如有必要)
}
bool MachineLearning::ParseRichHeader(const uint8_t* peBuffer,
RichHeaderInfo& richInfo) {
PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(peBuffer);
// 检查DOS头部有效性
if (!dosHeader || dosHeader->e_magic != IMAGE_DOS_SIGNATURE) {
return false;
}
// 搜索范围是DOS头后到PE头前
const uint32_t* scanPtr =
reinterpret_cast<const uint32_t*>(peBuffer + sizeof(IMAGE_DOS_HEADER));
size_t maxItems =
(dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER)) / sizeof(uint32_t);
// 查找DanS标记
size_t dansIndex = 0;
for (; dansIndex < maxItems - 1; dansIndex++) {
if (scanPtr[dansIndex] == 0x536E6144) { // "DanS"
break;
}
}
if (dansIndex >= maxItems - 1) {
return false; // 没找到DanS
}
// 获取校验和
uint32_t checksum = scanPtr[dansIndex + 1];
richInfo.checksum = checksum;
// 找Rich标记
size_t richIndex = 0;
for (richIndex = dansIndex + 2; richIndex < maxItems; richIndex++) {
if ((scanPtr[richIndex] ^ checksum) ==
0x68636952) { // "Rich" ^ checksum
break;
}
}
if (richIndex >= maxItems) {
return false; // 没找到Rich
}
// 解析Rich条目
// DanS之前的数据是Rich条目每个条目占用2个DWORD
size_t entryCount = (richIndex - dansIndex - 2) / 2;
richInfo.entries.reserve(entryCount);
for (size_t i = 0; i < entryCount; i++) {
size_t entryPos = richIndex - 2 * (i + 1);
uint32_t dword1 = scanPtr[entryPos] ^ checksum;
uint32_t dword2 = scanPtr[entryPos + 1] ^ checksum;
RichEntry entry;
entry.productId = dword1 & 0xFFFF; // 低16位是ProductId
entry.buildId = (dword1 >> 16) & 0xFFFF; // 高16位是BuildId
entry.useCount = dword2; // 使用次数
richInfo.entries.push_back(entry);
}
return true;
}
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
size_t bufferSize) {
// 使用libpeconv解析PE文件
@@ -672,7 +736,7 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
// 处理文件
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
if (fileBuffer.empty()) {
std::cerr << "跳过文件: " << currentPath << " (读取失败)"
std::cerr << "skip file: " << currentPath << " (read failed)"
<< std::endl;
failedCount++;
continue;
@@ -682,8 +746,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
std::vector<double> features =
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
if (features.empty()) {
std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
<< std::endl;
std::cerr << "skip file: " << currentPath
<< " (can't get feature)" << std::endl;
failedCount++;
continue;
}
@@ -697,8 +761,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
processedCount++;
if (processedCount % 100 == 0) {
std::cout << "已处理 " << processedCount << " 个文件..."
<< std::endl;
std::cout << "a ready processed " << processedCount
<< " files..." << std::endl;
}
}
} while (FindNextFileA(hFind, &findData));

View File

@@ -15,7 +15,16 @@
struct PeInfo;
struct SectionInfo;
class BasicPeInfo;
struct RichEntry {
uint16_t productId; // 组件ID
uint16_t buildId; // 版本号
uint32_t useCount; // 使用次数
};
struct RichHeaderInfo {
uint32_t checksum; // 校验和
std::vector<RichEntry> entries; // Rich头条目
};
// RVA转换为内存中的指针的辅助函数
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
if (!peBuffer || rva == 0) return nullptr;
@@ -61,7 +70,7 @@ class MachineLearning {
public:
MachineLearning();
~MachineLearning();
bool ParseRichHeader(const uint8_t* peBuffer, RichHeaderInfo& richInfo);
// 提取特征并返回特征向量
std::vector<double> ExtractFeatures(const uint8_t* buffer,
size_t bufferSize);