Implement Rich Header parsing for PE file analysis
- Added ParseRichHeader method to extract Rich header information from PE files - Defined RichEntry and RichHeaderInfo structures to store Rich header details - Implemented decoding of Rich header entries with checksum XOR technique - Updated ml.h and ml.cpp to support Rich header parsing - Improved error handling and logging in ProcessDirectory method - Translated some log messages to English for consistency
This commit is contained in:
@@ -179,6 +179,70 @@ MachineLearning::~MachineLearning() {
|
||||
// 析构函数,清理资源(如有必要)
|
||||
}
|
||||
|
||||
bool MachineLearning::ParseRichHeader(const uint8_t* peBuffer,
|
||||
RichHeaderInfo& richInfo) {
|
||||
PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(peBuffer);
|
||||
|
||||
// 检查DOS头部有效性
|
||||
if (!dosHeader || dosHeader->e_magic != IMAGE_DOS_SIGNATURE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 搜索范围是DOS头后到PE头前
|
||||
const uint32_t* scanPtr =
|
||||
reinterpret_cast<const uint32_t*>(peBuffer + sizeof(IMAGE_DOS_HEADER));
|
||||
size_t maxItems =
|
||||
(dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER)) / sizeof(uint32_t);
|
||||
|
||||
// 查找DanS标记
|
||||
size_t dansIndex = 0;
|
||||
for (; dansIndex < maxItems - 1; dansIndex++) {
|
||||
if (scanPtr[dansIndex] == 0x536E6144) { // "DanS"
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (dansIndex >= maxItems - 1) {
|
||||
return false; // 没找到DanS
|
||||
}
|
||||
|
||||
// 获取校验和
|
||||
uint32_t checksum = scanPtr[dansIndex + 1];
|
||||
richInfo.checksum = checksum;
|
||||
|
||||
// 找Rich标记
|
||||
size_t richIndex = 0;
|
||||
for (richIndex = dansIndex + 2; richIndex < maxItems; richIndex++) {
|
||||
if ((scanPtr[richIndex] ^ checksum) ==
|
||||
0x68636952) { // "Rich" ^ checksum
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (richIndex >= maxItems) {
|
||||
return false; // 没找到Rich
|
||||
}
|
||||
|
||||
// 解析Rich条目
|
||||
// DanS之前的数据是Rich条目,每个条目占用2个DWORD
|
||||
size_t entryCount = (richIndex - dansIndex - 2) / 2;
|
||||
richInfo.entries.reserve(entryCount);
|
||||
|
||||
for (size_t i = 0; i < entryCount; i++) {
|
||||
size_t entryPos = richIndex - 2 * (i + 1);
|
||||
uint32_t dword1 = scanPtr[entryPos] ^ checksum;
|
||||
uint32_t dword2 = scanPtr[entryPos + 1] ^ checksum;
|
||||
|
||||
RichEntry entry;
|
||||
entry.productId = dword1 & 0xFFFF; // 低16位是ProductId
|
||||
entry.buildId = (dword1 >> 16) & 0xFFFF; // 高16位是BuildId
|
||||
entry.useCount = dword2; // 使用次数
|
||||
|
||||
richInfo.entries.push_back(entry);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
||||
size_t bufferSize) {
|
||||
// 使用libpeconv解析PE文件
|
||||
@@ -672,7 +736,7 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||
// 处理文件
|
||||
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
|
||||
if (fileBuffer.empty()) {
|
||||
std::cerr << "跳过文件: " << currentPath << " (读取失败)"
|
||||
std::cerr << "skip file: " << currentPath << " (read failed)"
|
||||
<< std::endl;
|
||||
failedCount++;
|
||||
continue;
|
||||
@@ -682,8 +746,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||
std::vector<double> features =
|
||||
ExtractFeatures(fileBuffer.data(), fileBuffer.size());
|
||||
if (features.empty()) {
|
||||
std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
|
||||
<< std::endl;
|
||||
std::cerr << "skip file: " << currentPath
|
||||
<< " (can't get feature)" << std::endl;
|
||||
failedCount++;
|
||||
continue;
|
||||
}
|
||||
@@ -697,8 +761,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||
|
||||
processedCount++;
|
||||
if (processedCount % 100 == 0) {
|
||||
std::cout << "已处理 " << processedCount << " 个文件..."
|
||||
<< std::endl;
|
||||
std::cout << "a ready processed " << processedCount
|
||||
<< " files..." << std::endl;
|
||||
}
|
||||
}
|
||||
} while (FindNextFileA(hFind, &findData));
|
||||
|
||||
@@ -15,7 +15,16 @@
|
||||
struct PeInfo;
|
||||
struct SectionInfo;
|
||||
class BasicPeInfo;
|
||||
struct RichEntry {
|
||||
uint16_t productId; // 组件ID
|
||||
uint16_t buildId; // 版本号
|
||||
uint32_t useCount; // 使用次数
|
||||
};
|
||||
|
||||
struct RichHeaderInfo {
|
||||
uint32_t checksum; // 校验和
|
||||
std::vector<RichEntry> entries; // Rich头条目
|
||||
};
|
||||
// RVA转换为内存中的指针的辅助函数
|
||||
inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
||||
if (!peBuffer || rva == 0) return nullptr;
|
||||
@@ -61,7 +70,7 @@ class MachineLearning {
|
||||
public:
|
||||
MachineLearning();
|
||||
~MachineLearning();
|
||||
|
||||
bool ParseRichHeader(const uint8_t* peBuffer, RichHeaderInfo& richInfo);
|
||||
// 提取特征并返回特征向量
|
||||
std::vector<double> ExtractFeatures(const uint8_t* buffer,
|
||||
size_t bufferSize);
|
||||
|
||||
Reference in New Issue
Block a user