Implement Rich Header parsing for PE file analysis

- Added ParseRichHeader method to extract Rich header information from PE files - Defined RichEntry and RichHeaderInfo structures to store Rich header details - Implemented decoding of Rich header entries with checksum XOR technique - Updated ml.h and ml.cpp to support Rich header parsing - Improved error handling and logging in ProcessDirectory method - Translated some log messages to English for consistency
2025-03-09 03:29:14 +08:00
parent 2fed2d5bae
commit 4d1ccb16aa
2 changed files with 79 additions and 6 deletions
--- a/ai_anti_malware/ml.cpp
+++ b/ai_anti_malware/ml.cpp
@@ -179,6 +179,70 @@ MachineLearning::~MachineLearning() {
    // 析构函数，清理资源（如有必要）
 }

+bool MachineLearning::ParseRichHeader(const uint8_t* peBuffer,
+                                      RichHeaderInfo& richInfo) {
+    PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(peBuffer);
+
+    // 检查DOS头部有效性
+    if (!dosHeader || dosHeader->e_magic != IMAGE_DOS_SIGNATURE) {
+        return false;
+    }
+
+    // 搜索范围是DOS头后到PE头前
+    const uint32_t* scanPtr =
+        reinterpret_cast<const uint32_t*>(peBuffer + sizeof(IMAGE_DOS_HEADER));
+    size_t maxItems =
+        (dosHeader->e_lfanew - sizeof(IMAGE_DOS_HEADER)) / sizeof(uint32_t);
+
+    // 查找DanS标记
+    size_t dansIndex = 0;
+    for (; dansIndex < maxItems - 1; dansIndex++) {
+        if (scanPtr[dansIndex] == 0x536E6144) {  // "DanS"
+            break;
+        }
+    }
+
+    if (dansIndex >= maxItems - 1) {
+        return false;  // 没找到DanS
+    }
+
+    // 获取校验和
+    uint32_t checksum = scanPtr[dansIndex + 1];
+    richInfo.checksum = checksum;
+
+    // 找Rich标记
+    size_t richIndex = 0;
+    for (richIndex = dansIndex + 2; richIndex < maxItems; richIndex++) {
+        if ((scanPtr[richIndex] ^ checksum) ==
+            0x68636952) {  // "Rich" ^ checksum
+            break;
+        }
+    }
+
+    if (richIndex >= maxItems) {
+        return false;  // 没找到Rich
+    }
+
+    // 解析Rich条目
+    // DanS之前的数据是Rich条目，每个条目占用2个DWORD
+    size_t entryCount = (richIndex - dansIndex - 2) / 2;
+    richInfo.entries.reserve(entryCount);
+
+    for (size_t i = 0; i < entryCount; i++) {
+        size_t entryPos = richIndex - 2 * (i + 1);
+        uint32_t dword1 = scanPtr[entryPos] ^ checksum;
+        uint32_t dword2 = scanPtr[entryPos + 1] ^ checksum;
+
+        RichEntry entry;
+        entry.productId = dword1 & 0xFFFF;        // 低16位是ProductId
+        entry.buildId = (dword1 >> 16) & 0xFFFF;  // 高16位是BuildId
+        entry.useCount = dword2;                  // 使用次数
+
+        richInfo.entries.push_back(entry);
+    }
+
+    return true;
+}
 std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
                                                     size_t bufferSize) {
    // 使用libpeconv解析PE文件
@@ -672,7 +736,7 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
            // 处理文件
            std::vector<uint8_t> fileBuffer = ReadFileToBuffer(currentPath);
            if (fileBuffer.empty()) {
-                std::cerr << "跳过文件: " << currentPath << " (读取失败)"
+                std::cerr << "skip file: " << currentPath << " (read failed)"
                          << std::endl;
                failedCount++;
                continue;
@@ -682,8 +746,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
            std::vector<double> features =
                ExtractFeatures(fileBuffer.data(), fileBuffer.size());
            if (features.empty()) {
-                std::cerr << "跳过文件: " << currentPath << " (特征提取失败)"
-                          << std::endl;
+                std::cerr << "skip file: " << currentPath
+                          << " (can't get feature)" << std::endl;
                failedCount++;
                continue;
            }
@@ -697,8 +761,8 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,

            processedCount++;
            if (processedCount % 100 == 0) {
-                std::cout << "已处理 " << processedCount << " 个文件..."
-                          << std::endl;
+                std::cout << "a ready processed " << processedCount
+                          << " files..." << std::endl;
            }
        }
    } while (FindNextFileA(hFind, &findData));
--- a/ai_anti_malware/ml.h
+++ b/ai_anti_malware/ml.h
@@ -15,7 +15,16 @@
 struct PeInfo;
 struct SectionInfo;
 class BasicPeInfo;
+struct RichEntry {
+    uint16_t productId;  // 组件ID
+    uint16_t buildId;    // 版本号
+    uint32_t useCount;   // 使用次数
+};

+struct RichHeaderInfo {
+    uint32_t checksum;               // 校验和
+    std::vector<RichEntry> entries;  // Rich头条目
+};
 // RVA转换为内存中的指针的辅助函数
 inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
    if (!peBuffer || rva == 0) return nullptr;
@@ -61,7 +70,7 @@ class MachineLearning {
   public:
    MachineLearning();
    ~MachineLearning();
-
+    bool ParseRichHeader(const uint8_t* peBuffer, RichHeaderInfo& richInfo);
    // 提取特征并返回特征向量
    std::vector<double> ExtractFeatures(const uint8_t* buffer,
                                        size_t bufferSize);