Update project documentation and enhance malware detection engine

- Completely rewrite README.md with comprehensive project overview and technical details - Add detailed explanation of antivirus engine architecture and detection strategies - Implement multi-stage malware detection with machine learning, sandbox, and PE structure analysis - Update project configuration and add new source files for enhanced detection capabilities - Integrate XGBoost machine learning model with C++ export functionality - Improve sandbox environment with advanced module and LDR data table handling - Remove legacy Python prediction and training scripts in favor of C++ implementation
2025-03-09 21:59:22 +08:00
parent 51f929abfa
commit 60c4ef5f58
23 changed files with 46102 additions and 1717 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -58,6 +58,19 @@
        "xutility": "cpp",
        "functional": "cpp",
        "array": "cpp",
-        "numeric": "cpp"
+        "numeric": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "filesystem": "cpp",
+        "format": "cpp",
+        "forward_list": "cpp",
+        "locale": "cpp",
+        "mutex": "cpp",
+        "optional": "cpp",
+        "ratio": "cpp",
+        "stop_token": "cpp",
+        "thread": "cpp",
+        "xlocbuf": "cpp",
+        "xlocmes": "cpp"
    }
 }
--- a/ai_anti_malware/ai_anti_malware.cpp
+++ b/ai_anti_malware/ai_anti_malware.cpp
@@ -2,6 +2,13 @@
 //

 #include "head.h"
+enum class DetectEngineType {
+    kNone,
+    kMachineLearning,
+    kSandbox,
+    kPeStruct,
+    kYaraScan
+};

 auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
    auto sampleInfo = std::make_shared<BasicPeInfo>();
@@ -10,6 +17,9 @@ auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
    sampleInfo->peBuffer =
        peconv::load_pe_module((const char*)sampleInfo->inputFilePath.c_str(),
                               sampleInfo->peSize, false, false);
+    if (sampleInfo->peBuffer == nullptr) {
+        return nullptr;
+    }
    sampleInfo->ntHead64 = peconv::get_nt_hdrs64((BYTE*)sampleInfo->peBuffer);
    sampleInfo->ntHead32 = peconv::get_nt_hdrs32((BYTE*)sampleInfo->peBuffer);
    sampleInfo->isX64 = peconv::is64bit((BYTE*)sampleInfo->peBuffer);
@@ -39,6 +49,7 @@ auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
    sampleInfo->peSize = (sampleInfo->peSize + 0xFFF) & ~0xFFF;
    return sampleInfo;
 }
+// 搜集恶意软件特征的.
 int doMl(int argc, char* argv[]) {
    // 检查命令行参数
    if (argc < 3) {
@@ -98,31 +109,210 @@ int doMl(int argc, char* argv[]) {
    }
    return 0;
 };
-int main(int argc, char* argv[]) {
-    doMl(argc, argv);
-    /*
- auto sampleInfo = getPeInfo(
-     "E:\\对战平台\\CrowAntiCheat\\CrowAntiCheat\\client\\Console_"
-     "Test\\Release\\Console_Test.exe");
- // auto sampleInfo = getPeInfo("C:\\ConsoleApplication1.exe");
- printf("input new file %s \n", sampleInfo->inputFilePath);
- printf("is x64: %d\n", sampleInfo->isX64);
- printf("is relocated: %d\n", sampleInfo->isRelocated);
- printf("RecImageBase: %llx\n", sampleInfo->RecImageBase);
- auto sandbox = std::make_shared<Sandbox>();
- sandbox->InitEnv(sampleInfo);
- sandbox->Run();
- auto [peBuffer, peSize] = sandbox->DumpPE();
+int doPredict(int argc, char* argv[]) {
+    if (argc < 2) {
+        std::cout << "用法: " << argv[0] << " <文件路径>" << std::endl;
+        return 1;
+    }
+    std::string filePath = argv[1];
+    MachineLearning ml;
+    double score = 1 - ml.PredictMalwareFromFile(filePath);
+    if (score >= 0) {
+        std::cout << "文件 " << filePath << " 的恶意软件得分: " << score
+                  << std::endl;
+        if (score > 0.5) {
+            std::cout << "警告: 这个文件可能是恶意软件!" << std::endl;
+        } else {
+            std::cout << "这个文件可能是安全的。" << std::endl;
+        }
+    } else {
+        std::cout << "无法分析文件。" << std::endl;
+    }
+}
+class PeStructAnalyzer {
+   public:
+    PeStructAnalyzer() = default;
+    ~PeStructAnalyzer() = default;

- if (peBuffer) {
-     printf("peBuffer: %p\n", peBuffer.get());
-     printf("peSize: %d\n", peSize);
-     // peconv::dump_to_file("z:\\dumped_main.exe", peBuffer.get(), peSize);
-     MachineLearning ml;
-     ml.ExtractFeatures(peBuffer.get(), peSize);
- }
- peBuffer.release();
- */
-    system("pause");
+    bool AnalyzePe(const std::shared_ptr<BasicPeInfo>& peInfo) {
+        if (!peInfo || !peInfo->peBuffer) {
+            return false;
+        }
+
+        bool isSuspicious = false;
+
+        // 检查导入表
+        if (HasNoImports(peInfo)) {
+            std::cout << "警告: 未发现导入表，这是一个可疑特征" << std::endl;
+            isSuspicious = true;
+        }
+
+        // 检查节表异常
+        auto [hasSuspiciousSections, suspiciousReason] =
+            AnalyzeSections(peInfo);
+        if (hasSuspiciousSections) {
+            std::cout << "警告: " << suspiciousReason << std::endl;
+            isSuspicious = true;
+        }
+
+        return isSuspicious;
+    }
+
+   private:
+    static constexpr DWORD MAX_REASONABLE_SECTION_COUNT = 20;  // 最大合理区段数
+    static constexpr DWORD MAX_EXECUTABLE_SECTIONS = 3;    // 最大可执行区段数
+    static constexpr DWORD MAX_SECTION_SIZE = 0x10000000;  // 256MB
+    static constexpr DWORD SECTION_ALIGNMENT = 0x1000;     // 4KB对齐
+    static constexpr DWORD SUSPICIOUS_ENTROPY_THRESHOLD = 7;  // 熵值阈值
+
+    bool HasNoImports(const std::shared_ptr<BasicPeInfo>& peInfo) {
+        PIMAGE_DATA_DIRECTORY importDir = nullptr;
+        if (peInfo->isX64) {
+            importDir = &peInfo->ntHead64->OptionalHeader
+                             .DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];
+        } else {
+            importDir = &peInfo->ntHead32->OptionalHeader
+                             .DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];
+        }
+
+        return (importDir->VirtualAddress == 0 || importDir->Size == 0);
+    }
+
+    std::pair<bool, std::string> AnalyzeSections(
+        const std::shared_ptr<BasicPeInfo>& peInfo) {
+        PIMAGE_SECTION_HEADER firstSection = nullptr;
+        WORD numberOfSections = 0;
+
+        if (peInfo->isX64) {
+            firstSection = IMAGE_FIRST_SECTION(peInfo->ntHead64);
+            numberOfSections = peInfo->ntHead64->FileHeader.NumberOfSections;
+        } else {
+            firstSection = IMAGE_FIRST_SECTION(peInfo->ntHead32);
+            numberOfSections = peInfo->ntHead32->FileHeader.NumberOfSections;
+        }
+
+        // 检查区段数量是否异常
+        if (numberOfSections > MAX_REASONABLE_SECTION_COUNT) {
+            return {true, "区段数量异常: " + std::to_string(numberOfSections) +
+                              " > " +
+                              std::to_string(MAX_REASONABLE_SECTION_COUNT)};
+        }
+
+        // 统计可执行区段数量
+        int executableSections = 0;
+        bool hasWritableExecutableSection = false;
+        bool hasZeroSizedSection = false;
+        bool hasOversizedSection = false;
+        bool hasMisalignedSection = false;
+
+        for (WORD i = 0; i < numberOfSections; i++) {
+            const auto& section = firstSection[i];
+
+            // 检查区段属性
+            if (section.Characteristics & IMAGE_SCN_MEM_EXECUTE) {
+                executableSections++;
+
+                // 检查是否同时具有可写和可执行属性
+                if (section.Characteristics & IMAGE_SCN_MEM_WRITE) {
+                    hasWritableExecutableSection = true;
+                }
+            }
+
+            // 检查区段大小
+            if (section.SizeOfRawData == 0 && section.Misc.VirtualSize > 0) {
+                hasZeroSizedSection = true;
+            }
+
+            if (section.SizeOfRawData > MAX_SECTION_SIZE) {
+                hasOversizedSection = true;
+            }
+
+            // 检查对齐
+            if (section.VirtualAddress % SECTION_ALIGNMENT != 0) {
+                hasMisalignedSection = true;
+            }
+        }
+
+        // 返回检测结果
+        if (executableSections > MAX_EXECUTABLE_SECTIONS) {
+            return {true, "可执行区段数量过多: " +
+                              std::to_string(executableSections)};
+        }
+
+        if (hasWritableExecutableSection) {
+            return {true, "发现同时具有可写和可执行属性的区段"};
+        }
+
+        if (hasZeroSizedSection) {
+            return {true, "发现大小异常的区段"};
+        }
+
+        if (hasOversizedSection) {
+            return {true, "发现过大的区段"};
+        }
+
+        if (hasMisalignedSection) {
+            return {true, "发现未正确对齐的区段"};
+        }
+
+        return {false, ""};
+    }
+};
+
+class DetectEngine {
+   public:
+    DetectEngine();
+    ~DetectEngine();
+    DetectEngineType DetectMalware(std::string filePath);
+};
+DetectEngine::DetectEngine() {}
+DetectEngine::~DetectEngine() {}
+DetectEngineType DetectEngine::DetectMalware(std::string filePath) {
+    auto peInfo = getPeInfo(filePath);
+    if (peInfo == nullptr) {
+        return DetectEngineType::kNone;
+    }
+
+    // PE结构分析
+    PeStructAnalyzer peAnalyzer;
+    if (peAnalyzer.AnalyzePe(peInfo)) {
+        return DetectEngineType::kPeStruct;
+    }
+
+    // 先机器学习引擎
+    MachineLearning ml;
+    double score = 1 - ml.PredictMalwareFromFile(filePath);
+    if (score >= 0) {
+        printf("machine learning score: %f\n", score);
+        if (score > 0.5) {
+            return DetectEngineType::kMachineLearning;
+        }
+    }
+
+    // 再沙盒引擎
+    Sandbox se;
+    se.InitEnv(peInfo);
+    se.Run();
+    if (se.GetMalwareAnalysisType() == MalwareAnalysisType::kSuspicious ||
+        se.GetMalwareAnalysisType() == MalwareAnalysisType::kMalware) {
+        return DetectEngineType::kSandbox;
+    }
+
+    return DetectEngineType::kNone;
+}
+auto doMalwareScan(int argc, char* argv[]) -> void {
+    DetectEngine scanner;
+    if (argc < 2) {
+        std::cout << "用法: " << argv[0] << " <文件路径>" << std::endl;
+        return;
+    }
+    std::string filePath = argv[1];
+    auto sampleType = scanner.DetectMalware(filePath);
+    printf("sample type: %d \n", sampleType);
+}
+int main(int argc, char* argv[]) {
+    // doMl(argc, argv);
+    // doPredict(argc, argv);
+    doMalwareScan(argc, argv);
    return 0;
 }
--- a/ai_anti_malware/ai_anti_malware.vcxproj
+++ b/ai_anti_malware/ai_anti_malware.vcxproj
@@ -142,6 +142,7 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
+    <ClCompile Include="..\ml\malware_detector.cpp" />
    <ClCompile Include="ai_anti_malware.cpp" />
    <ClCompile Include="libpeconv\libpeconv\src\buffer_util.cpp" />
    <ClCompile Include="libpeconv\libpeconv\src\caves.cpp" />
@@ -173,15 +174,21 @@
    <ClCompile Include="ml.cpp" />
    <ClCompile Include="sandbox.cpp" />
    <ClCompile Include="sandbox_api_emu.cpp" />
+    <ClCompile Include="sandbox_api_regs.cpp" />
+    <ClCompile Include="sandbox_api_stl.cpp" />
+    <ClCompile Include="sandbox_api_winhttp.cpp" />
    <ClCompile Include="sandbox_callbacks.cpp" />
+    <ClCompile Include="sandbox_malware_check.cpp" />
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="..\ml\malware_detector.h" />
    <ClInclude Include="head.h" />
    <ClInclude Include="libpeconv\libpeconv\src\fix_dot_net_ep.h" />
    <ClInclude Include="libpeconv\libpeconv\src\ntddk.h" />
    <ClInclude Include="ml.h" />
    <ClInclude Include="native_struct.h" />
    <ClInclude Include="sandbox.h" />
+    <ClInclude Include="sandbox_api_winhttp.h" />
    <ClInclude Include="sandbox_callbacks.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ai_anti_malware/ai_anti_malware.vcxproj.filters
+++ b/ai_anti_malware/ai_anti_malware.vcxproj.filters
@@ -126,6 +126,21 @@
    <ClCompile Include="ml.cpp">
      <Filter>源文件\machine_learning</Filter>
    </ClCompile>
+    <ClCompile Include="..\ml\malware_detector.cpp">
+      <Filter>源文件\machine_learning</Filter>
+    </ClCompile>
+    <ClCompile Include="sandbox_malware_check.cpp">
+      <Filter>源文件\sandbox</Filter>
+    </ClCompile>
+    <ClCompile Include="sandbox_api_regs.cpp">
+      <Filter>源文件\sandbox</Filter>
+    </ClCompile>
+    <ClCompile Include="sandbox_api_stl.cpp">
+      <Filter>源文件\sandbox</Filter>
+    </ClCompile>
+    <ClCompile Include="sandbox_api_winhttp.cpp">
+      <Filter>源文件\sandbox</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="head.h">
@@ -149,5 +164,11 @@
    <ClInclude Include="ml.h">
      <Filter>头文件\machine_learning</Filter>
    </ClInclude>
+    <ClInclude Include="..\ml\malware_detector.h">
+      <Filter>头文件\machine_learning</Filter>
+    </ClInclude>
+    <ClInclude Include="sandbox_api_winhttp.h">
+      <Filter>头文件\sandbox</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>
--- a/ai_anti_malware/ml.cpp
+++ b/ai_anti_malware/ml.cpp
@@ -9,7 +9,6 @@
 #include <sstream>
 #include <cfloat>
 #include <filesystem>
-
 // 确保std命名空间中的函数可用
 using std::max;
 using std::min;
@@ -855,4 +854,30 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
    printf("ML Process Result, success count: %d fail count: %d \n",
           processedCount, failedCount);
    return true;
+}
+
+double MachineLearning::PredictMalware(const uint8_t* buffer,
+                                       size_t bufferSize) {
+    // 提取特征
+    std::vector<double> features = ExtractFeatures(buffer, bufferSize);
+
+    // 如果特征提取失败，返回-1.0表示无法预测
+    if (features.empty()) {
+        return -1.0;
+    }
+
+    // 将特征向量传递给XGBoost模型
+    return score(features.data());
+}
+//返回的是白文件的概率
+double MachineLearning::PredictMalwareFromFile(const std::string& filePath) {
+    // 读取文件
+    std::vector<uint8_t> fileBuffer = ReadFileToBuffer(filePath);
+    if (fileBuffer.empty()) {
+        std::cerr << "无法读取文件: " << filePath << std::endl;
+        return -1.0;
+    }
+
+    // 使用缓冲区进行预测
+    return PredictMalware(fileBuffer.data(), fileBuffer.size());
 }
--- a/ai_anti_malware/ml.h
+++ b/ai_anti_malware/ml.h
@@ -66,6 +66,9 @@ inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
    return nullptr;
 }

+// 在头文件中声明score函数（从外部导入）
+extern double score(double* input);
+
 class MachineLearning {
   public:
    MachineLearning();
@@ -86,6 +89,10 @@ class MachineLearning {
    // 读取文件到内存
    std::vector<uint8_t> ReadFileToBuffer(const std::string& filePath);

+    // 新增方法：使用XGBoost模型预测文件是否为恶意软件
+    double PredictMalware(const uint8_t* buffer, size_t bufferSize);
+    double PredictMalwareFromFile(const std::string& filePath);
+
   private:
    // 特征提取辅助函数
    std::vector<double> EncodeProperties(
--- a/ai_anti_malware/sandbox.cpp
+++ b/ai_anti_malware/sandbox.cpp
@@ -155,7 +155,19 @@ class cFixImprot : public peconv::t_function_resolver {
                }
            }
        }
-        //__debugbreak();
+        for (const auto& module : m_sandbox->m_moduleList) {
+            for (const auto& exp : m_sandbox->m_exportFuncDict) {
+                // 检查函数名是否匹配
+                if (strcmp(exp->name, func_name) == 0) {
+                    auto newBase = reinterpret_cast<FARPROC>(
+                        module->base + exp->function_address);
+                    printf("fix import: %s => %llx \n", func_name, newBase);
+                    // 返回在模拟器中的虚拟地址
+                    return newBase;
+                }
+            }
+        }
+        __debugbreak();
        return nullptr;
    }

@@ -219,6 +231,11 @@ auto Sandbox::PushModuleToVM(const char* dllName, uint64_t moduleBase) -> void {
                                newModule->base) == false) {
        throw std::runtime_error("Failed to relocate module");
    }
+
+    // 将模块添加到LDR链表中
+    if (m_peInfo->isX64) {
+        AddModuleToLdr(newModule);
+    }
 }

 auto Sandbox::CreateModuleInfo(const char* dllName, uint64_t moduleBase,
@@ -344,7 +361,8 @@ auto Sandbox::ResolveImportExports() -> void {
        const auto exports = ResolveExport(module->real_base);
        for (const auto item : exports) {
            if (LOG_LEVEL > 0) {
-                printf("import export: [%s] %s => %llx\n", module->name, item->name, item->function_address);
+                printf("import export: [%s] %s => %llx\n", module->name,
+                       item->name, item->function_address);
            }
            module->export_function.push_back(item);
        }
@@ -586,6 +604,11 @@ auto Sandbox::InitEnv(std::shared_ptr<BasicPeInfo> peInfo) -> void {
    _ASSERTE(m_moduleList.size() == 0);
    m_moduleList.push_back(newModule);

+    // 将模块添加到LDR链表中
+    if (m_peInfo->isX64) {
+        AddModuleToLdr(newModule);
+    }
+
    ResoveImport();
    ResolveImportExports();

@@ -697,7 +720,7 @@ auto Sandbox::Run() -> void {
    InitApiHooks();
    std::cout << "Starting execution at " << std::hex << entryPoint
              << std::endl;
-    uint64_t timeout = 60 * 1000;
+    uint64_t timeout = 60 * 1000 * 1000;
    err = uc_emu_start(m_ucEngine, entryPoint, m_peInfo->imageEnd, timeout, 0);
    std::cerr << "Emulation error: " << uc_strerror(err) << std::endl;
 }
@@ -1083,3 +1106,165 @@ void Sandbox::UpdateBaseOfCode(PIMAGE_SECTION_HEADER sectionHeader,
        }
    }
 }
+
+auto Sandbox::InitializeLdrData() -> void {
+    if (m_peInfo->isX64 && m_peb64.Ldr == 0) {
+        // 为LDR_DATA分配内存
+        uint64_t ldrDataAddress = m_pebBase + sizeof(X64PEB);
+        m_pebEnd = ldrDataAddress + sizeof(X64_PEB_LDR_DATA);
+        m_peb64.Ldr = ldrDataAddress;
+
+        // 映射LDR数据内存
+        uc_mem_map(m_ucEngine, ldrDataAddress, sizeof(X64_PEB_LDR_DATA),
+                   UC_PROT_ALL);
+
+        // 初始化LDR_DATA结构
+        X64_PEB_LDR_DATA ldrData = {0};
+        ldrData.Length = sizeof(X64_PEB_LDR_DATA);
+        ldrData.Initialized = 1;
+
+        // 初始化链表头 - 使用适当的类型转换
+        LIST_ENTRY inLoadOrderList = {
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList)),
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList))};
+        ldrData.InLoadOrderModuleList = inLoadOrderList;
+
+        LIST_ENTRY inMemoryOrderList = {
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList)),
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList))};
+        ldrData.InMemoryOrderModuleList = inMemoryOrderList;
+
+        LIST_ENTRY inInitOrderList = {
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList)),
+            reinterpret_cast<LIST_ENTRY*>(
+                ldrDataAddress +
+                offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList))};
+        ldrData.InInitializationOrderModuleList = inInitOrderList;
+
+        uc_mem_write(m_ucEngine, ldrDataAddress, &ldrData,
+                     sizeof(X64_PEB_LDR_DATA));
+
+        // 更新PEB中的Ldr指针
+        uc_mem_write(m_ucEngine, m_pebBase, &m_peb64, sizeof(X64PEB));
+    }
+}
+
+auto Sandbox::CreateLdrEntry(const std::shared_ptr<struct_moudle>& module,
+                             uint64_t entryAddress, uint64_t fullNameAddress,
+                             uint64_t baseNameAddress) -> LDR_DATA_TABLE_ENTRY {
+    LDR_DATA_TABLE_ENTRY entry = {0};
+    entry.DllBase = reinterpret_cast<PVOID>(module->base);
+    entry.EntryPoint = reinterpret_cast<PVOID>(module->base + module->entry);
+    entry.SizeOfImages = static_cast<ULONG>(module->size);
+
+    // 准备模块名称的Unicode字符串
+    wchar_t nameBuffer[MAX_PATH] = {0};
+    std::mbstowcs(nameBuffer, module->name, strlen(module->name));
+
+    // 设置全路径
+    entry.FullDllName.Length =
+        static_cast<USHORT>(wcslen(nameBuffer) * sizeof(wchar_t));
+    entry.FullDllName.MaximumLength = MAX_PATH * sizeof(wchar_t);
+    entry.FullDllName.Buffer = reinterpret_cast<PWSTR>(fullNameAddress);
+
+    // 设置基本名称
+    entry.BaseDllName.Length =
+        static_cast<USHORT>(wcslen(nameBuffer) * sizeof(wchar_t));
+    entry.BaseDllName.MaximumLength = MAX_PATH * sizeof(wchar_t);
+    entry.BaseDllName.Buffer = reinterpret_cast<PWSTR>(baseNameAddress);
+
+    // 写入Unicode字符串
+    uc_mem_write(m_ucEngine, fullNameAddress, nameBuffer,
+                 (wcslen(nameBuffer) + 1) * sizeof(wchar_t));
+    uc_mem_write(m_ucEngine, baseNameAddress, nameBuffer,
+                 (wcslen(nameBuffer) + 1) * sizeof(wchar_t));
+
+    return entry;
+}
+
+auto Sandbox::UpdateLdrLinks(const LDR_DATA_TABLE_ENTRY& entry,
+                             uint64_t entryAddress, X64_PEB_LDR_DATA& ldrData)
+    -> void {
+    // 更新LDR_DATA中的链表头
+    ldrData.InLoadOrderModuleList.Flink = reinterpret_cast<LIST_ENTRY*>(
+        entryAddress + offsetof(LDR_DATA_TABLE_ENTRY, InLoadOrderLinks));
+    ldrData.InMemoryOrderModuleList.Flink = reinterpret_cast<LIST_ENTRY*>(
+        entryAddress + offsetof(LDR_DATA_TABLE_ENTRY, InMemoryOrderLinks));
+    ldrData.InInitializationOrderModuleList.Flink =
+        reinterpret_cast<LIST_ENTRY*>(
+            entryAddress +
+            offsetof(LDR_DATA_TABLE_ENTRY, InInitializationOrderLinks));
+
+    // 写回更新后的LDR_DATA
+    uc_mem_write(m_ucEngine, m_peb64.Ldr, &ldrData, sizeof(X64_PEB_LDR_DATA));
+}
+
+auto Sandbox::AddModuleToLdr(const std::shared_ptr<struct_moudle>& module)
+    -> void {
+    if (!m_peInfo->isX64) {
+        return;  // 暂时只处理64位
+    }
+
+    if (m_peb64.Ldr == 0) {
+        InitializeLdrData();
+    }
+
+    // 为模块创建LDR_DATA_TABLE_ENTRY
+    uint64_t entrySize = sizeof(LDR_DATA_TABLE_ENTRY) +
+                         MAX_PATH * 2;  // 额外空间用于Unicode字符串
+    uint64_t entryAddress = m_pebEnd;
+    m_pebEnd += entrySize;
+
+    // 映射内存
+    uc_mem_map(m_ucEngine, entryAddress, entrySize, UC_PROT_ALL);
+
+    // 设置Unicode字符串地址
+    uint64_t fullNameAddress = entryAddress + sizeof(LDR_DATA_TABLE_ENTRY);
+    uint64_t baseNameAddress = fullNameAddress + MAX_PATH;
+
+    // 创建并初始化LDR_DATA_TABLE_ENTRY
+    auto entry =
+        CreateLdrEntry(module, entryAddress, fullNameAddress, baseNameAddress);
+
+    // 从PEB读取当前LDR_DATA结构
+    X64_PEB_LDR_DATA ldrData;
+    uc_mem_read(m_ucEngine, m_peb64.Ldr, &ldrData, sizeof(X64_PEB_LDR_DATA));
+
+    // 设置链表指针
+    entry.InLoadOrderLinks.Flink = reinterpret_cast<LIST_ENTRY*>(
+        reinterpret_cast<uintptr_t>(ldrData.InLoadOrderModuleList.Flink));
+    entry.InLoadOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
+        m_peb64.Ldr + offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList));
+
+    entry.InMemoryOrderLinks.Flink = reinterpret_cast<LIST_ENTRY*>(
+        reinterpret_cast<uintptr_t>(ldrData.InMemoryOrderModuleList.Flink));
+    entry.InMemoryOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
+        m_peb64.Ldr + offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList));
+
+    entry.InInitializationOrderLinks.Flink =
+        reinterpret_cast<LIST_ENTRY*>(reinterpret_cast<uintptr_t>(
+            ldrData.InInitializationOrderModuleList.Flink));
+    entry.InInitializationOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
+        m_peb64.Ldr +
+        offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList));
+
+    // 写入LDR_DATA_TABLE_ENTRY结构
+    uc_mem_write(m_ucEngine, entryAddress, &entry,
+                 sizeof(LDR_DATA_TABLE_ENTRY));
+
+    // 更新链表
+    UpdateLdrLinks(entry, entryAddress, ldrData);
+
+    printf("Added module '%s' to LDR data tables at 0x%llx\n", module->name,
+           entryAddress);
+}
--- a/ai_anti_malware/sandbox.h
+++ b/ai_anti_malware/sandbox.h
@@ -3,6 +3,7 @@
 #include <map>

 #include "head.h"
+#include <WinInet.h>
 #define PAGE_SIZE 0x1000
 #define CF_MASK (1 << 0)
 #define PF_MASK (1 << 2)
@@ -70,6 +71,18 @@ struct HeapSegment {
    size_t size;        // 堆段的总大小
    HeapBlock* blocks;  // 块链表
 };
+enum class MalwareAnalysisType {
+    kNone,
+    kSuspicious,
+    kMalware,
+};
+struct InternetHandleInfo {
+    HINTERNET handle;
+    bool isConnection;
+    std::string url;
+    std::vector<char> responseData;
+    size_t currentPosition;
+};

 class Sandbox {
    friend class cFixImprot;  // 声明cFixImprot为友元类
@@ -114,6 +127,11 @@ class Sandbox {
    auto GetHeapBlocks() const -> std::map<uint64_t, HeapSegment*> {
        return m_heapSegments;
    }
+    auto PrintApiCallList() -> void {
+        for (auto& api : ApiCallList) {
+            printf("%s\n", api.c_str());
+        }
+    }

    // 从内存中提取PE文件并修复重定位和导入表，返回原始PE的缓冲区
    auto DumpPE() -> std::pair<std::unique_ptr<BYTE[]>, size_t>;
@@ -151,6 +169,49 @@ class Sandbox {
    auto SetCrossSectionExecution(uint64_t address) -> void {
        return m_crossSectionExecution.push_back(address);
    }
+    auto GetMalwareAnalysisType() -> MalwareAnalysisType {
+        return m_malwareAnalysisType;
+    }
+    auto SetMalwareAnalysisType(MalwareAnalysisType type) -> void {
+        if (type == MalwareAnalysisType::kMalware &&
+            m_malwareAnalysisType == MalwareAnalysisType::kSuspicious) {
+            m_malwareAnalysisType = type;
+        } else if (m_malwareAnalysisType == MalwareAnalysisType::kNone) {
+            m_malwareAnalysisType = type;
+        }
+    }
+    auto CheckMalwareActive_Registry(std::wstring registryPath) -> void;
+
+    auto CheckMalwareActive_Sleep(uint32_t secToSleep) -> void;
+
+    auto CheckMalwareActive_GetProcAddress(std::string wantName) -> void;
+
+    auto CheckMalwareActive_FilePath(std::wstring filePath) -> void;
+
+    // WinHTTP API相关方法
+    auto GetNextInternetHandle() -> uint64_t { return m_nextInternetHandle++; }
+
+    auto AddInternetHandle(uint64_t handle, const InternetHandleInfo& info)
+        -> void {
+        m_internetHandles[handle] = info;
+    }
+
+    auto GetInternetHandle(uint64_t handle) -> InternetHandleInfo* {
+        auto it = m_internetHandles.find(handle);
+        if (it != m_internetHandles.end()) {
+            return &it->second;
+        }
+        return nullptr;
+    }
+
+    auto RemoveInternetHandle(uint64_t handle) -> bool {
+        return m_internetHandles.erase(handle) > 0;
+    }
+
+    auto GetAllInternetHandles() -> std::map<uint64_t, InternetHandleInfo>& {
+        return m_internetHandles;
+    }
+    std::vector<std::string> ApiCallList;

   private:
    std::shared_ptr<BasicPeInfo> m_peInfo;
@@ -219,4 +280,26 @@ class Sandbox {
    uint64_t m_lastExecuteSectionIndex = 0;         // 上次执行的区段索引
    uint64_t m_KSharedUserDataBase{0};
    uint64_t m_KSharedUserDataSize{0};
+
+    MalwareAnalysisType m_malwareAnalysisType = MalwareAnalysisType::kNone;
+
+    // WinHTTP API相关成员变量
+    std::map<uint64_t, InternetHandleInfo> m_internetHandles;
+    uint64_t m_nextInternetHandle = 0x1000;
+
+    // 初始化PEB的LDR数据结构
+    auto InitializeLdrData() -> void;
+
+    // 将模块添加到LDR链表中
+    auto AddModuleToLdr(const std::shared_ptr<struct_moudle>& module) -> void;
+
+    // 创建LDR_DATA_TABLE_ENTRY结构
+    auto CreateLdrEntry(const std::shared_ptr<struct_moudle>& module,
+                        uint64_t entryAddress, uint64_t fullNameAddress,
+                        uint64_t baseNameAddress) -> LDR_DATA_TABLE_ENTRY;
+
+    // 更新LDR链表
+    auto UpdateLdrLinks(const LDR_DATA_TABLE_ENTRY& entry,
+                        uint64_t entryAddress, X64_PEB_LDR_DATA& ldrData)
+        -> void;
 };
--- a/ai_anti_malware/sandbox_api_emu.cpp
+++ b/ai_anti_malware/sandbox_api_emu.cpp
--- a/ai_anti_malware/sandbox_api_regs.cpp
+++ b/ai_anti_malware/sandbox_api_regs.cpp
@@ -0,0 +1,181 @@
+#include "sandbox.h"
+#include "sandbox_callbacks.h"
+
+auto Api_RegOpenKeyExW(void* sandbox, uc_engine* uc, uint64_t address) -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+    uint64_t hKey = 0;        // 父键句柄
+    uint64_t lpSubKey = 0;    // 子键名称
+    uint32_t ulOptions = 0;   // 选项
+    uint32_t samDesired = 0;  // 访问权限
+    uint64_t phkResult = 0;   // 结果句柄的指针
+
+    // 默认返回值：成功
+    LONG status = ERROR_SUCCESS;
+
+    // 获取参数
+    if (context->GetPeInfo()->isX64) {
+        // x64: rcx=hKey, rdx=lpSubKey, r8=ulOptions, r9=samDesired,
+        // [rsp+0x28]=phkResult
+        uc_reg_read(uc, UC_X86_REG_RCX, &hKey);
+        uc_reg_read(uc, UC_X86_REG_RDX, &lpSubKey);
+        uint64_t temp_options = 0;
+        uc_reg_read(uc, UC_X86_REG_R8, &temp_options);
+        ulOptions = static_cast<uint32_t>(temp_options);
+        uint64_t temp_sam = 0;
+        uc_reg_read(uc, UC_X86_REG_R9, &temp_sam);
+        samDesired = static_cast<uint32_t>(temp_sam);
+
+        // 第5个参数从栈上读取
+        uint64_t rsp = 0;
+        uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
+        uc_mem_read(uc, rsp + 0x28, &phkResult, sizeof(uint64_t));
+    } else {
+        // x86: 从栈上读取参数
+        uint32_t esp_address = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp_address);
+        esp_address += 4;  // 跳过返回地址
+
+        uint32_t temp_hkey = 0;
+        uc_mem_read(uc, esp_address, &temp_hkey, sizeof(uint32_t));
+        hKey = temp_hkey;
+        esp_address += 4;
+
+        uint32_t temp_subkey = 0;
+        uc_mem_read(uc, esp_address, &temp_subkey, sizeof(uint32_t));
+        lpSubKey = temp_subkey;
+        esp_address += 4;
+
+        uc_mem_read(uc, esp_address, &ulOptions, sizeof(uint32_t));
+        esp_address += 4;
+
+        uc_mem_read(uc, esp_address, &samDesired, sizeof(uint32_t));
+        esp_address += 4;
+
+        uint32_t temp_result = 0;
+        uc_mem_read(uc, esp_address, &temp_result, sizeof(uint32_t));
+        phkResult = temp_result;
+    }
+
+    // 读取子键名称
+    std::wstring subKeyName;
+    if (lpSubKey != 0) {
+        wchar_t buffer[MAX_PATH] = {0};
+        size_t bytesRead = 0;
+        bool truncated = false;
+
+        // 读取Unicode字符串，直到遇到null终止符或达到MAX_PATH
+        for (size_t i = 0; i < MAX_PATH - 1; i++) {
+            wchar_t ch = 0;
+            uc_mem_read(uc, lpSubKey + (i * sizeof(wchar_t)), &ch,
+                        sizeof(wchar_t));
+            if (ch == 0) break;
+            buffer[i] = ch;
+            bytesRead = i + 1;
+
+            if (i == MAX_PATH - 2) {
+                truncated = true;
+            }
+        }
+
+        subKeyName = std::wstring(buffer, bytesRead);
+    }
+
+    // 生成一个随机句柄值 (不是0，通常是4的倍数)
+    uint32_t newKeyHandle = 0x1000 + (std::rand() % 0xFFFFF) * 4;
+
+    // 在沙箱中记录打开的注册表键 (这里可以根据需要扩展，保存键的路径等信息)
+    // 例如：context->OpenedRegistryKeys[newKeyHandle] = {hKey, subKeyName};
+
+    // 写入句柄到结果指针
+    if (phkResult != 0) {
+        if (context->GetPeInfo()->isX64) {
+            uc_mem_write(uc, phkResult, &newKeyHandle, sizeof(uint64_t));
+        } else {
+            uc_mem_write(uc, phkResult, &newKeyHandle, sizeof(uint32_t));
+        }
+    } else {
+        status = ERROR_INVALID_PARAMETER;
+    }
+    // 获取根键名称
+    std::string rootKeyName;
+    switch (hKey) {
+        case (uint64_t)HKEY_CLASSES_ROOT:
+            rootKeyName = "HKEY_CLASSES_ROOT";
+            break;
+        case (uint64_t)HKEY_CURRENT_USER:
+            rootKeyName = "HKEY_CURRENT_USER";
+            break;
+        case (uint64_t)HKEY_LOCAL_MACHINE:
+            rootKeyName = "HKEY_LOCAL_MACHINE";
+            break;
+        case (uint64_t)HKEY_USERS:
+            rootKeyName = "HKEY_USERS";
+            break;
+        case (uint64_t)HKEY_CURRENT_CONFIG:
+            rootKeyName = "HKEY_CURRENT_CONFIG";
+            break;
+        default:
+            rootKeyName = "Unknown key handle";
+            break;
+    }
+
+    std::string wstr_to_str;
+    for (wchar_t c : subKeyName) {
+        if (c <= 127) {
+            wstr_to_str += static_cast<char>(c);
+        } else {
+            wstr_to_str += '?';
+        }
+    }
+    context->CheckMalwareActive_Registry(subKeyName);
+
+    printf(
+        "[*] RegOpenKeyExW: %s\\%s, Options=0x%x, SAM=0x%x -> Handle=0x%x, "
+        "Status=%ld\n",
+        rootKeyName.c_str(), wstr_to_str.c_str(), ulOptions, samDesired,
+        newKeyHandle, status);
+
+    // 返回状态
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &status);
+}
+
+auto Api_RegCloseKey(void* sandbox, uc_engine* uc, uint64_t address) -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+    uint64_t hKey = 0;  // 键句柄
+
+    // 默认返回值：成功
+    LONG status = ERROR_SUCCESS;
+
+    // 获取参数
+    if (context->GetPeInfo()->isX64) {
+        // x64: rcx=hKey
+        uc_reg_read(uc, UC_X86_REG_RCX, &hKey);
+    } else {
+        // x86: 从栈上读取参数
+        uint32_t esp_address = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp_address);
+        esp_address += 4;  // 跳过返回地址
+
+        uint32_t temp_hkey = 0;
+        uc_mem_read(uc, esp_address, &temp_hkey, sizeof(uint32_t));
+        hKey = temp_hkey;
+    }
+
+    // 在实际应用中，这里应该从沙盒的注册表句柄映射中移除此句柄
+    // 但当前环境似乎没有明确保存句柄映射，所以只记录操作即可
+    // 如果以后需要，可以添加: context->OpenedRegistryKeys.erase(hKey);
+
+    // 只有当句柄为0或无效时才返回错误
+    if (hKey == 0) {
+        status = ERROR_INVALID_HANDLE;
+    }
+
+    printf("[*] RegCloseKey: Handle=0x%llx -> Status=%ld\n", hKey, status);
+
+    // 返回状态
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &status);
+}
--- a/ai_anti_malware/sandbox_api_stl.cpp
+++ b/ai_anti_malware/sandbox_api_stl.cpp
--- a/ai_anti_malware/sandbox_api_winhttp.cpp
+++ b/ai_anti_malware/sandbox_api_winhttp.cpp
@@ -0,0 +1,350 @@
+#include "sandbox.h"
+#include <windows.h>
+#include <wininet.h>
+#include <algorithm>
+
+// 函数声明，确保外部可见
+extern auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
+extern auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
+extern auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc,
+                                    uint64_t address) -> void;
+extern auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
+
+// 模拟InternetOpenA API
+auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address) -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+
+    // 获取参数
+    uint64_t lpszAgent = 0;
+    uint64_t dwAccessType = 0;
+    uint64_t lpszProxy = 0;
+    uint64_t lpszProxyBypass = 0;
+    uint32_t dwFlags = 0;
+
+    // 根据x86或x64架构读取参数
+    if (context->GetPeInfo()->isX64) {
+        uc_reg_read(uc, UC_X86_REG_RCX, &lpszAgent);
+        uc_reg_read(uc, UC_X86_REG_RDX, &dwAccessType);
+        uc_reg_read(uc, UC_X86_REG_R8, &lpszProxy);
+        uc_reg_read(uc, UC_X86_REG_R9, &lpszProxyBypass);
+
+        uint64_t rsp = 0;
+        uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
+        uc_mem_read(uc, rsp + 0x28, &dwFlags, sizeof(dwFlags));
+    } else {
+        uint32_t esp = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp);
+
+        uint32_t param_addr = esp + 4;
+        uc_mem_read(uc, param_addr, &lpszAgent, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwAccessType, sizeof(dwAccessType));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpszProxy, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpszProxyBypass, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwFlags, sizeof(dwFlags));
+    }
+
+    // 读取用户代理字符串
+    std::string agentString;
+    if (lpszAgent != 0) {
+        char buffer[256] = {0};
+        uc_mem_read(uc, lpszAgent, buffer, sizeof(buffer) - 1);
+        agentString = buffer;
+
+        // 检查用户代理是否可疑
+        const std::vector<std::string> suspiciousAgents = {
+            "wget",    "curl",       "python",  "go-http",
+            "perl",    "powershell", "winhttp", "urlmon",
+            "mozilla", "edge",       "chrome",  "internet explorer"};
+
+        for (const auto& agent : suspiciousAgents) {
+            std::string lowerAgent = agentString;
+            // 转换为小写进行比较
+            std::transform(lowerAgent.begin(), lowerAgent.end(),
+                           lowerAgent.begin(),
+                           [](unsigned char c) { return std::tolower(c); });
+
+            if (lowerAgent.find(agent) != std::string::npos) {
+                context->SetMalwareAnalysisType(
+                    MalwareAnalysisType::kSuspicious);
+#if LOG_LEVEL >= 1
+                printf("[!!!] Suspicious User-Agent: %s\n",
+                       agentString.c_str());
+#endif
+                break;
+            }
+        }
+    }
+
+    // 分配新的Internet句柄
+    uint64_t handleValue = context->GetNextInternetHandle();
+
+    // 在实际创建句柄之前进行检查
+    if (dwAccessType == INTERNET_OPEN_TYPE_PROXY && lpszProxy != 0) {
+        char proxyBuffer[256] = {0};
+        uc_mem_read(uc, lpszProxy, proxyBuffer, sizeof(proxyBuffer) - 1);
+        std::string proxyString = proxyBuffer;
+
+        // 检查代理设置是否可疑
+        if (!proxyString.empty()) {
+            context->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+#if LOG_LEVEL >= 1
+            printf("[!!!] Suspicious proxy configuration: %s\n",
+                   proxyString.c_str());
+#endif
+        }
+    }
+
+    // 创建句柄信息
+    InternetHandleInfo handleInfo;
+    handleInfo.handle = (HINTERNET)handleValue;
+    handleInfo.isConnection = false;
+    context->AddInternetHandle(handleValue, handleInfo);
+
+    // 设置返回值
+    uint64_t returnValue = handleValue;
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &returnValue);
+}
+
+// 模拟InternetOpenUrlA API
+auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+
+    // 获取参数
+    uint64_t hInternet = 0;
+    uint64_t lpszUrl = 0;
+    uint64_t lpszHeaders = 0;
+    uint64_t dwHeadersLength = 0;
+    uint64_t dwFlags = 0;
+    uint64_t dwContext = 0;
+
+    // 根据x86或x64架构读取参数
+    if (context->GetPeInfo()->isX64) {
+        uc_reg_read(uc, UC_X86_REG_RCX, &hInternet);
+        uc_reg_read(uc, UC_X86_REG_RDX, &lpszUrl);
+        uc_reg_read(uc, UC_X86_REG_R8, &lpszHeaders);
+        uc_reg_read(uc, UC_X86_REG_R9, &dwHeadersLength);
+
+        uint64_t rsp = 0;
+        uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
+        uc_mem_read(uc, rsp + 0x28, &dwFlags, sizeof(dwFlags));
+        uc_mem_read(uc, rsp + 0x30, &dwContext, sizeof(dwContext));
+    } else {
+        uint32_t esp = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp);
+
+        uint32_t param_addr = esp + 4;
+        uc_mem_read(uc, param_addr, &hInternet, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpszUrl, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpszHeaders, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwHeadersLength, sizeof(dwHeadersLength));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwFlags, sizeof(dwFlags));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwContext, sizeof(uint32_t));
+    }
+    context->SetMalwareAnalysisType(MalwareAnalysisType::kMalware);
+
+    // 读取URL字符串
+    std::string urlString;
+    if (lpszUrl != 0) {
+        char buffer[1024] = {0};
+        uc_mem_read(uc, lpszUrl, buffer, sizeof(buffer) - 1);
+        urlString = buffer;
+    }
+    printf("urlString: %s\n", urlString.c_str());
+
+    // 检查Internet句柄是否有效
+    if (context->GetInternetHandle(hInternet) == nullptr) {
+        // 无效句柄，返回NULL
+        uint64_t returnValue = 0;
+        uc_reg_write(
+            uc, context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+            &returnValue);
+        return;
+    }
+
+    // 分配新的URL连接句柄
+    uint64_t handleValue = context->GetNextInternetHandle();
+
+    // 创建句柄信息
+    InternetHandleInfo handleInfo;
+    handleInfo.handle = (HINTERNET)handleValue;
+    handleInfo.isConnection = true;
+    handleInfo.url = urlString;
+    // 生成模拟响应数据
+    // 这块可以真实请求,然后看是不是PE文件之类的.
+    const char* sampleResponse =
+        "HTTP/1.1 200 OK\r\nContent-Type: "
+        "text/html\r\n\r\n<html><body>huoji own me and all</body></html>";
+    handleInfo.responseData.assign(sampleResponse,
+                                   sampleResponse + strlen(sampleResponse));
+    handleInfo.currentPosition = 0;
+
+    context->AddInternetHandle(handleValue, handleInfo);
+
+    // 设置返回值
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &handleValue);
+}
+
+// 模拟InternetCloseHandle API
+auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+
+    // 获取参数
+    uint64_t hInternet = 0;
+
+    // 根据x86或x64架构读取参数
+    if (context->GetPeInfo()->isX64) {
+        uc_reg_read(uc, UC_X86_REG_RCX, &hInternet);
+    } else {
+        uint32_t esp = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp);
+
+        uint32_t param_addr = esp + 4;
+        uc_mem_read(uc, param_addr, &hInternet, sizeof(uint32_t));
+    }
+
+    // 检查句柄是否有效
+    bool handleValid = (context->GetInternetHandle(hInternet) != nullptr);
+
+    // 如果句柄有效，移除它
+    if (handleValid) {
+        context->RemoveInternetHandle(hInternet);
+    }
+
+    // 设置返回值（成功或失败）
+    uint32_t returnValue = handleValid ? TRUE : FALSE;
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &returnValue);
+}
+
+// 模拟InternetReadFile API
+auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void {
+    auto context = static_cast<Sandbox*>(sandbox);
+
+    // 获取参数
+    uint64_t hFile = 0;
+    uint64_t lpBuffer = 0;
+    uint32_t dwNumberOfBytesToRead = 0;
+    uint64_t lpdwNumberOfBytesRead = 0;
+
+    // 根据x86或x64架构读取参数
+    if (context->GetPeInfo()->isX64) {
+        uc_reg_read(uc, UC_X86_REG_RCX, &hFile);
+        uc_reg_read(uc, UC_X86_REG_RDX, &lpBuffer);
+        uc_reg_read(uc, UC_X86_REG_R8, &dwNumberOfBytesToRead);
+        uc_reg_read(uc, UC_X86_REG_R9, &lpdwNumberOfBytesRead);
+    } else {
+        uint32_t esp = 0;
+        uc_reg_read(uc, UC_X86_REG_ESP, &esp);
+
+        uint32_t param_addr = esp + 4;
+        uc_mem_read(uc, param_addr, &hFile, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpBuffer, sizeof(uint32_t));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &dwNumberOfBytesToRead,
+                    sizeof(dwNumberOfBytesToRead));
+
+        param_addr += 4;
+        uc_mem_read(uc, param_addr, &lpdwNumberOfBytesRead, sizeof(uint32_t));
+    }
+
+    // 检查句柄是否有效
+    auto it = context->GetInternetHandle(hFile);
+    if (it == nullptr || !it->isConnection) {
+        // 无效句柄，设置失败状态
+        uint32_t returnValue = FALSE;
+        uc_reg_write(
+            uc, context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+            &returnValue);
+        return;
+    }
+
+    // 获取句柄信息
+    InternetHandleInfo& handleInfo = *it;
+
+    // 计算实际要读取的字节数
+    uint32_t bytesToRead = dwNumberOfBytesToRead;
+    if (handleInfo.currentPosition + bytesToRead >
+        handleInfo.responseData.size()) {
+        bytesToRead = (uint32_t)(handleInfo.responseData.size() -
+                                 handleInfo.currentPosition);
+    }
+
+    // 检查响应数据中是否包含恶意内容
+    if (bytesToRead > 0) {
+        std::string dataChunk(
+            handleInfo.responseData.begin() + handleInfo.currentPosition,
+            handleInfo.responseData.begin() + handleInfo.currentPosition +
+                bytesToRead);
+
+        // 检查响应数据是否包含可疑内容
+        const std::vector<std::string> suspiciousResponsePatterns = {
+            "powershell",     "cmd.exe",      "eval(",      "exec(",
+            "system(",        "shell_exec",   "<script",    "function()",
+            "document.write", "base64",       "FromBase64", "CreateObject",
+            "WScript",        "ActiveXObject"};
+
+        for (const auto& pattern : suspiciousResponsePatterns) {
+            if (dataChunk.find(pattern) != std::string::npos) {
+                context->SetMalwareAnalysisType(
+                    MalwareAnalysisType::kSuspicious);
+#if LOG_LEVEL >= 1
+                printf("[!!!] Suspicious content in HTTP response: %s\n",
+                       pattern.c_str());
+#endif
+                break;
+            }
+        }
+    }
+
+    // 将数据写入缓冲区
+    if (bytesToRead > 0) {
+        uc_mem_write(
+            uc, lpBuffer,
+            handleInfo.responseData.data() + handleInfo.currentPosition,
+            bytesToRead);
+
+        // 更新当前位置
+        handleInfo.currentPosition += bytesToRead;
+    }
+
+    // 写入读取的字节数
+    uc_mem_write(uc, lpdwNumberOfBytesRead, &bytesToRead, sizeof(bytesToRead));
+
+    // 设置返回值（成功）
+    uint32_t returnValue = TRUE;
+    uc_reg_write(uc,
+                 context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
+                 &returnValue);
+}
--- a/ai_anti_malware/sandbox_api_winhttp.h
+++ b/ai_anti_malware/sandbox_api_winhttp.h
@@ -0,0 +1,11 @@
+#pragma once
+#include "head.h"
+
+// Internet API函数声明
+auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address) -> void;
+auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
+auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
+auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
+    -> void;
--- a/ai_anti_malware/sandbox_callbacks.cpp
+++ b/ai_anti_malware/sandbox_callbacks.cpp
@@ -46,6 +46,7 @@ void handleCodeRun(uc_engine* uc, uint64_t address, uint32_t size,
            "[!!!]detect cross section excute, from %d to %d,address: 0x%llx\n",
            sandbox->GetLastExecuteSectionIndex(), currentSectionIndex,
            address);
+        sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);

        // 记录跨区段执行地址
        sandbox->SetCrossSectionExecution(address);
@@ -99,6 +100,29 @@ void handleMemoryRead(uc_engine* uc, uc_mem_type type, uint64_t address,
                sandbox->GetPeInfo()->isX64 ? UC_X86_REG_RIP : UC_X86_REG_EIP,
                &regRip);

+    // 检测是否访问LDR结构
+    if (sandbox->GetPeInfo()->isX64) {
+        uint64_t ldrAddress = sandbox->GetPeb64()->Ldr;
+        if (ldrAddress != 0 && address >= ldrAddress &&
+            address < (ldrAddress + sizeof(X64_PEB_LDR_DATA))) {
+            printf(
+                "[WARNING] Suspicious direct LDR access detected at RIP: "
+                "0x%llx, accessing address: 0x%llx\n",
+                regRip, address);
+            sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+        }
+    } else {
+        uint32_t ldrAddress = sandbox->GetPeb32()->Ldr;
+        if (ldrAddress != 0 && address >= ldrAddress &&
+            address < (ldrAddress + sizeof(_PEB_LDR_DATA))) {
+            printf(
+                "[WARNING] Suspicious direct LDR access detected at RIP: 0x%x, "
+                "accessing address: 0x%llx\n",
+                static_cast<uint32_t>(regRip), address);
+            sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+        }
+    }
+
    uint64_t readAddress;
    auto readError =
        uc_mem_read(sandbox->GetUnicornHandle(), address, &readAddress, size);
@@ -235,16 +259,53 @@ void handleMemoryUnmapRead(uc_engine* uc, uc_mem_type type, uint64_t address,
    printf("[handleMemoryUnmapRead] Address: %p Size: %p Value: %p\n", address,
           size, value);
    dumpVmenv(uc, userData);
-    __debugbreak();
 }

 void handleMemoryWrite(uc_engine* uc, uc_mem_type type, uint64_t address,
                       int size, int64_t value, void* userData) {
-    // 待实现
+    auto* sandbox = static_cast<Sandbox*>(userData);
+    if (!sandbox) return;
+
+    uint64_t regRip;
+    uc_reg_read(uc,
+                sandbox->GetPeInfo()->isX64 ? UC_X86_REG_RIP : UC_X86_REG_EIP,
+                &regRip);
+
+    // 检测是否写入LDR结构
+    if (sandbox->GetPeInfo()->isX64) {
+        uint64_t ldrAddress = sandbox->GetPeb64()->Ldr;
+        if (ldrAddress != 0 && address >= ldrAddress &&
+            address < (ldrAddress + sizeof(X64_PEB_LDR_DATA))) {
+            printf(
+                "[WARNING] Suspicious direct LDR modification detected at RIP: "
+                "0x%llx, modifying address: 0x%llx\n",
+                regRip, address);
+            sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+        }
+    } else {
+        uint32_t ldrAddress = sandbox->GetPeb32()->Ldr;
+        if (ldrAddress != 0 && address >= ldrAddress &&
+            address < (ldrAddress + sizeof(_PEB_LDR_DATA))) {
+            printf(
+                "[WARNING] Suspicious direct LDR modification detected at RIP: "
+                "0x%x, modifying address: 0x%llx\n",
+                static_cast<uint32_t>(regRip), address);
+            sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+        }
+    }
+
+    if (LOG_LEVEL > 0) {
+        printf("[handleMemoryWrite] Address: %p Size: %p Value: %p RIP: %p\n",
+               address, size, value, regRip);
+    }
 }

 void handleSyscall(uc_engine* uc, void* userData) {
    // 待实现
+    auto* sandbox = static_cast<Sandbox*>(userData);
+    if (!sandbox) return;
+    sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+    printf("[handleSyscall] Syscall detected\n");
 }

 }  // namespace sandboxCallbacks
--- a/ai_anti_malware/sandbox_malware_check.cpp
+++ b/ai_anti_malware/sandbox_malware_check.cpp
@@ -0,0 +1,96 @@
+#include "sandbox.h"
+#include <windows.h>
+
+auto Sandbox::CheckMalwareActive_Registry(std::wstring registryPath) -> void {
+    // 定义敏感注册表路径列表
+    const std::vector<std::wstring> sensitiveRegistryPaths = {
+        L"SOFTWARE\\DingTalk", L"SOFTWARE\\Tencent",
+        L"SOFTWARE\\WOW6432Node\\DingTalk", L"SOFTWARE\\WOW6432Node\\Tencent"};
+
+    // 检查提供的注册表路径是否在敏感列表中
+    bool isSensitiveRegistry = false;
+    for (const auto& sensitivePath : sensitiveRegistryPaths) {
+        if (registryPath.find(sensitivePath) != std::wstring::npos) {
+            isSensitiveRegistry = true;
+            break;
+        }
+    }
+
+    // 如果是敏感注册表路径，尝试打开它检查是否可访问
+    if (isSensitiveRegistry) {
+        SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+
+// 记录日志 (如果有日志系统的话)
+#if LOG_LEVEL >= 1
+        printf("[!!!] SensitiveRegistry Access %s\n", registryPath.c_str());
+#endif
+    }
+}
+auto Sandbox::CheckMalwareActive_Sleep(uint32_t secToSleep) -> void {
+    if (secToSleep > 1000 * 30) {
+        SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+        printf("[!!!] suspicious Sleep %d seconds\n", secToSleep);
+    }
+}
+auto Sandbox::CheckMalwareActive_GetProcAddress(std::string wantName) -> void {
+    const std::vector<std::string> sensitiveGetProcAddressNames = {
+        "ZwAllocateVirtualMemory",
+        "NtAllocateVirtualMemory",
+        "NtFreeVirtualMemory",
+        "NtProtectVirtualMemory",
+        "NtWriteVirtualMemory",
+        "NtReadVirtualMemory",
+        "NtCreateThreadEx",
+        "NtOpenThread",
+        "NtTerminateThread",
+        "NtResumeThread",
+        "NtSuspendThread",
+        "NtCreateThread",
+        "NtOpenThread",
+        "NtTerminateThread",
+        "NtResumeThread",
+        "NtSuspendThread"};
+    // more
+    if (std::find(sensitiveGetProcAddressNames.begin(),
+                  sensitiveGetProcAddressNames.end(),
+                  wantName) != sensitiveGetProcAddressNames.end()) {
+        SetMalwareAnalysisType(MalwareAnalysisType::kMalware);
+        printf("[!!!] suspicious GetProcAddress %s\n", wantName.c_str());
+    }
+}
+
+auto Sandbox::CheckMalwareActive_FilePath(std::wstring filePath) -> void {
+    // 定义敏感文件路径列表
+    const std::vector<std::wstring> sensitiveFilePaths = {
+        L"\\AppData\\",
+        L"\\Temp\\",
+        L"\\Windows\\System32\\",
+        L"\\Program Files\\",
+        L"\\Program Files (x86)\\",
+        L"\\Documents\\",
+        L"\\Downloads\\",
+        L"\\Desktop\\",
+        L"\\Users\\All Users\\",
+        L"\\ProgramData\\",
+        L"\\Microsoft\\Windows\\Start Menu\\",
+        L"\\Startup\\"};
+
+    // 检查提供的文件路径是否在敏感列表中
+    bool isSensitiveFilePath = false;
+    for (const auto& sensitivePath : sensitiveFilePaths) {
+        if (filePath.find(sensitivePath) != std::wstring::npos) {
+            isSensitiveFilePath = true;
+            break;
+        }
+    }
+
+    // 如果是敏感文件路径，将恶意软件分析类型设置为可疑
+    if (isSensitiveFilePath) {
+        SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
+
+        // 记录日志
+#if LOG_LEVEL >= 1
+        printf("[!!!] SensitiveFilePath Access: %ls\n", filePath.c_str());
+#endif
+    }
+}
--- a/ml/.vscode/settings.json
+++ b/ml/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "python.analysis.typeCheckingMode": "basic"
+}
--- a/ml/data/malware_features.csv
+++ b/ml/data/malware_features.csv
--- a/ml/data/whitelist_features.csv
+++ b/ml/data/whitelist_features.csv
--- a/ml/malware_detector.cpp
+++ b/ml/malware_detector.cpp
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import joblib
-import pandas as pd
-import numpy as np
-import sys
-import os
-
-def load_model(model_path='xgboost_malware_detector.model'):
-    """
-    加载训练好的模型
-    """
-    print(f"正在加载模型: {model_path}")
-    try:
-        model = joblib.load(model_path)
-        print("模型加载成功！")
-        return model
-    except Exception as e:
-        print(f"模型加载失败: {e}")
-        return None
-
-def predict_file(model, csv_path):
-    """
-    对单个CSV文件进行预测
-    """
-    try:
-        # 加载CSV文件
-        df = pd.read_csv(csv_path)
-        
-        # 提取特征 (除去第一列文件路径)
-        features = df.iloc[:, 1:]
-        
-        # 使用模型预测
-        predictions = model.predict(features)
-        probabilities = model.predict_proba(features)
-        
-        # 添加预测结果到数据框
-        df['预测标签'] = predictions
-        df['恶意软件概率'] = probabilities[:, 1]
-        
-        # 创建结果数据框
-        results = pd.DataFrame({
-            '文件路径': df.iloc[:, 0],
-            '预测标签': predictions,
-            '恶意软件概率': probabilities[:, 1]
-        })
-        
-        # 保存结果到CSV
-        output_path = os.path.splitext(csv_path)[0] + '_predictions.csv'
-        results.to_csv(output_path, index=False)
-        print(f"预测结果已保存到: {output_path}")
-        
-        # 打印概要
-        malware_count = len(results[results['预测标签'] == 1])
-        total_count = len(results)
-        print(f"总样本数: {total_count}")
-        print(f"检测为恶意软件: {malware_count} ({malware_count/total_count*100:.2f}%)")
-        print(f"检测为白名单软件: {total_count - malware_count} ({(total_count-malware_count)/total_count*100:.2f}%)")
-        
-        return results
-    
-    except Exception as e:
-        print(f"预测失败: {e}")
-        return None
-
-def batch_predict(model, csv_paths):
-    """
-    批量预测多个CSV文件
-    """
-    results = {}
-    for csv_path in csv_paths:
-        print(f"\n分析文件: {csv_path}")
-        result = predict_file(model, csv_path)
-        if result is not None:
-            results[csv_path] = result
-    
-    return results
-
-def main():
-    """
-    主函数
-    """
-    # 检查命令行参数
-    if len(sys.argv) < 2:
-        print("使用方法: python predict.py <csv文件路径1> [csv文件路径2] ...")
-        return
-    
-    # 加载模型
-    model = load_model()
-    if model is None:
-        return
-    
-    # 批量预测
-    csv_paths = sys.argv[1:]
-    batch_predict(model, csv_paths)
-
-if __name__ == "__main__":
-    main() 
--- a/ml/train_model.py
+++ b/ml/train_model.py
@@ -1,264 +1,117 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
 import pandas as pd
 import numpy as np
 import xgboost as xgb
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
-import matplotlib.pyplot as plt
-import seaborn as sns
-import os
-import joblib
+from sklearn.metrics import accuracy_score
+import m2cgen as m2c
+from xgboost import XGBClassifier
+import csv

-def load_data(malware_csv, whitelist_csv):
-    """
-    加载恶意软件和白名单CSV文件
-    """
-    print(f"加载恶意软件数据: {malware_csv}")
-    
-    # 预处理：先获取CSV的列数
-    # 读取第一行以确定正确的列数
-    try:
-        header = pd.read_csv(malware_csv, nrows=1)
-        expected_columns = len(header.columns)
-        print(f"预期列数: {expected_columns}")
-        
-        # 使用自定义函数读取CSV，处理字段不足的行
-        malware_df = pd.read_csv(
-            malware_csv, 
-            header=0,
-            low_memory=False,
-            on_bad_lines='skip',  # 跳过无法解析的行
-            dtype=float,          # 将所有数据列转为浮点型
-            converters={0: str}   # 第一列为文件路径，保持为字符串类型
-        )
-        
-        # 检查列数是否不足，如果不足则填充0
-        actual_columns = len(malware_df.columns)
-        if actual_columns < expected_columns:
-            for i in range(actual_columns, expected_columns):
-                col_name = f"col_{i}"
-                malware_df[col_name] = 0.0
-                
-        print(f"成功读取恶意软件数据，形状: {malware_df.shape}")
-    except Exception as e:
-        print(f"读取恶意软件数据时出错: {e}")
-        return None, None
-    
-    malware_df['label'] = 1  # 恶意软件标签为1
-    
-    print(f"加载白名单数据: {whitelist_csv}")
-    try:
-        # 同样处理白名单数据
-        whitelist_df = pd.read_csv(
-            whitelist_csv, 
-            header=0,
-            low_memory=False,
-            on_bad_lines='skip',
-            dtype=float,
-            converters={0: str}
-        )
-        
-        # 确保列数与恶意软件数据一致
-        whitelist_cols = len(whitelist_df.columns)
-        malware_cols = len(malware_df.columns) - 1  # 减去标签列
-        
-        if whitelist_cols < malware_cols:
-            for i in range(whitelist_cols, malware_cols):
-                col_name = f"col_{i}"
-                whitelist_df[col_name] = 0.0
-                
-        print(f"成功读取白名单数据，形状: {whitelist_df.shape}")
-    except Exception as e:
-        print(f"读取白名单数据时出错: {e}")
-        return None, None
-        
-    whitelist_df['label'] = 0  # 白名单软件标签为0
-    
-    # 确保两个DataFrame的列完全一致（除了可能的文件路径差异）
-    malware_features = set(malware_df.columns)
-    whitelist_features = set(whitelist_df.columns)
-    
-    # 找出不同的列
-    malware_only = malware_features - whitelist_features
-    whitelist_only = whitelist_features - malware_features
-    
-    # 为缺少的列添加0值
-    for col in malware_only:
-        if col != 'label':
-            whitelist_df[col] = 0.0
-            
-    for col in whitelist_only:
-        if col != 'label':
-            malware_df[col] = 0.0
-    
-    # 合并数据
-    combined_df = pd.concat([malware_df, whitelist_df], ignore_index=True, sort=False)
-    
-    # 第一列通常是文件路径，需要将其移除
-    # 先保存文件路径以便后续参考
-    file_paths = combined_df.iloc[:, 0].tolist()
-    
-    features = combined_df.iloc[:, 1:-1]  # 除去第一列(文件路径)和最后一列(标签)
-    labels = combined_df['label']
-    
-    print(f"数据加载完成: {len(malware_df)} 个恶意样本, {len(whitelist_df)} 个白名单样本")
-    print(f"特征维度: {features.shape}")
-    
-    return features, labels
+malware_csv = 'data/malware_features.csv'
+whitelist_csv = 'data/whitelist_features.csv'

-def train_xgboost_model(X_train, y_train, X_test, y_test):
-    """
-    训练XGBoost模型
-    """
-    print("开始训练XGBoost模型...")
+# 手动读取CSV文件并自动填充缺失字段
+def read_csv_with_padding(file_path):
+    print(f"开始读取 {file_path}...")
+    max_cols = 0
+    rows = []
    
-    # 处理数据中可能存在的NaN值
-    print("检查并填充缺失值...")
-    X_train = X_train.fillna(0)
-    X_test = X_test.fillna(0)
+    # 首先确定最大列数
+    with open(file_path, 'r', encoding='latin1', errors='replace') as f:
+        csv_reader = csv.reader(f)
+        for row in csv_reader:
+            max_cols = max(max_cols, len(row))
+            rows.append(row)
    
-    # 检查是否还有无限值，并将其替换为0
-    X_train = X_train.replace([np.inf, -np.inf], 0)
-    X_test = X_test.replace([np.inf, -np.inf], 0)
+    print(f"文件 {file_path} 最大列数: {max_cols}")
    
-    print(f"处理后的训练数据形状: {X_train.shape}")
-    print(f"处理后的测试数据形状: {X_test.shape}")
+    # 为每一行填充缺失的字段
+    padded_rows = []
+    for row in rows:
+        # 如果行长度小于最大列数，用'0'填充
+        padded_row = row + ['0'] * (max_cols - len(row))
+        padded_rows.append(padded_row)
    
-    # 设置XGBoost参数
-    params = {
-        'max_depth': 6,               # 树的最大深度
-        'learning_rate': 0.1,         # 学习率
-        'n_estimators': 100,          # 树的数量
-        'objective': 'binary:logistic', # 二分类问题
-        'eval_metric': 'logloss',     # 评估指标
-        'subsample': 0.8,             # 样本采样率
-        'colsample_bytree': 0.8,      # 特征采样率
-        'random_state': 42            # 随机种子
-    }
-    
-    # 创建XGBoost分类器
-    model = xgb.XGBClassifier(**params)
-    
-    # 训练模型
-    model.fit(
-        X_train, y_train,
-        eval_set=[(X_train, y_train), (X_test, y_test)],
-        early_stopping_rounds=10,
-        verbose=True
-    )
-    
-    print("模型训练完成！")
-    return model
+    # 转换为DataFrame
+    df = pd.DataFrame(padded_rows)
+    print(f"读取 {file_path} 完成，形状: {df.shape}")
+    return df

-def evaluate_model(model, X_test, y_test):
-    """
-    评估模型性能
-    """
-    print("评估模型性能...")
-    
-    # 在测试集上进行预测
-    y_pred = model.predict(X_test)
-    
-    # 计算准确率
-    accuracy = accuracy_score(y_test, y_pred)
-    print(f"准确率: {accuracy:.4f}")
-    
-    # 打印分类报告
-    print("\n分类报告:")
-    print(classification_report(y_test, y_pred, target_names=['白名单', '恶意软件']))
-    
-    # 打印混淆矩阵
-    cm = confusion_matrix(y_test, y_pred)
-    plt.figure(figsize=(8, 6))
-    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
-                xticklabels=['白名单', '恶意软件'], 
-                yticklabels=['白名单', '恶意软件'])
-    plt.xlabel('预测')
-    plt.ylabel('实际')
-    plt.title('混淆矩阵')
-    plt.savefig('confusion_matrix.png')
-    plt.close()
-    
-    # 显示特征重要性
-    plt.figure(figsize=(12, 8))
-    xgb.plot_importance(model, max_num_features=20)
-    plt.title('特征重要性')
-    plt.savefig('feature_importance.png')
-    plt.close()
-    
-    return accuracy
+# 读取CSV文件
+malware_data = read_csv_with_padding(malware_csv)
+whitelist_data = read_csv_with_padding(whitelist_csv)

-def save_model(model, output_path='xgboost_malware_detector.model'):
-    """
-    保存模型到文件
-    """
-    print(f"保存模型到 {output_path}")
-    joblib.dump(model, output_path)
-    print("模型保存完成！")
+# 删除第一列（路径列）
+malware_data = malware_data.iloc[:, 1:]
+whitelist_data = whitelist_data.iloc[:, 1:]

-def main():
-    """
-    主函数：加载数据，训练模型，评估结果，保存模型
-    """
-    try:
-        print("开始恶意软件检测模型训练...")
-        
-        # 设置文件路径
-        malware_csv = 'data/malware_features.csv'
-        whitelist_csv = 'data/whitelist_features.csv'
-        
-        # 检查文件是否存在
-        if not os.path.exists(malware_csv):
-            print(f"错误: 找不到恶意软件特征文件 {malware_csv}")
-            return
-            
-        if not os.path.exists(whitelist_csv):
-            print(f"错误: 找不到白名单特征文件 {whitelist_csv}")
-            return
-        
-        # 加载数据
-        X, y = load_data(malware_csv, whitelist_csv)
-        
-        if X is None or y is None:
-            print("数据加载失败，终止训练")
-            return
-            
-        print(f"数据集加载完成，共 {len(X)} 个样本")
-        
-        # 数据划分
-        try:
-            X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=0.2, random_state=42, stratify=y)
-            
-            print(f"训练集: {len(X_train)} 样本，测试集: {len(X_test)} 样本")
-        except Exception as e:
-            print(f"数据划分出错: {e}")
-            return
-        
-        # 训练模型
-        try:
-            model = train_xgboost_model(X_train, y_train, X_test, y_test)
-        except Exception as e:
-            print(f"模型训练出错: {e}")
-            return
-        
-        # 评估模型
-        try:
-            evaluate_model(model, X_test, y_test)
-        except Exception as e:
-            print(f"模型评估出错: {e}")
-        
-        # 保存模型
-        try:
-            save_model(model)
-            print("模型训练和评估完成！")
-        except Exception as e:
-            print(f"模型保存出错: {e}")
-        
-    except Exception as e:
-        print(f"训练过程中发生未预期错误: {e}")
+# 将所有列转换为数值类型，非数值将转为NaN
+for col in malware_data.columns:
+    malware_data[col] = pd.to_numeric(malware_data[col], errors='coerce')
+for col in whitelist_data.columns:
+    whitelist_data[col] = pd.to_numeric(whitelist_data[col], errors='coerce')

-if __name__ == "__main__":
-    main() 
+# 用0填充NaN值
+malware_data.fillna(0, inplace=True)
+whitelist_data.fillna(0, inplace=True)
+
+# 找到最大列数（最长的特征向量）
+max_cols = max(malware_data.shape[1], whitelist_data.shape[1])
+
+# 用 0 填充（Padding）数据，使所有样本的列数相同
+malware_data = malware_data.reindex(columns=range(max_cols), fill_value=0)
+whitelist_data = whitelist_data.reindex(columns=range(max_cols), fill_value=0)
+
+# 添加标签
+malware_data['label'] = 1  # 恶意软件
+whitelist_data['label'] = 0  # 白名单（正常）
+print(malware_data.head())
+print(whitelist_data.head())
+
+# 合并数据
+combined_data = pd.concat([malware_data, whitelist_data], ignore_index=True)
+print(f"合并后数据形状: {combined_data.shape}")
+
+# 分离特征和标签
+X = combined_data.drop('label', axis=1)
+y = combined_data['label']
+
+# 分割数据集
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+print(f"训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}")
+
+# 创建 XGBoost 数据集
+dtrain = xgb.DMatrix(X_train, label=y_train)
+dtest = xgb.DMatrix(X_test, label=y_test)
+
+# 训练 XGBoost 模型
+num_rounds = 30
+# 创建watchlist来监控训练和验证集的性能
+watchlist = [(dtrain, '训练集'), (dtest, '验证集')]
+pos_ratio = np.mean(y_train)  # 计算 1 的比例
+
+clf = XGBClassifier(
+    base_score=pos_ratio,  #
+
+    objective='binary:logistic',  # 适用于二分类
+    max_depth=6,  # 树的最大深度
+    learning_rate=0.1,  # 学习率
+    n_estimators=100,  # 迭代轮数
+    subsample=0.8,  # 采样比例，防止过拟合
+    colsample_bytree=0.8,
+    use_label_encoder=False,  # 关闭 XGBoost 的 label 编码 (适用于新版本)
+    eval_metric='logloss'  # 交叉熵损失
+)
+clf.fit(X_train, y_train)
+
+# 预测
+y_pred_prob = clf.predict(X_test)
+y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]
+
+# 计算准确率
+accuracy = accuracy_score(y_test, y_pred)
+print(f'XGBoost 分类准确率: {accuracy:.4f}')
+code = m2c.export_to_c(clf)
+output_file = "malware_detector.cpp"
+with open(output_file, "w") as f:
+    f.write(code)
--- a/ml/xgboost_malware_detector.model
+++ b/ml/xgboost_malware_detector.model
--- a/readme.md
+++ b/readme.md
@@ -1,118 +1,140 @@
-# PE文件恶意软件检测系统
+## Preface

-这是一个基于机器学习的PE文件恶意软件检测系统，使用XGBoost算法对PE文件进行分类。
+**key08 Security** has surpassed **3,000 followers**, meaning that a significant portion of cybersecurity professionals in China are keeping an eye on it. So, it's time for a big project.

-## 功能特点
+### Why This Project?
+While working in the domestic cybersecurity field, I realized that **there is still a lot of untapped potential in the overall technical level**. Many people working in cybersecurity might also be interested in how **security software** on their computers actually works. Additionally, some might even dream of developing their **own antivirus software** or see it as their long-term goal.

- 利用PE文件结构特征进行恶意软件检测
- 基于XGBoost机器学习算法
- 提供训练和预测功能
- 输出详细的分类报告和可视化结果
+So, I felt there was a need to systematically **document the working principles of an antivirus engine**. While working on this, I noticed that the **information available online is close to zero**. The few available sources only describe outdated technologies like **signature-based scanning and cloud antivirus from before 2006**. Antivirus software seems to be treated like a **black box**.

-## 系统架构
+To **systematically educate**, rather than spread **misinformation or meme-based security practices** like some other public security accounts, I spent **two days** developing an antivirus engine that aligns with **modern security practices (as of 2025)**.

-该系统包含以下组件：
+Now, I will explain **how it works, what its weaknesses are**, and at the end of the chapter, I will even **open-source the code**, which can be **compiled directly using Visual Studio**, making **learning more convenient**.

-1. **特征提取模块**：C++编写的特征提取器，分析PE文件结构和行为特征
-2. **训练模块**：Python编写的模型训练代码，使用XGBoost算法
-3. **预测模块**：Python编写的模型推理代码，用于检测未知文件
+> ⚠️ **WARNING:** This code is provided **for learning purposes only**. The **datasets for machine learning, signature analysis, and dynamic behavior detection are extremely small**, so **detection effectiveness is very limited**.
+> 
+> **Do not use this code for your "bypass AV" tests** and then complain that it fails to detect certain samples. This is **not intended for antivirus evasion testing**.
+> **If you want to improve it, study the issues yourself instead of copying and pasting the code and then asking why it doesn't work!**

-## 特征集
+---

-系统从PE文件中提取以下特征：
+## Classification of Antivirus Engines
+Currently, all major security vendors promote their so-called **NGAV (Next-Gen Antivirus)**, but in reality, most detection engines fall into these four categories:

-1. PE段属性 (是否有配置、调试信息、例外处理、导出、导入等)
-2. 导入的DLL库
-3. 文件熵
-4. 入口点前64字节的归一化值
-5. 节区信息 (节区数量、平均熵、最大熵、归一化平均熵、大小比率)
-6. 代码段与整个文件的比率
-7. 节区数量
+1. **Cloud-Based Detection**
+   - This includes:
+     - **Fuzzy hashing engines** (such as `ssdeep`, `simhash`, etc.), which are used to **compare the similarity of files** (some vendors call this **"virus DNA"**).
+     - **Traditional hash-based engines**, which rely on **SHA1, SHA256**, etc.
+     - **Various cloud-based sandbox, manual or automated analysis systems**.

-## 环境要求
+2. **Signature-Based Detection**
+3. **AI & Machine Learning-Based Detection**
+4. **Heuristic-Based Sandbox Detection**

- Python 3.7+
- 依赖包：
-  - pandas
-  - numpy
-  - xgboost
-  - scikit-learn
-  - matplotlib
-  - seaborn
-  - joblib
+Cloud-based engines are **extremely complex** and are typically a **core capability of each security company**, so **we won't discuss their implementation here** (except for those who simply use **VirusTotal (VT) as their cloud engine**). 

-安装依赖：
+That leaves **categories 2, 3, and 4**, which are typically combined in AV solutions.

-```bash
-pip install pandas numpy xgboost scikit-learn matplotlib seaborn joblib
-```
+Each has its own strengths and weaknesses:
+- **Signature-Based Detection**: Does **not** have heuristic capabilities and **fully relies on manual rule creation**, but it is the **most effective**. Each security vendor's detection capabilities **heavily rely on their signature database**.
+- **Heuristic-Based Sandbox Detection**: Has **weak detection capabilities**, is **easily bypassed**, and **lags behind evolving threats**. It also tends to generate **false positives**.
+- **AI/Machine Learning-Based Detection**: Provides **high detection rates** but also produces **high false positive rates**, often **negatively impacting business operations** (e.g., compiling a simple **Hello World!** application in **Visual Studio** might trigger an alert). **Many AI-based engines are overly aggressive** and flag almost anything **without a digital signature**.

-## 使用说明
+---

-### 1. 准备数据
+## What Are We Going to Build?
+Today, we will create **a combined Machine Learning + Behavior-Based Sandbox Engine**.

-需要准备两个CSV文件：
- `malware.csv`：恶意软件样本的特征数据
- `whitelist.csv`：正常软件样本的特征数据
+We are **not** implementing a **signature-based engine** because that would be **too simple** (if you're interested in signature matching, check out **YARA**).

-这些CSV文件由C++特征提取模块生成。
+The overall engine structure is as follows:
+![](https://key08.com/usr/uploads/2025/03/926716651.png)

-### 2. 训练模型
+We need to implement **two core modules**:
+1. **Sandbox Behavior Analysis Module**
+2. **Machine Learning-Based Detection Module**

-运行以下命令进行模型训练：
+We will **introduce each module step by step**.

-```bash
-python train_model.py
-```
+---

-训练结果将保存为`xgboost_malware_detector.model`文件，并生成性能评估图表：
- `confusion_matrix.png`：混淆矩阵
- `feature_importance.png`：特征重要性排序
+## Sandbox Module
+A **sandbox module** is typically used for **unpacking and behavior analysis**. Essentially, it is a **PE file emulator**.

-### 3. 预测未知文件
+In our system, we use **Unicorn Engine** to **simulate CPU execution**. **Unicorn Engine** is a **lightweight**, **cross-platform** CPU emulation framework that **supports multiple architectures**, including **MIPS, ARM, PowerPC, x86, and x64**. It is based on **QEMU** and was first introduced at **Black Hat 2015** by the **GrayShift security team**.

-使用训练好的模型预测未知文件：
+### Main Steps of the Sandbox:
+1. **Initialize the Emulation Environment**
+   - Relocate PE file sections
+   - Setup stack memory
+   - Initialize `Unicorn Engine` and allocate virtual memory
+   - Map the PE file into the virtual environment
+   - Load required DLLs into the virtual machine
+   - Hook critical DLL functions to monitor behavior
+   - Set up essential handles, stack, **PEB**, **TEB**, etc.
+   - Store important PE metadata for unpacking

-```bash
-python predict.py <csv文件路径1> [csv文件路径2] ...
-```
+2. **Relocation Processing**
+   - If a **PE header contains a relocation table**, Windows will relocate **resources and functions** before execution.

-预测结果将保存为`*_predictions.csv`文件。
+3. **Memory and Stack Allocation**
+   - The **stack memory** must be fully emulated for the execution environment.

-## 示例
+4. **Mapping PE Sections into Memory**
+   - A **PE file's size on disk differs from its actual size when loaded in memory**.
+   - We must **expand** it and **map each section accordingly**.

-```bash
-# 训练模型
-python train_model.py
+5. **Load Required DLLs**
+   - **Parse the Import Table** and **map necessary DLLs** into our virtual machine.

-# 预测单个文件
-python predict.py unknown_samples.csv
+6. **Intercept API Calls**
+   - Hook **imported API functions**.

-# 批量预测多个文件
-python predict.py file1.csv file2.csv file3.csv
-```
+7. **Shellcode & Packed Malware Detection**
+   - Monitor for **self-modifying code execution**, which indicates **packed malware**.

-## 性能指标
+8. **Behavior-Based Detection**
+   - Detect suspicious behavior, such as:
+     - **Downloading executable files via `WinHttp`**
+     - **Excessive `sleep` delays**
+     - **Accessing sensitive directories**
+     - **Direct access to `LDR` structures** (used to detect stealth malware)

-在测试数据集上，该系统通常能达到以下性能：
+### Sandbox Performance:
+Here’s an example detection result:
+![](https://key08.com/usr/uploads/2025/03/408250478.png)

- 准确率：95%+
- 召回率：90%+
- 精确率：92%+
- F1值：91%+
+---

-_注意：实际性能可能因训练数据和参数设置而异。_
+## Machine Learning Module
+The **machine learning module** is used to classify files based on extracted PE features.

-## 扩展与优化
+### Feature Engineering:
+We extract the following feature sets:
+1. **PE Header Features** (Presence of Import Tables, TLS sections, relocations, etc.)
+2. **Imported DLLs** (Checks for specific suspicious DLLs)
+3. **File Entropy** (Measures randomness)
+4. **Entry Point Byte Sequence** (Examines the first 64 bytes of code)
+5. **Section Analysis** (Checks PE section sizes and entropy)
+6. **Code-to-Data Ratio** (Compares code section size vs. total PE file size)

-系统可以进行以下扩展和优化：
+### Training Data:
+We collected **1,000 benign samples** and **1,000 malicious samples**, saved their features into a **CSV file**, and used them for training.

-1. 添加更多特征，如字符串分析、API调用序列等
-2. 尝试其他机器学习算法或深度学习模型
-3. 集成多个模型进行综合决策
-4. 开发实时监控和检测功能
-5. 增加可解释性分析
+![](https://key08.com/usr/uploads/2025/03/1410311475.png)

-## License
+> ⚠️ **NOTE:** The dataset is **too small** for real-world performance. A proper dataset should have at least **100,000+ benign and 100,000+ malicious samples**.

-MIT
+### Model Training:
+We use **XGBoost** for training and then export the trained model to **pure C++ code** using **m2cgen**.
+
+![](https://key08.com/usr/uploads/2025/03/358391058.png)
+
+---
+
+## Conclusion
+This is a **basic but modern antivirus engine** using **sandbox-based behavior analysis** and **machine learning-based detection**.
+
+The **full source code** is available on **GitHub** (link below). 🚀
+
+🔗 **GitHub Repository:** [INSERT LINK HERE]