Update project documentation and enhance malware detection engine
- Completely rewrite README.md with comprehensive project overview and technical details - Add detailed explanation of antivirus engine architecture and detection strategies - Implement multi-stage malware detection with machine learning, sandbox, and PE structure analysis - Update project configuration and add new source files for enhanced detection capabilities - Integrate XGBoost machine learning model with C++ export functionality - Improve sandbox environment with advanced module and LDR data table handling - Remove legacy Python prediction and training scripts in favor of C++ implementation
This commit is contained in:
15
.vscode/settings.json
vendored
15
.vscode/settings.json
vendored
@@ -58,6 +58,19 @@
|
||||
"xutility": "cpp",
|
||||
"functional": "cpp",
|
||||
"array": "cpp",
|
||||
"numeric": "cpp"
|
||||
"numeric": "cpp",
|
||||
"charconv": "cpp",
|
||||
"chrono": "cpp",
|
||||
"filesystem": "cpp",
|
||||
"format": "cpp",
|
||||
"forward_list": "cpp",
|
||||
"locale": "cpp",
|
||||
"mutex": "cpp",
|
||||
"optional": "cpp",
|
||||
"ratio": "cpp",
|
||||
"stop_token": "cpp",
|
||||
"thread": "cpp",
|
||||
"xlocbuf": "cpp",
|
||||
"xlocmes": "cpp"
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,13 @@
|
||||
//
|
||||
|
||||
#include "head.h"
|
||||
enum class DetectEngineType {
|
||||
kNone,
|
||||
kMachineLearning,
|
||||
kSandbox,
|
||||
kPeStruct,
|
||||
kYaraScan
|
||||
};
|
||||
|
||||
auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
|
||||
auto sampleInfo = std::make_shared<BasicPeInfo>();
|
||||
@@ -10,6 +17,9 @@ auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
|
||||
sampleInfo->peBuffer =
|
||||
peconv::load_pe_module((const char*)sampleInfo->inputFilePath.c_str(),
|
||||
sampleInfo->peSize, false, false);
|
||||
if (sampleInfo->peBuffer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
sampleInfo->ntHead64 = peconv::get_nt_hdrs64((BYTE*)sampleInfo->peBuffer);
|
||||
sampleInfo->ntHead32 = peconv::get_nt_hdrs32((BYTE*)sampleInfo->peBuffer);
|
||||
sampleInfo->isX64 = peconv::is64bit((BYTE*)sampleInfo->peBuffer);
|
||||
@@ -39,6 +49,7 @@ auto getPeInfo(std::string inputFilePath) -> std::shared_ptr<BasicPeInfo> {
|
||||
sampleInfo->peSize = (sampleInfo->peSize + 0xFFF) & ~0xFFF;
|
||||
return sampleInfo;
|
||||
}
|
||||
// 搜集恶意软件特征的.
|
||||
int doMl(int argc, char* argv[]) {
|
||||
// 检查命令行参数
|
||||
if (argc < 3) {
|
||||
@@ -98,31 +109,210 @@ int doMl(int argc, char* argv[]) {
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
int main(int argc, char* argv[]) {
|
||||
doMl(argc, argv);
|
||||
/*
|
||||
auto sampleInfo = getPeInfo(
|
||||
"E:\\对战平台\\CrowAntiCheat\\CrowAntiCheat\\client\\Console_"
|
||||
"Test\\Release\\Console_Test.exe");
|
||||
// auto sampleInfo = getPeInfo("C:\\ConsoleApplication1.exe");
|
||||
printf("input new file %s \n", sampleInfo->inputFilePath);
|
||||
printf("is x64: %d\n", sampleInfo->isX64);
|
||||
printf("is relocated: %d\n", sampleInfo->isRelocated);
|
||||
printf("RecImageBase: %llx\n", sampleInfo->RecImageBase);
|
||||
auto sandbox = std::make_shared<Sandbox>();
|
||||
sandbox->InitEnv(sampleInfo);
|
||||
sandbox->Run();
|
||||
auto [peBuffer, peSize] = sandbox->DumpPE();
|
||||
int doPredict(int argc, char* argv[]) {
|
||||
if (argc < 2) {
|
||||
std::cout << "用法: " << argv[0] << " <文件路径>" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
std::string filePath = argv[1];
|
||||
MachineLearning ml;
|
||||
double score = 1 - ml.PredictMalwareFromFile(filePath);
|
||||
if (score >= 0) {
|
||||
std::cout << "文件 " << filePath << " 的恶意软件得分: " << score
|
||||
<< std::endl;
|
||||
if (score > 0.5) {
|
||||
std::cout << "警告: 这个文件可能是恶意软件!" << std::endl;
|
||||
} else {
|
||||
std::cout << "这个文件可能是安全的。" << std::endl;
|
||||
}
|
||||
} else {
|
||||
std::cout << "无法分析文件。" << std::endl;
|
||||
}
|
||||
}
|
||||
class PeStructAnalyzer {
|
||||
public:
|
||||
PeStructAnalyzer() = default;
|
||||
~PeStructAnalyzer() = default;
|
||||
|
||||
if (peBuffer) {
|
||||
printf("peBuffer: %p\n", peBuffer.get());
|
||||
printf("peSize: %d\n", peSize);
|
||||
// peconv::dump_to_file("z:\\dumped_main.exe", peBuffer.get(), peSize);
|
||||
MachineLearning ml;
|
||||
ml.ExtractFeatures(peBuffer.get(), peSize);
|
||||
}
|
||||
peBuffer.release();
|
||||
*/
|
||||
system("pause");
|
||||
bool AnalyzePe(const std::shared_ptr<BasicPeInfo>& peInfo) {
|
||||
if (!peInfo || !peInfo->peBuffer) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isSuspicious = false;
|
||||
|
||||
// 检查导入表
|
||||
if (HasNoImports(peInfo)) {
|
||||
std::cout << "警告: 未发现导入表,这是一个可疑特征" << std::endl;
|
||||
isSuspicious = true;
|
||||
}
|
||||
|
||||
// 检查节表异常
|
||||
auto [hasSuspiciousSections, suspiciousReason] =
|
||||
AnalyzeSections(peInfo);
|
||||
if (hasSuspiciousSections) {
|
||||
std::cout << "警告: " << suspiciousReason << std::endl;
|
||||
isSuspicious = true;
|
||||
}
|
||||
|
||||
return isSuspicious;
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr DWORD MAX_REASONABLE_SECTION_COUNT = 20; // 最大合理区段数
|
||||
static constexpr DWORD MAX_EXECUTABLE_SECTIONS = 3; // 最大可执行区段数
|
||||
static constexpr DWORD MAX_SECTION_SIZE = 0x10000000; // 256MB
|
||||
static constexpr DWORD SECTION_ALIGNMENT = 0x1000; // 4KB对齐
|
||||
static constexpr DWORD SUSPICIOUS_ENTROPY_THRESHOLD = 7; // 熵值阈值
|
||||
|
||||
bool HasNoImports(const std::shared_ptr<BasicPeInfo>& peInfo) {
|
||||
PIMAGE_DATA_DIRECTORY importDir = nullptr;
|
||||
if (peInfo->isX64) {
|
||||
importDir = &peInfo->ntHead64->OptionalHeader
|
||||
.DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];
|
||||
} else {
|
||||
importDir = &peInfo->ntHead32->OptionalHeader
|
||||
.DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];
|
||||
}
|
||||
|
||||
return (importDir->VirtualAddress == 0 || importDir->Size == 0);
|
||||
}
|
||||
|
||||
std::pair<bool, std::string> AnalyzeSections(
|
||||
const std::shared_ptr<BasicPeInfo>& peInfo) {
|
||||
PIMAGE_SECTION_HEADER firstSection = nullptr;
|
||||
WORD numberOfSections = 0;
|
||||
|
||||
if (peInfo->isX64) {
|
||||
firstSection = IMAGE_FIRST_SECTION(peInfo->ntHead64);
|
||||
numberOfSections = peInfo->ntHead64->FileHeader.NumberOfSections;
|
||||
} else {
|
||||
firstSection = IMAGE_FIRST_SECTION(peInfo->ntHead32);
|
||||
numberOfSections = peInfo->ntHead32->FileHeader.NumberOfSections;
|
||||
}
|
||||
|
||||
// 检查区段数量是否异常
|
||||
if (numberOfSections > MAX_REASONABLE_SECTION_COUNT) {
|
||||
return {true, "区段数量异常: " + std::to_string(numberOfSections) +
|
||||
" > " +
|
||||
std::to_string(MAX_REASONABLE_SECTION_COUNT)};
|
||||
}
|
||||
|
||||
// 统计可执行区段数量
|
||||
int executableSections = 0;
|
||||
bool hasWritableExecutableSection = false;
|
||||
bool hasZeroSizedSection = false;
|
||||
bool hasOversizedSection = false;
|
||||
bool hasMisalignedSection = false;
|
||||
|
||||
for (WORD i = 0; i < numberOfSections; i++) {
|
||||
const auto& section = firstSection[i];
|
||||
|
||||
// 检查区段属性
|
||||
if (section.Characteristics & IMAGE_SCN_MEM_EXECUTE) {
|
||||
executableSections++;
|
||||
|
||||
// 检查是否同时具有可写和可执行属性
|
||||
if (section.Characteristics & IMAGE_SCN_MEM_WRITE) {
|
||||
hasWritableExecutableSection = true;
|
||||
}
|
||||
}
|
||||
|
||||
// 检查区段大小
|
||||
if (section.SizeOfRawData == 0 && section.Misc.VirtualSize > 0) {
|
||||
hasZeroSizedSection = true;
|
||||
}
|
||||
|
||||
if (section.SizeOfRawData > MAX_SECTION_SIZE) {
|
||||
hasOversizedSection = true;
|
||||
}
|
||||
|
||||
// 检查对齐
|
||||
if (section.VirtualAddress % SECTION_ALIGNMENT != 0) {
|
||||
hasMisalignedSection = true;
|
||||
}
|
||||
}
|
||||
|
||||
// 返回检测结果
|
||||
if (executableSections > MAX_EXECUTABLE_SECTIONS) {
|
||||
return {true, "可执行区段数量过多: " +
|
||||
std::to_string(executableSections)};
|
||||
}
|
||||
|
||||
if (hasWritableExecutableSection) {
|
||||
return {true, "发现同时具有可写和可执行属性的区段"};
|
||||
}
|
||||
|
||||
if (hasZeroSizedSection) {
|
||||
return {true, "发现大小异常的区段"};
|
||||
}
|
||||
|
||||
if (hasOversizedSection) {
|
||||
return {true, "发现过大的区段"};
|
||||
}
|
||||
|
||||
if (hasMisalignedSection) {
|
||||
return {true, "发现未正确对齐的区段"};
|
||||
}
|
||||
|
||||
return {false, ""};
|
||||
}
|
||||
};
|
||||
|
||||
class DetectEngine {
|
||||
public:
|
||||
DetectEngine();
|
||||
~DetectEngine();
|
||||
DetectEngineType DetectMalware(std::string filePath);
|
||||
};
|
||||
DetectEngine::DetectEngine() {}
|
||||
DetectEngine::~DetectEngine() {}
|
||||
DetectEngineType DetectEngine::DetectMalware(std::string filePath) {
|
||||
auto peInfo = getPeInfo(filePath);
|
||||
if (peInfo == nullptr) {
|
||||
return DetectEngineType::kNone;
|
||||
}
|
||||
|
||||
// PE结构分析
|
||||
PeStructAnalyzer peAnalyzer;
|
||||
if (peAnalyzer.AnalyzePe(peInfo)) {
|
||||
return DetectEngineType::kPeStruct;
|
||||
}
|
||||
|
||||
// 先机器学习引擎
|
||||
MachineLearning ml;
|
||||
double score = 1 - ml.PredictMalwareFromFile(filePath);
|
||||
if (score >= 0) {
|
||||
printf("machine learning score: %f\n", score);
|
||||
if (score > 0.5) {
|
||||
return DetectEngineType::kMachineLearning;
|
||||
}
|
||||
}
|
||||
|
||||
// 再沙盒引擎
|
||||
Sandbox se;
|
||||
se.InitEnv(peInfo);
|
||||
se.Run();
|
||||
if (se.GetMalwareAnalysisType() == MalwareAnalysisType::kSuspicious ||
|
||||
se.GetMalwareAnalysisType() == MalwareAnalysisType::kMalware) {
|
||||
return DetectEngineType::kSandbox;
|
||||
}
|
||||
|
||||
return DetectEngineType::kNone;
|
||||
}
|
||||
auto doMalwareScan(int argc, char* argv[]) -> void {
|
||||
DetectEngine scanner;
|
||||
if (argc < 2) {
|
||||
std::cout << "用法: " << argv[0] << " <文件路径>" << std::endl;
|
||||
return;
|
||||
}
|
||||
std::string filePath = argv[1];
|
||||
auto sampleType = scanner.DetectMalware(filePath);
|
||||
printf("sample type: %d \n", sampleType);
|
||||
}
|
||||
int main(int argc, char* argv[]) {
|
||||
// doMl(argc, argv);
|
||||
// doPredict(argc, argv);
|
||||
doMalwareScan(argc, argv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -142,6 +142,7 @@
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\ml\malware_detector.cpp" />
|
||||
<ClCompile Include="ai_anti_malware.cpp" />
|
||||
<ClCompile Include="libpeconv\libpeconv\src\buffer_util.cpp" />
|
||||
<ClCompile Include="libpeconv\libpeconv\src\caves.cpp" />
|
||||
@@ -173,15 +174,21 @@
|
||||
<ClCompile Include="ml.cpp" />
|
||||
<ClCompile Include="sandbox.cpp" />
|
||||
<ClCompile Include="sandbox_api_emu.cpp" />
|
||||
<ClCompile Include="sandbox_api_regs.cpp" />
|
||||
<ClCompile Include="sandbox_api_stl.cpp" />
|
||||
<ClCompile Include="sandbox_api_winhttp.cpp" />
|
||||
<ClCompile Include="sandbox_callbacks.cpp" />
|
||||
<ClCompile Include="sandbox_malware_check.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\ml\malware_detector.h" />
|
||||
<ClInclude Include="head.h" />
|
||||
<ClInclude Include="libpeconv\libpeconv\src\fix_dot_net_ep.h" />
|
||||
<ClInclude Include="libpeconv\libpeconv\src\ntddk.h" />
|
||||
<ClInclude Include="ml.h" />
|
||||
<ClInclude Include="native_struct.h" />
|
||||
<ClInclude Include="sandbox.h" />
|
||||
<ClInclude Include="sandbox_api_winhttp.h" />
|
||||
<ClInclude Include="sandbox_callbacks.h" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
||||
@@ -126,6 +126,21 @@
|
||||
<ClCompile Include="ml.cpp">
|
||||
<Filter>源文件\machine_learning</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\ml\malware_detector.cpp">
|
||||
<Filter>源文件\machine_learning</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sandbox_malware_check.cpp">
|
||||
<Filter>源文件\sandbox</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sandbox_api_regs.cpp">
|
||||
<Filter>源文件\sandbox</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sandbox_api_stl.cpp">
|
||||
<Filter>源文件\sandbox</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sandbox_api_winhttp.cpp">
|
||||
<Filter>源文件\sandbox</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="head.h">
|
||||
@@ -149,5 +164,11 @@
|
||||
<ClInclude Include="ml.h">
|
||||
<Filter>头文件\machine_learning</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\ml\malware_detector.h">
|
||||
<Filter>头文件\machine_learning</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="sandbox_api_winhttp.h">
|
||||
<Filter>头文件\sandbox</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -9,7 +9,6 @@
|
||||
#include <sstream>
|
||||
#include <cfloat>
|
||||
#include <filesystem>
|
||||
|
||||
// 确保std命名空间中的函数可用
|
||||
using std::max;
|
||||
using std::min;
|
||||
@@ -855,4 +854,30 @@ bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||
printf("ML Process Result, success count: %d fail count: %d \n",
|
||||
processedCount, failedCount);
|
||||
return true;
|
||||
}
|
||||
|
||||
double MachineLearning::PredictMalware(const uint8_t* buffer,
|
||||
size_t bufferSize) {
|
||||
// 提取特征
|
||||
std::vector<double> features = ExtractFeatures(buffer, bufferSize);
|
||||
|
||||
// 如果特征提取失败,返回-1.0表示无法预测
|
||||
if (features.empty()) {
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
// 将特征向量传递给XGBoost模型
|
||||
return score(features.data());
|
||||
}
|
||||
//返回的是白文件的概率
|
||||
double MachineLearning::PredictMalwareFromFile(const std::string& filePath) {
|
||||
// 读取文件
|
||||
std::vector<uint8_t> fileBuffer = ReadFileToBuffer(filePath);
|
||||
if (fileBuffer.empty()) {
|
||||
std::cerr << "无法读取文件: " << filePath << std::endl;
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
// 使用缓冲区进行预测
|
||||
return PredictMalware(fileBuffer.data(), fileBuffer.size());
|
||||
}
|
||||
@@ -66,6 +66,9 @@ inline BYTE* RvaToPtr(DWORD rva, BYTE* peBuffer) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 在头文件中声明score函数(从外部导入)
|
||||
extern double score(double* input);
|
||||
|
||||
class MachineLearning {
|
||||
public:
|
||||
MachineLearning();
|
||||
@@ -86,6 +89,10 @@ class MachineLearning {
|
||||
// 读取文件到内存
|
||||
std::vector<uint8_t> ReadFileToBuffer(const std::string& filePath);
|
||||
|
||||
// 新增方法:使用XGBoost模型预测文件是否为恶意软件
|
||||
double PredictMalware(const uint8_t* buffer, size_t bufferSize);
|
||||
double PredictMalwareFromFile(const std::string& filePath);
|
||||
|
||||
private:
|
||||
// 特征提取辅助函数
|
||||
std::vector<double> EncodeProperties(
|
||||
|
||||
@@ -155,7 +155,19 @@ class cFixImprot : public peconv::t_function_resolver {
|
||||
}
|
||||
}
|
||||
}
|
||||
//__debugbreak();
|
||||
for (const auto& module : m_sandbox->m_moduleList) {
|
||||
for (const auto& exp : m_sandbox->m_exportFuncDict) {
|
||||
// 检查函数名是否匹配
|
||||
if (strcmp(exp->name, func_name) == 0) {
|
||||
auto newBase = reinterpret_cast<FARPROC>(
|
||||
module->base + exp->function_address);
|
||||
printf("fix import: %s => %llx \n", func_name, newBase);
|
||||
// 返回在模拟器中的虚拟地址
|
||||
return newBase;
|
||||
}
|
||||
}
|
||||
}
|
||||
__debugbreak();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -219,6 +231,11 @@ auto Sandbox::PushModuleToVM(const char* dllName, uint64_t moduleBase) -> void {
|
||||
newModule->base) == false) {
|
||||
throw std::runtime_error("Failed to relocate module");
|
||||
}
|
||||
|
||||
// 将模块添加到LDR链表中
|
||||
if (m_peInfo->isX64) {
|
||||
AddModuleToLdr(newModule);
|
||||
}
|
||||
}
|
||||
|
||||
auto Sandbox::CreateModuleInfo(const char* dllName, uint64_t moduleBase,
|
||||
@@ -344,7 +361,8 @@ auto Sandbox::ResolveImportExports() -> void {
|
||||
const auto exports = ResolveExport(module->real_base);
|
||||
for (const auto item : exports) {
|
||||
if (LOG_LEVEL > 0) {
|
||||
printf("import export: [%s] %s => %llx\n", module->name, item->name, item->function_address);
|
||||
printf("import export: [%s] %s => %llx\n", module->name,
|
||||
item->name, item->function_address);
|
||||
}
|
||||
module->export_function.push_back(item);
|
||||
}
|
||||
@@ -586,6 +604,11 @@ auto Sandbox::InitEnv(std::shared_ptr<BasicPeInfo> peInfo) -> void {
|
||||
_ASSERTE(m_moduleList.size() == 0);
|
||||
m_moduleList.push_back(newModule);
|
||||
|
||||
// 将模块添加到LDR链表中
|
||||
if (m_peInfo->isX64) {
|
||||
AddModuleToLdr(newModule);
|
||||
}
|
||||
|
||||
ResoveImport();
|
||||
ResolveImportExports();
|
||||
|
||||
@@ -697,7 +720,7 @@ auto Sandbox::Run() -> void {
|
||||
InitApiHooks();
|
||||
std::cout << "Starting execution at " << std::hex << entryPoint
|
||||
<< std::endl;
|
||||
uint64_t timeout = 60 * 1000;
|
||||
uint64_t timeout = 60 * 1000 * 1000;
|
||||
err = uc_emu_start(m_ucEngine, entryPoint, m_peInfo->imageEnd, timeout, 0);
|
||||
std::cerr << "Emulation error: " << uc_strerror(err) << std::endl;
|
||||
}
|
||||
@@ -1083,3 +1106,165 @@ void Sandbox::UpdateBaseOfCode(PIMAGE_SECTION_HEADER sectionHeader,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto Sandbox::InitializeLdrData() -> void {
|
||||
if (m_peInfo->isX64 && m_peb64.Ldr == 0) {
|
||||
// 为LDR_DATA分配内存
|
||||
uint64_t ldrDataAddress = m_pebBase + sizeof(X64PEB);
|
||||
m_pebEnd = ldrDataAddress + sizeof(X64_PEB_LDR_DATA);
|
||||
m_peb64.Ldr = ldrDataAddress;
|
||||
|
||||
// 映射LDR数据内存
|
||||
uc_mem_map(m_ucEngine, ldrDataAddress, sizeof(X64_PEB_LDR_DATA),
|
||||
UC_PROT_ALL);
|
||||
|
||||
// 初始化LDR_DATA结构
|
||||
X64_PEB_LDR_DATA ldrData = {0};
|
||||
ldrData.Length = sizeof(X64_PEB_LDR_DATA);
|
||||
ldrData.Initialized = 1;
|
||||
|
||||
// 初始化链表头 - 使用适当的类型转换
|
||||
LIST_ENTRY inLoadOrderList = {
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList)),
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList))};
|
||||
ldrData.InLoadOrderModuleList = inLoadOrderList;
|
||||
|
||||
LIST_ENTRY inMemoryOrderList = {
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList)),
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList))};
|
||||
ldrData.InMemoryOrderModuleList = inMemoryOrderList;
|
||||
|
||||
LIST_ENTRY inInitOrderList = {
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList)),
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
ldrDataAddress +
|
||||
offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList))};
|
||||
ldrData.InInitializationOrderModuleList = inInitOrderList;
|
||||
|
||||
uc_mem_write(m_ucEngine, ldrDataAddress, &ldrData,
|
||||
sizeof(X64_PEB_LDR_DATA));
|
||||
|
||||
// 更新PEB中的Ldr指针
|
||||
uc_mem_write(m_ucEngine, m_pebBase, &m_peb64, sizeof(X64PEB));
|
||||
}
|
||||
}
|
||||
|
||||
auto Sandbox::CreateLdrEntry(const std::shared_ptr<struct_moudle>& module,
|
||||
uint64_t entryAddress, uint64_t fullNameAddress,
|
||||
uint64_t baseNameAddress) -> LDR_DATA_TABLE_ENTRY {
|
||||
LDR_DATA_TABLE_ENTRY entry = {0};
|
||||
entry.DllBase = reinterpret_cast<PVOID>(module->base);
|
||||
entry.EntryPoint = reinterpret_cast<PVOID>(module->base + module->entry);
|
||||
entry.SizeOfImages = static_cast<ULONG>(module->size);
|
||||
|
||||
// 准备模块名称的Unicode字符串
|
||||
wchar_t nameBuffer[MAX_PATH] = {0};
|
||||
std::mbstowcs(nameBuffer, module->name, strlen(module->name));
|
||||
|
||||
// 设置全路径
|
||||
entry.FullDllName.Length =
|
||||
static_cast<USHORT>(wcslen(nameBuffer) * sizeof(wchar_t));
|
||||
entry.FullDllName.MaximumLength = MAX_PATH * sizeof(wchar_t);
|
||||
entry.FullDllName.Buffer = reinterpret_cast<PWSTR>(fullNameAddress);
|
||||
|
||||
// 设置基本名称
|
||||
entry.BaseDllName.Length =
|
||||
static_cast<USHORT>(wcslen(nameBuffer) * sizeof(wchar_t));
|
||||
entry.BaseDllName.MaximumLength = MAX_PATH * sizeof(wchar_t);
|
||||
entry.BaseDllName.Buffer = reinterpret_cast<PWSTR>(baseNameAddress);
|
||||
|
||||
// 写入Unicode字符串
|
||||
uc_mem_write(m_ucEngine, fullNameAddress, nameBuffer,
|
||||
(wcslen(nameBuffer) + 1) * sizeof(wchar_t));
|
||||
uc_mem_write(m_ucEngine, baseNameAddress, nameBuffer,
|
||||
(wcslen(nameBuffer) + 1) * sizeof(wchar_t));
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
auto Sandbox::UpdateLdrLinks(const LDR_DATA_TABLE_ENTRY& entry,
|
||||
uint64_t entryAddress, X64_PEB_LDR_DATA& ldrData)
|
||||
-> void {
|
||||
// 更新LDR_DATA中的链表头
|
||||
ldrData.InLoadOrderModuleList.Flink = reinterpret_cast<LIST_ENTRY*>(
|
||||
entryAddress + offsetof(LDR_DATA_TABLE_ENTRY, InLoadOrderLinks));
|
||||
ldrData.InMemoryOrderModuleList.Flink = reinterpret_cast<LIST_ENTRY*>(
|
||||
entryAddress + offsetof(LDR_DATA_TABLE_ENTRY, InMemoryOrderLinks));
|
||||
ldrData.InInitializationOrderModuleList.Flink =
|
||||
reinterpret_cast<LIST_ENTRY*>(
|
||||
entryAddress +
|
||||
offsetof(LDR_DATA_TABLE_ENTRY, InInitializationOrderLinks));
|
||||
|
||||
// 写回更新后的LDR_DATA
|
||||
uc_mem_write(m_ucEngine, m_peb64.Ldr, &ldrData, sizeof(X64_PEB_LDR_DATA));
|
||||
}
|
||||
|
||||
auto Sandbox::AddModuleToLdr(const std::shared_ptr<struct_moudle>& module)
|
||||
-> void {
|
||||
if (!m_peInfo->isX64) {
|
||||
return; // 暂时只处理64位
|
||||
}
|
||||
|
||||
if (m_peb64.Ldr == 0) {
|
||||
InitializeLdrData();
|
||||
}
|
||||
|
||||
// 为模块创建LDR_DATA_TABLE_ENTRY
|
||||
uint64_t entrySize = sizeof(LDR_DATA_TABLE_ENTRY) +
|
||||
MAX_PATH * 2; // 额外空间用于Unicode字符串
|
||||
uint64_t entryAddress = m_pebEnd;
|
||||
m_pebEnd += entrySize;
|
||||
|
||||
// 映射内存
|
||||
uc_mem_map(m_ucEngine, entryAddress, entrySize, UC_PROT_ALL);
|
||||
|
||||
// 设置Unicode字符串地址
|
||||
uint64_t fullNameAddress = entryAddress + sizeof(LDR_DATA_TABLE_ENTRY);
|
||||
uint64_t baseNameAddress = fullNameAddress + MAX_PATH;
|
||||
|
||||
// 创建并初始化LDR_DATA_TABLE_ENTRY
|
||||
auto entry =
|
||||
CreateLdrEntry(module, entryAddress, fullNameAddress, baseNameAddress);
|
||||
|
||||
// 从PEB读取当前LDR_DATA结构
|
||||
X64_PEB_LDR_DATA ldrData;
|
||||
uc_mem_read(m_ucEngine, m_peb64.Ldr, &ldrData, sizeof(X64_PEB_LDR_DATA));
|
||||
|
||||
// 设置链表指针
|
||||
entry.InLoadOrderLinks.Flink = reinterpret_cast<LIST_ENTRY*>(
|
||||
reinterpret_cast<uintptr_t>(ldrData.InLoadOrderModuleList.Flink));
|
||||
entry.InLoadOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
|
||||
m_peb64.Ldr + offsetof(X64_PEB_LDR_DATA, InLoadOrderModuleList));
|
||||
|
||||
entry.InMemoryOrderLinks.Flink = reinterpret_cast<LIST_ENTRY*>(
|
||||
reinterpret_cast<uintptr_t>(ldrData.InMemoryOrderModuleList.Flink));
|
||||
entry.InMemoryOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
|
||||
m_peb64.Ldr + offsetof(X64_PEB_LDR_DATA, InMemoryOrderModuleList));
|
||||
|
||||
entry.InInitializationOrderLinks.Flink =
|
||||
reinterpret_cast<LIST_ENTRY*>(reinterpret_cast<uintptr_t>(
|
||||
ldrData.InInitializationOrderModuleList.Flink));
|
||||
entry.InInitializationOrderLinks.Blink = reinterpret_cast<LIST_ENTRY*>(
|
||||
m_peb64.Ldr +
|
||||
offsetof(X64_PEB_LDR_DATA, InInitializationOrderModuleList));
|
||||
|
||||
// 写入LDR_DATA_TABLE_ENTRY结构
|
||||
uc_mem_write(m_ucEngine, entryAddress, &entry,
|
||||
sizeof(LDR_DATA_TABLE_ENTRY));
|
||||
|
||||
// 更新链表
|
||||
UpdateLdrLinks(entry, entryAddress, ldrData);
|
||||
|
||||
printf("Added module '%s' to LDR data tables at 0x%llx\n", module->name,
|
||||
entryAddress);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include <map>
|
||||
|
||||
#include "head.h"
|
||||
#include <WinInet.h>
|
||||
#define PAGE_SIZE 0x1000
|
||||
#define CF_MASK (1 << 0)
|
||||
#define PF_MASK (1 << 2)
|
||||
@@ -70,6 +71,18 @@ struct HeapSegment {
|
||||
size_t size; // 堆段的总大小
|
||||
HeapBlock* blocks; // 块链表
|
||||
};
|
||||
enum class MalwareAnalysisType {
|
||||
kNone,
|
||||
kSuspicious,
|
||||
kMalware,
|
||||
};
|
||||
struct InternetHandleInfo {
|
||||
HINTERNET handle;
|
||||
bool isConnection;
|
||||
std::string url;
|
||||
std::vector<char> responseData;
|
||||
size_t currentPosition;
|
||||
};
|
||||
|
||||
class Sandbox {
|
||||
friend class cFixImprot; // 声明cFixImprot为友元类
|
||||
@@ -114,6 +127,11 @@ class Sandbox {
|
||||
auto GetHeapBlocks() const -> std::map<uint64_t, HeapSegment*> {
|
||||
return m_heapSegments;
|
||||
}
|
||||
auto PrintApiCallList() -> void {
|
||||
for (auto& api : ApiCallList) {
|
||||
printf("%s\n", api.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// 从内存中提取PE文件并修复重定位和导入表,返回原始PE的缓冲区
|
||||
auto DumpPE() -> std::pair<std::unique_ptr<BYTE[]>, size_t>;
|
||||
@@ -151,6 +169,49 @@ class Sandbox {
|
||||
auto SetCrossSectionExecution(uint64_t address) -> void {
|
||||
return m_crossSectionExecution.push_back(address);
|
||||
}
|
||||
auto GetMalwareAnalysisType() -> MalwareAnalysisType {
|
||||
return m_malwareAnalysisType;
|
||||
}
|
||||
auto SetMalwareAnalysisType(MalwareAnalysisType type) -> void {
|
||||
if (type == MalwareAnalysisType::kMalware &&
|
||||
m_malwareAnalysisType == MalwareAnalysisType::kSuspicious) {
|
||||
m_malwareAnalysisType = type;
|
||||
} else if (m_malwareAnalysisType == MalwareAnalysisType::kNone) {
|
||||
m_malwareAnalysisType = type;
|
||||
}
|
||||
}
|
||||
auto CheckMalwareActive_Registry(std::wstring registryPath) -> void;
|
||||
|
||||
auto CheckMalwareActive_Sleep(uint32_t secToSleep) -> void;
|
||||
|
||||
auto CheckMalwareActive_GetProcAddress(std::string wantName) -> void;
|
||||
|
||||
auto CheckMalwareActive_FilePath(std::wstring filePath) -> void;
|
||||
|
||||
// WinHTTP API相关方法
|
||||
auto GetNextInternetHandle() -> uint64_t { return m_nextInternetHandle++; }
|
||||
|
||||
auto AddInternetHandle(uint64_t handle, const InternetHandleInfo& info)
|
||||
-> void {
|
||||
m_internetHandles[handle] = info;
|
||||
}
|
||||
|
||||
auto GetInternetHandle(uint64_t handle) -> InternetHandleInfo* {
|
||||
auto it = m_internetHandles.find(handle);
|
||||
if (it != m_internetHandles.end()) {
|
||||
return &it->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto RemoveInternetHandle(uint64_t handle) -> bool {
|
||||
return m_internetHandles.erase(handle) > 0;
|
||||
}
|
||||
|
||||
auto GetAllInternetHandles() -> std::map<uint64_t, InternetHandleInfo>& {
|
||||
return m_internetHandles;
|
||||
}
|
||||
std::vector<std::string> ApiCallList;
|
||||
|
||||
private:
|
||||
std::shared_ptr<BasicPeInfo> m_peInfo;
|
||||
@@ -219,4 +280,26 @@ class Sandbox {
|
||||
uint64_t m_lastExecuteSectionIndex = 0; // 上次执行的区段索引
|
||||
uint64_t m_KSharedUserDataBase{0};
|
||||
uint64_t m_KSharedUserDataSize{0};
|
||||
|
||||
MalwareAnalysisType m_malwareAnalysisType = MalwareAnalysisType::kNone;
|
||||
|
||||
// WinHTTP API相关成员变量
|
||||
std::map<uint64_t, InternetHandleInfo> m_internetHandles;
|
||||
uint64_t m_nextInternetHandle = 0x1000;
|
||||
|
||||
// 初始化PEB的LDR数据结构
|
||||
auto InitializeLdrData() -> void;
|
||||
|
||||
// 将模块添加到LDR链表中
|
||||
auto AddModuleToLdr(const std::shared_ptr<struct_moudle>& module) -> void;
|
||||
|
||||
// 创建LDR_DATA_TABLE_ENTRY结构
|
||||
auto CreateLdrEntry(const std::shared_ptr<struct_moudle>& module,
|
||||
uint64_t entryAddress, uint64_t fullNameAddress,
|
||||
uint64_t baseNameAddress) -> LDR_DATA_TABLE_ENTRY;
|
||||
|
||||
// 更新LDR链表
|
||||
auto UpdateLdrLinks(const LDR_DATA_TABLE_ENTRY& entry,
|
||||
uint64_t entryAddress, X64_PEB_LDR_DATA& ldrData)
|
||||
-> void;
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
181
ai_anti_malware/sandbox_api_regs.cpp
Normal file
181
ai_anti_malware/sandbox_api_regs.cpp
Normal file
@@ -0,0 +1,181 @@
|
||||
#include "sandbox.h"
|
||||
#include "sandbox_callbacks.h"
|
||||
|
||||
auto Api_RegOpenKeyExW(void* sandbox, uc_engine* uc, uint64_t address) -> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
uint64_t hKey = 0; // 父键句柄
|
||||
uint64_t lpSubKey = 0; // 子键名称
|
||||
uint32_t ulOptions = 0; // 选项
|
||||
uint32_t samDesired = 0; // 访问权限
|
||||
uint64_t phkResult = 0; // 结果句柄的指针
|
||||
|
||||
// 默认返回值:成功
|
||||
LONG status = ERROR_SUCCESS;
|
||||
|
||||
// 获取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
// x64: rcx=hKey, rdx=lpSubKey, r8=ulOptions, r9=samDesired,
|
||||
// [rsp+0x28]=phkResult
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &hKey);
|
||||
uc_reg_read(uc, UC_X86_REG_RDX, &lpSubKey);
|
||||
uint64_t temp_options = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_R8, &temp_options);
|
||||
ulOptions = static_cast<uint32_t>(temp_options);
|
||||
uint64_t temp_sam = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_R9, &temp_sam);
|
||||
samDesired = static_cast<uint32_t>(temp_sam);
|
||||
|
||||
// 第5个参数从栈上读取
|
||||
uint64_t rsp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
|
||||
uc_mem_read(uc, rsp + 0x28, &phkResult, sizeof(uint64_t));
|
||||
} else {
|
||||
// x86: 从栈上读取参数
|
||||
uint32_t esp_address = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp_address);
|
||||
esp_address += 4; // 跳过返回地址
|
||||
|
||||
uint32_t temp_hkey = 0;
|
||||
uc_mem_read(uc, esp_address, &temp_hkey, sizeof(uint32_t));
|
||||
hKey = temp_hkey;
|
||||
esp_address += 4;
|
||||
|
||||
uint32_t temp_subkey = 0;
|
||||
uc_mem_read(uc, esp_address, &temp_subkey, sizeof(uint32_t));
|
||||
lpSubKey = temp_subkey;
|
||||
esp_address += 4;
|
||||
|
||||
uc_mem_read(uc, esp_address, &ulOptions, sizeof(uint32_t));
|
||||
esp_address += 4;
|
||||
|
||||
uc_mem_read(uc, esp_address, &samDesired, sizeof(uint32_t));
|
||||
esp_address += 4;
|
||||
|
||||
uint32_t temp_result = 0;
|
||||
uc_mem_read(uc, esp_address, &temp_result, sizeof(uint32_t));
|
||||
phkResult = temp_result;
|
||||
}
|
||||
|
||||
// 读取子键名称
|
||||
std::wstring subKeyName;
|
||||
if (lpSubKey != 0) {
|
||||
wchar_t buffer[MAX_PATH] = {0};
|
||||
size_t bytesRead = 0;
|
||||
bool truncated = false;
|
||||
|
||||
// 读取Unicode字符串,直到遇到null终止符或达到MAX_PATH
|
||||
for (size_t i = 0; i < MAX_PATH - 1; i++) {
|
||||
wchar_t ch = 0;
|
||||
uc_mem_read(uc, lpSubKey + (i * sizeof(wchar_t)), &ch,
|
||||
sizeof(wchar_t));
|
||||
if (ch == 0) break;
|
||||
buffer[i] = ch;
|
||||
bytesRead = i + 1;
|
||||
|
||||
if (i == MAX_PATH - 2) {
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
|
||||
subKeyName = std::wstring(buffer, bytesRead);
|
||||
}
|
||||
|
||||
// 生成一个随机句柄值 (不是0,通常是4的倍数)
|
||||
uint32_t newKeyHandle = 0x1000 + (std::rand() % 0xFFFFF) * 4;
|
||||
|
||||
// 在沙箱中记录打开的注册表键 (这里可以根据需要扩展,保存键的路径等信息)
|
||||
// 例如:context->OpenedRegistryKeys[newKeyHandle] = {hKey, subKeyName};
|
||||
|
||||
// 写入句柄到结果指针
|
||||
if (phkResult != 0) {
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
uc_mem_write(uc, phkResult, &newKeyHandle, sizeof(uint64_t));
|
||||
} else {
|
||||
uc_mem_write(uc, phkResult, &newKeyHandle, sizeof(uint32_t));
|
||||
}
|
||||
} else {
|
||||
status = ERROR_INVALID_PARAMETER;
|
||||
}
|
||||
// 获取根键名称
|
||||
std::string rootKeyName;
|
||||
switch (hKey) {
|
||||
case (uint64_t)HKEY_CLASSES_ROOT:
|
||||
rootKeyName = "HKEY_CLASSES_ROOT";
|
||||
break;
|
||||
case (uint64_t)HKEY_CURRENT_USER:
|
||||
rootKeyName = "HKEY_CURRENT_USER";
|
||||
break;
|
||||
case (uint64_t)HKEY_LOCAL_MACHINE:
|
||||
rootKeyName = "HKEY_LOCAL_MACHINE";
|
||||
break;
|
||||
case (uint64_t)HKEY_USERS:
|
||||
rootKeyName = "HKEY_USERS";
|
||||
break;
|
||||
case (uint64_t)HKEY_CURRENT_CONFIG:
|
||||
rootKeyName = "HKEY_CURRENT_CONFIG";
|
||||
break;
|
||||
default:
|
||||
rootKeyName = "Unknown key handle";
|
||||
break;
|
||||
}
|
||||
|
||||
std::string wstr_to_str;
|
||||
for (wchar_t c : subKeyName) {
|
||||
if (c <= 127) {
|
||||
wstr_to_str += static_cast<char>(c);
|
||||
} else {
|
||||
wstr_to_str += '?';
|
||||
}
|
||||
}
|
||||
context->CheckMalwareActive_Registry(subKeyName);
|
||||
|
||||
printf(
|
||||
"[*] RegOpenKeyExW: %s\\%s, Options=0x%x, SAM=0x%x -> Handle=0x%x, "
|
||||
"Status=%ld\n",
|
||||
rootKeyName.c_str(), wstr_to_str.c_str(), ulOptions, samDesired,
|
||||
newKeyHandle, status);
|
||||
|
||||
// 返回状态
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&status);
|
||||
}
|
||||
|
||||
auto Api_RegCloseKey(void* sandbox, uc_engine* uc, uint64_t address) -> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
uint64_t hKey = 0; // 键句柄
|
||||
|
||||
// 默认返回值:成功
|
||||
LONG status = ERROR_SUCCESS;
|
||||
|
||||
// 获取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
// x64: rcx=hKey
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &hKey);
|
||||
} else {
|
||||
// x86: 从栈上读取参数
|
||||
uint32_t esp_address = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp_address);
|
||||
esp_address += 4; // 跳过返回地址
|
||||
|
||||
uint32_t temp_hkey = 0;
|
||||
uc_mem_read(uc, esp_address, &temp_hkey, sizeof(uint32_t));
|
||||
hKey = temp_hkey;
|
||||
}
|
||||
|
||||
// 在实际应用中,这里应该从沙盒的注册表句柄映射中移除此句柄
|
||||
// 但当前环境似乎没有明确保存句柄映射,所以只记录操作即可
|
||||
// 如果以后需要,可以添加: context->OpenedRegistryKeys.erase(hKey);
|
||||
|
||||
// 只有当句柄为0或无效时才返回错误
|
||||
if (hKey == 0) {
|
||||
status = ERROR_INVALID_HANDLE;
|
||||
}
|
||||
|
||||
printf("[*] RegCloseKey: Handle=0x%llx -> Status=%ld\n", hKey, status);
|
||||
|
||||
// 返回状态
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&status);
|
||||
}
|
||||
1515
ai_anti_malware/sandbox_api_stl.cpp
Normal file
1515
ai_anti_malware/sandbox_api_stl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
350
ai_anti_malware/sandbox_api_winhttp.cpp
Normal file
350
ai_anti_malware/sandbox_api_winhttp.cpp
Normal file
@@ -0,0 +1,350 @@
|
||||
#include "sandbox.h"
|
||||
#include <windows.h>
|
||||
#include <wininet.h>
|
||||
#include <algorithm>
|
||||
|
||||
// 函数声明,确保外部可见
|
||||
extern auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
extern auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
extern auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc,
|
||||
uint64_t address) -> void;
|
||||
extern auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
|
||||
// 模拟InternetOpenA API
|
||||
auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address) -> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
|
||||
// 获取参数
|
||||
uint64_t lpszAgent = 0;
|
||||
uint64_t dwAccessType = 0;
|
||||
uint64_t lpszProxy = 0;
|
||||
uint64_t lpszProxyBypass = 0;
|
||||
uint32_t dwFlags = 0;
|
||||
|
||||
// 根据x86或x64架构读取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &lpszAgent);
|
||||
uc_reg_read(uc, UC_X86_REG_RDX, &dwAccessType);
|
||||
uc_reg_read(uc, UC_X86_REG_R8, &lpszProxy);
|
||||
uc_reg_read(uc, UC_X86_REG_R9, &lpszProxyBypass);
|
||||
|
||||
uint64_t rsp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
|
||||
uc_mem_read(uc, rsp + 0x28, &dwFlags, sizeof(dwFlags));
|
||||
} else {
|
||||
uint32_t esp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp);
|
||||
|
||||
uint32_t param_addr = esp + 4;
|
||||
uc_mem_read(uc, param_addr, &lpszAgent, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwAccessType, sizeof(dwAccessType));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpszProxy, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpszProxyBypass, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwFlags, sizeof(dwFlags));
|
||||
}
|
||||
|
||||
// 读取用户代理字符串
|
||||
std::string agentString;
|
||||
if (lpszAgent != 0) {
|
||||
char buffer[256] = {0};
|
||||
uc_mem_read(uc, lpszAgent, buffer, sizeof(buffer) - 1);
|
||||
agentString = buffer;
|
||||
|
||||
// 检查用户代理是否可疑
|
||||
const std::vector<std::string> suspiciousAgents = {
|
||||
"wget", "curl", "python", "go-http",
|
||||
"perl", "powershell", "winhttp", "urlmon",
|
||||
"mozilla", "edge", "chrome", "internet explorer"};
|
||||
|
||||
for (const auto& agent : suspiciousAgents) {
|
||||
std::string lowerAgent = agentString;
|
||||
// 转换为小写进行比较
|
||||
std::transform(lowerAgent.begin(), lowerAgent.end(),
|
||||
lowerAgent.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
|
||||
if (lowerAgent.find(agent) != std::string::npos) {
|
||||
context->SetMalwareAnalysisType(
|
||||
MalwareAnalysisType::kSuspicious);
|
||||
#if LOG_LEVEL >= 1
|
||||
printf("[!!!] Suspicious User-Agent: %s\n",
|
||||
agentString.c_str());
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 分配新的Internet句柄
|
||||
uint64_t handleValue = context->GetNextInternetHandle();
|
||||
|
||||
// 在实际创建句柄之前进行检查
|
||||
if (dwAccessType == INTERNET_OPEN_TYPE_PROXY && lpszProxy != 0) {
|
||||
char proxyBuffer[256] = {0};
|
||||
uc_mem_read(uc, lpszProxy, proxyBuffer, sizeof(proxyBuffer) - 1);
|
||||
std::string proxyString = proxyBuffer;
|
||||
|
||||
// 检查代理设置是否可疑
|
||||
if (!proxyString.empty()) {
|
||||
context->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
#if LOG_LEVEL >= 1
|
||||
printf("[!!!] Suspicious proxy configuration: %s\n",
|
||||
proxyString.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// 创建句柄信息
|
||||
InternetHandleInfo handleInfo;
|
||||
handleInfo.handle = (HINTERNET)handleValue;
|
||||
handleInfo.isConnection = false;
|
||||
context->AddInternetHandle(handleValue, handleInfo);
|
||||
|
||||
// 设置返回值
|
||||
uint64_t returnValue = handleValue;
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&returnValue);
|
||||
}
|
||||
|
||||
// 模拟InternetOpenUrlA API
|
||||
auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
|
||||
// 获取参数
|
||||
uint64_t hInternet = 0;
|
||||
uint64_t lpszUrl = 0;
|
||||
uint64_t lpszHeaders = 0;
|
||||
uint64_t dwHeadersLength = 0;
|
||||
uint64_t dwFlags = 0;
|
||||
uint64_t dwContext = 0;
|
||||
|
||||
// 根据x86或x64架构读取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &hInternet);
|
||||
uc_reg_read(uc, UC_X86_REG_RDX, &lpszUrl);
|
||||
uc_reg_read(uc, UC_X86_REG_R8, &lpszHeaders);
|
||||
uc_reg_read(uc, UC_X86_REG_R9, &dwHeadersLength);
|
||||
|
||||
uint64_t rsp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_RSP, &rsp);
|
||||
uc_mem_read(uc, rsp + 0x28, &dwFlags, sizeof(dwFlags));
|
||||
uc_mem_read(uc, rsp + 0x30, &dwContext, sizeof(dwContext));
|
||||
} else {
|
||||
uint32_t esp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp);
|
||||
|
||||
uint32_t param_addr = esp + 4;
|
||||
uc_mem_read(uc, param_addr, &hInternet, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpszUrl, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpszHeaders, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwHeadersLength, sizeof(dwHeadersLength));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwFlags, sizeof(dwFlags));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwContext, sizeof(uint32_t));
|
||||
}
|
||||
context->SetMalwareAnalysisType(MalwareAnalysisType::kMalware);
|
||||
|
||||
// 读取URL字符串
|
||||
std::string urlString;
|
||||
if (lpszUrl != 0) {
|
||||
char buffer[1024] = {0};
|
||||
uc_mem_read(uc, lpszUrl, buffer, sizeof(buffer) - 1);
|
||||
urlString = buffer;
|
||||
}
|
||||
printf("urlString: %s\n", urlString.c_str());
|
||||
|
||||
// 检查Internet句柄是否有效
|
||||
if (context->GetInternetHandle(hInternet) == nullptr) {
|
||||
// 无效句柄,返回NULL
|
||||
uint64_t returnValue = 0;
|
||||
uc_reg_write(
|
||||
uc, context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&returnValue);
|
||||
return;
|
||||
}
|
||||
|
||||
// 分配新的URL连接句柄
|
||||
uint64_t handleValue = context->GetNextInternetHandle();
|
||||
|
||||
// 创建句柄信息
|
||||
InternetHandleInfo handleInfo;
|
||||
handleInfo.handle = (HINTERNET)handleValue;
|
||||
handleInfo.isConnection = true;
|
||||
handleInfo.url = urlString;
|
||||
// 生成模拟响应数据
|
||||
// 这块可以真实请求,然后看是不是PE文件之类的.
|
||||
const char* sampleResponse =
|
||||
"HTTP/1.1 200 OK\r\nContent-Type: "
|
||||
"text/html\r\n\r\n<html><body>huoji own me and all</body></html>";
|
||||
handleInfo.responseData.assign(sampleResponse,
|
||||
sampleResponse + strlen(sampleResponse));
|
||||
handleInfo.currentPosition = 0;
|
||||
|
||||
context->AddInternetHandle(handleValue, handleInfo);
|
||||
|
||||
// 设置返回值
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&handleValue);
|
||||
}
|
||||
|
||||
// 模拟InternetCloseHandle API
|
||||
auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
|
||||
// 获取参数
|
||||
uint64_t hInternet = 0;
|
||||
|
||||
// 根据x86或x64架构读取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &hInternet);
|
||||
} else {
|
||||
uint32_t esp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp);
|
||||
|
||||
uint32_t param_addr = esp + 4;
|
||||
uc_mem_read(uc, param_addr, &hInternet, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
// 检查句柄是否有效
|
||||
bool handleValid = (context->GetInternetHandle(hInternet) != nullptr);
|
||||
|
||||
// 如果句柄有效,移除它
|
||||
if (handleValid) {
|
||||
context->RemoveInternetHandle(hInternet);
|
||||
}
|
||||
|
||||
// 设置返回值(成功或失败)
|
||||
uint32_t returnValue = handleValid ? TRUE : FALSE;
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&returnValue);
|
||||
}
|
||||
|
||||
// 模拟InternetReadFile API
|
||||
auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void {
|
||||
auto context = static_cast<Sandbox*>(sandbox);
|
||||
|
||||
// 获取参数
|
||||
uint64_t hFile = 0;
|
||||
uint64_t lpBuffer = 0;
|
||||
uint32_t dwNumberOfBytesToRead = 0;
|
||||
uint64_t lpdwNumberOfBytesRead = 0;
|
||||
|
||||
// 根据x86或x64架构读取参数
|
||||
if (context->GetPeInfo()->isX64) {
|
||||
uc_reg_read(uc, UC_X86_REG_RCX, &hFile);
|
||||
uc_reg_read(uc, UC_X86_REG_RDX, &lpBuffer);
|
||||
uc_reg_read(uc, UC_X86_REG_R8, &dwNumberOfBytesToRead);
|
||||
uc_reg_read(uc, UC_X86_REG_R9, &lpdwNumberOfBytesRead);
|
||||
} else {
|
||||
uint32_t esp = 0;
|
||||
uc_reg_read(uc, UC_X86_REG_ESP, &esp);
|
||||
|
||||
uint32_t param_addr = esp + 4;
|
||||
uc_mem_read(uc, param_addr, &hFile, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpBuffer, sizeof(uint32_t));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &dwNumberOfBytesToRead,
|
||||
sizeof(dwNumberOfBytesToRead));
|
||||
|
||||
param_addr += 4;
|
||||
uc_mem_read(uc, param_addr, &lpdwNumberOfBytesRead, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
// 检查句柄是否有效
|
||||
auto it = context->GetInternetHandle(hFile);
|
||||
if (it == nullptr || !it->isConnection) {
|
||||
// 无效句柄,设置失败状态
|
||||
uint32_t returnValue = FALSE;
|
||||
uc_reg_write(
|
||||
uc, context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&returnValue);
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取句柄信息
|
||||
InternetHandleInfo& handleInfo = *it;
|
||||
|
||||
// 计算实际要读取的字节数
|
||||
uint32_t bytesToRead = dwNumberOfBytesToRead;
|
||||
if (handleInfo.currentPosition + bytesToRead >
|
||||
handleInfo.responseData.size()) {
|
||||
bytesToRead = (uint32_t)(handleInfo.responseData.size() -
|
||||
handleInfo.currentPosition);
|
||||
}
|
||||
|
||||
// 检查响应数据中是否包含恶意内容
|
||||
if (bytesToRead > 0) {
|
||||
std::string dataChunk(
|
||||
handleInfo.responseData.begin() + handleInfo.currentPosition,
|
||||
handleInfo.responseData.begin() + handleInfo.currentPosition +
|
||||
bytesToRead);
|
||||
|
||||
// 检查响应数据是否包含可疑内容
|
||||
const std::vector<std::string> suspiciousResponsePatterns = {
|
||||
"powershell", "cmd.exe", "eval(", "exec(",
|
||||
"system(", "shell_exec", "<script", "function()",
|
||||
"document.write", "base64", "FromBase64", "CreateObject",
|
||||
"WScript", "ActiveXObject"};
|
||||
|
||||
for (const auto& pattern : suspiciousResponsePatterns) {
|
||||
if (dataChunk.find(pattern) != std::string::npos) {
|
||||
context->SetMalwareAnalysisType(
|
||||
MalwareAnalysisType::kSuspicious);
|
||||
#if LOG_LEVEL >= 1
|
||||
printf("[!!!] Suspicious content in HTTP response: %s\n",
|
||||
pattern.c_str());
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 将数据写入缓冲区
|
||||
if (bytesToRead > 0) {
|
||||
uc_mem_write(
|
||||
uc, lpBuffer,
|
||||
handleInfo.responseData.data() + handleInfo.currentPosition,
|
||||
bytesToRead);
|
||||
|
||||
// 更新当前位置
|
||||
handleInfo.currentPosition += bytesToRead;
|
||||
}
|
||||
|
||||
// 写入读取的字节数
|
||||
uc_mem_write(uc, lpdwNumberOfBytesRead, &bytesToRead, sizeof(bytesToRead));
|
||||
|
||||
// 设置返回值(成功)
|
||||
uint32_t returnValue = TRUE;
|
||||
uc_reg_write(uc,
|
||||
context->GetPeInfo()->isX64 ? UC_X86_REG_RAX : UC_X86_REG_EAX,
|
||||
&returnValue);
|
||||
}
|
||||
11
ai_anti_malware/sandbox_api_winhttp.h
Normal file
11
ai_anti_malware/sandbox_api_winhttp.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
#include "head.h"
|
||||
|
||||
// Internet API函数声明
|
||||
auto Api_InternetOpenA(void* sandbox, uc_engine* uc, uint64_t address) -> void;
|
||||
auto Api_InternetOpenUrlA(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
auto Api_InternetCloseHandle(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
auto Api_InternetReadFile(void* sandbox, uc_engine* uc, uint64_t address)
|
||||
-> void;
|
||||
@@ -46,6 +46,7 @@ void handleCodeRun(uc_engine* uc, uint64_t address, uint32_t size,
|
||||
"[!!!]detect cross section excute, from %d to %d,address: 0x%llx\n",
|
||||
sandbox->GetLastExecuteSectionIndex(), currentSectionIndex,
|
||||
address);
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
|
||||
// 记录跨区段执行地址
|
||||
sandbox->SetCrossSectionExecution(address);
|
||||
@@ -99,6 +100,29 @@ void handleMemoryRead(uc_engine* uc, uc_mem_type type, uint64_t address,
|
||||
sandbox->GetPeInfo()->isX64 ? UC_X86_REG_RIP : UC_X86_REG_EIP,
|
||||
®Rip);
|
||||
|
||||
// 检测是否访问LDR结构
|
||||
if (sandbox->GetPeInfo()->isX64) {
|
||||
uint64_t ldrAddress = sandbox->GetPeb64()->Ldr;
|
||||
if (ldrAddress != 0 && address >= ldrAddress &&
|
||||
address < (ldrAddress + sizeof(X64_PEB_LDR_DATA))) {
|
||||
printf(
|
||||
"[WARNING] Suspicious direct LDR access detected at RIP: "
|
||||
"0x%llx, accessing address: 0x%llx\n",
|
||||
regRip, address);
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
}
|
||||
} else {
|
||||
uint32_t ldrAddress = sandbox->GetPeb32()->Ldr;
|
||||
if (ldrAddress != 0 && address >= ldrAddress &&
|
||||
address < (ldrAddress + sizeof(_PEB_LDR_DATA))) {
|
||||
printf(
|
||||
"[WARNING] Suspicious direct LDR access detected at RIP: 0x%x, "
|
||||
"accessing address: 0x%llx\n",
|
||||
static_cast<uint32_t>(regRip), address);
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t readAddress;
|
||||
auto readError =
|
||||
uc_mem_read(sandbox->GetUnicornHandle(), address, &readAddress, size);
|
||||
@@ -235,16 +259,53 @@ void handleMemoryUnmapRead(uc_engine* uc, uc_mem_type type, uint64_t address,
|
||||
printf("[handleMemoryUnmapRead] Address: %p Size: %p Value: %p\n", address,
|
||||
size, value);
|
||||
dumpVmenv(uc, userData);
|
||||
__debugbreak();
|
||||
}
|
||||
|
||||
void handleMemoryWrite(uc_engine* uc, uc_mem_type type, uint64_t address,
|
||||
int size, int64_t value, void* userData) {
|
||||
// 待实现
|
||||
auto* sandbox = static_cast<Sandbox*>(userData);
|
||||
if (!sandbox) return;
|
||||
|
||||
uint64_t regRip;
|
||||
uc_reg_read(uc,
|
||||
sandbox->GetPeInfo()->isX64 ? UC_X86_REG_RIP : UC_X86_REG_EIP,
|
||||
®Rip);
|
||||
|
||||
// 检测是否写入LDR结构
|
||||
if (sandbox->GetPeInfo()->isX64) {
|
||||
uint64_t ldrAddress = sandbox->GetPeb64()->Ldr;
|
||||
if (ldrAddress != 0 && address >= ldrAddress &&
|
||||
address < (ldrAddress + sizeof(X64_PEB_LDR_DATA))) {
|
||||
printf(
|
||||
"[WARNING] Suspicious direct LDR modification detected at RIP: "
|
||||
"0x%llx, modifying address: 0x%llx\n",
|
||||
regRip, address);
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
}
|
||||
} else {
|
||||
uint32_t ldrAddress = sandbox->GetPeb32()->Ldr;
|
||||
if (ldrAddress != 0 && address >= ldrAddress &&
|
||||
address < (ldrAddress + sizeof(_PEB_LDR_DATA))) {
|
||||
printf(
|
||||
"[WARNING] Suspicious direct LDR modification detected at RIP: "
|
||||
"0x%x, modifying address: 0x%llx\n",
|
||||
static_cast<uint32_t>(regRip), address);
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
}
|
||||
}
|
||||
|
||||
if (LOG_LEVEL > 0) {
|
||||
printf("[handleMemoryWrite] Address: %p Size: %p Value: %p RIP: %p\n",
|
||||
address, size, value, regRip);
|
||||
}
|
||||
}
|
||||
|
||||
void handleSyscall(uc_engine* uc, void* userData) {
|
||||
// 待实现
|
||||
auto* sandbox = static_cast<Sandbox*>(userData);
|
||||
if (!sandbox) return;
|
||||
sandbox->SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
printf("[handleSyscall] Syscall detected\n");
|
||||
}
|
||||
|
||||
} // namespace sandboxCallbacks
|
||||
|
||||
96
ai_anti_malware/sandbox_malware_check.cpp
Normal file
96
ai_anti_malware/sandbox_malware_check.cpp
Normal file
@@ -0,0 +1,96 @@
|
||||
#include "sandbox.h"
|
||||
#include <windows.h>
|
||||
|
||||
auto Sandbox::CheckMalwareActive_Registry(std::wstring registryPath) -> void {
|
||||
// 定义敏感注册表路径列表
|
||||
const std::vector<std::wstring> sensitiveRegistryPaths = {
|
||||
L"SOFTWARE\\DingTalk", L"SOFTWARE\\Tencent",
|
||||
L"SOFTWARE\\WOW6432Node\\DingTalk", L"SOFTWARE\\WOW6432Node\\Tencent"};
|
||||
|
||||
// 检查提供的注册表路径是否在敏感列表中
|
||||
bool isSensitiveRegistry = false;
|
||||
for (const auto& sensitivePath : sensitiveRegistryPaths) {
|
||||
if (registryPath.find(sensitivePath) != std::wstring::npos) {
|
||||
isSensitiveRegistry = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 如果是敏感注册表路径,尝试打开它检查是否可访问
|
||||
if (isSensitiveRegistry) {
|
||||
SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
|
||||
// 记录日志 (如果有日志系统的话)
|
||||
#if LOG_LEVEL >= 1
|
||||
printf("[!!!] SensitiveRegistry Access %s\n", registryPath.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
auto Sandbox::CheckMalwareActive_Sleep(uint32_t secToSleep) -> void {
|
||||
if (secToSleep > 1000 * 30) {
|
||||
SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
printf("[!!!] suspicious Sleep %d seconds\n", secToSleep);
|
||||
}
|
||||
}
|
||||
auto Sandbox::CheckMalwareActive_GetProcAddress(std::string wantName) -> void {
|
||||
const std::vector<std::string> sensitiveGetProcAddressNames = {
|
||||
"ZwAllocateVirtualMemory",
|
||||
"NtAllocateVirtualMemory",
|
||||
"NtFreeVirtualMemory",
|
||||
"NtProtectVirtualMemory",
|
||||
"NtWriteVirtualMemory",
|
||||
"NtReadVirtualMemory",
|
||||
"NtCreateThreadEx",
|
||||
"NtOpenThread",
|
||||
"NtTerminateThread",
|
||||
"NtResumeThread",
|
||||
"NtSuspendThread",
|
||||
"NtCreateThread",
|
||||
"NtOpenThread",
|
||||
"NtTerminateThread",
|
||||
"NtResumeThread",
|
||||
"NtSuspendThread"};
|
||||
// more
|
||||
if (std::find(sensitiveGetProcAddressNames.begin(),
|
||||
sensitiveGetProcAddressNames.end(),
|
||||
wantName) != sensitiveGetProcAddressNames.end()) {
|
||||
SetMalwareAnalysisType(MalwareAnalysisType::kMalware);
|
||||
printf("[!!!] suspicious GetProcAddress %s\n", wantName.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
auto Sandbox::CheckMalwareActive_FilePath(std::wstring filePath) -> void {
|
||||
// 定义敏感文件路径列表
|
||||
const std::vector<std::wstring> sensitiveFilePaths = {
|
||||
L"\\AppData\\",
|
||||
L"\\Temp\\",
|
||||
L"\\Windows\\System32\\",
|
||||
L"\\Program Files\\",
|
||||
L"\\Program Files (x86)\\",
|
||||
L"\\Documents\\",
|
||||
L"\\Downloads\\",
|
||||
L"\\Desktop\\",
|
||||
L"\\Users\\All Users\\",
|
||||
L"\\ProgramData\\",
|
||||
L"\\Microsoft\\Windows\\Start Menu\\",
|
||||
L"\\Startup\\"};
|
||||
|
||||
// 检查提供的文件路径是否在敏感列表中
|
||||
bool isSensitiveFilePath = false;
|
||||
for (const auto& sensitivePath : sensitiveFilePaths) {
|
||||
if (filePath.find(sensitivePath) != std::wstring::npos) {
|
||||
isSensitiveFilePath = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 如果是敏感文件路径,将恶意软件分析类型设置为可疑
|
||||
if (isSensitiveFilePath) {
|
||||
SetMalwareAnalysisType(MalwareAnalysisType::kSuspicious);
|
||||
|
||||
// 记录日志
|
||||
#if LOG_LEVEL >= 1
|
||||
printf("[!!!] SensitiveFilePath Access: %ls\n", filePath.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
3
ml/.vscode/settings.json
vendored
Normal file
3
ml/.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"python.analysis.typeCheckingMode": "basic"
|
||||
}
|
||||
12257
ml/data/malware_features.csv
Normal file
12257
ml/data/malware_features.csv
Normal file
File diff suppressed because it is too large
Load Diff
23812
ml/data/whitelist_features.csv
Normal file
23812
ml/data/whitelist_features.csv
Normal file
File diff suppressed because it is too large
Load Diff
6623
ml/malware_detector.cpp
Normal file
6623
ml/malware_detector.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,99 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import joblib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
|
||||
def load_model(model_path='xgboost_malware_detector.model'):
|
||||
"""
|
||||
加载训练好的模型
|
||||
"""
|
||||
print(f"正在加载模型: {model_path}")
|
||||
try:
|
||||
model = joblib.load(model_path)
|
||||
print("模型加载成功!")
|
||||
return model
|
||||
except Exception as e:
|
||||
print(f"模型加载失败: {e}")
|
||||
return None
|
||||
|
||||
def predict_file(model, csv_path):
|
||||
"""
|
||||
对单个CSV文件进行预测
|
||||
"""
|
||||
try:
|
||||
# 加载CSV文件
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
# 提取特征 (除去第一列文件路径)
|
||||
features = df.iloc[:, 1:]
|
||||
|
||||
# 使用模型预测
|
||||
predictions = model.predict(features)
|
||||
probabilities = model.predict_proba(features)
|
||||
|
||||
# 添加预测结果到数据框
|
||||
df['预测标签'] = predictions
|
||||
df['恶意软件概率'] = probabilities[:, 1]
|
||||
|
||||
# 创建结果数据框
|
||||
results = pd.DataFrame({
|
||||
'文件路径': df.iloc[:, 0],
|
||||
'预测标签': predictions,
|
||||
'恶意软件概率': probabilities[:, 1]
|
||||
})
|
||||
|
||||
# 保存结果到CSV
|
||||
output_path = os.path.splitext(csv_path)[0] + '_predictions.csv'
|
||||
results.to_csv(output_path, index=False)
|
||||
print(f"预测结果已保存到: {output_path}")
|
||||
|
||||
# 打印概要
|
||||
malware_count = len(results[results['预测标签'] == 1])
|
||||
total_count = len(results)
|
||||
print(f"总样本数: {total_count}")
|
||||
print(f"检测为恶意软件: {malware_count} ({malware_count/total_count*100:.2f}%)")
|
||||
print(f"检测为白名单软件: {total_count - malware_count} ({(total_count-malware_count)/total_count*100:.2f}%)")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"预测失败: {e}")
|
||||
return None
|
||||
|
||||
def batch_predict(model, csv_paths):
|
||||
"""
|
||||
批量预测多个CSV文件
|
||||
"""
|
||||
results = {}
|
||||
for csv_path in csv_paths:
|
||||
print(f"\n分析文件: {csv_path}")
|
||||
result = predict_file(model, csv_path)
|
||||
if result is not None:
|
||||
results[csv_path] = result
|
||||
|
||||
return results
|
||||
|
||||
def main():
|
||||
"""
|
||||
主函数
|
||||
"""
|
||||
# 检查命令行参数
|
||||
if len(sys.argv) < 2:
|
||||
print("使用方法: python predict.py <csv文件路径1> [csv文件路径2] ...")
|
||||
return
|
||||
|
||||
# 加载模型
|
||||
model = load_model()
|
||||
if model is None:
|
||||
return
|
||||
|
||||
# 批量预测
|
||||
csv_paths = sys.argv[1:]
|
||||
batch_predict(model, csv_paths)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,264 +1,117 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import os
|
||||
import joblib
|
||||
from sklearn.metrics import accuracy_score
|
||||
import m2cgen as m2c
|
||||
from xgboost import XGBClassifier
|
||||
import csv
|
||||
|
||||
def load_data(malware_csv, whitelist_csv):
|
||||
"""
|
||||
加载恶意软件和白名单CSV文件
|
||||
"""
|
||||
print(f"加载恶意软件数据: {malware_csv}")
|
||||
|
||||
# 预处理:先获取CSV的列数
|
||||
# 读取第一行以确定正确的列数
|
||||
try:
|
||||
header = pd.read_csv(malware_csv, nrows=1)
|
||||
expected_columns = len(header.columns)
|
||||
print(f"预期列数: {expected_columns}")
|
||||
|
||||
# 使用自定义函数读取CSV,处理字段不足的行
|
||||
malware_df = pd.read_csv(
|
||||
malware_csv,
|
||||
header=0,
|
||||
low_memory=False,
|
||||
on_bad_lines='skip', # 跳过无法解析的行
|
||||
dtype=float, # 将所有数据列转为浮点型
|
||||
converters={0: str} # 第一列为文件路径,保持为字符串类型
|
||||
)
|
||||
|
||||
# 检查列数是否不足,如果不足则填充0
|
||||
actual_columns = len(malware_df.columns)
|
||||
if actual_columns < expected_columns:
|
||||
for i in range(actual_columns, expected_columns):
|
||||
col_name = f"col_{i}"
|
||||
malware_df[col_name] = 0.0
|
||||
|
||||
print(f"成功读取恶意软件数据,形状: {malware_df.shape}")
|
||||
except Exception as e:
|
||||
print(f"读取恶意软件数据时出错: {e}")
|
||||
return None, None
|
||||
|
||||
malware_df['label'] = 1 # 恶意软件标签为1
|
||||
|
||||
print(f"加载白名单数据: {whitelist_csv}")
|
||||
try:
|
||||
# 同样处理白名单数据
|
||||
whitelist_df = pd.read_csv(
|
||||
whitelist_csv,
|
||||
header=0,
|
||||
low_memory=False,
|
||||
on_bad_lines='skip',
|
||||
dtype=float,
|
||||
converters={0: str}
|
||||
)
|
||||
|
||||
# 确保列数与恶意软件数据一致
|
||||
whitelist_cols = len(whitelist_df.columns)
|
||||
malware_cols = len(malware_df.columns) - 1 # 减去标签列
|
||||
|
||||
if whitelist_cols < malware_cols:
|
||||
for i in range(whitelist_cols, malware_cols):
|
||||
col_name = f"col_{i}"
|
||||
whitelist_df[col_name] = 0.0
|
||||
|
||||
print(f"成功读取白名单数据,形状: {whitelist_df.shape}")
|
||||
except Exception as e:
|
||||
print(f"读取白名单数据时出错: {e}")
|
||||
return None, None
|
||||
|
||||
whitelist_df['label'] = 0 # 白名单软件标签为0
|
||||
|
||||
# 确保两个DataFrame的列完全一致(除了可能的文件路径差异)
|
||||
malware_features = set(malware_df.columns)
|
||||
whitelist_features = set(whitelist_df.columns)
|
||||
|
||||
# 找出不同的列
|
||||
malware_only = malware_features - whitelist_features
|
||||
whitelist_only = whitelist_features - malware_features
|
||||
|
||||
# 为缺少的列添加0值
|
||||
for col in malware_only:
|
||||
if col != 'label':
|
||||
whitelist_df[col] = 0.0
|
||||
|
||||
for col in whitelist_only:
|
||||
if col != 'label':
|
||||
malware_df[col] = 0.0
|
||||
|
||||
# 合并数据
|
||||
combined_df = pd.concat([malware_df, whitelist_df], ignore_index=True, sort=False)
|
||||
|
||||
# 第一列通常是文件路径,需要将其移除
|
||||
# 先保存文件路径以便后续参考
|
||||
file_paths = combined_df.iloc[:, 0].tolist()
|
||||
|
||||
features = combined_df.iloc[:, 1:-1] # 除去第一列(文件路径)和最后一列(标签)
|
||||
labels = combined_df['label']
|
||||
|
||||
print(f"数据加载完成: {len(malware_df)} 个恶意样本, {len(whitelist_df)} 个白名单样本")
|
||||
print(f"特征维度: {features.shape}")
|
||||
|
||||
return features, labels
|
||||
malware_csv = 'data/malware_features.csv'
|
||||
whitelist_csv = 'data/whitelist_features.csv'
|
||||
|
||||
def train_xgboost_model(X_train, y_train, X_test, y_test):
|
||||
"""
|
||||
训练XGBoost模型
|
||||
"""
|
||||
print("开始训练XGBoost模型...")
|
||||
# 手动读取CSV文件并自动填充缺失字段
|
||||
def read_csv_with_padding(file_path):
|
||||
print(f"开始读取 {file_path}...")
|
||||
max_cols = 0
|
||||
rows = []
|
||||
|
||||
# 处理数据中可能存在的NaN值
|
||||
print("检查并填充缺失值...")
|
||||
X_train = X_train.fillna(0)
|
||||
X_test = X_test.fillna(0)
|
||||
# 首先确定最大列数
|
||||
with open(file_path, 'r', encoding='latin1', errors='replace') as f:
|
||||
csv_reader = csv.reader(f)
|
||||
for row in csv_reader:
|
||||
max_cols = max(max_cols, len(row))
|
||||
rows.append(row)
|
||||
|
||||
# 检查是否还有无限值,并将其替换为0
|
||||
X_train = X_train.replace([np.inf, -np.inf], 0)
|
||||
X_test = X_test.replace([np.inf, -np.inf], 0)
|
||||
print(f"文件 {file_path} 最大列数: {max_cols}")
|
||||
|
||||
print(f"处理后的训练数据形状: {X_train.shape}")
|
||||
print(f"处理后的测试数据形状: {X_test.shape}")
|
||||
# 为每一行填充缺失的字段
|
||||
padded_rows = []
|
||||
for row in rows:
|
||||
# 如果行长度小于最大列数,用'0'填充
|
||||
padded_row = row + ['0'] * (max_cols - len(row))
|
||||
padded_rows.append(padded_row)
|
||||
|
||||
# 设置XGBoost参数
|
||||
params = {
|
||||
'max_depth': 6, # 树的最大深度
|
||||
'learning_rate': 0.1, # 学习率
|
||||
'n_estimators': 100, # 树的数量
|
||||
'objective': 'binary:logistic', # 二分类问题
|
||||
'eval_metric': 'logloss', # 评估指标
|
||||
'subsample': 0.8, # 样本采样率
|
||||
'colsample_bytree': 0.8, # 特征采样率
|
||||
'random_state': 42 # 随机种子
|
||||
}
|
||||
|
||||
# 创建XGBoost分类器
|
||||
model = xgb.XGBClassifier(**params)
|
||||
|
||||
# 训练模型
|
||||
model.fit(
|
||||
X_train, y_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
early_stopping_rounds=10,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print("模型训练完成!")
|
||||
return model
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(padded_rows)
|
||||
print(f"读取 {file_path} 完成,形状: {df.shape}")
|
||||
return df
|
||||
|
||||
def evaluate_model(model, X_test, y_test):
|
||||
"""
|
||||
评估模型性能
|
||||
"""
|
||||
print("评估模型性能...")
|
||||
|
||||
# 在测试集上进行预测
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
# 计算准确率
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
print(f"准确率: {accuracy:.4f}")
|
||||
|
||||
# 打印分类报告
|
||||
print("\n分类报告:")
|
||||
print(classification_report(y_test, y_pred, target_names=['白名单', '恶意软件']))
|
||||
|
||||
# 打印混淆矩阵
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['白名单', '恶意软件'],
|
||||
yticklabels=['白名单', '恶意软件'])
|
||||
plt.xlabel('预测')
|
||||
plt.ylabel('实际')
|
||||
plt.title('混淆矩阵')
|
||||
plt.savefig('confusion_matrix.png')
|
||||
plt.close()
|
||||
|
||||
# 显示特征重要性
|
||||
plt.figure(figsize=(12, 8))
|
||||
xgb.plot_importance(model, max_num_features=20)
|
||||
plt.title('特征重要性')
|
||||
plt.savefig('feature_importance.png')
|
||||
plt.close()
|
||||
|
||||
return accuracy
|
||||
# 读取CSV文件
|
||||
malware_data = read_csv_with_padding(malware_csv)
|
||||
whitelist_data = read_csv_with_padding(whitelist_csv)
|
||||
|
||||
def save_model(model, output_path='xgboost_malware_detector.model'):
|
||||
"""
|
||||
保存模型到文件
|
||||
"""
|
||||
print(f"保存模型到 {output_path}")
|
||||
joblib.dump(model, output_path)
|
||||
print("模型保存完成!")
|
||||
# 删除第一列(路径列)
|
||||
malware_data = malware_data.iloc[:, 1:]
|
||||
whitelist_data = whitelist_data.iloc[:, 1:]
|
||||
|
||||
def main():
|
||||
"""
|
||||
主函数:加载数据,训练模型,评估结果,保存模型
|
||||
"""
|
||||
try:
|
||||
print("开始恶意软件检测模型训练...")
|
||||
|
||||
# 设置文件路径
|
||||
malware_csv = 'data/malware_features.csv'
|
||||
whitelist_csv = 'data/whitelist_features.csv'
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(malware_csv):
|
||||
print(f"错误: 找不到恶意软件特征文件 {malware_csv}")
|
||||
return
|
||||
|
||||
if not os.path.exists(whitelist_csv):
|
||||
print(f"错误: 找不到白名单特征文件 {whitelist_csv}")
|
||||
return
|
||||
|
||||
# 加载数据
|
||||
X, y = load_data(malware_csv, whitelist_csv)
|
||||
|
||||
if X is None or y is None:
|
||||
print("数据加载失败,终止训练")
|
||||
return
|
||||
|
||||
print(f"数据集加载完成,共 {len(X)} 个样本")
|
||||
|
||||
# 数据划分
|
||||
try:
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
|
||||
print(f"训练集: {len(X_train)} 样本,测试集: {len(X_test)} 样本")
|
||||
except Exception as e:
|
||||
print(f"数据划分出错: {e}")
|
||||
return
|
||||
|
||||
# 训练模型
|
||||
try:
|
||||
model = train_xgboost_model(X_train, y_train, X_test, y_test)
|
||||
except Exception as e:
|
||||
print(f"模型训练出错: {e}")
|
||||
return
|
||||
|
||||
# 评估模型
|
||||
try:
|
||||
evaluate_model(model, X_test, y_test)
|
||||
except Exception as e:
|
||||
print(f"模型评估出错: {e}")
|
||||
|
||||
# 保存模型
|
||||
try:
|
||||
save_model(model)
|
||||
print("模型训练和评估完成!")
|
||||
except Exception as e:
|
||||
print(f"模型保存出错: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"训练过程中发生未预期错误: {e}")
|
||||
# 将所有列转换为数值类型,非数值将转为NaN
|
||||
for col in malware_data.columns:
|
||||
malware_data[col] = pd.to_numeric(malware_data[col], errors='coerce')
|
||||
for col in whitelist_data.columns:
|
||||
whitelist_data[col] = pd.to_numeric(whitelist_data[col], errors='coerce')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# 用0填充NaN值
|
||||
malware_data.fillna(0, inplace=True)
|
||||
whitelist_data.fillna(0, inplace=True)
|
||||
|
||||
# 找到最大列数(最长的特征向量)
|
||||
max_cols = max(malware_data.shape[1], whitelist_data.shape[1])
|
||||
|
||||
# 用 0 填充(Padding)数据,使所有样本的列数相同
|
||||
malware_data = malware_data.reindex(columns=range(max_cols), fill_value=0)
|
||||
whitelist_data = whitelist_data.reindex(columns=range(max_cols), fill_value=0)
|
||||
|
||||
# 添加标签
|
||||
malware_data['label'] = 1 # 恶意软件
|
||||
whitelist_data['label'] = 0 # 白名单(正常)
|
||||
print(malware_data.head())
|
||||
print(whitelist_data.head())
|
||||
|
||||
# 合并数据
|
||||
combined_data = pd.concat([malware_data, whitelist_data], ignore_index=True)
|
||||
print(f"合并后数据形状: {combined_data.shape}")
|
||||
|
||||
# 分离特征和标签
|
||||
X = combined_data.drop('label', axis=1)
|
||||
y = combined_data['label']
|
||||
|
||||
# 分割数据集
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
print(f"训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}")
|
||||
|
||||
# 创建 XGBoost 数据集
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||||
dtest = xgb.DMatrix(X_test, label=y_test)
|
||||
|
||||
# 训练 XGBoost 模型
|
||||
num_rounds = 30
|
||||
# 创建watchlist来监控训练和验证集的性能
|
||||
watchlist = [(dtrain, '训练集'), (dtest, '验证集')]
|
||||
pos_ratio = np.mean(y_train) # 计算 1 的比例
|
||||
|
||||
clf = XGBClassifier(
|
||||
base_score=pos_ratio, #
|
||||
|
||||
objective='binary:logistic', # 适用于二分类
|
||||
max_depth=6, # 树的最大深度
|
||||
learning_rate=0.1, # 学习率
|
||||
n_estimators=100, # 迭代轮数
|
||||
subsample=0.8, # 采样比例,防止过拟合
|
||||
colsample_bytree=0.8,
|
||||
use_label_encoder=False, # 关闭 XGBoost 的 label 编码 (适用于新版本)
|
||||
eval_metric='logloss' # 交叉熵损失
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
# 预测
|
||||
y_pred_prob = clf.predict(X_test)
|
||||
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]
|
||||
|
||||
# 计算准确率
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
print(f'XGBoost 分类准确率: {accuracy:.4f}')
|
||||
code = m2c.export_to_c(clf)
|
||||
output_file = "malware_detector.cpp"
|
||||
with open(output_file, "w") as f:
|
||||
f.write(code)
|
||||
|
||||
BIN
ml/xgboost_malware_detector.model
Normal file
BIN
ml/xgboost_malware_detector.model
Normal file
Binary file not shown.
182
readme.md
182
readme.md
@@ -1,118 +1,140 @@
|
||||
# PE文件恶意软件检测系统
|
||||
## Preface
|
||||
|
||||
这是一个基于机器学习的PE文件恶意软件检测系统,使用XGBoost算法对PE文件进行分类。
|
||||
**key08 Security** has surpassed **3,000 followers**, meaning that a significant portion of cybersecurity professionals in China are keeping an eye on it. So, it's time for a big project.
|
||||
|
||||
## 功能特点
|
||||
### Why This Project?
|
||||
While working in the domestic cybersecurity field, I realized that **there is still a lot of untapped potential in the overall technical level**. Many people working in cybersecurity might also be interested in how **security software** on their computers actually works. Additionally, some might even dream of developing their **own antivirus software** or see it as their long-term goal.
|
||||
|
||||
- 利用PE文件结构特征进行恶意软件检测
|
||||
- 基于XGBoost机器学习算法
|
||||
- 提供训练和预测功能
|
||||
- 输出详细的分类报告和可视化结果
|
||||
So, I felt there was a need to systematically **document the working principles of an antivirus engine**. While working on this, I noticed that the **information available online is close to zero**. The few available sources only describe outdated technologies like **signature-based scanning and cloud antivirus from before 2006**. Antivirus software seems to be treated like a **black box**.
|
||||
|
||||
## 系统架构
|
||||
To **systematically educate**, rather than spread **misinformation or meme-based security practices** like some other public security accounts, I spent **two days** developing an antivirus engine that aligns with **modern security practices (as of 2025)**.
|
||||
|
||||
该系统包含以下组件:
|
||||
Now, I will explain **how it works, what its weaknesses are**, and at the end of the chapter, I will even **open-source the code**, which can be **compiled directly using Visual Studio**, making **learning more convenient**.
|
||||
|
||||
1. **特征提取模块**:C++编写的特征提取器,分析PE文件结构和行为特征
|
||||
2. **训练模块**:Python编写的模型训练代码,使用XGBoost算法
|
||||
3. **预测模块**:Python编写的模型推理代码,用于检测未知文件
|
||||
> ⚠️ **WARNING:** This code is provided **for learning purposes only**. The **datasets for machine learning, signature analysis, and dynamic behavior detection are extremely small**, so **detection effectiveness is very limited**.
|
||||
>
|
||||
> **Do not use this code for your "bypass AV" tests** and then complain that it fails to detect certain samples. This is **not intended for antivirus evasion testing**.
|
||||
> **If you want to improve it, study the issues yourself instead of copying and pasting the code and then asking why it doesn't work!**
|
||||
|
||||
## 特征集
|
||||
---
|
||||
|
||||
系统从PE文件中提取以下特征:
|
||||
## Classification of Antivirus Engines
|
||||
Currently, all major security vendors promote their so-called **NGAV (Next-Gen Antivirus)**, but in reality, most detection engines fall into these four categories:
|
||||
|
||||
1. PE段属性 (是否有配置、调试信息、例外处理、导出、导入等)
|
||||
2. 导入的DLL库
|
||||
3. 文件熵
|
||||
4. 入口点前64字节的归一化值
|
||||
5. 节区信息 (节区数量、平均熵、最大熵、归一化平均熵、大小比率)
|
||||
6. 代码段与整个文件的比率
|
||||
7. 节区数量
|
||||
1. **Cloud-Based Detection**
|
||||
- This includes:
|
||||
- **Fuzzy hashing engines** (such as `ssdeep`, `simhash`, etc.), which are used to **compare the similarity of files** (some vendors call this **"virus DNA"**).
|
||||
- **Traditional hash-based engines**, which rely on **SHA1, SHA256**, etc.
|
||||
- **Various cloud-based sandbox, manual or automated analysis systems**.
|
||||
|
||||
## 环境要求
|
||||
2. **Signature-Based Detection**
|
||||
3. **AI & Machine Learning-Based Detection**
|
||||
4. **Heuristic-Based Sandbox Detection**
|
||||
|
||||
- Python 3.7+
|
||||
- 依赖包:
|
||||
- pandas
|
||||
- numpy
|
||||
- xgboost
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
- seaborn
|
||||
- joblib
|
||||
Cloud-based engines are **extremely complex** and are typically a **core capability of each security company**, so **we won't discuss their implementation here** (except for those who simply use **VirusTotal (VT) as their cloud engine**).
|
||||
|
||||
安装依赖:
|
||||
That leaves **categories 2, 3, and 4**, which are typically combined in AV solutions.
|
||||
|
||||
```bash
|
||||
pip install pandas numpy xgboost scikit-learn matplotlib seaborn joblib
|
||||
```
|
||||
Each has its own strengths and weaknesses:
|
||||
- **Signature-Based Detection**: Does **not** have heuristic capabilities and **fully relies on manual rule creation**, but it is the **most effective**. Each security vendor's detection capabilities **heavily rely on their signature database**.
|
||||
- **Heuristic-Based Sandbox Detection**: Has **weak detection capabilities**, is **easily bypassed**, and **lags behind evolving threats**. It also tends to generate **false positives**.
|
||||
- **AI/Machine Learning-Based Detection**: Provides **high detection rates** but also produces **high false positive rates**, often **negatively impacting business operations** (e.g., compiling a simple **Hello World!** application in **Visual Studio** might trigger an alert). **Many AI-based engines are overly aggressive** and flag almost anything **without a digital signature**.
|
||||
|
||||
## 使用说明
|
||||
---
|
||||
|
||||
### 1. 准备数据
|
||||
## What Are We Going to Build?
|
||||
Today, we will create **a combined Machine Learning + Behavior-Based Sandbox Engine**.
|
||||
|
||||
需要准备两个CSV文件:
|
||||
- `malware.csv`:恶意软件样本的特征数据
|
||||
- `whitelist.csv`:正常软件样本的特征数据
|
||||
We are **not** implementing a **signature-based engine** because that would be **too simple** (if you're interested in signature matching, check out **YARA**).
|
||||
|
||||
这些CSV文件由C++特征提取模块生成。
|
||||
The overall engine structure is as follows:
|
||||

|
||||
|
||||
### 2. 训练模型
|
||||
We need to implement **two core modules**:
|
||||
1. **Sandbox Behavior Analysis Module**
|
||||
2. **Machine Learning-Based Detection Module**
|
||||
|
||||
运行以下命令进行模型训练:
|
||||
We will **introduce each module step by step**.
|
||||
|
||||
```bash
|
||||
python train_model.py
|
||||
```
|
||||
---
|
||||
|
||||
训练结果将保存为`xgboost_malware_detector.model`文件,并生成性能评估图表:
|
||||
- `confusion_matrix.png`:混淆矩阵
|
||||
- `feature_importance.png`:特征重要性排序
|
||||
## Sandbox Module
|
||||
A **sandbox module** is typically used for **unpacking and behavior analysis**. Essentially, it is a **PE file emulator**.
|
||||
|
||||
### 3. 预测未知文件
|
||||
In our system, we use **Unicorn Engine** to **simulate CPU execution**. **Unicorn Engine** is a **lightweight**, **cross-platform** CPU emulation framework that **supports multiple architectures**, including **MIPS, ARM, PowerPC, x86, and x64**. It is based on **QEMU** and was first introduced at **Black Hat 2015** by the **GrayShift security team**.
|
||||
|
||||
使用训练好的模型预测未知文件:
|
||||
### Main Steps of the Sandbox:
|
||||
1. **Initialize the Emulation Environment**
|
||||
- Relocate PE file sections
|
||||
- Setup stack memory
|
||||
- Initialize `Unicorn Engine` and allocate virtual memory
|
||||
- Map the PE file into the virtual environment
|
||||
- Load required DLLs into the virtual machine
|
||||
- Hook critical DLL functions to monitor behavior
|
||||
- Set up essential handles, stack, **PEB**, **TEB**, etc.
|
||||
- Store important PE metadata for unpacking
|
||||
|
||||
```bash
|
||||
python predict.py <csv文件路径1> [csv文件路径2] ...
|
||||
```
|
||||
2. **Relocation Processing**
|
||||
- If a **PE header contains a relocation table**, Windows will relocate **resources and functions** before execution.
|
||||
|
||||
预测结果将保存为`*_predictions.csv`文件。
|
||||
3. **Memory and Stack Allocation**
|
||||
- The **stack memory** must be fully emulated for the execution environment.
|
||||
|
||||
## 示例
|
||||
4. **Mapping PE Sections into Memory**
|
||||
- A **PE file's size on disk differs from its actual size when loaded in memory**.
|
||||
- We must **expand** it and **map each section accordingly**.
|
||||
|
||||
```bash
|
||||
# 训练模型
|
||||
python train_model.py
|
||||
5. **Load Required DLLs**
|
||||
- **Parse the Import Table** and **map necessary DLLs** into our virtual machine.
|
||||
|
||||
# 预测单个文件
|
||||
python predict.py unknown_samples.csv
|
||||
6. **Intercept API Calls**
|
||||
- Hook **imported API functions**.
|
||||
|
||||
# 批量预测多个文件
|
||||
python predict.py file1.csv file2.csv file3.csv
|
||||
```
|
||||
7. **Shellcode & Packed Malware Detection**
|
||||
- Monitor for **self-modifying code execution**, which indicates **packed malware**.
|
||||
|
||||
## 性能指标
|
||||
8. **Behavior-Based Detection**
|
||||
- Detect suspicious behavior, such as:
|
||||
- **Downloading executable files via `WinHttp`**
|
||||
- **Excessive `sleep` delays**
|
||||
- **Accessing sensitive directories**
|
||||
- **Direct access to `LDR` structures** (used to detect stealth malware)
|
||||
|
||||
在测试数据集上,该系统通常能达到以下性能:
|
||||
### Sandbox Performance:
|
||||
Here’s an example detection result:
|
||||

|
||||
|
||||
- 准确率:95%+
|
||||
- 召回率:90%+
|
||||
- 精确率:92%+
|
||||
- F1值:91%+
|
||||
---
|
||||
|
||||
_注意:实际性能可能因训练数据和参数设置而异。_
|
||||
## Machine Learning Module
|
||||
The **machine learning module** is used to classify files based on extracted PE features.
|
||||
|
||||
## 扩展与优化
|
||||
### Feature Engineering:
|
||||
We extract the following feature sets:
|
||||
1. **PE Header Features** (Presence of Import Tables, TLS sections, relocations, etc.)
|
||||
2. **Imported DLLs** (Checks for specific suspicious DLLs)
|
||||
3. **File Entropy** (Measures randomness)
|
||||
4. **Entry Point Byte Sequence** (Examines the first 64 bytes of code)
|
||||
5. **Section Analysis** (Checks PE section sizes and entropy)
|
||||
6. **Code-to-Data Ratio** (Compares code section size vs. total PE file size)
|
||||
|
||||
系统可以进行以下扩展和优化:
|
||||
### Training Data:
|
||||
We collected **1,000 benign samples** and **1,000 malicious samples**, saved their features into a **CSV file**, and used them for training.
|
||||
|
||||
1. 添加更多特征,如字符串分析、API调用序列等
|
||||
2. 尝试其他机器学习算法或深度学习模型
|
||||
3. 集成多个模型进行综合决策
|
||||
4. 开发实时监控和检测功能
|
||||
5. 增加可解释性分析
|
||||

|
||||
|
||||
## License
|
||||
> ⚠️ **NOTE:** The dataset is **too small** for real-world performance. A proper dataset should have at least **100,000+ benign and 100,000+ malicious samples**.
|
||||
|
||||
MIT
|
||||
### Model Training:
|
||||
We use **XGBoost** for training and then export the trained model to **pure C++ code** using **m2cgen**.
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
This is a **basic but modern antivirus engine** using **sandbox-based behavior analysis** and **machine learning-based detection**.
|
||||
|
||||
The **full source code** is available on **GitHub** (link below). 🚀
|
||||
|
||||
🔗 **GitHub Repository:** [INSERT LINK HERE]
|
||||
Reference in New Issue
Block a user