1
This commit is contained in:
@@ -340,6 +340,7 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
|||||||
peInfo.characteristics = ntHeaders64->FileHeader.Characteristics;
|
peInfo.characteristics = ntHeaders64->FileHeader.Characteristics;
|
||||||
peInfo.dllCharacteristics =
|
peInfo.dllCharacteristics =
|
||||||
ntHeaders64->OptionalHeader.DllCharacteristics;
|
ntHeaders64->OptionalHeader.DllCharacteristics;
|
||||||
|
peInfo.hasImageBase = ntHeaders64->OptionalHeader.ImageBase != 0;
|
||||||
} else {
|
} else {
|
||||||
// 32位PE文件
|
// 32位PE文件
|
||||||
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
|
PIMAGE_NT_HEADERS32 ntHeaders32 = (PIMAGE_NT_HEADERS32)ntHeaders;
|
||||||
@@ -352,6 +353,7 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
|||||||
peInfo.characteristics = ntHeaders32->FileHeader.Characteristics;
|
peInfo.characteristics = ntHeaders32->FileHeader.Characteristics;
|
||||||
peInfo.dllCharacteristics =
|
peInfo.dllCharacteristics =
|
||||||
ntHeaders32->OptionalHeader.DllCharacteristics;
|
ntHeaders32->OptionalHeader.DllCharacteristics;
|
||||||
|
peInfo.hasImageBase = ntHeaders32->OptionalHeader.ImageBase != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 检查PE目录
|
// 检查PE目录
|
||||||
@@ -398,8 +400,6 @@ std::vector<double> MachineLearning::ExtractFeatures(const uint8_t* buffer,
|
|||||||
IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT);
|
IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT);
|
||||||
peInfo.hasDelayImports = dataDir && dataDir->VirtualAddress != 0;
|
peInfo.hasDelayImports = dataDir && dataDir->VirtualAddress != 0;
|
||||||
|
|
||||||
peInfo.hasImageBase = true; // PE文件都有ImageBase
|
|
||||||
|
|
||||||
dataDir = peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IAT);
|
dataDir = peconv::get_directory_entry(peBuffer, IMAGE_DIRECTORY_ENTRY_IAT);
|
||||||
peInfo.hasEntryIat = dataDir && dataDir->VirtualAddress != 0;
|
peInfo.hasEntryIat = dataDir && dataDir->VirtualAddress != 0;
|
||||||
|
|
||||||
@@ -544,9 +544,12 @@ std::vector<double> MachineLearning::EncodeEntrypoint(
|
|||||||
const std::vector<uint8_t>& epBytes) {
|
const std::vector<uint8_t>& epBytes) {
|
||||||
std::vector<double> features;
|
std::vector<double> features;
|
||||||
|
|
||||||
|
// 只使用前64个字节,确保特征数量固定
|
||||||
|
size_t bytesToUse = std::min<size_t>(64, epBytes.size());
|
||||||
|
|
||||||
// 原始字节转为浮点值(按Python代码中的normalize处理)
|
// 原始字节转为浮点值(按Python代码中的normalize处理)
|
||||||
for (const auto& byte : epBytes) {
|
for (size_t i = 0; i < bytesToUse; i++) {
|
||||||
features.push_back(static_cast<double>(byte) / 255.0);
|
features.push_back(static_cast<double>(epBytes[i]) / 255.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 填充至64字节长度
|
// 填充至64字节长度
|
||||||
@@ -743,34 +746,49 @@ std::vector<uint8_t> MachineLearning::ReadFileToBuffer(
|
|||||||
|
|
||||||
bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
bool MachineLearning::ProcessDirectory(const std::string& directoryPath,
|
||||||
const std::string& outputCsvPath) {
|
const std::string& outputCsvPath) {
|
||||||
// 打开CSV文件用于写入
|
// 检查文件是否已存在
|
||||||
std::ofstream csvFile(outputCsvPath);
|
bool fileExists = std::filesystem::exists(outputCsvPath);
|
||||||
|
|
||||||
|
// 打开CSV文件用于写入,如果文件已存在则使用追加模式
|
||||||
|
std::ofstream csvFile;
|
||||||
|
if (fileExists) {
|
||||||
|
csvFile.open(outputCsvPath, std::ios::app);
|
||||||
|
} else {
|
||||||
|
csvFile.open(outputCsvPath);
|
||||||
|
}
|
||||||
|
|
||||||
if (!csvFile.is_open()) {
|
if (!csvFile.is_open()) {
|
||||||
std::cerr << "无法创建CSV文件: " << outputCsvPath << std::endl;
|
std::cerr << "无法创建或打开CSV文件: " << outputCsvPath << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 仅在文件不存在时写入CSV标题行
|
||||||
/*
|
/*
|
||||||
// 写入CSV标题行
|
if (!fileExists) {
|
||||||
csvFile << "文件路径";
|
|
||||||
for (size_t i = 0; i < _properties.size(); i++) {
|
// 写入CSV标题行
|
||||||
csvFile << ",属性_" << i;
|
csvFile << "文件路径";
|
||||||
|
for (size_t i = 0; i < _properties.size(); i++) {
|
||||||
|
csvFile << ",属性_" << i;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < _libraries.size(); i++) {
|
||||||
|
csvFile << ",库_" << i;
|
||||||
|
}
|
||||||
|
csvFile << ",文件熵";
|
||||||
|
for (size_t i = 0; i < 64; i++) { // 前64个字节特征
|
||||||
|
csvFile << ",EP_" << i;
|
||||||
|
}
|
||||||
|
csvFile << ",节区数";
|
||||||
|
csvFile << ",平均熵";
|
||||||
|
csvFile << ",最大熵";
|
||||||
|
csvFile << ",归一化平均熵";
|
||||||
|
csvFile << ",节区大小比率";
|
||||||
|
csvFile << ",代码比率";
|
||||||
|
csvFile << ",节区计数";
|
||||||
|
csvFile << std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < _libraries.size(); i++) {
|
*/
|
||||||
csvFile << ",库_" << i;
|
|
||||||
}
|
|
||||||
csvFile << ",文件熵";
|
|
||||||
for (size_t i = 0; i < 64; i++) { // 前64个字节特征
|
|
||||||
csvFile << ",EP_" << i;
|
|
||||||
}
|
|
||||||
csvFile << ",节区数";
|
|
||||||
csvFile << ",平均熵";
|
|
||||||
csvFile << ",最大熵";
|
|
||||||
csvFile << ",归一化平均熵";
|
|
||||||
csvFile << ",节区大小比率";
|
|
||||||
csvFile << ",代码比率";
|
|
||||||
csvFile << ",节区计数";
|
|
||||||
csvFile << std::endl;
|
|
||||||
*/
|
|
||||||
// 递归遍历目录
|
// 递归遍历目录
|
||||||
WIN32_FIND_DATAA findData;
|
WIN32_FIND_DATAA findData;
|
||||||
std::string searchPath = directoryPath + "\\*";
|
std::string searchPath = directoryPath + "\\*";
|
||||||
|
|||||||
99
ml/predict.py
Normal file
99
ml/predict.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import joblib
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
def load_model(model_path='xgboost_malware_detector.model'):
|
||||||
|
"""
|
||||||
|
加载训练好的模型
|
||||||
|
"""
|
||||||
|
print(f"正在加载模型: {model_path}")
|
||||||
|
try:
|
||||||
|
model = joblib.load(model_path)
|
||||||
|
print("模型加载成功!")
|
||||||
|
return model
|
||||||
|
except Exception as e:
|
||||||
|
print(f"模型加载失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def predict_file(model, csv_path):
|
||||||
|
"""
|
||||||
|
对单个CSV文件进行预测
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 加载CSV文件
|
||||||
|
df = pd.read_csv(csv_path)
|
||||||
|
|
||||||
|
# 提取特征 (除去第一列文件路径)
|
||||||
|
features = df.iloc[:, 1:]
|
||||||
|
|
||||||
|
# 使用模型预测
|
||||||
|
predictions = model.predict(features)
|
||||||
|
probabilities = model.predict_proba(features)
|
||||||
|
|
||||||
|
# 添加预测结果到数据框
|
||||||
|
df['预测标签'] = predictions
|
||||||
|
df['恶意软件概率'] = probabilities[:, 1]
|
||||||
|
|
||||||
|
# 创建结果数据框
|
||||||
|
results = pd.DataFrame({
|
||||||
|
'文件路径': df.iloc[:, 0],
|
||||||
|
'预测标签': predictions,
|
||||||
|
'恶意软件概率': probabilities[:, 1]
|
||||||
|
})
|
||||||
|
|
||||||
|
# 保存结果到CSV
|
||||||
|
output_path = os.path.splitext(csv_path)[0] + '_predictions.csv'
|
||||||
|
results.to_csv(output_path, index=False)
|
||||||
|
print(f"预测结果已保存到: {output_path}")
|
||||||
|
|
||||||
|
# 打印概要
|
||||||
|
malware_count = len(results[results['预测标签'] == 1])
|
||||||
|
total_count = len(results)
|
||||||
|
print(f"总样本数: {total_count}")
|
||||||
|
print(f"检测为恶意软件: {malware_count} ({malware_count/total_count*100:.2f}%)")
|
||||||
|
print(f"检测为白名单软件: {total_count - malware_count} ({(total_count-malware_count)/total_count*100:.2f}%)")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"预测失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def batch_predict(model, csv_paths):
|
||||||
|
"""
|
||||||
|
批量预测多个CSV文件
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
for csv_path in csv_paths:
|
||||||
|
print(f"\n分析文件: {csv_path}")
|
||||||
|
result = predict_file(model, csv_path)
|
||||||
|
if result is not None:
|
||||||
|
results[csv_path] = result
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
主函数
|
||||||
|
"""
|
||||||
|
# 检查命令行参数
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("使用方法: python predict.py <csv文件路径1> [csv文件路径2] ...")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 加载模型
|
||||||
|
model = load_model()
|
||||||
|
if model is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 批量预测
|
||||||
|
csv_paths = sys.argv[1:]
|
||||||
|
batch_predict(model, csv_paths)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
264
ml/train_model.py
Normal file
264
ml/train_model.py
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
import os
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
def load_data(malware_csv, whitelist_csv):
|
||||||
|
"""
|
||||||
|
加载恶意软件和白名单CSV文件
|
||||||
|
"""
|
||||||
|
print(f"加载恶意软件数据: {malware_csv}")
|
||||||
|
|
||||||
|
# 预处理:先获取CSV的列数
|
||||||
|
# 读取第一行以确定正确的列数
|
||||||
|
try:
|
||||||
|
header = pd.read_csv(malware_csv, nrows=1)
|
||||||
|
expected_columns = len(header.columns)
|
||||||
|
print(f"预期列数: {expected_columns}")
|
||||||
|
|
||||||
|
# 使用自定义函数读取CSV,处理字段不足的行
|
||||||
|
malware_df = pd.read_csv(
|
||||||
|
malware_csv,
|
||||||
|
header=0,
|
||||||
|
low_memory=False,
|
||||||
|
on_bad_lines='skip', # 跳过无法解析的行
|
||||||
|
dtype=float, # 将所有数据列转为浮点型
|
||||||
|
converters={0: str} # 第一列为文件路径,保持为字符串类型
|
||||||
|
)
|
||||||
|
|
||||||
|
# 检查列数是否不足,如果不足则填充0
|
||||||
|
actual_columns = len(malware_df.columns)
|
||||||
|
if actual_columns < expected_columns:
|
||||||
|
for i in range(actual_columns, expected_columns):
|
||||||
|
col_name = f"col_{i}"
|
||||||
|
malware_df[col_name] = 0.0
|
||||||
|
|
||||||
|
print(f"成功读取恶意软件数据,形状: {malware_df.shape}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取恶意软件数据时出错: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
malware_df['label'] = 1 # 恶意软件标签为1
|
||||||
|
|
||||||
|
print(f"加载白名单数据: {whitelist_csv}")
|
||||||
|
try:
|
||||||
|
# 同样处理白名单数据
|
||||||
|
whitelist_df = pd.read_csv(
|
||||||
|
whitelist_csv,
|
||||||
|
header=0,
|
||||||
|
low_memory=False,
|
||||||
|
on_bad_lines='skip',
|
||||||
|
dtype=float,
|
||||||
|
converters={0: str}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 确保列数与恶意软件数据一致
|
||||||
|
whitelist_cols = len(whitelist_df.columns)
|
||||||
|
malware_cols = len(malware_df.columns) - 1 # 减去标签列
|
||||||
|
|
||||||
|
if whitelist_cols < malware_cols:
|
||||||
|
for i in range(whitelist_cols, malware_cols):
|
||||||
|
col_name = f"col_{i}"
|
||||||
|
whitelist_df[col_name] = 0.0
|
||||||
|
|
||||||
|
print(f"成功读取白名单数据,形状: {whitelist_df.shape}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取白名单数据时出错: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
whitelist_df['label'] = 0 # 白名单软件标签为0
|
||||||
|
|
||||||
|
# 确保两个DataFrame的列完全一致(除了可能的文件路径差异)
|
||||||
|
malware_features = set(malware_df.columns)
|
||||||
|
whitelist_features = set(whitelist_df.columns)
|
||||||
|
|
||||||
|
# 找出不同的列
|
||||||
|
malware_only = malware_features - whitelist_features
|
||||||
|
whitelist_only = whitelist_features - malware_features
|
||||||
|
|
||||||
|
# 为缺少的列添加0值
|
||||||
|
for col in malware_only:
|
||||||
|
if col != 'label':
|
||||||
|
whitelist_df[col] = 0.0
|
||||||
|
|
||||||
|
for col in whitelist_only:
|
||||||
|
if col != 'label':
|
||||||
|
malware_df[col] = 0.0
|
||||||
|
|
||||||
|
# 合并数据
|
||||||
|
combined_df = pd.concat([malware_df, whitelist_df], ignore_index=True, sort=False)
|
||||||
|
|
||||||
|
# 第一列通常是文件路径,需要将其移除
|
||||||
|
# 先保存文件路径以便后续参考
|
||||||
|
file_paths = combined_df.iloc[:, 0].tolist()
|
||||||
|
|
||||||
|
features = combined_df.iloc[:, 1:-1] # 除去第一列(文件路径)和最后一列(标签)
|
||||||
|
labels = combined_df['label']
|
||||||
|
|
||||||
|
print(f"数据加载完成: {len(malware_df)} 个恶意样本, {len(whitelist_df)} 个白名单样本")
|
||||||
|
print(f"特征维度: {features.shape}")
|
||||||
|
|
||||||
|
return features, labels
|
||||||
|
|
||||||
|
def train_xgboost_model(X_train, y_train, X_test, y_test):
|
||||||
|
"""
|
||||||
|
训练XGBoost模型
|
||||||
|
"""
|
||||||
|
print("开始训练XGBoost模型...")
|
||||||
|
|
||||||
|
# 处理数据中可能存在的NaN值
|
||||||
|
print("检查并填充缺失值...")
|
||||||
|
X_train = X_train.fillna(0)
|
||||||
|
X_test = X_test.fillna(0)
|
||||||
|
|
||||||
|
# 检查是否还有无限值,并将其替换为0
|
||||||
|
X_train = X_train.replace([np.inf, -np.inf], 0)
|
||||||
|
X_test = X_test.replace([np.inf, -np.inf], 0)
|
||||||
|
|
||||||
|
print(f"处理后的训练数据形状: {X_train.shape}")
|
||||||
|
print(f"处理后的测试数据形状: {X_test.shape}")
|
||||||
|
|
||||||
|
# 设置XGBoost参数
|
||||||
|
params = {
|
||||||
|
'max_depth': 6, # 树的最大深度
|
||||||
|
'learning_rate': 0.1, # 学习率
|
||||||
|
'n_estimators': 100, # 树的数量
|
||||||
|
'objective': 'binary:logistic', # 二分类问题
|
||||||
|
'eval_metric': 'logloss', # 评估指标
|
||||||
|
'subsample': 0.8, # 样本采样率
|
||||||
|
'colsample_bytree': 0.8, # 特征采样率
|
||||||
|
'random_state': 42 # 随机种子
|
||||||
|
}
|
||||||
|
|
||||||
|
# 创建XGBoost分类器
|
||||||
|
model = xgb.XGBClassifier(**params)
|
||||||
|
|
||||||
|
# 训练模型
|
||||||
|
model.fit(
|
||||||
|
X_train, y_train,
|
||||||
|
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||||
|
early_stopping_rounds=10,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
print("模型训练完成!")
|
||||||
|
return model
|
||||||
|
|
||||||
|
def evaluate_model(model, X_test, y_test):
|
||||||
|
"""
|
||||||
|
评估模型性能
|
||||||
|
"""
|
||||||
|
print("评估模型性能...")
|
||||||
|
|
||||||
|
# 在测试集上进行预测
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
|
||||||
|
# 计算准确率
|
||||||
|
accuracy = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"准确率: {accuracy:.4f}")
|
||||||
|
|
||||||
|
# 打印分类报告
|
||||||
|
print("\n分类报告:")
|
||||||
|
print(classification_report(y_test, y_pred, target_names=['白名单', '恶意软件']))
|
||||||
|
|
||||||
|
# 打印混淆矩阵
|
||||||
|
cm = confusion_matrix(y_test, y_pred)
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||||
|
xticklabels=['白名单', '恶意软件'],
|
||||||
|
yticklabels=['白名单', '恶意软件'])
|
||||||
|
plt.xlabel('预测')
|
||||||
|
plt.ylabel('实际')
|
||||||
|
plt.title('混淆矩阵')
|
||||||
|
plt.savefig('confusion_matrix.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 显示特征重要性
|
||||||
|
plt.figure(figsize=(12, 8))
|
||||||
|
xgb.plot_importance(model, max_num_features=20)
|
||||||
|
plt.title('特征重要性')
|
||||||
|
plt.savefig('feature_importance.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
return accuracy
|
||||||
|
|
||||||
|
def save_model(model, output_path='xgboost_malware_detector.model'):
|
||||||
|
"""
|
||||||
|
保存模型到文件
|
||||||
|
"""
|
||||||
|
print(f"保存模型到 {output_path}")
|
||||||
|
joblib.dump(model, output_path)
|
||||||
|
print("模型保存完成!")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
主函数:加载数据,训练模型,评估结果,保存模型
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
print("开始恶意软件检测模型训练...")
|
||||||
|
|
||||||
|
# 设置文件路径
|
||||||
|
malware_csv = 'data/malware_features.csv'
|
||||||
|
whitelist_csv = 'data/whitelist_features.csv'
|
||||||
|
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.exists(malware_csv):
|
||||||
|
print(f"错误: 找不到恶意软件特征文件 {malware_csv}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not os.path.exists(whitelist_csv):
|
||||||
|
print(f"错误: 找不到白名单特征文件 {whitelist_csv}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 加载数据
|
||||||
|
X, y = load_data(malware_csv, whitelist_csv)
|
||||||
|
|
||||||
|
if X is None or y is None:
|
||||||
|
print("数据加载失败,终止训练")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"数据集加载完成,共 {len(X)} 个样本")
|
||||||
|
|
||||||
|
# 数据划分
|
||||||
|
try:
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=0.2, random_state=42, stratify=y)
|
||||||
|
|
||||||
|
print(f"训练集: {len(X_train)} 样本,测试集: {len(X_test)} 样本")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"数据划分出错: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 训练模型
|
||||||
|
try:
|
||||||
|
model = train_xgboost_model(X_train, y_train, X_test, y_test)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"模型训练出错: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 评估模型
|
||||||
|
try:
|
||||||
|
evaluate_model(model, X_test, y_test)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"模型评估出错: {e}")
|
||||||
|
|
||||||
|
# 保存模型
|
||||||
|
try:
|
||||||
|
save_model(model)
|
||||||
|
print("模型训练和评估完成!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"模型保存出错: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"训练过程中发生未预期错误: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
135
readme.md
135
readme.md
@@ -1,29 +1,118 @@
|
|||||||
[2025]从0制作现代启发式AI杀毒引擎,附源码
|
# PE文件恶意软件检测系统
|
||||||
## 前言
|
|
||||||
|
|
||||||
冲鸭安全突破3000粉丝了,应该国内大半个搞安全的人都在看了.所以整个大的活.
|
这是一个基于机器学习的PE文件恶意软件检测系统,使用XGBoost算法对PE文件进行分类。
|
||||||
|
|
||||||
为什么突然想搞这个,因为在做国内安全业务的时候,我意识到,国内的平均技术水平还有很大的挖掘价值.很多人从事安全,可能也对自己的电脑上的安全软件的工作原理感兴趣.也有很多人把做安全软件视为自己的梦想.或者一个努力方向.所以我觉得,有必要花一些时间,系统的整理一下杀毒引擎的工作原理,在整理工作原理的时候我发现网上基本0资料,有也停留在2006年之前什么特征码扫描,云查杀毒.仿佛杀毒软件这玩意就是个黑盒.
|
## 功能特点
|
||||||
|
|
||||||
简而言之,为了系统性的科普知识,而不是其他公众号那种胡言乱语,meme,免杀巫术,我花了大概两天时间,写了一个符合现代(2025年)情况的杀毒引擎.现在我将介绍他是如何工作的.以及他的缺陷是什么.并且在文末我还会开源源码,能直接VS编译.方便大家学习
|
- 利用PE文件结构特征进行恶意软件检测
|
||||||
|
- 基于XGBoost机器学习算法
|
||||||
|
- 提供训练和预测功能
|
||||||
|
- 输出详细的分类报告和可视化结果
|
||||||
|
|
||||||
## 杀毒引擎分类
|
## 系统架构
|
||||||
目前查杀引擎各家瞎吹的什么NGAV无非就这几种:
|
|
||||||
1. 云查引擎
|
|
||||||
这包括:
|
|
||||||
模糊hash引擎(ssdeep,simhash等都算),模糊hash是一种算法,能比较文件相似度(某些PPT叫病毒基因),具体可以看我之前的文章:
|
|
||||||
[2021]余弦定理检测文件相似度 & 病毒样本基因检测
|
|
||||||
https://key08.com/index.php/2021/08/19/1306.html
|
|
||||||
hash base引擎,没什么好说的,基于sha1或者sha256等固定唯一hash
|
|
||||||
背后的各种沙箱/人工/自动机鉴定
|
|
||||||
2. 特征引擎
|
|
||||||
3. ai机器学习引擎
|
|
||||||
4. 启发式沙箱引擎
|
|
||||||
|
|
||||||
云引擎说起来非常复杂,属于是各家的**核心能力**,我们不讨论其实现(某些直接买了VT当云引擎的除外).所以除了1外,2,3,4往往是同时打包在一起的,
|
该系统包含以下组件:
|
||||||
这几个引擎各有特点,比如特征引擎不具备启发能力,纯靠人力堆.启发的沙箱引擎检出弱,很容易被针对,技术落后一代.AI机器学习引擎高检出,但是也高误报,对业务造成很大影响,to B/G基本不开这玩意.
|
|
||||||
## 我们要做什么
|
|
||||||
我们今天所做的,是一个机器学习+沙箱行为检测的引擎,为什么不做特征引擎,因为特征引擎太普通了,如果对其感兴趣的,可以去看yara.
|
|
||||||
整个引擎的构造如下图所示:
|
|
||||||
|
|
||||||
我们需要
|
1. **特征提取模块**:C++编写的特征提取器,分析PE文件结构和行为特征
|
||||||
|
2. **训练模块**:Python编写的模型训练代码,使用XGBoost算法
|
||||||
|
3. **预测模块**:Python编写的模型推理代码,用于检测未知文件
|
||||||
|
|
||||||
|
## 特征集
|
||||||
|
|
||||||
|
系统从PE文件中提取以下特征:
|
||||||
|
|
||||||
|
1. PE段属性 (是否有配置、调试信息、例外处理、导出、导入等)
|
||||||
|
2. 导入的DLL库
|
||||||
|
3. 文件熵
|
||||||
|
4. 入口点前64字节的归一化值
|
||||||
|
5. 节区信息 (节区数量、平均熵、最大熵、归一化平均熵、大小比率)
|
||||||
|
6. 代码段与整个文件的比率
|
||||||
|
7. 节区数量
|
||||||
|
|
||||||
|
## 环境要求
|
||||||
|
|
||||||
|
- Python 3.7+
|
||||||
|
- 依赖包:
|
||||||
|
- pandas
|
||||||
|
- numpy
|
||||||
|
- xgboost
|
||||||
|
- scikit-learn
|
||||||
|
- matplotlib
|
||||||
|
- seaborn
|
||||||
|
- joblib
|
||||||
|
|
||||||
|
安装依赖:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pandas numpy xgboost scikit-learn matplotlib seaborn joblib
|
||||||
|
```
|
||||||
|
|
||||||
|
## 使用说明
|
||||||
|
|
||||||
|
### 1. 准备数据
|
||||||
|
|
||||||
|
需要准备两个CSV文件:
|
||||||
|
- `malware.csv`:恶意软件样本的特征数据
|
||||||
|
- `whitelist.csv`:正常软件样本的特征数据
|
||||||
|
|
||||||
|
这些CSV文件由C++特征提取模块生成。
|
||||||
|
|
||||||
|
### 2. 训练模型
|
||||||
|
|
||||||
|
运行以下命令进行模型训练:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python train_model.py
|
||||||
|
```
|
||||||
|
|
||||||
|
训练结果将保存为`xgboost_malware_detector.model`文件,并生成性能评估图表:
|
||||||
|
- `confusion_matrix.png`:混淆矩阵
|
||||||
|
- `feature_importance.png`:特征重要性排序
|
||||||
|
|
||||||
|
### 3. 预测未知文件
|
||||||
|
|
||||||
|
使用训练好的模型预测未知文件:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python predict.py <csv文件路径1> [csv文件路径2] ...
|
||||||
|
```
|
||||||
|
|
||||||
|
预测结果将保存为`*_predictions.csv`文件。
|
||||||
|
|
||||||
|
## 示例
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 训练模型
|
||||||
|
python train_model.py
|
||||||
|
|
||||||
|
# 预测单个文件
|
||||||
|
python predict.py unknown_samples.csv
|
||||||
|
|
||||||
|
# 批量预测多个文件
|
||||||
|
python predict.py file1.csv file2.csv file3.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
## 性能指标
|
||||||
|
|
||||||
|
在测试数据集上,该系统通常能达到以下性能:
|
||||||
|
|
||||||
|
- 准确率:95%+
|
||||||
|
- 召回率:90%+
|
||||||
|
- 精确率:92%+
|
||||||
|
- F1值:91%+
|
||||||
|
|
||||||
|
_注意:实际性能可能因训练数据和参数设置而异。_
|
||||||
|
|
||||||
|
## 扩展与优化
|
||||||
|
|
||||||
|
系统可以进行以下扩展和优化:
|
||||||
|
|
||||||
|
1. 添加更多特征,如字符串分析、API调用序列等
|
||||||
|
2. 尝试其他机器学习算法或深度学习模型
|
||||||
|
3. 集成多个模型进行综合决策
|
||||||
|
4. 开发实时监控和检测功能
|
||||||
|
5. 增加可解释性分析
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
pandas>=1.3.0
|
||||||
|
numpy>=1.20.0
|
||||||
|
xgboost>=1.5.0
|
||||||
|
scikit-learn>=1.0.0
|
||||||
|
matplotlib>=3.4.0
|
||||||
|
seaborn>=0.11.0
|
||||||
|
joblib>=1.0.0
|
||||||
Reference in New Issue
Block a user