142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
# coding=utf-8
|
||
import fileinput
|
||
import warnings
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import os
|
||
import tensorflow as tf
|
||
import sys
|
||
import re
|
||
import nltk
|
||
import sklearn
|
||
import tensorflow.keras as keras
|
||
import tensorflow.keras.preprocessing as keras_preprocessing
|
||
from sklearn.preprocessing import StandardScaler
|
||
import chardet
|
||
import math
|
||
from joblib import load
|
||
g_word_dict = {}
|
||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
|
||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
|
||
|
||
|
||
def os_listdir_ex(file_dir, find_name): # 祖传代码
|
||
result = []
|
||
for root, dirs, files in os.walk(file_dir):
|
||
for file in files:
|
||
if os.path.splitext(file)[1] == find_name:
|
||
result.append(os.path.join(root, file))
|
||
# return result # 测试用
|
||
return result
|
||
|
||
|
||
def get_file_length(pFile): # 得到文件长度,祖传代码
|
||
fsize = os.path.getsize(pFile)
|
||
return int(fsize)
|
||
|
||
|
||
def flush_file(pFile): # 清洗php注释
|
||
file = open(pFile, 'r', encoding='gb18030', errors='ignore')
|
||
read_string = file.read()
|
||
file.close()
|
||
m = re.compile(r'/\*.*?\*/', re.S)
|
||
result = re.sub(m, '', read_string)
|
||
m = re.compile(r'//.*')
|
||
result = re.sub(m, '', result)
|
||
m = re.compile(r'#.*')
|
||
result = re.sub(m, '', result)
|
||
return result
|
||
|
||
|
||
# 得到文件熵 https://blog.csdn.net/jliang3/article/details/88359063
|
||
def get_file_entropy(pFile):
|
||
clean_string = flush_file(pFile)
|
||
text_list = {}
|
||
_sum = 0
|
||
result = 0
|
||
for word_iter in clean_string:
|
||
if word_iter != '\n' and word_iter != ' ':
|
||
if word_iter not in text_list.keys():
|
||
text_list[word_iter] = 1
|
||
else:
|
||
text_list[word_iter] = text_list[word_iter] + 1
|
||
for index in text_list.keys():
|
||
_sum = _sum + text_list[index]
|
||
for index in text_list.keys():
|
||
result = result - float(text_list[index])/_sum * \
|
||
math.log(float(text_list[index])/_sum, 2)
|
||
return result
|
||
|
||
|
||
def vectorize_sequences(sequences, dimention=1337):
|
||
# 创建一个大小为(25000,10000)的全零矩阵
|
||
results = np.zeros((len(sequences), dimention))
|
||
for i, sequence in enumerate(sequences):
|
||
if i > dimention:
|
||
break
|
||
try:
|
||
results[i, sequence] = 1.
|
||
except:
|
||
break
|
||
|
||
return results
|
||
|
||
|
||
def get_file_word_bag(pFile):
|
||
global g_word_dict
|
||
english_punctuations = [',', '.', ':', ';', '?',
|
||
'(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 'php', '<', '>', '\'']
|
||
clean_string = flush_file(pFile)
|
||
word_list = nltk.word_tokenize(clean_string)
|
||
# 过滤掉不干净的
|
||
word_list = [
|
||
word_iter for word_iter in word_list if word_iter not in english_punctuations]
|
||
|
||
keras_token = keras.preprocessing.text.Tokenizer() # 初始化标注器
|
||
keras_token.fit_on_texts(word_list) # 学习出文本的字典
|
||
g_word_dict.update(keras_token.word_index)
|
||
# 通过texts_to_sequences 这个dict可以将每个string的每个词转成数字
|
||
sequences_data = keras_token.texts_to_sequences(word_list)
|
||
# 将每条文本的长度设置一个固定值, ps 超过1337个字符的"单词"不用说肯定是某个骇客想把大马变免杀马
|
||
# word_bag = keras_preprocessing.sequence.pad_sequences(sequences_data, maxlen=1337, dtype='int16')
|
||
word_bag = []
|
||
for index in range(0, len(sequences_data)):
|
||
if len(sequences_data[index]) != 0:
|
||
for zeus in range(0, len(sequences_data[index])):
|
||
word_bag.append(sequences_data[index][zeus])
|
||
return word_bag
|
||
|
||
|
||
file_path = '.\\1.php'
|
||
|
||
entropy = get_file_entropy(file_path)
|
||
length = get_file_length(file_path)
|
||
word_bag = get_file_word_bag(file_path)
|
||
array_input = np.array([[entropy, length]])
|
||
|
||
data_frame = pd.DataFrame(
|
||
{'length': [length], 'entropy': [entropy], 'word_bag': [word_bag]}, columns=['length', 'entropy', 'word_bag'])
|
||
# scaler = StandardScaler()
|
||
scaler_entropy = load('scaler_entropy.joblib')
|
||
scaler_length = load('scaler_length.joblib')
|
||
data_frame['length_scaled'] = scaler_length.transform(
|
||
data_frame['length'].values.reshape(-1, 1))
|
||
data_frame['entropy_scaled'] = scaler_entropy.transform(
|
||
data_frame['entropy'].values.reshape(-1, 1))
|
||
|
||
data_train_pre = data_frame.filter(items=['length_scaled', 'entropy_scaled'])
|
||
# data_train_pre = data_frame.filter(items=['length', 'entropy'])
|
||
data_train_x_1 = tf.constant(data_train_pre)
|
||
data_train_x_2 = tf.constant(
|
||
vectorize_sequences(data_frame['word_bag'].values))
|
||
print(data_frame.head())
|
||
|
||
model_name = 'huoji1.h5' # huoji.h5 huoji_scaled.h5 huoji_no_scale.h5
|
||
model = keras.models.load_model(model_name)
|
||
model.summary()
|
||
print(data_train_x_1, data_train_x_2)
|
||
prediction = model.predict([data_train_x_1, data_train_x_2])
|
||
print(prediction)
|