remove anti-paste and add a new anti-paste

2021-06-19 18:59:10 +08:00
parent 32c3234735
commit b8c5f7d2a1
1 changed files with 41 additions and 2 deletions
--- a/webshell.py
+++ b/webshell.py
@@ -25,6 +25,7 @@ config.gpu_options.allow_growth = True
 sess0 = tf.InteractiveSession(config=config)
 '''
 g_word_dict = {}
+g_keras_token = None


 def os_listdir_ex(file_dir, find_name):  # 祖传代码
@@ -114,7 +115,41 @@ def vectorize_sequences(sequences, dimention=1337):
    return results


-def get_file_word_bag(pFile):
+def get_word_bag(pWordList):
+    global g_word_dict
+    global g_keras_token
+    sequences_data = g_keras_token.texts_to_sequences(pWordList)
+    word_bag = []
+    for index in range(0, len(sequences_data)):
+        if len(sequences_data[index]) != 0:
+            for zeus in range(0, len(sequences_data[index])):
+                word_bag.append(sequences_data[index][zeus])
+    return word_bag
+
+
+def set_word_bag(pWordList):
+    global g_word_dict
+    global g_keras_token
+    if g_keras_token == None:
+        g_keras_token = keras.preprocessing.text.Tokenizer()  # 初始化标注器
+    g_keras_token.fit_on_texts(pWordList)  # 学习出文本的字典
+    g_word_dict.update(g_keras_token.word_index)
+
+
+def get_file_word(pFile):
+    global g_word_dict
+    english_punctuations = [',', '.', ':', ';', '?',
+                            '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 'php', '<', '>', '\'']
+    clean_string = flush_file(pFile)
+    word_list = nltk.word_tokenize(clean_string)
+    # 过滤掉不干净的
+    word_list = [
+        word_iter for word_iter in word_list if word_iter not in english_punctuations]
+    # anti-paste
+    return word_list
+
+
+def get_file_word_bag_non_use(pFile):
    global g_word_dict
    english_punctuations = [',', '.', ':', ';', '?',
                            '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', 'php', '<', '>', '\'']
@@ -264,7 +299,11 @@ if os.path.exists("save.csv") == False:
        data_frame['entropy'].values.reshape(-1, 1), scaler_entropy.fit(data_frame['entropy'].values.reshape(-1, 1)))
    # 导入词袋
    data_frame['word_bag'] = data_frame['file'].map(
-        lambda file_name: get_file_word_bag(file_name))
+        lambda file_name: get_file_word(file_name))
+    set_word_bag(data_frame['word_bag'])
+    data_frame['word_bag'] = data_frame['word_bag'].map(
+        lambda text_params: get_word_bag(text_params))
+
    data_frame.to_csv("save.csv")
    dump(scaler_length, 'scaler_length.joblib')
    dump(scaler_entropy, 'scaler_entropy.joblib')