在本文中,将探讨如何使用深度神经网络来检测编程语言。使用了Keras框架,其后端为TensorFlow,来完成这项任务。CodAI使用的神经网络与之前文章中提到的LSTM垃圾邮件检测网络非常相似。
首先,需要准备测试数据。测试数据是一个包含代码样本的HTML文本文件。使用了BeautifulSoup来提取所有的PRE标签内容。
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("LanguageSamples.txt", "r"), "html.parser")
count = 0
code_snippets = []
languages = []
for pre_tag in soup.find_all("pre", text=True):
count += 1
line = str(pre_tag.contents[0])
code_snippets.append(line)
languages.append(pre_tag["lang"].lower())
接下来,需要对输入进行标记化处理。Keras的Tokenizer用于此目的,最大特征数设置为10000,并将词索引保存到json文件中。
from keras.preprocessing.text import Tokenizer
import json
max_features = 10000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(code_snippets)
dictionary = tokenizer.word_index
with open("wordindex.json", "w") as dictionary_file:
json.dump(dictionary, dictionary_file)
X = tokenizer.texts_to_sequences(code_snippets)
X = pad_sequences(X, 100)
Y = pd.get_dummies(languages)
CodAI神经网络由卷积神经网络、LSTM和前馈网络组成。
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dropout, Dense
embed_dim = 128
lstm_out = 64
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=100))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', dilation_rate=1, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', dilation_rate=1, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(lstm_out))
model.add(Dropout(0.5))
model.add(Dense(64))
model.add(Dense(len(Y.columns), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
该模型经过400个周期的训练,在验证数据上达到了100%的准确率。
from flask import Flask, request, jsonify
import numpy as np
app = Flask(__name__)
def convert_text_to_index_array(text):
wordvec = []
global dictionary
for word in kpt.text_to_word_sequence(text):
if word in dictionary:
if dictionary[word] <= 10000:
wordvec.append([dictionary[word]])
else:
wordvec.append([0])
else:
wordvec.append([0])
return wordvec
@app.route("/predict", methods=["POST"])
def predict():
global model
data = {"success": False}
X_test = []
if request.method == "POST":
code_snip = request.json
word_vec = convert_text_to_index_array(code_snip)
X_test.append(word_vec)
X_test = pad_sequences(X_test, maxlen=100)
y_prob = model.predict(X_test[0].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)[0]
languages = ['angular', 'asm', 'asp.net', 'c#', 'c++', 'css', 'delphi', 'html', 'java', 'javascript', 'objectivec', 'pascal', 'perl', 'php', 'powershell', 'python', 'razor', 'react', 'ruby', 'scala', 'sql', 'swift', 'typescript', 'vb.net', 'xml']
data["predictions"] = []
for i in range(len(languages)):
r = {"label": languages[i], "probability": format(y_prob[i] * 100, '.2f')}
data["predictions"].append(r)
data["success"] = True
return jsonify(data)