Commit 877e9a8c authored by Nandeesh Patel Gowdru Prabushanker's avatar Nandeesh Patel Gowdru Prabushanker
Browse files

Module for neural entity linking

parent d7846e76
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, ReLU, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CosineSimilarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from utils import prepare_training_data
def train_model(train_x, train_y):
entity_in = Input(shape=(30, 100))
lstm_layer = Bidirectional(LSTM(units=100))(entity_in)
relu_layer = ReLU()(lstm_layer)
out_layer = Dense(100, activation="sigmoid")(relu_layer)
model = Model(entity_in, out_layer)
model.compile(loss=CosineSimilarity(axis=1), optimizer='sgd', metrics=['accuracy'])
num_epochs = 5
model.fit(train_x, train_y, batch_size=10, epochs=num_epochs, validation_split=0.25, verbose=1)
return model
def get_ref_entity(candidates, prediction):
similarity = -2
ref_entity = None
for candidate in candidates:
if cosine_similarity([candidate], [prediction])[0][0] > similarity:
ref_entity = candidate
similarity = cosine_similarity([candidate], [prediction])[0][0]
return ref_entity
def get_entity_from_embeddings(embeddings_array):
entities_list = []
for embedding in embeddings_array:
entity = data[data.embeddings == ",".join(map(str, embedding))]
if entity.empty:
entities_list.append("http://dbpedia.org/unknown")
else:
entities_list.append(entity.get('entity').array[0])
return entities_list
def evaluate(x_test, y_test):
y_prediction = entity_linker(x_test)
y_pred = []
i = 0
for entity in x_test:
y_pred.append(get_ref_entity(entity, y_prediction[i]))
i += 1
test_labels = get_entity_from_embeddings(y_test)
predicted_labels = get_entity_from_embeddings(y_pred)
print("Micro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="micro")))
print("Macro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="macro")))
print("Weighted F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="weighted")))
if __name__ == "__main__":
data = pd.read_csv("./../data/embeddings.tsv", sep="\t")
embeddings = {}
for index, entry in data.iterrows():
embeddings[entry.get('entity')] = entry.get('embeddings')
X_train, Y_train = prepare_training_data(embeddings, "./../data/train_data.csv")
embeddings = None
X_train = np.array(X_train)
Y_train = np.array(Y_train)
entity_linker = train_model(X_train, Y_train)
X_train = None
Y_train = None
X_test, Y_test = prepare_training_data(embeddings, "./../data/test_data.csv")
evaluate(X_test, Y_test)
print("Done")
import numpy as np
def prepare_training_data(entity_embeddings, path):
with open(path, "r") as train_file:
line = train_file.readline().strip()
x_train = []
y_train = []
while line != "":
line = line.split("\t")
gold = line[2]
try:
gold = np.array(entity_embeddings[gold].split(","), dtype="float")
except KeyError:
gold = np.zeros(100)
y_train.append(gold)
candidates = []
for i in range(3, len(line)):
if i % 2 != 0:
candidate = line[i]
try:
candidate = np.array(entity_embeddings[candidate].split(","), dtype="float")
except KeyError:
candidate = np.full(100, -1)
candidates.append(candidate)
if len(candidates) < 30:
while len(candidates) < 30:
candidates.append(np.full(100, -1))
else:
candidates = candidates[:30]
x_train.append(candidates)
line = train_file.readline().strip()
return x_train, y_train
def test():
print("Inside the util module")
numpy==1.19.5
pandas==1.3.1
scikit-learn==0.24.2
scipy==1.7.1
tensorflow==2.6.0
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment