Commit 9bb40759 authored by Nandeesh Patel Gowdru Prabushanker's avatar Nandeesh Patel Gowdru Prabushanker
Browse files

Updated files for python module

parent 61a7a3c1
......@@ -3,3 +3,5 @@
*.iws
*.iml
*.ipr
Datasets
Datasets.zip
*.props
data/*
saved_model/*
\ No newline at end of file
saved_model/*
data_v1/*
\ No newline at end of file
......@@ -2,7 +2,7 @@ import json
from flask import Flask, request
from flask_cors import CORS, cross_origin
from utils import process_line, get_embeddings, es, annotation2nif
from utils import process_line, get_embeddings, es, annotation2nif, get_sentence_embedding
from model import get_ref_entity, get_entity_from_embeddings
import requests
import tensorflow.keras.models
......@@ -12,7 +12,7 @@ from pynif import NIFCollection
app = Flask(__name__)
cors = CORS(app)
entity_linker = tensorflow.keras.models.load_model("./saved_model/linker")
entity_linker = tensorflow.keras.models.load_model("./leave_1out_models/linker_aida_train")
@app.route('/ping', methods=['GET'])
......@@ -82,11 +82,12 @@ def get_relation_embedding():
@app.route('/get-entities', methods=['POST'])
@cross_origin()
def link_entities_in_text():
text = request.values.get('text')
text = request.get_json().get('text')
if text is None or text.strip() == '':
return "Invalid request parameters", 400
req_body = {
"text": text,
"type": 'text'
}
response = requests.post("http://localhost:8080/candidates", headers={'content-type': 'application/json'},
json=req_body)
......@@ -95,6 +96,7 @@ def link_entities_in_text():
embedding_dictionary = {}
resp = []
i = 0
doc_emb = []
for line in text.split("\n"):
named_entity = line.split("\t")[0].split("||")[0]
start = line.split("\t")[0].split("||")[1]
......@@ -104,14 +106,17 @@ def link_entities_in_text():
"start": start,
"offset": offset
})
candidates, gold, emb_dict = process_line(line.strip())
doc_embedding = get_sentence_embedding(line.split("\t")[1])
candidates, gold, emb_dict = process_line(line.strip(), doc_embedding)
x_test.append(candidates)
doc_emb.append(doc_embedding[line.split("\t")[1]])
for entity in emb_dict.keys():
if entity not in embedding_dictionary:
embedding_dictionary[entity] = ",".join(map(str, emb_dict[entity]))
i += 1
x_test = np.array(x_test)
y_test = entity_linker(x_test)
doc_emb = np.reshape(doc_emb, (len(doc_emb), 1, 768))
y_test = entity_linker([x_test, doc_emb])
y_pred = []
i = 0
for entity in x_test:
......@@ -128,14 +133,14 @@ def link_entities_in_text():
i = 0
for label in labels:
if label == "UNKNOWN":
resp[i]['disambiguatedURL'] = "http://nel.cs.upb.de/notInWiki/" + resp[i]['namedEntity'].replace(" ", "")
resp[i]['disambiguatedURL'] = "http://aksw.org/notInWiki/" + resp[i]['namedEntity'].replace(" ", "")
else:
resp[i]['disambiguatedURL'] = label
i += 1
return json.dumps(list(resp))
@app.route("/test", methods=['POST', 'GET'])
@app.route("/gerbil", methods=['POST', 'GET'])
def gerbil_evaluation():
data = request.data
data = data.lstrip()
......@@ -156,6 +161,8 @@ def gerbil_evaluation():
except:
print('no mentions')
resp = []
if len(mentions) <= 0:
print("No mentions")
if len(mentions) > 0:
response = requests.post("http://localhost:8080/candidates", headers={'content-type': 'application/json'},
json=req_body)
......@@ -163,6 +170,7 @@ def gerbil_evaluation():
x_test = []
embedding_dictionary = {}
i = 0
doc_emb = []
for line in text.split("\n"):
named_entity = line.split("\t")[0].split("||")[0]
start = line.split("\t")[0].split("||")[1]
......@@ -172,14 +180,17 @@ def gerbil_evaluation():
"start": start,
"offset": offset
})
candidates, gold, emb_dict = process_line(line.strip())
doc_embedding = get_sentence_embedding(line.split("\t")[1])
candidates, gold, emb_dict = process_line(line.strip(), doc_embedding)
x_test.append(candidates)
doc_emb.append(doc_embedding[line.split("\t")[1]])
for entity in emb_dict.keys():
if entity not in embedding_dictionary:
embedding_dictionary[entity] = ",".join(map(str, emb_dict[entity]))
i += 1
x_test = np.array(x_test)
y_test = entity_linker(x_test)
doc_emb = np.reshape(doc_emb, (len(doc_emb), 1, 768))
y_test = entity_linker([x_test, doc_emb])
y_pred = []
i = 0
for entity in x_test:
......@@ -196,8 +207,7 @@ def gerbil_evaluation():
i = 0
for label in labels:
if label == "UNKNOWN":
resp[i]['disambiguatedURL'] = "http://nel.cs.upb.de/notInWiki/" + resp[i]['namedEntity'].replace(" ",
"")
resp[i]['disambiguatedURL'] = "http://aksw.org/notInWiki/" + resp[i]['namedEntity'].replace(" ", "")
else:
resp[i]['disambiguatedURL'] = label
i += 1
......
def print_stats(path, name):
with open(path) as fl:
line = fl.readline().strip()
total_count = 0
count = 0
gold_not_in_candidates = 0
gold_in_candidates = 0
while line != "":
line = line.split("\t")
candidates = []
gold = line[2]
if gold not in entities:
entities.append(gold)
for candidate in candidates:
if candidate not in entities:
entities.append(candidate)
for i in range(3, len(line)):
if i % 2 != 0:
candidates.append(line[i])
if len(candidates) == 0:
count += 1
if gold not in candidates and gold.startswith("http://dbpedia.org/"):
gold_not_in_candidates += 1
if gold in candidates or gold.startswith("http://aksw.org/notInWiki"):
gold_in_candidates += 1
total_count += 1
line = fl.readline().strip()
print(total_count, "\t\t", count, "\t", gold_not_in_candidates, "\t", gold_in_candidates, "\t", round((gold_in_candidates / total_count) * 100, 2), "\t", name[:10])
return total_count
total_entries = 0
entities = []
dataset_names = ["ACE2004"]
dataset_names.append("aida_complete")
dataset_names.append("aida_testa")
dataset_names.append("aida_testb")
dataset_names.append("aida_train")
dataset_names.append("AQUAINT")
dataset_names.append("spotlight")
dataset_names.append("iitb")
dataset_names.append("KORE50")
dataset_names.append("MSNBC")
dataset_names.append("N3-Reuters-128")
dataset_names.append("N3-RSS-500")
dataset_names.append("oke-challenge-task1-eval")
dataset_names.append("oke-challenge-task1-example")
dataset_names.append("oke-challenge-task1-gs")
print("total\t\tno candidates\tgold not part\tgold part\tname")
for dsName in dataset_names:
total_entries += print_stats("./data/test_data/" + dsName + ".tsv", dsName)
print("************************************************************")
from utils import process_line
def process_file(path):
with open(path) as train_file:
line = train_file.readline().strip()
line_count = 0
while line != "":
candidates, gold, emb_dict = process_line(line, {})
for key in emb_dict.keys():
if key not in embedding_dictionary.keys():
embedding_dictionary[key] = emb_dict[key]
line_count += 1
if line_count % 1000 == 0:
print(line_count)
line = train_file.readline().strip()
print("Done")
embedding_dictionary = {}
process_file("./training_data/train_all.tsv")
with open("./data/entity_embeddings_latest.tsv", "w") as embedding_file:
embedding_file.write("entity\tembeddings\n")
for entity in embedding_dictionary.keys():
embedding_file.write(entity + "\t" + ",".join([str(i) for i in embedding_dictionary[entity]]))
embedding_file.write("\n")
# Run this file to generate training files for leave one out evaluation.
dataset_names = ["AQUAINT", "aida_train", "aida_testa", "aida_testb", "ACE2004", "MSNBC", "iitb", "N3-Reuters-128",
"N3-RSS-500", "oke-challenge-task1-example", "oke-challenge-task1-gs", "oke-challenge-task1-eval",
"KORE50", "spotlight"]
def write_train_data(ds_name):
with open("./data/leave_1out_training_data/aida_complete.tsv", "w") as fl:
for d in dataset_names:
if not d.startswith(ds_name):
with open("./data/training_data/" + d + ".tsv", "r") as ip_file:
line = ip_file.readline()
while line.strip() != "":
fl.write(line)
line = ip_file.readline()
for dataset in dataset_names:
write_train_data(dataset)
print(dataset + " Done!")
import requests
from sklearn.metrics import f1_score
dataset_names = ["AQUAINT", "aida_train", "aida_testa", "aida_testb", "ACE2004", "MSNBC", "iitb", "N3-Reuters-128",
"N3-RSS-500", "oke-challenge-task1-example", "oke-challenge-task1-gs", "oke-challenge-task1-eval",
"KORE50", "spotlight"]
dataset_names = ["aida_complete"]
for dataset_name in dataset_names:
response = requests.get("http://nel.cs.upb.de:8080/eval?dsName=" + dataset_name,
headers={'content-type': 'application/json'}).json()
predicted_labels = []
test_labels = []
for doc in response.keys():
for mention in response[doc].keys():
predicted_labels.append(response[doc][mention]['link'])
test_labels.append(response[doc][mention]['gold'])
print("**********************")
print(dataset_name, ":\t", str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="micro")))
print("**********************")
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras.models
from tensorflow.keras.layers import Input, ReLU, Dense, LSTM, Bidirectional
from tensorflow.keras.layers import Input, Dense, LSTM, Bidirectional, LeakyReLU, Dropout, concatenate, ReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CosineSimilarity
from tensorflow.keras.losses import CosineSimilarity, MeanSquaredError, mean_squared_error, Huber
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from utils import prepare_training_data
from os.path import exists
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from matplotlib import rcParams
tensorflow.random.set_seed(2021)
def plot_data(initial_history, num_epochs):
rcParams['figure.figsize'] = (18, 8)
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
plt.plot(
np.arange(1, num_epochs + 1),
initial_history.history['loss'],
label='Loss', lw=3
)
plt.plot(
np.arange(1, num_epochs + 1),
initial_history.history['accuracy'],
label='Accuracy', lw=3
)
# plt.plot(
# np.arange(1, num_epochs + 1),
# initial_history.history['lr'],
# label='Learning rate', color='#000', lw=3, linestyle='--'
# )
# plt.title('Evaluation metrics', size=20)
plt.xlabel('Epoch', size=14)
plt.legend()
plt.show()
plt.savefig('plot1.png')
plt.clf()
def plot_data2(initial_history, num_epochs):
learning_rates = 1e-3 * (10 ** (np.arange(num_epochs) / 30))
plt.semilogx(
learning_rates,
initial_history.history['loss'],
lw=3, color='#000'
)
# plt.title('Learning rate vs. loss', size=20)
plt.xlabel('Learning rate', size=14)
plt.ylabel('Loss', size=14)
plt.show()
plt.savefig('plot2.png')
plt.clf()
def prepare_data():
embeddings_df = pd.read_csv("./data/embeddings.tsv", sep="\t")
embeddings_df = pd.read_csv("./data/entity_embeddings.tsv", sep="\t")
embeddings_dict = {}
for index, entry in embeddings_df.iterrows():
embeddings_dict[entry.get('entity')] = entry.get('embeddings')
return embeddings_df, embeddings_dict
def train_model(train_x, train_y):
def rmsle(y_true, y_pred):
msle = MeanSquaredError()
return K.sqrt(msle(y_true, y_pred))
def train_model(train_x, train_y, train_doc_embedding):
entity_in = Input(shape=(30, 100))
doc_embedding_in = Input(shape=(1, 768))
lstm_layer = Bidirectional(LSTM(units=100))(entity_in)
relu_layer = ReLU()(lstm_layer)
out_layer = Dense(100, activation="sigmoid")(relu_layer)
model = Model(entity_in, out_layer)
model.compile(loss=CosineSimilarity(axis=1), optimizer='sgd', metrics=['accuracy'])
num_epochs = 5
model.fit(train_x, train_y, batch_size=100, epochs=num_epochs, validation_split=0.25, verbose=1)
doc_embedding_lstm = Bidirectional(LSTM(units=100))(doc_embedding_in)
combined_layer = concatenate([lstm_layer, doc_embedding_lstm])
relu_layer = ReLU()(combined_layer)
dropout_layer = Dropout(rate=0.2)(relu_layer)
out_layer = Dense(100, activation="relu")(dropout_layer)
model = Model([entity_in, doc_embedding_in], out_layer)
optimizer = Adam(learning_rate=0.015)
model.compile(loss=CosineSimilarity(axis=1), optimizer=optimizer, metrics=['accuracy'])
num_epochs = 100
train_doc_embedding = np.reshape(train_doc_embedding, (len(train_doc_embedding), 1, 768))
model.fit([train_x, train_doc_embedding], train_y, batch_size=128, epochs=num_epochs,
validation_split=0.2, verbose=1)
return model
def get_ref_entity(candidates, prediction):
similarity = -2
ref_entity = None
ref_entity = np.full(100, -1)
for candidate in candidates:
if cosine_similarity([candidate], [prediction])[0][0] > similarity:
ref_entity = candidate
similarity = cosine_similarity([candidate], [prediction])[0][0]
cosine_sim = cosine_similarity([candidate[:100]], [prediction])[0][0]
if cosine_sim > similarity:
ref_entity = candidate[:100]
similarity = cosine_sim
return ref_entity
......@@ -51,36 +115,67 @@ def get_entity_from_embeddings(embeddings_array, embedding_df):
return entities_list
def evaluate(x_test, y_test):
y_prediction = entity_linker(x_test)
def evaluate(x_test, y_test, doc_emb):
doc_emb = np.reshape(doc_emb, (len(doc_emb), 1, 768))
y_prediction = entity_linker([x_test, doc_emb])
y_pred = []
i = 0
for entity in x_test:
y_pred.append(get_ref_entity(entity, y_prediction[i]))
ref_entity = get_ref_entity(entity, y_prediction[i])
y_pred.append(ref_entity)
i += 1
test_labels = get_entity_from_embeddings(y_test, data)
predicted_labels = get_entity_from_embeddings(y_pred, data)
in_kb_true = 0
out_kb_true = 0
in_kb_false = 0
out_kb_false = 0
with open("./data/testb_compare.tsv", "w") as fl:
fl.write("Actual link\tPredicted link\n")
for i in range(len(test_labels)):
fl.write(test_labels[i] + "\t" + predicted_labels[i] + "\n")
if str(test_labels[i]).find("dbpedia.org") >= 0 and test_labels[i] == predicted_labels[i]:
in_kb_true += 1
if str(test_labels[i]).find("dbpedia.org") >= 0 and test_labels[i] != predicted_labels[i]:
in_kb_false += 1
if str(test_labels[i]).find("UNKNOWN") >= 0 and test_labels[i] == predicted_labels[i]:
out_kb_true += 1
if str(test_labels[i]).find("UNKNOWN") >= 0 and test_labels[i] != predicted_labels[i]:
out_kb_false += 1
print("Micro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="micro")))
print("Macro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="macro")))
print("Weighted F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="weighted")))
# print("Macro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="macro")))
# print("Weighted F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="weighted")))
if __name__ == "__main__":
if exists("./saved_model/linker"):
entity_linker = tensorflow.keras.models.load_model("./saved_model/linker")
print("trained model already exists")
else:
data, embeddings = prepare_data()
x_train, y_train = prepare_training_data(embeddings, "./data/train_data.csv")
x_train = np.array(x_train)
y_train = np.array(y_train)
entity_linker = train_model(x_train, y_train)
x_train = None
y_train = None
entity_linker.save("./saved_model/linker")
X_test, Y_test = prepare_training_data(embeddings, "./data/test_data.csv")
embeddings = None
X_test = np.array(X_test)
Y_test = np.array(Y_test)
evaluate(X_test, Y_test)
data, embeddings = prepare_data()
dataset_names = ["AQUAINT", "aida_complete", "aida_train", "aida_testa", "aida_testb", "ACE2004", "MSNBC", "iitb",
"N3-Reuters-128", "N3-RSS-500", "oke-challenge-task1-example", "oke-challenge-task1-gs",
"oke-challenge-task1-eval", "KORE50", "spotlight"]
for ds_name in dataset_names:
if exists("./models/linker_" + ds_name):
# Change the model name to linker_aida_train if you need to evaluate the model trained on aida dataset
entity_linker = tensorflow.keras.models.load_model("./models/linker_" + ds_name)
file_path = "./data/training_data/" + ds_name + ".tsv"
X_test, Y_test, doc_embedding_test = prepare_training_data(embeddings, file_path)
X_test = np.array(X_test)
Y_test = np.array(Y_test)
doc_embedding_test = np.array(doc_embedding_test)
print("**************************")
print(ds_name, "\n")
evaluate(X_test, Y_test, doc_embedding_test)
print("**************************")
else:
x_train, y_train, doc_embedding = prepare_training_data(embeddings,
"./data/training_data/" + ds_name + ".tsv")
x_train = np.array(x_train)
y_train = np.array(y_train)
doc_embedding = np.array(doc_embedding)
entity_linker = train_model(x_train, y_train, doc_embedding)
x_train = None
y_train = None
entity_linker.save("./models/linker_" + ds_name)
print("*******************************")
print("Saved Model:\t" + ds_name)
print("*******************************")
print("Done")
......@@ -10,4 +10,4 @@ pandas==1.3.1
scikit-learn==0.24.2
scipy==1.7.1
tensorflow==2.6.0
pynif==0.2.0
pynif==0.2.0
\ No newline at end of file
from app import app
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5001)
app.run(host="0.0.0.0", port=5000)
import math
import numpy as np
from elasticsearch import Elasticsearch
from jproperties import Properties
from pynif import NIFCollection
from sentence_transformers import SentenceTransformer
sentence_embedding_model = SentenceTransformer("./doc_embedding_model/all-mpnet-base-v2")
configs = Properties()
with open('app.props', 'rb') as config_file:
......@@ -27,7 +31,29 @@ def get_embeddings(query_string, index_name, field_name='entity', first_n=1):
return None
def process_line(text, entity_embeddings=None):
def prepare_sentence_embeddings(path):
with open(path) as fl:
line = fl.readline().strip()
sentences = []
sentence_embeddings = {}
while line != "":
line = line.split("\t")
if line[1] not in sentences:
sentences.append(line[1])
line = fl.readline().strip()
embeddings = sentence_embedding_model.encode(sentences)
for sentence, embedding in zip(sentences, embeddings):
sentence_embeddings[sentence] = embedding
return sentence_embeddings
def get_sentence_embedding(sentence):
return {
sentence: sentence_embedding_model.encode(sentence)
}
def process_line(text, document_embeddings, entity_embeddings=None):
line = text.split("\t")
if len(line) > 2:
gold = line[2]
......@@ -48,7 +74,6 @@ def process_line(text, entity_embeddings=None):
except KeyError:
gold = np.full(100, -1)
except TypeError:
print(gold)
gold = np.full(100, -1)
candidates = []
for i in range(3, len(line)):
......@@ -65,7 +90,6 @@ def process_line(text, entity_embeddings=None):
except KeyError:
candidate = np.full(100, -1)
except TypeError:
print(candidate)
candidate = np.full(100, -1)
candidates.append(candidate)
if len(candidates) < 30:
......@@ -76,20 +100,23 @@ def process_line(text, entity_embeddings=None):
if entity_embeddings is None:
return candidates, gold, emb_dict
else:
return candidates, gold
return candidates, gold, document_embeddings[line[1]]
def prepare_training_data(entity_embeddings, path):
document_embeddings = prepare_sentence_embeddings(path)
with open(path, "r") as train_file:
line = train_file.readline().strip()
x_train = []
y_train = []
doc_embeddings = []
while line != "":
candidates, gold = process_line(line, entity_embeddings)
candidates, gold, doc_embedding = process_line(line, document_embeddings, entity_embeddings)
y_train.append(gold)
x_train.append(candidates)
doc_embeddings.append(doc_embedding)
line = train_file.readline().strip()
return x_train, y_train
return x_train, y_train, doc_embeddings
def annotation2nif(collection_name, entities, text):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment