Commit ffa1e0c3 authored by Nandeesh Patel Gowdru Prabushanker's avatar Nandeesh Patel Gowdru Prabushanker
Browse files

Merge branch 'embedding-index' into 'master'

New changes

See merge request !2
parents fcb7eb78 f3ee841e
## DBpedia entity embeddings
## DBpedia entity embeddings
This repo has APIs the can be used to access embeddings for all the DBpedia entities. The embeddings are indexed on Elasticsearch server.
<br><br>
### List of APIs
### List of APIs
####1. Get entity embeddings
#### 1. Get entity embeddings
This API takes a list of entities as input and returns the embeddings of the given entities in response. It returns embeddings of first 10 unique entities and ignores the rest.
```
URL: /get-entity-embedding
......@@ -15,14 +15,14 @@ Request Body: {
}
```
####2. Get elastic search entity embedding index properties
#### 2. Get elastic search entity embedding index properties
This API returns the list of properties of every document in the Elasticsearch index of entity embeddings.
```
URL: /get-entity-index-info
METHOD: GET
```
####3. Get relation embeddings
#### 3. Get relation embeddings
This API takes a list of relations as input and returns the embeddings of the given entities in response. It returns embeddings of first 10 unique entities and ignores the rest.
```
URL: /get-relation-embedding
......@@ -32,9 +32,11 @@ Request Body: {
}
```
####4. Get elastic search relation embedding index properties
#### 4. Get elastic search relation embedding index properties
This API returns the list of properties of every document in the Elasticsearch index of relation embeddings.
```
URL: /get-relation-index-info
METHOD: GET
```
\ No newline at end of file
```
These APIs can be accessed from http://nel.cs.upb.de:5000/ on UPB network for now.
\ No newline at end of file
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, ReLU, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CosineSimilarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from utils import prepare_training_data
def train_model(train_x, train_y):
entity_in = Input(shape=(30, 100))
lstm_layer = Bidirectional(LSTM(units=100))(entity_in)
relu_layer = ReLU()(lstm_layer)
out_layer = Dense(100, activation="sigmoid")(relu_layer)
model = Model(entity_in, out_layer)
model.compile(loss=CosineSimilarity(axis=1), optimizer='sgd', metrics=['accuracy'])
num_epochs = 5
model.fit(train_x, train_y, batch_size=10, epochs=num_epochs, validation_split=0.25, verbose=1)
return model
def get_ref_entity(candidates, prediction):
similarity = -2
ref_entity = None
for candidate in candidates:
if cosine_similarity([candidate], [prediction])[0][0] > similarity:
ref_entity = candidate
similarity = cosine_similarity([candidate], [prediction])[0][0]
return ref_entity
def get_entity_from_embeddings(embeddings_array):
entities_list = []
for embedding in embeddings_array:
entity = data[data.embeddings == ",".join(map(str, embedding))]
if entity.empty:
entities_list.append("http://dbpedia.org/unknown")
else:
entities_list.append(entity.get('entity').array[0])
return entities_list
def evaluate(x_test, y_test):
y_prediction = entity_linker(x_test)
y_pred = []
i = 0
for entity in x_test:
y_pred.append(get_ref_entity(entity, y_prediction[i]))
i += 1
test_labels = get_entity_from_embeddings(y_test)
predicted_labels = get_entity_from_embeddings(y_pred)
print("Micro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="micro")))
print("Macro F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="macro")))
print("Weighted F1:\t" + str(f1_score(y_true=test_labels, y_pred=predicted_labels, average="weighted")))
if __name__ == "__main__":
data = pd.read_csv("./../data/embeddings.tsv", sep="\t")
embeddings = {}
for index, entry in data.iterrows():
embeddings[entry.get('entity')] = entry.get('embeddings')
X_train, Y_train = prepare_training_data(embeddings, "./../data/train_data.csv")
embeddings = None
X_train = np.array(X_train)
Y_train = np.array(Y_train)
entity_linker = train_model(X_train, Y_train)
X_train = None
Y_train = None
X_test, Y_test = prepare_training_data(embeddings, "./../data/test_data.csv")
evaluate(X_test, Y_test)
print("Done")
import numpy as np
def prepare_training_data(entity_embeddings, path):
with open(path, "r") as train_file:
line = train_file.readline().strip()
x_train = []
y_train = []
while line != "":
line = line.split("\t")
gold = line[2]
try:
gold = np.array(entity_embeddings[gold].split(","), dtype="float")
except KeyError:
gold = np.zeros(100)
y_train.append(gold)
candidates = []
for i in range(3, len(line)):
if i % 2 != 0:
candidate = line[i]
try:
candidate = np.array(entity_embeddings[candidate].split(","), dtype="float")
except KeyError:
candidate = np.full(100, -1)
candidates.append(candidate)
if len(candidates) < 30:
while len(candidates) < 30:
candidates.append(np.full(100, -1))
else:
candidates = candidates[:30]
x_train.append(candidates)
line = train_file.readline().strip()
return x_train, y_train
def test():
print("Inside the util module")
numpy==1.19.5
pandas==1.3.1
scikit-learn==0.24.2
scipy==1.7.1
tensorflow==2.6.0
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment