Commit fcb7eb78 authored by Nandeesh Patel Gowdru Prabushanker's avatar Nandeesh Patel Gowdru Prabushanker
Browse files

Merge branch 'embedding-index' into 'master'

Adding index manager file and relation index changes

See merge request !1
parents bb2c2fb7 0a49eff2
......@@ -8,7 +8,8 @@ with open('app.props', 'rb') as config_file:
configs.load(config_file)
app = Flask(__name__)
cors = CORS(app)
es = Elasticsearch(["http://nel.cs.upb.de:9200"], http_auth=(configs.get("elastic.user").data, configs.get("elastic.password").data))
es = Elasticsearch(["http://nel.cs.upb.de:9200"],
http_auth=(configs.get("elastic.user").data, configs.get("elastic.password").data))
@app.route('/ping', methods=['GET'])
......@@ -17,23 +18,26 @@ def test():
return "Status:\tOK", 200
def get_embeddings(entity):
res = es.search(index="embedding_index", body={
def get_embeddings(query_string, index_name, field_name='entity', first_n=1):
res = es.search(index=index_name, body={
"query": {
"match": {
"entity": entity
field_name: query_string
}
}
})
hits = res['hits']['hits']
if len(hits) > 0:
return hits[0]['_source']['embeddings']
results = []
for i in range(max(first_n, len(hits))):
results.append(hits[i]['_source'])
return results
return None
@app.route('/get-embedding', methods=['GET'])
@app.route('/get-entity-embedding', methods=['GET'])
@cross_origin()
def get_embedding():
def get_entity_embedding():
if "entities" not in request.json:
return "Invalid parameters", 400
entities = request.json["entities"]
......@@ -41,18 +45,53 @@ def get_embedding():
for entity in entities:
if entity in embeddings:
continue
embeddings[entity] = get_embeddings(entity)
embeddings[entity] = get_embeddings(entity, "embedding_index")[0]['embeddings']
if len(embeddings.keys()) >= 10:
break
return embeddings
@app.route('/get-index-info', methods=['GET'])
@app.route('/get-entity-index-info', methods=['GET'])
@cross_origin()
def get_index_info():
def get_entity_index_info():
settings = es.indices.get(index="embedding_index")
return settings["embedding_index"]["mappings"]
@app.route('/get-relation-index-info', methods=['GET'])
@cross_origin()
def get_relation_index_info():
settings = es.indices.get(index="relation_embedding_index")
return settings["relation_embedding_index"]["mappings"]
@app.route('/get-relation-embedding', methods=['GET'])
@cross_origin()
def get_relation_embedding():
if "relations" not in request.json:
return "Invalid parameters", 400
entities = request.json["relations"]
embeddings = {}
for entity in entities:
if entity in embeddings:
continue
hits = get_embeddings(entity, "relation_embedding_index", "relation", 4)
embeddings[entity] = {
'real': {
'rhs': [],
'lhs': []
},
'imag': {
'rhs': [],
'lhs': []
}
}
for hit in hits:
embeddings[entity][hit['dtype']][hit['operator']] = hit['embeddings']
if len(embeddings.keys()) >= 10:
break
return embeddings
if __name__ == '__main__':
app.run(debug=True)
from dask.distributed import Client
import dask.dataframe as dd
from elasticsearch import Elasticsearch
from jproperties import Properties
configs = Properties()
with open('app.props', 'rb') as config_file:
configs.load(config_file)
es = Elasticsearch(["http://nel.cs.upb.de:9200"],
http_auth=(configs.get("elastic.user").data, configs.get("elastic.password").data))
def create_entity_index():
index_config = {
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1
},
'mappings': {
'properties': {
'id': {
'type': 'keyword'
},
'entity': {
'type': 'keyword'
},
'embeddings': {
'type': 'dense_vector',
'dims': 100
}
}
}
}
res = es.indices.create("embedding_index", body=index_config)
print("Done")
def create_relation_index():
index_config = {
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1
},
'mappings': {
'properties': {
'id': {
'type': 'keyword'
},
'relation': {
'type': 'keyword'
},
'operator': {
'type': 'text'
},
'dtype': {
'type': 'text'
},
'embeddings': {
'type': 'dense_vector',
'dims': 50
}
}
}
}
res = es.indices.create("relation_embedding_index", body=index_config)
print("Done")
def index_entity_docs():
data = dd.read_csv("../nel/embeddings/entity_embeddings.tsv", sep="\t")
count = 0
doc_id = 0
documents = []
print("*********************************************")
print("Starting to index the entity embeddings now!!")
print("*********************************************")
for index, row in data.iterrows() :
embeddings = [float(i) for i in row['entity_embeddings'].split(",")]
entity = row['entity']
documents.append({
"index": {
"_id": doc_id,
"_index": "embedding_index"
}
})
documents.append({
"id": doc_id,
"entity": entity,
"embeddings": embeddings
})
doc_id += 1
if len(documents) == 100000:
es.bulk(index="embedding_index", body=documents)
count += 50000
print(count)
documents = []
es.bulk(index="embedding_index", body=documents)
print(doc_id)
print("*********************************************")
print("DONE!!")
print("*********************************************")
def index_relation_docs():
data = dd.read_csv("./relation_embeddings.tsv", sep="\t")
count = 0
doc_id = 0
documents = []
print("*********************************************")
print("Starting to index the relation embeddings now!!")
print("*********************************************")
for index, row in data.iterrows() :
embeddings = [float(i) for i in row['relation_embeddings'].split(",")]
relation = row['relation']
operator = row['operator']
dtype = row['dtype']
documents.append({
"index": {
"_id": doc_id,
"_index": "relation_embedding_index"
}
})
documents.append({
"id": doc_id,
"relation": relation,
"embeddings": embeddings,
"operator": operator,
"dtype": dtype
})
doc_id += 1
if len(documents) == 10000:
es.bulk(index="relation_embedding_index", body=documents)
count += 5000
print(count)
documents = []
es.bulk(index="relation_embedding_index", body=documents)
print(doc_id)
print("*********************************************")
print("DONE!!")
print("*********************************************")
if __name__ == "__main__":
client = Client()
client
# create_entity_index()
# index_entity_docs()
create_relation_index()
index_relation_docs()
## DBpedia entity embeddings
This repo has APIs the can be used to access embeddings for all the DBpedia entities. The embeddings are indexed on Elasticsearch server.
<br><br>
### List of APIs
####1. Get entity embeddings
This API takes a list of entities as input and returns the embeddings of the given entities in response. It returns embeddings of first 10 unique entities and ignores the rest.
```
URL: /get-entity-embedding
METHOD: GET
Request Body: {
"entities": Array of entities
}
```
####2. Get elastic search entity embedding index properties
This API returns the list of properties of every document in the Elasticsearch index of entity embeddings.
```
URL: /get-entity-index-info
METHOD: GET
```
####3. Get relation embeddings
This API takes a list of relations as input and returns the embeddings of the given entities in response. It returns embeddings of first 10 unique entities and ignores the rest.
```
URL: /get-relation-embedding
METHOD: GET
Request Body: {
"relations": Array of relations
}
```
####4. Get elastic search relation embedding index properties
This API returns the list of properties of every document in the Elasticsearch index of relation embeddings.
```
URL: /get-relation-index-info
METHOD: GET
```
\ No newline at end of file
......@@ -4,3 +4,4 @@ Flask-Cors==3.0.10
jproperties==2.1.1
requests==2.26.0
uWSGI==2.0.19.1
dask==2021.8.0
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment