Commit 6ef7aeaa authored by markus's avatar markus
Browse files

Cleanup

parent 4622fad3
%% Cell type:code id: tags:
```
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from pykeen_embeddings import generate_and_save_embeddings,load_embeddings_from_file
from run import extract_resources, read
from lib import ml_metrics
from rdflib import Graph, URIRef
def read(path, format):
"""
Read graph from file
:param path: path to file
:return: rdf graph
"""
g = Graph()
g.parse(path, format=format)
return g
def extract_resources(g: Graph, lp: int):
"""
Extract resources from the graph with respect to the given learning problem
:param g: rdf graph with data
:param lp: number of the learning problem
:return: list with included resources and list with excluded resources
"""
exclude_res = URIRef('https://lpbenchgen.org/property/excludesResource')
include_res = URIRef('https://lpbenchgen.org/property/includesResource')
u = URIRef('https://lpbenchgen.org/resource/lp_%d' % lp)
included_res = []
excluded_res = []
for i, obj in enumerate(g.objects(u, include_res)):
included_res.append(str(obj))
for i, obj in enumerate(g.objects(u, exclude_res)):
excluded_res.append(str(obj))
return included_res, excluded_res
def load_embeddings_from_file(embeddings_file : str) -> dict:
"""
Loads embeddings previously generated from file.
:embeddings_file: File containing embeddings from previous runs
:return: Dictionary of the form entity = embedding.
"""
embeddings = dict()
_t = ""
with open(embeddings_file, "r") as f:
for l in f.readlines():
entity = l.split("\t")[0].strip()
embedding = eval(l.split("\t")[1].strip())
embeddings[entity] = embedding
return embeddings
```
%% Cell type:code id: tags:
```
learning_problems = read(path='data/kg-mini-project-train.ttl', format='turtle')
embeddings = load_embeddings_from_file("data/embeddings/embeddings_carcinogenesis_transr_16dim.tsv")
learning_problems = read(path='../data/kg-mini-project-train.ttl', format='turtle')
embeddings = load_embeddings_from_file("../data/embeddings/embeddings_carcinogenesis_transr_16dim.tsv")
pos, neg = extract_resources(learning_problems, 1)
all_res = pos + neg
all_res_embeddings = np.array([embeddings[x] for x in all_res])
......@@ -23,42 +70,24 @@
```
%% Cell type:code id: tags:
```
# Identify cluster with majority positive examples
positives_labels = 1 if sum(clustering.labels_[:len(pos)]) > len(pos)/2 else 0
print(clustering.labels_[:41])
```
%% Cell type:code id: tags:
```
print(max(clustering.labels_), min(clustering.labels_))
print(positives_labels)
```
%% Cell type:code id: tags:
```
len(pos)
print(clustering.labels_[:41])
```
%% Cell type:code id: tags:
```
sum(clustering.labels_[41:])
```
%%%% Output: execute_result
235
241
%% Cell type:code id: tags:
```
......
rdflib~=5.0.0
pykeen~=1.5.0
ontolearn
\ No newline at end of file
from rdflib import Graph, URIRef
from sklearn.ensemble import RandomForestClassifier
from pykeen_embeddings import generate_and_save_embeddings, load_embeddings_from_file
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ml_utils import balance_data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from lib import knn
from lib import ml_metrics
def read(path, format):
"""
Read graph from file
:param path: path to file
:return: rdf graph
"""
g = Graph()
g.parse(path, format=format)
return g
def run_knn(g_ttl: Graph, path_to_embedding: str, lp: int):
"""Run knn on learning problem lp.
:param g_ttl: Graph containing LP definitions
:param path_to_embedding: path to tsv file containing embeddings
:param lp: Number of lp to try
"""
embeddings = load_embeddings_from_file(path_to_embedding)
p, n = extract_resources(g_ttl, lp)
p = [embeddings[str(x)] for x in p]
n = [embeddings[str(x)] for x in n]
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(p) - 1):
if knn.knn(p[:i] + p[i + 1:], n, p[i]):
TP += 1
else:
FN += 1
for i in range(len(n) - 1):
if not knn.knn(p, n[:i] + n[i + 1:], n[i]):
TN += 1
else:
FP += 1
ml_metrics.print_metrics(TP, TN, FP, FN)
def extract_resources(g: Graph, lp: int):
"""
Extract resources from the graph with respect to the given learning problem
:param g: rdf graph with data
:param lp: number of the learning problem
:return: list with included resources and list with excluded resources
"""
exclude_res = URIRef('https://lpbenchgen.org/property/excludesResource')
include_res = URIRef('https://lpbenchgen.org/property/includesResource')
u = URIRef('https://lpbenchgen.org/resource/lp_%d' % lp)
included_res = []
excluded_res = []
for i, obj in enumerate(g.objects(u, include_res)):
included_res.append(str(obj))
for i, obj in enumerate(g.objects(u, exclude_res)):
excluded_res.append(str(obj))
return included_res, excluded_res
# g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
# generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
def generate_features(path="data/embeddings/embeddings_carcinogenesis_transr_32dim.tsv", lp=1):
g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
included_res, excluded_res = extract_resources(g1, lp=lp)
df = pd.read_csv(path, delimiter='\t', names=['name', 'X'])
X = df['X'].to_list()
new_X = []
for row in X:
new_X.append(np.fromstring(row[1:-1], sep=','))
df = pd.concat([df['name'], pd.DataFrame(new_X)], axis=1)
df.loc[df['name'].isin(included_res), 'y'] = 1
df.loc[df['name'].isin(excluded_res), 'y'] = 0
df = df.dropna()
return df.iloc[:, 1:-1], df['y']
for lp in range(8, 25):
X, y = generate_features(lp=lp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, y_train = balance_data(X_train, y_train, random_state=42)
clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
prediction = clf.predict(X_test)
for a, b in zip(prediction, y_test):
if a != b:
print(a, b)
print("F1:", f1_score(y_test, prediction))
print("precision:", precision_score(y_test, prediction))
print("recall:", recall_score(y_test, prediction))
break
# g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
#g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
# run_knn(g1, "data/embeddings_carcinogenesis.tsv", 4)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", "TransE", 16)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transe_32dim.tsv", "TransE", 32)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transe_64dim.tsv", "TransE", 64)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transr_16dim.tsv", "TransR", 16)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transr_32dim.tsv", "TransR", 32)
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
# "data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv", "TransR", 64)
# for lp in range(1, 26):
# print(extract_resources(g1, lp))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment