Commit c5f817bb authored by lgehring's avatar lgehring
Browse files

m

parents 8a030755 9b804fa6
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -22,7 +22,7 @@ def knn(p : List, n : List, instance : List, d = eucl_dist) -> bool:
:param d: Distance function. Default is euclidian distance
:return: Classification result of instance
"""
k = int((len(p) + len(n))**0.5)
k = int(min(len(p), len(n))**0.5)
if k % 2 == 0:
k += 1 #make k uneven for majority vote
distances = [(True, d(instance, i)) for i in p] + [(False,d(instance, i)) for i in n]
......
def accuracy(TP : int, TN : int, FP : int, FN : int) -> float:
"""Ratio of correct predicitons to all predictions.
:param TP: True positives
:param TN: True negatives
:param FP: False positives
:param FN: False negatives
"""
return (TP + TN) / (TP + TN + FP + FN)
def precision(TP : int, FP : int) -> float:
"""Number of true positives in all positives.
:param TP: True positives
:param FP: False negatives
"""
return TP / (TP + FP)
def recall(TP : int, FN : int) -> float:
"""How many true instances were calssified correctly?
:param TP: True positives
:param FN: False negatives
"""
return TP / (TP + FN)
def f1_score(TP : int, FP : int, FN : int) -> float:
"""Weighted average between precision and recall.
:param TP: True positives
:param FP: False positives
:param FN: False negatives
"""
return 2 * (recall(TP, FN)*precision(TP, FP)) / (recall(TP, FN) + precision(TP, FP))
def print_metrics(TP : int, TN : int, FP : int, FN : int):
"""Prints all metrics at once.
:param TP: True positives
:param TN: True negatives
:param FP: False positives
:param FN: False negatives
"""
print(f"Accuracy: {accuracy(TP, TN, FP, FN)}")
print(f"Precision: {precision(TP, FP)}")
print(f"Recall: {recall(TP, FN)}")
print(f"F1-Score: {f1_score(TP, FP, FN)}")
if __name__ == "__main__":
print_metrics(25, 65, 5, 5)
\ No newline at end of file
import random
import pandas as pd
def balance_data(X, y, random_state=None):
if random_state is None:
random_state = random.randint(0, 100000)
min_label_count = y.value_counts().min()
labels = y.unique()
balanced_Xs = []
balanced_ys = []
for l in labels:
mask = y == l
balanced_Xs.append(X[mask].sample(n=min_label_count, random_state=random_state))
balanced_ys.append(y[mask].sample(n=min_label_count, random_state=random_state))
# TODO: need to sort_index()?
balanced_X = pd.concat(balanced_Xs, axis=0).sort_index()
balanced_y = pd.concat(balanced_ys, axis=0).sort_index()
return balanced_X, balanced_y
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from rdflib import Graph, URIRef
import matplotlib.pyplot as plt
def read(path, format):
"""
Read graph from file
:param path: path to file
:return: rdf graph
"""
g = Graph()
g.parse(path, format=format)
return g
def extract_resources(g: Graph, lp: int):
"""
Extract resources from the graph with respect to the given learning problem
:param g: rdf graph with data
:param lp: number of the learning problem
:return: list with included resources and list with excluded resources
"""
exclude_res = URIRef('https://lpbenchgen.org/property/excludesResource')
include_res = URIRef('https://lpbenchgen.org/property/includesResource')
u = URIRef('https://lpbenchgen.org/resource/lp_%d' % lp)
included_res = []
excluded_res = []
for i, obj in enumerate(g.objects(u, include_res)):
included_res.append(str(obj))
for i, obj in enumerate(g.objects(u, exclude_res)):
excluded_res.append(str(obj))
return included_res, excluded_res
def load_embeddings_from_file(embeddings_file: str) -> dict:
"""
Loads embeddings previously generated from file.
:embeddings_file: File containing embeddings from previous runs
:return: Dictionary of the form entity = embedding.
"""
embeddings = dict()
_t = ""
with open(embeddings_file, "r") as f:
for l in f.readlines():
entity = l.split("\t")[0].strip()
embedding = eval(l.split("\t")[1].strip())
embeddings[entity] = embedding
return embeddings
def generate_features(path="data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", dim=16, lp=1):
g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
included_res, excluded_res = extract_resources(g1, lp=lp)
data = load_embeddings_from_file(embeddings_file=path)
keys = data.keys()
values = data.values()
df = pd.DataFrame(data=values, index=keys)
df.loc[included_res, 'y'] = 1
df.loc[excluded_res, 'y'] = 0
df = df.dropna()
return df[np.arange(dim)], df['y']
def plot_pca(X, y) -> float:
"""
this function plots the first 2 principal components of the data.
the classes are used for coloring the scatterplot (unlabeled elements as gray)
:return: explained variance of the pca plot
"""
pca = PCA(n_components=2)
X = pd.DataFrame(data=pca.fit_transform(X), columns=['x', 'y'])
sns.scatterplot(x='x', y='y', data=X, hue=y.to_numpy())
return sum(pca.explained_variance_)
for dim in [16,32,64]:
mode = 'transr'
X, y = generate_features(path=f"data/embeddings/embeddings_carcinogenesis_{mode}_{dim}dim.tsv", dim=dim)
plot_pca(X, y)
plt.title(f"dim {dim} {mode}")
plt.show()
......@@ -12,7 +12,7 @@ def _create_tsv(g: Graph, tsv_target_file : str):
for subj, pred, obj in g:
f.write(f"{subj}\t{pred}\t{obj}\n")
def _create_embeddings(tsv_file : str) -> dict:
def _create_embeddings(tsv_file : str, model_name : str = "TransE", dim : int = 16) -> dict:
"""
Learns embeddings based on a given tsv file and returns then, mapped
to their corresponding entity, as vector in dictionary.
......@@ -20,28 +20,30 @@ def _create_embeddings(tsv_file : str) -> dict:
:return: Dictionary containing all embeddings
"""
# Split data provied into training and testing sets
tf = TriplesFactory(path=tsv_file)
#tf = TriplesFactory(path=tsv_file)
tf = TriplesFactory.from_path(tsv_file)
training, testing = tf.split()
# Learn embeddings based on pykeen
embeddings = dict()
result = pipeline(
model='TransE',
model=model_name,
training=training,
testing=testing,
model_kwargs=dict(embedding_dim=8),
model_kwargs=dict(embedding_dim=dim),
training_kwargs=dict(num_epochs=64, use_tqdm_batch=False),
negative_sampler = "bernoulli",
)
# Extract embeddings and map id to entitiy
embeddings_numpy = result.model.entity_embeddings.weight.detach().numpy()
entity_id_to_label = result.model.triples_factory.entity_id_to_label
embeddings_numpy = result.model.entity_embeddings._embeddings.weight.detach().numpy()
entity_id_to_label = result.training.entity_id_to_label
for index, data in enumerate(embeddings_numpy):
embeddings[entity_id_to_label[index]] = data
return embeddings
def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str):
def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str, model_name : str = "TransE", dim : int = 16):
"""
Generates and saves embeddings based on graph g. Intermediate triples are saved in tsv_target_file.
out_file contains the embeddings in the format:
......@@ -51,7 +53,7 @@ def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str
:out_file: File to write embeddings to
"""
_create_tsv(g, tsv_target_file)
embeddings = _create_embeddings(tsv_target_file)
embeddings = _create_embeddings(tsv_target_file, model_name, dim)
with open(out_file, "w") as f:
f.write("\n".join([f"{key}\t{list(embeddings[key])}" for key in embeddings]))
......
rdflib~=5.0.0
pykeen
pykeen~=1.5.0
ontolearn
\ No newline at end of file
......@@ -8,6 +8,9 @@ from sklearn.model_selection import train_test_split
from ml_utils import balance_data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from lib import knn
from lib import ml_metrics
def read(path, format):
"""
......@@ -21,6 +24,38 @@ def read(path, format):
return g
def run_knn(g_ttl: Graph, path_to_embedding: str, lp: int):
"""Run knn on learning problem lp.
:param g_ttl: Graph containing LP definitions
:param path_to_embedding: path to tsv file containing embeddings
:param lp: Number of lp to try
"""
embeddings = load_embeddings_from_file(path_to_embedding)
p, n = extract_resources(g_ttl, lp)
p = [embeddings[str(x)] for x in p]
n = [embeddings[str(x)] for x in n]
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(p) - 1):
if knn.knn(p[:i] + p[i + 1:], n, p[i]):
TP += 1
else:
FN += 1
for i in range(len(n) - 1):
if not knn.knn(p, n[:i] + n[i + 1:], n[i]):
TN += 1
else:
FP += 1
ml_metrics.print_metrics(TP, TN, FP, FN)
def extract_resources(g: Graph, lp: int):
"""
Extract resources from the graph with respect to the given learning problem
......@@ -42,10 +77,11 @@ def extract_resources(g: Graph, lp: int):
return included_res, excluded_res
<< << << < HEAD
#g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
# g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
# generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
def generate_features(path="data/embeddings_carcinogenesis.tsv", lp=1):
g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
......@@ -74,11 +110,30 @@ for lp in range(8, 25):
prediction = clf.predict(X_test)
for a, b in zip(prediction, y_test):
if a != b:
print(a,b)
print(a, b)
print("F1:", f1_score(y_test, prediction))
print("precision:", precision_score(y_test, prediction))
print("recall:", recall_score(y_test, prediction))
break
# g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
# run_knn(g1, "data/embeddings_carcinogenesis.tsv", 4)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", "TransE", 16)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transe_32dim.tsv", "TransE", 32)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transe_64dim.tsv", "TransE", 64)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transr_16dim.tsv", "TransR", 16)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transr_32dim.tsv", "TransR", 32)
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv",
"data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv", "TransR", 64)
# for lp in range(1, 26):
# print(extract_resources(g1, lp))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment