Commit c5f817bb by lgehring

### m

parents 8a030755 9b804fa6
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
 ... ... @@ -22,7 +22,7 @@ def knn(p : List, n : List, instance : List, d = eucl_dist) -> bool: :param d: Distance function. Default is euclidian distance :return: Classification result of instance """ k = int((len(p) + len(n))**0.5) k = int(min(len(p), len(n))**0.5) if k % 2 == 0: k += 1 #make k uneven for majority vote distances = [(True, d(instance, i)) for i in p] + [(False,d(instance, i)) for i in n] ... ...
lib/ml_metrics.py 0 → 100644
 def accuracy(TP : int, TN : int, FP : int, FN : int) -> float: """Ratio of correct predicitons to all predictions. :param TP: True positives :param TN: True negatives :param FP: False positives :param FN: False negatives """ return (TP + TN) / (TP + TN + FP + FN) def precision(TP : int, FP : int) -> float: """Number of true positives in all positives. :param TP: True positives :param FP: False negatives """ return TP / (TP + FP) def recall(TP : int, FN : int) -> float: """How many true instances were calssified correctly? :param TP: True positives :param FN: False negatives """ return TP / (TP + FN) def f1_score(TP : int, FP : int, FN : int) -> float: """Weighted average between precision and recall. :param TP: True positives :param FP: False positives :param FN: False negatives """ return 2 * (recall(TP, FN)*precision(TP, FP)) / (recall(TP, FN) + precision(TP, FP)) def print_metrics(TP : int, TN : int, FP : int, FN : int): """Prints all metrics at once. :param TP: True positives :param TN: True negatives :param FP: False positives :param FN: False negatives """ print(f"Accuracy: {accuracy(TP, TN, FP, FN)}") print(f"Precision: {precision(TP, FP)}") print(f"Recall: {recall(TP, FN)}") print(f"F1-Score: {f1_score(TP, FP, FN)}") if __name__ == "__main__": print_metrics(25, 65, 5, 5) \ No newline at end of file
ml_utils.py 0 → 100644
 import random import pandas as pd def balance_data(X, y, random_state=None): if random_state is None: random_state = random.randint(0, 100000) min_label_count = y.value_counts().min() labels = y.unique() balanced_Xs = [] balanced_ys = [] for l in labels: mask = y == l balanced_Xs.append(X[mask].sample(n=min_label_count, random_state=random_state)) balanced_ys.append(y[mask].sample(n=min_label_count, random_state=random_state)) # TODO: need to sort_index()? balanced_X = pd.concat(balanced_Xs, axis=0).sort_index() balanced_y = pd.concat(balanced_ys, axis=0).sort_index() return balanced_X, balanced_y
plot_data.py 0 → 100644
 import pandas as pd import numpy as np import seaborn as sns from sklearn.decomposition import PCA from rdflib import Graph, URIRef import matplotlib.pyplot as plt def read(path, format): """ Read graph from file :param path: path to file :return: rdf graph """ g = Graph() g.parse(path, format=format) return g def extract_resources(g: Graph, lp: int): """ Extract resources from the graph with respect to the given learning problem :param g: rdf graph with data :param lp: number of the learning problem :return: list with included resources and list with excluded resources """ exclude_res = URIRef('https://lpbenchgen.org/property/excludesResource') include_res = URIRef('https://lpbenchgen.org/property/includesResource') u = URIRef('https://lpbenchgen.org/resource/lp_%d' % lp) included_res = [] excluded_res = [] for i, obj in enumerate(g.objects(u, include_res)): included_res.append(str(obj)) for i, obj in enumerate(g.objects(u, exclude_res)): excluded_res.append(str(obj)) return included_res, excluded_res def load_embeddings_from_file(embeddings_file: str) -> dict: """ Loads embeddings previously generated from file. :embeddings_file: File containing embeddings from previous runs :return: Dictionary of the form entity = embedding. """ embeddings = dict() _t = "" with open(embeddings_file, "r") as f: for l in f.readlines(): entity = l.split("\t")[0].strip() embedding = eval(l.split("\t")[1].strip()) embeddings[entity] = embedding return embeddings def generate_features(path="data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", dim=16, lp=1): g1 = read(path='data/kg-mini-project-train.ttl', format='turtle') included_res, excluded_res = extract_resources(g1, lp=lp) data = load_embeddings_from_file(embeddings_file=path) keys = data.keys() values = data.values() df = pd.DataFrame(data=values, index=keys) df.loc[included_res, 'y'] = 1 df.loc[excluded_res, 'y'] = 0 df = df.dropna() return df[np.arange(dim)], df['y'] def plot_pca(X, y) -> float: """ this function plots the first 2 principal components of the data. the classes are used for coloring the scatterplot (unlabeled elements as gray) :return: explained variance of the pca plot """ pca = PCA(n_components=2) X = pd.DataFrame(data=pca.fit_transform(X), columns=['x', 'y']) sns.scatterplot(x='x', y='y', data=X, hue=y.to_numpy()) return sum(pca.explained_variance_) for dim in [16,32,64]: mode = 'transr' X, y = generate_features(path=f"data/embeddings/embeddings_carcinogenesis_{mode}_{dim}dim.tsv", dim=dim) plot_pca(X, y) plt.title(f"dim {dim} {mode}") plt.show()
 ... ... @@ -12,7 +12,7 @@ def _create_tsv(g: Graph, tsv_target_file : str): for subj, pred, obj in g: f.write(f"{subj}\t{pred}\t{obj}\n") def _create_embeddings(tsv_file : str) -> dict: def _create_embeddings(tsv_file : str, model_name : str = "TransE", dim : int = 16) -> dict: """ Learns embeddings based on a given tsv file and returns then, mapped to their corresponding entity, as vector in dictionary. ... ... @@ -20,28 +20,30 @@ def _create_embeddings(tsv_file : str) -> dict: :return: Dictionary containing all embeddings """ # Split data provied into training and testing sets tf = TriplesFactory(path=tsv_file) #tf = TriplesFactory(path=tsv_file) tf = TriplesFactory.from_path(tsv_file) training, testing = tf.split() # Learn embeddings based on pykeen embeddings = dict() result = pipeline( model='TransE', model=model_name, training=training, testing=testing, model_kwargs=dict(embedding_dim=8), model_kwargs=dict(embedding_dim=dim), training_kwargs=dict(num_epochs=64, use_tqdm_batch=False), negative_sampler = "bernoulli", ) # Extract embeddings and map id to entitiy embeddings_numpy = result.model.entity_embeddings.weight.detach().numpy() entity_id_to_label = result.model.triples_factory.entity_id_to_label embeddings_numpy = result.model.entity_embeddings._embeddings.weight.detach().numpy() entity_id_to_label = result.training.entity_id_to_label for index, data in enumerate(embeddings_numpy): embeddings[entity_id_to_label[index]] = data return embeddings def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str): def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str, model_name : str = "TransE", dim : int = 16): """ Generates and saves embeddings based on graph g. Intermediate triples are saved in tsv_target_file. out_file contains the embeddings in the format: ... ... @@ -51,7 +53,7 @@ def generate_and_save_embeddings(g : Graph, tsv_target_file: str, out_file : str :out_file: File to write embeddings to """ _create_tsv(g, tsv_target_file) embeddings = _create_embeddings(tsv_target_file) embeddings = _create_embeddings(tsv_target_file, model_name, dim) with open(out_file, "w") as f: f.write("\n".join([f"{key}\t{list(embeddings[key])}" for key in embeddings])) ... ...
 rdflib~=5.0.0 pykeen pykeen~=1.5.0 ontolearn \ No newline at end of file
 ... ... @@ -8,6 +8,9 @@ from sklearn.model_selection import train_test_split from ml_utils import balance_data from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score, precision_score, recall_score from lib import knn from lib import ml_metrics def read(path, format): """ ... ... @@ -21,6 +24,38 @@ def read(path, format): return g def run_knn(g_ttl: Graph, path_to_embedding: str, lp: int): """Run knn on learning problem lp. :param g_ttl: Graph containing LP definitions :param path_to_embedding: path to tsv file containing embeddings :param lp: Number of lp to try """ embeddings = load_embeddings_from_file(path_to_embedding) p, n = extract_resources(g_ttl, lp) p = [embeddings[str(x)] for x in p] n = [embeddings[str(x)] for x in n] TP = 0 FP = 0 TN = 0 FN = 0 for i in range(len(p) - 1): if knn.knn(p[:i] + p[i + 1:], n, p[i]): TP += 1 else: FN += 1 for i in range(len(n) - 1): if not knn.knn(p, n[:i] + n[i + 1:], n[i]): TN += 1 else: FP += 1 ml_metrics.print_metrics(TP, TN, FP, FN) def extract_resources(g: Graph, lp: int): """ Extract resources from the graph with respect to the given learning problem ... ... @@ -42,10 +77,11 @@ def extract_resources(g: Graph, lp: int): return included_res, excluded_res << << << < HEAD #g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml') #generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv") # g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml') # generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv") def generate_features(path="data/embeddings_carcinogenesis.tsv", lp=1): g1 = read(path='data/kg-mini-project-train.ttl', format='turtle') ... ... @@ -74,11 +110,30 @@ for lp in range(8, 25): prediction = clf.predict(X_test) for a, b in zip(prediction, y_test): if a != b: print(a,b) print(a, b) print("F1:", f1_score(y_test, prediction)) print("precision:", precision_score(y_test, prediction)) print("recall:", recall_score(y_test, prediction)) break # g1 = read(path='data/kg-mini-project-train.ttl', format='turtle') g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml') # run_knn(g1, "data/embeddings_carcinogenesis.tsv", 4) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", "TransE", 16) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transe_32dim.tsv", "TransE", 32) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transe_64dim.tsv", "TransE", 64) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transr_16dim.tsv", "TransR", 16) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transr_32dim.tsv", "TransR", 32) generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv", "TransR", 64) # for lp in range(1, 26): # print(extract_resources(g1, lp))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment