Commit 36e14359 authored by lgehring's avatar lgehring
Browse files

add prediction of grading

parent d9cf2fbb
from rdflib import Graph, URIRef
import sys
from sklearn.svm import LinearSVC
from pykeen_embeddings import load_embeddings_from_file
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from rdflib import Graph, Literal, Namespace, XSD, URIRef
def read(path, format):
......@@ -42,10 +42,8 @@ def extract_resources(g: Graph, lp: int):
return included_res, excluded_res
def generate_features(path="data/embeddings/embeddings_carcinogenesis_transe_16dim.tsv", lp=1):
g1 = read(path='data/kg-mini-project-train_v2.ttl', format='turtle')
included_res, excluded_res = extract_resources(g1, lp=lp)
def generate_features(included_res: list, excluded_res: list,
path="data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv"):
embeddings = load_embeddings_from_file(embeddings_file=path)
df = pd.DataFrame.from_dict(data=embeddings, orient='index')
......@@ -58,47 +56,82 @@ def generate_features(path="data/embeddings/embeddings_carcinogenesis_transe_16d
return df.iloc[:, :-1], df['y']
def get_to_classify(path_to_embeddings : str, pos : list, neg : list):
embeddings = load_embeddings_from_file(embeddings_file=path_to_embeddings)
def get_to_classify(included_res: list, excluded_res: list,
path="data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv"):
embeddings = load_embeddings_from_file(embeddings_file=path)
entity_names = open("data/all_entities.txt", "r").read().split("\n")
fin = {}
all_given = pos + neg
all_given = included_res + excluded_res
for k in embeddings:
if k not in all_given:
fin[k] = embeddings[k]
fin[k.split('#')[-1]] = embeddings[k]
return fin
scores = []
for lp in range(1, 25):
X, y = generate_features(path="data/embeddings/embeddings_carcinogenesis_transr_64dim.tsv", lp=lp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
ros = SMOTE(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)
def __main__(path='data/kg-mini-project-grading.ttl'):
g = read(path=path, format='turtle')
data = {}
for lp in range(33, 50):
print(f"start lp {lp}")
included_res, excluded_res = extract_resources(g, lp=lp)
X_train, y_train = generate_features(included_res, excluded_res)
ros = SMOTE(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)
clf = LinearSVC(random_state=1).fit(X_train, y_train)
zeros = y_train.value_counts()[0]
ones = y_train.value_counts()[1]
total = zeros + ones
embeddings = get_to_classify(included_res, excluded_res)
weight_zero = 1 - zeros / total
weight_one = 1 - ones / total
X_pred = list(embeddings.values())
weights = [weight_zero if x == 0 else weight_one for x in y_train]
predictions = clf.predict(X_pred)
pred_dict = {}
for x, y in zip(embeddings.keys(), predictions):
pred_dict[x] = y
pred_included = [k for k, v in pred_dict.items() if v == 1]
pred_excluded = [k for k, v in pred_dict.items() if v == 0]
data[lp] = (pred_included, pred_excluded)
print(f"finished lp {lp}")
create_rdf(data)
def create_rdf(data):
# create a Graph
g = Graph()
clf = LinearSVC(random_state=1).fit(X_train, y_train)
predictions = clf.predict(X_test)
# print(predictions)
# print(y_test.to_numpy())
# prefix
CARCINOGENESIS = Namespace("http://dl-learner.org/carcinogenesis#")
LPRES = Namespace("https://lpbenchgen.org/resource/")
LPPROP = Namespace("https://lpbenchgen.org/property/")
g.bind("carcinogenesis", CARCINOGENESIS)
g.bind("lpres", LPRES)
g.bind("lpprop", LPPROP)
f1 = f1_score(y_test, predictions)
scores.append(f1)
print(f'LP{lp}, f1:{f1}')
for lp, (included, excluded) in data.items():
# included res
g.add((LPRES[f"result_{lp}pos"], LPPROP.belongsToLP, Literal("true", datatype=XSD.boolean)))
g.add((LPRES[f"result_{lp}pos"], LPPROP.pertainsTo, LPRES[f"lp_{lp}"]))
for res in included:
g.add((LPRES[f"result_{lp}pos"], LPPROP.resource, CARCINOGENESIS[res]))
def average(lst):
return sum(lst) / len(lst)
# excluded res
g.add((LPRES[f"result_{lp}neg"], LPPROP.belongsToLP, Literal("false", datatype=XSD.boolean)))
g.add((LPRES[f"result_{lp}neg"], LPPROP.pertainsTo, LPRES[f"lp_{lp}"]))
for res in excluded:
g.add((LPRES[f"result_{lp}neg"], LPPROP.resource, CARCINOGENESIS[res]))
g.serialize(destination='predictions.ttl', format='turtle')
average = average(scores)
print(average)
__main__()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment