Commit 8a030755 authored by lgehring's avatar lgehring
Browse files

Update run.py

parent 6dd39cf0
from rdflib import Graph, URIRef
from sklearn.ensemble import RandomForestClassifier
from pykeen_embeddings import generate_and_save_embeddings, load_embeddings_from_file
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ml_utils import balance_data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
def read(path, format):
"""
......@@ -27,17 +35,50 @@ def extract_resources(g: Graph, lp: int):
included_res = []
excluded_res = []
for i, obj in enumerate(g.objects(u, include_res)):
included_res.append(obj)
included_res.append(str(obj))
for i, obj in enumerate(g.objects(u, exclude_res)):
excluded_res.append(obj)
excluded_res.append(str(obj))
return included_res, excluded_res
g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
#for lp in range(1, 26):
# print(extract_resources(g1, lp))
#g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
def generate_features(path="data/embeddings_carcinogenesis.tsv", lp=1):
g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
included_res, excluded_res = extract_resources(g1, lp=lp)
df = pd.read_csv(path, delimiter='\t', names=['name', 'X'])
X = df['X'].to_list()
new_X = []
for row in X:
new_X.append(np.fromstring(row[1:-1], sep=','))
df = pd.concat([df['name'], pd.DataFrame(new_X)], axis=1)
df.loc[df['name'].isin(included_res), 'y'] = 1
df.loc[df['name'].isin(excluded_res), 'y'] = 0
df = df.dropna()
return df.iloc[:, 1:-1], df['y']
for lp in range(8, 25):
X, y = generate_features(lp=lp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, y_train = balance_data(X_train, y_train, random_state=42)
clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
prediction = clf.predict(X_test)
for a, b in zip(prediction, y_test):
if a != b:
print(a,b)
print("F1:", f1_score(y_test, prediction))
print("precision:", precision_score(y_test, prediction))
print("recall:", recall_score(y_test, prediction))
break
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment