run.py 2.7 KB
Newer Older
1
from rdflib import Graph, URIRef
lgehring's avatar
lgehring committed
2
3
from sklearn.ensemble import RandomForestClassifier

4
from pykeen_embeddings import generate_and_save_embeddings, load_embeddings_from_file
lgehring's avatar
lgehring committed
5
6
7
8
9
10
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ml_utils import balance_data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

def read(path, format):
    """
    Read graph from file
    :param path: path to file
    :return: rdf graph
    """
    g = Graph()
    g.parse(path, format=format)

    return g


def extract_resources(g: Graph, lp: int):
    """
    Extract resources from the graph with respect to the given learning problem
    :param g: rdf graph with data
    :param lp: number of the learning problem
    :return: list with included resources and list with excluded resources
    """
    exclude_res = URIRef('https://lpbenchgen.org/property/excludesResource')
    include_res = URIRef('https://lpbenchgen.org/property/includesResource')
    u = URIRef('https://lpbenchgen.org/resource/lp_%d' % lp)

    included_res = []
    excluded_res = []
    for i, obj in enumerate(g.objects(u, include_res)):
lgehring's avatar
lgehring committed
38
        included_res.append(str(obj))
39
40

    for i, obj in enumerate(g.objects(u, exclude_res)):
lgehring's avatar
lgehring committed
41
        excluded_res.append(str(obj))
42
43
44
45
46

    return included_res, excluded_res



lgehring's avatar
lgehring committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")

def generate_features(path="data/embeddings_carcinogenesis.tsv", lp=1):
    g1 = read(path='data/kg-mini-project-train.ttl', format='turtle')
    included_res, excluded_res = extract_resources(g1, lp=lp)

    df = pd.read_csv(path, delimiter='\t', names=['name', 'X'])
    X = df['X'].to_list()
    new_X = []
    for row in X:
        new_X.append(np.fromstring(row[1:-1], sep=','))
    df = pd.concat([df['name'], pd.DataFrame(new_X)], axis=1)

    df.loc[df['name'].isin(included_res), 'y'] = 1
    df.loc[df['name'].isin(excluded_res), 'y'] = 0
    df = df.dropna()
    return df.iloc[:, 1:-1], df['y']


for lp in range(8, 25):
    X, y = generate_features(lp=lp)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    X_train, y_train = balance_data(X_train, y_train, random_state=42)

    clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    prediction = clf.predict(X_test)
    for a, b in zip(prediction, y_test):
        if a != b:
            print(a,b)
    print("F1:", f1_score(y_test, prediction))
    print("precision:", precision_score(y_test, prediction))
    print("recall:", recall_score(y_test, prediction))

    break