Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lukas Gehring
LSM
Commits
8a030755
Commit
8a030755
authored
Jun 24, 2021
by
lgehring
Browse files
Update run.py
parent
6dd39cf0
Changes
1
Show whitespace changes
Inline
Side-by-side
run.py
View file @
8a030755
from
rdflib
import
Graph
,
URIRef
from
sklearn.ensemble
import
RandomForestClassifier
from
pykeen_embeddings
import
generate_and_save_embeddings
,
load_embeddings_from_file
import
pandas
as
pd
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
from
ml_utils
import
balance_data
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.metrics
import
f1_score
,
precision_score
,
recall_score
def
read
(
path
,
format
):
"""
...
...
@@ -27,17 +35,50 @@ def extract_resources(g: Graph, lp: int):
included_res
=
[]
excluded_res
=
[]
for
i
,
obj
in
enumerate
(
g
.
objects
(
u
,
include_res
)):
included_res
.
append
(
obj
)
included_res
.
append
(
str
(
obj
)
)
for
i
,
obj
in
enumerate
(
g
.
objects
(
u
,
exclude_res
)):
excluded_res
.
append
(
obj
)
excluded_res
.
append
(
str
(
obj
)
)
return
included_res
,
excluded_res
g1
=
read
(
path
=
'data/kg-mini-project-train.ttl'
,
format
=
'turtle'
)
g2
=
read
(
path
=
'data/carcinogenesis.owl'
,
format
=
'application/rdf+xml'
)
generate_and_save_embeddings
(
g2
,
"data/carcinogenesis.tsv"
,
"data/embeddings_carcinogenesis.tsv"
)
#for lp in range(1, 26):
# print(extract_resources(g1, lp))
#g2 = read(path='data/carcinogenesis.owl', format='application/rdf+xml')
#generate_and_save_embeddings(g2, "data/carcinogenesis.tsv", "data/embeddings_carcinogenesis.tsv")
def
generate_features
(
path
=
"data/embeddings_carcinogenesis.tsv"
,
lp
=
1
):
g1
=
read
(
path
=
'data/kg-mini-project-train.ttl'
,
format
=
'turtle'
)
included_res
,
excluded_res
=
extract_resources
(
g1
,
lp
=
lp
)
df
=
pd
.
read_csv
(
path
,
delimiter
=
'
\t
'
,
names
=
[
'name'
,
'X'
])
X
=
df
[
'X'
].
to_list
()
new_X
=
[]
for
row
in
X
:
new_X
.
append
(
np
.
fromstring
(
row
[
1
:
-
1
],
sep
=
','
))
df
=
pd
.
concat
([
df
[
'name'
],
pd
.
DataFrame
(
new_X
)],
axis
=
1
)
df
.
loc
[
df
[
'name'
].
isin
(
included_res
),
'y'
]
=
1
df
.
loc
[
df
[
'name'
].
isin
(
excluded_res
),
'y'
]
=
0
df
=
df
.
dropna
()
return
df
.
iloc
[:,
1
:
-
1
],
df
[
'y'
]
for
lp
in
range
(
8
,
25
):
X
,
y
=
generate_features
(
lp
=
lp
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.3
,
random_state
=
42
)
X_train
,
y_train
=
balance_data
(
X_train
,
y_train
,
random_state
=
42
)
clf
=
RandomForestClassifier
(
random_state
=
42
).
fit
(
X_train
,
y_train
)
prediction
=
clf
.
predict
(
X_test
)
for
a
,
b
in
zip
(
prediction
,
y_test
):
if
a
!=
b
:
print
(
a
,
b
)
print
(
"F1:"
,
f1_score
(
y_test
,
prediction
))
print
(
"precision:"
,
precision_score
(
y_test
,
prediction
))
print
(
"recall:"
,
recall_score
(
y_test
,
prediction
))
break
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment