Commit 7a9129c3 authored by Sven Meyer's avatar Sven Meyer
Browse files

Added setup and some comments

parent f2ff8ac7
# Classification of carcinogenesis
# Installation
## Dependencies
The classifier requires:
* Python (>=3.8)
* NumPy>=1.19.3
* pandas>=1.1.5
* torch>=1.8.1+cu111
* torch_geometric>=1.7.0
* scikit-learn>=0.24.1
* rdflib>=5.0.0
* pykeen>=1.4.0
## User installation
Use the package installer [pip](https://pip.pypa.io/en/stable/) to install all needed dependencies. Navigate to this directory and execute the following command:
```bash
pip install .
```
# Usage
Nothing to see here...
\ No newline at end of file
......@@ -7,8 +7,28 @@ from torch_geometric.nn import GCNConv
class GNN(torch.nn.Module):
def __init__(self, hidden_channels, num_features, num_classes):
"""Graph Convolutional Network
Neurol Network for learning and predicting data. Most of the parts
were copied from the documentation of pytorch.geometric. See
https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html
for more information.
Parameters
----------
hidden_channels : int
Number of hidden channels.
num_features : int
Number of features. This number is equivalent to the dimension of the
embeddings.
num_classes : int
Number of classes. For carcinogenesis, there are two classes 0 and 1.
"""
super(GNN, self).__init__()
torch.manual_seed(0)
......
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
},
"orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.6 64-bit"
},
"interpreter": {
"hash": "12409c8b02b783cc5653524b9646dd8bb5df877da98cdcbaea2438b4df38f843"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
......@@ -69,10 +45,21 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[-0.3225, -0.0639, 0.2360, ..., 0.3762, 0.1285, 0.5810],\n [ 0.1276, 0.3800, 0.6500, ..., -0.2002, -0.3369, 0.0453],\n [-0.2063, 0.9173, 0.0640, ..., -0.0835, 0.1432, 0.1977],\n ...,\n [-0.0714, -0.0246, 0.5191, ..., -0.2374, -0.2465, -0.1062],\n [ 0.2783, 0.2671, -0.0616, ..., 0.6209, -0.5399, 0.0011],\n [-0.6664, 0.2597, 0.2532, ..., 0.0654, 0.0716, -0.2881]]) 23644\ntensor([0, 0, 0, ..., 0, 0, 0]) 23644\ntensor([[20088, 12265, 5861, ..., 11451, 5305, 4570],\n [20106, 12610, 1118, ..., 11462, 15300, 1115]]) 2\n[False True False ... True True True] 23644\n[ True False True ... False False False] 23644\n"
"tensor([[-0.3225, -0.0639, 0.2360, ..., 0.3762, 0.1285, 0.5810],\n",
" [ 0.1276, 0.3800, 0.6500, ..., -0.2002, -0.3369, 0.0453],\n",
" [-0.2063, 0.9173, 0.0640, ..., -0.0835, 0.1432, 0.1977],\n",
" ...,\n",
" [-0.0714, -0.0246, 0.5191, ..., -0.2374, -0.2465, -0.1062],\n",
" [ 0.2783, 0.2671, -0.0616, ..., 0.6209, -0.5399, 0.0011],\n",
" [-0.6664, 0.2597, 0.2532, ..., 0.0654, 0.0716, -0.2881]]) 23644\n",
"tensor([0, 0, 0, ..., 0, 0, 0]) 23644\n",
"tensor([[20088, 12265, 5861, ..., 11451, 5305, 4570],\n",
" [20106, 12610, 1118, ..., 11462, 15300, 1115]]) 2\n",
"[False True False ... True True True] 23644\n",
"[ True False True ... False False False] 23644\n"
]
}
],
......@@ -137,8 +124,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch: 0 Loss: tensor(0.6919, grad_fn=<NllLossBackward>)\n",
"Epoch: 100 Loss: tensor(0.3183, grad_fn=<NllLossBackward>)\n",
......@@ -150,10 +137,10 @@
}
],
"source": [
"for epoch in range(0, 501):\n",
" loss = train()\n",
"\n",
" if epoch % 100 == 0:\n",
"for epoch in range(0, 201):\r\n",
" loss = train()\r\n",
"\r\n",
" if epoch % 100 == 0:\r\n",
" print('Epoch:', epoch, 'Loss:', loss)"
]
},
......@@ -163,8 +150,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"train mask 11822\n",
"test mask 11822\n",
......@@ -183,5 +170,29 @@
"print('Test F1-Score:', test_score)\n"
]
}
]
],
"metadata": {
"interpreter": {
"hash": "12409c8b02b783cc5653524b9646dd8bb5df877da98cdcbaea2438b4df38f843"
},
"kernelspec": {
"display_name": "Python 3.8.6 64-bit",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
[metadata]
name = fokg-gnn
description = Classifying carcinogenesis
long-description = file: README.md
long-description-content-type = text/markdown
classifiers = Programming Language :: Python :: 3.8
[options]
include_package_data = True
packages = find:
install_requires =
numpy>=1.19.3
pandas>=1.1.5
torch>=1.8.1+cu111
torch_geometric>=1.7.0
scikit-learn>=0.24.1
rdflib>=5.0.0
pykeen>=1.4.0
from setuptools import setup
setup()
\ No newline at end of file
......@@ -8,73 +8,163 @@ from sklearn.model_selection import train_test_split
class Transformer:
def __init__(self, embeddings_file):
self.embeddings = load_embeddings_from_file(embeddings_file)
self.individual_ids = dict()
def __init__(self, embeddings_file):
"""Transformer
Class, which provides functions to read a file containing embeddings,
transform these embeddinge into a dataset and split the data into
training data and test data. All resources are stored in a map/dict, which
maps an individual to an unique id.
id = 0
for key in self.embeddings.keys():
self.individual_ids[key] = id
Parameters
----------
embeddings_file : str
Path to embeddings file, which contains the embeddings
"""
self.embeddings = load_embeddings_from_file(embeddings_file)
id += 1
self.individual_ids = dict()
def transform_embeddings_to_dataset(self):
dataset = list()
id = 0
for key in self.embeddings.keys():
self.individual_ids[key] = id
for value in self.embeddings.values():
dataset.append(value)
id += 1
return tensor(dataset)
def transform_embeddings_to_dataset(self):
"""Transform embeddings into a dataset.
def extract_edges_from_triples(self, triple_file):
edges = list()
Transform the embeddings from the constructor into
a 2-dimensional dataset. The row corresponds to the id
of an individual.
Returns
----------
dataset : torch.tensor of shape (n_embeddings, n_dimension)
Tensor describing the dataset of the embeddings
"""
triples = pd.read_csv(triple_file, delimiter=' ', names=['subject', 'predicate', 'object'], header=None)
dataset = list()
for _, row in triples.iterrows():
subject_id = self.individual_ids[row['subject']]
object_id = self.individual_ids[row['object']]
for value in self.embeddings.values():
dataset.append(value)
edges.append([subject_id, object_id])
return tensor(dataset)
return tensor(edges).t()
def extract_edges_from_triples(self, triple_file):
"""Extract edges of the graph.
Extract the edges of the graph from a given file. Using the id's of the
individuals, the position of the ids correspond to the edges
def extract_labels_from_lp(self, lp_file, lp_id):
graph = read(lp_file, format='turtle')
Parameters
----------
triple_file : str
File, which contains all triples (s, p, o) of the graph. An edge is given
by (s, o).
included, excluded = extract_resources(graph, lp_id)
Returns
----------
edges : torch.tensor of shape (2, n_edges)
Tensor containing two lists of ids of individuals. An edge is given by
corresponding position. For example, tensor([[1,2,3], [4,5,6]]) contains
an edge between the individuals 1 and 4, an edge between 2 and 5 and an
edge between 3 and 6.
"""
num_individuals = len(self.individual_ids)
edges = list()
labels = np.zeros(num_individuals, dtype=int)
triples = pd.read_csv(triple_file, delimiter=' ', names=['subject', 'predicate', 'object'], header=None)
for resource in included:
individual_id = self.individual_ids[resource.n3()[1:-1]]
for _, row in triples.iterrows():
subject_id = self.individual_ids[row['subject']]
object_id = self.individual_ids[row['object']]
labels[individual_id] = 1
edges.append([subject_id, object_id])
return tensor(labels.tolist())
return tensor(edges).t()
def get_train_test_mask(self, y, random_state=None):
num_individuals = len(self.individual_ids)
def extract_labels_from_lp(self, lp_file, lp_id):
"""Extract the labels from a specific learning problem.
ids = np.arange(num_individuals)
Extract the labels from a learning problem. The resources
listed in included get the label 1 and the resources listed in
excluded and not listed resources get the label 0.
assert len(ids) == len(y)
Parameters
----------
lp_file : str
File containing the learning problems
x_train, x_test, y_train, y_test = train_test_split(ids, y, train_size=0.5, random_state=random_state, stratify=y)
lp_id : int
Id of the the learning problem.
train_mask = np.full(num_individuals, False)
Returns
----------
labels : torch.tensor of shape (n_embeddings, )
1-dimensional tensor containing the labels of all individuals.
"""
for id in x_train:
train_mask[id] = True
graph = read(lp_file, format='turtle')
test_mask = np.full(num_individuals, False)
for id in x_test:
test_mask[id] = True
included, excluded = extract_resources(graph, lp_id)
assert (train_mask == test_mask).sum() == 0
num_individuals = len(self.individual_ids)
return train_mask, test_mask
\ No newline at end of file
labels = np.zeros(num_individuals, dtype=int)
for resource in included:
individual_id = self.individual_ids[resource.n3()[1:-1]]
labels[individual_id] = 1
return tensor(labels.tolist())
def get_train_test_mask(self, y, random_state=None):
"""Get a train-test mask
Split the dataset into train data and test data according to
the labels of the individuals.
Parameters
----------
y : torch.tensor
1-dimensional tensor containing the labels of all individuals.
random_state : int, default: None
Random state for reproducibility of results.
Returns
----------
train_mask : nd-array of shape (n_embeddings, )
Boolean array, where True means that the corrensponding
individual is part of the train data.
test_mask : nd-array of shape (n_embeddings, )
Boolean array, where True means that the corresponding
individual is part of the test data.
"""
num_individuals = len(self.individual_ids)
ids = np.arange(num_individuals)
assert len(ids) == len(y)
x_train, x_test, y_train, y_test = train_test_split(ids, y, train_size=0.5, random_state=random_state, stratify=y)
train_mask = np.full(num_individuals, False)
for id in x_train:
train_mask[id] = True
test_mask = np.full(num_individuals, False)
for id in x_test:
test_mask[id] = True
assert (train_mask == test_mask).sum() == 0
return train_mask, test_mask
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment