Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ExplainGNNWithHighLevelConcepts
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ajay Umakanth
ExplainGNNWithHighLevelConcepts
Commits
22ddc82d
There was an error fetching the commit references. Please try again later.
Commit
22ddc82d
authored
6 months ago
by
AjUm-HEIDI
Browse files
Options
Downloads
Patches
Plain Diff
Fix issues in test
parent
ce33a8b4
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
config.json
+5
-5
5 additions, 5 deletions
config.json
main.py
+4
-9
4 additions, 9 deletions
main.py
text_based_datasets_experiment.py
+38
-35
38 additions, 35 deletions
text_based_datasets_experiment.py
with
47 additions
and
49 deletions
config.json
+
5
−
5
View file @
22ddc82d
{
"structured"
:
[
{
"dataset
N
ame"
:
"BA2Motif"
"dataset
_n
ame"
:
"BA2Motif"
},
{
"dataset
N
ame"
:
"BAMultiShape"
"dataset
_n
ame"
:
"BAMultiShape"
},
{
"dataset
N
ame"
:
"MUTAG"
"dataset
_n
ame"
:
"MUTAG"
}
],
"text"
:
[
{
"dataset
N
ame"
:
"dblp"
,
"dataset
_n
ame"
:
"dblp"
,
"grouped_keyword_dir"
:
"rawData/dblp/groups"
,
"entity_name"
:
"author"
},
{
"dataset
N
ame"
:
"imdb"
,
"dataset
_n
ame"
:
"imdb"
,
"grouped_keyword_dir"
:
"rawData/imdb/groups"
,
"entity_name"
:
"movie"
}
...
...
This diff is collapsed.
Click to expand it.
main.py
+
4
−
9
View file @
22ddc82d
...
...
@@ -44,7 +44,7 @@ def main():
# If a specific dataset is specified, filter to only run that one
if
args
.
dataset
:
datasets
=
[
d
for
d
in
datasets
if
d
[
'
dataset
N
ame
'
]
==
args
.
dataset
]
datasets
=
[
d
for
d
in
datasets
if
d
[
'
dataset
_n
ame
'
]
==
args
.
dataset
]
for
dataset
in
datasets
:
# Extract additional parameters from configuration or use defaults
...
...
@@ -56,19 +56,14 @@ def main():
if
dataset_type
==
"
structured
"
:
structured_datasets_experiment
(
dataset
[
"
datasetName
"
],
iterations
=
args
.
iterations
,
bag_of_words_size
=
bag_of_words_size
,
num_groups_list
=
num_groups_list
,
create_high_level_concepts_as_boolean
=
create_high_level_concepts_as_boolean
dataset
[
"
dataset_name
"
]
)
elif
dataset_type
==
"
text
"
:
text_based_datasets_experiment
(
dataset
[
"
grouped_keyword_dir
"
],
dataset
[
"
dataset
N
ame
"
],
dataset
[
"
dataset
_n
ame
"
],
dataset
[
"
entity_name
"
],
args
.
iterations
,
bag_of_words_size
=
bag_of_words_size
,
iterations
=
args
.
iterations
,
num_groups_list
=
num_groups_list
,
create_high_level_concepts_as_boolean
=
create_high_level_concepts_as_boolean
)
...
...
This diff is collapsed.
Click to expand it.
text_based_datasets_experiment.py
+
38
−
35
View file @
22ddc82d
from
datetime
import
datetime
import
datetime
import
time
import
json
import
os
...
...
@@ -16,22 +16,23 @@ from GNN.HeterogenousGNN import GNN
renderer
=
DLSyntaxObjectRenderer
()
generate_new_owl_file
=
True
def
run_gnn
(
structuredDataset
:
Base
,
entity_name
,
datasetName
,
timeStamp
):
def
create_directory
(
base_path
,
suffix
=
""
):
"""
Create directory with optional suffix and return the path
"""
path
=
os
.
path
.
join
(
base_path
,
suffix
)
os
.
makedirs
(
path
,
exist_ok
=
True
)
return
path
def
run_gnn
(
structuredDataset
:
Base
,
entity_name
,
datasetName
,
results_dir
):
evaluations
=
{
"
gnn
"
:
{},
"
explanation
"
:
{},
"
confusion_matrix
"
:
{}
}
# Initialize GNN model
print
(
"
Initializing GNN model...
"
)
model
=
GNN
(
structuredDataset
.
dataset
)
# Train model
print
(
"
Training model...
"
)
metrics
=
model
.
train_model
(
epochs
=
150
)
# Save GNN training metrics
metrics
=
model
.
train_model
(
epochs
=
150
,
lr
=
0.01
)
evaluations
[
"
gnn
"
]
=
metrics
[
entity_name
]
print
(
"
\n
Best Training Metrics:
"
)
...
...
@@ -45,8 +46,8 @@ def run_gnn(structuredDataset: Base, entity_name, datasetName, timeStamp):
print
(
cm
)
evaluations
[
"
confusion_matrix
"
]
=
cm
.
tolist
()
output_file
=
f
"
./evaluation_results/
{
datasetName
}
_evaluation_
{
timeStamp
}
.json
"
# Save
evaluation
s to file
# Save evaluations to the timestamped results directory
output_file
=
os
.
path
.
join
(
results_dir
,
f
"
{
datasetName
}
_
evaluation
.json
"
)
with
open
(
output_file
,
"
w
"
)
as
f
:
json
.
dump
(
evaluations
,
f
,
indent
=
4
,
ensure_ascii
=
False
)
...
...
@@ -61,12 +62,11 @@ def load_datasets(dataset_name, bag_of_words_size) -> Base:
def
fetch_high_level_concepts
(
dataset
:
Base
,
num_groups
,
group_keyword_file
):
return
dataset
.
fetch_themes
(
num_groups
,
group_keyword_file
)
def
append_to_csv_file
(
results
,
filename
,
dataset_key
,
num_groups
,
create_high_level_concepts_as_boolean
,
write_header
=
False
):
def
append_to_csv_file
(
results
,
results_dir
,
dataset_key
,
num_groups
,
create_high_level_concepts_as_boolean
,
write_header
=
False
):
"""
Appends results to a CSV file. Creates the file if it doesn
'
t exist.
If `write_header` is True, it will write the header.
Appends results to a CSV file in the results directory. Creates the file if it doesn
'
t exist.
"""
os
.
makedirs
(
os
.
path
.
dirname
(
filename
),
exist_ok
=
True
)
filename
=
os
.
path
.
join
(
results_dir
,
f
"
{
dataset_key
}
_results.csv
"
)
with
open
(
filename
,
mode
=
'
a
'
,
newline
=
''
,
encoding
=
'
utf-8
'
)
as
csvfile
:
fieldnames
=
[
'
Dataset
'
,
'
Number of Groups
'
,
'
Create High Level Concepts As Boolean
'
,
'
Label Name
'
,
...
...
@@ -141,9 +141,9 @@ def explain_and_evaluate(model, dataset, entity_name, owl_graph_path, high_level
return
all_results
def
summarize_aggregated_results
(
aggregated_results
,
summary_file
name
):
def
summarize_aggregated_results
(
aggregated_results
,
results_dir
,
dataset_
name
):
"""
Summarizes the aggregated results
,
in
cluding for each label and number of groups:
Summarizes the aggregated results in
the results directory
- Best Hypothesis
- Best F1 Score
- Average F1 Score
...
...
@@ -153,7 +153,7 @@ def summarize_aggregated_results(aggregated_results, summary_filename):
- Average Length
- Average Explain Time
"""
os
.
makedirs
(
os
.
path
.
dir
name
(
summary
_filename
),
exist_ok
=
True
)
summary_filename
=
os
.
path
.
join
(
results_dir
,
f
"
{
dataset_
name
}
_
summary
.csv
"
)
with
open
(
summary_filename
,
mode
=
"
w
"
,
newline
=
""
,
encoding
=
"
utf-8
"
)
as
csvfile
:
fieldnames
=
[
...
...
@@ -185,7 +185,7 @@ def summarize_aggregated_results(aggregated_results, summary_filename):
print
(
f
"
Summary results saved to
{
summary_filename
}
"
)
def
experiment
(
grouped_keyword_dir
,
dataset_name
,
entity_name
,
bag_of_words_size
=
1000
,
iterations
=
5
,
num_groups_list
=
[
0
,
5
,
10
,
15
,
20
,
25
],
create_high_level_concepts_as_boolean
=
False
):
def
experiment
(
grouped_keyword_dir
,
dataset_name
,
entity_name
,
iterations
=
5
,
num_groups_list
=
[
0
,
5
,
10
,
15
,
20
,
25
],
create_high_level_concepts_as_boolean
=
True
,
bag_of_words_size
=
1000
):
"""
Handles dataset loading and evaluation for experiments.
Manages the experiment based on specified number of groups and boolean concept creation settings.
...
...
@@ -193,22 +193,25 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
print
(
f
"
CUDA is
{
'
available. Using GPU.
'
if
device
.
type
==
'
cuda
'
else
'
not available. Using CPU.
'
}
"
)
timestamp
=
datetime
.
now
().
strftime
(
"
%Y%m%d_%H%M%S
"
)
summary_filename
=
f
"
./evaluation_results/
{
dataset_name
}
_summary_
{
timestamp
}
.csv
"
# Create timestamp directory for all results
timestamp
=
datetime
.
datetime
.
now
().
strftime
(
"
%Y%m%d_%H%M%S
"
)
results_dir
=
create_directory
(
"
./evaluation_results
"
,
timestamp
)
aggregated_results
=
{}
for
run
in
range
(
1
,
iterations
+
1
):
print
(
f
"
\n
Starting Experiment Iteration
{
run
}
/
{
iterations
}
"
)
run_timestamp
=
f
"
{
timestamp
}
_run_
{
run
}
"
run_csv_filename
=
f
"
./evaluation_results/
{
dataset_name
}
_evaluation_
{
run_timestamp
}
.csv
"
# Create run-specific directory
run_dir
=
create_directory
(
results_dir
,
f
"
run_
{
run
}
"
)
dataset
=
load_datasets
(
dataset_name
=
dataset_name
,
bag_of_words_size
=
bag_of_words_size
)
model
=
run_gnn
(
dataset
,
entity_name
,
dataset_name
,
run_
timestamp
)
model
=
run_gnn
(
dataset
,
entity_name
,
dataset_name
,
run_
dir
)
write_header
=
True
for
num_groups
in
num_groups_list
:
group_keyword_file
=
""
if
num_groups
==
0
else
os
.
path
.
join
(
grouped_keyword_dir
,
f
'
groupedKeywords_
{
num_groups
}
.json
'
)
owl_graph_path
=
f
'
./owlGraphs/
{
dataset_name
}
_
{
run_timestamp
}
_
{
num_groups
}
_groups_
{
"
bool
"
if
create_high_level_concepts_as_boolean
else
"
data
"
}
.owl
'
owl_graph_path
=
os
.
path
.
join
(
run_dir
,
f
"
{
num_groups
}
_groups_
{
'
bool
'
if
create_high_level_concepts_as_boolean
else
'
data
'
}
.owl
"
)
print
(
"
\n
"
+
"
=
"
*
50
)
print
(
f
"
Running experiment
{
run
}
with create_high_level_concepts_as_boolean=
{
create_high_level_concepts_as_boolean
}
and num_groups=
{
num_groups
}
"
)
...
...
@@ -220,16 +223,16 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
model
,
dataset
.
dataset
,
entity_name
,
owl_graph_path
,
high_level_concepts
,
create_high_level_concepts_as_boolean
)
append_to_csv_file
(
results
,
run_
csv_filename
,
dataset_name
,
num_groups
,
create_high_level_concepts_as_boolean
,
write_header
=
write_header
)
append_to_csv_file
(
results
,
run_
dir
,
dataset_name
,
num_groups
,
create_high_level_concepts_as_boolean
,
write_header
=
write_header
)
for
label
,
data
in
results
.
items
():
# Initialize aggregation for this label and number of groups if not yet present
key
=
(
label
,
num_groups
)
if
key
not
in
aggregated_results
:
aggregated_results
[
key
]
=
{
"
label_name
"
:
data
[
"
label_name
"
],
"
best_hypothesis
"
:
data
[
"
hypothesis
"
],
"
best_F1
"
:
data
[
"
evaluation
"
][
"
F1
"
],
"
best_accuracy
"
:
data
[
"
evaluation
"
][
"
Accuracy
"
],
"
length_at_best_f1
"
:
data
[
"
length
"
],
"
all_f1_scores
"
:
[],
"
all_accuracies
"
:
[],
...
...
@@ -237,7 +240,6 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
"
all_times
"
:
[]
}
aggregated_data
=
aggregated_results
[
key
]
# If the current F1 is better than the stored best, update best values
if
data
[
"
evaluation
"
][
"
F1
"
]
>
aggregated_data
[
"
best_F1
"
]:
aggregated_data
[
"
best_F1
"
]
=
data
[
"
evaluation
"
][
"
F1
"
]
aggregated_data
[
"
best_hypothesis
"
]
=
data
[
"
hypothesis
"
]
...
...
@@ -249,8 +251,9 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
write_header
=
False
summarize_aggregated_results
(
aggregated_results
,
summary_filename
)
print
(
f
"
\n
Experiments completed. Summary saved to
{
summary_filename
}
"
)
# Save summary to main results directory
summarize_aggregated_results
(
aggregated_results
,
results_dir
,
dataset_name
)
print
(
f
"
\n
Experiments completed. Results saved in
{
results_dir
}
"
)
def
main
():
datasets
=
[
...
...
@@ -272,7 +275,7 @@ def main():
dataset
[
"
grouped_keyword_dir
"
],
dataset
[
"
dataset_name
"
],
dataset
[
"
entity_name
"
],
iterations
=
5
# Repeat experiment 5 times
iterations
=
5
)
if
__name__
==
"
__main__
"
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment