Skip to content
Snippets Groups Projects
Commit 22ddc82d authored by AjUm-HEIDI's avatar AjUm-HEIDI
Browse files

Fix issues in test

parent ce33a8b4
No related branches found
No related tags found
No related merge requests found
{
"structured": [
{
"datasetName": "BA2Motif"
"dataset_name": "BA2Motif"
},
{
"datasetName": "BAMultiShape"
"dataset_name": "BAMultiShape"
},
{
"datasetName": "MUTAG"
"dataset_name": "MUTAG"
}
],
"text": [
{
"datasetName": "dblp",
"dataset_name": "dblp",
"grouped_keyword_dir": "rawData/dblp/groups",
"entity_name": "author"
},
{
"datasetName": "imdb",
"dataset_name": "imdb",
"grouped_keyword_dir": "rawData/imdb/groups",
"entity_name": "movie"
}
......
......@@ -44,7 +44,7 @@ def main():
# If a specific dataset is specified, filter to only run that one
if args.dataset:
datasets = [d for d in datasets if d['datasetName'] == args.dataset]
datasets = [d for d in datasets if d['dataset_name'] == args.dataset]
for dataset in datasets:
# Extract additional parameters from configuration or use defaults
......@@ -56,19 +56,14 @@ def main():
if dataset_type == "structured":
structured_datasets_experiment(
dataset["datasetName"],
iterations=args.iterations,
bag_of_words_size=bag_of_words_size,
num_groups_list=num_groups_list,
create_high_level_concepts_as_boolean=create_high_level_concepts_as_boolean
dataset["dataset_name"]
)
elif dataset_type == "text":
text_based_datasets_experiment(
dataset["grouped_keyword_dir"],
dataset["datasetName"],
dataset["dataset_name"],
dataset["entity_name"],
args.iterations,
bag_of_words_size=bag_of_words_size,
iterations=args.iterations,
num_groups_list=num_groups_list,
create_high_level_concepts_as_boolean=create_high_level_concepts_as_boolean
)
......
from datetime import datetime
import datetime
import time
import json
import os
......@@ -16,22 +16,23 @@ from GNN.HeterogenousGNN import GNN
renderer = DLSyntaxObjectRenderer()
generate_new_owl_file = True
def run_gnn(structuredDataset: Base, entity_name, datasetName, timeStamp):
def create_directory(base_path, suffix=""):
""" Create directory with optional suffix and return the path """
path = os.path.join(base_path, suffix)
os.makedirs(path, exist_ok=True)
return path
def run_gnn(structuredDataset: Base, entity_name, datasetName, results_dir):
evaluations = {
"gnn": {},
"explanation": {},
"confusion_matrix": {}
}
# Initialize GNN model
print("Initializing GNN model...")
model = GNN(structuredDataset.dataset)
# Train model
print("Training model...")
metrics = model.train_model(epochs=150)
# Save GNN training metrics
metrics = model.train_model(epochs=150, lr=0.01)
evaluations["gnn"] = metrics[entity_name]
print("\nBest Training Metrics:")
......@@ -45,8 +46,8 @@ def run_gnn(structuredDataset: Base, entity_name, datasetName, timeStamp):
print(cm)
evaluations["confusion_matrix"] = cm.tolist()
output_file = f"./evaluation_results/{datasetName}_evaluation_{timeStamp}.json"
# Save evaluations to file
# Save evaluations to the timestamped results directory
output_file = os.path.join(results_dir, f"{datasetName}_evaluation.json")
with open(output_file, "w") as f:
json.dump(evaluations, f, indent=4, ensure_ascii=False)
......@@ -61,12 +62,11 @@ def load_datasets(dataset_name, bag_of_words_size) -> Base:
def fetch_high_level_concepts(dataset: Base, num_groups, group_keyword_file):
return dataset.fetch_themes(num_groups, group_keyword_file)
def append_to_csv_file(results, filename, dataset_key, num_groups, create_high_level_concepts_as_boolean, write_header=False):
def append_to_csv_file(results, results_dir, dataset_key, num_groups, create_high_level_concepts_as_boolean, write_header=False):
"""
Appends results to a CSV file. Creates the file if it doesn't exist.
If `write_header` is True, it will write the header.
Appends results to a CSV file in the results directory. Creates the file if it doesn't exist.
"""
os.makedirs(os.path.dirname(filename), exist_ok=True)
filename = os.path.join(results_dir, f"{dataset_key}_results.csv")
with open(filename, mode='a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Dataset', 'Number of Groups', 'Create High Level Concepts As Boolean', 'Label Name',
......@@ -141,9 +141,9 @@ def explain_and_evaluate(model, dataset, entity_name, owl_graph_path, high_level
return all_results
def summarize_aggregated_results(aggregated_results, summary_filename):
def summarize_aggregated_results(aggregated_results, results_dir, dataset_name):
"""
Summarizes the aggregated results, including for each label and number of groups:
Summarizes the aggregated results in the results directory
- Best Hypothesis
- Best F1 Score
- Average F1 Score
......@@ -153,7 +153,7 @@ def summarize_aggregated_results(aggregated_results, summary_filename):
- Average Length
- Average Explain Time
"""
os.makedirs(os.path.dirname(summary_filename), exist_ok=True)
summary_filename = os.path.join(results_dir, f"{dataset_name}_summary.csv")
with open(summary_filename, mode="w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
......@@ -185,7 +185,7 @@ def summarize_aggregated_results(aggregated_results, summary_filename):
print(f"Summary results saved to {summary_filename}")
def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size=1000, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=False):
def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=True, bag_of_words_size=1000):
"""
Handles dataset loading and evaluation for experiments.
Manages the experiment based on specified number of groups and boolean concept creation settings.
......@@ -193,22 +193,25 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA is {'available. Using GPU.' if device.type == 'cuda' else 'not available. Using CPU.'}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_filename = f"./evaluation_results/{dataset_name}_summary_{timestamp}.csv"
# Create timestamp directory for all results
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = create_directory("./evaluation_results", timestamp)
aggregated_results = {}
for run in range(1, iterations + 1):
print(f"\nStarting Experiment Iteration {run}/{iterations}")
run_timestamp = f"{timestamp}_run_{run}"
run_csv_filename = f"./evaluation_results/{dataset_name}_evaluation_{run_timestamp}.csv"
# Create run-specific directory
run_dir = create_directory(results_dir, f"run_{run}")
dataset = load_datasets(dataset_name=dataset_name, bag_of_words_size=bag_of_words_size)
model = run_gnn(dataset, entity_name, dataset_name, run_timestamp)
model = run_gnn(dataset, entity_name, dataset_name, run_dir)
write_header = True
for num_groups in num_groups_list:
group_keyword_file = "" if num_groups == 0 else os.path.join(grouped_keyword_dir, f'groupedKeywords_{num_groups}.json')
owl_graph_path = f'./owlGraphs/{dataset_name}_{run_timestamp}_{num_groups}_groups_{"bool" if create_high_level_concepts_as_boolean else "data"}.owl'
owl_graph_path = os.path.join(run_dir, f"{num_groups}_groups_{'bool' if create_high_level_concepts_as_boolean else 'data'}.owl")
print("\n" + "=" * 50)
print(f"Running experiment {run} with create_high_level_concepts_as_boolean={create_high_level_concepts_as_boolean} and num_groups={num_groups}")
......@@ -220,16 +223,16 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
model, dataset.dataset, entity_name, owl_graph_path, high_level_concepts, create_high_level_concepts_as_boolean
)
append_to_csv_file(results, run_csv_filename, dataset_name, num_groups, create_high_level_concepts_as_boolean, write_header=write_header)
append_to_csv_file(results, run_dir, dataset_name, num_groups, create_high_level_concepts_as_boolean, write_header=write_header)
for label, data in results.items():
# Initialize aggregation for this label and number of groups if not yet present
key = (label, num_groups)
if key not in aggregated_results:
aggregated_results[key] = {
"label_name": data["label_name"],
"best_hypothesis": data["hypothesis"],
"best_F1": data["evaluation"]["F1"],
"best_accuracy": data["evaluation"]["Accuracy"],
"length_at_best_f1": data["length"],
"all_f1_scores": [],
"all_accuracies": [],
......@@ -237,7 +240,6 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
"all_times": []
}
aggregated_data = aggregated_results[key]
# If the current F1 is better than the stored best, update best values
if data["evaluation"]["F1"] > aggregated_data["best_F1"]:
aggregated_data["best_F1"] = data["evaluation"]["F1"]
aggregated_data["best_hypothesis"] = data["hypothesis"]
......@@ -249,8 +251,9 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, bag_of_words_size
write_header = False
summarize_aggregated_results(aggregated_results, summary_filename)
print(f"\nExperiments completed. Summary saved to {summary_filename}")
# Save summary to main results directory
summarize_aggregated_results(aggregated_results, results_dir, dataset_name)
print(f"\nExperiments completed. Results saved in {results_dir}")
def main():
datasets = [
......@@ -272,7 +275,7 @@ def main():
dataset["grouped_keyword_dir"],
dataset["dataset_name"],
dataset["entity_name"],
iterations=5 # Repeat experiment 5 times
iterations=5
)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment