Skip to content
Snippets Groups Projects
Commit 3c772842 authored by AjUm-HEIDI's avatar AjUm-HEIDI
Browse files

Add code to set a title

parent 67f2e923
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,6 @@ class Base:
def __init__(self, path, bag_of_words_size=100, remove_all_false_values=True):
self.path = path
self.bag_of_words_size = bag_of_words_size
self.remove_all_false_values = remove_all_false_values
try:
self.stop_words = set(stopwords.words('english'))
......
......@@ -12,11 +12,12 @@ def parse_arguments():
parser = argparse.ArgumentParser(description="Run experiments for structured or text datasets.")
parser.add_argument("-c", "--config", default="config.json", help="Path to the configuration file, defaults to 'config.json'")
parser.add_argument("-i", "--iterations", type=int, default=5, help="The total number of times to run the experiment")
parser.add_argument("-t", "--type", choices=['text', 'structured'], help="Type of dataset, defaults to 'structured'")
parser.add_argument("-t", "--type", choices=['text', 'structured', 's', 't'], default=None, help="Type of dataset: 's' or 'structured' for structured and 't' or 'text' for text. Defaults to run both.")
parser.add_argument("-d", "--dataset", help="Specific dataset name to run, optional")
parser.add_argument("-n", "--num_groups", type=int, nargs='*', default=None, help="List of group sizes to run experiments with, space-separated")
parser.add_argument("-l", "--labels", type=int, nargs='*', default=None, help="List of labels to run experiments with, space-separated")
parser.add_argument("-b", "--boolean_concepts", type=str, choices=['true', 'false'], default=None, help="Whether to create high level concepts as boolean values, must explicitly state 'true' or 'false'")
parser.add_argument("--title", type=str, default="", help="Title to append to the results folder name")
args = parser.parse_args()
# Convert boolean_concepts argument from string to actual Boolean type, or None if not specified
......@@ -27,6 +28,10 @@ def parse_arguments():
if args.iterations < 1:
parser.error("The number of iterations must be a positive integer.")
# Clean title: remove spaces and special characters, convert to lowercase
if args.title:
args.title = "_" + "".join(c.lower() for c in args.title if c.isalnum())
return args
def main():
......@@ -35,10 +40,12 @@ def main():
# Decide which types to run based on the argument passed
types_to_run = []
if not args.type:
types_to_run = ['text', 'structured']
if args.type == 't' or args.type == 'text':
types_to_run = ['text']
elif args.type == 's' or args.type == 'structured':
types_to_run = ['structured']
else:
types_to_run.append(args.type)
types_to_run = ['text', 'structured']
for dataset_type in types_to_run:
datasets = config[dataset_type] # Load datasets based on type (structured or text)
......@@ -51,16 +58,14 @@ def main():
# Extract additional parameters from configuration or use defaults
num_groups_list = args.num_groups if args.num_groups is not None else [i * 5 for i in range(6)]
# Pass the command-line argument for creating boolean concepts
create_high_level_concepts_as_boolean = args.boolean_concepts
if dataset_type == "structured":
structured_datasets_experiment(
dataset["dataset_name"],
add_node_type=dataset["add_node_type"],
iterations=args.iterations,
create_high_level_concepts_as_boolean=create_high_level_concepts_as_boolean,
selected_labels=args.labels
create_high_level_concepts_as_boolean=args.boolean_concepts,
selected_labels=args.labels,
title=args.title
)
elif dataset_type == "text":
text_based_datasets_experiment(
......@@ -69,8 +74,9 @@ def main():
dataset["entity_name"],
iterations=args.iterations,
num_groups_list=num_groups_list,
create_high_level_concepts_as_boolean=create_high_level_concepts_as_boolean,
selected_labels=args.labels
create_high_level_concepts_as_boolean=args.boolean_concepts,
selected_labels=args.labels,
title=args.title
)
if __name__ == "__main__":
......
......@@ -51,6 +51,7 @@ def explain_gnn(model, dataset, datasetName, run_dir, add_node_type, high_level_
evaluation = {
"Label": label,
"MotifsAdded": high_level_concepts is not None,
"DataType": None if high_level_concepts is None else "Bool" if create_high_level_concepts_as_boolean is True else "Integer",
"BestHypothesis": rendered_hypothesis,
"Quality": best_hypothesis.quality,
"Length": best_hypothesis.len
......@@ -61,22 +62,22 @@ def explain_gnn(model, dataset, datasetName, run_dir, add_node_type, high_level_
)
evaluation[metric.name] = evaluated_concept.q
# Write evaluation to CSV
with open(run_dir / f"gnn_explanations.csv", "a", newline="") as f:
writer = csv.writer(f)
if f.tell() == 0: # Write header if the file is empty
if f.tell() == 0:
writer.writerow([
"Label", "MotifsAdded", "BestHypothesis", "Quality",
"Length", "Accuracy", "Recall", "Precision", "F1"
"Label", "MotifsAdded", "DataType",
"Length", "F1", "Accuracy",
"Recall", "Precision", "BestHypothesis"
])
writer.writerow([
evaluation["Label"], evaluation["MotifsAdded"], evaluation["BestHypothesis"],
evaluation["Quality"], evaluation["Length"], evaluation["Accuracy"],
evaluation["Recall"], evaluation["Precision"], evaluation["F1"]
evaluation["Label"], evaluation["MotifsAdded"], evaluation["DataType"],
evaluation["Length"], evaluation["F1"], evaluation["Accuracy"],
evaluation["Recall"], evaluation["Precision"], evaluation["BestHypothesis"]
])
def experiment(datasetName: str, add_node_type = True, iterations: int = 1, create_high_level_concepts_as_boolean=True, selected_labels=None):
def experiment(datasetName: str, add_node_type = True, iterations: int = 1, create_high_level_concepts_as_boolean=None, selected_labels=None, title=""):
"""
Run the experiment for the specified dataset multiple times.
......@@ -94,10 +95,10 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
elif datasetName == "MUTAG":
structuredDataset = MUTAG()
else:
raise ValueError(f"Unknown dataset name '{datasetName}'. Valid options are 'BA2Motif', 'MultiShape', or 'MUTAG'.")
raise ValueError(f"Unknown dataset name '{datasetName}'.")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results = Path(f"evaluation_results/{timestamp}_{datasetName}")
results = Path(f"evaluation_results/{timestamp}_{datasetName}{title}")
results.mkdir(parents=True, exist_ok=True)
best_results = {}
......@@ -110,31 +111,20 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
evaluations = {
"gnn": {},
"confusion_matrix": {}
}
model = GNN(structuredDataset.dataset)
print("Training model...")
metrics = model.train_model(epochs=300, lr=0.001, show_progress=True)
evaluations["gnn"] = metrics
metrics = model.train_model(epochs=300, lr=0.001)
# Convert tensors to NumPy arrays after moving them to CPU
original_labels = np.array([data.y.item() for data in structuredDataset.dataset])
predicted_labels = model.predict_all().clone().detach().cpu().numpy()
# Compute confusion matrix
cm = confusion_matrix(original_labels, predicted_labels)
evaluations["confusion_matrix"] = cm.tolist()
# Write GNN metrics and confusion matrix to CSV
with open(run_dir / f"gnn_results.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Metric", "Value"])
for metric, value in metrics.items():
writer.writerow([metric.capitalize(), value])
writer.writerow([]) # Blank row for separation
writer.writerow([])
writer.writerow(["Confusion Matrix"] + [""] * (len(cm[0]) - 1))
for row in cm:
writer.writerow(row)
......@@ -165,11 +155,17 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
structuredDataset.visualize_pattern_in_graph(pattern_idx, graph_idx)
print("\nAfter finding motifs:")
explain_gnn(model, structuredDataset.dataset, datasetName, run_dir, add_node_type, high_level_concepts, create_high_level_concepts_as_boolean, selected_labels)
print(f"Results for iteration {iteration + 1} saved in {run_dir}")
if create_high_level_concepts_as_boolean is not False:
print("\nRunning with boolean concepts:")
explain_gnn(model, structuredDataset.dataset, datasetName, run_dir, add_node_type,
high_level_concepts, create_high_level_concepts_as_boolean=True, selected_labels=selected_labels)
if create_high_level_concepts_as_boolean is not True:
print("\nRunning with integer concepts:")
explain_gnn(model, structuredDataset.dataset, datasetName, run_dir, add_node_type,
high_level_concepts, create_high_level_concepts_as_boolean=False, selected_labels=selected_labels)
# Retrieve the best hypothesis for each label
hypothesis_file = run_dir / "gnn_explanations.csv"
if hypothesis_file.exists():
with open(hypothesis_file, "r") as f:
......@@ -177,10 +173,12 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
for row in reader:
label = int(row["Label"])
motif_added = row["MotifsAdded"] == "True"
datatype = row["DataType"]
current_result = {
"Iteration": iteration + 1,
"Label": label,
"MotifsAdded": motif_added,
"DataType": datatype,
"BestHypothesis": row["BestHypothesis"],
"Length": int(row["Length"]),
"F1": float(row["F1"]),
......@@ -189,16 +187,16 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
"Precision": float(row["Precision"])
}
# Update if current result is better
key = (label, motif_added)
key = (label, motif_added, datatype)
if key not in best_results or current_result["F1"] > best_results[key]["F1"]:
best_results[key] = current_result
print(f"Results for iteration {iteration + 1} saved in {run_dir}")
# Write the best results for each label and motif combination to the parent directory
with open(results / f"best_results_by_label_and_motif.csv", "w", newline="") as f:
with open(results / f"best_results_summary.csv", "w", newline="") as f:
fieldnames = [
"Label", "MotifsAdded", "Iteration",
"Label", "MotifsAdded", "DataType", "Iteration",
"Best F1 Score", "Length at Best F1", "Average F1 Score",
"Best Accuracy", "Average Accuracy", "Average Length",
"Best Hypothesis"
......@@ -207,26 +205,24 @@ def experiment(datasetName: str, add_node_type = True, iterations: int = 1, crea
writer.writeheader()
# Compute aggregated metrics for each label and motif combination
for (label, motif_added), result in best_results.items():
for (label, motif_added, datatype), result in best_results.items():
if result is not None:
# Identify all results for this label/motif combination
all_results = [
r for (lbl, mot), r in best_results.items()
if lbl == label and mot == motif_added
r for (lbl, mot, dt), r in best_results.items()
if lbl == label and mot == motif_added and dt == datatype
]
# Compute averages
avg_f1 = sum(r["F1"] for r in all_results) / len(all_results)
avg_accuracy = sum(r["Accuracy"] for r in all_results) / len(all_results)
avg_length = sum(r["Length"] for r in all_results) / len(all_results)
# Determine the result with the highest F1 score
best_result = max(all_results, key=lambda r: r["F1"])
# Write the best result to the CSV
writer.writerow({
"Label": label,
"MotifsAdded": motif_added,
"DataType": datatype,
"Iteration": best_result["Iteration"],
"Best F1 Score": best_result["F1"],
"Length at Best F1": best_result["Length"],
......
......@@ -206,7 +206,7 @@ def summarize_aggregated_results(aggregated_results, results_dir, dataset_name):
print(f"Summary results saved to {summary_filename}")
def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=True, selected_labels=None):
def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=True, selected_labels=None, title=""):
"""
Handles dataset loading and evaluation for experiments.
Manages the experiment based on specified number of groups and boolean concept creation settings.
......@@ -217,7 +217,8 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num
# Create timestamp directory for all results
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = Path(f"evaluation_results/{timestamp}_{dataset_name}")
results_dir = Path(f"evaluation_results/{timestamp}_{dataset_name}{title}")
results_dir.mkdir(parents=True, exist_ok=True)
aggregated_results = {}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment