Skip to content
Snippets Groups Projects
Commit dbefed65 authored by AjUm-HEIDI's avatar AjUm-HEIDI
Browse files

Update the text based

parent 47ca581d
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,7 @@ class BA2Motif(Base):
A class to process the BA2Motif dataset, find frequent patterns, and visualize graphs and patterns.
"""
def __init__(self, path='../rawData/BA2Motif'):
def __init__(self, path='../rawData'):
"""
Initialize the BA2Motif processor.
......
......@@ -9,7 +9,7 @@ class MultiShape(Base):
A class to process the BAMultiShapes dataset, find frequent patterns, and visualize graphs and patterns.
"""
def __init__(self, path='../rawData/BAMultiShapes'):
def __init__(self, path='../rawData'):
"""
Initialize the MultiShape processor.
......
......@@ -123,7 +123,7 @@ class GNN(torch.nn.Module):
def train_model(self, epochs: int = 75, lr: float = 0.0005,
train_split: float = 0.8, show_progress: bool = False,
seed: Optional[int] = 8) -> Dict[str, Dict[str, float]]:
seed: Optional[int] = 15) -> Dict[str, Dict[str, float]]:
# Set random seeds for reproducibility
torch.manual_seed(seed)
......
......@@ -91,10 +91,10 @@ class GNN(torch.nn.Module):
return F.log_softmax(x, dim=1)
def train_model(self, epochs: int = 300, lr: float = 0.001,
def train_model(self, epochs: int = 75, lr: float = 0.0005,
train_split: float = 0.8, show_progress: bool = False) -> Dict[str, float]:
"""
Train with enhanced learning schedule and early stopping
Train with enhanced learning schedule but without early stopping
"""
train_loader, test_loader = self._prepare_data(train_split)
......@@ -103,8 +103,6 @@ class GNN(torch.nn.Module):
patience=20, min_lr=1e-5)
best_metrics = {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'epoch': 0}
patience = 150
no_improve = 0
# Enhanced class weight calculation
if train_loader is not None:
......@@ -175,9 +173,6 @@ class GNN(torch.nn.Module):
if current_metrics['f1'] > best_metrics['f1']:
best_metrics = {**current_metrics, 'epoch': epoch}
self.best_model_state = self.state_dict()
no_improve = 0
else:
no_improve += 1
if show_progress and (epoch + 1) % 10 == 0:
print(f"\nEpoch {epoch+1:03d}, Loss: {total_loss/len(train_loader):.4f}")
......@@ -189,11 +184,6 @@ class GNN(torch.nn.Module):
for metric, value in current_metrics.items():
print(f"Test {metric.capitalize()}: {value:.4f}")
# Enhanced early stopping
if no_improve >= patience:
print(f"\nEarly stopping triggered after {epoch + 1} epochs")
break
if self.best_model_state:
self.load_state_dict(self.best_model_state)
return best_metrics
......
......@@ -96,7 +96,7 @@ def group_keywords_into_themes(keywords: List[str], num_themes: int):
openai.api_key = api_key
# Construct the prompt
prompt = f"Group the following keywords into {num_themes} distinct themes. Ensure each theme has a meaningful name, and do not combine themes unnecessarily. Here are the keywords:\n\n{', '.join(keywords)}\n\nPlease return the themes in a JSON format where the key is the theme name and the value is a list of grouped keywords."
prompt = f"Group the following keywords into {num_themes} distinct themes. Ensure each theme has a meaningful name, and do not combine themes unnecessarily. Uuse all the keywords while grouping into themes. Here are the keywords:\n\n{', '.join(keywords)}\n\nPlease return the themes in a JSON format where the key is the theme name and the value is a list of grouped keywords."
# Call the OpenAI API with the correct method
response = openai.ChatCompletion.create(
......
......@@ -16,19 +16,20 @@ def parse_arguments():
parser.add_argument("-d", "--dataset", help="Specific dataset name to run, optional")
parser.add_argument("-n", "--num_groups", type=int, nargs='*', default=None, help="List of group sizes to run experiments with, space-separated")
parser.add_argument("-l", "--labels", type=int, nargs='*', default=None, help="List of labels to run experiments with, space-separated")
parser.add_argument("-b", "--boolean_concepts", type=str, choices=['true', 'false'], default=None, help="Whether to create high level concepts as boolean values, must explicitly state 'true' or 'false'")
parser.add_argument("-b", "--boolean_concepts", type=str, choices=['true', 'false'], default=None, help="Whether to create high-level concepts as boolean values, must explicitly state 'true' or 'false'")
parser.add_argument("-e", "--use_experimented_groups", action='store_true', help="Use experimented groups if set, otherwise create new grouped keywords")
parser.add_argument("-p", "--penalty", type=int, default=1, help="Set the penalty of evolearner")
parser.add_argument("--title", type=str, default="", help="Title to append to the results folder name")
args = parser.parse_args()
# Convert boolean_concepts argument from string to actual Boolean type, or None if not specified
if args.boolean_concepts is not None:
args.boolean_concepts = True if args.boolean_concepts.lower() == 'true' else False
else:
args.boolean_concepts = True
# Check if the iterations argument is a positive integer
if args.iterations < 1:
parser.error("The number of iterations must be a positive integer.")
# Clean title: remove spaces and special characters, convert to lowercase
if args.title:
args.title = "_" + "".join(c.lower() for c in args.title if c.isalnum())
......@@ -48,7 +49,7 @@ def main():
types_to_run = ['text', 'structured']
for dataset_type in types_to_run:
datasets = config[dataset_type] # Load datasets based on type (structured or text)
datasets = config[dataset_type]
# If a specific dataset is specified, filter to only run that one
if args.dataset:
......@@ -65,18 +66,20 @@ def main():
iterations=args.iterations,
create_high_level_concepts_as_boolean=args.boolean_concepts,
selected_labels=args.labels,
title=args.title
title=args.title,
penalty=args.penalty
)
elif dataset_type == "text":
text_based_datasets_experiment(
dataset["grouped_keyword_dir"],
dataset["dataset_name"],
dataset["entity_name"],
iterations=args.iterations,
num_groups_list=num_groups_list,
create_high_level_concepts_as_boolean=args.boolean_concepts,
selected_labels=args.labels,
title=args.title
title=args.title,
use_experimented_groups=args.use_experimented_groups,
penalty=args.penalty
)
if __name__ == "__main__":
......
......@@ -16,7 +16,7 @@ networkx==3.3
nltk==3.9.1
numpy==2.1.0
ontolearn==0.6.0
openai==1.59.7
openai==0.27.0
psutil==6.0.0
pyparsing==3.1.2
requests==2.32.3
......@@ -25,8 +25,9 @@ scipy==1.14.0
sympy==1.13.2
threadpoolctl==3.5.0
torch==2.4.0
torch_geometric==2.5.3
torch_geometric==2.6.1
tqdm==4.66.5
urllib3==2.2.2
yarl==1.9.4
python-dotenv==1.0.1
pytest==8.3.3
\ No newline at end of file
......@@ -88,7 +88,7 @@ def test_generate_owl_file(custom_heterodata):
def test_discriminative_result_dataproperties(custom_heterodata):
hgnn = GNN
owl_graph_path = os.path.join(base_dir, "testGraphs/test_discriminative_result_dataproperties.owl") # No leading slash
explainer = DiscriminativeExplainer(hgnn, custom_heterodata, "http://example.org/", owl_graph_path, generate_new_owl_file=True)
explainer = DiscriminativeExplainer(hgnn, custom_heterodata, "http://example.org/", owl_graph_path, generate_new_owl_file=False)
hypotheses, model = explainer.explain(1, 1, use_data_properties=True, debug=True)
hypothesis = hypotheses[0]
assert hypothesis.quality == 1.0
......@@ -100,7 +100,7 @@ def test_discriminative_result_dataproperties(custom_heterodata):
def test_discriminative_result_objectproperties(custom_heterodata1):
hgnn = GNN
owl_graph_path = os.path.join(base_dir, "testGraphs/test_discriminative_result_objectproperties.owl") # No leading slash
explainer = DiscriminativeExplainer(hgnn, custom_heterodata1, "http://example.org/", owl_graph_path, generate_new_owl_file=True)
explainer = DiscriminativeExplainer(hgnn, custom_heterodata1, "http://example.org/", owl_graph_path, generate_new_owl_file=False)
hypotheses, model = explainer.explain(1, 1, use_data_properties=False, debug=True)
hypothesis = hypotheses[0]
assert hypothesis.quality == 1.0
......
......@@ -33,7 +33,7 @@ def run_gnn(structuredDataset: Base, entity_name, datasetName, results_dir):
print("Initializing GNN model...")
model = GNN(structuredDataset.dataset)
print("Training model...")
metrics = model.train_model(epochs=150, lr=0.0005, show_progress=True)
metrics = model.train_model()
evaluations["gnn"] = metrics[entity_name]
print("\nGNN Metrics:")
......@@ -112,7 +112,7 @@ def append_to_csv_file(results, results_dir, dataset_key, num_groups, create_hig
print(f"Results appended to {filename}")
def explain_and_evaluate(model, dataset, entity_name, owl_graph_path, high_level_concepts, create_high_level_concepts_as_boolean, selected_labels=None):
def explain_and_evaluate(model, dataset, entity_name, owl_graph_path, high_level_concepts, create_high_level_concepts_as_boolean, selected_labels=None, penalty=1):
"""
Explains and evaluates each label in the dataset.
"""
......@@ -134,7 +134,7 @@ def explain_and_evaluate(model, dataset, entity_name, owl_graph_path, high_level
label_name = dataset[entity_name].yLabel[label]
start_time = time.time() # Start timing for each explanation
print(f"\nExplaining {entity_name} label {label}: {label_name}")
hypotheses, model = explainer.explain(label, 5, max_runtime=90, num_generations=1000)
hypotheses, model = explainer.explain(label, 5, max_runtime=90, num_generations=1000, penalty=penalty)
explain_time = time.time() - start_time # End timing for each explanation
best_hypothesis = hypotheses[0].concept
......@@ -215,7 +215,7 @@ def summarize_aggregated_results(aggregated_results, results_dir, dataset_name):
print(f"Summary results saved to {summary_filename}")
def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=True, selected_labels=None, title=""):
def experiment(dataset_name, entity_name, iterations=5, num_groups_list=[0, 5, 10, 15, 20, 25], create_high_level_concepts_as_boolean=True, selected_labels=None, title="", use_experimented_groups=False, penalty=1):
"""
Handles dataset loading and evaluation for experiments.
Manages the experiment based on specified number of groups and boolean concept creation settings.
......@@ -225,11 +225,20 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num
# Create timestamp directory for all results
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = Path(f"evaluation_results/{timestamp}_{dataset_name}{title}")
results_dir.mkdir(parents=True, exist_ok=True)
# Create directory for grouped keywords
if use_experimented_groups:
grouped_keyword_dir = f"experimentedGroups/{dataset_name}"
else:
grouped_keyword_dir = results_dir / "groupedKeywords"
grouped_keyword_dir.mkdir(parents=True, exist_ok=True)
# Load dataset and train GNN model only once
dataset = load_datasets(dataset_name=dataset_name)
model = run_gnn(dataset, entity_name, dataset_name, results_dir)
aggregated_results = {}
for run in range(1, iterations + 1):
......@@ -238,9 +247,6 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num
# Create run-specific directory
run_dir = create_directory(results_dir, f"run_{run}")
dataset = load_datasets(dataset_name=dataset_name)
model = run_gnn(dataset, entity_name, dataset_name, run_dir)
write_header = True
for num_groups in num_groups_list:
group_keyword_file = "" if num_groups == 0 else os.path.join(grouped_keyword_dir, f'groupedKeywords_{num_groups}.json')
......@@ -253,7 +259,7 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num
high_level_concepts = None if num_groups == 0 else fetch_high_level_concepts(dataset, num_groups, group_keyword_file)
results = explain_and_evaluate(
model, dataset.dataset, entity_name, owl_graph_path, high_level_concepts, create_high_level_concepts_as_boolean, selected_labels
model, dataset.dataset, entity_name, owl_graph_path, high_level_concepts, create_high_level_concepts_as_boolean, selected_labels, penalty
)
append_to_csv_file(results, run_dir, dataset_name, num_groups, create_high_level_concepts_as_boolean, write_header=write_header)
......@@ -291,24 +297,22 @@ def experiment(grouped_keyword_dir, dataset_name, entity_name, iterations=5, num
def main():
datasets = [
{
'grouped_keyword_dir': 'rawData/dblp/groups',
'dataset_name': 'dblp',
'entity_name': 'author'
}
# ,
# {
# 'grouped_keyword_dir': 'rawData/imdb/groups',
# 'dataset_name': 'imdb',
# 'entity_name': 'movie'
# }
,
{
'dataset_name': 'imdb',
'entity_name': 'movie'
}
]
for dataset in datasets:
experiment(
dataset["grouped_keyword_dir"],
dataset["dataset_name"],
dataset["entity_name"],
iterations=5
iterations=5,
use_experimented_groups=True
)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment