Update text based datasets

3fe3a1fd · AjUm-HEIDI · 7733c9e8 · 3fe3a1fd · 3fe3a1fd · 3fe3a1fd
Commit 3fe3a1fd authored 6 months ago by AjUm-HEIDI
--- a/ConceptLearner/ConvertToOWL.py
+++ b/ConceptLearner/ConvertToOWL.py
@@ -26,8 +26,9 @@ class BaseOWLConverter(ABC):
        pass

 class SingleGraphOWLConverter(BaseOWLConverter):
-    """Converter for single HeteroData graphs"""
+    """Converter for single HeteroData graphs with support for high-level concepts"""
    def __init__(self, data: HeteroData, namespace: str, owlGraphPath: str, 
+                 high_level_concepts: dict = None,
                 create_data_properties: bool = True, 
                 create_data_properties_as_object: bool = False,
                 add_edge_counts: bool = False,
@@ -37,6 +38,7 @@ class SingleGraphOWLConverter(BaseOWLConverter):
                 **kwargs):
        super().__init__(namespace, owlGraphPath)
        self.dataset = data
+        self.high_level_concepts = high_level_concepts or {}
        self.create_data_properties = create_data_properties
        self.create_data_properties_as_object = create_data_properties_as_object
        self.add_edge_counts = add_edge_counts
@@ -107,12 +109,13 @@ class SingleGraphOWLConverter(BaseOWLConverter):
                    self.graph.add((propertyObjectProperty, RDFS.domain, classNamespace[node]))
                    self.graph.add((propertyObjectProperty, RDFS.range, xsdRange))
                        
-            if "super_classes" in self.dataset[node]:
-                for super_class in self.dataset[node].super_classes:
-                    propertyObjectProperty = classNamespace["has_theme_" + super_class]
-                    self.graph.add((propertyObjectProperty, RDF.type, OWL.DatatypeProperty))
-                    self.graph.add((propertyObjectProperty, RDFS.domain, classNamespace[node]))
-                    self.graph.add((propertyObjectProperty, RDFS.range, XSD.boolean))
+            # Add high-level concepts if available for this node type
+            if node in self.high_level_concepts:
+                for theme in self.high_level_concepts[node].get('themes', []):
+                    theme_namespace = classNamespace[f'has_theme_{theme}']
+                    self.graph.add((theme_namespace, RDF.type, OWL.DatatypeProperty))
+                    self.graph.add((theme_namespace, RDFS.domain, classNamespace[node]))
+                    self.graph.add((theme_namespace, RDFS.range, XSD.boolean))

    def _buildObjectProperties(self):
        classNamespace = Namespace(self.namespace)
@@ -128,6 +131,7 @@ class SingleGraphOWLConverter(BaseOWLConverter):
    def _buildNodes(self):
        classNamespace = Namespace(self.namespace)
        for node_type in self.dataset.node_types:
+
            if "x" in self.dataset[node_type]:
                tensor_values = self.dataset[node_type].x
                for row_idx, properties in enumerate(tensor_values):
@@ -153,6 +157,16 @@ class SingleGraphOWLConverter(BaseOWLConverter):
                                val = True if val != 0 else False
                            if self.add_false_values or val or val != 0:
                                self.graph.add((newNode, propertyObjectProperty, Literal(val)))
+                                
+                    if node_type in self.high_level_concepts:
+                        high_level_concept = self.high_level_concepts[node_type]
+                        presence_matrix = high_level_concept.presence_matrix  # m x n array
+                        themes = high_level_concept.themes  # List of n themes
+                        
+                        for theme_idx, theme in enumerate(themes):  # Iterate over columns (n columns)
+                            if presence_matrix[row_idx, theme_idx] == 1:  # Check if presence is 1
+                                self.graph.add((newNode, classNamespace[f'has_theme_{theme}'], Literal(True)))
+                                
            if "num_nodes" in self.dataset[node_type]:
                num_nodes = self.dataset[node_type].num_nodes
                for idx in range(num_nodes):
@@ -242,10 +256,13 @@ class SingleGraphOWLConverter(BaseOWLConverter):
                self.graph.add((newNode, classNamespace[f'{nodeType}_outgoing'], Literal(nodeCounts[nodeType][node]["outgoing"])))

 class MultiGraphOWLConverter(BaseOWLConverter):
-    """Converter for datasets containing multiple graphs"""
-    def __init__(self, dataset: Data, namespace: str, owlGraphPath: str, n: int = None, ignore_nodes: bool = False, add_node_type: bool = True, **kwargs):
+    """Converter for datasets containing multiple graphs with pattern support"""
+    def __init__(self, dataset: Data, namespace: str, owlGraphPath: str, n: int = None, ignore_nodes: bool = False, add_node_type: bool = True,  high_level_concepts: dict = None, **kwargs):
        super().__init__(namespace, owlGraphPath)
        self.dataset = dataset
+        self.high_level_concepts = high_level_concepts or {}
+        self.patterns = self.high_level_concepts.get("patterns", [])
+        self.presence_matrix = self.high_level_concepts.get("presence_matrix", None)
        self.ignore_nodes = ignore_nodes
        self.add_node_type = add_node_type
        self.n = n if n is not None and n <= len(self.dataset) else len(self.dataset)
@@ -253,7 +270,7 @@ class MultiGraphOWLConverter(BaseOWLConverter):
        print(f"Will process {self.n} instances out of {len(self.dataset)} total instances")

    def _build(self):
-        """Implements the building process for multiple graphs"""
+        """Implements the building process for multiple graphs with pattern support"""
        start_time = time.time()
        print("Creating Dataset Classes...")
        self._createDatasetClasses()
@@ -270,15 +287,29 @@ class MultiGraphOWLConverter(BaseOWLConverter):
        print(f"Data Properties built in {time.time() - start_time:.2f} seconds.")

    def _createDatasetClasses(self):
+        """Creates the basic classes for the dataset"""
        ns = Namespace(self.namespace)
        self.graph.add((ns.Graph, RDF.type, OWL.Class))
+        self.graph.add((ns.Node, RDF.type, OWL.Class))
        
        if hasattr(self.dataset, 'super_classes'):
            for structure in self.dataset.super_classes:
-                has_structure = ns['has_'+structure]
-                self.graph.add((has_structure, RDF.type, OWL.ObjectProperty))
-                self.graph.add((has_structure, RDFS.domain, ns.Graph))
-                self.graph.add((has_structure, RDFS.range,  XSD.boolean))
+                has_motif = ns['has_motif_' + structure]
+                self.graph.add((has_motif, RDF.type, OWL.DatatypeProperty))
+                self.graph.add((has_motif, RDFS.domain, ns.Graph))
+                self.graph.add((has_motif, RDFS.range, XSD.boolean))
+                
+        for i, pattern_name in enumerate(self.patterns):
+            has_pattern = ns[f'has_pattern_{pattern_name}']
+            self.graph.add((has_pattern, RDF.type, OWL.DatatypeProperty))
+            self.graph.add((has_pattern, RDFS.domain, ns.Graph))
+            self.graph.add((has_pattern, RDFS.range, XSD.boolean))
+        
+        # Add a property to count the number of superstructures
+        ns.superstructure_count = ns["superstructure_count"]
+        self.graph.add((ns.superstructure_count, RDF.type, OWL.DatatypeProperty))
+        self.graph.add((ns.superstructure_count, RDFS.domain, ns.Graph))
+        self.graph.add((ns.superstructure_count, RDFS.range, XSD.integer))

        # Add node type properties
        if self.add_node_type:
@@ -289,6 +320,7 @@ class MultiGraphOWLConverter(BaseOWLConverter):
                self.graph.add((has_node_type, RDFS.range, XSD.boolean))

    def _buildDatasetProperties(self):
+        """Builds the basic properties for the dataset"""
        ns = Namespace(self.namespace)
        if not self.ignore_nodes:
            self.graph.add((ns.contains, RDF.type, OWL.ObjectProperty))
@@ -300,31 +332,32 @@ class MultiGraphOWLConverter(BaseOWLConverter):
            self.graph.add((ns.connectedTo, RDFS.range, ns.Node))

    def _buildDatasetIndividuals(self):
+        """Builds dataset individuals with pattern information"""
        ns = Namespace(self.namespace)
        for idx in range(self.n):
-            if idx % 10 == 0:
-                print(f"Processing graph {idx}/{self.n}...")

            data = self.dataset[idx]
-            
+            superstructure_count = 0
            # Create graph individual
            graph_uri = ns[f'graph_{idx}']
            self.graph.add((graph_uri, RDF.type, ns.Graph))
            self.graph.add((graph_uri, RDF.type, OWL.NamedIndividual))

-            # Add super class relationships if present
-            if hasattr(data, 'super_classes'):
-                for structure in data.super_classes:
-                    has_structure = ns['has_' + structure]
-                    self.graph.add((graph_uri, has_structure, Literal(True)))
+            # Add pattern presence information
+            if self.patterns and self.presence_matrix is not None:
+                for pattern_idx, pattern in enumerate(self.patterns):
+                    if self.presence_matrix[idx, pattern_idx]:
+                        has_pattern = ns[f'has_pattern_{pattern}']
+                        self.graph.add((graph_uri, has_pattern, Literal(True)))
+                        superstructure_count += 1
+                self.graph.add((graph_uri, ns.superstructure_count, Literal(superstructure_count, datatype=XSD.integer)))

-            # Handle node type information
+            # Extract node features (if available)
            node_type_tensor = getattr(data, 'x', None)
-            if node_type_tensor is None:
-                print(f"No node features (data.x) found for graph {idx}. Skipping node type processing.")
-                continue

            if self.ignore_nodes:
+                # Skip adding individual nodes, only process superstructures
+                if self.add_node_type and node_type_tensor is not None:
                    # Add node type counts directly to graph
                    for type_idx in range(node_type_tensor.size(1)):
                        has_node_type = ns[f'has_node_type_{type_idx}']
@@ -333,20 +366,19 @@ class MultiGraphOWLConverter(BaseOWLConverter):
                            self.graph.add((graph_uri, has_node_type, Literal(True)))
            else:
                # Add nodes with their types
+                if node_type_tensor is not None:
                    for node_idx in range(data.num_nodes):
                        node_uri = ns[f'node_{idx}_{node_idx}']
                        self.graph.add((node_uri, RDF.type, ns.Node))
                        self.graph.add((graph_uri, ns.contains, node_uri))

-                    # Determine the node type
+                        if self.add_node_type:
                            node_features = node_type_tensor[node_idx]
                            node_type = torch.argmax(node_features).item() if node_features.sum() > 0 else None
-            
                            if node_type is not None:
                                has_node_type = ns[f'has_node_type_{node_type}']
                                self.graph.add((node_uri, has_node_type, Literal(True)))
                
-                # Add edges
                edges = getattr(data, 'edge_index', None)
                if edges is not None:
                    for edge in edges.t():
@@ -355,27 +387,46 @@ class MultiGraphOWLConverter(BaseOWLConverter):
                        target_uri = ns[f'node_{idx}_{target.item()}']
                        self.graph.add((source_uri, ns.connectedTo, target_uri))

-
-def convert_to_owl(data, namespace: str, owlGraphPath: str, **kwargs):
-    """Factory function to create the appropriate converter based on input data type"""
+def convert_to_owl(data, namespace: str, owlGraphPath: str, high_level_concepts: dict = None, **kwargs):
+    """
+    Factory function to create the appropriate converter based on input data type
+    
+    Args:
+        data: Input data (HeteroData or regular Data)
+        namespace (str): Namespace for the OWL ontology
+        owlGraphPath (str): Path where the OWL file will be saved
+        high_level_concepts (dict): For HeteroData, a dictionary mapping node types to their concepts
+                                  For regular Data, a tuple of (patterns, presence_matrix)
+        **kwargs: Additional arguments passed to the converter
+    
+    Returns:
+        BaseOWLConverter: An instance of the appropriate converter
+    """
    if isinstance(data, HeteroData):
-        return SingleGraphOWLConverter(data, namespace, owlGraphPath, **kwargs)
+        return SingleGraphOWLConverter(data, namespace, owlGraphPath, high_level_concepts, **kwargs)
    else:
-        return MultiGraphOWLConverter(data, namespace, owlGraphPath, **kwargs)
-
+        return MultiGraphOWLConverter(data, namespace, owlGraphPath, high_level_concepts=high_level_concepts, **kwargs)

 if __name__ == "__main__":
-    
-    # Example 1: Multiple graphs usage
+    # Example usage with MUTAG dataset
    from customDBs.MUTAG import MUTAG
-    dataset = MUTAG(path='../rawData/MUTAG', add_patterns=True)
+    from customDBs.MultiShape import MultiShape
+    mutag = MUTAG(path='../rawData/MUTAG')
+    ms = MultiShape(path='./rawData/BAMultiShapes')
+    
+    # Detect motifs and get patterns
+    patterns, presence_matrix = mutag.detect_motifs(mutag.dataset)
    
+    # Convert to OWL
    converter = convert_to_owl(
-        data=dataset.dataset,
+        data=mutag.dataset,
        namespace="http://example.org/",
-        owlGraphPath="./owlGraphs/mutage_multi3.owl",
-        ignore_nodes=True,
-        n=5
+        owlGraphPath="./owlGraphs/mutag_with_patterns.owl",
+        high_level_concepts={
+            patterns : patterns, 
+            presence_matrix: presence_matrix
+        },
+        ignore_nodes=True
    )
    converter.buildGraph()
    

--- a/customDBs/DBLP.py
+++ b/customDBs/DBLP.py
 from collections import defaultdict
-from datetime import datetime
-import argparse
-import json
 import os
 import re
 import torch
 from torch_geometric.data import HeteroData
-from ConceptLearner.Utils import group_keywords_into_themes, clean_false_entries_in_dataset, group_themes
+from ConceptLearner.Utils import group_themes
 from nltk.corpus import stopwords
 import nltk
-
-try:
-    stop_words = set(stopwords.words('english'))
-except LookupError:
-    nltk.download('stopwords')
-    stop_words = set(stopwords.words('english'))
+from customDBs.TextDataset import TextDataset

 # Define label categories as constants
-LABELS = ["Database", "Data Mining", "Artificial Intelligence", "Information Retrieval"]

-def load_dblp(path="./rawData/dblp", bag_of_words_size=100, groupKeywords=True, groupedKeywordsPath='', removeAllFalseValues=True):
+class DBLP(TextDataset):
+
+    def __init__(self, path='rawData/dblp', bag_of_words_size=100, remove_all_false_values=True):
        """
        Loads the DBLP dataset and constructs a HeteroData object for PyTorch Geometric.

        Args:
            path (str): Path to the DBLP data directory.
            bag_of_words_size (int): Number of top words to consider if grouping is disabled.
-        groupKeywords (bool): Whether to group keywords into themes.
-        groupedKeywordsPath (str): Path to save/load grouped keywords.
            removeAllFalseValues (bool): Whether to clean false entries in the dataset.

        Returns:
            HeteroData: The constructed heterogeneous graph data.
        """
-    if not groupedKeywordsPath:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        groupedKeywordsPath = os.path.join(path, f"groupedKeywords_{timestamp}.json")
+        
+        super().__init__(path=path, bag_of_words_size=bag_of_words_size, remove_all_false_values=remove_all_false_values)
+        self._initialize()
+
+    def _initialize(self):
+        """
+        Loads and processes IMDb movie data, returning it as a HeteroData object suitable for PyTorch Geometric.
+        Each label (genre) gets its own top bag_of_words_size keywords.
+        """
+                
+        try:
+            self.stop_words = set(stopwords.words('english'))
+        except LookupError:
+            nltk.download('stopwords')
+            self.top_words = set(stopwords.words('english'))

        # Load authors
-    author_ids, author_labels, author_id_dict = _get_authors(path)
+        author_ids, author_labels, author_id_dict = self._get_authors()

        # Load papers and grouped themes
-    paper_tensor, paper_id_dict, bag_of_words = _get_papers(path, bag_of_words_size, groupKeywords, groupedKeywordsPath)
+        paper_tensor, paper_id_dict, bag_of_words = self._get_papers()

-    author_paper_mappings = _get_author_paper_mappings(path, author_id_dict, paper_id_dict)
-    conf_ids, conf_id_dict = _get_conference(path)
-    paper_conference_mappings = _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict)
-
-    # Construct HeteroData
-    dataset = HeteroData()
+        author_paper_mappings = self._get_author_paper_mappings(author_id_dict, paper_id_dict)
+        conf_ids, conf_id_dict = self._get_conference()
+        paper_conference_mappings = self._get_paper_conference_mappings(paper_id_dict, conf_id_dict)

        # Author Nodes
-    dataset['author'].num_nodes = len(author_labels)
-    dataset['author'].y = torch.tensor(author_labels)
-    dataset['author'].yLabel = LABELS
-    dataset['author'].x = torch.zeros((dataset['author'].num_nodes, 1), dtype=torch.float32)  # Dummy feature matrix
+        self.dataset['author'].num_nodes = len(author_labels)
+        self.dataset['author'].y = torch.tensor(author_labels)
+        self.dataset['author'].yLabel = ["Database", "Data Mining", "Artificial Intelligence", "Information Retrieval"]
+        self.dataset['author'].x = torch.zeros((self.dataset['author'].num_nodes, 1), dtype=torch.float32)  # Dummy feature matrix

        # Paper Nodes
-    dataset['paper'].x = paper_tensor.float()
-    dataset['paper'].xKeys = bag_of_words
+        self.dataset['paper'].x = paper_tensor.float()
+        self.dataset['paper'].xKeys = bag_of_words

        # Author-Paper Edges
-    dataset['author', 'writes', 'paper'].edge_index = author_paper_mappings.t()
-    dataset['paper', 'written_by', 'author'].edge_index = author_paper_mappings.t()[[1, 0], :]
+        self.dataset['author', 'writes', 'paper'].edge_index = author_paper_mappings.t()
+        self.dataset['paper', 'written_by', 'author'].edge_index = author_paper_mappings.t()[[1, 0], :]

-    dataset['conference'].x = torch.tensor(conf_ids)
-    dataset['paper', 'published_in', 'conference'].edge_index = paper_conference_mappings.t()
+        self.dataset['conference'].x = torch.tensor(len(conf_ids))
+        self.dataset['paper', 'published_in', 'conference'].edge_index = paper_conference_mappings.t()

-    return dataset
+        return self.dataset

-def _get_authors(path):
+    def _get_authors(self):
        """
        Reads the author_label.txt file and extracts author IDs and labels.

@@ -79,7 +79,7 @@ def _get_authors(path):
        Returns:
            tuple: (ids, labels, id_dict)
        """
-    file_path = os.path.join(path, "author_label.txt")
+        file_path = os.path.join(self.path, "author_label.txt")
        ids = []
        labels = []

@@ -104,20 +104,13 @@ def _get_authors(path):

        id_dict = {id: idx for idx, id in enumerate(ids)}

-    # Validate labels
-    if labels and max(labels) >= len(LABELS):
-        print(f"Encountered a label {max(labels)} outside the defined yLabel categories.")
-        raise ValueError("Invalid label found in author_label.txt.")
-
        return ids, labels, id_dict

-def fetch_themes(num_groups, groupedKeywordsPath=''):
+    def fetch_themes(self, num_groups, groupedKeywordsPath=''):
        """
        Groups tensor features into themes based on vocabulary words.

        Args:
-        tensor (torch.Tensor): Original tensor with word frequencies
-        vocabulary (list): List of words corresponding to tensor columns
            num_groups (int): Number of groups/themes to create
            groupedKeywordsPath (str, optional): Path to save/load grouped keywords

@@ -128,8 +121,8 @@ def fetch_themes(num_groups, groupedKeywordsPath=''):
        """
        grouped_themes = {}

-    vocabulary = dataset['paper'].xKeys
-    tensor = dataset['paper'].x
+        vocabulary = self.dataset['paper'].xKeys
+        tensor = self.dataset['paper'].x
        grouped_tensor, grouped_themes = group_themes(tensor, vocabulary, num_groups, groupedKeywordsPath)

        
@@ -142,20 +135,14 @@ def fetch_themes(num_groups, groupedKeywordsPath=''):
        
        return high_level_concepts

-def _get_papers(path, bag_of_words_size):
+    def _get_papers(self):
        """
        Reads the paper.txt file and processes the text into a tensor.
        
-    Args:
-        path (str): Path to the DBLP data directory.
-        bag_of_words_size (int): Number of top words to consider.
-        groupKeywords (bool): Whether to group keywords into themes.
-        groupedKeywordsPath (str): Path to save/load grouped keywords.
-
        Returns:
            tuple: (paper_tensor, paper_id_dict, vocabulary, grouped_themes)
        """
-    file_path = os.path.join(path, "paper.txt")
+        file_path = os.path.join(self.path, "paper.txt")
        bag_of_words = []
        vocabulary = []
        total_words = defaultdict(int)
@@ -177,7 +164,7 @@ def _get_papers(path, bag_of_words_size):
                    words = re.findall(r'\w+', text.lower())  # Tokenize and convert to lowercase
                    word_count = defaultdict(int)
                    for word in words:
-                    if word in stop_words:
+                        if word in self.stop_words:
                            continue
                        if word not in total_words:
                            vocabulary.append(word)
@@ -194,7 +181,7 @@ def _get_papers(path, bag_of_words_size):
            raise

        # Get top N words for vocabulary
-    top_n = bag_of_words_size
+        top_n = self.bag_of_words_size
        total_words = dict(sorted(total_words.items(), key=lambda item: item[1], reverse=True)[:top_n])
        vocabulary = list(total_words.keys())

@@ -209,19 +196,18 @@ def _get_papers(path, bag_of_words_size):

        return paper_tensor, paper_id_dict, vocabulary

-def _get_author_paper_mappings(path, author_id_dict, paper_id_dict):
+    def _get_author_paper_mappings(self, author_id_dict, paper_id_dict):
        """
        Reads the paper_author.txt file and creates author-paper edge mappings.

        Args:
-        path (str): Path to the DBLP data directory.
            author_id_dict (dict): Mapping from author IDs to indices.
            paper_id_dict (dict): Mapping from paper IDs to indices.

        Returns:
            torch.Tensor: Edge indices tensor of shape [2, num_edges].
        """
-    file_path = os.path.join(path, "paper_author.txt")
+        file_path = os.path.join(self.path, "paper_author.txt")

        mappings = []

@@ -252,7 +238,7 @@ def _get_author_paper_mappings(path, author_id_dict, paper_id_dict):

        return mappings

-def _get_conference(path):
+    def _get_conference(self):
        """
        Reads the conf.txt file and extracts conference IDs and names.

@@ -262,7 +248,7 @@ def _get_conference(path):
        Returns:
            tuple: (ids, id_dict)
        """
-    file_path = os.path.join(path, "conf.txt")
+        file_path = os.path.join(self.path, "conf.txt")
        ids = []

        try:
@@ -286,19 +272,18 @@ def _get_conference(path):
        id_dict = {id: idx for idx, id in enumerate(ids)}
        return ids, id_dict

-def _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict):
+    def _get_paper_conference_mappings(self, paper_id_dict, conf_id_dict):
        """
        Reads the paper_conf.txt file and creates paper-conference edge mappings.

        Args:
-        path (str): Path to the DBLP data directory.
            paper_id_dict (dict): Mapping from paper IDs to indices.
            conf_id_dict (dict): Mapping from conference IDs to indices.

        Returns:
            torch.Tensor: Edge indices tensor of shape [2, num_edges].
        """
-    file_path = os.path.join(path, "paper_conf.txt")
+        file_path = os.path.join(self.path, "paper_conf.txt")

        mappings = []

@@ -331,5 +316,5 @@ def _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict):


 if __name__ == "__main__":
-    dataset = load_dblp(path="/Users/ajay/Documents/gitlabThesis/Thesis/rawData/customDblp")
-    print(dataset)
+    dblp = DBLP()
+    print(dblp.dataset)
--- a/customDBs/IMDB.py
+++ b/customDBs/IMDB.py
 from collections import defaultdict
-from datetime import datetime
-import json
 import torch
 from torch_geometric.data import HeteroData
-from ConceptLearner.Utils import group_keywords_into_themes, clean_false_entries_in_dataset
 import csv
 import os
-import sys

+from ConceptLearner.Utils import group_themes

-def load_imdb(path="rawData/imdb", bag_of_words_size=25, groupKeywords=True, groupedKeywordsPath='', removeAllFalseValues=True, numberOfGroups=5):
+class IMDB:
+    def __init__(self, path='rawData/imdb/movie_metadata.csv', bag_of_words_size=100, remove_all_false_values=True):
        """
-    Loads and processes IMDb movie data, returning it as a HeteroData object suitable for PyTorch Geometric.
-    Each label (genre) gets its own top bag_of_words_size keywords.
-
-    :param path: Path to the directory containing the 'movie_metadata.csv' file.
-    :param bag_of_words_size: Number of top keywords or themes to include per label.
-    :param groupKeywords: Whether to group keywords into themes.
-    :param groupedKeywordsPath: Path to save or load grouped themes JSON file.
-    :param removeAllFalseValues: Whether to remove entries with zero bag of words.
-    :param numberOfGroups: Number of themes to group keywords into if grouping is enabled.
-    :return: A HeteroData object containing movie features and labels.
-    """
-    # Construct the file path using os.path.join for cross-platform compatibility
-    file_path = os.path.join(path, "movie_metadata.csv")
+        Initialize the IMDB dataset.

-    if not groupedKeywordsPath:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        groupedKeywordsPath = os.path.join(path, f"groupedKeywords_{timestamp}.json")
+        Args:
+            path (str): Path to the IMDB directory.
+        """
+        self.path = path
+        self.bag_of_words_size = bag_of_words_size
+        self.remove_all_false_values = remove_all_false_values
+        self.dataset = HeteroData()
+        self._initialize()

-    # Check if the file exists
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"Error: The file '{file_path}' does not exist.")
+    def _initialize(self):
+        """
+        Loads and processes IMDb movie data, adding movies, directors, and actors as nodes,
+        and creating edges between them.
+        """

        # Parse CSV data into a list of dictionaries
-    data_list = _parse_csv_to_dict(file_path)
+        data_list = self._parse_csv_to_dict(self.path)

        # Process movies and extract features
-    movie_labels, movie_tensor, bag_of_words, grouped_labels, grouped_tensor = _get_movies(
-        data_list, bag_of_words_size, groupKeywords, groupedKeywordsPath, numberOfGroups
+        movie_labels, movie_tensor, plot_keywords, movie_to_director, movie_to_actors = self._get_movies(
+            data_list, self.bag_of_words_size
        )

-    # Remove rows with zero bag of words if requested
-    if removeAllFalseValues:
-        # Find indices where sum of features is not zero
-        if groupKeywords and grouped_tensor is not None:
-            non_zero_indices = torch.where(grouped_tensor.sum(dim=1) != 0)[0]
-        else:
-            non_zero_indices = torch.where(movie_tensor.sum(dim=1) != 0)[0]
-        
-        # Filter tensors and labels
-        movie_labels = [movie_labels[i] for i in non_zero_indices]
-        movie_tensor = movie_tensor[non_zero_indices]
+        # Filter out movies without the desired genres or zero bag-of-words features
+        valid_movie_indices = torch.where(movie_tensor.sum(dim=1) != 0)[0]
+        filtered_index_map = {i.item(): idx for idx, i in enumerate(valid_movie_indices)}
+
+        # Apply the filtering logic
+        movie_labels = [movie_labels[i] for i in valid_movie_indices]
+        movie_tensor = movie_tensor[valid_movie_indices]
+        movie_to_director = {
+            filtered_index_map[movie_idx]: director
+            for movie_idx, director in movie_to_director.items()
+            if movie_idx in filtered_index_map
+        }
+        movie_to_actors = {
+            filtered_index_map[movie_idx]: actors
+            for movie_idx, actors in movie_to_actors.items()
+            if movie_idx in filtered_index_map
+        }

-        if groupKeywords and grouped_tensor is not None:
-            grouped_tensor = grouped_tensor[non_zero_indices]
+        # Prepare movie nodes
+        self.dataset['movie'].x = movie_tensor
+        self.dataset['movie'].y = torch.tensor(movie_labels)
+        self.dataset['movie'].xKeys = plot_keywords
+        self.dataset['movie'].yLabel = ["Action", "Comedy", "Drama"]

-    # Prepare HeteroData object
-    dataset = HeteroData()
-    dataset['movie'].y = torch.tensor(movie_labels)
-    dataset['movie'].x = movie_tensor
-    dataset['movie'].xKeys = bag_of_words
-    dataset['movie'].yLabel = ["Action", "Comedy", "Drama"]
+        # Add director and actor nodes and edges
+        movie_to_director_edges = self._add_directors_and_edges(movie_to_director)
+        self.dataset['movie', 'to', 'director'].edge_index = movie_to_director_edges
+        self.dataset['director', 'to', 'movie'].edge_index = movie_to_director_edges.flip(0)
        
-    if groupKeywords and grouped_tensor is not None:
-        dataset['movie'].super_classes = grouped_labels
-        dataset['movie'].super_class_values = grouped_tensor
        
-    return dataset
+        movie_to_actors_edges = self._add_actors_and_edges(movie_to_actors)
+        self.dataset['movie', 'to', 'actor'].edge_index = movie_to_actors_edges
+        self.dataset['actor', 'to', 'movie'].edge_index = movie_to_actors_edges.flip(0)
        
+        return self.dataset

-def _get_movies(data_list, bag_of_words_size, groupKeywords, groupedKeywordsPath, numberOfGroups):
+    def _get_movies(self, data_list, bag_of_words_size):
        """
-    Processes the list of movie data dictionaries to extract features and labels.
-
-    :param data_list: List of dictionaries containing movie data.
-    :param bag_of_words_size: Number of top keywords or themes to include.
-    :param groupKeywords: Whether to group keywords into themes.
-    :param groupedKeywordsPath: Path to save or load grouped themes JSON file.
-    :param numberOfGroups: Number of themes to group keywords into if grouping is enabled.
-    :return: Tuple of labels list, feature tensor, vocabulary list, and optionally grouped labels and tensor if groupKeywords is True.
+        Processes movie data to extract features, labels, and mappings.
+
+        Args:
+            data_list (list): List of dictionaries containing movie data.
+            bag_of_words_size (int): Number of top keywords to include in the vocabulary.
+
+        Returns:
+            tuple: A tuple containing:
+                - labels: List of labels (one per movie).
+                - tensor: Tensor with word frequencies.
+                - vocabulary: List of words in the vocabulary.
+                - movie_to_director: Mapping of movies to directors.
+                - movie_to_actors: Mapping of movies to actors.
        """
        total_words = defaultdict(int)
        labels = []
        bag_of_words = []
-    labelNames = ["Action", "Comedy", "Drama"]
-    rating_order = {
-        "G": [1, 0, 0, 0, 0],
-        "PG": [0, 1, 0, 0, 0],
-        "PG-13": [0, 0, 1, 0, 0],
-        "R": [0, 0, 0, 1, 0],
-        "NC-17": [0, 0, 0, 0, 1]
-    }
-
-    # Dictionary to track keyword counts for each label
-    label_plot_counts = {labelName: defaultdict(int) for labelName in labelNames}
+        movie_to_director = {}
+        movie_to_actors = {}
+        label_names = ["Action", "Comedy", "Drama"]

        # Process each movie entry
-    for data in data_list:
+        for idx, data in enumerate(data_list):
            genres = data.get("genres", "").split("|")
-        content_rating = data.get("content_rating", "")
            plot_keywords = data.get("plot_keywords", "")
+            director = data.get("director_name", "")
+            actors = data.get("actor_1_name", "") + "|" + data.get("actor_2_name", "") + "|" + data.get("actor_3_name", "")

-        # Determine label based on genres
-        label = -1
-        for label_idx, labelName in enumerate(labelNames):
-            if labelName in genres:
-                label = label_idx
-                break  # Stop at the first matching genre
-
-        if label == -1 or content_rating not in rating_order:
-            continue  # Skip movies without desired genres or content ratings
+            # Assign a label based on genres
+            label = next((idx for idx, name in enumerate(label_names) if name in genres), -1)
+            if label == -1:  # Skip movies without desired genres
+                continue

-        # Collect labels
            labels.append(label)
+            movie_to_director[len(labels) - 1] = director
+            movie_to_actors[len(labels) - 1] = [actor.strip() for actor in actors.split("|") if actor.strip()]

            # Process plot keywords
-        plots = plot_keywords.split("|")
            word_count = defaultdict(int)
+            plots = plot_keywords.split("|")
            for plot in plots:
-            plot = plot.strip().replace(" ", "_")
+                plot = plot.strip().replace(" ", "_").lower()  # Normalize to lowercase
                if not plot:
                    continue
                total_words[plot] += 1
                word_count[plot] += 1
-            # Track keyword count for the specific label
-            label_plot_counts[labelNames[label]][plot] += 1
            bag_of_words.append(dict(word_count))

-    # Initialize vocabulary
-    sorted_words = sorted(total_words.items(), key=lambda x: x[1], reverse=True)
-    vocabulary = [word for word, _ in sorted_words[:bag_of_words_size]]
-
-    grouped_labels = None
-    grouped_tensor = None
-
-    if groupKeywords:
-        if os.path.exists(groupedKeywordsPath):
-            with open(groupedKeywordsPath, "r") as json_file:
-                grouped_themes = json.load(json_file)
-        else:
-            result = group_keywords_into_themes(vocabulary, numberOfGroups)
-            grouped_themes = result["success"]
-            grouped_themes = {key.replace(' ', '_'): value for key, value in grouped_themes.items()}
-            with open(groupedKeywordsPath, "w") as json_file:
-                json.dump(grouped_themes, json_file, indent=4)
-
-        # Create a new bag-of-words model based on grouped labels
-        grouped_matrix = []
-        grouped_labels = list(grouped_themes.keys())
-        for word_count in bag_of_words:
-            theme_vector = [0] * len(grouped_labels)
-            for word, count in word_count.items():
-                for idx, theme in enumerate(grouped_labels):
-                    if word in grouped_themes[theme]:
-                        theme_vector[idx] += count
-            grouped_matrix.append(theme_vector)
-
-        # Convert grouped matrix to tensor
-        grouped_tensor = torch.tensor(grouped_matrix, dtype=torch.float32)
-
-    # Create feature matrix based on vocabulary
-    matrix = []
-    for item in bag_of_words:
-        word_vector = [item.get(word, 0) for word in vocabulary]
-        matrix.append(word_vector)
-
-    # Convert the matrix to a PyTorch tensor
-    tensor = torch.tensor(matrix, dtype=torch.float32)
-
-    if groupKeywords:
-        return labels, tensor, vocabulary, grouped_labels, grouped_tensor
-    else:
-        return labels, tensor, vocabulary
-
-
-def _parse_csv_to_dict(file_path):
+        # Build the vocabulary from the most common keywords
+        vocabulary = [
+            word for word, _ in sorted(total_words.items(), key=lambda x: x[1], reverse=True)[:bag_of_words_size]
+        ]
+
+        # Create the feature matrix
+        feature_matrix = [
+            [word_count.get(word, 0) for word in vocabulary] for word_count in bag_of_words
+        ]
+        tensor = torch.tensor(feature_matrix, dtype=torch.float32)
+
+        return labels, tensor, vocabulary, movie_to_director, movie_to_actors
+
+    def _add_directors_and_edges(self, movie_to_director):
+        """
+        Adds directors as nodes and creates edges between movies and directors.
+
+        Args:
+            movie_to_director (dict): A dictionary mapping movies to their directors.
+        """
+        directors = list(set(movie_to_director.values()))
+        director_to_idx = {director: idx for idx, director in enumerate(directors)}
+
+        self.dataset['director'].num_nodes = len(directors)
+
+        movie_indices = []
+        director_indices = []
+
+        for movie_idx, director in movie_to_director.items():
+            movie_indices.append(movie_idx)
+            director_indices.append(director_to_idx[director])
+
+        edge_index = torch.tensor([movie_indices, director_indices], dtype=torch.long)
+        return edge_index
+
+    def _add_actors_and_edges(self, movie_to_actors):
+        """
+        Adds actors as nodes and creates edges between movies and actors.
+
+        Args:
+            movie_to_actors (dict): A dictionary mapping movies to their actors.
+        """
+        actors = set(actor for actor_list in movie_to_actors.values() for actor in actor_list)
+        actor_to_idx = {actor: idx for idx, actor in enumerate(actors)}
+
+        self.dataset['actor'].num_nodes = len(actors)
+
+        movie_indices = []
+        actor_indices = []
+
+        for movie_idx, actor_list in movie_to_actors.items():
+            for actor in actor_list:
+                movie_indices.append(movie_idx)
+                actor_indices.append(actor_to_idx[actor])
+
+        edge_index = torch.tensor([movie_indices, actor_indices], dtype=torch.long)
+        
+        return edge_index
+
+
+    def fetch_themes(self, num_groups, groupedKeywordsPath=''):
+        """
+        Groups tensor features into themes based on vocabulary words.
+
+        Args:
+            num_groups (int): Number of groups/themes to create
+            groupedKeywordsPath (str, optional): Path to save/load grouped keywords
+
+        Returns:
+            tuple: (grouped_tensor, grouped_themes)
+                - grouped_tensor: Tensor with columns representing themes
+                - grouped_themes: Dictionary mapping theme names to lists of words
+        """
+        grouped_themes = {}
+
+        vocabulary = self.dataset['movie'].xKeys
+        tensor = self.dataset['movie'].x
+        grouped_tensor, grouped_themes = group_themes(tensor, vocabulary, num_groups, groupedKeywordsPath)
+
+        
+        high_level_concepts = {
+            "movie": {
+                "themes" : grouped_themes, 
+                "presence_matrix": grouped_tensor
+            }
+        }
+        
+        return high_level_concepts
+
+    def _parse_csv_to_dict(self, file_path):
        """
        Parses a CSV file and returns a list of dictionaries where each dictionary
        represents a row from the CSV with keys as column headers.

-    :param file_path: Path to the CSV file
-    :return: List of dictionaries representing the CSV data
+        Args:
+            file_path (str): Path to the CSV file.
+
+        Returns:
+            list: List of dictionaries representing the CSV data.
        """
        try:
            with open(file_path, mode='r', newline='', encoding='utf-8') as file:
@@ -196,5 +233,5 @@ def _parse_csv_to_dict(file_path):

 # Example usage
 if __name__ == "__main__":
-    result = load_imdb(bag_of_words_size=100, groupKeywords=False)
-    print(result)
\ No newline at end of file
+    dataset = IMDB( path='rawData/imdb/sample.csv')
+    print(dataset.dataset)
--- a/customDBs/MultiShape.py
+++ b/customDBs/MultiShape.py
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 from torch_geometric.datasets import BAMultiShapesDataset
-from StructuredDatasets import StructuredDatasets
+from customDBs.StructuredDataset import StructuredDataset
+import networkx as nx


-class MultiShape(StructuredDatasets):
+class MultiShape(StructuredDataset):
    """
    A class to process the BAMultiShapes dataset, find frequent patterns, and visualize graphs and patterns.
    """

--- a/customDBs/TextDataset.py
+++ b/customDBs/TextDataset.py
+from torch_geometric.data import HeteroData
+from collections import defaultdict
+import os
+import re
+import torch
+from nltk.corpus import stopwords
+import nltk
+from ConceptLearner.Utils import group_themes
+import csv
+from datetime import datetime
+
+# Ensure NLTK stopwords are available
+try:
+    STOP_WORDS = set(stopwords.words('english'))
+except LookupError:
+    nltk.download('stopwords')
+    STOP_WORDS = set(stopwords.words('english'))
+
+class TextDataset:
+    def __init__(self, path, bag_of_words_size=100, remove_all_false_values=True):
+        self.path = path
+        self.bag_of_words_size = bag_of_words_size
+        self.remove_all_false_values = remove_all_false_values
+        self.dataset = HeteroData()
+
+    def load_dataset(self, path):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+
+    def fetch_themes(self, num_groups, grouped_keywords_path=''):
+        raise NotImplementedError("This method should be implemented by subclasses.")