Skip to content
Snippets Groups Projects
Commit 3fe3a1fd authored by AjUm-HEIDI's avatar AjUm-HEIDI
Browse files

Update text based datasets

parent 7733c9e8
No related branches found
No related tags found
No related merge requests found
......@@ -26,8 +26,9 @@ class BaseOWLConverter(ABC):
pass
class SingleGraphOWLConverter(BaseOWLConverter):
"""Converter for single HeteroData graphs"""
"""Converter for single HeteroData graphs with support for high-level concepts"""
def __init__(self, data: HeteroData, namespace: str, owlGraphPath: str,
high_level_concepts: dict = None,
create_data_properties: bool = True,
create_data_properties_as_object: bool = False,
add_edge_counts: bool = False,
......@@ -37,6 +38,7 @@ class SingleGraphOWLConverter(BaseOWLConverter):
**kwargs):
super().__init__(namespace, owlGraphPath)
self.dataset = data
self.high_level_concepts = high_level_concepts or {}
self.create_data_properties = create_data_properties
self.create_data_properties_as_object = create_data_properties_as_object
self.add_edge_counts = add_edge_counts
......@@ -107,12 +109,13 @@ class SingleGraphOWLConverter(BaseOWLConverter):
self.graph.add((propertyObjectProperty, RDFS.domain, classNamespace[node]))
self.graph.add((propertyObjectProperty, RDFS.range, xsdRange))
if "super_classes" in self.dataset[node]:
for super_class in self.dataset[node].super_classes:
propertyObjectProperty = classNamespace["has_theme_" + super_class]
self.graph.add((propertyObjectProperty, RDF.type, OWL.DatatypeProperty))
self.graph.add((propertyObjectProperty, RDFS.domain, classNamespace[node]))
self.graph.add((propertyObjectProperty, RDFS.range, XSD.boolean))
# Add high-level concepts if available for this node type
if node in self.high_level_concepts:
for theme in self.high_level_concepts[node].get('themes', []):
theme_namespace = classNamespace[f'has_theme_{theme}']
self.graph.add((theme_namespace, RDF.type, OWL.DatatypeProperty))
self.graph.add((theme_namespace, RDFS.domain, classNamespace[node]))
self.graph.add((theme_namespace, RDFS.range, XSD.boolean))
def _buildObjectProperties(self):
classNamespace = Namespace(self.namespace)
......@@ -128,6 +131,7 @@ class SingleGraphOWLConverter(BaseOWLConverter):
def _buildNodes(self):
classNamespace = Namespace(self.namespace)
for node_type in self.dataset.node_types:
if "x" in self.dataset[node_type]:
tensor_values = self.dataset[node_type].x
for row_idx, properties in enumerate(tensor_values):
......@@ -153,6 +157,16 @@ class SingleGraphOWLConverter(BaseOWLConverter):
val = True if val != 0 else False
if self.add_false_values or val or val != 0:
self.graph.add((newNode, propertyObjectProperty, Literal(val)))
if node_type in self.high_level_concepts:
high_level_concept = self.high_level_concepts[node_type]
presence_matrix = high_level_concept.presence_matrix # m x n array
themes = high_level_concept.themes # List of n themes
for theme_idx, theme in enumerate(themes): # Iterate over columns (n columns)
if presence_matrix[row_idx, theme_idx] == 1: # Check if presence is 1
self.graph.add((newNode, classNamespace[f'has_theme_{theme}'], Literal(True)))
if "num_nodes" in self.dataset[node_type]:
num_nodes = self.dataset[node_type].num_nodes
for idx in range(num_nodes):
......@@ -242,10 +256,13 @@ class SingleGraphOWLConverter(BaseOWLConverter):
self.graph.add((newNode, classNamespace[f'{nodeType}_outgoing'], Literal(nodeCounts[nodeType][node]["outgoing"])))
class MultiGraphOWLConverter(BaseOWLConverter):
"""Converter for datasets containing multiple graphs"""
def __init__(self, dataset: Data, namespace: str, owlGraphPath: str, n: int = None, ignore_nodes: bool = False, add_node_type: bool = True, **kwargs):
"""Converter for datasets containing multiple graphs with pattern support"""
def __init__(self, dataset: Data, namespace: str, owlGraphPath: str, n: int = None, ignore_nodes: bool = False, add_node_type: bool = True, high_level_concepts: dict = None, **kwargs):
super().__init__(namespace, owlGraphPath)
self.dataset = dataset
self.high_level_concepts = high_level_concepts or {}
self.patterns = self.high_level_concepts.get("patterns", [])
self.presence_matrix = self.high_level_concepts.get("presence_matrix", None)
self.ignore_nodes = ignore_nodes
self.add_node_type = add_node_type
self.n = n if n is not None and n <= len(self.dataset) else len(self.dataset)
......@@ -253,7 +270,7 @@ class MultiGraphOWLConverter(BaseOWLConverter):
print(f"Will process {self.n} instances out of {len(self.dataset)} total instances")
def _build(self):
"""Implements the building process for multiple graphs"""
"""Implements the building process for multiple graphs with pattern support"""
start_time = time.time()
print("Creating Dataset Classes...")
self._createDatasetClasses()
......@@ -270,15 +287,29 @@ class MultiGraphOWLConverter(BaseOWLConverter):
print(f"Data Properties built in {time.time() - start_time:.2f} seconds.")
def _createDatasetClasses(self):
"""Creates the basic classes for the dataset"""
ns = Namespace(self.namespace)
self.graph.add((ns.Graph, RDF.type, OWL.Class))
self.graph.add((ns.Node, RDF.type, OWL.Class))
if hasattr(self.dataset, 'super_classes'):
for structure in self.dataset.super_classes:
has_structure = ns['has_'+structure]
self.graph.add((has_structure, RDF.type, OWL.ObjectProperty))
self.graph.add((has_structure, RDFS.domain, ns.Graph))
self.graph.add((has_structure, RDFS.range, XSD.boolean))
has_motif = ns['has_motif_' + structure]
self.graph.add((has_motif, RDF.type, OWL.DatatypeProperty))
self.graph.add((has_motif, RDFS.domain, ns.Graph))
self.graph.add((has_motif, RDFS.range, XSD.boolean))
for i, pattern_name in enumerate(self.patterns):
has_pattern = ns[f'has_pattern_{pattern_name}']
self.graph.add((has_pattern, RDF.type, OWL.DatatypeProperty))
self.graph.add((has_pattern, RDFS.domain, ns.Graph))
self.graph.add((has_pattern, RDFS.range, XSD.boolean))
# Add a property to count the number of superstructures
ns.superstructure_count = ns["superstructure_count"]
self.graph.add((ns.superstructure_count, RDF.type, OWL.DatatypeProperty))
self.graph.add((ns.superstructure_count, RDFS.domain, ns.Graph))
self.graph.add((ns.superstructure_count, RDFS.range, XSD.integer))
# Add node type properties
if self.add_node_type:
......@@ -289,6 +320,7 @@ class MultiGraphOWLConverter(BaseOWLConverter):
self.graph.add((has_node_type, RDFS.range, XSD.boolean))
def _buildDatasetProperties(self):
"""Builds the basic properties for the dataset"""
ns = Namespace(self.namespace)
if not self.ignore_nodes:
self.graph.add((ns.contains, RDF.type, OWL.ObjectProperty))
......@@ -300,31 +332,32 @@ class MultiGraphOWLConverter(BaseOWLConverter):
self.graph.add((ns.connectedTo, RDFS.range, ns.Node))
def _buildDatasetIndividuals(self):
"""Builds dataset individuals with pattern information"""
ns = Namespace(self.namespace)
for idx in range(self.n):
if idx % 10 == 0:
print(f"Processing graph {idx}/{self.n}...")
data = self.dataset[idx]
superstructure_count = 0
# Create graph individual
graph_uri = ns[f'graph_{idx}']
self.graph.add((graph_uri, RDF.type, ns.Graph))
self.graph.add((graph_uri, RDF.type, OWL.NamedIndividual))
# Add super class relationships if present
if hasattr(data, 'super_classes'):
for structure in data.super_classes:
has_structure = ns['has_' + structure]
self.graph.add((graph_uri, has_structure, Literal(True)))
# Add pattern presence information
if self.patterns and self.presence_matrix is not None:
for pattern_idx, pattern in enumerate(self.patterns):
if self.presence_matrix[idx, pattern_idx]:
has_pattern = ns[f'has_pattern_{pattern}']
self.graph.add((graph_uri, has_pattern, Literal(True)))
superstructure_count += 1
self.graph.add((graph_uri, ns.superstructure_count, Literal(superstructure_count, datatype=XSD.integer)))
# Handle node type information
# Extract node features (if available)
node_type_tensor = getattr(data, 'x', None)
if node_type_tensor is None:
print(f"No node features (data.x) found for graph {idx}. Skipping node type processing.")
continue
if self.ignore_nodes:
# Skip adding individual nodes, only process superstructures
if self.add_node_type and node_type_tensor is not None:
# Add node type counts directly to graph
for type_idx in range(node_type_tensor.size(1)):
has_node_type = ns[f'has_node_type_{type_idx}']
......@@ -333,20 +366,19 @@ class MultiGraphOWLConverter(BaseOWLConverter):
self.graph.add((graph_uri, has_node_type, Literal(True)))
else:
# Add nodes with their types
if node_type_tensor is not None:
for node_idx in range(data.num_nodes):
node_uri = ns[f'node_{idx}_{node_idx}']
self.graph.add((node_uri, RDF.type, ns.Node))
self.graph.add((graph_uri, ns.contains, node_uri))
# Determine the node type
if self.add_node_type:
node_features = node_type_tensor[node_idx]
node_type = torch.argmax(node_features).item() if node_features.sum() > 0 else None
if node_type is not None:
has_node_type = ns[f'has_node_type_{node_type}']
self.graph.add((node_uri, has_node_type, Literal(True)))
# Add edges
edges = getattr(data, 'edge_index', None)
if edges is not None:
for edge in edges.t():
......@@ -355,27 +387,46 @@ class MultiGraphOWLConverter(BaseOWLConverter):
target_uri = ns[f'node_{idx}_{target.item()}']
self.graph.add((source_uri, ns.connectedTo, target_uri))
def convert_to_owl(data, namespace: str, owlGraphPath: str, **kwargs):
"""Factory function to create the appropriate converter based on input data type"""
def convert_to_owl(data, namespace: str, owlGraphPath: str, high_level_concepts: dict = None, **kwargs):
"""
Factory function to create the appropriate converter based on input data type
Args:
data: Input data (HeteroData or regular Data)
namespace (str): Namespace for the OWL ontology
owlGraphPath (str): Path where the OWL file will be saved
high_level_concepts (dict): For HeteroData, a dictionary mapping node types to their concepts
For regular Data, a tuple of (patterns, presence_matrix)
**kwargs: Additional arguments passed to the converter
Returns:
BaseOWLConverter: An instance of the appropriate converter
"""
if isinstance(data, HeteroData):
return SingleGraphOWLConverter(data, namespace, owlGraphPath, **kwargs)
return SingleGraphOWLConverter(data, namespace, owlGraphPath, high_level_concepts, **kwargs)
else:
return MultiGraphOWLConverter(data, namespace, owlGraphPath, **kwargs)
return MultiGraphOWLConverter(data, namespace, owlGraphPath, high_level_concepts=high_level_concepts, **kwargs)
if __name__ == "__main__":
# Example 1: Multiple graphs usage
# Example usage with MUTAG dataset
from customDBs.MUTAG import MUTAG
dataset = MUTAG(path='../rawData/MUTAG', add_patterns=True)
from customDBs.MultiShape import MultiShape
mutag = MUTAG(path='../rawData/MUTAG')
ms = MultiShape(path='./rawData/BAMultiShapes')
# Detect motifs and get patterns
patterns, presence_matrix = mutag.detect_motifs(mutag.dataset)
# Convert to OWL
converter = convert_to_owl(
data=dataset.dataset,
data=mutag.dataset,
namespace="http://example.org/",
owlGraphPath="./owlGraphs/mutage_multi3.owl",
ignore_nodes=True,
n=5
owlGraphPath="./owlGraphs/mutag_with_patterns.owl",
high_level_concepts={
patterns : patterns,
presence_matrix: presence_matrix
},
ignore_nodes=True
)
converter.buildGraph()
......
from collections import defaultdict
from datetime import datetime
import argparse
import json
import os
import re
import torch
from torch_geometric.data import HeteroData
from ConceptLearner.Utils import group_keywords_into_themes, clean_false_entries_in_dataset, group_themes
from ConceptLearner.Utils import group_themes
from nltk.corpus import stopwords
import nltk
try:
stop_words = set(stopwords.words('english'))
except LookupError:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from customDBs.TextDataset import TextDataset
# Define label categories as constants
LABELS = ["Database", "Data Mining", "Artificial Intelligence", "Information Retrieval"]
def load_dblp(path="./rawData/dblp", bag_of_words_size=100, groupKeywords=True, groupedKeywordsPath='', removeAllFalseValues=True):
class DBLP(TextDataset):
def __init__(self, path='rawData/dblp', bag_of_words_size=100, remove_all_false_values=True):
"""
Loads the DBLP dataset and constructs a HeteroData object for PyTorch Geometric.
Args:
path (str): Path to the DBLP data directory.
bag_of_words_size (int): Number of top words to consider if grouping is disabled.
groupKeywords (bool): Whether to group keywords into themes.
groupedKeywordsPath (str): Path to save/load grouped keywords.
removeAllFalseValues (bool): Whether to clean false entries in the dataset.
Returns:
HeteroData: The constructed heterogeneous graph data.
"""
if not groupedKeywordsPath:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
groupedKeywordsPath = os.path.join(path, f"groupedKeywords_{timestamp}.json")
super().__init__(path=path, bag_of_words_size=bag_of_words_size, remove_all_false_values=remove_all_false_values)
self._initialize()
def _initialize(self):
"""
Loads and processes IMDb movie data, returning it as a HeteroData object suitable for PyTorch Geometric.
Each label (genre) gets its own top bag_of_words_size keywords.
"""
try:
self.stop_words = set(stopwords.words('english'))
except LookupError:
nltk.download('stopwords')
self.top_words = set(stopwords.words('english'))
# Load authors
author_ids, author_labels, author_id_dict = _get_authors(path)
author_ids, author_labels, author_id_dict = self._get_authors()
# Load papers and grouped themes
paper_tensor, paper_id_dict, bag_of_words = _get_papers(path, bag_of_words_size, groupKeywords, groupedKeywordsPath)
paper_tensor, paper_id_dict, bag_of_words = self._get_papers()
author_paper_mappings = _get_author_paper_mappings(path, author_id_dict, paper_id_dict)
conf_ids, conf_id_dict = _get_conference(path)
paper_conference_mappings = _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict)
# Construct HeteroData
dataset = HeteroData()
author_paper_mappings = self._get_author_paper_mappings(author_id_dict, paper_id_dict)
conf_ids, conf_id_dict = self._get_conference()
paper_conference_mappings = self._get_paper_conference_mappings(paper_id_dict, conf_id_dict)
# Author Nodes
dataset['author'].num_nodes = len(author_labels)
dataset['author'].y = torch.tensor(author_labels)
dataset['author'].yLabel = LABELS
dataset['author'].x = torch.zeros((dataset['author'].num_nodes, 1), dtype=torch.float32) # Dummy feature matrix
self.dataset['author'].num_nodes = len(author_labels)
self.dataset['author'].y = torch.tensor(author_labels)
self.dataset['author'].yLabel = ["Database", "Data Mining", "Artificial Intelligence", "Information Retrieval"]
self.dataset['author'].x = torch.zeros((self.dataset['author'].num_nodes, 1), dtype=torch.float32) # Dummy feature matrix
# Paper Nodes
dataset['paper'].x = paper_tensor.float()
dataset['paper'].xKeys = bag_of_words
self.dataset['paper'].x = paper_tensor.float()
self.dataset['paper'].xKeys = bag_of_words
# Author-Paper Edges
dataset['author', 'writes', 'paper'].edge_index = author_paper_mappings.t()
dataset['paper', 'written_by', 'author'].edge_index = author_paper_mappings.t()[[1, 0], :]
self.dataset['author', 'writes', 'paper'].edge_index = author_paper_mappings.t()
self.dataset['paper', 'written_by', 'author'].edge_index = author_paper_mappings.t()[[1, 0], :]
dataset['conference'].x = torch.tensor(conf_ids)
dataset['paper', 'published_in', 'conference'].edge_index = paper_conference_mappings.t()
self.dataset['conference'].x = torch.tensor(len(conf_ids))
self.dataset['paper', 'published_in', 'conference'].edge_index = paper_conference_mappings.t()
return dataset
return self.dataset
def _get_authors(path):
def _get_authors(self):
"""
Reads the author_label.txt file and extracts author IDs and labels.
......@@ -79,7 +79,7 @@ def _get_authors(path):
Returns:
tuple: (ids, labels, id_dict)
"""
file_path = os.path.join(path, "author_label.txt")
file_path = os.path.join(self.path, "author_label.txt")
ids = []
labels = []
......@@ -104,20 +104,13 @@ def _get_authors(path):
id_dict = {id: idx for idx, id in enumerate(ids)}
# Validate labels
if labels and max(labels) >= len(LABELS):
print(f"Encountered a label {max(labels)} outside the defined yLabel categories.")
raise ValueError("Invalid label found in author_label.txt.")
return ids, labels, id_dict
def fetch_themes(num_groups, groupedKeywordsPath=''):
def fetch_themes(self, num_groups, groupedKeywordsPath=''):
"""
Groups tensor features into themes based on vocabulary words.
Args:
tensor (torch.Tensor): Original tensor with word frequencies
vocabulary (list): List of words corresponding to tensor columns
num_groups (int): Number of groups/themes to create
groupedKeywordsPath (str, optional): Path to save/load grouped keywords
......@@ -128,8 +121,8 @@ def fetch_themes(num_groups, groupedKeywordsPath=''):
"""
grouped_themes = {}
vocabulary = dataset['paper'].xKeys
tensor = dataset['paper'].x
vocabulary = self.dataset['paper'].xKeys
tensor = self.dataset['paper'].x
grouped_tensor, grouped_themes = group_themes(tensor, vocabulary, num_groups, groupedKeywordsPath)
......@@ -142,20 +135,14 @@ def fetch_themes(num_groups, groupedKeywordsPath=''):
return high_level_concepts
def _get_papers(path, bag_of_words_size):
def _get_papers(self):
"""
Reads the paper.txt file and processes the text into a tensor.
Args:
path (str): Path to the DBLP data directory.
bag_of_words_size (int): Number of top words to consider.
groupKeywords (bool): Whether to group keywords into themes.
groupedKeywordsPath (str): Path to save/load grouped keywords.
Returns:
tuple: (paper_tensor, paper_id_dict, vocabulary, grouped_themes)
"""
file_path = os.path.join(path, "paper.txt")
file_path = os.path.join(self.path, "paper.txt")
bag_of_words = []
vocabulary = []
total_words = defaultdict(int)
......@@ -177,7 +164,7 @@ def _get_papers(path, bag_of_words_size):
words = re.findall(r'\w+', text.lower()) # Tokenize and convert to lowercase
word_count = defaultdict(int)
for word in words:
if word in stop_words:
if word in self.stop_words:
continue
if word not in total_words:
vocabulary.append(word)
......@@ -194,7 +181,7 @@ def _get_papers(path, bag_of_words_size):
raise
# Get top N words for vocabulary
top_n = bag_of_words_size
top_n = self.bag_of_words_size
total_words = dict(sorted(total_words.items(), key=lambda item: item[1], reverse=True)[:top_n])
vocabulary = list(total_words.keys())
......@@ -209,19 +196,18 @@ def _get_papers(path, bag_of_words_size):
return paper_tensor, paper_id_dict, vocabulary
def _get_author_paper_mappings(path, author_id_dict, paper_id_dict):
def _get_author_paper_mappings(self, author_id_dict, paper_id_dict):
"""
Reads the paper_author.txt file and creates author-paper edge mappings.
Args:
path (str): Path to the DBLP data directory.
author_id_dict (dict): Mapping from author IDs to indices.
paper_id_dict (dict): Mapping from paper IDs to indices.
Returns:
torch.Tensor: Edge indices tensor of shape [2, num_edges].
"""
file_path = os.path.join(path, "paper_author.txt")
file_path = os.path.join(self.path, "paper_author.txt")
mappings = []
......@@ -252,7 +238,7 @@ def _get_author_paper_mappings(path, author_id_dict, paper_id_dict):
return mappings
def _get_conference(path):
def _get_conference(self):
"""
Reads the conf.txt file and extracts conference IDs and names.
......@@ -262,7 +248,7 @@ def _get_conference(path):
Returns:
tuple: (ids, id_dict)
"""
file_path = os.path.join(path, "conf.txt")
file_path = os.path.join(self.path, "conf.txt")
ids = []
try:
......@@ -286,19 +272,18 @@ def _get_conference(path):
id_dict = {id: idx for idx, id in enumerate(ids)}
return ids, id_dict
def _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict):
def _get_paper_conference_mappings(self, paper_id_dict, conf_id_dict):
"""
Reads the paper_conf.txt file and creates paper-conference edge mappings.
Args:
path (str): Path to the DBLP data directory.
paper_id_dict (dict): Mapping from paper IDs to indices.
conf_id_dict (dict): Mapping from conference IDs to indices.
Returns:
torch.Tensor: Edge indices tensor of shape [2, num_edges].
"""
file_path = os.path.join(path, "paper_conf.txt")
file_path = os.path.join(self.path, "paper_conf.txt")
mappings = []
......@@ -331,5 +316,5 @@ def _get_paper_conference_mappings(path, paper_id_dict, conf_id_dict):
if __name__ == "__main__":
dataset = load_dblp(path="/Users/ajay/Documents/gitlabThesis/Thesis/rawData/customDblp")
print(dataset)
dblp = DBLP()
print(dblp.dataset)
from collections import defaultdict
from datetime import datetime
import json
import torch
from torch_geometric.data import HeteroData
from ConceptLearner.Utils import group_keywords_into_themes, clean_false_entries_in_dataset
import csv
import os
import sys
from ConceptLearner.Utils import group_themes
def load_imdb(path="rawData/imdb", bag_of_words_size=25, groupKeywords=True, groupedKeywordsPath='', removeAllFalseValues=True, numberOfGroups=5):
class IMDB:
def __init__(self, path='rawData/imdb/movie_metadata.csv', bag_of_words_size=100, remove_all_false_values=True):
"""
Loads and processes IMDb movie data, returning it as a HeteroData object suitable for PyTorch Geometric.
Each label (genre) gets its own top bag_of_words_size keywords.
:param path: Path to the directory containing the 'movie_metadata.csv' file.
:param bag_of_words_size: Number of top keywords or themes to include per label.
:param groupKeywords: Whether to group keywords into themes.
:param groupedKeywordsPath: Path to save or load grouped themes JSON file.
:param removeAllFalseValues: Whether to remove entries with zero bag of words.
:param numberOfGroups: Number of themes to group keywords into if grouping is enabled.
:return: A HeteroData object containing movie features and labels.
"""
# Construct the file path using os.path.join for cross-platform compatibility
file_path = os.path.join(path, "movie_metadata.csv")
Initialize the IMDB dataset.
if not groupedKeywordsPath:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
groupedKeywordsPath = os.path.join(path, f"groupedKeywords_{timestamp}.json")
Args:
path (str): Path to the IMDB directory.
"""
self.path = path
self.bag_of_words_size = bag_of_words_size
self.remove_all_false_values = remove_all_false_values
self.dataset = HeteroData()
self._initialize()
# Check if the file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"Error: The file '{file_path}' does not exist.")
def _initialize(self):
"""
Loads and processes IMDb movie data, adding movies, directors, and actors as nodes,
and creating edges between them.
"""
# Parse CSV data into a list of dictionaries
data_list = _parse_csv_to_dict(file_path)
data_list = self._parse_csv_to_dict(self.path)
# Process movies and extract features
movie_labels, movie_tensor, bag_of_words, grouped_labels, grouped_tensor = _get_movies(
data_list, bag_of_words_size, groupKeywords, groupedKeywordsPath, numberOfGroups
movie_labels, movie_tensor, plot_keywords, movie_to_director, movie_to_actors = self._get_movies(
data_list, self.bag_of_words_size
)
# Remove rows with zero bag of words if requested
if removeAllFalseValues:
# Find indices where sum of features is not zero
if groupKeywords and grouped_tensor is not None:
non_zero_indices = torch.where(grouped_tensor.sum(dim=1) != 0)[0]
else:
non_zero_indices = torch.where(movie_tensor.sum(dim=1) != 0)[0]
# Filter tensors and labels
movie_labels = [movie_labels[i] for i in non_zero_indices]
movie_tensor = movie_tensor[non_zero_indices]
# Filter out movies without the desired genres or zero bag-of-words features
valid_movie_indices = torch.where(movie_tensor.sum(dim=1) != 0)[0]
filtered_index_map = {i.item(): idx for idx, i in enumerate(valid_movie_indices)}
# Apply the filtering logic
movie_labels = [movie_labels[i] for i in valid_movie_indices]
movie_tensor = movie_tensor[valid_movie_indices]
movie_to_director = {
filtered_index_map[movie_idx]: director
for movie_idx, director in movie_to_director.items()
if movie_idx in filtered_index_map
}
movie_to_actors = {
filtered_index_map[movie_idx]: actors
for movie_idx, actors in movie_to_actors.items()
if movie_idx in filtered_index_map
}
if groupKeywords and grouped_tensor is not None:
grouped_tensor = grouped_tensor[non_zero_indices]
# Prepare movie nodes
self.dataset['movie'].x = movie_tensor
self.dataset['movie'].y = torch.tensor(movie_labels)
self.dataset['movie'].xKeys = plot_keywords
self.dataset['movie'].yLabel = ["Action", "Comedy", "Drama"]
# Prepare HeteroData object
dataset = HeteroData()
dataset['movie'].y = torch.tensor(movie_labels)
dataset['movie'].x = movie_tensor
dataset['movie'].xKeys = bag_of_words
dataset['movie'].yLabel = ["Action", "Comedy", "Drama"]
# Add director and actor nodes and edges
movie_to_director_edges = self._add_directors_and_edges(movie_to_director)
self.dataset['movie', 'to', 'director'].edge_index = movie_to_director_edges
self.dataset['director', 'to', 'movie'].edge_index = movie_to_director_edges.flip(0)
if groupKeywords and grouped_tensor is not None:
dataset['movie'].super_classes = grouped_labels
dataset['movie'].super_class_values = grouped_tensor
return dataset
movie_to_actors_edges = self._add_actors_and_edges(movie_to_actors)
self.dataset['movie', 'to', 'actor'].edge_index = movie_to_actors_edges
self.dataset['actor', 'to', 'movie'].edge_index = movie_to_actors_edges.flip(0)
return self.dataset
def _get_movies(data_list, bag_of_words_size, groupKeywords, groupedKeywordsPath, numberOfGroups):
def _get_movies(self, data_list, bag_of_words_size):
"""
Processes the list of movie data dictionaries to extract features and labels.
:param data_list: List of dictionaries containing movie data.
:param bag_of_words_size: Number of top keywords or themes to include.
:param groupKeywords: Whether to group keywords into themes.
:param groupedKeywordsPath: Path to save or load grouped themes JSON file.
:param numberOfGroups: Number of themes to group keywords into if grouping is enabled.
:return: Tuple of labels list, feature tensor, vocabulary list, and optionally grouped labels and tensor if groupKeywords is True.
Processes movie data to extract features, labels, and mappings.
Args:
data_list (list): List of dictionaries containing movie data.
bag_of_words_size (int): Number of top keywords to include in the vocabulary.
Returns:
tuple: A tuple containing:
- labels: List of labels (one per movie).
- tensor: Tensor with word frequencies.
- vocabulary: List of words in the vocabulary.
- movie_to_director: Mapping of movies to directors.
- movie_to_actors: Mapping of movies to actors.
"""
total_words = defaultdict(int)
labels = []
bag_of_words = []
labelNames = ["Action", "Comedy", "Drama"]
rating_order = {
"G": [1, 0, 0, 0, 0],
"PG": [0, 1, 0, 0, 0],
"PG-13": [0, 0, 1, 0, 0],
"R": [0, 0, 0, 1, 0],
"NC-17": [0, 0, 0, 0, 1]
}
# Dictionary to track keyword counts for each label
label_plot_counts = {labelName: defaultdict(int) for labelName in labelNames}
movie_to_director = {}
movie_to_actors = {}
label_names = ["Action", "Comedy", "Drama"]
# Process each movie entry
for data in data_list:
for idx, data in enumerate(data_list):
genres = data.get("genres", "").split("|")
content_rating = data.get("content_rating", "")
plot_keywords = data.get("plot_keywords", "")
director = data.get("director_name", "")
actors = data.get("actor_1_name", "") + "|" + data.get("actor_2_name", "") + "|" + data.get("actor_3_name", "")
# Determine label based on genres
label = -1
for label_idx, labelName in enumerate(labelNames):
if labelName in genres:
label = label_idx
break # Stop at the first matching genre
if label == -1 or content_rating not in rating_order:
continue # Skip movies without desired genres or content ratings
# Assign a label based on genres
label = next((idx for idx, name in enumerate(label_names) if name in genres), -1)
if label == -1: # Skip movies without desired genres
continue
# Collect labels
labels.append(label)
movie_to_director[len(labels) - 1] = director
movie_to_actors[len(labels) - 1] = [actor.strip() for actor in actors.split("|") if actor.strip()]
# Process plot keywords
plots = plot_keywords.split("|")
word_count = defaultdict(int)
plots = plot_keywords.split("|")
for plot in plots:
plot = plot.strip().replace(" ", "_")
plot = plot.strip().replace(" ", "_").lower() # Normalize to lowercase
if not plot:
continue
total_words[plot] += 1
word_count[plot] += 1
# Track keyword count for the specific label
label_plot_counts[labelNames[label]][plot] += 1
bag_of_words.append(dict(word_count))
# Initialize vocabulary
sorted_words = sorted(total_words.items(), key=lambda x: x[1], reverse=True)
vocabulary = [word for word, _ in sorted_words[:bag_of_words_size]]
grouped_labels = None
grouped_tensor = None
if groupKeywords:
if os.path.exists(groupedKeywordsPath):
with open(groupedKeywordsPath, "r") as json_file:
grouped_themes = json.load(json_file)
else:
result = group_keywords_into_themes(vocabulary, numberOfGroups)
grouped_themes = result["success"]
grouped_themes = {key.replace(' ', '_'): value for key, value in grouped_themes.items()}
with open(groupedKeywordsPath, "w") as json_file:
json.dump(grouped_themes, json_file, indent=4)
# Create a new bag-of-words model based on grouped labels
grouped_matrix = []
grouped_labels = list(grouped_themes.keys())
for word_count in bag_of_words:
theme_vector = [0] * len(grouped_labels)
for word, count in word_count.items():
for idx, theme in enumerate(grouped_labels):
if word in grouped_themes[theme]:
theme_vector[idx] += count
grouped_matrix.append(theme_vector)
# Convert grouped matrix to tensor
grouped_tensor = torch.tensor(grouped_matrix, dtype=torch.float32)
# Create feature matrix based on vocabulary
matrix = []
for item in bag_of_words:
word_vector = [item.get(word, 0) for word in vocabulary]
matrix.append(word_vector)
# Convert the matrix to a PyTorch tensor
tensor = torch.tensor(matrix, dtype=torch.float32)
if groupKeywords:
return labels, tensor, vocabulary, grouped_labels, grouped_tensor
else:
return labels, tensor, vocabulary
def _parse_csv_to_dict(file_path):
# Build the vocabulary from the most common keywords
vocabulary = [
word for word, _ in sorted(total_words.items(), key=lambda x: x[1], reverse=True)[:bag_of_words_size]
]
# Create the feature matrix
feature_matrix = [
[word_count.get(word, 0) for word in vocabulary] for word_count in bag_of_words
]
tensor = torch.tensor(feature_matrix, dtype=torch.float32)
return labels, tensor, vocabulary, movie_to_director, movie_to_actors
def _add_directors_and_edges(self, movie_to_director):
"""
Adds directors as nodes and creates edges between movies and directors.
Args:
movie_to_director (dict): A dictionary mapping movies to their directors.
"""
directors = list(set(movie_to_director.values()))
director_to_idx = {director: idx for idx, director in enumerate(directors)}
self.dataset['director'].num_nodes = len(directors)
movie_indices = []
director_indices = []
for movie_idx, director in movie_to_director.items():
movie_indices.append(movie_idx)
director_indices.append(director_to_idx[director])
edge_index = torch.tensor([movie_indices, director_indices], dtype=torch.long)
return edge_index
def _add_actors_and_edges(self, movie_to_actors):
"""
Adds actors as nodes and creates edges between movies and actors.
Args:
movie_to_actors (dict): A dictionary mapping movies to their actors.
"""
actors = set(actor for actor_list in movie_to_actors.values() for actor in actor_list)
actor_to_idx = {actor: idx for idx, actor in enumerate(actors)}
self.dataset['actor'].num_nodes = len(actors)
movie_indices = []
actor_indices = []
for movie_idx, actor_list in movie_to_actors.items():
for actor in actor_list:
movie_indices.append(movie_idx)
actor_indices.append(actor_to_idx[actor])
edge_index = torch.tensor([movie_indices, actor_indices], dtype=torch.long)
return edge_index
def fetch_themes(self, num_groups, groupedKeywordsPath=''):
"""
Groups tensor features into themes based on vocabulary words.
Args:
num_groups (int): Number of groups/themes to create
groupedKeywordsPath (str, optional): Path to save/load grouped keywords
Returns:
tuple: (grouped_tensor, grouped_themes)
- grouped_tensor: Tensor with columns representing themes
- grouped_themes: Dictionary mapping theme names to lists of words
"""
grouped_themes = {}
vocabulary = self.dataset['movie'].xKeys
tensor = self.dataset['movie'].x
grouped_tensor, grouped_themes = group_themes(tensor, vocabulary, num_groups, groupedKeywordsPath)
high_level_concepts = {
"movie": {
"themes" : grouped_themes,
"presence_matrix": grouped_tensor
}
}
return high_level_concepts
def _parse_csv_to_dict(self, file_path):
"""
Parses a CSV file and returns a list of dictionaries where each dictionary
represents a row from the CSV with keys as column headers.
:param file_path: Path to the CSV file
:return: List of dictionaries representing the CSV data
Args:
file_path (str): Path to the CSV file.
Returns:
list: List of dictionaries representing the CSV data.
"""
try:
with open(file_path, mode='r', newline='', encoding='utf-8') as file:
......@@ -196,5 +233,5 @@ def _parse_csv_to_dict(file_path):
# Example usage
if __name__ == "__main__":
result = load_imdb(bag_of_words_size=100, groupKeywords=False)
print(result)
\ No newline at end of file
dataset = IMDB( path='rawData/imdb/sample.csv')
print(dataset.dataset)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch_geometric.datasets import BAMultiShapesDataset
from StructuredDatasets import StructuredDatasets
from customDBs.StructuredDataset import StructuredDataset
import networkx as nx
class MultiShape(StructuredDatasets):
class MultiShape(StructuredDataset):
"""
A class to process the BAMultiShapes dataset, find frequent patterns, and visualize graphs and patterns.
"""
......
from torch_geometric.data import HeteroData
from collections import defaultdict
import os
import re
import torch
from nltk.corpus import stopwords
import nltk
from ConceptLearner.Utils import group_themes
import csv
from datetime import datetime
# Ensure NLTK stopwords are available
try:
STOP_WORDS = set(stopwords.words('english'))
except LookupError:
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))
class TextDataset:
def __init__(self, path, bag_of_words_size=100, remove_all_false_values=True):
self.path = path
self.bag_of_words_size = bag_of_words_size
self.remove_all_false_values = remove_all_false_values
self.dataset = HeteroData()
def load_dataset(self, path):
raise NotImplementedError("This method should be implemented by subclasses.")
def fetch_themes(self, num_groups, grouped_keywords_path=''):
raise NotImplementedError("This method should be implemented by subclasses.")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment