Added parent commit JARs & refactored clustering

4b66cd6c · Michael Youkeim · 8ae8d01d · 4b66cd6c · 4b66cd6c · 4b66cd6c
Commit 4b66cd6c authored May 20, 2024 by Michael Youkeim
--- a/src/hadoop_analysis/build_manager.py
+++ b/src/hadoop_analysis/build_manager.py
@@ -60,9 +60,39 @@ def process_commit_hashes(commit_hashes, repo_path, output_dir):
        json.dump(java_versions, file, indent=2)


+def find_orphan_parents(json_file):
+    try:
+        with open(json_file, 'r') as file:
+            data = json.load(file)
+
+        hashes = set()
+        parents = set()
+
+        for commits in data.values():
+            for commit in commits:
+                hashes.add(commit['Hash'])
+                parents.update(commit['Parents'])
+
+        orphan_parents = parents - hashes
+        return orphan_parents
+
+    except FileNotFoundError:
+        print(f"Error: The file '{json_file}' does not exist.")
+        return set()
+    except json.JSONDecodeError:
+        print(f"Error: The file '{json_file}' contains invalid JSON.")
+        return set()
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return set()
+
 def create_packages(repo_path, commit_info_file_path, output_dir):
    # Open the file and load the data
    with open(commit_info_file_path, "r") as file:
        commit_hashes = [commit["commit_hash"] for commit in json.load(file)]

    process_commit_hashes(commit_hashes, repo_path, output_dir)
+
+def create_packages_for_parents(repo_path, issue_commit_relation_file, output_dir):
+    orphan_parents = find_orphan_parents(issue_commit_relation_file)
+    process_commit_hashes(orphan_parents, repo_path, output_dir)
--- a/src/hadoop_analysis/clustering.py
+++ b/src/hadoop_analysis/clustering.py
@@ -61,7 +61,7 @@ class RepoClusterer:
                    print(
                        f"ACDC run completed for {rsf_file_path}. Output generated: {log_path}")

-    def run_clusterer(self):
+    def run_clusterer(self, algorithm):
        jar_path = os.path.join(self.lib_dir, "arcade_core_clusterer.jar")
        for subdir, dirs, files in os.walk(self.output_dir):
            for dir_name in dirs:
@@ -76,9 +76,9 @@ class RepoClusterer:
                        f"projname={self.project_name}",
                        f"projversion={dir_name}",
                        f"language={self.language}",
-                        f"algo=Limbo"
+                        f"algo={algorithm}"
                    ]
                    log_path = self.run_jar_with_logging(
-                        jar_path, args, dir_path, "clusterer_output_limbo.log")
+                        jar_path, args, dir_path, f"clusterer_output_{algorithm.lower()}.log")
                    print(
-                        f"Limbo clustering run completed for {dir_name}. Results saved to {log_path}")
+                        f"{algorithm} clustering run completed for {dir_name}. Results saved to {log_path}")
--- a/src/hadoop_analysis/main.py
+++ b/src/hadoop_analysis/main.py
@@ -4,12 +4,12 @@ import argparse
 import os

 from utils import setup_logging
-from build_manager import create_packages
+from build_manager import create_packages, create_packages_for_parents
 from issue_commit_linkage import link_issues_to_commits
 from visualization import create_charts

 from java_version_manager import switch_java_version
-from dependency_analyzer import analyze_dependencies, get_rsf_file_paths
+from dependency_analyzer import analyze_dependencies
 from clustering import RepoClusterer
 from clustering_result_analyzer import analyze_clustering_results

@@ -48,6 +48,7 @@ def main():
    switch_java_version("1.8")
    link_issues_to_commits(repo_path, issues_file_path, output_dir)
    create_packages(repo_path, commit_info_file_path, output_dir)
+    create_packages_for_parents(repo_path, os.path.join(output_dir, "issue_commit_relationships.json"), output_dir)

    # Visualization
    create_charts(commit_info_file_path, output_dir)
@@ -60,7 +61,8 @@ def main():
    clusterer =  RepoClusterer(output_dir)
    clusterer.run_pkg()
    clusterer.run_acdc()
-    clusterer.run_clusterer()
+    clusterer.run_clusterer("Limbo")
+    clusterer.run_clusterer("WCA")

    # Analyzing the clustering results
    analyze_clustering_results(output_dir)