Eliminated repititions in the clustering code

9da6dce9 · michaelyoukeim · 33cd98c7 · 9da6dce9
Commit 9da6dce9 authored May 13, 2024 by michaelyoukeim
--- a/src/hadoop_analysis/clustering.py
+++ b/src/hadoop_analysis/clustering.py
-from jar_runner import run_jar
 import os
+import json
+import pandas as pd
+from jar_runner import run_jar


-def run_pkg(output_dir):
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    lib_dir = os.path.join(base_dir, "../../lib")
-    pkg_jar_path = os.path.join(lib_dir, "arcade_core-Pkg.jar")
-    project_name = "Hadoop common"
-    language = "java"
+class RepoClusterer:
+    def __init__(self, output_dir):
+        self.output_dir = output_dir
+        self.base_dir = os.path.dirname(os.path.abspath(__file__))
+        self.lib_dir = os.path.join(self.base_dir, "../../lib")
+        self.project_name = "Hadoop common"
+        self.language = "java"
+
+    def find_rsf_file(self, dir_path):
+        """Returns the dependencies.rsf file path"""
+        rsf_path = os.path.join(dir_path, "dependencies.rsf")
+        if os.path.exists(rsf_path):
+            return rsf_path
+        return None
+
+    def run_jar_with_logging(self, jar_path, args, cwd, log_filename):
+        stdout, stderr = run_jar(jar_path, args=args, cwd=cwd)
+        log_path = os.path.join(cwd, log_filename)
+        with open(log_path, "w") as log_file:
+            log_file.write(stdout if stdout else stderr)
+        return log_path
+
+    def run_pkg(self):
+        jar_path = os.path.join(self.lib_dir, "arcade_core-Pkg.jar")
        file_level = "true"
-
-    for subdir, dirs, files in os.walk(output_dir):
+        for subdir, dirs, files in os.walk(self.output_dir):
            for dir_name in dirs:
-            # The name of the directory is used as the project_version (commit ID)
-            project_version = dir_name
-            full_dir_path = os.path.join(subdir, dir_name)
-
-            # Find the RSF file
-            rsf_file = next(
-                (f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")), None
-            )
-
-            if rsf_file:
-                full_rsf_path = os.path.join(full_dir_path, rsf_file)
-                output_log = os.path.join(full_dir_path, "pkg_output.log")
-
-                # Construct arguments required by PKG
+                dir_path = os.path.join(subdir, dir_name)
+                rsf_file_path = self.find_rsf_file(dir_path)
+                if rsf_file_path:
                    args = [
-                    f'depspath={full_rsf_path}',
-                    f'projectpath={full_dir_path}',
-                    f'projectname={project_name}',
-                    f'projectversion={project_version}',
-                    f'language={language}',
+                        f'depspath={rsf_file_path}',
+                        f'projectpath={dir_path}',
+                        f'projectname={self.project_name}',
+                        f'projectversion={dir_name}',
+                        f'language={self.language}',
                        f'filelevel={file_level}',
                    ]
-
+                    log_path = self.run_jar_with_logging(
+                        jar_path, args, dir_path, "pkg_output.log")
                    print(
-                    f"Running PKG on {full_rsf_path} with commit ID {project_version}..."
-                )
-
-                # Run PKG
-                stdout, stderr = run_jar(pkg_jar_path, args=args, cwd=full_dir_path)
-
-                # Handle logging to file
-                print(f"PKG Clustering completed for commit {project_version}\n")
-                if stdout:
-                    print(stdout + "\n")
-                if stderr:
-                    print(f"Error processing {rsf_file}: {stderr}\n")
+                        f"PKG Clustering completed for commit {dir_name}. Results saved to {log_path}")

-                print(f"Clustering results saved to {output_log}")
-
-
-def run_acdc(output_dir):
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    lib_dir = os.path.join(base_dir, "../../lib")
-    acdc_jar_path = os.path.join(lib_dir, "arcade_core-ACDC.jar")
-
-    for subdir, dirs, files in os.walk(output_dir):
+    def run_acdc(self):
+        jar_path = os.path.join(self.lib_dir, "arcade_core-ACDC.jar")
+        for subdir, dirs, files in os.walk(self.output_dir):
            for dir_name in dirs:
-            full_dir_path = os.path.join(subdir, dir_name)
-            rsf_files = [f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")]
-
-            for rsf_file in rsf_files:
-                full_rsf_path = os.path.join(full_dir_path, rsf_file)
-                # Extract filename without extension for output naming
-                filename_without_ext = os.path.splitext(rsf_file)[0]
-                output_file_path = os.path.join(full_dir_path, f"{filename_without_ext}_acdc_output.rsf")
-
-                # Construct arguments for the ACDC JAR
-                args = [full_rsf_path, output_file_path]
-
-                print(f"Running ACDC on {full_rsf_path}...")
-                # Run ACDC
-                stdout, stderr = run_jar(acdc_jar_path, args=args)
-
-                if stdout:
-                    print(stdout)
-                if stderr:
-                    print(f"Error processing {rsf_file}: {stderr}")
-
-                print(f"Output generated: {output_file_path}")
-
-def run_limbo(output_dir):
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    lib_dir = os.path.join(base_dir, "../../lib")
-    limbo_jar_path = os.path.join(lib_dir, "arcade_core-ACDC.jar")
-
-
-def run_clusterer(output_dir):
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    lib_dir = os.path.join(base_dir, "../../lib")
-    clusterer_jar_path = os.path.join(lib_dir, "arcade_core_clusterer.jar")
-
-    project_name = "Hadoop common"
-    language = "java"
+                dir_path = os.path.join(subdir, dir_name)
+                rsf_file_path = self.find_rsf_file(dir_path)
+                if rsf_file_path:
+                    args = [os.path.join(dir_path, rsf_file_path), os.path.join(
+                        dir_path, f"{os.path.splitext(rsf_file_path)[0]}_acdc_output.rsf")]
+                    log_path = self.run_jar_with_logging(
+                        jar_path, args, dir_path, f"{os.path.splitext(rsf_file_path)[0]}_acdc_output.log")
+                    print(
+                        f"ACDC run completed for {rsf_file_path}. Output generated: {log_path}")

-    for subdir, dirs, files in os.walk(output_dir):
+    def run_clusterer(self):
+        jar_path = os.path.join(self.lib_dir, "arcade_core_clusterer.jar")
+        for subdir, dirs, files in os.walk(self.output_dir):
            for dir_name in dirs:
-            full_dir_path = os.path.join(subdir, dir_name)
-            rsf_files = [f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")]
-
-            for rsf_file in rsf_files:
-                full_rsf_path = os.path.join(full_dir_path, rsf_file)
-                output_path = full_dir_path
-                project_version = dir_name
-                output_log_uem = os.path.join(output_path, "clusterer_output_uem.log")
-                output_log_uemm = os.path.join(output_path, "clusterer_output_uemm.log")
-                output_log_limbo = os.path.join(output_path, "clusterer_output_limbo.log")
-
-                # Construct arguments for the first WCA execution
-                args1 = [
-                    f"-Xmx14024m",
-                    f"deps={full_rsf_path}",
-                    f"projpath={output_path}",
-                    f"measure=UEM",
-                    f"projname={project_name}",
-                    f"projversion={project_version}",
-                    f"language={language}",
-                    f"algo=WCA"
-                ]
-
-                # Construct arguments for the second WCA execution
-                args2 = [
-                    f"-Xmx14024m",
-                    f"deps={full_rsf_path}",
-                    f"projpath={output_path}",
-                    f"measure=UEMNM",
-                    f"projname={project_name}",
-                    f"projversion={project_version}",
-                    f"language={language}",
-                    f"algo=WCA"
-                ]
-
-                args3 = [
+                dir_path = os.path.join(subdir, dir_name)
+                rsf_file_path = self.find_rsf_file(dir_path)
+                if rsf_file_path:
+                    args = [
                        f"-Xmx14024m",
-                    f"deps={full_rsf_path}",
-                    f"projpath={output_path}",
+                        f"deps={os.path.join(dir_path, rsf_file_path)}",
+                        f"projpath={dir_path}",
                        f"measure=UEM",
-                    f"projname={project_name}",
-                    f"projversion={project_version}",
-                    f"language={language}",
+                        f"projname={self.project_name}",
+                        f"projversion={dir_name}",
+                        f"language={self.language}",
                        f"algo=Limbo"
                    ]
-
-                print(f"Running WCA on {full_rsf_path} with commit ID {project_version}...")
-                # Run WCA for UEM
-                print("Jar path:", clusterer_jar_path)
-                print("Project path:", full_dir_path)
-                print("RSF path:", full_rsf_path)
-                # stdout1, stderr1 = run_jar(clusterer_jar_path, args=args1, cwd=output_path)
-                # # Run WCA for UEMNM
-                # stdout2, stderr2 = run_jar(clusterer_jar_path, args=args2, cwd=output_path)
-                # # Run Limbo for UEM
-                stdout3, stderr3 = run_jar(clusterer_jar_path, args=args3, cwd=output_path)
-
-                # Log outputs
-                with open(output_log_uem, "w") as log_file1, open(output_log_uemm, "w") as log_file2, open(output_log_limbo, "w") as log_file3:
-                    #log_file1.write(stdout1 if stdout1 else stderr1)
-                    #log_file2.write(stdout2 if stdout2 else stderr2)
-                    log_file3.write(stdout3 if stdout3 else stderr3)
-
-                print(f"Output for UEM saved to {output_log_uem}")
-                print(f"Output for UEMNM saved to {output_log_uemm}")
-                print(f"Output for UEMNM saved to {output_log_limbo}")
\ No newline at end of file
+                    log_path = self.run_jar_with_logging(
+                        jar_path, args, dir_path, "clusterer_output_limbo.log")
+                    print(
+                        f"Limbo clustering run completed for {dir_name}. Results saved to {log_path}")