Skip to content
Snippets Groups Projects
Commit 9da6dce9 authored by michaelyoukeim's avatar michaelyoukeim
Browse files

Eliminated repititions in the clustering code

parent 33cd98c7
No related branches found
No related tags found
No related merge requests found
from jar_runner import run_jar
import os
import json
import pandas as pd
from jar_runner import run_jar
def run_pkg(output_dir):
base_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(base_dir, "../../lib")
pkg_jar_path = os.path.join(lib_dir, "arcade_core-Pkg.jar")
project_name = "Hadoop common"
language = "java"
class RepoClusterer:
def __init__(self, output_dir):
self.output_dir = output_dir
self.base_dir = os.path.dirname(os.path.abspath(__file__))
self.lib_dir = os.path.join(self.base_dir, "../../lib")
self.project_name = "Hadoop common"
self.language = "java"
def find_rsf_file(self, dir_path):
"""Returns the dependencies.rsf file path"""
rsf_path = os.path.join(dir_path, "dependencies.rsf")
if os.path.exists(rsf_path):
return rsf_path
return None
def run_jar_with_logging(self, jar_path, args, cwd, log_filename):
stdout, stderr = run_jar(jar_path, args=args, cwd=cwd)
log_path = os.path.join(cwd, log_filename)
with open(log_path, "w") as log_file:
log_file.write(stdout if stdout else stderr)
return log_path
def run_pkg(self):
jar_path = os.path.join(self.lib_dir, "arcade_core-Pkg.jar")
file_level = "true"
for subdir, dirs, files in os.walk(output_dir):
for subdir, dirs, files in os.walk(self.output_dir):
for dir_name in dirs:
# The name of the directory is used as the project_version (commit ID)
project_version = dir_name
full_dir_path = os.path.join(subdir, dir_name)
# Find the RSF file
rsf_file = next(
(f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")), None
)
if rsf_file:
full_rsf_path = os.path.join(full_dir_path, rsf_file)
output_log = os.path.join(full_dir_path, "pkg_output.log")
# Construct arguments required by PKG
dir_path = os.path.join(subdir, dir_name)
rsf_file_path = self.find_rsf_file(dir_path)
if rsf_file_path:
args = [
f'depspath={full_rsf_path}',
f'projectpath={full_dir_path}',
f'projectname={project_name}',
f'projectversion={project_version}',
f'language={language}',
f'depspath={rsf_file_path}',
f'projectpath={dir_path}',
f'projectname={self.project_name}',
f'projectversion={dir_name}',
f'language={self.language}',
f'filelevel={file_level}',
]
log_path = self.run_jar_with_logging(
jar_path, args, dir_path, "pkg_output.log")
print(
f"Running PKG on {full_rsf_path} with commit ID {project_version}..."
)
# Run PKG
stdout, stderr = run_jar(pkg_jar_path, args=args, cwd=full_dir_path)
# Handle logging to file
print(f"PKG Clustering completed for commit {project_version}\n")
if stdout:
print(stdout + "\n")
if stderr:
print(f"Error processing {rsf_file}: {stderr}\n")
f"PKG Clustering completed for commit {dir_name}. Results saved to {log_path}")
print(f"Clustering results saved to {output_log}")
def run_acdc(output_dir):
base_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(base_dir, "../../lib")
acdc_jar_path = os.path.join(lib_dir, "arcade_core-ACDC.jar")
for subdir, dirs, files in os.walk(output_dir):
def run_acdc(self):
jar_path = os.path.join(self.lib_dir, "arcade_core-ACDC.jar")
for subdir, dirs, files in os.walk(self.output_dir):
for dir_name in dirs:
full_dir_path = os.path.join(subdir, dir_name)
rsf_files = [f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")]
for rsf_file in rsf_files:
full_rsf_path = os.path.join(full_dir_path, rsf_file)
# Extract filename without extension for output naming
filename_without_ext = os.path.splitext(rsf_file)[0]
output_file_path = os.path.join(full_dir_path, f"{filename_without_ext}_acdc_output.rsf")
# Construct arguments for the ACDC JAR
args = [full_rsf_path, output_file_path]
print(f"Running ACDC on {full_rsf_path}...")
# Run ACDC
stdout, stderr = run_jar(acdc_jar_path, args=args)
if stdout:
print(stdout)
if stderr:
print(f"Error processing {rsf_file}: {stderr}")
print(f"Output generated: {output_file_path}")
def run_limbo(output_dir):
base_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(base_dir, "../../lib")
limbo_jar_path = os.path.join(lib_dir, "arcade_core-ACDC.jar")
def run_clusterer(output_dir):
base_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(base_dir, "../../lib")
clusterer_jar_path = os.path.join(lib_dir, "arcade_core_clusterer.jar")
project_name = "Hadoop common"
language = "java"
dir_path = os.path.join(subdir, dir_name)
rsf_file_path = self.find_rsf_file(dir_path)
if rsf_file_path:
args = [os.path.join(dir_path, rsf_file_path), os.path.join(
dir_path, f"{os.path.splitext(rsf_file_path)[0]}_acdc_output.rsf")]
log_path = self.run_jar_with_logging(
jar_path, args, dir_path, f"{os.path.splitext(rsf_file_path)[0]}_acdc_output.log")
print(
f"ACDC run completed for {rsf_file_path}. Output generated: {log_path}")
for subdir, dirs, files in os.walk(output_dir):
def run_clusterer(self):
jar_path = os.path.join(self.lib_dir, "arcade_core_clusterer.jar")
for subdir, dirs, files in os.walk(self.output_dir):
for dir_name in dirs:
full_dir_path = os.path.join(subdir, dir_name)
rsf_files = [f for f in os.listdir(full_dir_path) if f.endswith("dependencies.rsf")]
for rsf_file in rsf_files:
full_rsf_path = os.path.join(full_dir_path, rsf_file)
output_path = full_dir_path
project_version = dir_name
output_log_uem = os.path.join(output_path, "clusterer_output_uem.log")
output_log_uemm = os.path.join(output_path, "clusterer_output_uemm.log")
output_log_limbo = os.path.join(output_path, "clusterer_output_limbo.log")
# Construct arguments for the first WCA execution
args1 = [
f"-Xmx14024m",
f"deps={full_rsf_path}",
f"projpath={output_path}",
f"measure=UEM",
f"projname={project_name}",
f"projversion={project_version}",
f"language={language}",
f"algo=WCA"
]
# Construct arguments for the second WCA execution
args2 = [
f"-Xmx14024m",
f"deps={full_rsf_path}",
f"projpath={output_path}",
f"measure=UEMNM",
f"projname={project_name}",
f"projversion={project_version}",
f"language={language}",
f"algo=WCA"
]
args3 = [
dir_path = os.path.join(subdir, dir_name)
rsf_file_path = self.find_rsf_file(dir_path)
if rsf_file_path:
args = [
f"-Xmx14024m",
f"deps={full_rsf_path}",
f"projpath={output_path}",
f"deps={os.path.join(dir_path, rsf_file_path)}",
f"projpath={dir_path}",
f"measure=UEM",
f"projname={project_name}",
f"projversion={project_version}",
f"language={language}",
f"projname={self.project_name}",
f"projversion={dir_name}",
f"language={self.language}",
f"algo=Limbo"
]
print(f"Running WCA on {full_rsf_path} with commit ID {project_version}...")
# Run WCA for UEM
print("Jar path:", clusterer_jar_path)
print("Project path:", full_dir_path)
print("RSF path:", full_rsf_path)
# stdout1, stderr1 = run_jar(clusterer_jar_path, args=args1, cwd=output_path)
# # Run WCA for UEMNM
# stdout2, stderr2 = run_jar(clusterer_jar_path, args=args2, cwd=output_path)
# # Run Limbo for UEM
stdout3, stderr3 = run_jar(clusterer_jar_path, args=args3, cwd=output_path)
# Log outputs
with open(output_log_uem, "w") as log_file1, open(output_log_uemm, "w") as log_file2, open(output_log_limbo, "w") as log_file3:
#log_file1.write(stdout1 if stdout1 else stderr1)
#log_file2.write(stdout2 if stdout2 else stderr2)
log_file3.write(stdout3 if stdout3 else stderr3)
print(f"Output for UEM saved to {output_log_uem}")
print(f"Output for UEMNM saved to {output_log_uemm}")
print(f"Output for UEMNM saved to {output_log_limbo}")
\ No newline at end of file
log_path = self.run_jar_with_logging(
jar_path, args, dir_path, "clusterer_output_limbo.log")
print(
f"Limbo clustering run completed for {dir_name}. Results saved to {log_path}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment