Skip to content
Snippets Groups Projects
Commit 43d985ee authored by michaelyoukeim's avatar michaelyoukeim
Browse files

Added initial statistical analysis impl

parent 9da6dce9
No related branches found
No related tags found
No related merge requests found
......@@ -194,6 +194,6 @@ def process_commit_hashes(commit_hashes, repo_path, output_dir):
def create_packages(repo_path, commit_info_file_path, output_dir):
# Open the file and load the data
with open(commit_info_file_path, "r") as file:
commit_hashes = list(json.load(file).keys())
commit_hashes = [commit['commit_hash'] for commit in json.load(file)]
process_commit_hashes(commit_hashes, repo_path, output_dir)
......@@ -4,6 +4,7 @@ import json
def run_java_metric(command):
print("Running command:", command)
output = subprocess.run(command, shell=True, capture_output=True, text=True)
if output.stderr:
return {"error": output.stderr}
......@@ -74,61 +75,54 @@ def analyze_clustering_results(output_dir):
commit_hash = "bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de"
commit_dir = os.path.join(output_dir, commit_hash)
print("Analyzing Clustering Results for:", commit_dir)
parent_commit_hash = "6ccb223c9c98a448bdb46acfddeb0b351d5b196a"
parent_commit_dir = os.path.join(output_dir, parent_commit_hash)
a2a_jar_path = os.path.join(lib_dir, "arcade_core_A2a.jar")
cvg_jar_path = os.path.join(lib_dir, "arcade_core_Cvg.jar")
cluster_file = os.path.join(commit_dir, 'dependencies.rsf')
results = {}
# Currently, we are just evaluating the clustering results for one commit
algorithm_configs = {
"acdc": {
"cluster_file": os.path.join(commit_dir, "out_acdc_output.rsf"),
"a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
"cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
"current_file": os.path.join(commit_dir, "dependencies_acdc_output.rsf"),
"current_file": os.path.join(commit_dir, "acdc_output.rsf"),
"parent_file": os.path.join(
parent_commit_dir, "dependencies_acdc_output.rsf"
parent_commit_dir, "acdc_output.rsf"
),
},
"pkg": {
"cluster_file": os.path.join(
commit_dir, f"hadoopcommon-{commit_hash}_PKG_1_clusters.rsf"
),
"a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
"cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
"current_file": os.path.join(
commit_dir, f"hadoopcommon-{commit_hash}_PKG_1_clusters.rsf"
commit_dir, f"hadoop common-{commit_hash}_PKG_104_clusters.rsf"
),
"parent_file": os.path.join(
parent_commit_dir,
f"hadoopcommon-{parent_commit_hash}_PKG_1_clusters.rsf",
),
},
"wca_uem": {
"cluster_file": os.path.join(
commit_dir, "/hadoopcommon-{commit_hash}_UEM_50_clusters.rsf"
),
"a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
"cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
"current_file": os.path.join(
commit_dir, "/hadoopcommon-{commit_hash}_UEM_50_clusters.rsf"
),
"parent_file": os.path.join(
parent_commit_dir,
"/hadoopcommon-{parent_commit_hash}_UEM_50_clusters.rsf",
f"hadoop common-{parent_commit_hash}_PKG_104_clusters.rsf",
),
},
#"wca_uem": {
# "current_file": os.path.join(
# commit_dir, "/hadoop common-{commit_hash}_UEM_50_clusters.rsf"
# ),
# "parent_file": os.path.join(
# parent_commit_dir,
# f"/hadoop common-{parent_commit_hash}_UEM_50_clusters.rsf",
# ),
#},
}
for algorithm, config in algorithm_configs.items():
num_clusters, entity_count_per_cluster = count_clusters_and_entities(
config["cluster_file"]
cluster_file
)
a2a_metric, cvg_metric = calculate_a2a_cvg_metrics(
config["current_file"],
config["parent_file"],
config["a2a_jar_path"],
config["cvg_jar_path"],
a2a_jar_path,
cvg_jar_path
)
results[algorithm] = {
"num_clusters": num_clusters,
......@@ -137,15 +131,17 @@ def analyze_clustering_results(output_dir):
"cvg_metric": cvg_metric,
}
clustering_results_file_path = os.path.join(output_dir, "clustering_results.json")
clustering_results_file_path = os.path.join(commit_dir, "clustering_results.json")
# Save results to JSON file
print(f"Writing clustering results to {clustering_results_file_path}")
with open(clustering_results_file_path, "w") as f:
json.dump(results, f, indent=4)
# Calculate and store CVG metrics between algorithms
clusters_dict = {
alg: parse_rsf_file(algorithm_configs[alg]["cluster_file"])
alg: parse_rsf_file(cluster_file)
for alg in algorithm_configs
}
for i, alg1 in enumerate(algorithm_configs):
......
......@@ -22,11 +22,11 @@ def switch_java_version(java_version):
"""
if java_version == "21":
target_version = "java-1.21.0-openjdk-amd64"
target_version = "temurin-21-jdk-amd64"
elif java_version == "1.8":
target_version = "java-1.8.0-openjdk-amd64"
target_version = "temurin-8-jdk-amd64"
elif java_version == "11":
target_version = "java-1.11.0-openjdk-amd64"
target_version = "temurin-11-jdk-amd64"
else:
return
......
import os
import json
def update_commit_info(output_dir):
# Path to the detailed commits info file
commits_info_path = os.path.join(output_dir, 'detailed_commits_info.json')
# Load the detailed commits info from JSON file
with open(commits_info_path, 'r') as file:
commits_info = json.load(file)
# Mapping from commit hash to commit info for easy access
commit_info_map = {commit['commit_hash']: commit for commit in commits_info}
# Traverse each subdirectory in the output directory
for commit_hash in os.listdir(output_dir):
subdir_path = os.path.join(output_dir, commit_hash)
if os.path.isdir(subdir_path):
results_path = os.path.join(subdir_path, 'clustering_results.json')
# Check if the clustering results file exists
if os.path.exists(results_path):
with open(results_path, 'r') as file:
results = json.load(file)
# Check if 'acdc' key exists and contains the required keys
if 'acdc' in results and isinstance(results['acdc'], dict):
acdc = results['acdc']
if 'a2a_metric' in acdc and isinstance(acdc['a2a_metric'], float) and 'cvg' in acdc and isinstance(acdc['cvg'], float):
# Update the commit info with 'a2a' and 'cvg' if the commit hash is present in the map
if commit_hash in commit_info_map:
commit_info_map[commit_hash]['acdc'] = {
'a2a': acdc['a2a_metric'],
'cvg': acdc['a2a_metric']
}
# Save the updated commits info back to the JSON file
with open(commits_info_path, 'w') as file:
json.dump(commits_info, file, indent=4)
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
def plot_scatterplot_and_correlation(df: pd.DataFrame, x: str, y: str, title: str, xlabel: str, ylabel: str, output_file: str, showGrid: bool = True):
with PdfPages(output_file) as pdf:
# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=x, y=y, data=df)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.grid(showGrid)
pdf.savefig() # Save the figure into the PDF
plt.close()
# Perform correlation test
correlation_coefficient, p_value = pearsonr(df[x], df[y])
# Create a text page in the PDF with correlation results
plt.figure(figsize=(8, 6))
plt.text(0.5, 0.5, f"Correlation Coefficient: {correlation_coefficient}\nP-value: {p_value}",
horizontalalignment='center', verticalalignment='center', fontsize=12)
plt.axis('off')
pdf.savefig()
plt.close()
def perform_statistical_analysis(commit_info_file_path, output_dir):
df = pd.read_json(commit_info_file_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
decision_types = ['Existence', 'Executive', 'Property']
attributes = ['dmm_size', 'dmm_interfacing', 'dmm_complexity']
for decision_type in decision_types:
for attribute in attributes:
decision_df = df[df[f"is{decision_type}Decision"]]
output_file = os.path.join(output_dir, f"{decision_type.lower()}_decision_{attribute}_vs_a2a.pdf")
plot_scatterplot_and_correlation(
decision_df,
attribute,
'a2a',
f'Scatter Plot of {attribute.capitalize()} vs. A2A (For {decision_type} Decision)',
attribute.capitalize(),
'A2A',
output_file
)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment