Added initial statistical analysis impl

43d985ee · michaelyoukeim · 9da6dce9 · 43d985ee · 43d985ee · 43d985ee
Commit 43d985ee authored May 14, 2024 by michaelyoukeim
--- a/src/hadoop_analysis/build_manager.py
+++ b/src/hadoop_analysis/build_manager.py
@@ -194,6 +194,6 @@ def process_commit_hashes(commit_hashes, repo_path, output_dir):
 def create_packages(repo_path, commit_info_file_path, output_dir):
    # Open the file and load the data
    with open(commit_info_file_path, "r") as file:
-        commit_hashes = list(json.load(file).keys())
+        commit_hashes = [commit['commit_hash'] for commit in json.load(file)]

    process_commit_hashes(commit_hashes, repo_path, output_dir)
--- a/src/hadoop_analysis/clustering_result_analyzer.py
+++ b/src/hadoop_analysis/clustering_result_analyzer.py
@@ -4,6 +4,7 @@ import json


 def run_java_metric(command):
+    print("Running command:", command)
    output = subprocess.run(command, shell=True, capture_output=True, text=True)
    if output.stderr:
        return {"error": output.stderr}
@@ -74,61 +75,54 @@ def analyze_clustering_results(output_dir):

    commit_hash = "bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de"
    commit_dir = os.path.join(output_dir, commit_hash)
+    print("Analyzing Clustering Results for:", commit_dir)

    parent_commit_hash = "6ccb223c9c98a448bdb46acfddeb0b351d5b196a"
    parent_commit_dir = os.path.join(output_dir, parent_commit_hash)

+    a2a_jar_path = os.path.join(lib_dir, "arcade_core_A2a.jar")
+    cvg_jar_path = os.path.join(lib_dir, "arcade_core_Cvg.jar")
+
+    cluster_file = os.path.join(commit_dir, 'dependencies.rsf')
+
    results = {}
    # Currently, we are just evaluating the clustering results for one commit
    algorithm_configs = {
        "acdc": {
-            "cluster_file": os.path.join(commit_dir, "out_acdc_output.rsf"),
-            "a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
-            "cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
-            "current_file": os.path.join(commit_dir, "dependencies_acdc_output.rsf"),
+            "current_file": os.path.join(commit_dir, "acdc_output.rsf"),
            "parent_file": os.path.join(
-                parent_commit_dir, "dependencies_acdc_output.rsf"
+                parent_commit_dir, "acdc_output.rsf"
            ),
        },
        "pkg": {
-            "cluster_file": os.path.join(
-                commit_dir, f"hadoopcommon-{commit_hash}_PKG_1_clusters.rsf"
-            ),
-            "a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
-            "cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
            "current_file": os.path.join(
-                commit_dir, f"hadoopcommon-{commit_hash}_PKG_1_clusters.rsf"
+                commit_dir, f"hadoop common-{commit_hash}_PKG_104_clusters.rsf"
            ),
            "parent_file": os.path.join(
                parent_commit_dir,
-                f"hadoopcommon-{parent_commit_hash}_PKG_1_clusters.rsf",
-            ),
-        },
-        "wca_uem": {
-            "cluster_file": os.path.join(
-                commit_dir, "/hadoopcommon-{commit_hash}_UEM_50_clusters.rsf"
-            ),
-            "a2a_jar_path": os.path.join(lib_dir, "arcade_core_A2a.jar"),
-            "cvg_jar_path": os.path.join(lib_dir, "arcade_core_Cvg.jar"),
-            "current_file": os.path.join(
-                commit_dir, "/hadoopcommon-{commit_hash}_UEM_50_clusters.rsf"
-            ),
-            "parent_file": os.path.join(
-                parent_commit_dir,
-                "/hadoopcommon-{parent_commit_hash}_UEM_50_clusters.rsf",
+                f"hadoop common-{parent_commit_hash}_PKG_104_clusters.rsf",
            ),
        },
+        #"wca_uem": {
+        #    "current_file": os.path.join(
+        #        commit_dir, "/hadoop common-{commit_hash}_UEM_50_clusters.rsf"
+        #    ),
+        #    "parent_file": os.path.join(
+        #        parent_commit_dir,
+        #        f"/hadoop common-{parent_commit_hash}_UEM_50_clusters.rsf",
+        #    ),
+        #},
    }

    for algorithm, config in algorithm_configs.items():
        num_clusters, entity_count_per_cluster = count_clusters_and_entities(
-            config["cluster_file"]
+            cluster_file
        )
        a2a_metric, cvg_metric = calculate_a2a_cvg_metrics(
            config["current_file"],
            config["parent_file"],
-            config["a2a_jar_path"],
-            config["cvg_jar_path"],
+            a2a_jar_path,
+            cvg_jar_path
        )
        results[algorithm] = {
            "num_clusters": num_clusters,
@@ -137,15 +131,17 @@ def analyze_clustering_results(output_dir):
            "cvg_metric": cvg_metric,
        }

-    clustering_results_file_path = os.path.join(output_dir, "clustering_results.json")
+    clustering_results_file_path = os.path.join(commit_dir, "clustering_results.json")

    # Save results to JSON file
+    print(f"Writing clustering results to {clustering_results_file_path}")
    with open(clustering_results_file_path, "w") as f:
        json.dump(results, f, indent=4)
    
+
    # Calculate and store CVG metrics between algorithms
    clusters_dict = {
-        alg: parse_rsf_file(algorithm_configs[alg]["cluster_file"])
+        alg: parse_rsf_file(cluster_file)
        for alg in algorithm_configs
    }
    for i, alg1 in enumerate(algorithm_configs):

--- a/src/hadoop_analysis/java_version_manager.py
+++ b/src/hadoop_analysis/java_version_manager.py
@@ -22,11 +22,11 @@ def switch_java_version(java_version):
    """

    if java_version == "21":
-        target_version = "java-1.21.0-openjdk-amd64"
+        target_version = "temurin-21-jdk-amd64"
    elif java_version == "1.8":
-        target_version = "java-1.8.0-openjdk-amd64"
+        target_version = "temurin-8-jdk-amd64"
    elif java_version == "11":
-        target_version = "java-1.11.0-openjdk-amd64"
+        target_version = "temurin-11-jdk-amd64"
    else:
        return


--- a/src/hadoop_analysis/metrics_adapter.py
+++ b/src/hadoop_analysis/metrics_adapter.py
+import os
+import json
+
+def update_commit_info(output_dir):
+    # Path to the detailed commits info file
+    commits_info_path = os.path.join(output_dir, 'detailed_commits_info.json')
+    
+    # Load the detailed commits info from JSON file
+    with open(commits_info_path, 'r') as file:
+        commits_info = json.load(file)
+    
+    # Mapping from commit hash to commit info for easy access
+    commit_info_map = {commit['commit_hash']: commit for commit in commits_info}
+    
+    # Traverse each subdirectory in the output directory
+    for commit_hash in os.listdir(output_dir):
+        subdir_path = os.path.join(output_dir, commit_hash)
+        if os.path.isdir(subdir_path):
+            results_path = os.path.join(subdir_path, 'clustering_results.json')
+            
+            # Check if the clustering results file exists
+            if os.path.exists(results_path):
+                with open(results_path, 'r') as file:
+                    results = json.load(file)
+                
+                # Check if 'acdc' key exists and contains the required keys
+                if 'acdc' in results and isinstance(results['acdc'], dict):
+                    acdc = results['acdc']
+                    if 'a2a_metric' in acdc and isinstance(acdc['a2a_metric'], float) and 'cvg' in acdc and isinstance(acdc['cvg'], float):
+                        # Update the commit info with 'a2a' and 'cvg' if the commit hash is present in the map
+                        if commit_hash in commit_info_map:
+                            commit_info_map[commit_hash]['acdc'] = {
+                                'a2a': acdc['a2a_metric'],
+                                'cvg': acdc['a2a_metric']
+                            }
+    
+    # Save the updated commits info back to the JSON file
+    with open(commits_info_path, 'w') as file:
+        json.dump(commits_info, file, indent=4)
--- a/src/hadoop_analysis/statistical_analysis.py
+++ b/src/hadoop_analysis/statistical_analysis.py
+import pandas as pd
+import os
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy.stats import pearsonr
+from matplotlib.backends.backend_pdf import PdfPages
+
+def plot_scatterplot_and_correlation(df: pd.DataFrame, x: str, y: str, title: str, xlabel: str, ylabel: str, output_file: str, showGrid: bool = True):
+    with PdfPages(output_file) as pdf:
+        # Scatter plot
+        plt.figure(figsize=(8, 6))
+        sns.scatterplot(x=x, y=y, data=df)
+        plt.title(title)
+        plt.xlabel(xlabel)
+        plt.ylabel(ylabel)
+        plt.grid(showGrid)
+        pdf.savefig()  # Save the figure into the PDF
+        plt.close()
+
+        # Perform correlation test
+        correlation_coefficient, p_value = pearsonr(df[x], df[y])
+        # Create a text page in the PDF with correlation results
+        plt.figure(figsize=(8, 6))
+        plt.text(0.5, 0.5, f"Correlation Coefficient: {correlation_coefficient}\nP-value: {p_value}", 
+                 horizontalalignment='center', verticalalignment='center', fontsize=12)
+        plt.axis('off')
+        pdf.savefig()
+        plt.close()
+
+def perform_statistical_analysis(commit_info_file_path, output_dir):
+    df = pd.read_json(commit_info_file_path)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    decision_types = ['Existence', 'Executive', 'Property']
+    attributes = ['dmm_size', 'dmm_interfacing', 'dmm_complexity']
+
+    for decision_type in decision_types:
+        for attribute in attributes:
+            decision_df = df[df[f"is{decision_type}Decision"]]
+            output_file = os.path.join(output_dir, f"{decision_type.lower()}_decision_{attribute}_vs_a2a.pdf")
+            plot_scatterplot_and_correlation(
+                decision_df, 
+                attribute, 
+                'a2a', 
+                f'Scatter Plot of {attribute.capitalize()} vs. A2A (For {decision_type} Decision)', 
+                attribute.capitalize(), 
+                'A2A', 
+                output_file
+            )
\ No newline at end of file