Added dependency analysis

e6f676bb · Michael Youkeim · cc807afe · e6f676bb · e6f676bb · e6f676bb
Commit e6f676bb authored Apr 29, 2024 by Michael Youkeim
--- a/data/output/05964ad07ac0e6cbf6945b2b34cd7b2f342cf70c/dependencies.fv
+++ b/data/output/05964ad07ac0e6cbf6945b2b34cd7b2f342cf70c/dependencies.fv
--- a/data/output/05964ad07ac0e6cbf6945b2b34cd7b2f342cf70c/dependencies.rsf
+++ b/data/output/05964ad07ac0e6cbf6945b2b34cd7b2f342cf70c/dependencies.rsf
--- a/data/output/281e2d288df702c7f8ec35d06fef93e5ca75d286/dependencies.fv
+++ b/data/output/281e2d288df702c7f8ec35d06fef93e5ca75d286/dependencies.fv
--- a/data/output/281e2d288df702c7f8ec35d06fef93e5ca75d286/dependencies.rsf
+++ b/data/output/281e2d288df702c7f8ec35d06fef93e5ca75d286/dependencies.rsf
--- a/data/output/81b05977f2592f17e65242637f4366e51961c778/dependencies.fv
+++ b/data/output/81b05977f2592f17e65242637f4366e51961c778/dependencies.fv
--- a/data/output/81b05977f2592f17e65242637f4366e51961c778/dependencies.rsf
+++ b/data/output/81b05977f2592f17e65242637f4366e51961c778/dependencies.rsf
--- a/data/output/bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de/dependencies.fv
+++ b/data/output/bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de/dependencies.fv
--- a/data/output/bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de/dependencies.rsf
+++ b/data/output/bd1a08b2cfba7dcab89791ddba97e15bb2d2c0de/dependencies.rsf
--- a/src/hadoop_analysis/build_manager.py
+++ b/src/hadoop_analysis/build_manager.py
-#!/usr/bin/python3
-
 import xml.etree.ElementTree as ET
 import subprocess
 import re
@@ -8,51 +6,72 @@ import os
 import shutil
 import sys

+
 def parse_pom_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
-    ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
-    enforcer_plugin = root.find(".//maven:plugin[maven:artifactId='maven-enforcer-plugin']", ns)
+    ns = {"maven": "http://maven.apache.org/POM/4.0.0"}
+    enforcer_plugin = root.find(
+        ".//maven:plugin[maven:artifactId='maven-enforcer-plugin']", ns
+    )
    java_version = None
    maven_version = None

    if enforcer_plugin:
-        java_version = enforcer_plugin.find(".//maven:requireJavaVersion/maven:version", ns)
-        maven_version = enforcer_plugin.find(".//maven:requireMavenVersion/maven:version", ns)
+        java_version = enforcer_plugin.find(
+            ".//maven:requireJavaVersion/maven:version", ns
+        )
+        maven_version = enforcer_plugin.find(
+            ".//maven:requireMavenVersion/maven:version", ns
+        )
        java_version = java_version.text if java_version is not None else None
        maven_version = maven_version.text if maven_version is not None else None

    return java_version, maven_version

+
 def get_installed_version(command):
    try:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            shell=True,
+        )
        return result.stdout.strip()
    except Exception as e:
        print("Error obtaining installed version:", e)
        return None

+
 def normalize_version(ver):
-    ver = ver.replace('_', '.')
+    ver = ver.replace("_", ".")
    match = re.match(r"(\d+\.\d+\.\d+)", ver)
    return match.group(1) if match else ver

+
 def version_is_compatible(required_version_range, installed_version):
    if required_version_range and installed_version:
        try:
            normalized_installed_version = normalize_version(installed_version)
            if "," in required_version_range:
                lower_bound = required_version_range.strip("[]()").split(",")[0].strip()
-                if version.parse(normalized_installed_version) >= version.parse(lower_bound):
+                if version.parse(normalized_installed_version) >= version.parse(
+                    lower_bound
+                ):
                    return True
            else:
                required_version = required_version_range.strip("[]()")
-                if version.parse(normalized_installed_version) >= version.parse(required_version):
+                if version.parse(normalized_installed_version) >= version.parse(
+                    required_version
+                ):
                    return True
        except version.InvalidVersion as e:
            print(f"Error parsing version: {e}")
    return False

+
 def copy_jar_files(source_dir, output_dir):
    # if os.path.exists(output_dir):
    #     shutil.rmtree(output_dir)
@@ -70,24 +89,56 @@ def copy_jar_files(source_dir, output_dir):
                shutil.copy(source_file_path, destination_file_path)
                print(f"Copied: {source_file_path} to {destination_file_path}")

-def process_commit_hashes(commit_file, repo_path, output_dir, java_required, maven_required, java_installed, maven_installed):
-    with open(commit_file, 'r') as file:
+
+def process_commit_hashes(
+    commit_file,
+    repo_path,
+    output_dir,
+    java_required,
+    maven_required,
+    java_installed,
+    maven_installed,
+):
+    with open(commit_file, "r") as file:
        hashes = file.readlines()

    for hash in hashes:
        hash = hash.strip()
        try:
-            result = subprocess.run(f"cd {repo_path} && git checkout {hash}", shell=True, check=True)
+            result = subprocess.run(
+                f"cd {repo_path} && git checkout {hash}", shell=True, check=True
+            )
            print(f"Checked out {hash}")
-            if version_is_compatible(java_required, java_installed) and version_is_compatible(maven_required, maven_installed):
-                subprocess.run(f"cd {repo_path} && mvn clean package --projects :hadoop-common --also-make -DskipTests", shell=True)
-                copy_jar_files(f"{repo_path}/hadoop-common-project/hadoop-common/target", f"{output_dir}/{hash}")
+            if version_is_compatible(
+                java_required, java_installed
+            ) and version_is_compatible(maven_required, maven_installed):
+                subprocess.run(
+                    f"cd {repo_path} && mvn clean package --projects :hadoop-common --also-make -DskipTests",
+                    shell=True,
+                )
+                copy_jar_files(
+                    f"{repo_path}/hadoop-common-project/hadoop-common/target",
+                    f"{output_dir}/{hash}",
+                )
        except subprocess.CalledProcessError:
            print(f"Commit {hash} not found or checkout failed.")

+
 def create_packages(repo_path, commit_file, output_dir):
-    java_required, maven_required = parse_pom_xml(f'{repo_path}/pom.xml')
-    java_installed = get_installed_version("java -version 2>&1 | head -n 1 | awk '{print $3}' | tr -d '\"'")
-    maven_installed = get_installed_version("mvn -v | grep 'Apache Maven' | awk '{print $3}'")
+    java_required, maven_required = parse_pom_xml(f"{repo_path}/pom.xml")
+    java_installed = get_installed_version(
+        "java -version 2>&1 | head -n 1 | awk '{print $3}' | tr -d '\"'"
+    )
+    maven_installed = get_installed_version(
+        "mvn -v | grep 'Apache Maven' | awk '{print $3}'"
+    )

-    process_commit_hashes(commit_file, repo_path, output_dir, java_required, maven_required, java_installed, maven_installed)
+    process_commit_hashes(
+        commit_file,
+        repo_path,
+        output_dir,
+        java_required,
+        maven_required,
+        java_installed,
+        maven_installed,
+    )
--- a/src/hadoop_analysis/clustering.py
+++ b/src/hadoop_analysis/clustering.py
--- a/src/hadoop_analysis/commit_analyzer.py
+++ b/src/hadoop_analysis/commit_analyzer.py
 import json
 from pydriller import Repository

+
 def analyze_commit(commit):
    """Analyzes a commit to return detailed metrics including DMM metrics."""
    # PyDriller's method to get DMM metrics directly
    metrics = {
-        'size': commit.dmm_unit_size,
-        'complexity': commit.dmm_unit_complexity,
-        'interfacing': commit.dmm_unit_interfacing
+        "size": commit.dmm_unit_size,
+        "complexity": commit.dmm_unit_complexity,
+        "interfacing": commit.dmm_unit_interfacing,
    }
    return metrics

+
 def get_file_changes(commit):
    """Analyzes file changes in a commit to collect detailed metrics."""
    file_changes = {}
    for mod in commit.modified_files:
        file_changes[mod.filename] = {
-            'added_lines': mod.added_lines,
-            'deleted_lines': mod.deleted_lines,
-            'changed_methods': len(mod.changed_methods),
-            'complexity': mod.complexity
+            "added_lines": mod.added_lines,
+            "deleted_lines": mod.deleted_lines,
+            "changed_methods": len(mod.changed_methods),
+            "complexity": mod.complexity,
        }
    return file_changes

+
 def commit_details(commit):
    """Collects all required details from a commit."""
    metrics = analyze_commit(commit)
    file_changes = get_file_changes(commit)
    details = {
-        'Commit_Hash': commit.hash,
-        'Author': commit.author.name,
-        'Date': commit.committer_date.strftime("%Y-%m-%d %H:%M:%S"),
-        'Parent_Commits': [parent for parent in commit.parents],
-        'DMM_metrics': metrics,
-        'File_changes': file_changes
+        "Commit_Hash": commit.hash,
+        "Author": commit.author.name,
+        "Date": commit.committer_date.strftime("%Y-%m-%d %H:%M:%S"),
+        "Parent_Commits": [parent for parent in commit.parents],
+        "DMM_metrics": metrics,
+        "File_changes": file_changes,
    }
    return details
--- a/src/hadoop_analysis/dependency_analyzer.py
+++ b/src/hadoop_analysis/dependency_analyzer.py
+from jar_runner import run_jar
+import os
+
+
+def analyze_dependencies(output_dir):
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    lib_dir = os.path.join(base_dir, "../../lib")
+    java_parser_path = os.path.join(lib_dir, "arcade_core_JavaParser.jar")
+
+    for subdir, dirs, files in os.walk(output_dir):
+        for file_name in files:
+            if file_name.endswith("SNAPSHOT.jar"):
+                pass
+
+            full_jar_path = os.path.join(subdir, file_name)
+            output_rsf = os.path.join(subdir, "dependencies.rsf")
+            output_fv = os.path.join(subdir, "dependencies.fv")
+
+            # Construct arguments for the dependency analyzer JAR
+            args = [full_jar_path, output_rsf, output_fv, "org.apache.hadoop"]
+            print(f"Analyzing dependencies in {subdir} for JAR {file_name}...")
+
+            stdout, stderr = run_jar(java_parser_path, args=args)
+
+            if stdout:
+                print(stdout)
+            if stderr:
+                print(f"Error processing {file_name}: {stderr}")
--- a/src/hadoop_analysis/issue_commit_linkage.py
+++ b/src/hadoop_analysis/issue_commit_linkage.py
@@ -5,28 +5,31 @@ import requests
 from pydriller import Repository
 from commit_analyzer import commit_details

+
 def read_issue_ids(file_path):
    """Reads a JSON file and returns a list of issue IDs."""
    try:
-        with open(file_path, 'r') as file:
+        with open(file_path, "r") as file:
            data = json.load(file)
-            issue_ids = data.get('issue_ids', [])
+            issue_ids = data.get("issue_ids", [])
            logging.info("Issue IDs successfully read from file.")
            return issue_ids
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return []

+
 def get_issue_type(issue_key):
    """Fetches the issue type from the JIRA API based on issue_key."""
-    url = f'https://issues.apache.org/jira/rest/api/latest/issue/{issue_key}'
+    url = f"https://issues.apache.org/jira/rest/api/latest/issue/{issue_key}"
    response = requests.get(url)
    if response.status_code == 200:
        issue_data = response.json()
-        return issue_data['fields']['issuetype']['name']
+        return issue_data["fields"]["issuetype"]["name"]
    else:
        return None

+
 def link_issues_to_commits(repo_path, issues_file, output_dir):
    """Links issues to commits and writes relevant information to a JSON file."""
    issue_ids = read_issue_ids(issues_file)
@@ -38,12 +41,14 @@ def link_issues_to_commits(repo_path, issues_file, output_dir):
            if issue_id in commit.msg:
                issue_type = get_issue_type(issue_id)
                commit_info = commit_details(commit)
-                commit_info['Decision Type'] = issue_type
-                output[issue_id].append({'Hash': commit.hash, 'Parents': commit.parents})
+                commit_info["Decision Type"] = issue_type
+                output[issue_id].append(
+                    {"Hash": commit.hash, "Parents": commit.parents}
+                )
                all_commits[commit.hash] = commit_info

    # Write to JSON files
-    with open(os.path.join(output_dir, 'issue_commit_relationships.json'), 'w') as file:
+    with open(os.path.join(output_dir, "issue_commit_relationships.json"), "w") as file:
        json.dump(output, file, indent=4)
-    with open(os.path.join(output_dir, 'detailed_commits_info.json'), 'w') as file:
+    with open(os.path.join(output_dir, "detailed_commits_info.json"), "w") as file:
        json.dump(all_commits, file, indent=4)
--- a/src/hadoop_analysis/jar_runner.py
+++ b/src/hadoop_analysis/jar_runner.py
+import subprocess
+
+
+def run_jar(jar_path, args=None, cwd=None):
+    """
+    Runs a JAR file with the specified arguments and working directory.
+
+    Args:
+        jar_path (str): The path to the JAR file.
+        args (list of str, optional): The arguments to pass to the JAR file. Defaults to None.
+        cwd (str, optional): The working directory from which to run the JAR. Defaults to None.
+
+    Returns:
+        tuple: A tuple containing the standard output and standard error of the JAR execution.
+    """
+    if args is None:
+        args = []
+
+    command = ["java", "-jar", jar_path] + args
+    result = subprocess.run(command, capture_output=True, text=True, cwd=cwd)
+
+    if result.stderr:
+        print(f"Error running {jar_path}: {result.stderr}")
+
+    return result.stdout, result.stderr
--- a/src/hadoop_analysis/main.py
+++ b/src/hadoop_analysis/main.py
+#!/usr/bin/python3
+
 import argparse
 import os

@@ -6,16 +8,21 @@ from build_manager import create_packages
 from issue_commit_linkage import link_issues_to_commits
 from visualization import create_charts

+from dependency_analyzer import analyze_dependencies
+
 # Define default paths for input and output directories
-DEFAULT_INPUT_DIR = 'data/input'
-DEFAULT_OUTPUT_DIR = 'data/output'
+DEFAULT_INPUT_DIR = "data/input"
+DEFAULT_OUTPUT_DIR = "data/output"
+

 def parse_arguments():
-    parser = argparse.ArgumentParser(description='Analyze Hadoop Repository Data')
-    parser.add_argument('--repo-path', type=str, required=True,
-                        help='Path to the Hadoop repository')
+    parser = argparse.ArgumentParser(description="Analyze Hadoop Repository Data")
+    parser.add_argument(
+        "--repo-path", type=str, required=True, help="Path to the Hadoop repository"
+    )
    return parser.parse_args()

+
 def main():
    # Initial configurations
    setup_logging(log_to_file=False)
@@ -23,11 +30,11 @@ def main():

    # Define the base directory relative to the script location
    base_dir = os.path.dirname(os.path.abspath(__file__))
-    input_dir = os.path.join(base_dir, '../../data/input')
-    output_dir = os.path.join(base_dir, '../../data/output')
+    input_dir = os.path.join(base_dir, "../../data/input")
+    output_dir = os.path.join(base_dir, "../../data/output")

-    commit_file_path = os.path.join(input_dir, 'commit_file')
-    issues_file_path = os.path.join(input_dir, 'issues.json')
+    commit_file_path = os.path.join(input_dir, "commit_file")
+    issues_file_path = os.path.join(input_dir, "issues.json")

    # Access the arguments
    repo_path = args.repo_path
@@ -39,8 +46,12 @@ def main():
    create_packages(repo_path, commit_file_path, output_dir)

    # Visualization
-    commit_info_file_path = os.path.join(output_dir, 'detailed_commits_info.json')
+    commit_info_file_path = os.path.join(output_dir, "detailed_commits_info.json")
    create_charts(commit_info_file_path, output_dir)

-if __name__ == '__main__':
+    # Dependency Analysis
+    analyze_dependencies(output_dir)
+
+
+if __name__ == "__main__":
    main()
--- a/src/hadoop_analysis/utils.py
+++ b/src/hadoop_analysis/utils.py
 import logging
 import sys

+
 def setup_logging(log_to_file=False):
    # Define the logging format
-    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    date_format = '%Y-%m-%d %H:%M:%S'
+    log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    date_format = "%Y-%m-%d %H:%M:%S"

    if log_to_file:
        # Configure logging to file
-        logging.basicConfig(filename='logs/hadoop_analysis.log',
-                            filemode='w',
+        logging.basicConfig(
+            filename="logs/hadoop_analysis.log",
+            filemode="w",
            level=logging.DEBUG,
            format=log_format,
-                            datefmt=date_format)
+            datefmt=date_format,
+        )
    else:
        # Configure logging to console
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(logging.DEBUG)
        console_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))

-        logging.basicConfig(level=logging.DEBUG,
-                            handlers=[console_handler])
\ No newline at end of file
+        logging.basicConfig(level=logging.DEBUG, handlers=[console_handler])
--- a/src/hadoop_analysis/visualization.py
+++ b/src/hadoop_analysis/visualization.py
@@ -4,9 +4,10 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib.backends.backend_pdf import PdfPages

+
 def load_data(json_path):
    """Load and transform data from a JSON file into a pandas DataFrame."""
-    with open(json_path, 'r') as file:
+    with open(json_path, "r") as file:
        data = json.load(file)

    # Transform the dictionary to a list of values (commit details)
@@ -15,26 +16,46 @@ def load_data(json_path):
    # Normalize the list of dictionaries to create a DataFrame
    return pd.json_normalize(transformed_data)

+
 def create_boxplot(data, x, y, title, pdf):
    """Create a boxplot from the provided data and save to the PDF."""
    plt.figure(figsize=(10, 6))
-    sns.boxplot(x=x, y=y, data=data, palette='Set3')
+    sns.boxplot(x=x, y=y, data=data, palette="Set3")
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.tight_layout()
-    plt.savefig(pdf, format='pdf')
+    plt.savefig(pdf, format="pdf")
+

 def create_charts(json_file, output_dir):
    data = load_data(json_file)
    print(data.columns)

    # Ensure DMM metrics are accessible by flattening the nested structure
-    data['DMM_unit_size'] = data['DMM_metrics.size']
-    data['DMM_unit_complexity'] = data['DMM_metrics.complexity']
-    data['DMM_unit_interfacing'] = data['DMM_metrics.interfacing']
+    data["DMM_unit_size"] = data["DMM_metrics.size"]
+    data["DMM_unit_complexity"] = data["DMM_metrics.complexity"]
+    data["DMM_unit_interfacing"] = data["DMM_metrics.interfacing"]

    with PdfPages(f"{output_dir}/visualization_report.pdf") as pdf:
-        create_boxplot(data, 'Decision Type', 'DMM_unit_size', 'Size Information per Decision Type', pdf)
-        create_boxplot(data, 'Decision Type', 'DMM_unit_complexity', 'Complexity Information per Decision Type', pdf)
-        create_boxplot(data, 'Decision Type', 'DMM_unit_interfacing', 'Interfacing Information per Decision Type', pdf)
\ No newline at end of file
+        create_boxplot(
+            data,
+            "Decision Type",
+            "DMM_unit_size",
+            "Size Information per Decision Type",
+            pdf,
+        )
+        create_boxplot(
+            data,
+            "Decision Type",
+            "DMM_unit_complexity",
+            "Complexity Information per Decision Type",
+            pdf,
+        )
+        create_boxplot(
+            data,
+            "Decision Type",
+            "DMM_unit_interfacing",
+            "Interfacing Information per Decision Type",
+            pdf,
+        )