Skip to content
Snippets Groups Projects
Commit e6f676bb authored by Michael Youkeim's avatar Michael Youkeim
Browse files

Added dependency analysis

parent cc807afe
No related branches found
No related tags found
No related merge requests found
Showing
with 49452 additions and 80 deletions
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/python3
import xml.etree.ElementTree as ET
import subprocess
import re
......@@ -8,51 +6,72 @@ import os
import shutil
import sys
def parse_pom_xml(file_path):
tree = ET.parse(file_path)
root = tree.getroot()
ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
enforcer_plugin = root.find(".//maven:plugin[maven:artifactId='maven-enforcer-plugin']", ns)
ns = {"maven": "http://maven.apache.org/POM/4.0.0"}
enforcer_plugin = root.find(
".//maven:plugin[maven:artifactId='maven-enforcer-plugin']", ns
)
java_version = None
maven_version = None
if enforcer_plugin:
java_version = enforcer_plugin.find(".//maven:requireJavaVersion/maven:version", ns)
maven_version = enforcer_plugin.find(".//maven:requireMavenVersion/maven:version", ns)
java_version = enforcer_plugin.find(
".//maven:requireJavaVersion/maven:version", ns
)
maven_version = enforcer_plugin.find(
".//maven:requireMavenVersion/maven:version", ns
)
java_version = java_version.text if java_version is not None else None
maven_version = maven_version.text if maven_version is not None else None
return java_version, maven_version
def get_installed_version(command):
try:
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
result = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
shell=True,
)
return result.stdout.strip()
except Exception as e:
print("Error obtaining installed version:", e)
return None
def normalize_version(ver):
ver = ver.replace('_', '.')
ver = ver.replace("_", ".")
match = re.match(r"(\d+\.\d+\.\d+)", ver)
return match.group(1) if match else ver
def version_is_compatible(required_version_range, installed_version):
if required_version_range and installed_version:
try:
normalized_installed_version = normalize_version(installed_version)
if "," in required_version_range:
lower_bound = required_version_range.strip("[]()").split(",")[0].strip()
if version.parse(normalized_installed_version) >= version.parse(lower_bound):
if version.parse(normalized_installed_version) >= version.parse(
lower_bound
):
return True
else:
required_version = required_version_range.strip("[]()")
if version.parse(normalized_installed_version) >= version.parse(required_version):
if version.parse(normalized_installed_version) >= version.parse(
required_version
):
return True
except version.InvalidVersion as e:
print(f"Error parsing version: {e}")
return False
def copy_jar_files(source_dir, output_dir):
# if os.path.exists(output_dir):
# shutil.rmtree(output_dir)
......@@ -70,24 +89,56 @@ def copy_jar_files(source_dir, output_dir):
shutil.copy(source_file_path, destination_file_path)
print(f"Copied: {source_file_path} to {destination_file_path}")
def process_commit_hashes(commit_file, repo_path, output_dir, java_required, maven_required, java_installed, maven_installed):
with open(commit_file, 'r') as file:
def process_commit_hashes(
commit_file,
repo_path,
output_dir,
java_required,
maven_required,
java_installed,
maven_installed,
):
with open(commit_file, "r") as file:
hashes = file.readlines()
for hash in hashes:
hash = hash.strip()
try:
result = subprocess.run(f"cd {repo_path} && git checkout {hash}", shell=True, check=True)
result = subprocess.run(
f"cd {repo_path} && git checkout {hash}", shell=True, check=True
)
print(f"Checked out {hash}")
if version_is_compatible(java_required, java_installed) and version_is_compatible(maven_required, maven_installed):
subprocess.run(f"cd {repo_path} && mvn clean package --projects :hadoop-common --also-make -DskipTests", shell=True)
copy_jar_files(f"{repo_path}/hadoop-common-project/hadoop-common/target", f"{output_dir}/{hash}")
if version_is_compatible(
java_required, java_installed
) and version_is_compatible(maven_required, maven_installed):
subprocess.run(
f"cd {repo_path} && mvn clean package --projects :hadoop-common --also-make -DskipTests",
shell=True,
)
copy_jar_files(
f"{repo_path}/hadoop-common-project/hadoop-common/target",
f"{output_dir}/{hash}",
)
except subprocess.CalledProcessError:
print(f"Commit {hash} not found or checkout failed.")
def create_packages(repo_path, commit_file, output_dir):
java_required, maven_required = parse_pom_xml(f'{repo_path}/pom.xml')
java_installed = get_installed_version("java -version 2>&1 | head -n 1 | awk '{print $3}' | tr -d '\"'")
maven_installed = get_installed_version("mvn -v | grep 'Apache Maven' | awk '{print $3}'")
java_required, maven_required = parse_pom_xml(f"{repo_path}/pom.xml")
java_installed = get_installed_version(
"java -version 2>&1 | head -n 1 | awk '{print $3}' | tr -d '\"'"
)
maven_installed = get_installed_version(
"mvn -v | grep 'Apache Maven' | awk '{print $3}'"
)
process_commit_hashes(commit_file, repo_path, output_dir, java_required, maven_required, java_installed, maven_installed)
process_commit_hashes(
commit_file,
repo_path,
output_dir,
java_required,
maven_required,
java_installed,
maven_installed,
)
import json
from pydriller import Repository
def analyze_commit(commit):
"""Analyzes a commit to return detailed metrics including DMM metrics."""
# PyDriller's method to get DMM metrics directly
metrics = {
'size': commit.dmm_unit_size,
'complexity': commit.dmm_unit_complexity,
'interfacing': commit.dmm_unit_interfacing
"size": commit.dmm_unit_size,
"complexity": commit.dmm_unit_complexity,
"interfacing": commit.dmm_unit_interfacing,
}
return metrics
def get_file_changes(commit):
"""Analyzes file changes in a commit to collect detailed metrics."""
file_changes = {}
for mod in commit.modified_files:
file_changes[mod.filename] = {
'added_lines': mod.added_lines,
'deleted_lines': mod.deleted_lines,
'changed_methods': len(mod.changed_methods),
'complexity': mod.complexity
"added_lines": mod.added_lines,
"deleted_lines": mod.deleted_lines,
"changed_methods": len(mod.changed_methods),
"complexity": mod.complexity,
}
return file_changes
def commit_details(commit):
"""Collects all required details from a commit."""
metrics = analyze_commit(commit)
file_changes = get_file_changes(commit)
details = {
'Commit_Hash': commit.hash,
'Author': commit.author.name,
'Date': commit.committer_date.strftime("%Y-%m-%d %H:%M:%S"),
'Parent_Commits': [parent for parent in commit.parents],
'DMM_metrics': metrics,
'File_changes': file_changes
"Commit_Hash": commit.hash,
"Author": commit.author.name,
"Date": commit.committer_date.strftime("%Y-%m-%d %H:%M:%S"),
"Parent_Commits": [parent for parent in commit.parents],
"DMM_metrics": metrics,
"File_changes": file_changes,
}
return details
from jar_runner import run_jar
import os
def analyze_dependencies(output_dir):
base_dir = os.path.dirname(os.path.abspath(__file__))
lib_dir = os.path.join(base_dir, "../../lib")
java_parser_path = os.path.join(lib_dir, "arcade_core_JavaParser.jar")
for subdir, dirs, files in os.walk(output_dir):
for file_name in files:
if file_name.endswith("SNAPSHOT.jar"):
pass
full_jar_path = os.path.join(subdir, file_name)
output_rsf = os.path.join(subdir, "dependencies.rsf")
output_fv = os.path.join(subdir, "dependencies.fv")
# Construct arguments for the dependency analyzer JAR
args = [full_jar_path, output_rsf, output_fv, "org.apache.hadoop"]
print(f"Analyzing dependencies in {subdir} for JAR {file_name}...")
stdout, stderr = run_jar(java_parser_path, args=args)
if stdout:
print(stdout)
if stderr:
print(f"Error processing {file_name}: {stderr}")
......@@ -5,28 +5,31 @@ import requests
from pydriller import Repository
from commit_analyzer import commit_details
def read_issue_ids(file_path):
"""Reads a JSON file and returns a list of issue IDs."""
try:
with open(file_path, 'r') as file:
with open(file_path, "r") as file:
data = json.load(file)
issue_ids = data.get('issue_ids', [])
issue_ids = data.get("issue_ids", [])
logging.info("Issue IDs successfully read from file.")
return issue_ids
except Exception as e:
logging.error(f"An error occurred: {e}")
return []
def get_issue_type(issue_key):
"""Fetches the issue type from the JIRA API based on issue_key."""
url = f'https://issues.apache.org/jira/rest/api/latest/issue/{issue_key}'
url = f"https://issues.apache.org/jira/rest/api/latest/issue/{issue_key}"
response = requests.get(url)
if response.status_code == 200:
issue_data = response.json()
return issue_data['fields']['issuetype']['name']
return issue_data["fields"]["issuetype"]["name"]
else:
return None
def link_issues_to_commits(repo_path, issues_file, output_dir):
"""Links issues to commits and writes relevant information to a JSON file."""
issue_ids = read_issue_ids(issues_file)
......@@ -38,12 +41,14 @@ def link_issues_to_commits(repo_path, issues_file, output_dir):
if issue_id in commit.msg:
issue_type = get_issue_type(issue_id)
commit_info = commit_details(commit)
commit_info['Decision Type'] = issue_type
output[issue_id].append({'Hash': commit.hash, 'Parents': commit.parents})
commit_info["Decision Type"] = issue_type
output[issue_id].append(
{"Hash": commit.hash, "Parents": commit.parents}
)
all_commits[commit.hash] = commit_info
# Write to JSON files
with open(os.path.join(output_dir, 'issue_commit_relationships.json'), 'w') as file:
with open(os.path.join(output_dir, "issue_commit_relationships.json"), "w") as file:
json.dump(output, file, indent=4)
with open(os.path.join(output_dir, 'detailed_commits_info.json'), 'w') as file:
with open(os.path.join(output_dir, "detailed_commits_info.json"), "w") as file:
json.dump(all_commits, file, indent=4)
import subprocess
def run_jar(jar_path, args=None, cwd=None):
"""
Runs a JAR file with the specified arguments and working directory.
Args:
jar_path (str): The path to the JAR file.
args (list of str, optional): The arguments to pass to the JAR file. Defaults to None.
cwd (str, optional): The working directory from which to run the JAR. Defaults to None.
Returns:
tuple: A tuple containing the standard output and standard error of the JAR execution.
"""
if args is None:
args = []
command = ["java", "-jar", jar_path] + args
result = subprocess.run(command, capture_output=True, text=True, cwd=cwd)
if result.stderr:
print(f"Error running {jar_path}: {result.stderr}")
return result.stdout, result.stderr
#!/usr/bin/python3
import argparse
import os
......@@ -6,16 +8,21 @@ from build_manager import create_packages
from issue_commit_linkage import link_issues_to_commits
from visualization import create_charts
from dependency_analyzer import analyze_dependencies
# Define default paths for input and output directories
DEFAULT_INPUT_DIR = 'data/input'
DEFAULT_OUTPUT_DIR = 'data/output'
DEFAULT_INPUT_DIR = "data/input"
DEFAULT_OUTPUT_DIR = "data/output"
def parse_arguments():
parser = argparse.ArgumentParser(description='Analyze Hadoop Repository Data')
parser.add_argument('--repo-path', type=str, required=True,
help='Path to the Hadoop repository')
parser = argparse.ArgumentParser(description="Analyze Hadoop Repository Data")
parser.add_argument(
"--repo-path", type=str, required=True, help="Path to the Hadoop repository"
)
return parser.parse_args()
def main():
# Initial configurations
setup_logging(log_to_file=False)
......@@ -23,11 +30,11 @@ def main():
# Define the base directory relative to the script location
base_dir = os.path.dirname(os.path.abspath(__file__))
input_dir = os.path.join(base_dir, '../../data/input')
output_dir = os.path.join(base_dir, '../../data/output')
input_dir = os.path.join(base_dir, "../../data/input")
output_dir = os.path.join(base_dir, "../../data/output")
commit_file_path = os.path.join(input_dir, 'commit_file')
issues_file_path = os.path.join(input_dir, 'issues.json')
commit_file_path = os.path.join(input_dir, "commit_file")
issues_file_path = os.path.join(input_dir, "issues.json")
# Access the arguments
repo_path = args.repo_path
......@@ -39,8 +46,12 @@ def main():
create_packages(repo_path, commit_file_path, output_dir)
# Visualization
commit_info_file_path = os.path.join(output_dir, 'detailed_commits_info.json')
commit_info_file_path = os.path.join(output_dir, "detailed_commits_info.json")
create_charts(commit_info_file_path, output_dir)
if __name__ == '__main__':
# Dependency Analysis
analyze_dependencies(output_dir)
if __name__ == "__main__":
main()
import logging
import sys
def setup_logging(log_to_file=False):
# Define the logging format
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
date_format = '%Y-%m-%d %H:%M:%S'
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
date_format = "%Y-%m-%d %H:%M:%S"
if log_to_file:
# Configure logging to file
logging.basicConfig(filename='logs/hadoop_analysis.log',
filemode='w',
logging.basicConfig(
filename="logs/hadoop_analysis.log",
filemode="w",
level=logging.DEBUG,
format=log_format,
datefmt=date_format)
datefmt=date_format,
)
else:
# Configure logging to console
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(logging.Formatter(log_format, datefmt=date_format))
logging.basicConfig(level=logging.DEBUG,
handlers=[console_handler])
\ No newline at end of file
logging.basicConfig(level=logging.DEBUG, handlers=[console_handler])
......@@ -4,9 +4,10 @@ import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
def load_data(json_path):
"""Load and transform data from a JSON file into a pandas DataFrame."""
with open(json_path, 'r') as file:
with open(json_path, "r") as file:
data = json.load(file)
# Transform the dictionary to a list of values (commit details)
......@@ -15,26 +16,46 @@ def load_data(json_path):
# Normalize the list of dictionaries to create a DataFrame
return pd.json_normalize(transformed_data)
def create_boxplot(data, x, y, title, pdf):
"""Create a boxplot from the provided data and save to the PDF."""
plt.figure(figsize=(10, 6))
sns.boxplot(x=x, y=y, data=data, palette='Set3')
sns.boxplot(x=x, y=y, data=data, palette="Set3")
plt.title(title)
plt.xlabel(x)
plt.ylabel(y)
plt.tight_layout()
plt.savefig(pdf, format='pdf')
plt.savefig(pdf, format="pdf")
def create_charts(json_file, output_dir):
data = load_data(json_file)
print(data.columns)
# Ensure DMM metrics are accessible by flattening the nested structure
data['DMM_unit_size'] = data['DMM_metrics.size']
data['DMM_unit_complexity'] = data['DMM_metrics.complexity']
data['DMM_unit_interfacing'] = data['DMM_metrics.interfacing']
data["DMM_unit_size"] = data["DMM_metrics.size"]
data["DMM_unit_complexity"] = data["DMM_metrics.complexity"]
data["DMM_unit_interfacing"] = data["DMM_metrics.interfacing"]
with PdfPages(f"{output_dir}/visualization_report.pdf") as pdf:
create_boxplot(data, 'Decision Type', 'DMM_unit_size', 'Size Information per Decision Type', pdf)
create_boxplot(data, 'Decision Type', 'DMM_unit_complexity', 'Complexity Information per Decision Type', pdf)
create_boxplot(data, 'Decision Type', 'DMM_unit_interfacing', 'Interfacing Information per Decision Type', pdf)
\ No newline at end of file
create_boxplot(
data,
"Decision Type",
"DMM_unit_size",
"Size Information per Decision Type",
pdf,
)
create_boxplot(
data,
"Decision Type",
"DMM_unit_complexity",
"Complexity Information per Decision Type",
pdf,
)
create_boxplot(
data,
"Decision Type",
"DMM_unit_interfacing",
"Interfacing Information per Decision Type",
pdf,
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment