Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DSSE_Group1
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Neha Pokharel
DSSE_Group1
Commits
9da6dce9
There was an error fetching the commit references. Please try again later.
Commit
9da6dce9
authored
May 13, 2024
by
michaelyoukeim
Browse files
Options
Downloads
Patches
Plain Diff
Eliminated repititions in the clustering code
parent
33cd98c7
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/hadoop_analysis/clustering.py
+81
-163
81 additions, 163 deletions
src/hadoop_analysis/clustering.py
with
81 additions
and
163 deletions
src/hadoop_analysis/clustering.py
+
81
−
163
View file @
9da6dce9
from
jar_runner
import
run_jar
import
os
import
json
import
pandas
as
pd
from
jar_runner
import
run_jar
def
run_pkg
(
output_dir
):
base_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
lib_dir
=
os
.
path
.
join
(
base_dir
,
"
../../lib
"
)
pkg_jar_path
=
os
.
path
.
join
(
lib_dir
,
"
arcade_core-Pkg.jar
"
)
project_name
=
"
Hadoop common
"
language
=
"
java
"
class
RepoClusterer
:
def
__init__
(
self
,
output_dir
):
self
.
output_dir
=
output_dir
self
.
base_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
lib_dir
=
os
.
path
.
join
(
self
.
base_dir
,
"
../../lib
"
)
self
.
project_name
=
"
Hadoop common
"
self
.
language
=
"
java
"
def
find_rsf_file
(
self
,
dir_path
):
"""
Returns the dependencies.rsf file path
"""
rsf_path
=
os
.
path
.
join
(
dir_path
,
"
dependencies.rsf
"
)
if
os
.
path
.
exists
(
rsf_path
):
return
rsf_path
return
None
def
run_jar_with_logging
(
self
,
jar_path
,
args
,
cwd
,
log_filename
):
stdout
,
stderr
=
run_jar
(
jar_path
,
args
=
args
,
cwd
=
cwd
)
log_path
=
os
.
path
.
join
(
cwd
,
log_filename
)
with
open
(
log_path
,
"
w
"
)
as
log_file
:
log_file
.
write
(
stdout
if
stdout
else
stderr
)
return
log_path
def
run_pkg
(
self
):
jar_path
=
os
.
path
.
join
(
self
.
lib_dir
,
"
arcade_core-Pkg.jar
"
)
file_level
=
"
true
"
for
subdir
,
dirs
,
files
in
os
.
walk
(
output_dir
):
for
subdir
,
dirs
,
files
in
os
.
walk
(
self
.
output_dir
):
for
dir_name
in
dirs
:
# The name of the directory is used as the project_version (commit ID)
project_version
=
dir_name
full_dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
# Find the RSF file
rsf_file
=
next
(
(
f
for
f
in
os
.
listdir
(
full_dir_path
)
if
f
.
endswith
(
"
dependencies.rsf
"
)),
None
)
if
rsf_file
:
full_rsf_path
=
os
.
path
.
join
(
full_dir_path
,
rsf_file
)
output_log
=
os
.
path
.
join
(
full_dir_path
,
"
pkg_output.log
"
)
# Construct arguments required by PKG
dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
rsf_file_path
=
self
.
find_rsf_file
(
dir_path
)
if
rsf_file_path
:
args
=
[
f
'
depspath=
{
full_rsf
_path
}
'
,
f
'
projectpath=
{
full_
dir_path
}
'
,
f
'
projectname=
{
project_name
}
'
,
f
'
projectversion=
{
project_version
}
'
,
f
'
language=
{
language
}
'
,
f
'
depspath=
{
rsf_file
_path
}
'
,
f
'
projectpath=
{
dir_path
}
'
,
f
'
projectname=
{
self
.
project_name
}
'
,
f
'
projectversion=
{
dir_name
}
'
,
f
'
language=
{
self
.
language
}
'
,
f
'
filelevel=
{
file_level
}
'
,
]
log_path
=
self
.
run_jar_with_logging
(
jar_path
,
args
,
dir_path
,
"
pkg_output.log
"
)
print
(
f
"
Running PKG on
{
full_rsf_path
}
with commit ID
{
project_version
}
...
"
)
# Run PKG
stdout
,
stderr
=
run_jar
(
pkg_jar_path
,
args
=
args
,
cwd
=
full_dir_path
)
# Handle logging to file
print
(
f
"
PKG Clustering completed for commit
{
project_version
}
\n
"
)
if
stdout
:
print
(
stdout
+
"
\n
"
)
if
stderr
:
print
(
f
"
Error processing
{
rsf_file
}
:
{
stderr
}
\n
"
)
f
"
PKG Clustering completed for commit
{
dir_name
}
. Results saved to
{
log_path
}
"
)
print
(
f
"
Clustering results saved to
{
output_log
}
"
)
def
run_acdc
(
output_dir
):
base_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
lib_dir
=
os
.
path
.
join
(
base_dir
,
"
../../lib
"
)
acdc_jar_path
=
os
.
path
.
join
(
lib_dir
,
"
arcade_core-ACDC.jar
"
)
for
subdir
,
dirs
,
files
in
os
.
walk
(
output_dir
):
def
run_acdc
(
self
):
jar_path
=
os
.
path
.
join
(
self
.
lib_dir
,
"
arcade_core-ACDC.jar
"
)
for
subdir
,
dirs
,
files
in
os
.
walk
(
self
.
output_dir
):
for
dir_name
in
dirs
:
full_dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
rsf_files
=
[
f
for
f
in
os
.
listdir
(
full_dir_path
)
if
f
.
endswith
(
"
dependencies.rsf
"
)]
for
rsf_file
in
rsf_files
:
full_rsf_path
=
os
.
path
.
join
(
full_dir_path
,
rsf_file
)
# Extract filename without extension for output naming
filename_without_ext
=
os
.
path
.
splitext
(
rsf_file
)[
0
]
output_file_path
=
os
.
path
.
join
(
full_dir_path
,
f
"
{
filename_without_ext
}
_acdc_output.rsf
"
)
# Construct arguments for the ACDC JAR
args
=
[
full_rsf_path
,
output_file_path
]
print
(
f
"
Running ACDC on
{
full_rsf_path
}
...
"
)
# Run ACDC
stdout
,
stderr
=
run_jar
(
acdc_jar_path
,
args
=
args
)
if
stdout
:
print
(
stdout
)
if
stderr
:
print
(
f
"
Error processing
{
rsf_file
}
:
{
stderr
}
"
)
print
(
f
"
Output generated:
{
output_file_path
}
"
)
def
run_limbo
(
output_dir
):
base_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
lib_dir
=
os
.
path
.
join
(
base_dir
,
"
../../lib
"
)
limbo_jar_path
=
os
.
path
.
join
(
lib_dir
,
"
arcade_core-ACDC.jar
"
)
def
run_clusterer
(
output_dir
):
base_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
lib_dir
=
os
.
path
.
join
(
base_dir
,
"
../../lib
"
)
clusterer_jar_path
=
os
.
path
.
join
(
lib_dir
,
"
arcade_core_clusterer.jar
"
)
project_name
=
"
Hadoop common
"
language
=
"
java
"
dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
rsf_file_path
=
self
.
find_rsf_file
(
dir_path
)
if
rsf_file_path
:
args
=
[
os
.
path
.
join
(
dir_path
,
rsf_file_path
),
os
.
path
.
join
(
dir_path
,
f
"
{
os
.
path
.
splitext
(
rsf_file_path
)[
0
]
}
_acdc_output.rsf
"
)]
log_path
=
self
.
run_jar_with_logging
(
jar_path
,
args
,
dir_path
,
f
"
{
os
.
path
.
splitext
(
rsf_file_path
)[
0
]
}
_acdc_output.log
"
)
print
(
f
"
ACDC run completed for
{
rsf_file_path
}
. Output generated:
{
log_path
}
"
)
for
subdir
,
dirs
,
files
in
os
.
walk
(
output_dir
):
def
run_clusterer
(
self
):
jar_path
=
os
.
path
.
join
(
self
.
lib_dir
,
"
arcade_core_clusterer.jar
"
)
for
subdir
,
dirs
,
files
in
os
.
walk
(
self
.
output_dir
):
for
dir_name
in
dirs
:
full_dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
rsf_files
=
[
f
for
f
in
os
.
listdir
(
full_dir_path
)
if
f
.
endswith
(
"
dependencies.rsf
"
)]
for
rsf_file
in
rsf_files
:
full_rsf_path
=
os
.
path
.
join
(
full_dir_path
,
rsf_file
)
output_path
=
full_dir_path
project_version
=
dir_name
output_log_uem
=
os
.
path
.
join
(
output_path
,
"
clusterer_output_uem.log
"
)
output_log_uemm
=
os
.
path
.
join
(
output_path
,
"
clusterer_output_uemm.log
"
)
output_log_limbo
=
os
.
path
.
join
(
output_path
,
"
clusterer_output_limbo.log
"
)
# Construct arguments for the first WCA execution
args1
=
[
f
"
-Xmx14024m
"
,
f
"
deps=
{
full_rsf_path
}
"
,
f
"
projpath=
{
output_path
}
"
,
f
"
measure=UEM
"
,
f
"
projname=
{
project_name
}
"
,
f
"
projversion=
{
project_version
}
"
,
f
"
language=
{
language
}
"
,
f
"
algo=WCA
"
]
# Construct arguments for the second WCA execution
args2
=
[
f
"
-Xmx14024m
"
,
f
"
deps=
{
full_rsf_path
}
"
,
f
"
projpath=
{
output_path
}
"
,
f
"
measure=UEMNM
"
,
f
"
projname=
{
project_name
}
"
,
f
"
projversion=
{
project_version
}
"
,
f
"
language=
{
language
}
"
,
f
"
algo=WCA
"
]
args3
=
[
dir_path
=
os
.
path
.
join
(
subdir
,
dir_name
)
rsf_file_path
=
self
.
find_rsf_file
(
dir_path
)
if
rsf_file_path
:
args
=
[
f
"
-Xmx14024m
"
,
f
"
deps=
{
full_rsf
_path
}
"
,
f
"
projpath=
{
output
_path
}
"
,
f
"
deps=
{
os
.
path
.
join
(
dir_path
,
rsf_file
_path
)
}
"
,
f
"
projpath=
{
dir
_path
}
"
,
f
"
measure=UEM
"
,
f
"
projname=
{
project_name
}
"
,
f
"
projversion=
{
project_version
}
"
,
f
"
language=
{
language
}
"
,
f
"
projname=
{
self
.
project_name
}
"
,
f
"
projversion=
{
dir_name
}
"
,
f
"
language=
{
self
.
language
}
"
,
f
"
algo=Limbo
"
]
print
(
f
"
Running WCA on
{
full_rsf_path
}
with commit ID
{
project_version
}
...
"
)
# Run WCA for UEM
print
(
"
Jar path:
"
,
clusterer_jar_path
)
print
(
"
Project path:
"
,
full_dir_path
)
print
(
"
RSF path:
"
,
full_rsf_path
)
# stdout1, stderr1 = run_jar(clusterer_jar_path, args=args1, cwd=output_path)
# # Run WCA for UEMNM
# stdout2, stderr2 = run_jar(clusterer_jar_path, args=args2, cwd=output_path)
# # Run Limbo for UEM
stdout3
,
stderr3
=
run_jar
(
clusterer_jar_path
,
args
=
args3
,
cwd
=
output_path
)
# Log outputs
with
open
(
output_log_uem
,
"
w
"
)
as
log_file1
,
open
(
output_log_uemm
,
"
w
"
)
as
log_file2
,
open
(
output_log_limbo
,
"
w
"
)
as
log_file3
:
#log_file1.write(stdout1 if stdout1 else stderr1)
#log_file2.write(stdout2 if stdout2 else stderr2)
log_file3
.
write
(
stdout3
if
stdout3
else
stderr3
)
print
(
f
"
Output for UEM saved to
{
output_log_uem
}
"
)
print
(
f
"
Output for UEMNM saved to
{
output_log_uemm
}
"
)
print
(
f
"
Output for UEMNM saved to
{
output_log_limbo
}
"
)
\ No newline at end of file
log_path
=
self
.
run_jar_with_logging
(
jar_path
,
args
,
dir_path
,
"
clusterer_output_limbo.log
"
)
print
(
f
"
Limbo clustering run completed for
{
dir_name
}
. Results saved to
{
log_path
}
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment