Skip to content
Snippets Groups Projects
Commit df396bac authored by Jayesh's avatar Jayesh
Browse files

consider as stash

parent 2e780b89
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
from jira import JIRA, JIRAError
```
%% Cell type:code id: tags:
``` python
issue_list = pd.read_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Issues_2nd.xlsx", sheet_name="Yarn")
```
%% Cell type:code id: tags:
``` python
issue_keys = issue_list["Issue key"].to_list()
len(issue_keys)
```
%% Output
1535
%% Cell type:markdown id: tags:
# Point 1 & 2
%% Cell type:code id: tags:
``` python
APACHE_JIRA_SERVER = 'https://issues.apache.org/jira/'
jira = JIRA(APACHE_JIRA_SERVER)
```
%% Cell type:code id: tags:
``` python
jql_query = 'issuekey in ({}) AND project = YARN'
fields = "parent,summary,description,issuetype,status,comment"
fields_arr = fields.split(',')
```
%% Cell type:code id: tags:
``` python
jira_raw_data_key_map = {
"issuetype": "name",
"parent": "key",
"status": "name"
}
```
%% Cell type:code id: tags:
``` python
issue_data = []
key_window = 300
for idx in range(0, len(issue_keys), key_window):
issue_keys_subset = issue_keys[idx:idx+key_window]
query = jql_query.format(','.join(issue_keys_subset))
response = jira.search_issues(query, fields=fields, maxResults=key_window)
for issue in response:
data = [issue.key]
for field in fields_arr:
if field in issue.raw['fields']:
if field == 'comment':
comments = issue.raw['fields'][field]['comments']
comment_text = []
for comment in comments:
comment_text.append(comment['body'])
data.append(comment_text)
continue
if field in jira_raw_data_key_map:
data.append(issue.raw['fields'][field][jira_raw_data_key_map[field]])
else:
data.append(issue.raw['fields'][field])
else:
data.append(None)
issue_data.append(data)
```
%% Cell type:code id: tags:
``` python
issue_data_df = pd.DataFrame(issue_data, columns=["Issue key", "Parent", "Summary", "Description", "Issue Type", "Status", "Comments"])
issue_data_df.head()
```
%% Output
Issue key Parent Summary \
0 YARN-10930 YARN-10888 Introduce universal configured capacity vector
1 YARN-10562 None Follow up changes for YARN-9833
2 YARN-10514 None Introduce a dominant resource based schedule p...
3 YARN-10494 YARN-9014 CLI tool for docker-to-squashfs conversion (pu...
4 YARN-10493 YARN-9014 RunC container repository v2
Description Issue Type \
0 The proposal is to introduce a capacity resour... Sub-task
1 In YARN-9833, a race condition in DirectoryCol... Improvement
2 When we schedule in multi node lookup policy f... Improvement
3 *YARN-9564* defines a docker-to-squashfs image... Sub-task
4 The current runc container repository design h... Sub-task
Status Comments
0 Resolved []
1 Resolved [Attaching a patch to show the new approach.\r...
2 Patch Available [[~leftnoteasy] [~tangzhankun] [~prabhujoseph]...
3 Open [Hey [~ccondit], thanks for the document. I'm ...
4 Open [I have an initial PR to address the improveme...
%% Cell type:code id: tags:
``` python
def remove_illegal_characters(cell):
"""
Function to remove illegal characters from a cell.
"""
# Regex to match illegal characters
ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
if isinstance(cell, str):
return ILLEGAL_CHARACTERS_RE.sub('', cell)
return cell
```
%% Cell type:code id: tags:
``` python
issue_data_df = issue_data_df.applymap(remove_illegal_characters)
```
%% Output
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_23156\4278595474.py:1: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
issue_data_df = issue_data_df.applymap(remove_illegal_characters)
%% Cell type:code id: tags:
``` python
issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data.xlsx", index=False)
```
%% Cell type:markdown id: tags:
# Point 3
%% Cell type:code id: tags:
``` python
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
```
%% Cell type:code id: tags:
``` python
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
```
%% Output
[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\Jayesh\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\Jayesh\AppData\Roaming\nltk_data...
[nltk_data] Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data] C:\Users\Jayesh\AppData\Roaming\nltk_data...
%% Cell type:code id: tags:
True
``` python
nltk.download('averaged_perceptron_tagger')
```
%% Cell type:code id: tags:
``` python
# merge summary and description
issue_data_df['Summary_Description'] = issue_data_df['Summary'] + ' ' + issue_data_df['Description']
```
%% Cell type:code id: tags:
``` python
# Perform text cleaning and tokenization for the concatenation of issue summary and description
def clean_text(text):
"""
Function to clean text by removing unwanted characters and converting to lowercase.
"""
# Convert to string
text = str(text)
# Convert to lowercase
text = text.lower()
# Remove punctuation, numbers, and special characters
text = re.sub(r'[^a-z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_text(text):
"""
Function to tokenize cleaned text.
"""
return word_tokenize(text)
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(clean_text)
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description'].apply(tokenize_text)
```
%% Cell type:code id: tags:
``` python
def remove_stopwords(tokens):
"""
Function to remove stop words from tokenized text.
"""
stop_words = set(stopwords.words('english'))
return [word for word in tokens if word not in stop_words]
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
```
%% Cell type:code id: tags:
``` python
# pos_tags = pos_tag(issue_data_df['Summary_Description_Tokens'].tolist()[0])
# issue_data_df['Summary_Description_Tokens'].tolist()[0]
```
%% Cell type:code id: tags:
``` python
pos_tag(['running'])
```
%% Cell type:code id: tags:
``` python
pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
```
%% Cell type:code id: tags:
``` python
def lemmatize_text(tokens):
"""
Function to lemmatize tokenized text.
"""
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in tokens]
return [lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens]
```
%% Cell type:code id: tags:
``` python
lemmatize_text(['running'])
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(lemmatize_text)
```
%% Cell type:code id: tags:
``` python
issue_data_df.drop(columns=['Summary_Description'], inplace=True)
```
%% Cell type:code id: tags:
``` python
issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data_cleaned.xlsx", index=False)
```
%% Cell type:markdown id: tags:
# Point 4
%% Cell type:code id: tags:
``` python
# Create a list of unique words from the tokenized text and their count
unique_words = {}
for tokens in issue_data_df['Summary_Description_Tokens']:
for word in tokens:
if word in unique_words:
unique_words[word] += 1
else:
unique_words[word] = 1
unique_words_df = pd.DataFrame(list(unique_words.items()), columns=['Word', 'Count'])
unique_words_df = unique_words_df.sort_values('Count', ascending=False).reset_index(drop=True)
unique_words_df.head(10)
```
%% Output
%% Cell type:code id: tags:
Word Count
0 yarn 1531
1 container 791
2 rm 716
3 application 666
4 containers 661
5 support 647
6 node 618
7 queue 595
8 resource 582
9 cluster 442
``` python
unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
```
%% Cell type:code id: tags:
``` python
unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
unique_words_df.head(20)
```
......
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import os
import sys
from plotly import express as px
```
%% Cell type:code id: tags:
``` python
metric_excel = "E:\DSSE\DSSE-Group-7\Final Submission\Week1 Submission\Step4.xlsx"
metric_df = pd.read_excel(metric_excel)
```
%% Cell type:code id: tags:
``` python
relevant_columns = ['dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing',
'added_lines', 'deleted_lines', 'added_methods',
'deleted_method', 'modified_methods']
```
%% Cell type:code id: tags:
``` python
# taken method as spearman as it is non-parametric and works well with ordinal data
correlation_matrix = metric_df[relevant_columns].corr(method='spearman')
# round off all numbers to 4 decimal places
correlation_matrix = correlation_matrix.round(4)
correlation_matrix
```
%% Output
dmm_unit_size dmm_unit_complexity \
dmm_unit_size 1.0000 0.7144
dmm_unit_complexity 0.7144 1.0000
dmm_unit_interfacing 0.2643 0.4608
added_lines -0.0992 -0.0163
deleted_lines 0.1031 0.0224
added_methods -0.0538 -0.0208
deleted_method 0.0217 -0.0844
modified_methods -0.0660 -0.0770
dmm_unit_interfacing added_lines deleted_lines \
dmm_unit_size 0.2643 -0.0992 0.1031
dmm_unit_complexity 0.4608 -0.0163 0.0224
dmm_unit_interfacing 1.0000 0.0163 -0.0219
added_lines 0.0163 1.0000 -0.3796
deleted_lines -0.0219 -0.3796 1.0000
added_methods -0.0217 0.7105 -0.3601
deleted_method -0.1094 -0.2010 0.3525
modified_methods -0.0855 0.5465 -0.0173
added_methods deleted_method modified_methods
dmm_unit_size -0.0538 0.0217 -0.0660
dmm_unit_complexity -0.0208 -0.0844 -0.0770
dmm_unit_interfacing -0.0217 -0.1094 -0.0855
added_lines 0.7105 -0.2010 0.5465
deleted_lines -0.3601 0.3525 -0.0173
added_methods 1.0000 -0.1462 0.7923
deleted_method -0.1462 1.0000 0.2344
modified_methods 0.7923 0.2344 1.0000
%% Cell type:code id: tags:
``` python
# plot the correlation matrix
fig = px.imshow(correlation_matrix, text_auto=True, title="Correlation Matrix of Metrics from Week1",
zmin=-1, zmax=1, color_continuous_scale='RdBu')
fig.show()
```
%% Output
%% Cell type:code id: tags:
``` python
# lets create dmm data along with week3task3 data
a2a_acdc = pd.read_csv(r"E:\DSSE\DSSE-Group-7\Final Submission\Week3 Submission\Task 3\a2a_cvg_outputs\ACDC.csv")
a2a_acdc.head()
```
%% Output
child \
0 babd19de331c875a1dffee908617c07c3e1eb31b
1 dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
2 f0331cfd016219e416ef34f21b01973ec4ccf4c9
3 940389afce6a1b9b9e1519aed528cbc444786756
4 ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
parent a2a cvg_child_to_parent \
0 01b83a1e32b69399440843665ae2c8134d9d0e24 99.953030 1.000000
1 ebb236ef9c00592c592f8d5bb885e7dfd4d05c3a 99.622532 1.000000
2 7578282edce0eba3f24c96355c944c02156ece79 100.000000 1.000000
3 fe0ddc03e115df0e3c17b0c0f9b9376abb817688 98.825155 0.973684
4 366b1b1dd6f1ade1996c7c0eec1aca185c68d6cb 99.410464 1.000000
cvg_parent_to_child
0 1.000000
1 1.000000
2 1.000000
3 0.973684
4 1.000000
%% Cell type:code id: tags:
``` python
dmm_df = metric_df[['commit_hash', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity']]
dmm_df.drop_duplicates(subset='commit_hash', inplace=True)
combined_df = a2a_acdc.merge(dmm_df, left_on='child', right_on='commit_hash', how='left')
```
%% Output
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_7212\1058912344.py:2: SettingWithCopyWarning:
C:\Users\Jayesh\AppData\Local\Temp\ipykernel_17376\1058912344.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
%% Cell type:code id: tags:
``` python
combined_df.head()
```
%% Output
child \
0 babd19de331c875a1dffee908617c07c3e1eb31b
1 dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
2 f0331cfd016219e416ef34f21b01973ec4ccf4c9
3 940389afce6a1b9b9e1519aed528cbc444786756
4 ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
parent a2a cvg_child_to_parent \
0 01b83a1e32b69399440843665ae2c8134d9d0e24 99.953030 1.000000
1 ebb236ef9c00592c592f8d5bb885e7dfd4d05c3a 99.622532 1.000000
2 7578282edce0eba3f24c96355c944c02156ece79 100.000000 1.000000
3 fe0ddc03e115df0e3c17b0c0f9b9376abb817688 98.825155 0.973684
4 366b1b1dd6f1ade1996c7c0eec1aca185c68d6cb 99.410464 1.000000
cvg_parent_to_child commit_hash \
0 1.000000 babd19de331c875a1dffee908617c07c3e1eb31b
1 1.000000 dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
2 1.000000 f0331cfd016219e416ef34f21b01973ec4ccf4c9
3 0.973684 940389afce6a1b9b9e1519aed528cbc444786756
4 1.000000 ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
dmm_unit_size dmm_unit_complexity dmm_unit_interfacing complexity
0 0.186813 0.186813 0.807692 0.0
1 0.517544 0.820175 0.728070 0.0
2 NaN NaN NaN 0.0
3 0.337093 0.428571 0.873434 0.0
4 0.703448 1.000000 0.648276 0.0
%% Cell type:code id: tags:
``` python
# lets plot the correlation matrix of the combined data
relevant_columns = ['a2a', 'cvg_child_to_parent', 'cvg_parent_to_child', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity']
correlation_matrix = combined_df[relevant_columns].corr(method='spearman')
correlation_matrix = correlation_matrix.round(4)
correlation_matrix
```
%% Output
a2a cvg_child_to_parent cvg_parent_to_child \
a2a 1.0000 0.6280 0.5070
cvg_child_to_parent 0.6280 1.0000 0.8859
cvg_parent_to_child 0.5070 0.8859 1.0000
dmm_unit_size -0.0258 0.2069 0.3412
dmm_unit_complexity -0.3104 0.0662 0.2000
dmm_unit_interfacing 0.0358 0.0665 0.2032
complexity 0.1069 -0.0070 -0.0450
dmm_unit_size dmm_unit_complexity \
a2a -0.0258 -0.3104
cvg_child_to_parent 0.2069 0.0662
cvg_parent_to_child 0.3412 0.2000
dmm_unit_size 1.0000 0.5546
dmm_unit_complexity 0.5546 1.0000
dmm_unit_interfacing 0.1255 0.4446
complexity -0.1836 -0.4413
dmm_unit_interfacing complexity
a2a 0.0358 0.1069
cvg_child_to_parent 0.0665 -0.0070
cvg_parent_to_child 0.2032 -0.0450
dmm_unit_size 0.1255 -0.1836
dmm_unit_complexity 0.4446 -0.4413
dmm_unit_interfacing 1.0000 -0.0349
complexity -0.0349 1.0000
%% Cell type:code id: tags:
``` python
fig = px.imshow(correlation_matrix, text_auto=True, title="Correlation Matrix from Week3 stats and DMM",
zmin=-1, zmax=1, color_continuous_scale='RdBu')
fig.show()
```
%% Output
......
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment