Skip to content
Snippets Groups Projects
Commit f68704ec authored by Jayesh's avatar Jayesh
Browse files

update bag of words

parent e9bf9e00
No related branches found
No related tags found
No related merge requests found
**/.venv/
*.pyc
RSF_FV/
.idea/
\ No newline at end of file
%% Cell type:code id: tags:
``` python
import pandas as pd
from jira import JIRA, JIRAError
```
%% Cell type:code id: tags:
``` python
issue_list = pd.read_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Issues_2nd.xlsx", sheet_name="Yarn")
```
%% Cell type:code id: tags:
``` python
issue_keys = issue_list["Issue key"].to_list()
len(issue_keys)
```
%% Cell type:markdown id: tags:
# Point 1 & 2
%% Cell type:code id: tags:
``` python
APACHE_JIRA_SERVER = 'https://issues.apache.org/jira/'
jira = JIRA(APACHE_JIRA_SERVER)
```
%% Cell type:code id: tags:
``` python
jql_query = 'issuekey in ({}) AND project = YARN'
fields = "parent,summary,description,issuetype,status,comment"
fields_arr = fields.split(',')
```
%% Cell type:code id: tags:
``` python
jira_raw_data_key_map = {
"issuetype": "name",
"parent": "key",
"status": "name"
}
```
%% Cell type:code id: tags:
``` python
issue_data = []
key_window = 300
for idx in range(0, len(issue_keys), key_window):
issue_keys_subset = issue_keys[idx:idx+key_window]
query = jql_query.format(','.join(issue_keys_subset))
response = jira.search_issues(query, fields=fields, maxResults=key_window)
for issue in response:
data = [issue.key]
for field in fields_arr:
if field in issue.raw['fields']:
if field == 'comment':
comments = issue.raw['fields'][field]['comments']
comment_text = []
for comment in comments:
comment_text.append(comment['body'])
data.append(comment_text)
continue
if field in jira_raw_data_key_map:
data.append(issue.raw['fields'][field][jira_raw_data_key_map[field]])
else:
data.append(issue.raw['fields'][field])
else:
data.append(None)
issue_data.append(data)
```
%% Cell type:code id: tags:
``` python
issue_data_df = pd.DataFrame(issue_data, columns=["Issue key", "Parent", "Summary", "Description", "Issue Type", "Status", "Comments"])
issue_data_df.head()
```
%% Cell type:code id: tags:
``` python
import re
```
%% Cell type:code id: tags:
``` python
def remove_illegal_characters(cell):
"""
Function to remove illegal characters from a cell.
"""
# Regex to match illegal characters
ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
if isinstance(cell, str):
return ILLEGAL_CHARACTERS_RE.sub('', cell)
return cell
```
%% Cell type:code id: tags:
``` python
issue_data_df = issue_data_df.applymap(remove_illegal_characters)
```
%% Cell type:code id: tags:
``` python
issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data.xlsx", index=False)
```
%% Cell type:markdown id: tags:
# Point 3
%% Cell type:code id: tags:
``` python
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
```
%% Cell type:code id: tags:
``` python
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
```
%% Cell type:code id: tags:
``` python
nltk.download('averaged_perceptron_tagger')
```
%% Cell type:code id: tags:
``` python
# merge summary and description
issue_data_df['Summary_Description'] = issue_data_df['Summary'] + ' ' + issue_data_df['Description']
```
%% Cell type:code id: tags:
``` python
import re
# Perform text cleaning and tokenization for the concatenation of issue summary and description
def clean_text(text):
"""
Function to clean text by removing unwanted characters and converting to lowercase.
"""
# Convert to string
text = str(text)
# Convert to lowercase
text = text.lower()
# Remove punctuation, numbers, and special characters
text = re.sub(r'[^a-z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def replace_abbr(text):
"""
Replace specific words if they appear as standalone words.
"""
replacements = {
r"\brm\b": "ResourceManager",
r"\bnm\b": "NodeManager",
r"\bam\b": "ApplicationMaster",
r"\bacl\b": "AccessControlList",
r"\bha\b": "HighAvailability"
}
for pattern, replacement in replacements.items():
text = re.sub(pattern, replacement, text)
return text
def tokenize_text(text):
"""
Function to tokenize cleaned text.
"""
return word_tokenize(text)
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(clean_text)
issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(replace_abbr)
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description'].apply(tokenize_text)
```
%% Cell type:code id: tags:
``` python
def remove_stopwords(tokens):
"""
Function to remove stop words from tokenized text.
"""
stop_words = set(stopwords.words('english'))
return [word for word in tokens if word not in stop_words]
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
```
%% Cell type:code id: tags:
``` python
# pos_tags = pos_tag(issue_data_df['Summary_Description_Tokens'].tolist()[0])
# issue_data_df['Summary_Description_Tokens'].tolist()[0]
```
%% Cell type:code id: tags:
``` python
pos_tag(['running'])
```
%% Cell type:code id: tags:
``` python
pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
```
%% Cell type:code id: tags:
``` python
def lemmatize_text(tokens):
"""
Function to lemmatize tokenized text.
"""
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens]
```
%% Cell type:code id: tags:
``` python
lemmatize_text(['running'])
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
```
%% Cell type:code id: tags:
``` python
issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(lemmatize_text)
```
%% Cell type:code id: tags:
``` python
issue_data_df.drop(columns=['Summary_Description'], inplace=True)
```
%% Cell type:code id: tags:
``` python
issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data_cleaned.xlsx", index=False)
```
%% Cell type:markdown id: tags:
# Point 4
%% Cell type:code id: tags:
``` python
# Create a list of unique words from the tokenized text and their count
unique_words = {}
for tokens in issue_data_df['Summary_Description_Tokens']:
for word in tokens:
if word in unique_words:
unique_words[word] += 1
else:
unique_words[word] = 1
unique_words_df = pd.DataFrame(list(unique_words.items()), columns=['Word', 'Count'])
unique_words_df = unique_words_df.sort_values('Count', ascending=False).reset_index(drop=True)
unique_words_df.head(10)
```
%% Cell type:code id: tags:
``` python
unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
```
%% Cell type:code id: tags:
``` python
unique_words_df.head(20)
```
......
No preview for this file type
No preview for this file type
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment