update bag of words

f68704ec · Jayesh · e9bf9e00 · f68704ec · f68704ec · f68704ec
Commit f68704ec authored Jun 5, 2024 by Jayesh
--- a/.gitignore
+++ b/.gitignore
 **/.venv/
 *.pyc
 RSF_FV/
+.idea/
\ No newline at end of file
--- a/Assignment_2/Week1/task1.ipynb
+++ b/Assignment_2/Week1/task1.ipynb
@@ -214,6 +214,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import re\n",
    "# Perform text cleaning and tokenization for the concatenation of issue summary and description\n",
    "def clean_text(text):\n",
    "    \"\"\"\n",
@@ -230,6 +231,23 @@
    "    text = re.sub(r'\\s+', ' ', text).strip()\n",
    "    return text\n",
    "\n",
+    "def replace_abbr(text):\n",
+    "    \"\"\"\n",
+    "    Replace specific words if they appear as standalone words.\n",
+    "    \"\"\"\n",
+    "    replacements = {\n",
+    "        r\"\\brm\\b\": \"ResourceManager\",\n",
+    "        r\"\\bnm\\b\": \"NodeManager\",\n",
+    "        r\"\\bam\\b\": \"ApplicationMaster\",\n",
+    "        r\"\\bacl\\b\": \"AccessControlList\",\n",
+    "        r\"\\bha\\b\": \"HighAvailability\"\n",
+    "    }\n",
+    "\n",
+    "    for pattern, replacement in replacements.items():\n",
+    "        text = re.sub(pattern, replacement, text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
    "def tokenize_text(text):\n",
    "    \"\"\"\n",
    "    Function to tokenize cleaned text.\n",
@@ -244,6 +262,7 @@
   "outputs": [],
   "source": [
    "issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(clean_text)\n",
+    "issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(replace_abbr)\n",
    "issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description'].apply(tokenize_text)"
   ]
  },

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 from jira import JIRA, JIRAError
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_list = pd.read_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Issues_2nd.xlsx", sheet_name="Yarn")
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_keys = issue_list["Issue key"].to_list()
 len(issue_keys)
 ```

 %% Cell type:markdown id: tags:

 # Point 1 & 2

 %% Cell type:code id: tags:

 ``` python
 APACHE_JIRA_SERVER = 'https://issues.apache.org/jira/'
 jira = JIRA(APACHE_JIRA_SERVER)
 ```

 %% Cell type:code id: tags:

 ``` python
 jql_query = 'issuekey in ({}) AND project = YARN'
 fields = "parent,summary,description,issuetype,status,comment"
 fields_arr = fields.split(',')
 ```

 %% Cell type:code id: tags:

 ``` python
 jira_raw_data_key_map = {
    "issuetype": "name",
    "parent": "key",
    "status": "name"
 }
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data = []
 key_window = 300
 for idx in range(0, len(issue_keys), key_window):
    issue_keys_subset = issue_keys[idx:idx+key_window]
    query = jql_query.format(','.join(issue_keys_subset))
    response = jira.search_issues(query, fields=fields, maxResults=key_window)
    for issue in response:
        data = [issue.key]
        for field in fields_arr:
            if field in issue.raw['fields']:
                if field == 'comment':
                    comments = issue.raw['fields'][field]['comments']
                    comment_text = []
                    for comment in comments:
                        comment_text.append(comment['body'])
                    data.append(comment_text)
                    continue
                if field in jira_raw_data_key_map:
                    data.append(issue.raw['fields'][field][jira_raw_data_key_map[field]])
                else:
                    data.append(issue.raw['fields'][field])
            else:
                data.append(None)
        issue_data.append(data)

 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df = pd.DataFrame(issue_data, columns=["Issue key", "Parent", "Summary", "Description", "Issue Type", "Status", "Comments"])
 issue_data_df.head()
 ```

 %% Cell type:code id: tags:

 ``` python
 import re
 ```

 %% Cell type:code id: tags:

 ``` python
 def remove_illegal_characters(cell):
    """
    Function to remove illegal characters from a cell.
    """
    # Regex to match illegal characters
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
    if isinstance(cell, str):
        return ILLEGAL_CHARACTERS_RE.sub('', cell)
    return cell
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df = issue_data_df.applymap(remove_illegal_characters)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data.xlsx", index=False)
 ```

 %% Cell type:markdown id: tags:

 # Point 3

 %% Cell type:code id: tags:

 ``` python
 import re
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer, PorterStemmer
 from nltk import pos_tag
 ```

 %% Cell type:code id: tags:

 ``` python
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('wordnet')
 ```

 %% Cell type:code id: tags:

 ``` python
 nltk.download('averaged_perceptron_tagger')
 ```

 %% Cell type:code id: tags:

 ``` python
 # merge summary and description
 issue_data_df['Summary_Description'] = issue_data_df['Summary'] + ' ' + issue_data_df['Description']
 ```

 %% Cell type:code id: tags:

 ``` python
+import re
 # Perform text cleaning and tokenization for the concatenation of issue summary and description
 def clean_text(text):
    """
    Function to clean text by removing unwanted characters and converting to lowercase.
    """
    # Convert to string
    text = str(text)

    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

+def replace_abbr(text):
+    """
+    Replace specific words if they appear as standalone words.
+    """
+    replacements = {
+        r"\brm\b": "ResourceManager",
+        r"\bnm\b": "NodeManager",
+        r"\bam\b": "ApplicationMaster",
+        r"\bacl\b": "AccessControlList",
+        r"\bha\b": "HighAvailability"
+    }
+
+    for pattern, replacement in replacements.items():
+        text = re.sub(pattern, replacement, text)
+
+    return text
+
 def tokenize_text(text):
    """
    Function to tokenize cleaned text.
    """
    return word_tokenize(text)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(clean_text)
+issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(replace_abbr)
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description'].apply(tokenize_text)
 ```

 %% Cell type:code id: tags:

 ``` python
 def remove_stopwords(tokens):
    """
    Function to remove stop words from tokenized text.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
 ```

 %% Cell type:code id: tags:

 ``` python
 # pos_tags = pos_tag(issue_data_df['Summary_Description_Tokens'].tolist()[0])
 # issue_data_df['Summary_Description_Tokens'].tolist()[0]
 ```

 %% Cell type:code id: tags:

 ``` python
 pos_tag(['running'])
 ```

 %% Cell type:code id: tags:

 ``` python
 pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
 ```

 %% Cell type:code id: tags:

 ``` python
 def lemmatize_text(tokens):
    """
    Function to lemmatize tokenized text.
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens]
 ```

 %% Cell type:code id: tags:

 ``` python
 lemmatize_text(['running'])
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(lemmatize_text)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df.drop(columns=['Summary_Description'], inplace=True)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data_cleaned.xlsx", index=False)
 ```

 %% Cell type:markdown id: tags:

 # Point 4

 %% Cell type:code id: tags:

 ``` python
 # Create a list of unique words from the tokenized text and their count
 unique_words = {}
 for tokens in issue_data_df['Summary_Description_Tokens']:
    for word in tokens:
        if word in unique_words:
            unique_words[word] += 1
        else:
            unique_words[word] = 1

 unique_words_df = pd.DataFrame(list(unique_words.items()), columns=['Word', 'Count'])
 unique_words_df = unique_words_df.sort_values('Count', ascending=False).reset_index(drop=True)
 unique_words_df.head(10)
 ```

 %% Cell type:code id: tags:

 ``` python
 unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
 ```

 %% Cell type:code id: tags:

 ``` python
 unique_words_df.head(20)
 ```

--- a/Assignment_2/Week1/yarn_issue_data.xlsx
+++ b/Assignment_2/Week1/yarn_issue_data.xlsx
--- a/Assignment_2/Week1/yarn_issue_data_cleaned.xlsx
+++ b/Assignment_2/Week1/yarn_issue_data_cleaned.xlsx
--- a/Assignment_2/Week1/yarn_unique_words.xlsx
+++ b/Assignment_2/Week1/yarn_unique_words.xlsx