consider as stash

df396bac · Jayesh · 2e780b89 · df396bac · df396bac · df396bac
Commit df396bac authored Jun 3, 2024 by Jayesh
--- a/Assignment_2/Week1/task1.ipynb
+++ b/Assignment_2/Week1/task1.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -12,7 +12,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -21,20 +21,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1535"
-      ]
-     },
-     "execution_count": 82,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "issue_keys = issue_list[\"Issue key\"].to_list()\n",
    "len(issue_keys)"
@@ -49,7 +38,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -59,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -70,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -83,7 +72,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -116,122 +105,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Issue key</th>\n",
-       "      <th>Parent</th>\n",
-       "      <th>Summary</th>\n",
-       "      <th>Description</th>\n",
-       "      <th>Issue Type</th>\n",
-       "      <th>Status</th>\n",
-       "      <th>Comments</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>YARN-10930</td>\n",
-       "      <td>YARN-10888</td>\n",
-       "      <td>Introduce universal configured capacity vector</td>\n",
-       "      <td>The proposal is to introduce a capacity resour...</td>\n",
-       "      <td>Sub-task</td>\n",
-       "      <td>Resolved</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>YARN-10562</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Follow up changes for YARN-9833</td>\n",
-       "      <td>In YARN-9833, a race condition in DirectoryCol...</td>\n",
-       "      <td>Improvement</td>\n",
-       "      <td>Resolved</td>\n",
-       "      <td>[Attaching a patch to show the new approach.\\r...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>YARN-10514</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Introduce a dominant resource based schedule p...</td>\n",
-       "      <td>When we schedule in multi node lookup policy f...</td>\n",
-       "      <td>Improvement</td>\n",
-       "      <td>Patch Available</td>\n",
-       "      <td>[[~leftnoteasy] [~tangzhankun] [~prabhujoseph]...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>YARN-10494</td>\n",
-       "      <td>YARN-9014</td>\n",
-       "      <td>CLI tool for docker-to-squashfs conversion (pu...</td>\n",
-       "      <td>*YARN-9564* defines a docker-to-squashfs image...</td>\n",
-       "      <td>Sub-task</td>\n",
-       "      <td>Open</td>\n",
-       "      <td>[Hey [~ccondit], thanks for the document. I'm ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>YARN-10493</td>\n",
-       "      <td>YARN-9014</td>\n",
-       "      <td>RunC container repository v2</td>\n",
-       "      <td>The current runc container repository design h...</td>\n",
-       "      <td>Sub-task</td>\n",
-       "      <td>Open</td>\n",
-       "      <td>[I have an initial PR to address the improveme...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    Issue key      Parent                                            Summary  \\\n",
-       "0  YARN-10930  YARN-10888     Introduce universal configured capacity vector   \n",
-       "1  YARN-10562        None                    Follow up changes for YARN-9833   \n",
-       "2  YARN-10514        None  Introduce a dominant resource based schedule p...   \n",
-       "3  YARN-10494   YARN-9014  CLI tool for docker-to-squashfs conversion (pu...   \n",
-       "4  YARN-10493   YARN-9014                       RunC container repository v2   \n",
-       "\n",
-       "                                         Description   Issue Type  \\\n",
-       "0  The proposal is to introduce a capacity resour...     Sub-task   \n",
-       "1  In YARN-9833, a race condition in DirectoryCol...  Improvement   \n",
-       "2  When we schedule in multi node lookup policy f...  Improvement   \n",
-       "3  *YARN-9564* defines a docker-to-squashfs image...     Sub-task   \n",
-       "4  The current runc container repository design h...     Sub-task   \n",
-       "\n",
-       "            Status                                           Comments  \n",
-       "0         Resolved                                                 []  \n",
-       "1         Resolved  [Attaching a patch to show the new approach.\\r...  \n",
-       "2  Patch Available  [[~leftnoteasy] [~tangzhankun] [~prabhujoseph]...  \n",
-       "3             Open  [Hey [~ccondit], thanks for the document. I'm ...  \n",
-       "4             Open  [I have an initial PR to address the improveme...  "
-      ]
-     },
-     "execution_count": 87,
+   "execution_count": null,
   "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "issue_data_df = pd.DataFrame(issue_data, columns=[\"Issue key\", \"Parent\", \"Summary\", \"Description\", \"Issue Type\", \"Status\", \"Comments\"])\n",
    "issue_data_df.head()"
@@ -239,7 +115,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -256,25 +132,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Jayesh\\AppData\\Local\\Temp\\ipykernel_23156\\4278595474.py:1: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
-      "  issue_data_df = issue_data_df.applymap(remove_illegal_characters)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "issue_data_df = issue_data_df.applymap(remove_illegal_characters)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -290,7 +157,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -298,48 +165,33 @@
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
-    "from nltk.stem import WordNetLemmatizer, PorterStemmer"
+    "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
+    "from nltk import pos_tag"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     C:\\Users\\Jayesh\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n",
-      "[nltk_data] Downloading package stopwords to\n",
-      "[nltk_data]     C:\\Users\\Jayesh\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Unzipping corpora\\stopwords.zip.\n",
-      "[nltk_data] Downloading package wordnet to\n",
-      "[nltk_data]     C:\\Users\\Jayesh\\AppData\\Roaming\\nltk_data...\n"
+   "outputs": [],
+   "source": [
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')"
   ]
  },
  {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 108,
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "nltk.download('punkt')\n",
-    "nltk.download('stopwords')\n",
-    "nltk.download('wordnet')"
+    "nltk.download('averaged_perceptron_tagger')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -349,7 +201,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -378,7 +230,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -388,7 +240,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -402,7 +254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -411,7 +263,35 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pos_tags = pos_tag(issue_data_df['Summary_Description_Tokens'].tolist()[0])\n",
+    "# issue_data_df['Summary_Description_Tokens'].tolist()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pos_tag(['running'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -420,12 +300,21 @@
    "    Function to lemmatize tokenized text.\n",
    "    \"\"\"\n",
    "    lemmatizer = WordNetLemmatizer()\n",
-    "    return [lemmatizer.lemmatize(word) for word in tokens]"
+    "    return [lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemmatize_text(['running'])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -434,7 +323,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(lemmatize_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -443,7 +341,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -459,108 +357,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Word</th>\n",
-       "      <th>Count</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>yarn</td>\n",
-       "      <td>1531</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>container</td>\n",
-       "      <td>791</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>rm</td>\n",
-       "      <td>716</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>application</td>\n",
-       "      <td>666</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>containers</td>\n",
-       "      <td>661</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>support</td>\n",
-       "      <td>647</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>node</td>\n",
-       "      <td>618</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>queue</td>\n",
-       "      <td>595</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>resource</td>\n",
-       "      <td>582</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>cluster</td>\n",
-       "      <td>442</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          Word  Count\n",
-       "0         yarn   1531\n",
-       "1    container    791\n",
-       "2           rm    716\n",
-       "3  application    666\n",
-       "4   containers    661\n",
-       "5      support    647\n",
-       "6         node    618\n",
-       "7        queue    595\n",
-       "8     resource    582\n",
-       "9      cluster    442"
-      ]
-     },
-     "execution_count": 118,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Create a list of unique words from the tokenized text and their count\n",
    "unique_words = {}\n",
@@ -578,12 +377,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_words_df.to_excel(\"E:\\DSSE\\DSSE-Group-7\\Assignment_2\\Week1\\yarn_unique_words.xlsx\", index=False)"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_words_df.head(20)"
+   ]
  }
 ],
 "metadata": {

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 from jira import JIRA, JIRAError
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_list = pd.read_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Issues_2nd.xlsx", sheet_name="Yarn")
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_keys = issue_list["Issue key"].to_list()
 len(issue_keys)
 ```

-%% Output
-
-    1535
-
 %% Cell type:markdown id: tags:

 # Point 1 & 2

 %% Cell type:code id: tags:

 ``` python
 APACHE_JIRA_SERVER = 'https://issues.apache.org/jira/'
 jira = JIRA(APACHE_JIRA_SERVER)
 ```

 %% Cell type:code id: tags:

 ``` python
 jql_query = 'issuekey in ({}) AND project = YARN'
 fields = "parent,summary,description,issuetype,status,comment"
 fields_arr = fields.split(',')
 ```

 %% Cell type:code id: tags:

 ``` python
 jira_raw_data_key_map = {
    "issuetype": "name",
    "parent": "key",
    "status": "name"
 }
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data = []
 key_window = 300
 for idx in range(0, len(issue_keys), key_window):
    issue_keys_subset = issue_keys[idx:idx+key_window]
    query = jql_query.format(','.join(issue_keys_subset))
    response = jira.search_issues(query, fields=fields, maxResults=key_window)
    for issue in response:
        data = [issue.key]
        for field in fields_arr:
            if field in issue.raw['fields']:
                if field == 'comment':
                    comments = issue.raw['fields'][field]['comments']
                    comment_text = []
                    for comment in comments:
                        comment_text.append(comment['body'])
                    data.append(comment_text)
                    continue
                if field in jira_raw_data_key_map:
                    data.append(issue.raw['fields'][field][jira_raw_data_key_map[field]])
                else:
                    data.append(issue.raw['fields'][field])
            else:
                data.append(None)
        issue_data.append(data)

 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df = pd.DataFrame(issue_data, columns=["Issue key", "Parent", "Summary", "Description", "Issue Type", "Status", "Comments"])
 issue_data_df.head()
 ```

-%% Output
-
-        Issue key      Parent                                            Summary  \
-    0  YARN-10930  YARN-10888     Introduce universal configured capacity vector
-    1  YARN-10562        None                    Follow up changes for YARN-9833
-    2  YARN-10514        None  Introduce a dominant resource based schedule p...
-    3  YARN-10494   YARN-9014  CLI tool for docker-to-squashfs conversion (pu...
-    4  YARN-10493   YARN-9014                       RunC container repository v2
-    
-                                             Description   Issue Type  \
-    0  The proposal is to introduce a capacity resour...     Sub-task
-    1  In YARN-9833, a race condition in DirectoryCol...  Improvement
-    2  When we schedule in multi node lookup policy f...  Improvement
-    3  *YARN-9564* defines a docker-to-squashfs image...     Sub-task
-    4  The current runc container repository design h...     Sub-task
-    
-                Status                                           Comments
-    0         Resolved                                                 []
-    1         Resolved  [Attaching a patch to show the new approach.\r...
-    2  Patch Available  [[~leftnoteasy] [~tangzhankun] [~prabhujoseph]...
-    3             Open  [Hey [~ccondit], thanks for the document. I'm ...
-    4             Open  [I have an initial PR to address the improveme...
-
 %% Cell type:code id: tags:

 ``` python
 def remove_illegal_characters(cell):
    """
    Function to remove illegal characters from a cell.
    """
    # Regex to match illegal characters
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
    if isinstance(cell, str):
        return ILLEGAL_CHARACTERS_RE.sub('', cell)
    return cell
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df = issue_data_df.applymap(remove_illegal_characters)
 ```

-%% Output
-
-    C:\Users\Jayesh\AppData\Local\Temp\ipykernel_23156\4278595474.py:1: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
-      issue_data_df = issue_data_df.applymap(remove_illegal_characters)
-
 %% Cell type:code id: tags:

 ``` python
 issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data.xlsx", index=False)
 ```

 %% Cell type:markdown id: tags:

 # Point 3

 %% Cell type:code id: tags:

 ``` python
 import re
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer, PorterStemmer
+from nltk import pos_tag
 ```

 %% Cell type:code id: tags:

 ``` python
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('wordnet')
 ```

-%% Output
-
-    [nltk_data] Downloading package punkt to
-    [nltk_data]     C:\Users\Jayesh\AppData\Roaming\nltk_data...
-    [nltk_data]   Package punkt is already up-to-date!
-    [nltk_data] Downloading package stopwords to
-    [nltk_data]     C:\Users\Jayesh\AppData\Roaming\nltk_data...
-    [nltk_data]   Unzipping corpora\stopwords.zip.
-    [nltk_data] Downloading package wordnet to
-    [nltk_data]     C:\Users\Jayesh\AppData\Roaming\nltk_data...
+%% Cell type:code id: tags:

-    True
+``` python
+nltk.download('averaged_perceptron_tagger')
+```

 %% Cell type:code id: tags:

 ``` python
 # merge summary and description
 issue_data_df['Summary_Description'] = issue_data_df['Summary'] + ' ' + issue_data_df['Description']
 ```

 %% Cell type:code id: tags:

 ``` python
 # Perform text cleaning and tokenization for the concatenation of issue summary and description
 def clean_text(text):
    """
    Function to clean text by removing unwanted characters and converting to lowercase.
    """
    # Convert to string
    text = str(text)

    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

 def tokenize_text(text):
    """
    Function to tokenize cleaned text.
    """
    return word_tokenize(text)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description'] = issue_data_df['Summary_Description'].apply(clean_text)
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description'].apply(tokenize_text)
 ```

 %% Cell type:code id: tags:

 ``` python
 def remove_stopwords(tokens):
    """
    Function to remove stop words from tokenized text.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
 ```

 %% Cell type:code id: tags:

 ``` python
+# pos_tags = pos_tag(issue_data_df['Summary_Description_Tokens'].tolist()[0])
+# issue_data_df['Summary_Description_Tokens'].tolist()[0]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+pos_tag(['running'])
+```
+
+%% Cell type:code id: tags:
+
+``` python
+pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
+```
+
+%% Cell type:code id: tags:
+
+``` python
 def lemmatize_text(tokens):
    """
    Function to lemmatize tokenized text.
    """
    lemmatizer = WordNetLemmatizer()
-    return [lemmatizer.lemmatize(word) for word in tokens]
+    return [lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+lemmatize_text(['running'])
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(remove_stopwords)
 ```

 %% Cell type:code id: tags:

 ``` python
+issue_data_df['Summary_Description_Tokens'] = issue_data_df['Summary_Description_Tokens'].apply(lemmatize_text)
+```
+
+%% Cell type:code id: tags:
+
+``` python
 issue_data_df.drop(columns=['Summary_Description'], inplace=True)
 ```

 %% Cell type:code id: tags:

 ``` python
 issue_data_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_issue_data_cleaned.xlsx", index=False)
 ```

 %% Cell type:markdown id: tags:

 # Point 4

 %% Cell type:code id: tags:

 ``` python
 # Create a list of unique words from the tokenized text and their count
 unique_words = {}
 for tokens in issue_data_df['Summary_Description_Tokens']:
    for word in tokens:
        if word in unique_words:
            unique_words[word] += 1
        else:
            unique_words[word] = 1

 unique_words_df = pd.DataFrame(list(unique_words.items()), columns=['Word', 'Count'])
 unique_words_df = unique_words_df.sort_values('Count', ascending=False).reset_index(drop=True)
 unique_words_df.head(10)
 ```

-%% Output
+%% Cell type:code id: tags:

-              Word  Count
-    0         yarn   1531
-    1    container    791
-    2           rm    716
-    3  application    666
-    4   containers    661
-    5      support    647
-    6         node    618
-    7        queue    595
-    8     resource    582
-    9      cluster    442
+``` python
+unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
+```

 %% Cell type:code id: tags:

 ``` python
-unique_words_df.to_excel("E:\DSSE\DSSE-Group-7\Assignment_2\Week1\yarn_unique_words.xlsx", index=False)
+unique_words_df.head(20)
 ```

--- a/Final Submission/Week4 Submission/Task 3/week4_task3.ipynb
+++ b/Final Submission/Week4 Submission/Task 3/week4_task3.ipynb
--- a/Final Submission/Week4 Submission/Task 4/task4.ipynb
+++ b/Final Submission/Week4 Submission/Task 4/task4.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -14,7 +14,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -24,7 +24,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -35,7 +35,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -194,7 +194,7 @@
       "modified_methods             0.7923          0.2344            1.0000  "
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -212,7 +212,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@@ -1238,7 +1238,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@@ -1337,7 +1337,7 @@
       "4             1.000000  "
      ]
     },
-     "execution_count": 15,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1351,14 +1351,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "C:\\Users\\Jayesh\\AppData\\Local\\Temp\\ipykernel_7212\\1058912344.py:2: SettingWithCopyWarning:\n",
+      "C:\\Users\\Jayesh\\AppData\\Local\\Temp\\ipykernel_17376\\1058912344.py:2: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
@@ -1376,7 +1376,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@@ -1512,7 +1512,7 @@
       "4       0.703448             1.000000              0.648276         0.0  "
      ]
     },
-     "execution_count": 23,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1523,7 +1523,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -1660,7 +1660,7 @@
       "complexity                         -0.0349      1.0000  "
      ]
     },
-     "execution_count": 27,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1677,7 +1677,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 import os
 import sys
 from plotly import express as px
 ```

 %% Cell type:code id: tags:

 ``` python
 metric_excel = "E:\DSSE\DSSE-Group-7\Final Submission\Week1 Submission\Step4.xlsx"
 metric_df = pd.read_excel(metric_excel)
 ```

 %% Cell type:code id: tags:

 ``` python
 relevant_columns = ['dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing',
                    'added_lines', 'deleted_lines', 'added_methods',
                    'deleted_method', 'modified_methods']
 ```

 %% Cell type:code id: tags:

 ``` python
 # taken method as spearman as it is non-parametric and works well with ordinal data
 correlation_matrix = metric_df[relevant_columns].corr(method='spearman')



 # round off all numbers to 4 decimal places
 correlation_matrix = correlation_matrix.round(4)
 correlation_matrix
 ```

 %% Output

                          dmm_unit_size  dmm_unit_complexity  \
    dmm_unit_size                1.0000               0.7144
    dmm_unit_complexity          0.7144               1.0000
    dmm_unit_interfacing         0.2643               0.4608
    added_lines                 -0.0992              -0.0163
    deleted_lines                0.1031               0.0224
    added_methods               -0.0538              -0.0208
    deleted_method               0.0217              -0.0844
    modified_methods            -0.0660              -0.0770
    
                          dmm_unit_interfacing  added_lines  deleted_lines  \
    dmm_unit_size                       0.2643      -0.0992         0.1031
    dmm_unit_complexity                 0.4608      -0.0163         0.0224
    dmm_unit_interfacing                1.0000       0.0163        -0.0219
    added_lines                         0.0163       1.0000        -0.3796
    deleted_lines                      -0.0219      -0.3796         1.0000
    added_methods                      -0.0217       0.7105        -0.3601
    deleted_method                     -0.1094      -0.2010         0.3525
    modified_methods                   -0.0855       0.5465        -0.0173
    
                          added_methods  deleted_method  modified_methods
    dmm_unit_size               -0.0538          0.0217           -0.0660
    dmm_unit_complexity         -0.0208         -0.0844           -0.0770
    dmm_unit_interfacing        -0.0217         -0.1094           -0.0855
    added_lines                  0.7105         -0.2010            0.5465
    deleted_lines               -0.3601          0.3525           -0.0173
    added_methods                1.0000         -0.1462            0.7923
    deleted_method              -0.1462          1.0000            0.2344
    modified_methods             0.7923          0.2344            1.0000

 %% Cell type:code id: tags:

 ``` python
 # plot the correlation matrix
 fig = px.imshow(correlation_matrix, text_auto=True, title="Correlation Matrix of Metrics from Week1",
                zmin=-1, zmax=1, color_continuous_scale='RdBu')
 fig.show()
 ```

 %% Output


 %% Cell type:code id: tags:

 ``` python
 # lets create dmm data along with week3task3 data
 a2a_acdc = pd.read_csv(r"E:\DSSE\DSSE-Group-7\Final Submission\Week3 Submission\Task 3\a2a_cvg_outputs\ACDC.csv")

 a2a_acdc.head()
 ```

 %% Output

                                          child  \
    0  babd19de331c875a1dffee908617c07c3e1eb31b
    1  dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
    2  f0331cfd016219e416ef34f21b01973ec4ccf4c9
    3  940389afce6a1b9b9e1519aed528cbc444786756
    4  ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
    
                                         parent         a2a  cvg_child_to_parent  \
    0  01b83a1e32b69399440843665ae2c8134d9d0e24   99.953030             1.000000
    1  ebb236ef9c00592c592f8d5bb885e7dfd4d05c3a   99.622532             1.000000
    2  7578282edce0eba3f24c96355c944c02156ece79  100.000000             1.000000
    3  fe0ddc03e115df0e3c17b0c0f9b9376abb817688   98.825155             0.973684
    4  366b1b1dd6f1ade1996c7c0eec1aca185c68d6cb   99.410464             1.000000
    
       cvg_parent_to_child
    0             1.000000
    1             1.000000
    2             1.000000
    3             0.973684
    4             1.000000

 %% Cell type:code id: tags:

 ``` python
 dmm_df = metric_df[['commit_hash', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity']]
 dmm_df.drop_duplicates(subset='commit_hash', inplace=True)
 combined_df = a2a_acdc.merge(dmm_df, left_on='child', right_on='commit_hash', how='left')
 ```

 %% Output

-    C:\Users\Jayesh\AppData\Local\Temp\ipykernel_7212\1058912344.py:2: SettingWithCopyWarning:
+    C:\Users\Jayesh\AppData\Local\Temp\ipykernel_17376\1058912344.py:2: SettingWithCopyWarning:
    
    
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    

 %% Cell type:code id: tags:

 ``` python
 combined_df.head()
 ```

 %% Output

                                          child  \
    0  babd19de331c875a1dffee908617c07c3e1eb31b
    1  dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
    2  f0331cfd016219e416ef34f21b01973ec4ccf4c9
    3  940389afce6a1b9b9e1519aed528cbc444786756
    4  ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
    
                                         parent         a2a  cvg_child_to_parent  \
    0  01b83a1e32b69399440843665ae2c8134d9d0e24   99.953030             1.000000
    1  ebb236ef9c00592c592f8d5bb885e7dfd4d05c3a   99.622532             1.000000
    2  7578282edce0eba3f24c96355c944c02156ece79  100.000000             1.000000
    3  fe0ddc03e115df0e3c17b0c0f9b9376abb817688   98.825155             0.973684
    4  366b1b1dd6f1ade1996c7c0eec1aca185c68d6cb   99.410464             1.000000
    
       cvg_parent_to_child                               commit_hash  \
    0             1.000000  babd19de331c875a1dffee908617c07c3e1eb31b
    1             1.000000  dbd07f9e8c2824cdb04d44d07d27c2b56f68c1d5
    2             1.000000  f0331cfd016219e416ef34f21b01973ec4ccf4c9
    3             0.973684  940389afce6a1b9b9e1519aed528cbc444786756
    4             1.000000  ea50f154077e724cc1b2fe15565ede2f2dc2e6f4
    
       dmm_unit_size  dmm_unit_complexity  dmm_unit_interfacing  complexity
    0       0.186813             0.186813              0.807692         0.0
    1       0.517544             0.820175              0.728070         0.0
    2            NaN                  NaN                   NaN         0.0
    3       0.337093             0.428571              0.873434         0.0
    4       0.703448             1.000000              0.648276         0.0

 %% Cell type:code id: tags:

 ``` python
 # lets plot the correlation matrix of the combined data
 relevant_columns = ['a2a', 'cvg_child_to_parent', 'cvg_parent_to_child', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity']

 correlation_matrix = combined_df[relevant_columns].corr(method='spearman')
 correlation_matrix = correlation_matrix.round(4)

 correlation_matrix
 ```

 %% Output

                             a2a  cvg_child_to_parent  cvg_parent_to_child  \
    a2a                   1.0000               0.6280               0.5070
    cvg_child_to_parent   0.6280               1.0000               0.8859
    cvg_parent_to_child   0.5070               0.8859               1.0000
    dmm_unit_size        -0.0258               0.2069               0.3412
    dmm_unit_complexity  -0.3104               0.0662               0.2000
    dmm_unit_interfacing  0.0358               0.0665               0.2032
    complexity            0.1069              -0.0070              -0.0450
    
                          dmm_unit_size  dmm_unit_complexity  \
    a2a                         -0.0258              -0.3104
    cvg_child_to_parent          0.2069               0.0662
    cvg_parent_to_child          0.3412               0.2000
    dmm_unit_size                1.0000               0.5546
    dmm_unit_complexity          0.5546               1.0000
    dmm_unit_interfacing         0.1255               0.4446
    complexity                  -0.1836              -0.4413
    
                          dmm_unit_interfacing  complexity
    a2a                                 0.0358      0.1069
    cvg_child_to_parent                 0.0665     -0.0070
    cvg_parent_to_child                 0.2032     -0.0450
    dmm_unit_size                       0.1255     -0.1836
    dmm_unit_complexity                 0.4446     -0.4413
    dmm_unit_interfacing                1.0000     -0.0349
    complexity                         -0.0349      1.0000

 %% Cell type:code id: tags:

 ``` python
 fig = px.imshow(correlation_matrix, text_auto=True, title="Correlation Matrix from Week3 stats and DMM",
                zmin=-1, zmax=1, color_continuous_scale='RdBu')

 fig.show()
 ```

 %% Output


--- a/Sandbox_Jayesh/Week3Task4_2405/core_Cvg.jar
+++ b/Sandbox_Jayesh/Week3Task4_2405/core_Cvg.jar