week 2 task2 done

701d61a0 · Abhay Kishorbhai Vaghasiya · 1baa2780 · 701d61a0 · 701d61a0 · 701d61a0
Commit 701d61a0 authored Jun 22, 2024 by Abhay Kishorbhai Vaghasiya
--- a/Assignment-3/Week2/Task2/embedding_matrix.npy
+++ b/Assignment-3/Week2/Task2/embedding_matrix.npy
--- a/Assignment-3/Week2/Task2/terms.csv
+++ b/Assignment-3/Week2/Task2/terms.csv
--- a/Assignment-3/Week2/Task2/vocab.txt
+++ b/Assignment-3/Week2/Task2/vocab.txt
--- a/Assignment-3/Week2/Task2/word_embedding_matrix.ipynb
+++ b/Assignment-3/Week2/Task2/word_embedding_matrix.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ac5301a1-95dd-4e3d-aad0-cb9a926ee55b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading pre-trained word embeddings...\n",
+      "Embedding matrix and vocabulary have been created and saved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# Path to the pre-trained embeddings file\n",
+    "EMBEDDINGS_PATH = '/Users/abhayvaghasiya/Downloads/embeddings.bin'\n",
+    "\n",
+    "# Load the CSV file\n",
+    "file_path = '/Users/abhayvaghasiya/Desktop/DS4SE_2/DSSE-Group-7/Assignment-3/Week1/Task3 Results/terms.csv'\n",
+    "terms_df = pd.read_csv(file_path)\n",
+    "\n",
+    "\n",
+    "# Extract the processed texts\n",
+    "texts = terms_df['processed_text'].apply(lambda x: eval(x)).tolist()  # Converting string representation of lists to actual lists\n",
+    "\n",
+    "# Build the vocabulary\n",
+    "vocab = defaultdict(int)\n",
+    "index = 1  # Start indexing from 1\n",
+    "for text in texts:\n",
+    "    for word in text:\n",
+    "        if word not in vocab:\n",
+    "            vocab[word] = index\n",
+    "            index += 1\n",
+    "\n",
+    "vocab_length = len(vocab) + 1  # Adding 1 because of reserved index 0\n",
+    "embedding_dim = 200  # Replace with actual dimension of embeddings\n",
+    "\n",
+    "# Initialize the embedding matrix\n",
+    "embedding_matrix = np.zeros((vocab_length, embedding_dim))\n",
+    "\n",
+    "# Function to load word2vec binary file\n",
+    "def load_word2vec_bin(path):\n",
+    "    with open(path, 'rb') as f:\n",
+    "        header = f.readline()\n",
+    "        vocab_size, vector_size = map(int, header.split())\n",
+    "        binary_len = np.dtype('float32').itemsize * vector_size\n",
+    "        embeddings_index = {}\n",
+    "        for _ in range(vocab_size):\n",
+    "            word = []\n",
+    "            while True:\n",
+    "                ch = f.read(1)\n",
+    "                if ch == b' ':\n",
+    "                    break\n",
+    "                if ch != b'\\n':\n",
+    "                    word.append(ch)\n",
+    "            try:\n",
+    "                word = b''.join(word).decode('utf-8')\n",
+    "            except UnicodeDecodeError:\n",
+    "                continue\n",
+    "            embeddings_index[word] = np.frombuffer(f.read(binary_len), dtype='float32')\n",
+    "            f.read(1)  # To consume the newline character\n",
+    "    return embeddings_index\n",
+    "\n",
+    "# Load the pre-trained word embeddings\n",
+    "print(\"Loading pre-trained word embeddings...\")\n",
+    "embeddings_index = load_word2vec_bin(EMBEDDINGS_PATH)\n",
+    "\n",
+    "# Function to check if a vector contains valid values\n",
+    "def is_valid_vector(vector):\n",
+    "    return np.all(np.isfinite(vector)) and not np.any(np.isnan(vector))\n",
+    "\n",
+    "# Fill the embedding matrix with the pre-trained word vectors\n",
+    "for word, index in vocab.items():\n",
+    "    if word in embeddings_index:\n",
+    "        vector = embeddings_index[word]\n",
+    "        if is_valid_vector(vector):\n",
+    "            embedding_matrix[index] = vector\n",
+    "        else:\n",
+    "            embedding_matrix[index] = np.random.normal(size=(embedding_dim,))\n",
+    "    else:\n",
+    "        embedding_matrix[index] = np.random.normal(size=(embedding_dim,))\n",
+    "\n",
+    "# Save the embedding matrix and vocab for later use\n",
+    "np.save('embedding_matrix.npy', embedding_matrix)\n",
+    "with open('vocab.txt', 'w') as f:\n",
+    "    for word, index in vocab.items():\n",
+    "        f.write(f\"{word},{index}\\n\")\n",
+    "\n",
+    "print(\"Embedding matrix and vocabulary have been created and saved.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b2a1570-dee2-4ec1-af80-7825dcf08aa9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:ac5301a1-95dd-4e3d-aad0-cb9a926ee55b tags:
+
+``` python
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+
+# Path to the pre-trained embeddings file
+EMBEDDINGS_PATH = '/Users/abhayvaghasiya/Downloads/embeddings.bin'
+
+# Load the CSV file
+file_path = '/Users/abhayvaghasiya/Desktop/DS4SE_2/DSSE-Group-7/Assignment-3/Week1/Task3 Results/terms.csv'
+terms_df = pd.read_csv(file_path)
+
+
+# Extract the processed texts
+texts = terms_df['processed_text'].apply(lambda x: eval(x)).tolist()  # Converting string representation of lists to actual lists
+
+# Build the vocabulary
+vocab = defaultdict(int)
+index = 1  # Start indexing from 1
+for text in texts:
+    for word in text:
+        if word not in vocab:
+            vocab[word] = index
+            index += 1
+
+vocab_length = len(vocab) + 1  # Adding 1 because of reserved index 0
+embedding_dim = 200  # Replace with actual dimension of embeddings
+
+# Initialize the embedding matrix
+embedding_matrix = np.zeros((vocab_length, embedding_dim))
+
+# Function to load word2vec binary file
+def load_word2vec_bin(path):
+    with open(path, 'rb') as f:
+        header = f.readline()
+        vocab_size, vector_size = map(int, header.split())
+        binary_len = np.dtype('float32').itemsize * vector_size
+        embeddings_index = {}
+        for _ in range(vocab_size):
+            word = []
+            while True:
+                ch = f.read(1)
+                if ch == b' ':
+                    break
+                if ch != b'\n':
+                    word.append(ch)
+            try:
+                word = b''.join(word).decode('utf-8')
+            except UnicodeDecodeError:
+                continue
+            embeddings_index[word] = np.frombuffer(f.read(binary_len), dtype='float32')
+            f.read(1)  # To consume the newline character
+    return embeddings_index
+
+# Load the pre-trained word embeddings
+print("Loading pre-trained word embeddings...")
+embeddings_index = load_word2vec_bin(EMBEDDINGS_PATH)
+
+# Function to check if a vector contains valid values
+def is_valid_vector(vector):
+    return np.all(np.isfinite(vector)) and not np.any(np.isnan(vector))
+
+# Fill the embedding matrix with the pre-trained word vectors
+for word, index in vocab.items():
+    if word in embeddings_index:
+        vector = embeddings_index[word]
+        if is_valid_vector(vector):
+            embedding_matrix[index] = vector
+        else:
+            embedding_matrix[index] = np.random.normal(size=(embedding_dim,))
+    else:
+        embedding_matrix[index] = np.random.normal(size=(embedding_dim,))
+
+# Save the embedding matrix and vocab for later use
+np.save('embedding_matrix.npy', embedding_matrix)
+with open('vocab.txt', 'w') as f:
+    for word, index in vocab.items():
+        f.write(f"{word},{index}\n")
+
+print("Embedding matrix and vocabulary have been created and saved.")
+```
+
+%% Output
+
+    Loading pre-trained word embeddings...
+    Embedding matrix and vocabulary have been created and saved.
+
+%% Cell type:code id:5b2a1570-dee2-4ec1-af80-7825dcf08aa9 tags:
+
+``` python
+```