Skip to content
Snippets Groups Projects
Commit 701d61a0 authored by Abhay Kishorbhai Vaghasiya's avatar Abhay Kishorbhai Vaghasiya
Browse files

week 2 task2 done

parent 1baa2780
No related branches found
No related tags found
No related merge requests found
File added
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id:ac5301a1-95dd-4e3d-aad0-cb9a926ee55b tags:
``` python
import numpy as np
import pandas as pd
from collections import defaultdict
# Path to the pre-trained embeddings file
EMBEDDINGS_PATH = '/Users/abhayvaghasiya/Downloads/embeddings.bin'
# Load the CSV file
file_path = '/Users/abhayvaghasiya/Desktop/DS4SE_2/DSSE-Group-7/Assignment-3/Week1/Task3 Results/terms.csv'
terms_df = pd.read_csv(file_path)
# Extract the processed texts
texts = terms_df['processed_text'].apply(lambda x: eval(x)).tolist() # Converting string representation of lists to actual lists
# Build the vocabulary
vocab = defaultdict(int)
index = 1 # Start indexing from 1
for text in texts:
for word in text:
if word not in vocab:
vocab[word] = index
index += 1
vocab_length = len(vocab) + 1 # Adding 1 because of reserved index 0
embedding_dim = 200 # Replace with actual dimension of embeddings
# Initialize the embedding matrix
embedding_matrix = np.zeros((vocab_length, embedding_dim))
# Function to load word2vec binary file
def load_word2vec_bin(path):
with open(path, 'rb') as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * vector_size
embeddings_index = {}
for _ in range(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == b' ':
break
if ch != b'\n':
word.append(ch)
try:
word = b''.join(word).decode('utf-8')
except UnicodeDecodeError:
continue
embeddings_index[word] = np.frombuffer(f.read(binary_len), dtype='float32')
f.read(1) # To consume the newline character
return embeddings_index
# Load the pre-trained word embeddings
print("Loading pre-trained word embeddings...")
embeddings_index = load_word2vec_bin(EMBEDDINGS_PATH)
# Function to check if a vector contains valid values
def is_valid_vector(vector):
return np.all(np.isfinite(vector)) and not np.any(np.isnan(vector))
# Fill the embedding matrix with the pre-trained word vectors
for word, index in vocab.items():
if word in embeddings_index:
vector = embeddings_index[word]
if is_valid_vector(vector):
embedding_matrix[index] = vector
else:
embedding_matrix[index] = np.random.normal(size=(embedding_dim,))
else:
embedding_matrix[index] = np.random.normal(size=(embedding_dim,))
# Save the embedding matrix and vocab for later use
np.save('embedding_matrix.npy', embedding_matrix)
with open('vocab.txt', 'w') as f:
for word, index in vocab.items():
f.write(f"{word},{index}\n")
print("Embedding matrix and vocabulary have been created and saved.")
```
%% Output
Loading pre-trained word embeddings...
Embedding matrix and vocabulary have been created and saved.
%% Cell type:code id:5b2a1570-dee2-4ec1-af80-7825dcf08aa9 tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment