code2

2153748d · Niharika Aggarwal · cea9b821 · 2153748d
Commit 2153748d authored Jul 14, 2024 by Niharika Aggarwal
--- a/Assignment-3/Bonus/WEEK3/bonus2.py
+++ b/Assignment-3/Bonus/WEEK3/bonus2.py
+import torch
+import json
+import pandas as pd
+from tqdm import tqdm
+import re
+from bs4 import BeautifulSoup
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk import pos_tag
+import os
+import nltk
+import pickle
+from keras.models import load_model
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import numpy as np
+import sys
+from tensorflow.keras import preprocessing
+
+sys.modules['keras.src.preprocessing'] = preprocessing
+
+# Ensure nltk resources are available
+nltk.download('averaged_perceptron_tagger')
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
+
+# Paths and model configuration
+path = r'/pc2/users/n/niharika'
+csv_filepath = '/scratch/hpc-prf-dssecs/group7/processed_data_1.csv'
+model_path = r'/scratch/hpc-prf-dssecs/group7/best_so_far_92.h5'
+tokenizer_path = r'/scratch/hpc-prf-dssecs/group7/tokenizer92.pkl'
+output_json_file = 'model_results1.json'
+output_csv_file = 'model_results3.csv'
+
+# Labels and categories
+labels = ['feature', 'concept', 'technology', 'programming']
+
+# Load CNN model and tokenizer
+model = load_model(model_path)
+with open(tokenizer_path, 'rb') as handle:
+    tokenizer = pickle.load(handle)
+
+# Check for GPU availability
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+
+# Define preprocessing functions
+def clean_html(text):
+    return BeautifulSoup(text, "html.parser").get_text()
+
+def remove_source_code(text):
+    return re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)
+    
+stop_words = set(stopwords.words('english'))
+
+def preprocess_text(text):
+    text = clean_html(text)
+    text = remove_source_code(text)
+    text = re.sub(r'[^\w\s]', '', text)
+    tokens = word_tokenize(text)
+    tokens = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
+    lemmatizer = WordNetLemmatizer()
+    return ' '.join([lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens])
+
+# Initialize DataFrame to store results
+df_results = pd.DataFrame(columns=['post_id', 'Preprocessed Text', 'Predicted Category'])
+
+def load_results():
+    if os.path.exists(output_json_file):
+        with open(output_json_file, 'r') as file:
+            data = json.load(file)
+            category_counts = {label: 0 for label in labels}
+            for label in labels:
+                category_counts[label] = int(data["category_counts"].get(label, 0))
+            return category_counts, data.get("last_post_id")
+    else:
+        return {label: 0 for label in labels}, None
+
+category_counts, last_post_id = load_results()
+
+def save_results(last_post_id):
+    data_to_save = {
+        "category_counts": category_counts,
+        "last_post_id": last_post_id
+    }
+    with open(output_json_file, 'w') as json_file:
+        json.dump(data_to_save, json_file, indent=4)
+    df_results.to_csv(output_csv_file, index=False)
+
+def process_csv(filepath, last_post_id=None, batch_size=600):
+    data = pd.read_csv(filepath)
+    start_index = data.index[data['post_id'] == last_post_id].tolist()
+    start_index = start_index[0] + 1 if start_index else 0
+    batch = []
+    post_ids = []
+    preprocessed_texts = []
+
+    for index, row in data.iloc[start_index:].iterrows():
+        post_id = row['post_id']
+        best_answer = row['best_answer']
+        if pd.isnull(best_answer):
+            continue
+        processed_text = preprocess_text(best_answer)
+        sequence = tokenizer.texts_to_sequences([processed_text])
+        padded_sequence = pad_sequences(sequence, maxlen=600)
+        batch.append(padded_sequence[0])
+        post_ids.append(post_id)
+        preprocessed_texts.append(processed_text)
+
+        if len(batch) >= batch_size:
+            yield post_ids, batch, preprocessed_texts
+            batch = []
+            post_ids = []
+            preprocessed_texts = []
+
+    if batch:
+        yield post_ids, batch, preprocessed_texts
+
+# Initialize progress bar
+already_processed = sum(category_counts.values())
+pbar = tqdm(total=15000000, initial=already_processed, desc="Processing CSV")
+
+last_processed_post_id = last_post_id  # from load_results
+
+for post_ids, batch, preprocessed_texts in process_csv(csv_filepath, last_post_id):
+    batch = np.array(batch)  # Convert batch to numpy array
+    predictions = model.predict(batch)
+    predicted_classes = predictions.argmax(axis=1).tolist()
+    for post_id, pred, processed_text in zip(post_ids, predicted_classes, preprocessed_texts):
+        label = labels[pred] if max(predictions[pred]) > 0.5 else 'programming'  # Default to programming if confidence is low
+        category_counts[label] += 1
+        new_row = pd.DataFrame({'post_id': [post_id], 'Preprocessed Text': [processed_text], 'Predicted Category': [label]})
+        df_results = pd.concat([df_results, new_row], ignore_index=True)
+    pbar.update(len(batch))
+    last_processed_post_id = post_ids[-1]  # Update to the last ID in the current batch
+    if pbar.n // 1000 > already_processed // 1000:  # Save every 1000 additional items processed
+        save_results(last_post_id=last_processed_post_id)
+        already_processed = pbar.n
+
+pbar.close()
+save_results(last_post_id=last_processed_post_id)
+
+print(f"Predicted data saved to {output_json_file} and {output_csv_file}")
+
+print(f"Using device: {device}")