RQ2

64163542 · Niharika Aggarwal · c3eadf99 · 64163542 · 64163542
Commit 64163542 authored Jul 10, 2024 by Niharika Aggarwal
--- a/Assignment-3/Bonus/bonus.py
+++ b/Assignment-3/Bonus/bonus.py
+import torch
+import json
+import pandas as pd
+from tqdm import tqdm
+import xml.etree.ElementTree as ET
+import re
+from bs4 import BeautifulSoup
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import os
+import nltk
+import pickle
+from keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+
+# Ensure nltk resources are available
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+# Paths and model configuration
+path = '/pc2/users/n/niharika'
+inputfilepath = f'/scratch/hpc-prf-dssecs/Posts.xml'
+model_path = f'{path}/model/best_so_far_92.h5'
+tokenizer_path = f'{path}/model/tokenizer92.pkl'
+output_json_file = 'model_results.json'
+output_excel_file = 'model_results.xlsx'
+
+# Labels and categories
+labels = ['feature', 'concept', 'technology', 'programming']
+
+# Load CNN model and tokenizer
+model = load_model(model_path)
+with open(tokenizer_path, 'rb') as handle:
+    tokenizer = pickle.load(handle)
+
+# Check for GPU availability (Note: TensorFlow/Keras model might utilize GPUs differently)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+
+# Define preprocessing functions
+def clean_html(text):
+    return BeautifulSoup(text, "html.parser").get_text()
+
+def remove_source_code(text):
+    return re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)
+
+def preprocess_text(text):
+    text = clean_html(text)
+    text = remove_source_code(text)
+    tokens = word_tokenize(text)
+    stop_words = set(stopwords.words('english'))
+    tokens = [word for word in tokens if word.lower() not in stop_words]
+    lemmatizer = WordNetLemmatizer()
+    tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return ' '.join(tokens)
+
+# Initialize DataFrame to store results
+df_results = pd.DataFrame(columns=['Post ID', 'Predicted Category'])
+
+def load_results():
+    if os.path.exists(output_json_file):
+        with open(output_json_file, 'r') as file:
+            data = json.load(file)
+            category_counts = {label: 0 for label in labels}
+            for label in labels:
+                category_counts[label] = int(data["category_counts"].get(label, 0))
+            return category_counts, data.get("last_post_id")
+    else:
+        return {label: 0 for label in labels}, None
+
+category_counts, last_post_id = load_results()
+
+def save_results(last_post_id):
+    data_to_save = {
+        "category_counts": category_counts,
+        "last_post_id": last_post_id
+    }
+    with open(output_json_file, 'w') as json_file:
+        json.dump(data_to_save, json_file, indent=4)
+    df_results.to_excel(output_excel_file, index=False)
+
+def process_xml(filepath, last_post_id=None, batch_size=100):
+    batch = []
+    post_ids = []
+    start_processing = last_post_id is None  # Start from beginning if no last_post_id
+    for event, elem in ET.iterparse(filepath, events=('end',)):
+        if elem.tag == 'row':
+            current_post_id = elem.get('Id')
+            if not start_processing and current_post_id == last_post_id:
+                start_processing = True  # Start processing after this ID
+                continue
+            if start_processing:
+                body = elem.get('Body', '')
+                title = elem.get('Title', '')
+                combined_text = f"{title} {body}"
+                processed_text = preprocess_text(combined_text)
+                sequence = tokenizer.texts_to_sequences([processed_text])
+                padded_sequence = pad_sequences(sequence, maxlen=512)
+                batch.append(padded_sequence[0])
+                post_ids.append(current_post_id)
+                if len(batch) >= batch_size:
+                    yield post_ids, batch
+                    batch = []
+                    post_ids = []
+            elem.clear()
+    if batch:
+        yield post_ids, batch
+
+# Initialize progress bar
+already_processed = sum(category_counts.values())
+pbar = tqdm(total=30000000, initial=already_processed, desc="Processing XML")
+
+last_processed_post_id = last_post_id  # from load_results
+
+for post_ids, batch in process_xml(inputfilepath, last_post_id):
+    predictions = model.predict(batch)
+    predicted_classes = predictions.argmax(axis=1).tolist()
+    for post_id, pred in zip(post_ids, predicted_classes):
+        label = labels[pred] if max(predictions[pred]) > 0.5 else 'programming'  # Default to programming if confidence is low
+        category_counts[label] += 1
+        new_row = pd.DataFrame({'Post ID': [post_id], 'Predicted Category': [label]})
+        df_results = pd.concat([df_results, new_row], ignore_index=True)
+    pbar.update(len(batch))
+    last_processed_post_id = post_ids[-1]  # Update to the last ID in the current batch
+    if pbar.n // 1000 > already_processed // 1000:  # Save every 1000 additional items processed
+        save_results(last_post_id=last_processed_post_id)
+        already_processed = pbar.n
+
+pbar.close()
+save_results(last_post_id=last_processed_post_id)
+
+print(f"Predicted data saved to {output_json_file} and {output_excel_file}")
\ No newline at end of file
--- a/Assignment-3/Week2/CNN Reattempt/tokenizer92.pkl
+++ b/Assignment-3/Week2/CNN Reattempt/tokenizer92.pkl