Skip to content
Snippets Groups Projects
Commit 64163542 authored by Niharika Aggarwal's avatar Niharika Aggarwal
Browse files

RQ2

parent c3eadf99
No related branches found
No related tags found
No related merge requests found
import torch
import json
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import nltk
import pickle
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
# Ensure nltk resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Paths and model configuration
path = '/pc2/users/n/niharika'
inputfilepath = f'/scratch/hpc-prf-dssecs/Posts.xml'
model_path = f'{path}/model/best_so_far_92.h5'
tokenizer_path = f'{path}/model/tokenizer92.pkl'
output_json_file = 'model_results.json'
output_excel_file = 'model_results.xlsx'
# Labels and categories
labels = ['feature', 'concept', 'technology', 'programming']
# Load CNN model and tokenizer
model = load_model(model_path)
with open(tokenizer_path, 'rb') as handle:
tokenizer = pickle.load(handle)
# Check for GPU availability (Note: TensorFlow/Keras model might utilize GPUs differently)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Define preprocessing functions
def clean_html(text):
return BeautifulSoup(text, "html.parser").get_text()
def remove_source_code(text):
return re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)
def preprocess_text(text):
text = clean_html(text)
text = remove_source_code(text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
# Initialize DataFrame to store results
df_results = pd.DataFrame(columns=['Post ID', 'Predicted Category'])
def load_results():
if os.path.exists(output_json_file):
with open(output_json_file, 'r') as file:
data = json.load(file)
category_counts = {label: 0 for label in labels}
for label in labels:
category_counts[label] = int(data["category_counts"].get(label, 0))
return category_counts, data.get("last_post_id")
else:
return {label: 0 for label in labels}, None
category_counts, last_post_id = load_results()
def save_results(last_post_id):
data_to_save = {
"category_counts": category_counts,
"last_post_id": last_post_id
}
with open(output_json_file, 'w') as json_file:
json.dump(data_to_save, json_file, indent=4)
df_results.to_excel(output_excel_file, index=False)
def process_xml(filepath, last_post_id=None, batch_size=100):
batch = []
post_ids = []
start_processing = last_post_id is None # Start from beginning if no last_post_id
for event, elem in ET.iterparse(filepath, events=('end',)):
if elem.tag == 'row':
current_post_id = elem.get('Id')
if not start_processing and current_post_id == last_post_id:
start_processing = True # Start processing after this ID
continue
if start_processing:
body = elem.get('Body', '')
title = elem.get('Title', '')
combined_text = f"{title} {body}"
processed_text = preprocess_text(combined_text)
sequence = tokenizer.texts_to_sequences([processed_text])
padded_sequence = pad_sequences(sequence, maxlen=512)
batch.append(padded_sequence[0])
post_ids.append(current_post_id)
if len(batch) >= batch_size:
yield post_ids, batch
batch = []
post_ids = []
elem.clear()
if batch:
yield post_ids, batch
# Initialize progress bar
already_processed = sum(category_counts.values())
pbar = tqdm(total=30000000, initial=already_processed, desc="Processing XML")
last_processed_post_id = last_post_id # from load_results
for post_ids, batch in process_xml(inputfilepath, last_post_id):
predictions = model.predict(batch)
predicted_classes = predictions.argmax(axis=1).tolist()
for post_id, pred in zip(post_ids, predicted_classes):
label = labels[pred] if max(predictions[pred]) > 0.5 else 'programming' # Default to programming if confidence is low
category_counts[label] += 1
new_row = pd.DataFrame({'Post ID': [post_id], 'Predicted Category': [label]})
df_results = pd.concat([df_results, new_row], ignore_index=True)
pbar.update(len(batch))
last_processed_post_id = post_ids[-1] # Update to the last ID in the current batch
if pbar.n // 1000 > already_processed // 1000: # Save every 1000 additional items processed
save_results(last_post_id=last_processed_post_id)
already_processed = pbar.n
pbar.close()
save_results(last_post_id=last_processed_post_id)
print(f"Predicted data saved to {output_json_file} and {output_excel_file}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment