Skip to content
Snippets Groups Projects
Commit 2153748d authored by Niharika Aggarwal's avatar Niharika Aggarwal
Browse files

code2

parent cea9b821
No related branches found
No related tags found
No related merge requests found
import torch
import json
import pandas as pd
from tqdm import tqdm
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import os
import nltk
import pickle
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import sys
from tensorflow.keras import preprocessing
sys.modules['keras.src.preprocessing'] = preprocessing
# Ensure nltk resources are available
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
pos_tag_mapping = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
# Paths and model configuration
path = r'/pc2/users/n/niharika'
csv_filepath = '/scratch/hpc-prf-dssecs/group7/processed_data_1.csv'
model_path = r'/scratch/hpc-prf-dssecs/group7/best_so_far_92.h5'
tokenizer_path = r'/scratch/hpc-prf-dssecs/group7/tokenizer92.pkl'
output_json_file = 'model_results1.json'
output_csv_file = 'model_results3.csv'
# Labels and categories
labels = ['feature', 'concept', 'technology', 'programming']
# Load CNN model and tokenizer
model = load_model(model_path)
with open(tokenizer_path, 'rb') as handle:
tokenizer = pickle.load(handle)
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Define preprocessing functions
def clean_html(text):
return BeautifulSoup(text, "html.parser").get_text()
def remove_source_code(text):
return re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
text = clean_html(text)
text = remove_source_code(text)
text = re.sub(r'[^\w\s]', '', text)
tokens = word_tokenize(text)
tokens = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
lemmatizer = WordNetLemmatizer()
return ' '.join([lemmatizer.lemmatize(word, pos=pos_tag_mapping(pos_tag([word])[0][1])) for word in tokens])
# Initialize DataFrame to store results
df_results = pd.DataFrame(columns=['post_id', 'Preprocessed Text', 'Predicted Category'])
def load_results():
if os.path.exists(output_json_file):
with open(output_json_file, 'r') as file:
data = json.load(file)
category_counts = {label: 0 for label in labels}
for label in labels:
category_counts[label] = int(data["category_counts"].get(label, 0))
return category_counts, data.get("last_post_id")
else:
return {label: 0 for label in labels}, None
category_counts, last_post_id = load_results()
def save_results(last_post_id):
data_to_save = {
"category_counts": category_counts,
"last_post_id": last_post_id
}
with open(output_json_file, 'w') as json_file:
json.dump(data_to_save, json_file, indent=4)
df_results.to_csv(output_csv_file, index=False)
def process_csv(filepath, last_post_id=None, batch_size=600):
data = pd.read_csv(filepath)
start_index = data.index[data['post_id'] == last_post_id].tolist()
start_index = start_index[0] + 1 if start_index else 0
batch = []
post_ids = []
preprocessed_texts = []
for index, row in data.iloc[start_index:].iterrows():
post_id = row['post_id']
best_answer = row['best_answer']
if pd.isnull(best_answer):
continue
processed_text = preprocess_text(best_answer)
sequence = tokenizer.texts_to_sequences([processed_text])
padded_sequence = pad_sequences(sequence, maxlen=600)
batch.append(padded_sequence[0])
post_ids.append(post_id)
preprocessed_texts.append(processed_text)
if len(batch) >= batch_size:
yield post_ids, batch, preprocessed_texts
batch = []
post_ids = []
preprocessed_texts = []
if batch:
yield post_ids, batch, preprocessed_texts
# Initialize progress bar
already_processed = sum(category_counts.values())
pbar = tqdm(total=15000000, initial=already_processed, desc="Processing CSV")
last_processed_post_id = last_post_id # from load_results
for post_ids, batch, preprocessed_texts in process_csv(csv_filepath, last_post_id):
batch = np.array(batch) # Convert batch to numpy array
predictions = model.predict(batch)
predicted_classes = predictions.argmax(axis=1).tolist()
for post_id, pred, processed_text in zip(post_ids, predicted_classes, preprocessed_texts):
label = labels[pred] if max(predictions[pred]) > 0.5 else 'programming' # Default to programming if confidence is low
category_counts[label] += 1
new_row = pd.DataFrame({'post_id': [post_id], 'Preprocessed Text': [processed_text], 'Predicted Category': [label]})
df_results = pd.concat([df_results, new_row], ignore_index=True)
pbar.update(len(batch))
last_processed_post_id = post_ids[-1] # Update to the last ID in the current batch
if pbar.n // 1000 > already_processed // 1000: # Save every 1000 additional items processed
save_results(last_post_id=last_processed_post_id)
already_processed = pbar.n
pbar.close()
save_results(last_post_id=last_processed_post_id)
print(f"Predicted data saved to {output_json_file} and {output_csv_file}")
print(f"Using device: {device}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment