Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DSSE Group 7
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jayesh Chavan
DSSE Group 7
Commits
64163542
There was an error fetching the commit references. Please try again later.
Commit
64163542
authored
Jul 10, 2024
by
Niharika Aggarwal
Browse files
Options
Downloads
Patches
Plain Diff
RQ2
parent
c3eadf99
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
Assignment-3/Bonus/bonus.py
+136
-0
136 additions, 0 deletions
Assignment-3/Bonus/bonus.py
Assignment-3/Week2/CNN Reattempt/tokenizer92.pkl
+0
-0
0 additions, 0 deletions
Assignment-3/Week2/CNN Reattempt/tokenizer92.pkl
with
136 additions
and
0 deletions
Assignment-3/Bonus/bonus.py
0 → 100644
+
136
−
0
View file @
64163542
import
torch
import
json
import
pandas
as
pd
from
tqdm
import
tqdm
import
xml.etree.ElementTree
as
ET
import
re
from
bs4
import
BeautifulSoup
from
nltk.tokenize
import
word_tokenize
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
import
os
import
nltk
import
pickle
from
keras.models
import
load_model
from
tensorflow.keras.preprocessing.sequence
import
pad_sequences
from
tensorflow.keras.preprocessing.text
import
Tokenizer
# Ensure nltk resources are available
nltk
.
download
(
'
punkt
'
)
nltk
.
download
(
'
stopwords
'
)
nltk
.
download
(
'
wordnet
'
)
# Paths and model configuration
path
=
'
/pc2/users/n/niharika
'
inputfilepath
=
f
'
/scratch/hpc-prf-dssecs/Posts.xml
'
model_path
=
f
'
{
path
}
/model/best_so_far_92.h5
'
tokenizer_path
=
f
'
{
path
}
/model/tokenizer92.pkl
'
output_json_file
=
'
model_results.json
'
output_excel_file
=
'
model_results.xlsx
'
# Labels and categories
labels
=
[
'
feature
'
,
'
concept
'
,
'
technology
'
,
'
programming
'
]
# Load CNN model and tokenizer
model
=
load_model
(
model_path
)
with
open
(
tokenizer_path
,
'
rb
'
)
as
handle
:
tokenizer
=
pickle
.
load
(
handle
)
# Check for GPU availability (Note: TensorFlow/Keras model might utilize GPUs differently)
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
print
(
f
"
Using device:
{
device
}
"
)
# Define preprocessing functions
def
clean_html
(
text
):
return
BeautifulSoup
(
text
,
"
html.parser
"
).
get_text
()
def
remove_source_code
(
text
):
return
re
.
sub
(
r
'
<code>.*?</code>
'
,
''
,
text
,
flags
=
re
.
DOTALL
)
def
preprocess_text
(
text
):
text
=
clean_html
(
text
)
text
=
remove_source_code
(
text
)
tokens
=
word_tokenize
(
text
)
stop_words
=
set
(
stopwords
.
words
(
'
english
'
))
tokens
=
[
word
for
word
in
tokens
if
word
.
lower
()
not
in
stop_words
]
lemmatizer
=
WordNetLemmatizer
()
tokens
=
[
lemmatizer
.
lemmatize
(
word
)
for
word
in
tokens
]
return
'
'
.
join
(
tokens
)
# Initialize DataFrame to store results
df_results
=
pd
.
DataFrame
(
columns
=
[
'
Post ID
'
,
'
Predicted Category
'
])
def
load_results
():
if
os
.
path
.
exists
(
output_json_file
):
with
open
(
output_json_file
,
'
r
'
)
as
file
:
data
=
json
.
load
(
file
)
category_counts
=
{
label
:
0
for
label
in
labels
}
for
label
in
labels
:
category_counts
[
label
]
=
int
(
data
[
"
category_counts
"
].
get
(
label
,
0
))
return
category_counts
,
data
.
get
(
"
last_post_id
"
)
else
:
return
{
label
:
0
for
label
in
labels
},
None
category_counts
,
last_post_id
=
load_results
()
def
save_results
(
last_post_id
):
data_to_save
=
{
"
category_counts
"
:
category_counts
,
"
last_post_id
"
:
last_post_id
}
with
open
(
output_json_file
,
'
w
'
)
as
json_file
:
json
.
dump
(
data_to_save
,
json_file
,
indent
=
4
)
df_results
.
to_excel
(
output_excel_file
,
index
=
False
)
def
process_xml
(
filepath
,
last_post_id
=
None
,
batch_size
=
100
):
batch
=
[]
post_ids
=
[]
start_processing
=
last_post_id
is
None
# Start from beginning if no last_post_id
for
event
,
elem
in
ET
.
iterparse
(
filepath
,
events
=
(
'
end
'
,)):
if
elem
.
tag
==
'
row
'
:
current_post_id
=
elem
.
get
(
'
Id
'
)
if
not
start_processing
and
current_post_id
==
last_post_id
:
start_processing
=
True
# Start processing after this ID
continue
if
start_processing
:
body
=
elem
.
get
(
'
Body
'
,
''
)
title
=
elem
.
get
(
'
Title
'
,
''
)
combined_text
=
f
"
{
title
}
{
body
}
"
processed_text
=
preprocess_text
(
combined_text
)
sequence
=
tokenizer
.
texts_to_sequences
([
processed_text
])
padded_sequence
=
pad_sequences
(
sequence
,
maxlen
=
512
)
batch
.
append
(
padded_sequence
[
0
])
post_ids
.
append
(
current_post_id
)
if
len
(
batch
)
>=
batch_size
:
yield
post_ids
,
batch
batch
=
[]
post_ids
=
[]
elem
.
clear
()
if
batch
:
yield
post_ids
,
batch
# Initialize progress bar
already_processed
=
sum
(
category_counts
.
values
())
pbar
=
tqdm
(
total
=
30000000
,
initial
=
already_processed
,
desc
=
"
Processing XML
"
)
last_processed_post_id
=
last_post_id
# from load_results
for
post_ids
,
batch
in
process_xml
(
inputfilepath
,
last_post_id
):
predictions
=
model
.
predict
(
batch
)
predicted_classes
=
predictions
.
argmax
(
axis
=
1
).
tolist
()
for
post_id
,
pred
in
zip
(
post_ids
,
predicted_classes
):
label
=
labels
[
pred
]
if
max
(
predictions
[
pred
])
>
0.5
else
'
programming
'
# Default to programming if confidence is low
category_counts
[
label
]
+=
1
new_row
=
pd
.
DataFrame
({
'
Post ID
'
:
[
post_id
],
'
Predicted Category
'
:
[
label
]})
df_results
=
pd
.
concat
([
df_results
,
new_row
],
ignore_index
=
True
)
pbar
.
update
(
len
(
batch
))
last_processed_post_id
=
post_ids
[
-
1
]
# Update to the last ID in the current batch
if
pbar
.
n
//
1000
>
already_processed
//
1000
:
# Save every 1000 additional items processed
save_results
(
last_post_id
=
last_processed_post_id
)
already_processed
=
pbar
.
n
pbar
.
close
()
save_results
(
last_post_id
=
last_processed_post_id
)
print
(
f
"
Predicted data saved to
{
output_json_file
}
and
{
output_excel_file
}
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Assignment-3/Week2/CNN Reattempt/tokenizer92.pkl
0 → 100644
+
0
−
0
View file @
64163542
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment