Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DSSE Group 7
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Jayesh Chavan
DSSE Group 7
Commits
2153748d
There was an error fetching the commit references. Please try again later.
Commit
2153748d
authored
Jul 14, 2024
by
Niharika Aggarwal
Browse files
Options
Downloads
Patches
Plain Diff
code2
parent
cea9b821
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Assignment-3/Bonus/WEEK3/bonus2.py
+149
-0
149 additions, 0 deletions
Assignment-3/Bonus/WEEK3/bonus2.py
with
149 additions
and
0 deletions
Assignment-3/Bonus/WEEK3/bonus2.py
0 → 100644
+
149
−
0
View file @
2153748d
import
torch
import
json
import
pandas
as
pd
from
tqdm
import
tqdm
import
re
from
bs4
import
BeautifulSoup
from
nltk.tokenize
import
word_tokenize
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
from
nltk
import
pos_tag
import
os
import
nltk
import
pickle
from
keras.models
import
load_model
from
tensorflow.keras.preprocessing.text
import
Tokenizer
from
tensorflow.keras.preprocessing.sequence
import
pad_sequences
import
numpy
as
np
import
sys
from
tensorflow.keras
import
preprocessing
sys
.
modules
[
'
keras.src.preprocessing
'
]
=
preprocessing
# Ensure nltk resources are available
nltk
.
download
(
'
averaged_perceptron_tagger
'
)
nltk
.
download
(
'
punkt
'
)
nltk
.
download
(
'
stopwords
'
)
nltk
.
download
(
'
wordnet
'
)
pos_tag_mapping
=
lambda
e
:
(
'
a
'
if
e
[
0
].
lower
()
==
'
j
'
else
e
[
0
].
lower
())
if
e
[
0
].
lower
()
in
[
'
n
'
,
'
r
'
,
'
v
'
]
else
'
n
'
# Paths and model configuration
path
=
r
'
/pc2/users/n/niharika
'
csv_filepath
=
'
/scratch/hpc-prf-dssecs/group7/processed_data_1.csv
'
model_path
=
r
'
/scratch/hpc-prf-dssecs/group7/best_so_far_92.h5
'
tokenizer_path
=
r
'
/scratch/hpc-prf-dssecs/group7/tokenizer92.pkl
'
output_json_file
=
'
model_results1.json
'
output_csv_file
=
'
model_results3.csv
'
# Labels and categories
labels
=
[
'
feature
'
,
'
concept
'
,
'
technology
'
,
'
programming
'
]
# Load CNN model and tokenizer
model
=
load_model
(
model_path
)
with
open
(
tokenizer_path
,
'
rb
'
)
as
handle
:
tokenizer
=
pickle
.
load
(
handle
)
# Check for GPU availability
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
print
(
f
"
Using device:
{
device
}
"
)
# Define preprocessing functions
def
clean_html
(
text
):
return
BeautifulSoup
(
text
,
"
html.parser
"
).
get_text
()
def
remove_source_code
(
text
):
return
re
.
sub
(
r
'
<code>.*?</code>
'
,
''
,
text
,
flags
=
re
.
DOTALL
)
stop_words
=
set
(
stopwords
.
words
(
'
english
'
))
def
preprocess_text
(
text
):
text
=
clean_html
(
text
)
text
=
remove_source_code
(
text
)
text
=
re
.
sub
(
r
'
[^\w\s]
'
,
''
,
text
)
tokens
=
word_tokenize
(
text
)
tokens
=
[
word
for
word
in
tokens
if
word
.
isalpha
()
and
word
.
lower
()
not
in
stop_words
]
lemmatizer
=
WordNetLemmatizer
()
return
'
'
.
join
([
lemmatizer
.
lemmatize
(
word
,
pos
=
pos_tag_mapping
(
pos_tag
([
word
])[
0
][
1
]))
for
word
in
tokens
])
# Initialize DataFrame to store results
df_results
=
pd
.
DataFrame
(
columns
=
[
'
post_id
'
,
'
Preprocessed Text
'
,
'
Predicted Category
'
])
def
load_results
():
if
os
.
path
.
exists
(
output_json_file
):
with
open
(
output_json_file
,
'
r
'
)
as
file
:
data
=
json
.
load
(
file
)
category_counts
=
{
label
:
0
for
label
in
labels
}
for
label
in
labels
:
category_counts
[
label
]
=
int
(
data
[
"
category_counts
"
].
get
(
label
,
0
))
return
category_counts
,
data
.
get
(
"
last_post_id
"
)
else
:
return
{
label
:
0
for
label
in
labels
},
None
category_counts
,
last_post_id
=
load_results
()
def
save_results
(
last_post_id
):
data_to_save
=
{
"
category_counts
"
:
category_counts
,
"
last_post_id
"
:
last_post_id
}
with
open
(
output_json_file
,
'
w
'
)
as
json_file
:
json
.
dump
(
data_to_save
,
json_file
,
indent
=
4
)
df_results
.
to_csv
(
output_csv_file
,
index
=
False
)
def
process_csv
(
filepath
,
last_post_id
=
None
,
batch_size
=
600
):
data
=
pd
.
read_csv
(
filepath
)
start_index
=
data
.
index
[
data
[
'
post_id
'
]
==
last_post_id
].
tolist
()
start_index
=
start_index
[
0
]
+
1
if
start_index
else
0
batch
=
[]
post_ids
=
[]
preprocessed_texts
=
[]
for
index
,
row
in
data
.
iloc
[
start_index
:].
iterrows
():
post_id
=
row
[
'
post_id
'
]
best_answer
=
row
[
'
best_answer
'
]
if
pd
.
isnull
(
best_answer
):
continue
processed_text
=
preprocess_text
(
best_answer
)
sequence
=
tokenizer
.
texts_to_sequences
([
processed_text
])
padded_sequence
=
pad_sequences
(
sequence
,
maxlen
=
600
)
batch
.
append
(
padded_sequence
[
0
])
post_ids
.
append
(
post_id
)
preprocessed_texts
.
append
(
processed_text
)
if
len
(
batch
)
>=
batch_size
:
yield
post_ids
,
batch
,
preprocessed_texts
batch
=
[]
post_ids
=
[]
preprocessed_texts
=
[]
if
batch
:
yield
post_ids
,
batch
,
preprocessed_texts
# Initialize progress bar
already_processed
=
sum
(
category_counts
.
values
())
pbar
=
tqdm
(
total
=
15000000
,
initial
=
already_processed
,
desc
=
"
Processing CSV
"
)
last_processed_post_id
=
last_post_id
# from load_results
for
post_ids
,
batch
,
preprocessed_texts
in
process_csv
(
csv_filepath
,
last_post_id
):
batch
=
np
.
array
(
batch
)
# Convert batch to numpy array
predictions
=
model
.
predict
(
batch
)
predicted_classes
=
predictions
.
argmax
(
axis
=
1
).
tolist
()
for
post_id
,
pred
,
processed_text
in
zip
(
post_ids
,
predicted_classes
,
preprocessed_texts
):
label
=
labels
[
pred
]
if
max
(
predictions
[
pred
])
>
0.5
else
'
programming
'
# Default to programming if confidence is low
category_counts
[
label
]
+=
1
new_row
=
pd
.
DataFrame
({
'
post_id
'
:
[
post_id
],
'
Preprocessed Text
'
:
[
processed_text
],
'
Predicted Category
'
:
[
label
]})
df_results
=
pd
.
concat
([
df_results
,
new_row
],
ignore_index
=
True
)
pbar
.
update
(
len
(
batch
))
last_processed_post_id
=
post_ids
[
-
1
]
# Update to the last ID in the current batch
if
pbar
.
n
//
1000
>
already_processed
//
1000
:
# Save every 1000 additional items processed
save_results
(
last_post_id
=
last_processed_post_id
)
already_processed
=
pbar
.
n
pbar
.
close
()
save_results
(
last_post_id
=
last_processed_post_id
)
print
(
f
"
Predicted data saved to
{
output_json_file
}
and
{
output_csv_file
}
"
)
print
(
f
"
Using device:
{
device
}
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment