Skip to content
Snippets Groups Projects
Commit c326f12f authored by Abhay Kishorbhai Vaghasiya's avatar Abhay Kishorbhai Vaghasiya
Browse files

RQ-3 code

parent a20c8855
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
def load_data(filepath):
"""Load the dataset from a specified file path."""
df = pd.read_csv("/Users/abhayvaghasiya/Downloads/processed_data.csv")
# Adding a dummy 'Type' column with sample categories
types = ['Concept', 'Technology', 'Methodology', 'Pattern']
df['Type'] = pd.Series([types[i % 4] for i in range(len(df))])
return df
def analyze_tags(df):
"""Analyze and visualize the tag frequencies."""
tag_counts = df['tags'].apply(lambda x: x.split(',')).explode().value_counts()
print("Top 10 Tags:")
print(tag_counts.head(10))
tag_counts.head(30).plot(kind='bar', figsize=(10, 5)) # Visualize top 30 tags for better clarity
plt.title('Frequency of Top 30 Tags')
plt.xlabel('Tags')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()
def analyze_answers_votes(df):
"""Analyze and visualize answers and votes distributions."""
# Answers distribution
sns.histplot(df['ans_count'], kde=True, color='blue')
plt.title('Distribution of Answers per Post')
plt.xlabel('Number of Answers')
plt.ylabel('Frequency')
plt.show()
# Votes distribution (using post_score)
sns.histplot(df['post_score'], kde=True, color='green')
plt.title('Distribution of Post Scores (Votes)')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()
def analyze_by_type(df):
"""Analyze posts characteristics by Type and perform statistical tests."""
# Group data by Type
grouped = df.groupby('Type')
# Plotting answers by type
for name, group in grouped:
sns.histplot(group['ans_count'], kde=True, label=name)
plt.title('Distribution of Answers by Post Type')
plt.xlabel('Number of Answers')
plt.ylabel('Frequency')
plt.legend()
plt.show()
# ANOVA to test if differences in number of answers across types are statistically significant
groups = [group['ans_count'].values for name, group in grouped]
f_stat, p_value = f_oneway(*groups)
print(f"ANOVA results for number of answers by Type: F-statistic = {f_stat}, P-value = {p_value}")
def main():
filepath = 'path_to_your_data.csv' # Change to your actual file path
df = load_data(filepath)
if 'tags' in df.columns:
analyze_tags(df)
else:
print("Error: 'tags' column not found in the DataFrame.")
analyze_answers_votes(df)
analyze_by_type(df)
if __name__ == "__main__":
main()
```
%% Output
Top 10 Tags:
tags
asp 2
net 2
design 2
c 1
web 1
services 1
database 1
oop 1
class 1
iis 1
Name: count, dtype: int64
ANOVA results for number of answers by Type: F-statistic = 2.0333333333333337, P-value = 0.46637164044572
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment