Skip to content
Snippets Groups Projects
Commit 16ea44c5 authored by Abhay Kishorbhai Vaghasiya's avatar Abhay Kishorbhai Vaghasiya
Browse files

code updated for more graphs and better understanding of data

parent 64163542
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
from scipy.stats import shapiro, f_oneway, kruskal
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
def load_data(filepath):
"""Load the dataset from a specified file path."""
df = pd.read_csv("/Users/abhayvaghasiya/Downloads/processed_data.csv")
# Adding a dummy 'Type' column with sample categories
df = pd.read_csv('/Users/abhayvaghasiya/Downloads/processed_data.csv')
types = ['Concept', 'Technology', 'Methodology', 'Pattern']
df['Type'] = pd.Series([types[i % 4] for i in range(len(df))])
return df
def analyze_tags(df):
"""Analyze and visualize the tag frequencies."""
tag_counts = df['tags'].apply(lambda x: x.split(',')).explode().value_counts()
print("Top 10 Tags:")
print(tag_counts.head(10))
tag_counts.head(30).plot(kind='bar', figsize=(10, 5)) # Visualize top 30 tags for better clarity
plt.title('Frequency of Top 30 Tags')
plt.xlabel('Tags')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()
def analyze_answers_votes(df):
"""Analyze and visualize answers and votes distributions."""
# Answers distribution
sns.histplot(df['ans_count'], kde=True, color='blue')
plt.title('Distribution of Answers per Post')
plt.xlabel('Number of Answers')
plt.ylabel('Frequency')
plt.show()
# Votes distribution (using post_score)
sns.histplot(df['post_score'], kde=True, color='green')
plt.title('Distribution of Post Scores (Votes)')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()
def analyze_by_type(df):
"""Analyze posts characteristics by Type and perform statistical tests."""
# Group data by Type
def normality_tests(df):
grouped = df.groupby('Type')
# Plotting answers by type
for name, group in grouped:
sns.histplot(group['ans_count'], kde=True, label=name)
plt.title('Distribution of Answers by Post Type')
plt.xlabel('Number of Answers')
plt.ylabel('Frequency')
plt.legend()
plt.show()
# ANOVA to test if differences in number of answers across types are statistically significant
groups = [group['ans_count'].values for name, group in grouped]
f_stat, p_value = f_oneway(*groups)
print(f"ANOVA results for number of answers by Type: F-statistic = {f_stat}, P-value = {p_value}")
stat, p = shapiro(group['ans_count'])
print(f'Normality test for {name}: Statistics={stat}, p-value={p}')
def main():
filepath = 'path_to_your_data.csv' # Change to your actual file path
df = load_data(filepath)
def perform_statistical_tests(df):
grouped = df.groupby('Type')['ans_count']
data = [group for name, group in grouped]
if 'tags' in df.columns:
analyze_tags(df)
for name, group in grouped:
stat, p = shapiro(group)
print(f'Normality test for {name}: Statistics={stat}, p-value={p}')
if any(shapiro(group)[1] < 0.05 for group in data):
print("Data does not follow a normal distribution, performing Kruskal-Wallis test.")
stat, p = kruskal(*data)
print(f'Kruskal-Wallis test: Statistics={stat}, p-value={p}')
else:
print("Error: 'tags' column not found in the DataFrame.")
print("Data follows a normal distribution, performing ANOVA test.")
stat, p = f_oneway(*data)
print(f'ANOVA test: Statistics={stat}, p-value={p}')
if p < 0.05:
mc = MultiComparison(df['ans_count'], df['Type'])
result = mc.tukeyhsd()
print("Tukey's post-hoc test results:")
print(result)
analyze_answers_votes(df)
analyze_by_type(df)
def main():
filepath = '/Users/abhayvaghasiya/Downloads/processed_data.csv'
df = load_data(filepath)
normality_tests(df)
perform_statistical_tests(df)
if __name__ == "__main__":
main()
```
%% Output
Top 10 Tags:
tags
asp 2
net 2
design 2
c 1
web 1
services 1
database 1
oop 1
class 1
iis 1
Name: count, dtype: int64
ANOVA results for number of answers by Type: F-statistic = 2.0333333333333337, P-value = 0.46637164044572
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 51
48 perform_statistical_tests(df)
50 if __name__ == "__main__":
---> 51 main()
Cell In[11], line 47, in main()
45 filepath = '/Users/abhayvaghasiya/Downloads/processed_data.csv'
46 df = load_data(filepath)
---> 47 normality_tests(df)
48 perform_statistical_tests(df)
Cell In[11], line 18, in normality_tests(df)
16 grouped = df.groupby('Type')
17 for name, group in grouped:
---> 18 stat, p = shapiro(group['ans_count'])
19 print(f'Normality test for {name}: Statistics={stat}, p-value={p}')
File ~/.pyenv/versions/3.10.4/lib/python3.10/site-packages/scipy/stats/_axis_nan_policy.py:531, in _axis_nan_policy_factory.<locals>.axis_nan_policy_decorator.<locals>.axis_nan_policy_wrapper(***failed resolving arguments***)
529 if sentinel:
530 samples = _remove_sentinel(samples, paired, sentinel)
--> 531 res = hypotest_fun_out(*samples, **kwds)
532 res = result_to_tuple(res)
533 res = _add_reduced_axes(res, reduced_axes, keepdims)
File ~/.pyenv/versions/3.10.4/lib/python3.10/site-packages/scipy/stats/_morestats.py:1994, in shapiro(x)
1992 N = len(x)
1993 if N < 3:
-> 1994 raise ValueError("Data must be at least length 3.")
1996 a = zeros(N//2, dtype=np.float64)
1997 init = 0
ValueError: Data must be at least length 3.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment