Commit 5191a23f authored by Jana Germies's avatar Jana Germies
Browse files

start statisticl analysis

parent c9507845
"""Statistical Analysis Script ttas data"""
import pandas as pd
import numpy as np
# read data
columns = ["chat_id", "user_id", "message", "timestamp"]
df = pd.read_csv("../ttas/ttas-complete-chats.csv", header=None, names=columns)
print(df.head())
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
\ No newline at end of file
"""Statistical Analysis Script ttas data"""
import pandas as pd
import numpy as np
from pandas.core.algorithms import unique
from personality_scores import map_extraversion_poles, get_summary
from visualizations import visualize_traits
# read data
def read_chat_data(path):
columns = ["chat_id", "user_id", "message", "timestamp"]
chat_data = pd.read_csv(path, header=None, names=columns)
print(chat_data.head())
return chat_data
def get_n_count(df):
"""
Note: there was one test user left in the chats
"""
# get unique ids
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
# get n unique ids
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
#n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' %(n_users, n_chats))
return unique_users, unique_chats, n_users, n_chats
def remove_superfluous_users(unique_users, trait_df):
"""
Removes users who filled out the personality questionnaire
but did not participate in any chats.
"""
# compare user ids
irregular_names = [n for n in trait_df['user_id'] if n not in unique_users]
print('users who did not participate in any of the chats:', irregular_names)
# create boolean mask
mask = trait_df['user_id'].isin(unique_users)
# apply mask and remove users
clean_df = trait_df[mask]
clean_df = clean_df.reset_index(drop=True)
print(clean_df)
return clean_df
if __name__ == '__main__':
# read data
path = "../ttas/ttas-complete-chats.csv"
chat_data = read_chat_data(path)
trait_scores = pd.read_csv('trait_scores.csv')
print(trait_scores.head())
# count unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(chat_data)
# compare data frames
clean_trait_df = remove_superfluous_users(unique_users, trait_scores)
# get refactored summary
print(len(clean_trait_df))
mean_scores = get_summary(clean_trait_df.drop('user_id', axis=1))
#visualize_traits(clean_trait_df)
# map extraversion scores to pole expression markers
#extraversion_poles = map_extraversion_poles(clean_trait_df)
# save results
#trait_df.to_csv(name+save_path)
import pandas as pd
import numpy as np
# read data
def read_chat_data(path):
columns = ["chat_id", "user_id", "message", "timestamp"]
chat_data = pd.read_csv(path, header=None, names=columns)
print(chat_data.head())
return chat_data
def get_n_count(df):
"""
Note: there was one test user left in the chats
"""
# get unique ids
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
# get n unique ids
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
#n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' %(n_users, n_chats))
return unique_users, unique_chats, n_users, n_chats
\ No newline at end of file
"""
The methods in this script are used to read, format and evaluate answers to the personality questionnaire.
Average scores per user, as well as across the whole sample group are calculated.
"""
from numpy.core.fromnumeric import mean
import pandas as pd
from visualizations import visualize_traits
# read data
def read_personality_data(path):
"""
Reads in the data from the csv file.
Minor formatting is also done here.
A header with column names is added.
The table is pivotted to facilitate further processing.
"""
# set column names
columns = ["user_id", "question", "answer"]
# load csv
data_df = pd.read_csv(path, header=None, names=columns)
# pivot table
pivot_df = data_df.pivot_table(index="user_id", columns="question", values="answer")
return pivot_df
def remove_false_profiles(df):
"""
For testing, fake accounts were created and all questions answered with 1.
These test accounts and other possible fake accounts are removed here.
"""
# locate rows where all values are equal and create mask
equals_mask = df.eq(df.iloc[:, 0], axis=0).all(axis=1)
print(equals_mask)
# invert mask
inverted_mask = equals_mask != True
print(inverted_mask)
# apply mask
clean_df = df[inverted_mask]
print(clean_df)
return clean_df
def recode_q_answers(df):
"""
The BFI-S questionnaire contains positively and negatively poled questions.
For evaluation, answers to negatively poled questions are recoded here.
"""
poled_questions = [3, 6, 8, 15]
for column in poled_questions:
new_values = df[column].replace([1, 2, 3, 4, 5, 6, 7], [7, 6 , 5, 4, 3, 2, 1])
df[column].update(new_values)
return df
def calculate_scores_per_user(df):
"""
Calculates personality scores for each trait and user.
"""
# dimensions and their respective questions (columns)
extra = [2, 6, 9]
agree = [3, 7, 13]
conscient = [1, 8, 12]
open = [4, 10, 14]
neurotic = [5, 11, 15]
#create empty data frame
personality_df = pd.DataFrame()
# add columns with mean score values
personality_df['openness'] = df[open].mean(axis=1)
personality_df['conscientiousness'] = df[conscient].mean(axis=1)
personality_df['extraversion'] = df[extra].mean(axis=1)
personality_df['agreeableness'] = df[agree].mean(axis=1)
personality_df['neuroticism'] = df[neurotic].mean(axis=1)
return personality_df
def get_summary(df):
"""
Calculates the overall average, min and max values for each trait across the sample.
"""
#mean_scores= df.mean(axis=0)
summary = df.agg(['min', 'mean', 'max'], axis=0)
print('statistic summary of data:', summary)
return summary
def map_extraversion_poles(df):
"""
Map scores to the traits polar expressions.
In the case of extraversion, scores euqal to or above 3.5 are mapped to 'extroverted'.
Scores below 3.5 are mapped to 'introverted'.
"""
# select trait
extraversion_scores = df['extraversion']
# create boolean mask based on half-way point of scale (3.5)
mask = extraversion_scores >= 3.5
# replace values
expressions = mask.replace([True, False], ['extrovert', 'introvert'])
return expressions
if __name__ == '__main__':
# read
data_path = "../ttas/ttas-user-answers.csv"
save_path = ""
name = ""
df = read_personality_data(data_path)
# process
df = remove_false_profiles(df)
mod_df = recode_q_answers(df)
trait_df = calculate_scores_per_user(mod_df)
# summarize
mean_scores = get_summary(trait_df)
print(mean_scores)
extraversion_scores = map_extraversion_poles(trait_df)
print(extraversion_scores)
# visualize and save results
trait_df.to_csv(name+save_path)
#visualize_traits(trait_df)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment