Commit 10b3a3f0 authored by Jana Germies's avatar Jana Germies
Browse files

map personality score to chats

parent 5191a23f
"""Statistical Analysis Script ttas data"""
import pandas as pd
import numpy as np
# read data
columns = ["chat_id", "user_id", "message", "timestamp"]
df = pd.read_csv("../ttas/ttas-complete-chats.csv", header=None, names=columns)
print(df.head())
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
\ No newline at end of file
......@@ -31,7 +31,7 @@ def get_n_count(df):
def remove_superfluous_users(unique_users, trait_df):
"""
Removes users who filled out the personality questionnaire
but did not participate in any chats.
but did not participate in any chats from the personality data.
"""
# compare user ids
irregular_names = [n for n in trait_df['user_id'] if n not in unique_users]
......@@ -47,9 +47,9 @@ def remove_superfluous_users(unique_users, trait_df):
if __name__ == '__main__':
# read data
path = "../ttas/ttas-complete-chats.csv"
path = "/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv"
chat_data = read_chat_data(path)
trait_scores = pd.read_csv('trait_scores.csv')
trait_scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv')
print(trait_scores.head())
# count unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(chat_data)
......
import pandas as pd
from inspect_chats import read_chat_data, filter_chats
from compare_dfs import get_n_count, remove_superfluous_users
from personality_scores import get_summary, map_extraversion_poles
from visualizations import visualize_traits
if __name__ == '__main__':
chats_input_path = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
traits_input_path = '/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv'
chat_output_path = ''
# read chat data and filter out chats with turns > 4
chat_data = read_chat_data(chats_input_path)
# filter data
filtered_chats = filter_chats(chat_data)
# count unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered_chats)
# read personality score data
trait_scores = pd.read_csv(traits_input_path)
# compare data frames and remove superfluous users from personality data
filtered_scores = remove_superfluous_users(unique_users, trait_scores)
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# map extraversion scores to pole expression markers
extraversion_df, extraversion_dict = map_extraversion_poles(filtered_scores)
print(extraversion_dict)
# add extraversion pole to respective users
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_dict)
print(filtered_chats)
# save results
#trait_df.to_csv(name+save_path)
# visualize
#visualize_traits(filtered_scores)
\ No newline at end of file
import pandas as pd
import numpy as np
from personality_scores import get_summary
from visualizations import histplot_messages
# read data
def read_chat_data(path):
......@@ -8,18 +10,62 @@ def read_chat_data(path):
print(chat_data.head())
return chat_data
def filter_chats(df):
"""
Filter data to only contain chats with appropriate number of turns
"""
# set threshold
threshold = 3
# group messages
grouped = df.groupby('chat_id')
# filter for message count
filtered = grouped.filter(lambda x: x['message'].count() > threshold)
return filtered
def get_n_count(df):
def summarize_chats(df):
"""
Note: there was one test user left in the chats
Get statistical summaries of messages per chat and message lengths
"""
# get unique ids
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
# get n unique ids
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
#n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' %(n_users, n_chats))
return unique_users, unique_chats, n_users, n_chats
\ No newline at end of file
# group
grouped = df.groupby('chat_id')
# n messages per chat
n_messages_per_chat = grouped['message'].count()
# get min, max and average
summary_chats = get_summary(n_messages_per_chat)
# length of individual messages
messages = df['message']
# split individual messages and count words
message_lens = messages.str.split().str.len()
# get min, max and average
summary_messages = get_summary(message_lens)
return summarize_chats, summary_messages
if __name__ == '__main__':
# read data
chat_data = read_chat_data("/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv")
# filter data
filtered = filter_chats(chat_data)
print(filtered.shape)
# get n messages per chats, message lengths
summary_chats, summary_messages = summarize_chats(filtered)
# #print(chat_data)
# unique_users, unique_chats, n_users, n_chats = get_n_count(chat_data)
# # group by chat
# grouped = chat_data.groupby('chat_id')
# # get n messages per chat
# messages_per_chat = grouped['message'].count()
# # filter for message count
# filtered = grouped.filter(lambda x: x['message'].count() > 3)
# # get length of individual messages
# messages = filtered['message']
# message_lens = messages.str.split().str.len()
# summ = get_summary(message_lens)
# histplot_messages(message_lens)
# #TODO: combine stat summaries with personality trait results
# #TODO: --> calculate average scores per person
\ No newline at end of file
......@@ -53,7 +53,7 @@ def recode_q_answers(df):
def calculate_scores_per_user(df):
"""
Calculates personality scores for each trait and user.
Calculate personality scores for each trait and user.
"""
# dimensions and their respective questions (columns)
extra = [2, 6, 9]
......@@ -73,7 +73,7 @@ def calculate_scores_per_user(df):
def get_summary(df):
"""
Calculates the overall average, min and max values for each trait across the sample.
Calculate the overall average, min and max values for each trait across the sample.
"""
#mean_scores= df.mean(axis=0)
summary = df.agg(['min', 'mean', 'max'], axis=0)
......@@ -82,9 +82,9 @@ def get_summary(df):
def map_extraversion_poles(df):
"""
Map scores to the traits polar expressions.
In the case of extraversion, scores euqal to or above 3.5 are mapped to 'extroverted'.
Scores below 3.5 are mapped to 'introverted'.
Map scores to the traits' polar expressions.
In the case of extraversion, scores equal to or above 3.5 are mapped to 'extrovert'.
Scores below 3.5 are mapped to 'introvert'.
"""
# select trait
extraversion_scores = df['extraversion']
......@@ -92,17 +92,24 @@ def map_extraversion_poles(df):
mask = extraversion_scores >= 3.5
# replace values
expressions = mask.replace([True, False], ['extrovert', 'introvert'])
return expressions
# add to data frame
df['poles'] = expressions
expression_df = df.drop(['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'], axis=1)
# create dict from data frame
expression_dict = pd.Series(expression_df.poles.values, index=expression_df.user_id).to_dict()
return expression_df, expression_dict
if __name__ == '__main__':
# read
data_path = "../ttas/ttas-user-answers.csv"
data_path = "/Users/Jana1/Desktop/MA/ttas/ttas-user-answers.csv"
save_path = ""
name = ""
df = read_personality_data(data_path)
# process
df = remove_false_profiles(df)
print(df.shape)
mod_df = recode_q_answers(df)
trait_df = calculate_scores_per_user(mod_df)
# summarize
......@@ -111,5 +118,5 @@ if __name__ == '__main__':
extraversion_scores = map_extraversion_poles(trait_df)
print(extraversion_scores)
# visualize and save results
trait_df.to_csv(name+save_path)
#trait_df.to_csv(name+save_path)
#visualize_traits(trait_df)
"""Methods to visualize results and create knowledgable graphs"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#visualize traits in boxplot
def visualize_traits(df):
sns.set_theme(style='whitegrid')
#palette = sns.color_palette('flare', as_cmap=True) #rocket
sns.set_palette('rocket_r')
ax = sns.boxplot(data=df)
# Note: shows median not mean
# To add mean set argument showmeans = True
ax.set(xlabel='Personality Traits', ylabel='Expression Values')
ax.set_title('Big Five Trait Expressions in Participant Sample')
plt.show()
def histplot_messages(df):
sns.set_theme(style='whitegrid')
sns.set_palette('rocket_r')
ax = sns.histplot(data=df, kde=True)
#ax.set(xlabel='Personality Traits', ylabel='Expression Values')
#ax.set_title('Big Five Trait Expressions in Participant Sample')
plt.show()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment