Commit 9fd1b825 authored by Jana Germies's avatar Jana Germies
Browse files

clean up

parent 0fa8ae35
......@@ -4,8 +4,23 @@ ma-venv
#data
ttas-data
#spell check files
spellcheck
#hidden files
.idea
\ No newline at end of file
.idea
#outputs
outputs/chats
outputs/liwc
outputs/test
/outputs/ttas_clean_chats3.csv
/outputs/ttas_clean_chats2.csv
/outputs/ttas_clean_chats.csv
/outputs/lsm_scores_rocket.png
/outputs/lsm_scores_magma.png
/outputs/lsm_scores_by_trait.csv
/outputs/lsm_score_rocket.png
/outputs/lsm_score_magma.png
/outputs/liwc_scores_whitegrid.png
/outputs/liwc_scores.csv
/outputs/liwc_overall_stats.csv
/outputs/big_5_with_means.png
"""
Here, methods to read, format and evaluate answers to the personality questionnaire are provided.
Scores per user and effects across the whole sample are calculated.
"""
import pandas as pd
from visualization.visualizations import boxplot_trait_scores, scatterplot_interaction
from inspect_chats import read_chat_data, filter_chats, get_summary, get_n_count, summarize_chats
## basic processing of questionnaire ##
def read_personality_data(path):
"""
Initial minor formatting is done here:
A header with column names is added and the table pivotted.
"""
# set column names
columns = ["user_id", "question", "answer"]
# load csv
data_df = pd.read_csv(path, header=None, names=columns)
# pivot table
pivot_df = data_df.pivot_table(index="user_id", columns="question", values="answer")
return pivot_df
def remove_fake_profiles(df_traits):
"""
For testing the chat app, fake accounts were created and all questions answered with 1.
These test accounts and other possible fake accounts are removed here.
"""
# locate rows where all values are equal and create mask
equals_mask = df_traits.eq(df_traits.iloc[:, 0], axis=0).all(axis=1)
# invert mask
inverted_mask = equals_mask != True
# apply mask
clean_df = df_traits[inverted_mask]
return clean_df
def recode_answers(df_traits):
"""
The BFI-S questionnaire contains positively and negatively poled questions.
For evaluation, answers to negatively poled questions are re-coded.
"""
poled_questions = [3, 6, 8, 15]
for column in poled_questions:
new_values = df_traits[column].replace([1, 2, 3, 4, 5, 6, 7], [7, 6 , 5, 4, 3, 2, 1])
df_traits[column].update(new_values)
return df_traits
def calculate_scores_per_user(df_traits):
"""Calculate personality scores for each trait and user."""
# dimensions and their respective questions (column indices)
extra = [2, 6, 9]
agree = [3, 7, 13]
conscient = [1, 8, 12]
open = [4, 10, 14]
neurotic = [5, 11, 15]
#create empty data frame
personality_df = pd.DataFrame()
# add columns with mean score values
personality_df['openness'] = df_traits[open].mean(axis=1)
personality_df['conscientiousness'] = df_traits[conscient].mean(axis=1)
personality_df['extraversion'] = df_traits[extra].mean(axis=1)
personality_df['agreeableness'] = df_traits[agree].mean(axis=1)
personality_df['neuroticism'] = df_traits[neurotic].mean(axis=1)
return personality_df
## collate questionnaire and chat data ##
def remove_superfluous_users(unique_users, df_traits):
"""
Remove users who filled out the personality questionnaire
but did not participate in any chats from the personality data.
"""
# compare user ids
irregular_names = [n for n in df_traits['user_id'] if n not in unique_users]
print('users who did not participate in any of the chats:', irregular_names)
# create boolean mask
mask = df_traits['user_id'].isin(unique_users)
# apply mask and remove users
clean_df = df_traits[mask]
clean_df = clean_df.reset_index(drop=True)
return clean_df
def map_extraversion_poles(df_traits):
"""
Map scores to the traits' polar expressions.
In the case of extraversion, scores equal to or above 3.5 are mapped to 'extrovert'.
Scores below 3.5 are mapped to 'introvert'.
"""
# select trait
extraversion_scores = df_traits['extraversion']
# create boolean mask based on half-way point of scale (3.5)
mask = extraversion_scores >= 3.5
# replace values
expressions = mask.replace([True, False], ['extrovert', 'introvert'])
# add to data frame
df_traits['poles'] = expressions
expression_df = df_traits.drop(['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'], axis=1)
# create dict from data frame
expression_dict = pd.Series(expression_df.poles.values, index=expression_df.user_id).to_dict()
print("Users and their respective extraversion poles:", expression_dict)
return expression_dict
def get_interaction_message_lengths_scores(df_chats, df_traits, message_lens):
"""
Map extraversion scores to message lenths and assess possible factor interaction.
"""
# map extraversion scores to users and create dict
scores = df_traits.drop(['openness', 'conscientiousness', 'agreeableness', 'neuroticism'], axis=1)
score_dict = pd.Series(scores.extraversion.values, index=scores.user_id).to_dict()
# map scores and message lenghts to users in chat data frame
df_chats['extraversion_scores'] = df_chats['user_id'].map(score_dict)
interaction_df = pd.concat([df_chats['user_id'], message_lens, df_chats['extraversion_scores']], axis=1)
# for AVERAGE msg lengths grouped per user:
#small_df = pd.concat([df_chats['user_id'], lens, df_chats['extraversion_scores']], axis=1)
#grouped = small_df.groupby('user_id')
#interaction_df = grouped.mean()
scatterplot_interaction(interaction_df)
return interaction_df
if __name__ == '__main__':
# paths
personality_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-user-answers.csv'
personality_path_out = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
chat_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
chat_path_out = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
# read
trait_data = read_personality_data(personality_path_in)
chat_data = read_chat_data(chat_path_in)
# filter chats
filtered_chats = filter_chats(chat_data)
# process trait scores
clean_trait_df = remove_fake_profiles(trait_data)
recoded_trait_df = recode_answers(clean_trait_df)
score_df = calculate_scores_per_user(recoded_trait_df)
# compare with chat data
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered_chats)
filtered_scores = remove_superfluous_users(unique_users, score_df)
# calculate stats
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# save results
#filtered_scores.to_csv(personality_path_out)
# visualize results
boxplot_trait_scores(filtered_scores)
# interaction between message lengths and extraversion trait scores
message_lens, summary_chats, summary_messages = summarize_chats(filtered_chats)
interaction_df = get_interaction_message_lengths_scores(filtered_chats, filtered_scores, message_lens)
# add extraversion pole expressions to chat data
extraversion_poles_dict = map_extraversion_poles(filtered_scores)
# TODO: move into map function?
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict)
filtered_chats = filtered_chats.drop('index', axis=1)
# save results
#filtered_chats.to_csv(chat_path_out)
"""Apply methods to clean data"""
import pandas as pd
from inspect_chats import read_chat_data, filter_chats, get_n_count, get_summary
from calculate_personality_scores import map_extraversion_poles, remove_superfluous_users
from visualization.visualizations import boxplot_trait_scores
if __name__ == '__main__':
chats_input_path = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
traits_input_path = '/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv'
chat_output_path = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
traits_out_path = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
# read chat data and filter out chats with turns > 4
chat_data = read_chat_data(chats_input_path)
# filter data
filtered_chats = filter_chats(chat_data)
# count unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered_chats)
# read personality score data
trait_scores = pd.read_csv(traits_input_path)
# compare data frames and remove superfluous users from personality data
filtered_scores = remove_superfluous_users(unique_users, trait_scores)
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
print(filtered_scores)
# map extraversion scores to pole expression markers
extraversion_df, extraversion_dict = map_extraversion_poles(filtered_scores)
#print(extraversion_dict)
# add extraversion pole to respective users
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_dict)
filtered_chats = filtered_chats.drop('index', axis=1)
print(filtered_chats)
# save results
filtered_scores.to_csv(traits_out_path)
filtered_chats.to_csv(chat_output_path)
# visualize
#boxplot_trait_scores(filtered_scores)
\ No newline at end of file
"""Here, methods to filter out futile data points and create a statistical overview of the chat data set are provided."""
import pandas as pd
from pandas.core.algorithms import unique
from calculate_personality_scores import get_summary, get_interaction_message_lengths_scores
from visualization.visualizations import histplot_messages, scatterplot_interaction
# read data
def read_chat_data(path):
"""Read in the data and add respective column names."""
columns = ["chat_id", "user_id", "message", "timestamp"]
chat_data = pd.read_csv(path, header=None, names=columns)
print(chat_data.head())
return chat_data
def filter_chats(df):
"""Filter data to only contain chats with appropriate number of turns."""
# set threshold
threshold = 3
# group messages
grouped = df.groupby('chat_id')
# filter for message count
filtered = grouped.filter(lambda x: x['message'].count() > threshold)
filtered = filtered.reset_index()
return filtered
def get_summary(df):
"""Calculate the overall average, min and max values across sample."""
#mean_scores= df.mean(axis=0)
summary = df.agg(['min', 'mean', 'max'], axis=0)
print('statistic summary of data:', summary)
return summary
def summarize_chats(df):
"""Get statistical summaries of messages per chat and message lengths."""
# group
grouped = df.groupby('chat_id')
# n messages per chat
n_messages_per_chat = grouped['message'].count()
# get min, max and average
summary_chats = get_summary(n_messages_per_chat)
# length of individual messages
messages = df['message']
# split individual messages and count words
message_lens = messages.str.split().str.len()
# get min, max and average
summary_messages = get_summary(message_lens)
return message_lens, summary_chats, summary_messages
def get_n_count(df):
"""
Get count of unique users and chats.
Note: There was one test case left in the chat data.
"""
# get unique ids
unique_users = df.user_id.unique()
unique_chats = df.chat_id.unique()
# get n unique ids
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
#n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' %(n_users, n_chats))
return unique_users, unique_chats, n_users, n_chats
if __name__ == '__main__':
# read data
chat_data = read_chat_data("/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv")
# filter data
filtered = filter_chats(chat_data)
print(filtered.shape)
# get n unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered)
# get n messages per chats, message lengths
message_lens, summary_chats, summary_messages = summarize_chats(filtered)
# visualize
#histplot_messages(message_lens)
# personality data
trait_scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv')
# interaction
interaction_df = get_interaction_message_lengths_scores(filtered, trait_scores, message_lens)
\ No newline at end of file
"""Methods to visualize results and create knowledgable graphs"""
"""Methods to visualize results and create knowledgeable graphs"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#visualize traits in boxplot
# visualize traits in boxplot
def boxplot_trait_scores(df):
"""
Plot boxplot diagramm of each trait in one comprehensive graph.
"""
sns.set_theme(style='darkgrid')
#palette = sns.color_palette('flare', as_cmap=True) #rocket
sns.set_theme(style='whitegrid')
# palette = sns.color_palette('flare', as_cmap=True) #rocket
sns.set_palette('rocket_r')
ax = sns.boxplot(data=df)
# Note: shows median not mean
# showmeans=True, meanprops={'markerfacecolor':'black', 'markeredgecolor':'black'})
# Note: by default shows median not mean
# To add mean set argument showmeans = True
ax.set(xlabel='Personality Traits', ylabel='Expression Values')
ax.set_title('Big Five Trait Expressions in Participant Sample')
......@@ -23,13 +23,11 @@ def boxplot_trait_scores(df):
def histplot_messages(df):
"""
Plot a histogramm of message/chat distrubtion per length and count
Plot a histogramm of message/chat distribution per length and count
"""
sns.set_theme(style='darkgrid')
sns.set_palette('rocket_r')
ax = sns.histplot(data=df, kde=True)
#ax.set(xlabel='Personality Traits', ylabel='Expression Values')
#ax.set_title('Big Five Trait Expressions in Participant Sample')
plt.show()
......@@ -43,3 +41,13 @@ def scatterplot_interaction(df):
ax.set(xlabel='Scores for Trait Extraversion', ylabel='Length of Chat Messages')
ax.set_title('Interaction between Extraversion Score and Message Length')
plt.show()
def boxplot_lsm_scores(df):
sns.set_theme(style='whitegrid')
sns.set_palette('magma_r', n_colors=9) # rocket
ax = sns.boxplot(data=df.drop(columns=['chat_id', 'posemo', 'negemo'], axis=1))
# showmeans=True, meanprops={'markerfacecolor':'black', 'markeredgecolor':'black'})
ax.set(xlabel='Categories of Function Words', ylabel='LSM Score')
ax.set_title('Degree of Linguistic Style Matching across Chats per Category')
plt.show()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment