Commit 7506470c authored by Jana Germies's avatar Jana Germies
Browse files

minor reformating

parent f18f44c7
......@@ -9,6 +9,7 @@ ttas-data
......@@ -27,4 +28,6 @@ outputs/test
\ No newline at end of file
\ No newline at end of file
......@@ -8,7 +8,7 @@ from scipy import stats
from visualization.visualizations import boxplot_lsm_scores
def calculate_lsm(directory):
def calculate_lsm(path):
"""Calculate scores for Linguistic Style Matching for each chat.
Calculations are based on Gonzales et al. 2010
e.g. ppLSM = 1(|pp1 - pp2|/(pp1 + pp2))"""
......@@ -16,7 +16,7 @@ def calculate_lsm(directory):
cols = ['auxverb', 'article', 'adverb', 'ppron', 'ipron', 'prep', 'negate', 'conj', 'posemo', 'negemo']
scores_df = pd.DataFrame()
# iterate over files
for file_path in glob.iglob(directory+'*.csv'):
for file_path in glob.iglob(path + '*.csv'):
chat_id = os.path.basename(file_path)[:-4]
df = pd.read_csv(file_path, sep=";")
df = df[cols]
......@@ -42,11 +42,11 @@ def calculate_lsm(directory):
return scores_df, score_summary
def trait_by_chat(directory):
def trait_by_chat(path):
"""Match chat ids to a personality trait.
Introvert means at least one introverted speaker participated in the chat.
Note: chats were either mixed or had between only extroverts."""
df = pd.read_csv(directory, sep=";")
df = pd.read_csv(path, sep=";")
chats = pd.unique(df['chat_id'])
introvert_chats = [group for group, df in df[df['extraversion_pole'].str.contains('introvert')].groupby('chat_id')]
personality_dict = {}
......@@ -63,7 +63,7 @@ def match_trait_to_scores(df_lsm, dict_traits):
df_lsm['trait'] = df_lsm['chat_id'].map(dict_traits)
score_by_trait = df_lsm.groupby('trait').mean()
# Note: no (sig) difference between groups, BUT may differ with actual trait scores
# maybe average the scores of both speakers?
# to average the scores of both speakers:
# mixed_chats = score_by_trait.loc['introvert', :].tolist()
# extro_chats = score_by_trait.loc['extrovert', :].tolist()
return score_by_trait
......@@ -71,25 +71,40 @@ def match_trait_to_scores(df_lsm, dict_traits):
def compare_scores(df):
"""Compare lsm scores for chats with and without introverts present"""
df = df.drop('average', axis=1)
df = df.drop('overall', axis=1)
mixed_chats = df.loc['introvert', :].values.tolist()
extro_chats = df.loc['extrovert', :].values.tolist()
t_sig = stats.ttest_ind(extro_chats, mixed_chats)
print('\nINFO: Results t-test for chats between only extroverts compared to mixed personality group chats:\n', t_sig)
return t_sig
if __name__ == '__main__':
scores, overview = calculate_lsm('../outputs/LIWC_results/')
scores.to_csv('../outputs/liwc_scores.csv', sep=';', index=False)
overview.to_csv('../outputs/liwc_overall_stats.csv', sep=';')
scores, overview = calculate_lsm('../outputs/liwc/')
#scores = pd.read_csv('outputs/liwc_scores.csv', sep=";")
data = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
trait_dict = trait_by_chat('../outputs/ttas_clean_chats.csv')
trait_dict = trait_by_chat('../outputs2/ttas-annotated-chats.csv')
matched = match_trait_to_scores(scores, trait_dict)
sig = compare_scores(matched)
print('\nINFO: Results for LSM scores overall and for each factor gouped by personality trait:\n', matched)
ttest = compare_scores(matched)
scores.to_csv('../outputs2/liwc-scores.csv', sep=';', index=False)
overview.to_csv('../outputs2/liwc-overall-stats.csv', sep=';')
*** Congratulations ***
Basic cleaning and analysis of the data are done.
Please continue with preparing the cleaned chats for
the modeling pipeline. To do so, please switch to the
modeling directory and execute file:
\ No newline at end of file
......@@ -4,7 +4,7 @@ Scores per user and effects across the whole sample are calculated.
import pandas as pd
from visualization.visualizations import boxplot_trait_scores, scatterplot_interaction
from process_chats import read_chat_data, filter_chats, get_summary, get_n_count, summarize_chats
from process_chats import get_summary, get_n_count, summarize_chats, sort_chat_messages, concat_and_save_message_strings
# basic processing of questionnaire
......@@ -128,32 +128,70 @@ def get_interaction_message_lengths_scores(df_chats, df_traits, msg_lens):
if __name__ == '__main__':
# paths
personality_path_in = '../ttas-data/ttas-user-answers.csv'
personality_path_out = '../outputs/filtered_personality_scores.csv'
chat_path_in = '../ttas-data/ttas-complete-chats.csv'
chat_path_out = '../outputs/ttas-filtered-chats.csv'
personality_path_out = '../outputs2/filtered_personality_scores.csv'
chat_path_in = '../outputs2/ttas-clean-chats.csv'
chat_path_out = '../outputs2/ttas-annotated-chats.csv'
# read
trait_data = read_personality_data(personality_path_in)
chat_data = read_chat_data(chat_path_in)
# filter chats
filtered_chats = filter_chats(chat_data)
# process trait scores
clean_trait_df = remove_fake_profiles(trait_data)
recoded_trait_df = recode_answers(clean_trait_df)
score_df = calculate_scores_per_user(recoded_trait_df)
# compare with chat data
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered_chats)
filtered_scores = remove_superfluous_users(unique_users, score_df)
# calculate stats
chat_data = pd.read_csv(chat_path_in, sep=';') # header=None, names=columns)
# remove test profiles
clean_answers = remove_fake_profiles(trait_data)
# recode answers for calculation
recoded_answers = recode_answers(clean_answers)
# calculate scores
trait_scores = calculate_scores_per_user(recoded_answers)
# compare with cleaned chat data and remove superfluous profiles
unique_users, unique_chats, n_users, n_chats = get_n_count(chat_data)
filtered_scores = remove_superfluous_users(unique_users, trait_scores)
# get personality score summary
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# map extraversion scores to pole expression labels
extraversion_dict = map_extraversion_poles(filtered_scores)
# annotate the cleaned chat data with the personality poles
chat_data['extraversion_pole'] = chat_data['user_id'].map(extraversion_dict)
# sort chats according to timestamp
sorted_chats = sort_chat_messages(chat_data)
# prepare for LIWC
# prepare for LIWC
# save results
sorted_chats.to_csv(chat_path_out, index=False, sep=';')
filtered_scores.to_csv(personality_path_out, index=False, sep=';')
# interaction between message lengths and extraversion trait scores
message_lens, summary_chats, summary_messages = summarize_chats(sorted_chats)
interaction = get_interaction_message_lengths_scores(sorted_chats, filtered_scores, message_lens)
# visualize results
# interaction between message lengths and extraversion trait scores
message_lens, summary_chats, summary_messages = summarize_chats(filtered_chats)
interaction = get_interaction_message_lengths_scores(filtered_chats, filtered_scores, message_lens)
# add extraversion pole expressions to chat data
extraversion_poles_dict = map_extraversion_poles(filtered_scores)
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict)
filtered_chats = filtered_chats.drop('index', axis=1)
# save results
*** Manual step required ***
To calculate Linguistic Style Matching scores, please
refer to the Linguistic Inquiry and Word Count (LIWC) tool.
Files for all individual chats have been created at
outputs/chats/ and are ready to be be analyzed using the
tool. The respective software can be found at:
Fees may apply.
After analyzing the individual chats with the tool, save
results and continue with calculating the overall scores.
To do so, execute file:
......@@ -4,8 +4,9 @@
import re
import pandas as pd
from visualization.visualizations import histplot_messages
# read data
def read_chat_data(path):
"""Read in the data and add respective column names."""
columns = ["chat_id", "user_id", "message", "timestamp"]
......@@ -29,7 +30,6 @@ def get_summary(df):
"""Calculate the overall average, min and max values across sample."""
# mean_scores= df.mean(axis=0)
summary = df.agg(['min', 'mean', 'max'], axis=0)
print('statistic summary of data:', summary)
return summary
......@@ -41,12 +41,14 @@ def summarize_chats(df):
n_messages_per_chat = grouped['message'].count()
# get min, max and average
summary_chats = get_summary(n_messages_per_chat)
print('\nINFO: statistic summary of messages per chat:\n', summary_chats)
# length of individual messages
messages = df['message']
# split individual messages and count words
msg_lens = messages.str.split().str.len()
# get min, max and average
summary_msgs = get_summary(msg_lens)
print('\nINFO: statistic summary of message lengths:\n', summary_msgs)
return msg_lens, summary_chats, summary_msgs
......@@ -62,7 +64,8 @@ def get_n_count(df):
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
# n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' % (n_users, n_chats))
print("""INFO: number of unique users: %i
number of unique chats: %i \n""" % (n_users, n_chats))
return uniq_users, uniq_chats, n_users, n_chats
......@@ -89,32 +92,56 @@ def sort_chat_messages(df):
return df
def concat_and_save_message_strings(df):
def concat_and_save_message_strings(df, path='../outputs2/chats/'):
# group, sort and concat message strings
for group, frame in df.groupby('chat_id'):
frame = frame.sort_values(['user_id'])
# concat messages per user
strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index()
strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False)
strings.to_csv(path+'{}.csv'.format(group), sep=';', index=False)
if __name__ == '__main__':
chats_input_path = '../ttas-data/ttas-complete-chats.csv'
chats_output_path = '../outputs2/ttas-clean-chats.csv'
# read data
chat_data = read_chat_data('../outputs/ttas-filtered-chats.csv')
chat_data = read_chat_data(chats_input_path)
print('INFO: Results pre filtering:')
unique_users, unique_chats, number_users, number_chats = get_n_count(chat_data)
# filter data
filtered_chats = filter_chats(chat_data)
clean_chats = clean_messages(filtered_chats)
# get n unique users and chats
unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats)
print('INFO: Results post filtering:')
unique_users_clean, unique_chats_clean, number_users_clean, number_chats_clean = get_n_count(clean_chats)
# get n messages per chats, message lengths
message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats)
# personality data
trait_scores = pd.read_csv('../outputs/filtered_personality_scores.csv')
message_lens, chat_summary, summary_messages = summarize_chats(clean_chats)
# save
clean_chats.to_csv(chats_output_path, index=False, sep=';')
# visualize
*** Manual cleaning advised ***
Please manually correct spelling, casing, abbreviations,
foreign language use and superfluous white spaces
to enhance quality of the data.
The corresponding file can be found under:
After cleaning continue with calculating the respective
personality scores. To do so, execute file:
# chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1)
# print(chat_data.head())
# clean = clean_messages(filtered_chats)
# sort = sort_chat_messages(clean)
# sort.to_csv('../outputs/ttas_clean_chats.csv', sep=';', index=False)
# concat_and_save_message_strings(chat_data)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment