calculate_liwc_results.py 3.82 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Here, methods to assess Linguistic Style Matching in the chat dialogues are provided."""

import glob
import os
import pandas as pd
import numpy as np
from scipy import stats
from visualization.visualizations import boxplot_lsm_scores


def calculate_lsm(directory):
    """Calculate scores for Linguistic Style Matching for each chat.
    Calculations are based on Gonzales et al. 2010
    e.g. ppLSM = 1(|pp1 - pp2|/(pp1 + pp2))"""
    # columns of interest for calculation
    cols = ['auxverb', 'article', 'adverb', 'ppron', 'ipron', 'prep', 'negate', 'conj', 'posemo', 'negemo']
    scores_df = pd.DataFrame()
    # iterate over files
    for file_path in glob.iglob(directory+'*.csv'):
        chat_id = os.path.basename(file_path)[:-4]
        df = pd.read_csv(file_path, sep=";")
        df = df[cols]
        # convert strings to floats
        # Note: issue with converting non-float numbers (e.g. WC 45 -> NaN)
        df = df.apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)
        diff_scores = (df.loc[0, :] - df.loc[1, :]).abs()
        sum_scores = df.sum(axis=0)
        lsm_score = 1.0 - (diff_scores / sum_scores)
        # fill NaNs with score 1: NaNs result from 0 / 0 division
        # since the scores are basically a match, they are assigned a score of 1
        # Note: this decision did not make much of difference in the end results
        lsm_score = lsm_score.fillna(1)
        lsm_score['average'] = np.mean(lsm_score[:-2])
        lsm_score['chat_id'] = chat_id
        scores_df = scores_df.append(lsm_score, ignore_index=True)
    # re-order
    names = scores_df.pop('chat_id')
    avgs = scores_df.pop('average')
    scores_df.insert(0, 'chat_id', names)
    scores_df.insert(1, 'overall', avgs)
    score_summary = scores_df.describe()
    return scores_df, score_summary


def trait_by_chat(directory):
    """Match chat ids to a personality trait.
    Introvert means at least one introverted speaker participated in the chat.
    Note: chats were either mixed or had between only extroverts."""
    df = pd.read_csv(directory, sep=";")
    chats = pd.unique(df['chat_id'])
    introvert_chats = [group for group, df in df[df['extraversion_pole'].str.contains('introvert')].groupby('chat_id')]
    personality_dict = {}
    for chat in chats:
        if chat in introvert_chats:
            personality_dict[chat] = 'introvert'
        else:
            personality_dict[chat] = 'extrovert'
    return personality_dict


def match_trait_to_scores(df_lsm, dict_traits):
    """Extract traits and chat ids"""
    df_lsm['trait'] = df_lsm['chat_id'].map(dict_traits)
    score_by_trait = df_lsm.groupby('trait').mean()
    # Note: no (sig) difference between groups, BUT may differ with actual trait scores
    # maybe average the scores of both speakers?
    # mixed_chats = score_by_trait.loc['introvert', :].tolist()
    # extro_chats = score_by_trait.loc['extrovert', :].tolist()
    return score_by_trait


def compare_scores(df):
    """Compare lsm scores for chats with and without introverts present"""
    df = df.drop('average', axis=1)
    mixed_chats = df.loc['introvert', :].values.tolist()
    print(mixed_chats)
    extro_chats = df.loc['extrovert', :].values.tolist()
    t_sig = stats.ttest_ind(extro_chats, mixed_chats)
    return t_sig


if __name__ == '__main__':
Jana Germies's avatar
Jana Germies committed
83
84
85
    scores, overview = calculate_lsm('../outputs/LIWC_results/')
    scores.to_csv('../outputs/liwc_scores.csv', sep=';', index=False)
    overview.to_csv('../outputs/liwc_overall_stats.csv', sep=';')
86
87
88
    print(scores)
    boxplot_lsm_scores(scores)

Jana Germies's avatar
Jana Germies committed
89
90
91
    #scores = pd.read_csv('outputs/liwc_scores.csv', sep=";")
    data = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
    trait_dict = trait_by_chat('../outputs/ttas_clean_chats.csv')
92
93
94
95
    matched = match_trait_to_scores(scores, trait_dict)
    print(matched)
    sig = compare_scores(matched)
    print(sig)