Commit 0fa8ae35 authored by Jana Germies's avatar Jana Germies
Browse files

create training sequences and tensor dataset

parent 448978df
"""Here, methods to assess Linguistic Style Matching in the chat dialogues are provided."""
import glob
import os
import pandas as pd
import numpy as np
from scipy import stats
from visualization.visualizations import boxplot_lsm_scores
def calculate_lsm(directory):
"""Calculate scores for Linguistic Style Matching for each chat.
Calculations are based on Gonzales et al. 2010
e.g. ppLSM = 1(|pp1 - pp2|/(pp1 + pp2))"""
# columns of interest for calculation
cols = ['auxverb', 'article', 'adverb', 'ppron', 'ipron', 'prep', 'negate', 'conj', 'posemo', 'negemo']
scores_df = pd.DataFrame()
# iterate over files
for file_path in glob.iglob(directory+'*.csv'):
chat_id = os.path.basename(file_path)[:-4]
df = pd.read_csv(file_path, sep=";")
df = df[cols]
# convert strings to floats
# Note: issue with converting non-float numbers (e.g. WC 45 -> NaN)
df = df.apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)
diff_scores = (df.loc[0, :] - df.loc[1, :]).abs()
sum_scores = df.sum(axis=0)
lsm_score = 1.0 - (diff_scores / sum_scores)
# fill NaNs with score 1: NaNs result from 0 / 0 division
# since the scores are basically a match, they are assigned a score of 1
# Note: this decision did not make much of difference in the end results
lsm_score = lsm_score.fillna(1)
lsm_score['average'] = np.mean(lsm_score[:-2])
lsm_score['chat_id'] = chat_id
scores_df = scores_df.append(lsm_score, ignore_index=True)
# re-order
names = scores_df.pop('chat_id')
avgs = scores_df.pop('average')
scores_df.insert(0, 'chat_id', names)
scores_df.insert(1, 'overall', avgs)
score_summary = scores_df.describe()
return scores_df, score_summary
def trait_by_chat(directory):
"""Match chat ids to a personality trait.
Introvert means at least one introverted speaker participated in the chat.
Note: chats were either mixed or had between only extroverts."""
df = pd.read_csv(directory, sep=";")
chats = pd.unique(df['chat_id'])
introvert_chats = [group for group, df in df[df['extraversion_pole'].str.contains('introvert')].groupby('chat_id')]
personality_dict = {}
for chat in chats:
if chat in introvert_chats:
personality_dict[chat] = 'introvert'
else:
personality_dict[chat] = 'extrovert'
return personality_dict
def match_trait_to_scores(df_lsm, dict_traits):
"""Extract traits and chat ids"""
df_lsm['trait'] = df_lsm['chat_id'].map(dict_traits)
score_by_trait = df_lsm.groupby('trait').mean()
# Note: no (sig) difference between groups, BUT may differ with actual trait scores
# maybe average the scores of both speakers?
# mixed_chats = score_by_trait.loc['introvert', :].tolist()
# extro_chats = score_by_trait.loc['extrovert', :].tolist()
return score_by_trait
def compare_scores(df):
"""Compare lsm scores for chats with and without introverts present"""
df = df.drop('average', axis=1)
mixed_chats = df.loc['introvert', :].values.tolist()
print(mixed_chats)
extro_chats = df.loc['extrovert', :].values.tolist()
t_sig = stats.ttest_ind(extro_chats, mixed_chats)
return t_sig
if __name__ == '__main__':
scores, overview = calculate_lsm('/Users/Jana1/Desktop/MA/ttaspy/outputs/LIWC_results/')
scores.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=';', index=False)
overview.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_overall_stats.csv', sep=';')
print(scores)
boxplot_lsm_scores(scores)
scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=";")
data = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=";")
trait_dict = trait_by_chat('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv')
matched = match_trait_to_scores(scores, trait_dict)
print(matched)
sig = compare_scores(matched)
print(sig)
"""
Here, methods to read, format and evaluate answers to the personality questionnaire are provided.
Scores per user and effects across the whole sample are calculated.
"""
import pandas as pd
from visualization.visualizations import boxplot_trait_scores, scatterplot_interaction
from process_chats import read_chat_data, filter_chats, get_summary, get_n_count, summarize_chats
# basic processing of questionnaire
def read_personality_data(path):
"""
Initial minor formatting is done here:
A header with column names is added and the table pivotted.
"""
# set column names
columns = ["user_id", "question", "answer"]
# load csv
data_df = pd.read_csv(path, header=None, names=columns)
# pivot table
pivot_df = data_df.pivot_table(index="user_id", columns="question", values="answer")
return pivot_df
def remove_fake_profiles(df_traits):
"""
For testing the chat app, fake accounts were created and all questions answered with 1.
These test accounts and other possible fake accounts are removed here.
"""
# locate rows where all values are equal and create mask
equals_mask = df_traits.eq(df_traits.iloc[:, 0], axis=0).all(axis=1)
# invert mask
inverted_mask = equals_mask != True
# apply mask
clean_df = df_traits[inverted_mask]
return clean_df
def recode_answers(df_traits):
"""
The BFI-S questionnaire contains positively and negatively poled questions.
For evaluation, answers to negatively poled questions are re-coded.
"""
poled_questions = [3, 6, 8, 15]
for column in poled_questions:
new_values = df_traits[column].replace([1, 2, 3, 4, 5, 6, 7], [7, 6, 5, 4, 3, 2, 1])
df_traits[column].update(new_values)
return df_traits
def calculate_scores_per_user(df_traits):
"""Calculate personality scores for each trait and user."""
# dimensions and their respective questions (column indices)
extra = [2, 6, 9]
agree = [3, 7, 13]
conscient = [1, 8, 12]
openness = [4, 10, 14]
neurotic = [5, 11, 15]
# create empty data frame
personality_df = pd.DataFrame()
# add columns with mean score values
personality_df['openness'] = df_traits[openness].mean(axis=1)
personality_df['conscientiousness'] = df_traits[conscient].mean(axis=1)
personality_df['extraversion'] = df_traits[extra].mean(axis=1)
personality_df['agreeableness'] = df_traits[agree].mean(axis=1)
personality_df['neuroticism'] = df_traits[neurotic].mean(axis=1)
return personality_df
# collate questionnaire and chat data
def remove_superfluous_users(uniq_users, df_traits):
"""
Remove users who filled out the personality questionnaire
but did not participate in any chats from the personality data.
"""
# compare user ids
irregular_names = [n for n in df_traits['user_id'] if n not in uniq_users]
print('users who did not participate in any of the chats:', irregular_names)
# create boolean mask
mask = df_traits['user_id'].isin(uniq_users)
# apply mask and remove users
clean_df = df_traits[mask]
clean_df = clean_df.reset_index(drop=True)
return clean_df
def map_extraversion_poles(df_traits):
"""
Map scores to the traits' polar expressions.
In the case of extraversion, scores equal to or above 3.5 are mapped to 'extrovert'.
Scores below 3.5 are mapped to 'introvert'.
"""
# select trait
extraversion_scores = df_traits['extraversion']
# create boolean mask based on half-way point of scale (3.5)
mask = extraversion_scores >= 3.5
# replace values
expressions = mask.replace([True, False], ['extrovert', 'introvert'])
# add to data frame
df_traits['poles'] = expressions
expression_df = df_traits.drop(['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'],
axis=1)
# create dict from data frame
expression_dict = pd.Series(expression_df.poles.values, index=expression_df.user_id).to_dict()
print("Users and their respective extraversion poles:", expression_dict)
return expression_dict
def get_interaction_message_lengths_scores(df_chats, df_traits, msg_lens):
"""
Map extraversion scores to message lenths and assess possible factor interaction.
"""
# map extraversion scores to users and create dict
scores = df_traits.drop(['openness', 'conscientiousness', 'agreeableness', 'neuroticism'], axis=1)
score_dict = pd.Series(scores.extraversion.values, index=scores.user_id).to_dict()
# map scores and message lenghts to users in chat data frame
df_chats['extraversion_scores'] = df_chats['user_id'].map(score_dict)
interaction_df = pd.concat([df_chats['user_id'], msg_lens, df_chats['extraversion_scores']], axis=1)
# for AVERAGE msg lengths grouped per user:
# small_df = pd.concat([df_chats['user_id'], lens, df_chats['extraversion_scores']], axis=1)
# grouped = small_df.groupby('user_id')
# interaction = grouped.mean()
return interaction_df
if __name__ == '__main__':
# paths
personality_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-user-answers.csv'
personality_path_out = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
chat_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
chat_path_out = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
# read
trait_data = read_personality_data(personality_path_in)
chat_data = read_chat_data(chat_path_in)
# filter chats
filtered_chats = filter_chats(chat_data)
# process trait scores
clean_trait_df = remove_fake_profiles(trait_data)
recoded_trait_df = recode_answers(clean_trait_df)
score_df = calculate_scores_per_user(recoded_trait_df)
# compare with chat data
unique_users, unique_chats, n_users, n_chats = get_n_count(filtered_chats)
filtered_scores = remove_superfluous_users(unique_users, score_df)
# calculate stats
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# save results
# filtered_scores.to_csv(personality_path_out)
# visualize results
boxplot_trait_scores(filtered_scores)
# interaction between message lengths and extraversion trait scores
message_lens, summary_chats, summary_messages = summarize_chats(filtered_chats)
interaction = get_interaction_message_lengths_scores(filtered_chats, filtered_scores, message_lens)
scatterplot_interaction(interaction)
# add extraversion pole expressions to chat data
extraversion_poles_dict = map_extraversion_poles(filtered_scores)
# TODO: move into map function?
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict)
filtered_chats = filtered_chats.drop('index', axis=1)
# save results
# filtered_chats.to_csv(chat_path_out)
"""Apply methods to clean data"""
from process_chats import read_chat_data, filter_chats, get_n_count, get_summary, summarize_chats, clean_messages, \
sort_chat_messages, concat_and_save_message_strings
from calculate_personality_scores import read_personality_data, remove_fake_profiles, recode_answers, \
calculate_scores_per_user, map_extraversion_poles, remove_superfluous_users, get_interaction_message_lengths_scores
from visualization.visualizations import boxplot_trait_scores, histplot_messages, scatterplot_interaction
if __name__ == '__main__':
chats_input_path = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
traits_input_path = '/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv'
chat_output_path = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
traits_out_path = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
# read in raw chat data
chat_data = read_chat_data(chats_input_path)
# filter out chats > 4
filtered_chats = filter_chats(chat_data)
# clean from special symbols
clean_chats = clean_messages(filtered_chats)
# count unique users and chats
unique_users, unique_chats, n_users, n_chats = get_n_count(clean_chats)
# summarize conversations
message_lens, chat_summary, summary_messages = summarize_chats(clean_chats)
# extra step: manual cleaning
# read in raw questionnaire answers
personality_answers = read_personality_data(traits_input_path)
# remove test profiles
clean_answers = remove_fake_profiles(personality_answers)
# recode answers for calculation
recoded_answers = recode_answers(clean_answers)
# calculate scores
trait_scores = calculate_scores_per_user(recoded_answers)
# compare with cleaned chat data and remove superfluous profiles
trait_scores.reset_index(inplace=True) # TODO: check reset index in original method
filtered_scores = remove_superfluous_users(trait_scores)
# evaluate
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1)) # TODO: check drop in original method
# map extraversion scores to pole expression labels
extraversion_dict = map_extraversion_poles(filtered_scores)
# annotate to cleaned chat data
clean_chats['extraversion_pole'] = clean_chats['user_id'].map(extraversion_dict)
# sort chats according to timestamp
sorted_chats = sort_chat_messages(clean_chats)
# evaluate interaction between messages and personality scores
interaction = get_interaction_message_lengths_scores(sorted_chats, filtered_scores, message_lens)
# save
sorted_chats.to_csv(chat_output_path, index=False)
filtered_scores.to_csv(traits_out_path, index=False)
# visualize
boxplot_trait_scores(filtered_scores)
histplot_messages(message_lens)
scatterplot_interaction(interaction)
# prepare for LIWC
#concat_and_save_message_strings(sorted_chats)
"""Here, methods to filter out futile data points and create a statistical overview
of the chat data set are provided."""
import re
import pandas as pd
# from calculate_personality_scores import get_interaction_message_lengths_scores
# from visualization.visualizations import histplot_messages, scatterplot_interaction
# read data
def read_chat_data(path):
"""Read in the data and add respective column names."""
columns = ["chat_id", "user_id", "message", "timestamp"]
chat_df = pd.read_csv(path, header=None, names=columns)
print(chat_df.head())
return chat_df
def filter_chats(df):
"""Filter data to only contain chats with appropriate number of turns."""
# set threshold
threshold = 3
# group messages
grouped = df.groupby('chat_id')
# filter for message count
filtered = grouped.filter(lambda x: x['message'].count() > threshold)
return filtered
def get_summary(df):
"""Calculate the overall average, min and max values across sample."""
# mean_scores= df.mean(axis=0)
summary = df.agg(['min', 'mean', 'max'], axis=0)
print('statistic summary of data:', summary)
return summary
def summarize_chats(df):
"""Get statistical summaries of messages per chat and message lengths."""
# group
grouped = df.groupby('chat_id')
# n messages per chat
n_messages_per_chat = grouped['message'].count()
# get min, max and average
summary_chats = get_summary(n_messages_per_chat)
# length of individual messages
messages = df['message']
# split individual messages and count words
msg_lens = messages.str.split().str.len()
# get min, max and average
summary_msgs = get_summary(msg_lens)
return msg_lens, summary_chats, summary_msgs
def get_n_count(df):
"""
Get count of unique users and chats.
Note: There was one test case left in the chat data.
"""
# get unique ids
uniq_users = df.user_id.unique()
uniq_chats = df.chat_id.unique()
# get n unique ids
n_users = df.user_id.nunique()
n_chats = df.chat_id.nunique()
# n_chats = len(unique_chats)
print('number of unique users: %i and number of unique chats: %i' % (n_users, n_chats))
return uniq_users, uniq_chats, n_users, n_chats
########
def clean_messages(df):
""" Clean dataframe from emoticons and other special tokens"""
emoticons = re.compile('(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)')
special_chars = re.compile('[$&+:;=|"@#<>^*()%/_-]')
# apply regex to df
df['message'] = df['message'].apply(lambda x: emoticons.sub(r'.', x))
df['message'] = df['message'].apply(lambda x: special_chars.sub(r'', x))
# name = name.replace(r'/[@#$%^&*]/g', "")
return df
def sort_chat_messages(df):
"""Sort messages by chat id and timestamp"""
# convert string to datetime object
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S.%f")
# sort and group by chat id first
df = df.sort_values(['chat_id'], ascending=True)
df = df.groupby(['chat_id'], sort=False)
# then sort by datetime object
df = df.apply(lambda x: x.sort_values(['timestamp'], ascending=True))
df = df.reset_index(drop=True)
return df
def concat_and_save_message_strings(df):
# columns to drop
# cols = ['timestamp', 'extraversion_pole']
# group, sort and concat message strings
for group, frame in df.groupby('chat_id'):
frame = frame.sort_values(['user_id'])
# data = data.drop(columns=cols)
# concat messages per user
strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index()
strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False)
if __name__ == '__main__':
# read data
chat_data = read_chat_data('outputs/ttas-filtered-chats.csv')
# filter data
filtered_chats = filter_chats(chat_data)
print(filtered_chats.shape)
# get n unique users and chats
unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats)
# get n messages per chats, message lengths
message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats)
# visualize
# histplot_messages(message_lens)
# personality data
trait_scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv')
# interaction
# interaction_df = get_interaction_message_lengths_scores(filtered, trait_scores, message_lens)
# chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1)
# print(chat_data.head())
# clean = clean_messages(filtered_chats)
# sort = sort_chat_messages(clean)
# sort.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=';', index=False)
# concat_and_save_message_strings(chat_data)
# strings = data.groupby(['user_id'])['message'].apply(' '.join).reset_index()
# print(strings)
\ No newline at end of file
from pprint import pformat
import pandas as pd
import numpy as np
from itertools import chain
from collections import defaultdict
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, Dataset, RandomSampler, SequentialSampler, random_split
from torch.utils.data.distributed import DistributedSampler
from sklearn.model_selection import train_test_split
from transformers import (
MODEL_WITH_LM_HEAD_MAPPING,
WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer,
PreTrainedModel, PreTrainedTokenizer,
get_linear_schedule_with_warmup,
Trainer, TrainingArguments
)
# define special tokens
SPECIAL_TOKENS = ['<bos>', '<eos>', '<speaker1>', '<speaker2>', '<introvert>', '<extrovert>', '<pad>']
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
'additional_special_tokens': ['<speaker1>', '<speaker2>', '<introvert>', '<extrovert>']}
MODEL_INPUTS = ['input_ids', 'mc_token_ids', 'lm_labels', 'mc_labels', 'token_type_ids']
PADDED_INPUTS = ['input_ids', 'lm_labels', 'token_type_ids']
# TODO: check Trainer for modeling
def tokenize_dataset(df, tokenizer):
"""Tokenize string values specified columns
Note: dbmbz pre-trained tokenizer cannot be applied to batches of senetences
tokenize: separates string into list of words and punctuation marks
convert_tokens_to_ids: convert words into indices of vocabulary entries"""
print('Tokenizing messages ...')
# tokenize and encode
cols = ['message', 'distractor_1', 'distractor_2', 'context_0', 'context_1', 'context_2']
for name in cols:
df[name] = df[name].apply(tokenizer.tokenize)
df[name] = df[name].apply(tokenizer.convert_tokens_to_ids)
return df
def split_dataframe(df):
"""Concatenate candidates and contexts after tokenization
-> last response is ground truth
Note: token id 255 is an empty string and should be removed
Split into train and test set
Note: test_size is set to 0.15 since the dataset is quite small"""
# TODO: test candidates is only ground truth response
print('Splitting dataset ...')
new_df = pd.DataFrame()
new_df['trait'] = df['extraversion_pole']
new_df['candidates'] = df.apply(lambda x: [x['distractor_1']] + [x['distractor_2']] + [x['message']], axis=1)
new_df['context'] = df.apply(lambda x: [x['context_2']] + [x['context_1']] + [x['context_0']], axis=1)
new_df['context'] = [[msg for msg in li if msg != [225]] for li in new_df['context']]
# split in train and test
train, test = train_test_split(new_df, test_size=0.15, random_state=0, stratify=new_df[['trait']])
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print("Train and test samples:", train.shape, test.shape)
print(train)
return train, test
def pad_dataset(dataset, padding=0):
"""Pad Dataset.
Note: Labels may be padded differently"""
print('Padding inputs ...')
max_l = max(len(x) for x in dataset['input_ids'])
# max_l = 443 -> history + response sequence
for name in PADDED_INPUTS:
dataset[name] = [x + [padding if name != 'lm_labels' else -100] * (max_l - len(x)) for x in dataset[name]]
return dataset
def add_special_token(model, tokenizer):
"""Add special tokens to model and tokenizer.
Check with pretrained tokens."""
n_og_tokens = tokenizer.vocab_size
n_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
if n_added_tokens > 0:
model.resize_token_embeddings(new_num_tokens=n_og_tokens + n_added_tokens)
def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=True):
"""Build modeling sequences from pole, history and response segments
- history = list of previous utterances, list of list of token ids / words
- response = list of token ids / words
- trait = trait special token (persona chat: persona description, list of words / tokenids)
Returns dict"""
# convert special token symbols to token ids
bos, eos, speaker1, speaker2, introvert, extrovert = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
# set trait poles to respective tokens / token ids
if trait == 'introvert':
pole = introvert
elif trait == 'extrovert':
pole = extrovert
# create sequences
sequence = [[bos] + [pole]] + history + [response + ([eos] if with_eos else [])]
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1]
+ s for i, s in enumerate(sequence[1:])]
instance = dict()
instance['input_ids'] = list(chain(*sequence)) # individual words
instance['token_type_ids'] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
instance['mc_token_ids'] = len(instance['input_ids']) - 1 # next sentence prediction ids?
instance['lm_labels'] = [-100] * len(instance['input_ids']) # language modeling labels (targets)
if lm_labels:
instance['lm_labels'] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
return instance
def build_dataset(df, train_set=True, distributed=False):
"""
Input: dataframe / dict
Returns Tensor Dataset from dict
Note: Distributed Training is only supported on Linux and Windows
For support on Mac library needs to be compiled from source
"""
print('Building dataset')
#dataset = {'train': defaultdict(list), 'test': defaultdict(list)}
dataset = defaultdict(list)