Commit f18f44c7 authored by Jana Germies's avatar Jana Germies
Browse files

minor clean up

parent 9fd1b825
...@@ -24,3 +24,7 @@ outputs/test ...@@ -24,3 +24,7 @@ outputs/test
/outputs/liwc_scores.csv /outputs/liwc_scores.csv
/outputs/liwc_overall_stats.csv /outputs/liwc_overall_stats.csv
/outputs/big_5_with_means.png /outputs/big_5_with_means.png
modeling/runs
modeling/__pycache__/
\ No newline at end of file
...@@ -80,15 +80,15 @@ def compare_scores(df): ...@@ -80,15 +80,15 @@ def compare_scores(df):
if __name__ == '__main__': if __name__ == '__main__':
scores, overview = calculate_lsm('/Users/Jana1/Desktop/MA/ttaspy/outputs/LIWC_results/') scores, overview = calculate_lsm('../outputs/LIWC_results/')
scores.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=';', index=False) scores.to_csv('../outputs/liwc_scores.csv', sep=';', index=False)
overview.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_overall_stats.csv', sep=';') overview.to_csv('../outputs/liwc_overall_stats.csv', sep=';')
print(scores) print(scores)
boxplot_lsm_scores(scores) boxplot_lsm_scores(scores)
scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=";") #scores = pd.read_csv('outputs/liwc_scores.csv', sep=";")
data = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=";") data = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
trait_dict = trait_by_chat('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv') trait_dict = trait_by_chat('../outputs/ttas_clean_chats.csv')
matched = match_trait_to_scores(scores, trait_dict) matched = match_trait_to_scores(scores, trait_dict)
print(matched) print(matched)
sig = compare_scores(matched) sig = compare_scores(matched)
......
...@@ -12,7 +12,7 @@ from process_chats import read_chat_data, filter_chats, get_summary, get_n_count ...@@ -12,7 +12,7 @@ from process_chats import read_chat_data, filter_chats, get_summary, get_n_count
def read_personality_data(path): def read_personality_data(path):
""" """
Initial minor formatting is done here: Initial minor formatting is done here:
A header with column names is added and the table pivotted. A header with column names is added and the table pivoted.
""" """
# set column names # set column names
columns = ["user_id", "question", "answer"] columns = ["user_id", "question", "answer"]
...@@ -127,10 +127,10 @@ def get_interaction_message_lengths_scores(df_chats, df_traits, msg_lens): ...@@ -127,10 +127,10 @@ def get_interaction_message_lengths_scores(df_chats, df_traits, msg_lens):
if __name__ == '__main__': if __name__ == '__main__':
# paths # paths
personality_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-user-answers.csv' personality_path_in = '../ttas-data/ttas-user-answers.csv'
personality_path_out = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv' personality_path_out = '../outputs/filtered_personality_scores.csv'
chat_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv' chat_path_in = '../ttas-data/ttas-complete-chats.csv'
chat_path_out = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv' chat_path_out = '../outputs/ttas-filtered-chats.csv'
# read # read
trait_data = read_personality_data(personality_path_in) trait_data = read_personality_data(personality_path_in)
chat_data = read_chat_data(chat_path_in) chat_data = read_chat_data(chat_path_in)
...@@ -145,8 +145,6 @@ if __name__ == '__main__': ...@@ -145,8 +145,6 @@ if __name__ == '__main__':
filtered_scores = remove_superfluous_users(unique_users, score_df) filtered_scores = remove_superfluous_users(unique_users, score_df)
# calculate stats # calculate stats
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1)) mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# save results
# filtered_scores.to_csv(personality_path_out)
# visualize results # visualize results
boxplot_trait_scores(filtered_scores) boxplot_trait_scores(filtered_scores)
# interaction between message lengths and extraversion trait scores # interaction between message lengths and extraversion trait scores
...@@ -155,8 +153,7 @@ if __name__ == '__main__': ...@@ -155,8 +153,7 @@ if __name__ == '__main__':
scatterplot_interaction(interaction) scatterplot_interaction(interaction)
# add extraversion pole expressions to chat data # add extraversion pole expressions to chat data
extraversion_poles_dict = map_extraversion_poles(filtered_scores) extraversion_poles_dict = map_extraversion_poles(filtered_scores)
# TODO: move into map function?
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict) filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict)
filtered_chats = filtered_chats.drop('index', axis=1) filtered_chats = filtered_chats.drop('index', axis=1)
# save results # save results
# filtered_chats.to_csv(chat_path_out) filtered_chats.to_csv(chat_path_out)
"""Apply methods to clean data""" """Apply methods to clean data"""
from process_chats import (read_chat_data, filter_chats, get_n_count, get_summary, summarize_chats, clean_messages,
from process_chats import read_chat_data, filter_chats, get_n_count, get_summary, summarize_chats, clean_messages, \ sort_chat_messages, concat_and_save_message_strings)
sort_chat_messages, concat_and_save_message_strings from calculate_personality_scores import (read_personality_data, remove_fake_profiles, recode_answers,
from calculate_personality_scores import read_personality_data, remove_fake_profiles, recode_answers, \ calculate_scores_per_user, map_extraversion_poles, remove_superfluous_users, get_interaction_message_lengths_scores)
calculate_scores_per_user, map_extraversion_poles, remove_superfluous_users, get_interaction_message_lengths_scores
from visualization.visualizations import boxplot_trait_scores, histplot_messages, scatterplot_interaction from visualization.visualizations import boxplot_trait_scores, histplot_messages, scatterplot_interaction
if __name__ == '__main__': if __name__ == '__main__':
chats_input_path = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv' chats_input_path = '../ttas-data/ttas-complete-chats.csv'
traits_input_path = '/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv' traits_input_path = '../ttas-data/trait_scores.csv'
chat_output_path = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv' chat_output_path = '../outputs/ttas-filtered-chats.csv'
traits_out_path = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv' traits_out_path = '../outputs/filtered_personality_scores.csv'
# read in raw chat data # read in raw chat data
chat_data = read_chat_data(chats_input_path) chat_data = read_chat_data(chats_input_path)
# filter out chats > 4 # filter out chats > 4
...@@ -22,7 +21,9 @@ if __name__ == '__main__': ...@@ -22,7 +21,9 @@ if __name__ == '__main__':
# summarize conversations # summarize conversations
message_lens, chat_summary, summary_messages = summarize_chats(clean_chats) message_lens, chat_summary, summary_messages = summarize_chats(clean_chats)
### ----- ###
# extra step: manual cleaning # extra step: manual cleaning
### ----- ###
# read in raw questionnaire answers # read in raw questionnaire answers
personality_answers = read_personality_data(traits_input_path) personality_answers = read_personality_data(traits_input_path)
...@@ -36,7 +37,7 @@ if __name__ == '__main__': ...@@ -36,7 +37,7 @@ if __name__ == '__main__':
trait_scores.reset_index(inplace=True) # TODO: check reset index in original method trait_scores.reset_index(inplace=True) # TODO: check reset index in original method
filtered_scores = remove_superfluous_users(trait_scores) filtered_scores = remove_superfluous_users(trait_scores)
# evaluate # evaluate
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1)) # TODO: check drop in original method mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# map extraversion scores to pole expression labels # map extraversion scores to pole expression labels
extraversion_dict = map_extraversion_poles(filtered_scores) extraversion_dict = map_extraversion_poles(filtered_scores)
...@@ -57,6 +58,5 @@ if __name__ == '__main__': ...@@ -57,6 +58,5 @@ if __name__ == '__main__':
histplot_messages(message_lens) histplot_messages(message_lens)
scatterplot_interaction(interaction) scatterplot_interaction(interaction)
# prepare for LIWC # prepare for LIWC
#concat_and_save_message_strings(sorted_chats) concat_and_save_message_strings(sorted_chats)
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
import re import re
import pandas as pd import pandas as pd
# from calculate_personality_scores import get_interaction_message_lengths_scores
# from visualization.visualizations import histplot_messages, scatterplot_interaction
# read data # read data
...@@ -68,9 +66,6 @@ def get_n_count(df): ...@@ -68,9 +66,6 @@ def get_n_count(df):
return uniq_users, uniq_chats, n_users, n_chats return uniq_users, uniq_chats, n_users, n_chats
########
def clean_messages(df): def clean_messages(df):
""" Clean dataframe from emoticons and other special tokens""" """ Clean dataframe from emoticons and other special tokens"""
emoticons = re.compile('(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)') emoticons = re.compile('(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)')
...@@ -78,7 +73,6 @@ def clean_messages(df): ...@@ -78,7 +73,6 @@ def clean_messages(df):
# apply regex to df # apply regex to df
df['message'] = df['message'].apply(lambda x: emoticons.sub(r'.', x)) df['message'] = df['message'].apply(lambda x: emoticons.sub(r'.', x))
df['message'] = df['message'].apply(lambda x: special_chars.sub(r'', x)) df['message'] = df['message'].apply(lambda x: special_chars.sub(r'', x))
# name = name.replace(r'/[@#$%^&*]/g', "")
return df return df
...@@ -96,21 +90,17 @@ def sort_chat_messages(df): ...@@ -96,21 +90,17 @@ def sort_chat_messages(df):
def concat_and_save_message_strings(df): def concat_and_save_message_strings(df):
# columns to drop
# cols = ['timestamp', 'extraversion_pole']
# group, sort and concat message strings # group, sort and concat message strings
for group, frame in df.groupby('chat_id'): for group, frame in df.groupby('chat_id'):
frame = frame.sort_values(['user_id']) frame = frame.sort_values(['user_id'])
# data = data.drop(columns=cols)
# concat messages per user # concat messages per user
strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index() strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index()
strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False) strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False)
if __name__ == '__main__': if __name__ == '__main__':
# read data # read data
chat_data = read_chat_data('outputs/ttas-filtered-chats.csv') chat_data = read_chat_data('../outputs/ttas-filtered-chats.csv')
# filter data # filter data
filtered_chats = filter_chats(chat_data) filtered_chats = filter_chats(chat_data)
print(filtered_chats.shape) print(filtered_chats.shape)
...@@ -118,21 +108,13 @@ if __name__ == '__main__': ...@@ -118,21 +108,13 @@ if __name__ == '__main__':
unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats) unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats)
# get n messages per chats, message lengths # get n messages per chats, message lengths
message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats) message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats)
# visualize
# histplot_messages(message_lens)
# personality data # personality data
trait_scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv') trait_scores = pd.read_csv('../outputs/filtered_personality_scores.csv')
# interaction
# interaction_df = get_interaction_message_lengths_scores(filtered, trait_scores, message_lens)
# chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1) # chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1)
# print(chat_data.head()) # print(chat_data.head())
# clean = clean_messages(filtered_chats) # clean = clean_messages(filtered_chats)
# sort = sort_chat_messages(clean) # sort = sort_chat_messages(clean)
# sort.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=';', index=False) # sort.to_csv('../outputs/ttas_clean_chats.csv', sep=';', index=False)
# concat_and_save_message_strings(chat_data) # concat_and_save_message_strings(chat_data)
# strings = data.groupby(['user_id'])['message'].apply(' '.join).reset_index()
# print(strings)
\ No newline at end of file
from pprint import pformat """Here, methods to prepare the dataset for training are provided"""
import pandas as pd import pandas as pd
import numpy as np
from itertools import chain from itertools import chain
from collections import defaultdict from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch import torch
from torch.nn.utils.rnn import pad_sequence from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, Dataset, RandomSampler, SequentialSampler, random_split
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from sklearn.model_selection import train_test_split from transformers import (AutoModelWithLMHead, AutoTokenizer)
from transformers import (
MODEL_WITH_LM_HEAD_MAPPING,
WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer,
PreTrainedModel, PreTrainedTokenizer,
get_linear_schedule_with_warmup,
Trainer, TrainingArguments
)
# define special tokens # define special tokens
...@@ -31,7 +23,7 @@ PADDED_INPUTS = ['input_ids', 'lm_labels', 'token_type_ids'] ...@@ -31,7 +23,7 @@ PADDED_INPUTS = ['input_ids', 'lm_labels', 'token_type_ids']
def tokenize_dataset(df, tokenizer): def tokenize_dataset(df, tokenizer):
"""Tokenize string values specified columns """Tokenize string values specified columns
Note: dbmbz pre-trained tokenizer cannot be applied to batches of senetences Note: dbmbz pre-trained tokenizer cannot be applied to batches of sentences
tokenize: separates string into list of words and punctuation marks tokenize: separates string into list of words and punctuation marks
convert_tokens_to_ids: convert words into indices of vocabulary entries""" convert_tokens_to_ids: convert words into indices of vocabulary entries"""
print('Tokenizing messages ...') print('Tokenizing messages ...')
...@@ -48,8 +40,7 @@ def split_dataframe(df): ...@@ -48,8 +40,7 @@ def split_dataframe(df):
-> last response is ground truth -> last response is ground truth
Note: token id 255 is an empty string and should be removed Note: token id 255 is an empty string and should be removed
Split into train and test set Split into train and test set
Note: test_size is set to 0.15 since the dataset is quite small""" test_size is set to 0.15 since the dataset is quite small"""
# TODO: test candidates is only ground truth response
print('Splitting dataset ...') print('Splitting dataset ...')
new_df = pd.DataFrame() new_df = pd.DataFrame()
new_df['trait'] = df['extraversion_pole'] new_df['trait'] = df['extraversion_pole']
...@@ -67,10 +58,13 @@ def split_dataframe(df): ...@@ -67,10 +58,13 @@ def split_dataframe(df):
def pad_dataset(dataset, padding=0): def pad_dataset(dataset, padding=0):
"""Pad Dataset. """Pad Dataset.
Note: Labels may be padded differently""" Note: LM Labels are padded differently
max length of history + response = 443 tokens
model size = 512 for dbmdz
model size = 1024 for GerPT"""
print('Padding inputs ...') print('Padding inputs ...')
max_l = max(len(x) for x in dataset['input_ids']) #max_l = max(len(x) for x in dataset['input_ids'])
# max_l = 443 -> history + response sequence max_l = 512
for name in PADDED_INPUTS: for name in PADDED_INPUTS:
dataset[name] = [x + [padding if name != 'lm_labels' else -100] * (max_l - len(x)) for x in dataset[name]] dataset[name] = [x + [padding if name != 'lm_labels' else -100] * (max_l - len(x)) for x in dataset[name]]
return dataset return dataset
...@@ -79,17 +73,16 @@ def pad_dataset(dataset, padding=0): ...@@ -79,17 +73,16 @@ def pad_dataset(dataset, padding=0):
def add_special_token(model, tokenizer): def add_special_token(model, tokenizer):
"""Add special tokens to model and tokenizer. """Add special tokens to model and tokenizer.
Check with pretrained tokens.""" Check with pretrained tokens."""
n_og_tokens = tokenizer.vocab_size
n_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) n_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
if n_added_tokens > 0: if n_added_tokens > 0:
model.resize_token_embeddings(new_num_tokens=n_og_tokens + n_added_tokens) model.resize_token_embeddings(new_num_tokens=len(tokenizer))
def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=True): def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=True):
"""Build modeling sequences from pole, history and response segments """Build modeling sequences from pole, history and response segments
- history = list of previous utterances, list of list of token ids / words - history = list of previous utterances as list of list of token ids / words
- response = list of token ids / words - response = list of token ids / words for gold or distractor response
- trait = trait special token (persona chat: persona description, list of words / tokenids) - trait = trait special token
Returns dict""" Returns dict"""
# convert special token symbols to token ids # convert special token symbols to token ids
bos, eos, speaker1, speaker2, introvert, extrovert = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) bos, eos, speaker1, speaker2, introvert, extrovert = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
...@@ -103,16 +96,16 @@ def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos= ...@@ -103,16 +96,16 @@ def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1]
+ s for i, s in enumerate(sequence[1:])] + s for i, s in enumerate(sequence[1:])]
instance = dict() instance = dict()
instance['input_ids'] = list(chain(*sequence)) # individual words instance['input_ids'] = list(chain(*sequence))
instance['token_type_ids'] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] instance['token_type_ids'] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
instance['mc_token_ids'] = len(instance['input_ids']) - 1 # next sentence prediction ids? instance['mc_token_ids'] = len(instance['input_ids']) - 1
instance['lm_labels'] = [-100] * len(instance['input_ids']) # language modeling labels (targets) instance['lm_labels'] = [-100] * len(instance['input_ids'])
if lm_labels: if lm_labels:
instance['lm_labels'] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:] instance['lm_labels'] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
return instance return instance
def build_dataset(df, train_set=True, distributed=False): def build_dataset(df, tokenizer, train_set=True, distributed=False):
""" """
Input: dataframe / dict Input: dataframe / dict
Returns Tensor Dataset from dict Returns Tensor Dataset from dict
...@@ -122,9 +115,6 @@ def build_dataset(df, train_set=True, distributed=False): ...@@ -122,9 +115,6 @@ def build_dataset(df, train_set=True, distributed=False):
print('Building dataset') print('Building dataset')
#dataset = {'train': defaultdict(list), 'test': defaultdict(list)} #dataset = {'train': defaultdict(list), 'test': defaultdict(list)}
dataset = defaultdict(list) dataset = defaultdict(list)
# persona chat: flexible n candidates, history and persona description segments
# me: set numbers
# TODO: n cadidates for test = 0 ??
n_candidates = 3 # TODO: make adaptable n_candidates = 3 # TODO: make adaptable
max_history = 2 # TODO: make adaptable max_history = 2 # TODO: make adaptable
if train_set: if train_set:
...@@ -132,7 +122,6 @@ def build_dataset(df, train_set=True, distributed=False): ...@@ -132,7 +122,6 @@ def build_dataset(df, train_set=True, distributed=False):
else: else:
n_candidates = 1 n_candidates = 1
# create instance for each candidate response # create instance for each candidate response
#history = df['history']#[-(2 * max_history + 1):]
print('Building sequences ...') print('Building sequences ...')
for i, row in df.iterrows(): for i, row in df.iterrows():
trait = row['trait'] trait = row['trait']
...@@ -145,32 +134,19 @@ def build_dataset(df, train_set=True, distributed=False): ...@@ -145,32 +134,19 @@ def build_dataset(df, train_set=True, distributed=False):
dataset[input_name].append(input_array) dataset[input_name].append(input_array)
dataset['mc_labels'].append(n_candidates - 1) # label == 2? dataset['mc_labels'].append(n_candidates - 1) # label == 2?
dataset['n_candidates'] = n_candidates dataset['n_candidates'] = n_candidates
#print(len(dataset['input_ids']), len(dataset['lm_labels']), len(dataset['token_type_ids']), len(dataset['mc_token_ids']), print(len(dataset['mc_labels'])))
#print(len(dataset['mc_labels']))
print(dataset['mc_labels'])
# pad # pad
padded_dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) padded_dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
# convert to tensors # convert to tensors
# TODO: create extra method ?
print('Converting input sequences into tensors ...') print('Converting input sequences into tensors ...')
tensor_set = [] tensor_set = []
for input_name in MODEL_INPUTS: #mc_labels and mc_token_ids have different size than rest of inputs??? for input_name in MODEL_INPUTS:
print(input_name)
tensor = torch.tensor(padded_dataset[input_name]) tensor = torch.tensor(padded_dataset[input_name])
print(tensor.size(0))
# size 2292 (thrice the size of the tensors below)
#if input_name != 'mc_labels':
tensor = tensor.view((-1, dataset['n_candidates']) + tensor.shape[1:]) tensor = tensor.view((-1, dataset['n_candidates']) + tensor.shape[1:])
print(tensor.size(0)) print(tensor.size(0))
# size 764
tensor_set.append(tensor) tensor_set.append(tensor)
#build tensor data set #build tensor data set
#TODO: set batchsize, shuffle and distributer elsewhere
#TODO: set for train and test differently
batchsize = 4 batchsize = 4
tensor_dataset = TensorDataset(*tensor_set) # TODO: resolve size mismatch error tensor_dataset = TensorDataset(*tensor_set) # TODO: resolve size mismatch error
sampler = DistributedSampler(tensor_dataset) if distributed else None sampler = DistributedSampler(tensor_dataset) if distributed else None
...@@ -179,28 +155,21 @@ def build_dataset(df, train_set=True, distributed=False): ...@@ -179,28 +155,21 @@ def build_dataset(df, train_set=True, distributed=False):
return loader, sampler return loader, sampler
# -------------- # def get_data_loaders(data_path, tokenizer, model):
""" Load, tokenize and split data and build tensor datasets for training """
data = pd.read_csv(data_path, sep=";")
data = data.drop(['chat_id', 'user_id'], axis=1)
add_special_token(model, tokenizer)
tokenized_chats = tokenize_dataset(data, tokenizer)
train, test = split_dataframe(tokenized_chats)
train_loader, train_sampler = build_dataset(train, tokenizer)
test_loader, test_sampler = build_dataset(test, tokenizer, train_set=False)
return train_loader, train_sampler, test_loader, test_sampler
if __name__ == '__main__': if __name__ == '__main__':
# initialize model data = '../outputs/context_chats.csv'
# TODO: extract tensor conversion from build_dataset and create new function
# TODO: check out arguments for DataLoader and DistributedSampler
# TODO: rename and reorganize scripts
tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2') tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2')
#model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2') model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2')
# set special tokens train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data, tokenizer, model)
tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) \ No newline at end of file
#model.set_num_special_tokens(len(SPECIAL_TOKENS))
# read data
chats = pd.read_csv('/home/jgermies/MA/majana-project/majana-code/outputs/context_chats.csv', sep=";")
chats = chats.drop(['chat_id', 'user_id'], axis=1)
print(chats.head())
tokenized_chats = tokenize_dataset(chats, tokenizer)
print(tokenized_chats['context_1'].head())
train, test = split_dataframe(tokenized_chats)
print(train['context'].head())
#print(tokenizer.decode([255]))
train_dataset = build_dataset(train)
#print(train_dataset['train'])
model_checkpoint: /Users/Jana1/Desktop/MA/ttaspy/modeling/models/ model_checkpoint: models/
n_candidates: 3 n_candidates: 3
max_history: 3 max_history: 3
train_batch_size: 4 train_batch_size: 4
......
"""Here, methods to process the data for modeling / fine-tuning are provided.""" """Here, methods to process the data and create context and distractor data are provided."""
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import json
def change_userid_to_speakerid(df): def change_userid_to_speakerid(df):
"""Change made up user ids to speaker 1 and speaker 2 respectively""" """Change made up user ids to speaker 1 and speaker 2 respectively"""
...@@ -32,21 +32,21 @@ def clean_up_turns(df): ...@@ -32,21 +32,21 @@ def clean_up_turns(df):
for tpl in ind_tuple: for tpl in ind_tuple:
# TODO: iterates over df twice # TODO: iterates over df twice
df.iloc[tpl[0]]['message'] = df.iloc[tpl[0]]['message'] + ' ' + df.iloc[tpl[1]]['message'] df.iloc[tpl[0]]['message'] = df.iloc[tpl[0]]['message'] + ' ' + df.iloc[tpl[1]]['message']
#print(df.iloc[tpl[0]]['message'])
# drop redundant messages # drop redundant messages
df_clean = df[~mask] df_clean = df[~mask]
return df_clean return df_clean
def create_context_cols(df): def create_context_cols(df):
"""Create context columns in the data frame.
Note: Distractors are picked randomly from predefined distractor_sents.
For a larger dataset they should be picked randomly from the dataset itself"""
distractor_sents = pd.Series(['Das tur mir leid.', 'Das hab ich nicht verstanden.', 'Super cool!', 'Wie meinst du das?', distractor_sents = pd.Series(['Das tur mir leid.', 'Das hab ich nicht verstanden.', 'Super cool!', 'Wie meinst du das?',
'Ich liebe Eis.', 'Ich bin vegan.', 'Was ist dein Lieblingsessen?', 'Was ist dein Hobby?', 'Ich liebe Eis.', 'Ich bin vegan.', 'Was ist dein Lieblingsessen?', 'Was ist dein Hobby?',
'Ich mag Suppe.', 'Was hast du morgen so vor?']) 'Ich mag Suppe.', 'Was hast du morgen so vor?'])
#print(len(df))
df['context_0'] = df['message'].shift(1, fill_value='Hi!') df['context_0'] = df['message'].shift(1, fill_value='Hi!')
df['context_1'] = df['message'].shift(2, fill_value=' ') df['context_1'] = df['message'].shift(2, fill_value=' ')
df['context_2'] = df['message'].shift(3, fill_value=' ') df['context_2'] = df['message'].shift(3, fill_value=' ')
# TODO: right now computes random distractor for whole chat, loop over whole dataset?