Commit f18f44c7 authored by Jana Germies's avatar Jana Germies
Browse files

minor clean up

parent 9fd1b825
......@@ -24,3 +24,7 @@ outputs/test
/outputs/liwc_scores.csv
/outputs/liwc_overall_stats.csv
/outputs/big_5_with_means.png
modeling/runs
modeling/__pycache__/
\ No newline at end of file
......@@ -80,15 +80,15 @@ def compare_scores(df):
if __name__ == '__main__':
scores, overview = calculate_lsm('/Users/Jana1/Desktop/MA/ttaspy/outputs/LIWC_results/')
scores.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=';', index=False)
overview.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_overall_stats.csv', sep=';')
scores, overview = calculate_lsm('../outputs/LIWC_results/')
scores.to_csv('../outputs/liwc_scores.csv', sep=';', index=False)
overview.to_csv('../outputs/liwc_overall_stats.csv', sep=';')
print(scores)
boxplot_lsm_scores(scores)
scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/liwc_scores.csv', sep=";")
data = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=";")
trait_dict = trait_by_chat('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv')
#scores = pd.read_csv('outputs/liwc_scores.csv', sep=";")
data = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
trait_dict = trait_by_chat('../outputs/ttas_clean_chats.csv')
matched = match_trait_to_scores(scores, trait_dict)
print(matched)
sig = compare_scores(matched)
......
......@@ -12,7 +12,7 @@ from process_chats import read_chat_data, filter_chats, get_summary, get_n_count
def read_personality_data(path):
"""
Initial minor formatting is done here:
A header with column names is added and the table pivotted.
A header with column names is added and the table pivoted.
"""
# set column names
columns = ["user_id", "question", "answer"]
......@@ -127,10 +127,10 @@ def get_interaction_message_lengths_scores(df_chats, df_traits, msg_lens):
if __name__ == '__main__':
# paths
personality_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-user-answers.csv'
personality_path_out = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
chat_path_in = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
chat_path_out = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
personality_path_in = '../ttas-data/ttas-user-answers.csv'
personality_path_out = '../outputs/filtered_personality_scores.csv'
chat_path_in = '../ttas-data/ttas-complete-chats.csv'
chat_path_out = '../outputs/ttas-filtered-chats.csv'
# read
trait_data = read_personality_data(personality_path_in)
chat_data = read_chat_data(chat_path_in)
......@@ -145,8 +145,6 @@ if __name__ == '__main__':
filtered_scores = remove_superfluous_users(unique_users, score_df)
# calculate stats
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# save results
# filtered_scores.to_csv(personality_path_out)
# visualize results
boxplot_trait_scores(filtered_scores)
# interaction between message lengths and extraversion trait scores
......@@ -155,8 +153,7 @@ if __name__ == '__main__':
scatterplot_interaction(interaction)
# add extraversion pole expressions to chat data
extraversion_poles_dict = map_extraversion_poles(filtered_scores)
# TODO: move into map function?
filtered_chats['extraversion_pole'] = filtered_chats['user_id'].map(extraversion_poles_dict)
filtered_chats = filtered_chats.drop('index', axis=1)
# save results
# filtered_chats.to_csv(chat_path_out)
filtered_chats.to_csv(chat_path_out)
"""Apply methods to clean data"""
from process_chats import read_chat_data, filter_chats, get_n_count, get_summary, summarize_chats, clean_messages, \
sort_chat_messages, concat_and_save_message_strings
from calculate_personality_scores import read_personality_data, remove_fake_profiles, recode_answers, \
calculate_scores_per_user, map_extraversion_poles, remove_superfluous_users, get_interaction_message_lengths_scores
from process_chats import (read_chat_data, filter_chats, get_n_count, get_summary, summarize_chats, clean_messages,
sort_chat_messages, concat_and_save_message_strings)
from calculate_personality_scores import (read_personality_data, remove_fake_profiles, recode_answers,
calculate_scores_per_user, map_extraversion_poles, remove_superfluous_users, get_interaction_message_lengths_scores)
from visualization.visualizations import boxplot_trait_scores, histplot_messages, scatterplot_interaction
if __name__ == '__main__':
chats_input_path = '/Users/Jana1/Desktop/MA/ttas/ttas-complete-chats.csv'
traits_input_path = '/Users/Jana1/Desktop/MA/ttas-py/trait_scores.csv'
chat_output_path = '/Users/Jana1/Desktop/MA/ttas/ttas-filtered-chats.csv'
traits_out_path = '/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv'
chats_input_path = '../ttas-data/ttas-complete-chats.csv'
traits_input_path = '../ttas-data/trait_scores.csv'
chat_output_path = '../outputs/ttas-filtered-chats.csv'
traits_out_path = '../outputs/filtered_personality_scores.csv'
# read in raw chat data
chat_data = read_chat_data(chats_input_path)
# filter out chats > 4
......@@ -22,7 +21,9 @@ if __name__ == '__main__':
# summarize conversations
message_lens, chat_summary, summary_messages = summarize_chats(clean_chats)
### ----- ###
# extra step: manual cleaning
### ----- ###
# read in raw questionnaire answers
personality_answers = read_personality_data(traits_input_path)
......@@ -36,7 +37,7 @@ if __name__ == '__main__':
trait_scores.reset_index(inplace=True) # TODO: check reset index in original method
filtered_scores = remove_superfluous_users(trait_scores)
# evaluate
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1)) # TODO: check drop in original method
mean_scores = get_summary(filtered_scores.drop('user_id', axis=1))
# map extraversion scores to pole expression labels
extraversion_dict = map_extraversion_poles(filtered_scores)
......@@ -57,6 +58,5 @@ if __name__ == '__main__':
histplot_messages(message_lens)
scatterplot_interaction(interaction)
# prepare for LIWC
#concat_and_save_message_strings(sorted_chats)
concat_and_save_message_strings(sorted_chats)
......@@ -3,8 +3,6 @@
import re
import pandas as pd
# from calculate_personality_scores import get_interaction_message_lengths_scores
# from visualization.visualizations import histplot_messages, scatterplot_interaction
# read data
......@@ -68,9 +66,6 @@ def get_n_count(df):
return uniq_users, uniq_chats, n_users, n_chats
########
def clean_messages(df):
""" Clean dataframe from emoticons and other special tokens"""
emoticons = re.compile('(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)')
......@@ -78,7 +73,6 @@ def clean_messages(df):
# apply regex to df
df['message'] = df['message'].apply(lambda x: emoticons.sub(r'.', x))
df['message'] = df['message'].apply(lambda x: special_chars.sub(r'', x))
# name = name.replace(r'/[@#$%^&*]/g', "")
return df
......@@ -96,21 +90,17 @@ def sort_chat_messages(df):
def concat_and_save_message_strings(df):
# columns to drop
# cols = ['timestamp', 'extraversion_pole']
# group, sort and concat message strings
for group, frame in df.groupby('chat_id'):
frame = frame.sort_values(['user_id'])
# data = data.drop(columns=cols)
# concat messages per user
strings = frame.groupby(['user_id'])['message'].apply(' '.join).reset_index()
strings.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/chats/{}.csv'.format(group), sep=';', index=False)
if __name__ == '__main__':
# read data
chat_data = read_chat_data('outputs/ttas-filtered-chats.csv')
chat_data = read_chat_data('../outputs/ttas-filtered-chats.csv')
# filter data
filtered_chats = filter_chats(chat_data)
print(filtered_chats.shape)
......@@ -118,21 +108,13 @@ if __name__ == '__main__':
unique_users, unique_chats, number_users, number_chats = get_n_count(filtered_chats)
# get n messages per chats, message lengths
message_lens, chat_summary, summary_messages = summarize_chats(filtered_chats)
# visualize
# histplot_messages(message_lens)
# personality data
trait_scores = pd.read_csv('/Users/Jana1/Desktop/MA/ttas-py/filtered_personality_scores.csv')
# interaction
# interaction_df = get_interaction_message_lengths_scores(filtered, trait_scores, message_lens)
trait_scores = pd.read_csv('../outputs/filtered_personality_scores.csv')
# chat_data = chat_data.drop(columns=chat_data.columns[0], axis=1)
# print(chat_data.head())
# clean = clean_messages(filtered_chats)
# sort = sort_chat_messages(clean)
# sort.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=';', index=False)
# sort.to_csv('../outputs/ttas_clean_chats.csv', sep=';', index=False)
# concat_and_save_message_strings(chat_data)
# strings = data.groupby(['user_id'])['message'].apply(' '.join).reset_index()
# print(strings)
\ No newline at end of file
from pprint import pformat
"""Here, methods to prepare the dataset for training are provided"""
import pandas as pd
import numpy as np
from itertools import chain
from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, Dataset, RandomSampler, SequentialSampler, random_split
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from sklearn.model_selection import train_test_split
from transformers import (
MODEL_WITH_LM_HEAD_MAPPING,
WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer,
PreTrainedModel, PreTrainedTokenizer,
get_linear_schedule_with_warmup,
Trainer, TrainingArguments
)
from transformers import (AutoModelWithLMHead, AutoTokenizer)
# define special tokens
......@@ -31,7 +23,7 @@ PADDED_INPUTS = ['input_ids', 'lm_labels', 'token_type_ids']
def tokenize_dataset(df, tokenizer):
"""Tokenize string values specified columns
Note: dbmbz pre-trained tokenizer cannot be applied to batches of senetences
Note: dbmbz pre-trained tokenizer cannot be applied to batches of sentences
tokenize: separates string into list of words and punctuation marks
convert_tokens_to_ids: convert words into indices of vocabulary entries"""
print('Tokenizing messages ...')
......@@ -48,8 +40,7 @@ def split_dataframe(df):
-> last response is ground truth
Note: token id 255 is an empty string and should be removed
Split into train and test set
Note: test_size is set to 0.15 since the dataset is quite small"""
# TODO: test candidates is only ground truth response
test_size is set to 0.15 since the dataset is quite small"""
print('Splitting dataset ...')
new_df = pd.DataFrame()
new_df['trait'] = df['extraversion_pole']
......@@ -67,10 +58,13 @@ def split_dataframe(df):
def pad_dataset(dataset, padding=0):
"""Pad Dataset.
Note: Labels may be padded differently"""
Note: LM Labels are padded differently
max length of history + response = 443 tokens
model size = 512 for dbmdz
model size = 1024 for GerPT"""
print('Padding inputs ...')
max_l = max(len(x) for x in dataset['input_ids'])
# max_l = 443 -> history + response sequence
#max_l = max(len(x) for x in dataset['input_ids'])
max_l = 512
for name in PADDED_INPUTS:
dataset[name] = [x + [padding if name != 'lm_labels' else -100] * (max_l - len(x)) for x in dataset[name]]
return dataset
......@@ -79,17 +73,16 @@ def pad_dataset(dataset, padding=0):
def add_special_token(model, tokenizer):
"""Add special tokens to model and tokenizer.
Check with pretrained tokens."""
n_og_tokens = tokenizer.vocab_size
n_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
if n_added_tokens > 0:
model.resize_token_embeddings(new_num_tokens=n_og_tokens + n_added_tokens)
model.resize_token_embeddings(new_num_tokens=len(tokenizer))
def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=True):
"""Build modeling sequences from pole, history and response segments
- history = list of previous utterances, list of list of token ids / words
- response = list of token ids / words
- trait = trait special token (persona chat: persona description, list of words / tokenids)
- history = list of previous utterances as list of list of token ids / words
- response = list of token ids / words for gold or distractor response
- trait = trait special token
Returns dict"""
# convert special token symbols to token ids
bos, eos, speaker1, speaker2, introvert, extrovert = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
......@@ -103,16 +96,16 @@ def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1]
+ s for i, s in enumerate(sequence[1:])]
instance = dict()
instance['input_ids'] = list(chain(*sequence)) # individual words
instance['input_ids'] = list(chain(*sequence))
instance['token_type_ids'] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
instance['mc_token_ids'] = len(instance['input_ids']) - 1 # next sentence prediction ids?
instance['lm_labels'] = [-100] * len(instance['input_ids']) # language modeling labels (targets)
instance['mc_token_ids'] = len(instance['input_ids']) - 1
instance['lm_labels'] = [-100] * len(instance['input_ids'])
if lm_labels:
instance['lm_labels'] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
return instance
def build_dataset(df, train_set=True, distributed=False):
def build_dataset(df, tokenizer, train_set=True, distributed=False):
"""
Input: dataframe / dict
Returns Tensor Dataset from dict
......@@ -122,9 +115,6 @@ def build_dataset(df, train_set=True, distributed=False):
print('Building dataset')
#dataset = {'train': defaultdict(list), 'test': defaultdict(list)}
dataset = defaultdict(list)
# persona chat: flexible n candidates, history and persona description segments
# me: set numbers
# TODO: n cadidates for test = 0 ??
n_candidates = 3 # TODO: make adaptable
max_history = 2 # TODO: make adaptable
if train_set:
......@@ -132,7 +122,6 @@ def build_dataset(df, train_set=True, distributed=False):
else:
n_candidates = 1
# create instance for each candidate response
#history = df['history']#[-(2 * max_history + 1):]
print('Building sequences ...')
for i, row in df.iterrows():
trait = row['trait']
......@@ -145,32 +134,19 @@ def build_dataset(df, train_set=True, distributed=False):
dataset[input_name].append(input_array)
dataset['mc_labels'].append(n_candidates - 1) # label == 2?
dataset['n_candidates'] = n_candidates
#print(len(dataset['input_ids']), len(dataset['lm_labels']), len(dataset['token_type_ids']), len(dataset['mc_token_ids']), print(len(dataset['mc_labels'])))
#print(len(dataset['mc_labels']))
print(dataset['mc_labels'])
# pad
padded_dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
# convert to tensors
# TODO: create extra method ?
print('Converting input sequences into tensors ...')
tensor_set = []
for input_name in MODEL_INPUTS: #mc_labels and mc_token_ids have different size than rest of inputs???
print(input_name)
for input_name in MODEL_INPUTS:
tensor = torch.tensor(padded_dataset[input_name])
print(tensor.size(0))
# size 2292 (thrice the size of the tensors below)
#if input_name != 'mc_labels':
tensor = tensor.view((-1, dataset['n_candidates']) + tensor.shape[1:])
print(tensor.size(0))
# size 764
tensor_set.append(tensor)
#build tensor data set
#TODO: set batchsize, shuffle and distributer elsewhere
#TODO: set for train and test differently
batchsize = 4
tensor_dataset = TensorDataset(*tensor_set) # TODO: resolve size mismatch error
sampler = DistributedSampler(tensor_dataset) if distributed else None
......@@ -179,28 +155,21 @@ def build_dataset(df, train_set=True, distributed=False):
return loader, sampler
# -------------- #
def get_data_loaders(data_path, tokenizer, model):
""" Load, tokenize and split data and build tensor datasets for training """
data = pd.read_csv(data_path, sep=";")
data = data.drop(['chat_id', 'user_id'], axis=1)
add_special_token(model, tokenizer)
tokenized_chats = tokenize_dataset(data, tokenizer)
train, test = split_dataframe(tokenized_chats)
train_loader, train_sampler = build_dataset(train, tokenizer)
test_loader, test_sampler = build_dataset(test, tokenizer, train_set=False)
return train_loader, train_sampler, test_loader, test_sampler
if __name__ == '__main__':
# initialize model
# TODO: extract tensor conversion from build_dataset and create new function
# TODO: check out arguments for DataLoader and DistributedSampler
# TODO: rename and reorganize scripts
data = '../outputs/context_chats.csv'
tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2')
#model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2')
# set special tokens
tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
#model.set_num_special_tokens(len(SPECIAL_TOKENS))
# read data
chats = pd.read_csv('/home/jgermies/MA/majana-project/majana-code/outputs/context_chats.csv', sep=";")
chats = chats.drop(['chat_id', 'user_id'], axis=1)
print(chats.head())
tokenized_chats = tokenize_dataset(chats, tokenizer)
print(tokenized_chats['context_1'].head())
train, test = split_dataframe(tokenized_chats)
print(train['context'].head())
#print(tokenizer.decode([255]))
train_dataset = build_dataset(train)
#print(train_dataset['train'])
model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2')
train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data, tokenizer, model)
\ No newline at end of file
model_checkpoint: /Users/Jana1/Desktop/MA/ttaspy/modeling/models/
model_checkpoint: models/
n_candidates: 3
max_history: 3
train_batch_size: 4
......
"""Here, methods to process the data for modeling / fine-tuning are provided."""
"""Here, methods to process the data and create context and distractor data are provided."""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
def change_userid_to_speakerid(df):
"""Change made up user ids to speaker 1 and speaker 2 respectively"""
......@@ -32,21 +32,21 @@ def clean_up_turns(df):
for tpl in ind_tuple:
# TODO: iterates over df twice
df.iloc[tpl[0]]['message'] = df.iloc[tpl[0]]['message'] + ' ' + df.iloc[tpl[1]]['message']
#print(df.iloc[tpl[0]]['message'])
# drop redundant messages
df_clean = df[~mask]
return df_clean
def create_context_cols(df):
"""Create context columns in the data frame.
Note: Distractors are picked randomly from predefined distractor_sents.
For a larger dataset they should be picked randomly from the dataset itself"""
distractor_sents = pd.Series(['Das tur mir leid.', 'Das hab ich nicht verstanden.', 'Super cool!', 'Wie meinst du das?',
'Ich liebe Eis.', 'Ich bin vegan.', 'Was ist dein Lieblingsessen?', 'Was ist dein Hobby?',
'Ich mag Suppe.', 'Was hast du morgen so vor?'])
#print(len(df))
df['context_0'] = df['message'].shift(1, fill_value='Hi!')
df['context_1'] = df['message'].shift(2, fill_value=' ')
df['context_2'] = df['message'].shift(3, fill_value=' ')
# TODO: right now computes random distractor for whole chat, loop over whole dataset?
df['distractor_1'] = distractor_sents[np.random.randint(0, len(distractor_sents), len(df)).tolist()].tolist()
df['distractor_2'] = distractor_sents[np.random.randint(0, len(distractor_sents), len(df)).tolist()].tolist()
......@@ -54,7 +54,6 @@ def create_context_cols(df):
def format_context_response_table(df):
# TODO: maybe check timestamp one more time with sort?
# concat messages
df_turns = clean_up_turns(df)
# change usernames
......@@ -65,11 +64,14 @@ def format_context_response_table(df):
def table_to_nested_dict(df):
# TODO: rework
"""Create a nested dict of the data that can be saved to json file.
The two main keys are the two extraversion trait poles, each key holds the individual messages,
their respective chat history and distractor replies.
Note: Not used in the final pipeline."""
df['candidates'] = df.apply(lambda x: [x['distractor_1']] + [x['distractor_2']] + [x['message']], axis=1)
df['context'] = df.apply(lambda x: [x['context_2']] + [x['context_1']] + [x['context_0']], axis=1)
df['context'] = [[msg for msg in li if msg != ' '] for li in df['context']]
# nested_dict = df.groupby('extraversion_pole')[['candidates', 'context']].apply(lambda x: x.to_dict()).to_dict()
keys = ['personality', 'utterances']
data = {'train': [], 'test': []}
grouped = df.groupby('extraversion_pole')
......@@ -97,7 +99,7 @@ def table_to_nested_dict(df):
if __name__ == '__main__':
chats = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/ttas_clean_chats.csv', sep=";")
chats = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
print(chats.head())
#clean_turns = clean_up_turns(chats)
#print(clean_turns.iloc[2]['message'])
......@@ -106,13 +108,13 @@ if __name__ == '__main__':
contextual_df.reset_index(drop=True, inplace=True)
print(contextual_df.shape)
print(contextual_df.head())
contextual_df.to_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/context_chats.csv', sep=';', index=False)
contextual_df.to_csv('../outputs/context_chats.csv', sep=';', index=False)
# --- #
#chats = pd.read_csv('/Users/Jana1/Desktop/MA/ttaspy/outputs/context_chats.csv', sep=";")
#chats = pd.read_csv('../outputs/context_chats.csv', sep=";")
#dicti = table_to_nested_dict(chats)
#print(dicti.keys)
#print(dicti['test'][1]['utterances'])
#with open('/Users/Jana1/Desktop/MA/ttaspy/outputs/dataset.json', 'w') as fp:
#with open('../outputs/dataset.json', 'w') as fp:
# json.dump(dicti, fp, indent=2, ensure_ascii=False)
#print(dicti['test'][-1])
......
"""Here, methods for fine-tuning and the trainer engine are provided.
The code is inspired by Wolf et al. (2019) approach on the PersonaChat Dataset and training TransferTransfo for ConvAI2.
Much of the code is adapted from Thomas Wolf's blogpost, found here:
https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
"""
import os
import math
......@@ -8,27 +13,22 @@ import socket
from datetime import datetime
import torch
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint, global_step_from_engine
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
from transformers import (MODEL_WITH_LM_HEAD_MAPPING,WEIGHTS_NAME, CONFIG_NAME, AdamW, GPT2DoubleHeadsModel,
AutoTokenizer, GPT2Tokenizer, PreTrainedModel, PreTrainedTokenizer)
from transformers import (WEIGHTS_NAME, CONFIG_NAME, AdamW, GPT2DoubleHeadsModel, AutoTokenizer)
from build_dataset import add_special_token, split_dataframe, tokenize_dataset, build_dataset
from build_dataset import add_special_token, get_data_loaders
def get_data_loaders(data_path, tokenizer):
""" Load, tokenize and split data and build tensor datasets for training """
chats = pd.read_csv(data_path, sep=";")
chats = chats.drop(['chat_id', 'user_id'], axis=1)
tokenized_chats = tokenize_dataset(chats, tokenizer)
train, test = split_dataframe(tokenized_chats)
train_loader, train_sampler = build_dataset(train)
test_loader, test_sampler = build_dataset(test, train_set=False)
return train_loader, train_sampler, test_loader, test_sampler
# define special tokens
SPECIAL_TOKENS = ['<bos>', '<eos>', '<speaker1>', '<speaker2>', '<introvert>', '<extrovert>', '<pad>']
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
'additional_special_tokens': ['<speaker1>', '<speaker2>', '<introvert>', '<extrovert>']}
MODEL_INPUTS = ['input_ids', 'mc_token_ids', 'lm_labels', 'mc_labels', 'token_type_ids']
PADDED_INPUTS = ['input_ids', 'lm_labels', 'token_type_ids']
def get_params(yaml_path):
......@@ -47,83 +47,72 @@ def make_logdir(model_name):
return log_dir
def average_distributed_scalar(scalar, args, device):
def average_distributed_scalar(scalar):
""" Calculate scalar for loss
Note: distributed setting may not be supported on macOS """
if args.local_rank == -1:
Note: distributed setting is not be supported on macOS """
return scalar
else:
scalar_t = torch.tensor(scalar, dtype=torch.float, device=device / torch.distributed.get_world.size)
torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
return scalar_t.item()
# TODO: use transformer Trainer?
def update(engine, batch, args, model, device, optimizer, fp16=False):
def train(config_path, data_path):
""" Train model on prepared dataset """
# get parameters from config file
args = get_params(config_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# initiate pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2')
model_class = GPT2DoubleHeadsModel
model = model_class.from_pretrained('dbmdz/german-gpt2')
model.to(device)
add_special_token(model, tokenizer)
optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
# load data
train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data_path, tokenizer, model)
# initiate trainer and evaluator
def update(engine, batch):
""" Calculate loss for language modeling (lm_loss) and next sentence prediction task (mc_loss) """
model.train()
batch = tuple(input_tensor.to(device) for input_tensor in batch)
input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
(lm_loss), (mc_loss), *_ = model(
input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
mc_labels=mc_labels, lm_labels=lm_labels
)
mc_labels=mc_labels)
loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
# distributed training
if fp16:
# remove ?
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm(amp.master_params(optimizer), args.max_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
if engine.state.iteration % args.gradient_accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
return loss.item()
trainer = Engine(update)
def inference(engine, batch, model, device, tokenizer):
def inference(engine, batch):
model.eval()
with torch.no_grad():
batch = tuple(input_tensor.to(device) for input_tensor in batch)
input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
print(tokenizer.decode(input_ids[0, -1, :].tolist()))
lm_logits, mc_logits, *_ = model(
outputs = model(
input_ids, token_type_ids=token_type_ids,