Commit 80cf448d authored by Jana Germies's avatar Jana Germies
Browse files

minor cleaning, add comments

parent 7506470c
......@@ -100,6 +100,7 @@ if __name__ == '__main__':
*** Congratulations ***
Basic cleaning and analysis of the data are done.
You should have a good understanding of the dta now.
Please continue with preparing the cleaned chats for
the modeling pipeline. To do so, please switch to the
modeling directory and execute file:
......
......@@ -26,7 +26,7 @@ def tokenize_dataset(df, tokenizer):
Note: dbmbz pre-trained tokenizer cannot be applied to batches of sentences
tokenize: separates string into list of words and punctuation marks
convert_tokens_to_ids: convert words into indices of vocabulary entries"""
print('Tokenizing messages ...')
print('INFO: Tokenizing messages ...')
# tokenize and encode
cols = ['message', 'distractor_1', 'distractor_2', 'context_0', 'context_1', 'context_2']
for name in cols:
......@@ -41,7 +41,7 @@ def split_dataframe(df):
Note: token id 255 is an empty string and should be removed
Split into train and test set
test_size is set to 0.15 since the dataset is quite small"""
print('Splitting dataset ...')
print('INFO: Splitting dataset ...')
new_df = pd.DataFrame()
new_df['trait'] = df['extraversion_pole']
new_df['candidates'] = df.apply(lambda x: [x['distractor_1']] + [x['distractor_2']] + [x['message']], axis=1)
......@@ -51,8 +51,7 @@ def split_dataframe(df):
train, test = train_test_split(new_df, test_size=0.15, random_state=0, stratify=new_df[['trait']])
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print("Train and test samples:", train.shape, test.shape)
print(train)
print('INFO: Train and test samples:', train.shape, test.shape)
return train, test
......@@ -62,7 +61,7 @@ def pad_dataset(dataset, padding=0):
max length of history + response = 443 tokens
model size = 512 for dbmdz
model size = 1024 for GerPT"""
print('Padding inputs ...')
print('INFO: Padding inputs ...')
#max_l = max(len(x) for x in dataset['input_ids'])
max_l = 512
for name in PADDED_INPUTS:
......@@ -107,22 +106,20 @@ def build_inputs(tokenizer, trait, history, response, lm_labels=False, with_eos=
def build_dataset(df, tokenizer, train_set=True, distributed=False):
"""
Input: dataframe / dict
Returns Tensor Dataset from dict
Transforms the input dataframe or dict into a Tensor Dataset.
Note: Distributed Training is only supported on Linux and Windows
For support on Mac library needs to be compiled from source
"""
print('Building dataset')
#dataset = {'train': defaultdict(list), 'test': defaultdict(list)}
print('INFO: Building dataset')
dataset = defaultdict(list)
n_candidates = 3 # TODO: make adaptable
max_history = 2 # TODO: make adaptable
n_candidates = 3
max_history = 2
if train_set:
n_candidates = n_candidates
else:
n_candidates = 1
# create instance for each candidate response
print('Building sequences ...')
print('INFO: Building sequences ...')
for i, row in df.iterrows():
trait = row['trait']
history = row['context'][-(2*3+1):]
......@@ -138,12 +135,11 @@ def build_dataset(df, tokenizer, train_set=True, distributed=False):
padded_dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
# convert to tensors
print('Converting input sequences into tensors ...')
print('INFO: Converting input sequences into tensors ...')
tensor_set = []
for input_name in MODEL_INPUTS:
tensor = torch.tensor(padded_dataset[input_name])
tensor = tensor.view((-1, dataset['n_candidates']) + tensor.shape[1:])
print(tensor.size(0))
tensor_set.append(tensor)
#build tensor data set
......@@ -151,7 +147,7 @@ def build_dataset(df, tokenizer, train_set=True, distributed=False):
tensor_dataset = TensorDataset(*tensor_set) # TODO: resolve size mismatch error
sampler = DistributedSampler(tensor_dataset) if distributed else None
loader = DataLoader(tensor_dataset, sampler=sampler, batch_size=batchsize, shuffle=False)
print("Dataset (Batch, Candidates, Seq Length):{}".format(tensor_dataset.tensors[0].shape))
print('INFO: Dataset (Batch, Candidates, Seq Length):{}'.format(tensor_dataset.tensors[0].shape))
return loader, sampler
......@@ -168,8 +164,8 @@ def get_data_loaders(data_path, tokenizer, model):
if __name__ == '__main__':
data = '../outputs/context_chats.csv'
tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2')
model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2')
train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data, tokenizer, model)
\ No newline at end of file
#if __name__ == '__main__':
#data = '../outputs2/context-chats.csv'
#tokenizer = AutoTokenizer.from_pretrained('dbmdz/german-gpt2')
#model = AutoModelWithLMHead.from_pretrained('dbmdz/german-gpt2')
#train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data, tokenizer, model)
\ No newline at end of file
......@@ -8,9 +8,13 @@ from sklearn.model_selection import train_test_split
def change_userid_to_speakerid(df):
"""Change made up user ids to speaker 1 and speaker 2 respectively"""
uniq_user_id = df.user_id.unique()
chat = df['chat_id']
if len(uniq_user_id) < 2:
print('WARNING: Conversation with only 1 speaker detected. Please remove conversation:', chat)
else:
# map to dict
speaker_dict = {uniq_user_id[0]: 'speaker1', uniq_user_id[1]: 'speaker2'}
df['user_id'] = df['user_id'].map(speaker_dict)
speaker_dict = {uniq_user_id[0]: 'speaker1', uniq_user_id[1]: 'speaker2'}
df['user_id'] = df['user_id'].map(speaker_dict)
return df
......@@ -30,7 +34,6 @@ def clean_up_turns(df):
ind_tuple = zip(concat_ind, mask_ind)
# concatenate messages / turns
for tpl in ind_tuple:
# TODO: iterates over df twice
df.iloc[tpl[0]]['message'] = df.iloc[tpl[0]]['message'] + ' ' + df.iloc[tpl[1]]['message']
# drop redundant messages
df_clean = df[~mask]
......@@ -56,6 +59,7 @@ def create_context_cols(df):
def format_context_response_table(df):
# concat messages
df_turns = clean_up_turns(df)
print(df_turns)
# change usernames
df_speaker = df_turns.groupby('chat_id').apply(change_userid_to_speakerid)
# create context columns
......@@ -98,25 +102,25 @@ def table_to_nested_dict(df):
if __name__ == '__main__':
chats = pd.read_csv('../outputs/ttas_clean_chats.csv', sep=";")
print(chats.head())
#clean_turns = clean_up_turns(chats)
#print(clean_turns.iloc[2]['message'])
# create context and distractor columns
chats = pd.read_csv('../outputs2/ttas-annotated-chats.csv', sep=";")
contextual_df = format_context_response_table(chats)
contextual_df = contextual_df.drop(['timestamp'], axis=1)
contextual_df.reset_index(drop=True, inplace=True)
print(contextual_df.shape)
print(contextual_df.head())
contextual_df.to_csv('../outputs/context_chats.csv', sep=';', index=False)
# --- #
#chats = pd.read_csv('../outputs/context_chats.csv', sep=";")
#dicti = table_to_nested_dict(chats)
#print(dicti.keys)
#print(dicti['test'][1]['utterances'])
#with open('../outputs/dataset.json', 'w') as fp:
# json.dump(dicti, fp, indent=2, ensure_ascii=False)
#print(dicti['test'][-1])
contextual_df.to_csv('../outputs2/context-chats.csv', sep=';', index=False)
print("""
*******************************************************
*** Dataframe complete ***
Context and Distractor columns have been added to the
data frame. Please continue with preparing the data for
training. To do so, execute file:
build_dataset.py
*******************************************************
""")
......@@ -49,7 +49,7 @@ def make_logdir(model_name):
def average_distributed_scalar(scalar):
""" Calculate scalar for loss
Note: distributed setting is not be supported on macOS """
Note: distributed setting is not supported on macOS """
return scalar
......@@ -69,7 +69,6 @@ def train(config_path, data_path):
# load data
train_loader, train_sampler, test_loader, test_sampler = get_data_loaders(data_path, tokenizer, model)
# initiate trainer and evaluator
def update(engine, batch):
""" Calculate loss for language modeling (lm_loss) and next sentence prediction task (mc_loss) """
......@@ -168,5 +167,18 @@ def train(config_path, data_path):
if __name__ == '__main__':
config_path = 'config.yaml'
data_path = '../outputs/context_chats.csv'
train(config_path, data_path)
\ No newline at end of file
data_path = '../outputs2/context-chats.csv'
train(config_path, data_path)
print("""
*******************************************************
*** Congratulations ***
You successfully trained your model. Continue with the
interaction pipeline to chat with it or skip straight
ahead to the evaluation.
*******************************************************
""")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment