Skip to content
Snippets Groups Projects
Commit fcac2096 authored by Akhil Thomas's avatar Akhil Thomas
Browse files

adjusting default batch size of example notebook to run in binder

parent 881e319e
No related branches found
Tags 1.0.0
No related merge requests found
%% Cell type:markdown id: tags:
# Example notebook for fine-tuning BERT-based language models on the CG-NER and FG-NER datasets
%% Cell type:markdown id: tags:
## Import necessary packages
The required packaged can be installed using the `requirements.txt` file in the repository root. The code is tested with Python 3.8.
%% Cell type:code id: tags:
``` python
import os
import sys
from functools import partial
from pathlib import Path
import numpy as np
import torch
from seqeval.metrics import (
classification_report
)
from argparse import Namespace
import utils
import get_dataset
from transformers import (
AdamW,
AutoConfig,
AutoModelForTokenClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
)
sys.path.append("../")
PARENT_FOLDER = Path().absolute()
```
%% Cell type:markdown id: tags:
## Check if GPU or CPU is being used
%% Cell type:code id: tags:
``` python
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
print(f"Number of available GPUs: {gpu_count}")
for i in range(gpu_count):
gpu_name = torch.cuda.get_device_name(i)
print(f"GPU {i}: {gpu_name}")
device = torch.device("cuda")
else:
device = torch.device("cpu")
print("using device:", device)
```
%% Cell type:markdown id: tags:
## Define arguments and other variables
The 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented.
Also, the 'num_epochs' specifying training epochs is set to 1 by default.
- 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented
- 'num_epochs' specifying training epochs is set to 1 by default
- 'batch_size' greater than 1 would probably kill the kernel due to memory error in binder
%% Cell type:code id: tags:
``` python
# the following code selects a matscibert model and corresponding training parameters
args = dict(
model_name="matscibert",
model_save_dir=str(PARENT_FOLDER),
cache_dir=str(PARENT_FOLDER),
num_epochs=1,
batch_size=32,
batch_size=1,
seeds=11,
lm_lrs=4e-4,
lr_scheduler_type="cosine_with_restarts",
weight_decay=0.01,
dataset_name="fatigue",
model_revision="main",
max_seq_length=512
)
args = Namespace(**args)
dataset_name = args.dataset_name
model_revision = args.model_revision
max_seq_length = args.max_seq_length
num_epochs = args.num_epochs
metric_for_best_model = "f1"
other_metric = "accuracy"
model_save_dir = (utils.ensure_dir(args.model_save_dir)
if args.model_save_dir else None)
cache_dir = utils.ensure_dir(args.cache_dir) if args.cache_dir else None
output_dir = utils.ensure_dir(os.path.join(model_save_dir,
f"{args.model_name}"))
```
%% Cell type:markdown id: tags:
## Get data from CONLL files in the data folder
Define subset name to either 'CGNER' for course-granular dataset or 'FGNER' for fine-granular dataset. In both cases, the NER datasets from the four CONLL files are combined and randomly sampled into train, val and test splits. Moreover, in the case of FG-NER, only the most frequent 27 classes were included, all other classes were annotated as O tags. These training data is located inside `examples/data` folder.
%% Cell type:code id: tags:
``` python
train, dev, test = get_dataset.get_data(dataset_name, subset='CGNER')
train_X, train_y = train.tokens, train.tags
val_X, _ = dev.tokens, dev.tags
test_X, _ = test.tokens, test.tags
print(len(train_X), len(val_X), len(test_X))
unique_labels = set(label for sent in train_y for label in sent)
label_list = sorted(list(unique_labels))
print("All labels in Training dataset", label_list)
tag2id = {tag: id for id, tag in enumerate(label_list)}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(label_list)
print("Number of labels", num_labels)
```
%% Cell type:markdown id: tags:
## Define tokenizer and model
%% Cell type:code id: tags:
``` python
tokenizer_kwargs = {
"cache_dir": cache_dir,
"use_fast": True,
"revision": model_revision,
"use_auth_token": None,
"model_max_length": 512,
}
config_kwargs = {
"num_labels": num_labels,
"cache_dir": cache_dir,
"revision": model_revision,
"use_auth_token": None,
}
# define model
if args.model_name == "matscibert":
model_name = "m3rg-iitd/matscibert"
tokenizer = AutoTokenizer.from_pretrained(model_name,
**tokenizer_kwargs)
config = AutoConfig.from_pretrained(model_name, **config_kwargs)
else:
raise NotImplementedError
model = AutoModelForTokenClassification.from_pretrained(
model_name,
from_tf=False,
config=config,
cache_dir=cache_dir,
revision=model_revision,
use_auth_token=None,
)
model = model.to(device)
```
%% Cell type:markdown id: tags:
## Create datasets
%% Cell type:code id: tags:
``` python
train_dataset, val_dataset, test_dataset = utils.create_dataset(
tokenizer, train, dev, test, max_seq_length, tag2id, dataset_name
)
```
%% Cell type:markdown id: tags:
## Define optimizer
%% Cell type:code id: tags:
``` python
optimizer_grouped_parameters = [
{
"params": [p for _, p in model.named_parameters()],
"lr": args.lm_lrs,
"weight_decay": args.weight_decay,
}
]
optimizer_kwargs = {
"betas": (0.9, 0.999),
"eps": 1e-8,
}
optimizer = AdamW(
optimizer_grouped_parameters, **optimizer_kwargs
) # AdamW optimizer
```
%% Cell type:markdown id: tags:
## Define training arguments for the trainer
%% Cell type:code id: tags:
``` python
training_args = TrainingArguments(
# Total number of training epochs to perform
num_train_epochs=num_epochs,
# The output directory
output_dir=output_dir,
# Batch size per GPU/TPU core/CPU for training.
per_device_train_batch_size=args.batch_size,
# Batch size for evaluation
per_device_eval_batch_size=args.batch_size,
# Evaluate every epoch
evaluation_strategy="epoch",
# Save checkpoint every epoch
save_strategy="epoch",
# Log every epoch
logging_strategy="epoch",
# Log the first global_step
logging_first_step=True,
# Load the best model at the end of training
load_best_model_at_end=True,
metric_for_best_model=metric_for_best_model,
# True for f1, False for loss
greater_is_better=True,
# Only last checkpoint is saved
save_total_limit=1,
# Ratio of total training steps used for a linear \
# warmup from 0 to learning_rate.
warmup_ratio=0.1,
# Strength of weight decay
weight_decay=args.weight_decay,
# The initial learning rate for AdamW.
learning_rate=args.lm_lrs,
# Random seed that will be set at the beginning of training.
seed=args.seeds,
# The scheduler type to use.
lr_scheduler_type=args.lr_scheduler_type,
)
```
%% Cell type:markdown id: tags:
## Define Trainer
%% Cell type:code id: tags:
``` python
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=partial(utils.compute_metrics, id2tag=id2tag),
tokenizer=tokenizer,
optimizers=(optimizer, None),
)
```
%% Cell type:markdown id: tags:
## Train the model on the training dataset
%% Cell type:code id: tags:
``` python
train_result = trainer.train(
ignore_keys_for_eval=["O"], resume_from_checkpoint=None
)
print("Training set Result", train_result)
```
%% Cell type:markdown id: tags:
## Evaluate the model on the Validation and Test sets
%% Cell type:code id: tags:
``` python
val_result = trainer.evaluate(ignore_keys=["O"],
metric_key_prefix="eval")
print("Validation set Result", val_result)
```
%% Cell type:code id: tags:
``` python
test_result = trainer.evaluate(test_dataset,
ignore_keys=["O"],
metric_key_prefix="test",
)
print("Test set Result", test_result)
```
%% Cell type:markdown id: tags:
## Classification report of class-wise metrics
%% Cell type:code id: tags:
``` python
dev_predictions, dev_labels, metrics = trainer.predict(val_dataset,
ignore_keys=["O"],
metric_key_prefix="val",
)
dev_predictions = np.argmax(dev_predictions, axis=2)
true_predictions_dev = [
[label_list[p] for (p, l) in zip(prediction, label)
if l != -100]
for prediction, label in zip(dev_predictions, dev_labels)
]
true_labels_dev = [
[label_list[l] for (_, l) in zip(prediction, label)
if l != -100]
for prediction, label in zip(dev_predictions, dev_labels)
]
assert len(true_predictions_dev) == len(true_labels_dev)
for p, a in zip(true_predictions_dev, true_labels_dev):
assert len(p) == len(a)
print(
"Validation classification report",
classification_report(true_labels_dev, true_predictions_dev),
)
```
%% Cell type:code id: tags:
``` python
test_predictions, labels, metrics = trainer.predict(test_dataset,
ignore_keys=["O"],
metric_key_prefix="test",
)
test_predictions = np.argmax(test_predictions, axis=2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label)
if l != -100]
for prediction, label in zip(test_predictions, labels)
]
true_labels = [
[label_list[l] for (_, l) in zip(prediction, label)
if l != -100]
for prediction, label in zip(test_predictions, labels)
]
assert len(true_predictions) == len(true_labels)
for p, a in zip(true_predictions, true_labels):
assert len(p) == len(a)
print(
"Test classification report",
classification_report(true_labels, true_predictions),
)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment