adjusting default batch size of example notebook to run in binder

fcac2096 · Akhil Thomas · 881e319e · fcac2096
Commit fcac2096 authored 1 year ago by Akhil Thomas
--- a/examples/ner_finetuning.ipynb
+++ b/examples/ner_finetuning.ipynb
@@ -86,8 +86,9 @@
   "metadata": {},
   "source": [
    "## Define arguments and other variables\n",
-    "The 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented.\n",
-    "Also, the 'num_epochs' specifying training epochs is set to 1 by default."
+    "- 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented\n",
+    "- 'num_epochs' specifying training epochs is set to 1 by default\n",
+    "- 'batch_size' greater than 1 would probably kill the kernel due to memory error in binder"
   ]
  },
  {
@@ -102,7 +103,7 @@
    "    model_save_dir=str(PARENT_FOLDER),\n",
    "    cache_dir=str(PARENT_FOLDER),\n",
    "    num_epochs=1,\n",
-    "    batch_size=32,\n",
+    "    batch_size=1,\n",
    "    seeds=11,\n",
    "    lm_lrs=4e-4,\n",
    "    lr_scheduler_type=\"cosine_with_restarts\",\n",
@@ -485,7 +486,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.18 | packaged by conda-forge | (default, Dec 23 2023, 17:17:17) [MSC v.1929 64 bit (AMD64)]"
+   "version": "3.8.18"
  },
  "vscode": {
   "interpreter": {

 %% Cell type:markdown id: tags:

 # Example notebook for fine-tuning BERT-based language models on the CG-NER and FG-NER datasets

 %% Cell type:markdown id: tags:

 ## Import necessary packages
 The required packaged can be installed using the `requirements.txt` file in the repository root. The code is tested with Python 3.8.

 %% Cell type:code id: tags:

 ``` python
 import os
 import sys
 from functools import partial
 from pathlib import Path

 import numpy as np
 import torch

 from seqeval.metrics import (
    classification_report
 )
 from argparse import Namespace
 import utils
 import get_dataset

 from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
 )

 sys.path.append("../")

 PARENT_FOLDER = Path().absolute()
 ```

 %% Cell type:markdown id: tags:

 ## Check if GPU or CPU is being used

 %% Cell type:code id: tags:

 ``` python
 if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"Number of available GPUs: {gpu_count}")

    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
    device = torch.device("cuda")
 else:
    device = torch.device("cpu")

 print("using device:", device)
 ```

 %% Cell type:markdown id: tags:

 ## Define arguments and other variables
-The 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented.
-Also, the 'num_epochs' specifying training epochs is set to 1 by default.
+- 'model_name' should be a huggingface model name for the model to be downloaded. Currently, only Matscibert model is implemented
+- 'num_epochs' specifying training epochs is set to 1 by default
+- 'batch_size' greater than 1 would probably kill the kernel due to memory error in binder

 %% Cell type:code id: tags:

 ``` python
 # the following code selects a matscibert model and corresponding training parameters
 args = dict(
    model_name="matscibert",
    model_save_dir=str(PARENT_FOLDER),
    cache_dir=str(PARENT_FOLDER),
    num_epochs=1,
-    batch_size=32,
+    batch_size=1,
    seeds=11,
    lm_lrs=4e-4,
    lr_scheduler_type="cosine_with_restarts",
    weight_decay=0.01,
    dataset_name="fatigue",
    model_revision="main",
    max_seq_length=512
 )

 args = Namespace(**args)

 dataset_name = args.dataset_name
 model_revision = args.model_revision
 max_seq_length = args.max_seq_length
 num_epochs = args.num_epochs

 metric_for_best_model = "f1"
 other_metric = "accuracy"

 model_save_dir = (utils.ensure_dir(args.model_save_dir)
                  if args.model_save_dir else None)

 cache_dir = utils.ensure_dir(args.cache_dir) if args.cache_dir else None

 output_dir = utils.ensure_dir(os.path.join(model_save_dir,
                                           f"{args.model_name}"))
 ```

 %% Cell type:markdown id: tags:

 ## Get data from CONLL files in the data folder
 Define subset name to either 'CGNER' for course-granular dataset or 'FGNER' for fine-granular dataset. In both cases, the NER datasets from the four CONLL files are combined and randomly sampled into train, val and test splits. Moreover, in the case of FG-NER, only the most frequent 27 classes were included, all other classes were annotated as O tags. These training data is located inside `examples/data` folder.

 %% Cell type:code id: tags:

 ``` python
 train, dev, test = get_dataset.get_data(dataset_name, subset='CGNER')
 train_X, train_y = train.tokens, train.tags
 val_X, _ = dev.tokens, dev.tags
 test_X, _ = test.tokens, test.tags
 print(len(train_X), len(val_X), len(test_X))

 unique_labels = set(label for sent in train_y for label in sent)
 label_list = sorted(list(unique_labels))
 print("All labels in Training dataset", label_list)
 tag2id = {tag: id for id, tag in enumerate(label_list)}
 id2tag = {id: tag for tag, id in tag2id.items()}
 num_labels = len(label_list)
 print("Number of labels", num_labels)
 ```

 %% Cell type:markdown id: tags:

 ## Define tokenizer and model

 %% Cell type:code id: tags:

 ``` python
 tokenizer_kwargs = {
    "cache_dir": cache_dir,
    "use_fast": True,
    "revision": model_revision,
    "use_auth_token": None,
    "model_max_length": 512,
 }

 config_kwargs = {
    "num_labels": num_labels,
    "cache_dir": cache_dir,
    "revision": model_revision,
    "use_auth_token": None,
 }

 # define model
 if args.model_name == "matscibert":
    model_name = "m3rg-iitd/matscibert"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              **tokenizer_kwargs)
    config = AutoConfig.from_pretrained(model_name, **config_kwargs)

 else:
    raise NotImplementedError

 model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    from_tf=False,
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
    use_auth_token=None,
 )

 model = model.to(device)
 ```

 %% Cell type:markdown id: tags:

 ## Create  datasets

 %% Cell type:code id: tags:

 ``` python
 train_dataset, val_dataset, test_dataset = utils.create_dataset(
    tokenizer, train, dev, test, max_seq_length, tag2id, dataset_name
 )
 ```

 %% Cell type:markdown id: tags:

 ## Define optimizer

 %% Cell type:code id: tags:

 ``` python
 optimizer_grouped_parameters = [
    {
        "params": [p for _, p in model.named_parameters()],
        "lr": args.lm_lrs,
        "weight_decay": args.weight_decay,
    }
 ]
 optimizer_kwargs = {
    "betas": (0.9, 0.999),
    "eps": 1e-8,
 }
 optimizer = AdamW(
    optimizer_grouped_parameters, **optimizer_kwargs
 )  # AdamW optimizer
 ```

 %% Cell type:markdown id: tags:

 ## Define training arguments for the trainer

 %% Cell type:code id: tags:

 ``` python
 training_args = TrainingArguments(
    # Total number of training epochs to perform
    num_train_epochs=num_epochs,
    # The output directory
    output_dir=output_dir,
    # Batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=args.batch_size,
    # Batch size for evaluation
    per_device_eval_batch_size=args.batch_size,
    # Evaluate every epoch
    evaluation_strategy="epoch",
    # Save checkpoint every epoch
    save_strategy="epoch",
    # Log every epoch
    logging_strategy="epoch",
    # Log the first global_step
    logging_first_step=True,
    # Load the best model at the end of training
    load_best_model_at_end=True,
    metric_for_best_model=metric_for_best_model,
    # True for f1, False for loss
    greater_is_better=True,
    # Only last checkpoint is saved
    save_total_limit=1,
    # Ratio of total training steps used for a linear \
    # warmup from 0 to learning_rate.
    warmup_ratio=0.1,
    # Strength of weight decay
    weight_decay=args.weight_decay,
    # The initial learning rate for AdamW.
    learning_rate=args.lm_lrs,
    # Random seed that will be set at the beginning of training.
    seed=args.seeds,
    # The scheduler type to use.
    lr_scheduler_type=args.lr_scheduler_type,
 )
 ```

 %% Cell type:markdown id: tags:

 ## Define Trainer

 %% Cell type:code id: tags:

 ``` python
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=partial(utils.compute_metrics, id2tag=id2tag),
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
 )
 ```

 %% Cell type:markdown id: tags:

 ## Train the model on the training dataset

 %% Cell type:code id: tags:

 ``` python
 train_result = trainer.train(
    ignore_keys_for_eval=["O"], resume_from_checkpoint=None
 )

 print("Training set Result", train_result)
 ```

 %% Cell type:markdown id: tags:

 ## Evaluate the model on the Validation and Test sets

 %% Cell type:code id: tags:

 ``` python
 val_result = trainer.evaluate(ignore_keys=["O"],
                              metric_key_prefix="eval")

 print("Validation set Result", val_result)
 ```

 %% Cell type:code id: tags:

 ``` python
 test_result = trainer.evaluate(test_dataset,
                               ignore_keys=["O"],
                               metric_key_prefix="test",
                               )

 print("Test set Result", test_result)
 ```

 %% Cell type:markdown id: tags:

 ## Classification report of class-wise metrics

 %% Cell type:code id: tags:

 ``` python
 dev_predictions, dev_labels, metrics = trainer.predict(val_dataset,
                                                       ignore_keys=["O"],
                                                       metric_key_prefix="val",
                                                       )
 dev_predictions = np.argmax(dev_predictions, axis=2)

 true_predictions_dev = [
    [label_list[p] for (p, l) in zip(prediction, label)
     if l != -100]
    for prediction, label in zip(dev_predictions, dev_labels)
 ]
 true_labels_dev = [
    [label_list[l] for (_, l) in zip(prediction, label)
     if l != -100]
    for prediction, label in zip(dev_predictions, dev_labels)
 ]

 assert len(true_predictions_dev) == len(true_labels_dev)

 for p, a in zip(true_predictions_dev, true_labels_dev):
    assert len(p) == len(a)
 print(
    "Validation classification report",
    classification_report(true_labels_dev, true_predictions_dev),
 )
 ```

 %% Cell type:code id: tags:

 ``` python
 test_predictions, labels, metrics = trainer.predict(test_dataset,
                                                    ignore_keys=["O"],
                                                    metric_key_prefix="test",
                                                    )

 test_predictions = np.argmax(test_predictions, axis=2)
 true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label)
     if l != -100]
    for prediction, label in zip(test_predictions, labels)
 ]
 true_labels = [
    [label_list[l] for (_, l) in zip(prediction, label)
     if l != -100]
    for prediction, label in zip(test_predictions, labels)
 ]

 assert len(true_predictions) == len(true_labels)

 for p, a in zip(true_predictions, true_labels):
    assert len(p) == len(a)

 print(
    "Test classification report",
    classification_report(true_labels, true_predictions),
 )
 ```