Fine-tuning a transformer model allows for adapting a pretrained model to a specific task using domain-specific data. This process involves preparing the dataset, configuring training parameters, and executing the fine-tuning process using popular machine learning frameworks such as Hugging Face Transformers and PyTorch.
import os
import torch
import numpy as np
from datasets import load_dataset, concatenate_datasets
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from collections import Counter
# Environment setup
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
def compute_metrics(pred):
"""
Compute evaluation metrics
Args:
pred: Prediction object from Trainer
Returns:
Dictionary of metrics
"""
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# Calculate metrics with zero_division parameter
precision, recall, f1, _ = precision_recall_fscore_support(
labels, preds, average='binary', zero_division=0
)
acc = accuracy_score(labels, preds)
# Print confusion matrix to diagnose - handle case of single label
cm = confusion_matrix(labels, preds, labels=[0, 1])
print(f"Confusion matrix:\n{cm}")
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
def prepare_dataset(dataset, tokenizer):
"""
Prepare dataset for training
Args:
dataset: Original dataset
tokenizer: Tokenizer to use
Returns:
Processed dataset
"""
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=128
)
# Tokenize dataset
tokenized_dataset = dataset.map(
tokenize_function,
batched=True
)
# Set format for PyTorch - make sure we keep the label column
tokenized_dataset = tokenized_dataset.remove_columns(
[col for col in tokenized_dataset.column_names if col not in ["input_ids", "attention_mask", "label"]]
)
# Rename label column to labels (required by the Trainer)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# Set format to PyTorch tensors
tokenized_dataset.set_format("torch")
return tokenized_dataset
def main():
try:
# Model and tokenizer initialization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Explicitly set up model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1}
)
# Load dataset explicitly by class to ensure we get both positive and negative
print("Loading dataset...")
# Load negative examples (0) and positive examples (1) separately
negative_examples = load_dataset("imdb", split='train+test').filter(lambda example: example["label"] == 0).select(range(3000))
positive_examples = load_dataset("imdb", split='train+test').filter(lambda example: example["label"] == 1).select(range(3000))
# Verify the datasets
print(f"Negative examples loaded: {len(negative_examples)}")
print(f"Positive examples loaded: {len(positive_examples)}")
# Combine the datasets
dataset = concatenate_datasets([negative_examples, positive_examples])
# Shuffle the dataset
dataset = dataset.shuffle(seed=42)
# Print initial distribution to verify
print("Combined dataset size:", len(dataset))
label_counts = Counter(dataset["label"])
print(f"Combined label distribution: Negative (0): {label_counts[0]}, Positive (1): {label_counts[1]}")
# Prepare dataset
print("Preparing dataset...")
tokenized_dataset = prepare_dataset(dataset, tokenizer)
# Split into train and validation
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split['train']
val_dataset = dataset_split['test']
# Check label distribution for debugging
train_labels = train_dataset["labels"].tolist()
val_labels = val_dataset["labels"].tolist()
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Training label distribution: 0 (Negative): {train_labels.count(0)}, 1 (Positive): {train_labels.count(1)}")
print(f"Validation label distribution: 0 (Negative): {val_labels.count(0)}, 1 (Positive): {val_labels.count(1)}")
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="f1",
fp16=torch.cuda.is_available(),
logging_steps=100,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
)
# Fine-tune the model
print("Starting model training...")
trainer.train()
# Evaluate the model
print("Evaluating model...")
results = trainer.evaluate()
print("Evaluation Results:", results)
# Test on a few examples
print("\nTesting on a few examples:")
test_texts = [
"This movie was absolutely terrible. I hated every minute of it.",
"What a masterpiece! One of the best films I've ever seen.",
"The acting was mediocre, but the story was compelling.",
"I was disappointed with the ending, but overall it was good."
]
# Tokenize and prepare test examples
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
# Get predictions
with torch.no_grad():
outputs = model(**test_encodings)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_classes = torch.argmax(predictions, dim=-1)
# Print results
for i, text in enumerate(test_texts):
pred_class = "POSITIVE" if predicted_classes[i] == 1 else "NEGATIVE"
confidence = predictions[i][predicted_classes[i]].item()
print(f"Text: {text}")
print(f"Prediction: {pred_class}, Confidence: {confidence:.4f}\n")
# Save the fine-tuned model and tokenizer
print("Saving model...")
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Model and tokenizer saved successfully.")
except Exception as e:
print(f"An error occurred: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()Loading dataset...
Filter: 100%
 50000/50000 [00:00<00:00, 232050.10 examples/s]
Filter: 100%
 50000/50000 [00:00<00:00, 284615.42 examples/s]
Negative examples loaded: 3000 Positive examples loaded: 3000 Combined dataset size: 6000 Combined label distribution: Negative (0): 3000, Positive (1): 3000 Preparing dataset...
Map: 100%
 6000/6000 [00:03<00:00, 1784.51 examples/s]
Training set size: 4800 Validation set size: 1200 Training label distribution: 0 (Negative): 2407, 1 (Positive): 2393 Validation label distribution: 0 (Negative): 593, 1 (Positive): 607 Starting model training...
[900/900 45:23, Epoch 3/3]
| Epoch | Training Loss | Validation Loss | Accuracy | F1 | Precision | Recall |
|---|---|---|---|---|---|---|
| 1 | 0.367100 | 0.309218 | 0.880833 | 0.883834 | 0.871795 | 0.896211 |
| 2 | 0.257600 | 0.306283 | 0.878333 | 0.882637 | 0.861852 | 0.904448 |
| 3 | 0.156500 | 0.369149 | 0.885000 | 0.886513 | 0.885057 | 0.887974 |
Confusion matrix: [[513 80] [ 63 544]] Confusion matrix: [[505 88] [ 58 549]] Confusion matrix: [[523 70] [ 68 539]] Evaluating model...
[75/75 00:56]
Confusion matrix:
[[523 70]
[ 68 539]]
Evaluation Results: {'eval_loss': 0.3691490888595581, 'eval_accuracy': 0.885, 'eval_f1': 0.8865131578947368, 'eval_precision': 0.8850574712643678, 'eval_recall': 0.8879736408566722, 'eval_runtime': 56.974, 'eval_samples_per_second': 21.062, 'eval_steps_per_second': 1.316, 'epoch': 3.0}
Testing on a few examples:
Text: This movie was absolutely terrible. I hated every minute of it.
Prediction: NEGATIVE, Confidence: 0.9897
Text: What a masterpiece! One of the best films I've ever seen.
Prediction: POSITIVE, Confidence: 0.9916
Text: The acting was mediocre, but the story was compelling.
Prediction: NEGATIVE, Confidence: 0.9233
Text: I was disappointed with the ending, but overall it was good.
Prediction: POSITIVE, Confidence: 0.9581
Saving model...
Model and tokenizer saved successfully.
This script is a clean, well-structured pipeline for binary sentiment classification using HuggingFace Transformers and the IMDB dataset, including:
os, torch, numpy: Standard libraries for OS control, PyTorch tensors, and numerical operations.datasets: For loading and manipulating the IMDB dataset.transformers: HuggingFace’s Transformers library for tokenization, modeling, and training tools.sklearn.metrics: For evaluation metrics like accuracy, precision, recall, F1, and confusion matrix.Counter: To count label distributions.compute_metrics FunctionTrainer to evaluate model performance.label_ids and predictions from model outputs.precision_recall_fscore_support.prepare_dataset Function"text" field with padding and truncation (max_length=128)."label" column to "labels" (required for HuggingFace Trainer).set_format("torch").main() Functionid2label and label2id dictionaries are used for interpretability.load_dataset("imdb").label == 0)label == 1)Counter before proceeding to make sure it’s balanced.prepare_dataset() to tokenize and format the dataset.80% training set20% validation setConfigures how the model is trained:
output_dir: Where to save results.eval_strategy, save_strategy: Evaluate and save every epoch.learning_rate: 2e-5 (standard for BERT).batch_size: 16 for both train and eval.num_train_epochs: 3 full passes over the dataset.fp16: Uses half precision if a GPU is available.Trainer object handles training and evaluation.compute_metrics() for custom evaluation.trainer.train() runs the full training loop.trainer.evaluate() evaluates the final model on the validation set and prints metrics and confusion matrix.argmax to determine predicted class.trainer.save_model() and tokenizer.save_pretrained() to save the fine-tuned model and tokenizer locally for reuse or deployment.try-except block to catch and print any errors along with their tracebacks.The output reflects a complete and successful execution of a binary sentiment classification pipeline using DistilBERT on a balanced subset of the IMDB dataset. The initial warning about uninitialized weights pertains to the classification head layers (classifier and pre_classifier) of the model, which are expected to be randomly initialized when fine-tuning DistilBERT for a downstream task. This is standard behavior and does not indicate any issue.
The dataset loading phase confirms that 3,000 positive and 3,000 negative examples were correctly extracted and combined. The combined dataset was then shuffled and tokenized using the specified tokenizer. After preprocessing, the data was split into a training set (4,800 samples) and a validation set (1,200 samples), with both maintaining a near-equal class distribution, indicating that the class balance was preserved.
During training, the model was fine-tuned over three epochs. The training and validation metrics show consistent performance across epochs. The training loss decreased with each epoch, while the validation loss remained within a reasonable range. The final evaluation yielded strong performance metrics: an accuracy of 88.5%, an F1 score of 0.8865, a precision of 0.8851, and a recall of 0.8880. These metrics suggest that the model generalizes well and maintains balanced performance across both classes.
Confusion matrices from multiple stages indicate that the model consistently predicted both classes with minimal bias. The majority of predictions were correct, with a relatively balanced number of false positives and false negatives, further validating the model’s reliability.
The script also tested the model on four unseen example texts. The predictions were correct and showed high confidence scores, demonstrating the model’s effectiveness in real-world inference scenarios.
Finally, the trained model and tokenizer were saved successfully to the specified directory. This allows for future use without retraining and supports deployment or further fine-tuning.
Overall, the output confirms that the entire training, evaluation, and saving pipeline functioned as intended and produced a robust sentiment classification model.