AI Model Efficiency Toolkit (AIMET) Forum

Is setting model.eval() required while using QuantizationSimModel in PyTorch?

Dear AIMET Researchers,

During my experiment, when using QuantizationSimModel to evaluate the quantized version of specific model, setting w/ or w/o model.eval() gets large gap between the quantized accuracy. For example, w/ model.eval() on ResNet18 using cifar-10 dataset, we get 0.9474 on 31a31w quantization setting. However, w/o model.eval() on same model and dataset, we get 0.8588 on same quantization setting. The original FP32 model accuracy is 0.9463.

The question is which setting is correct during using QuantizationSimModel? I hope you can share your experience to me. Thank you for your attention!

P.S. The code I written is attached below:

from __future__ import division
import numpy as np
import timm
import PIL
import numpy as np
from tqdm import tqdm
import copy
import torchvision
import torch
from torchvision import transforms

# Quantization related import
from aimet_torch.quantsim import QuantizationSimModel

from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

MODEL_PATH = '../model_pth/cifar_resnet.pth' # resnet18's pth

NUM_FINETUNE_CLASSES = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose(
[transforms.Resize((224, 224), interpolation=PIL.Image.BICUBIC),
  transforms.ToTensor(),
  transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)])
batch_size = 512
testset = torchvision.datasets.CIFAR10(root='../data', train=False,
                                    download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                      shuffle=False, num_workers=32)

def evaluate_model(model: torch.nn.Module, eval_iterations: int, use_cuda: bool = True) -> float:
  """
  This is intended to be the user-defined model evaluation function.
  AIMET requires the above signature. So if the user's eval function does not
  match this signature, please create a simple wrapper.

  Note: Honoring the number of iterations is not absolutely necessary.
  However if all evaluations run over an entire epoch of validation data,
  the runtime for AIMET compression will obviously be higher.

  :param model: Model to evaluate
  :param eval_iterations: Number of iterations to use for evaluation.
          None for entire epoch.
  :param use_cuda: If true, evaluate using gpu acceleration
  :return: single float number (accuracy) representing model's performance
  """
  if (eval_iterations is not None):
    target_sample_number = eval_iterations * batch_size
    num_smaple_data = min(target_sample_number, len(testset))
  else: 
    num_smaple_data = len(testset)
  
  subdataset = torch.utils.data.Subset(testset, range(len(testset))[:num_smaple_data])
  subtestloader = torch.utils.data.DataLoader(subdataset, batch_size=batch_size,
                                      shuffle=False, num_workers=32)
  if (use_cuda):
    model.to(device)
 
  correct = 0
  total = 0
  # since we're not training, we don't need to calculate the gradients for our outputs
  with torch.no_grad():
    for i, data in tqdm(enumerate(subtestloader, 0), total=len(subtestloader)):
      if (use_cuda):
        images, labels = data[0].to(device), data[1].to(device)
      else:
        images, labels = data
      # calculate outputs by running images through the network 
      outputs = model(images)
      # the class with the highest energy is what we choose as prediction
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()\
  return (correct / total)

def quantize_model(model, output_bw: int, param_bw: int):
  acc_before_quant = evaluate_model(model, 20, True)
  print ("FP32 model accuracy: %f" % (acc_before_quant))

  model_copy = copy.deepcopy(model)
  sim = QuantizationSimModel(model_copy, default_output_bw=output_bw, default_param_bw=param_bw, dummy_input=torch.rand(1, 3, 224, 224).to(device))
  sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5)
  acc_after_quant_train = evaluate_model(sim.model, 20, True)
  print ("Without model.eval(), Output bit: %d, params bit: %d, model accuracy: %f" %(output_bw, param_bw, acc_after_quant_train))
  
  model_copy = copy.deepcopy(model)
  model_copy.eval()
  sim = QuantizationSimModel(model_copy, default_output_bw=output_bw, default_param_bw=param_bw, dummy_input=torch.rand(1, 3, 224, 224).to(device))
  sim.compute_encodings(forward_pass_callback=evaluate_model, forward_pass_callback_args=5)
  acc_after_quant_eval = evaluate_model(sim.model, 20, True)
  print ("With model.eval(), Output bit: %d, params bit: %d, model accuracy: %f" %(output_bw, param_bw, acc_after_quant_eval))

if __name__ == '__main__':
  model = timm.create_model('resnet18', pretrained=True, num_classes=NUM_FINETUNE_CLASSES).to(device)
  model.load_state_dict(torch.load(MODEL_PATH))
  quantize_model(model, 31, 31)  

Best regards,
Edan

@edan840216 We do set the model.eval() before performing quantization simulation. This sets the model in evaluation (inference) mode and it has specific effect on certain type of modules such as Batchnorm etc.