meta-llama_Llama-3.2-1B-TEQ-int4-gs128-sym / meta-llama_Llama-3.2-3B-TEQ-int4-gs128-asym.py

Initial Upload

12a4ede verified 7 months ago

13.5 kB

	import os
	import sys
	import time
	import random
	import torch

	from collections import UserDict
	from packaging.version import Version
	from datasets import load_dataset
	from torch.utils.data import DataLoader
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from neural_compressor.common import logger
	from neural_compressor.torch.utils import is_hpex_available, get_torch_version

	# ====== utils.py content inlined and fixed ======

	class DataloaderPreprocessor:
	def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None:
	self.dataloader_original = dataloader_original
	self.use_max_length = use_max_length
	self.max_seq_length = max_seq_length
	self.nsamples = nsamples
	self.dataloader = []
	self.is_ready = False

	def get_prepared_dataloader(self):
	if not self.is_ready:
	self.prepare_dataloader()
	return self.dataloader

	def prepare_dataloader(self):
	if self.use_max_length:
	self.obtain_first_n_samples_fulllength()
	else:
	self.obtain_first_n_samples()
	self.is_ready = True

	def obtain_first_n_samples(self, seed=0):
	"""Get first nsample data as the real calibration dataset."""
	self.dataloader.clear()
	random.seed(seed)
	for batch in self.dataloader_original:
	if len(self.dataloader) == self.nsamples:
	logger.info(f"Successfully collect {self.nsamples} calibration samples.")
	break
	# list, tuple
	if isinstance(batch, list) or isinstance(batch, tuple):
	if batch[0].shape[-1] > self.max_seq_length:
	i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1)
	j = i + self.max_seq_length
	batch_final = []
	for item in batch:
	if isinstance(item, torch.Tensor) and item.ndim == 2:
	batch_final.append(item[:, i:j])
	else:
	batch_final.append(item)
	else:
	batch_final = batch[:]
	# dict
	elif isinstance(batch, dict):
	try:
	length = batch["input_ids"].shape[-1]
	except Exception:
	logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
	continue
	batch_final = {}
	if length > self.max_seq_length:
	i = random.randint(0, length - self.max_seq_length - 1)
	j = i + self.max_seq_length
	for key in batch.keys():
	if isinstance(batch[key], torch.Tensor):
	batch_final[key] = batch[key][:, i:j]
	else:
	batch_final[key] = batch[key]
	else:
	batch_final = batch
	# tensor
	else:
	if batch.shape[-1] > self.max_seq_length:
	i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1)
	j = i + self.max_seq_length
	batch_final = batch[:, i:j]
	else:
	batch_final = batch
	self.dataloader.append(batch_final)
	if len(self.dataloader) < self.nsamples:
	logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")

	def obtain_first_n_samples_fulllength(self, seed=0):
	self.dataloader.clear()
	random.seed(seed)
	unified_length = self.max_seq_length
	for batch in self.dataloader_original:
	if len(self.dataloader) == self.nsamples:
	logger.info(f"Successfully collect {self.nsamples} calibration samples.")
	break
	# list & tuple
	if isinstance(batch, list) or isinstance(batch, tuple):
	if batch[0].shape[-1] == unified_length:
	batch_final = batch[:]
	elif batch[0].shape[-1] > unified_length:
	i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
	j = i + unified_length
	batch_final = []
	for item in batch:
	if isinstance(item, torch.Tensor) and item.ndim == 2:
	batch_final.append(item[:, i:j])
	else:
	batch_final.append(item)
	else:
	continue
	# dict
	elif isinstance(batch, dict):
	try:
	length = batch["input_ids"].shape[-1]
	except Exception:
	logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
	continue
	batch_final = {}
	if length == self.max_seq_length:
	batch_final = batch
	elif length > self.max_seq_length:
	i = random.randint(0, length - self.max_seq_length - 1)
	j = i + self.max_seq_length
	for key in batch.keys():
	if isinstance(batch[key], torch.Tensor):
	batch_final[key] = batch[key][:, i:j]
	else:
	batch_final[key] = batch[key]
	else:
	continue
	# tensor
	else:
	if batch.shape[-1] == unified_length:
	batch_final = batch
	elif batch.shape[-1] > unified_length:
	i = random.randint(0, batch.shape[-1] - unified_length - 1)
	j = i + unified_length
	batch_final = batch[:, i:j]
	else:
	continue
	self.dataloader.append(batch_final)
	if len(self.dataloader) < self.nsamples:
	logger.warning(
	f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, "
	f"but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value."
	)

	def get_example_inputs(model, dataloader):
	version = get_torch_version()
	from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
	if dataloader is None:
	return None
	device = next(model.parameters()).device
	try:
	for idx, (input, label) in enumerate(dataloader):
	input = move_input_to_device(input, device)
	if isinstance(input, (dict, UserDict)):
	assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
	if "label" in input.keys():
	input.pop("label")
	if version.release <= Version("2.0.1").release:
	return tuple(input.values())
	else:
	return dict(input)
	if isinstance(input, (list, tuple)):
	return tuple(input)
	if isinstance(input, torch.Tensor):
	return input
	break
	except Exception as e:
	for idx, input in enumerate(dataloader):
	input = move_input_to_device(input, device)
	if isinstance(input, (dict, UserDict)):
	assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
	if "label" in input.keys():
	input.pop("label")
	if version.release <= Version("2.0.1").release:
	return tuple(input.values())
	else:
	return dict(input)
	if isinstance(input, list) or isinstance(input, tuple):
	return tuple(input)
	if isinstance(input, torch.Tensor):
	return input
	break
	if idx == 0:
	assert False, "Please checkout the example_inputs format."

	# ====== End of utils.py content ======

	# ====== Hardcoded arguments ======
	class Args:
	model = "meta-llama/Llama-3.2-1B"
	trust_remote_code = True
	revision = None
	dataset = "neuralmagic/LLM_compression_calibration"
	output_dir = "meta-llama_Llama-3.2-1B-TEQ-int4-gs128-sym"
	quantize = True
	seed = 42
	load = False
	accuracy = False
	performance = False
	iters = 100
	batch_size = 1
	pad_max_length = 512
	calib_iters = 512
	tasks = "lambada_openai,hellaswag,winogrande,piqa"
	peft_model_id = None

	# Weight-only quantization configs
	woq_algo = "TEQ"
	woq_bits = 4
	woq_dtype = "int"
	woq_group_size = 128
	woq_group_dim = 1
	woq_scheme = "sym"
	woq_use_mse_search = False
	woq_use_full_range = False
	quant_lm_head = True
	use_hf_format = False

	# TEQ/AWQ configs
	use_auto_scale = False
	use_auto_clip = False
	folding = False
	absorb_layer_dict = {}

	# DoubleQuant configs
	double_quant_type = None
	double_quant_dtype = "fp32"
	double_quant_bits = 8
	double_quant_use_sym = True
	double_quant_group_size = 256

	args = Args()
	calib_size = 1

	if is_hpex_available():
	import habana_frameworks.torch.core as htcore
	htcore.hpu_set_inference_env()
	device = "hpu"
	else:
	device = "cpu"

	# ====== Helper functions ======
	def get_user_model():
	torchscript = False
	if args.woq_algo in ["AWQ", "TEQ"]:
	torchscript = True
	user_model = AutoModelForCausalLM.from_pretrained(
	args.model,
	torchscript=torchscript,
	trust_remote_code=args.trust_remote_code,
	revision=args.revision,
	)
	tokenizer = AutoTokenizer.from_pretrained(args.model)
	user_model = user_model.float()
	user_model = user_model.to(memory_format=torch.channels_last)
	user_model.eval()
	return user_model, tokenizer

	def calib_func(prepared_model):
	for i, calib_input in enumerate(calib_dataloader):
	if i > args.calib_iters:
	break
	prepared_model(calib_input[0])

	# ====== Main quantization logic ======
	if args.quantize:
	user_model, tokenizer = get_user_model()
	calib_dataset = load_dataset(args.dataset, split="train")
	calib_dataset = calib_dataset.shuffle(seed=args.seed)

	class Evaluator:
	def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
	self.dataset = dataset
	self.tokenizer = tokenizer
	self.batch_size = batch_size
	self.pad_val = pad_val
	self.pad_max = pad_max
	self.is_calib = is_calib
	self.dataset = self.dataset.map(self.tokenize_function, batched=True)
	self.dataset.set_format(type="torch", columns=["input_ids"])

	@torch.no_grad()
	def tokenize_function(self, examples):
	if args.woq_algo in ['TEQ']:
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token
	example = self.tokenizer(examples["text"], padding="max_length", max_length=self.pad_max)
	else:
	example = self.tokenizer(examples["text"])
	return example

	@torch.no_grad()
	def collate_batch(self, batch):
	input_ids_padded = []
	last_ind = []
	for text in batch:
	input_ids = text["input_ids"]
	pad_len = self.pad_max - input_ids.shape[0]
	last_ind.append(input_ids.shape[0] - 1)
	input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids
	input_ids = torch.nn.functional.pad(input_ids, (0, pad_len), value=self.pad_val)
	input_ids_padded.append(input_ids)
	return (torch.vstack(input_ids_padded), torch.tensor(last_ind))

	calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
	calib_dataloader = DataLoader(
	calib_evaluator.dataset,
	batch_size=calib_size,
	shuffle=False,
	collate_fn=calib_evaluator.collate_batch,
	)

	# === TEQ quantization ===
	from neural_compressor.torch.quantization import TEQConfig, prepare, convert

	weight_sym = True if args.woq_scheme == "sym" else False
	quant_config = TEQConfig(
	dtype=args.woq_dtype,
	bits=args.woq_bits,
	use_sym=weight_sym,
	group_size=args.woq_group_size,
	group_dim=args.woq_group_dim,
	folding=args.folding,
	quant_lm_head=args.quant_lm_head,
	)

	example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
	run_fn = calib_func

	user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
	run_fn(user_model)
	user_model = convert(user_model)

	# === Save quantized model ===
	os.makedirs(args.output_dir, exist_ok=True)
	print("Saving weight-only quantized model to", args.output_dir)
	if args.use_hf_format:
	user_model.save(args.output_dir, format="huggingface")
	tokenizer.save_pretrained(args.output_dir)
	else:
	user_model.save(args.output_dir)
	print("Saved weight-only quantized model.")

	else:
	print("Quantization not enabled. Exiting.")