diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index 5f0b9140c..8b6da5b0b 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -85,29 +85,17 @@ parser.add_argument( "--enable_qnn", "--enable-qnn", - nargs="?", - const=True, - type=str, + action="store_true", default=False, help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\ If not provided, the default configuration will be used.\ Sample Config: QEfficient/compile/qnn_config.json", ) - - args, compiler_options = parser.parse_known_args() - - if isinstance(args.enable_qnn, str): - args.qnn_config = args.enable_qnn - args.enable_qnn = True - - compiler_options_dict = {} - for i in range(0, len(compiler_options)): - if compiler_options[i].startswith("--"): - key = compiler_options[i].lstrip("-").replace("-", "_") - value = ( - compiler_options[i + 1] - if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") - else True - ) - compiler_options_dict[key] = value - QEfficient.compile(**args.__dict__, **compiler_options_dict) + parser.add_argument( + "qnn_config", + nargs="?", + type=str, + ) + # FIXME(ochougul): Allow extra compilation arguments + args = parser.parse_args() + QEfficient.compile(**vars(args)) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index c440e73c0..f312d00cb 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -7,7 +7,6 @@ import random import warnings -from typing import Any, Dict, Optional, Union import fire import numpy as np @@ -18,9 +17,8 @@ import torch.utils.data from peft import PeftModel, get_peft_model from torch.optim.lr_scheduler import StepLR -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer -from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG from QEfficient.finetune.utils.config_utils import ( generate_dataset_config, generate_peft_config, @@ -34,81 +32,52 @@ from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train from QEfficient.utils._utils import login_and_download_hf_lm -# Try importing QAIC-specific module, proceed without it if unavailable try: import torch_qaic # noqa: F401 except ImportError as e: - print(f"Warning: {e}. Proceeding without QAIC modules.") + print(f"Warning: {e}. Moving ahead without these qaic modules.") -from transformers import AutoModelForSequenceClassification +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer # Suppress all warnings warnings.filterwarnings("ignore") -def setup_distributed_training(train_config: TrainConfig) -> None: - """Initialize distributed training environment if enabled. - - Args: - train_config (TrainConfig): Training configuration object. - - Notes: - - If distributed data parallel (DDP) is disabled, this function does nothing. - - Ensures the device is not CPU and does not specify an index for DDP compatibility. - - Initializes the process group using the specified distributed backend. - - Raises: - AssertionError: If device is CPU or includes an index with DDP enabled. +def main(**kwargs): """ - if not train_config.enable_ddp: - return + Helper function to finetune the model on QAic. - torch_device = torch.device(train_config.device) - assert torch_device.type != "cpu", "Host doesn't support single-node DDP" - assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}" + .. code-block:: bash - dist.init_process_group(backend=train_config.dist_backend) - # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank - getattr(torch, torch_device.type).set_device(dist.get_rank()) + python -m QEfficient.cloud.finetune OPTIONS + """ + # update the configuration for the training process + train_config = TRAIN_CONFIG() + update_config(train_config, **kwargs) + dataset_config = generate_dataset_config(train_config, kwargs) + device = train_config.device -def setup_seeds(seed: int) -> None: - """Set random seeds across libraries for reproducibility. + # dist init + if train_config.enable_ddp: + # TODO: may have to init qccl backend, next try run with torchrun command + torch_device = torch.device(device) + assert torch_device.type != "cpu", "Host doesn't support single-node DDP" + assert torch_device.index is None, ( + f"DDP requires specification of device type only, however provided device index as well: {torch_device}" + ) + dist.init_process_group(backend=train_config.dist_backend) + # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank + getattr(torch, torch_device.type).set_device(dist.get_rank()) - Args: - seed (int): Seed value to set for random number generators. + # Set the seeds for reproducibility + torch.manual_seed(train_config.seed) + random.seed(train_config.seed) + np.random.seed(train_config.seed) - Notes: - - Sets seeds for PyTorch, Python's random module, and NumPy. - """ - torch.manual_seed(seed) - random.seed(seed) - np.random.seed(seed) - - -def load_model_and_tokenizer( - train_config: TrainConfig, dataset_config: Any, peft_config_file: str, **kwargs -) -> tuple[AutoModelForCausalLM, AutoTokenizer]: - """Load the pre-trained model and tokenizer from Hugging Face. - - Args: - config (TrainConfig): Training configuration object containing model and tokenizer names. - dataset_config (Any): A dataclass object representing dataset configuration. - peft_config_file (str): Path to PEFT config file used for PEFT finetuning. - kwargs: Additional arguments to override PEFT config. - - Returns: - tuple: A tuple of two values. - - Model with pretrained weights loaded. - - Model's tokenizer (AutoTokenizer). - - Notes: - - Downloads the model if not already cached using login_and_download_hf_lm. - - Configures the model with FP16 precision and disables caching for training. - - Resizes model embeddings if tokenizer vocab size exceeds model embedding size. - - Sets pad_token_id to eos_token_id if not defined in the tokenizer. - """ + # Load the pre-trained model and setup its configuration + # config = AutoConfig.from_pretrained(train_config.model_name) pretrained_model_path = login_and_download_hf_lm(train_config.model_name) if train_config.task_type == "seq_classification": model = AutoModelForSequenceClassification.from_pretrained( @@ -135,6 +104,7 @@ def load_model_and_tokenizer( torch_dtype=torch.float16, ) + # Load the tokenizer and add special tokens tokenizer = AutoTokenizer.from_pretrained( train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name ) @@ -144,12 +114,14 @@ def load_model_and_tokenizer( # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing embedding matrix to match tokenizer vocab size.") + print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) - # FIXME (Meet): Cover below line inside the logger once it is implemented. print_model_size(model, train_config) + # print the datatype of the model parameters + # print(get_parameter_dtypes(model)) + # Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model. # Because, both makes model.is_gradient_checkpointing = True which is used in peft library to # apply gradient checkpointing related hooks to the input embeddings. Without this we will get @@ -162,70 +134,17 @@ def load_model_and_tokenizer( else: raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.") - model = apply_peft(model, train_config, peft_config_file, **kwargs) - - return model, tokenizer - - -def apply_peft( - model: AutoModel, train_config: TrainConfig, peft_config_file: Dict, **kwargs -) -> Union[AutoModel, PeftModel]: - """Apply Parameter-Efficient Fine-Tuning (PEFT) to the model if enabled. - - Args: - model (AutoModel): Huggingface model. - train_config (TrainConfig): Training configuration object. - peft_config_file (str, optional): Path to YAML/JSON file containing - PEFT (LoRA) config. Defaults to None. - kwargs: Additional arguments to override PEFT config params. + if train_config.use_peft: + # Load the pre-trained peft model checkpoint and setup its configuration + if train_config.from_peft_checkpoint: + model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True) + peft_config = model.peft_config + # Generate the peft config and start fine-tuning from original model + else: + peft_config = generate_peft_config(train_config, kwargs) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() - Returns: - Union[AutoModel, PeftModel]: If the use_peft in train_config is True - then PeftModel object is returned else original model object - (AutoModel) is returned. - """ - if not train_config.use_peft: - return model - - # Load the pre-trained peft model checkpoint and setup its configuration - if train_config.from_peft_checkpoint: - model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True) - peft_config = model.peft_config - # Generate the peft config and start fine-tuning from original model - else: - peft_config = generate_peft_config(train_config, peft_config_file, **kwargs) - model = get_peft_model(model, peft_config) - model.print_trainable_parameters() - - return model - - -def setup_dataloaders( - train_config: TrainConfig, - dataset_config: Any, - tokenizer: AutoTokenizer, -) -> tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader], int]: - """Set up training and validation DataLoaders. - - Args: - train_config (TrainConfig): Training configuration object. - dataset_config (Any): Configuration for the dataset (generated from train_config). - tokenizer (AutoTokenizer): Tokenizer for preprocessing data. - - Returns: - tuple: A tuple of three values. - - First value represents train_dataloader - - Second value represents eval_dataloader. It is None if - validation is disabled. - - Length of longest sequence in the dataset. - - Raises: - ValueError: If validation is enabled but the validation set is too small. - - Notes: - - Applies a custom data collator if provided by get_custom_data_collator. - - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits. - """ # Get the dataset utils dataset_processer = tokenizer @@ -245,8 +164,6 @@ def setup_dataloaders( ## train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") print("length of dataset_train", len(dataset_train)) - - # FIXME (Meet): Add custom data collator registration from the outside by the user. custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: print("custom_data_collator is used") @@ -291,66 +208,40 @@ def setup_dataloaders( else: longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset) - return train_dataloader, eval_dataloader, longest_seq_length - - -def main(peft_config_file: str = None, **kwargs) -> None: - """ - Fine-tune a model on QAIC hardware with configurable training and LoRA parameters. - - Args: - peft_config_file (str, optional): Path to YAML/JSON file containing PEFT (LoRA) config. Defaults to None. - kwargs: Additional arguments to override TrainConfig. - - Example: - .. code-block:: bash - - # Using a YAML config file for PEFT - python -m QEfficient.cloud.finetune \\ - --model_name "meta-llama/Llama-3.2-1B" \\ - --lr 5e-4 \\ - --peft_config_file "lora_config.yaml" - - # Using default LoRA config - python -m QEfficient.cloud.finetune \\ - --model_name "meta-llama/Llama-3.2-1B" \\ - --lr 5e-4 - """ - train_config = TrainConfig() - update_config(train_config, **kwargs) - dataset_config = generate_dataset_config(train_config.dataset) - update_config(dataset_config, **kwargs) - - setup_distributed_training(train_config) - setup_seeds(train_config.seed) - model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs) - - # Create DataLoaders for the training and validation dataset - train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer) print( f"The longest sequence length in the train data is {longest_seq_length}, " f"passed context length is {train_config.context_length} and overall model's context length is " f"{model.config.max_position_embeddings}" ) - model.to(train_config.device) - optimizer = optim.AdamW(model.parameters(), lr=train_config.lr, weight_decay=train_config.weight_decay) + optimizer = optim.AdamW( + model.parameters(), + lr=train_config.lr, + weight_decay=train_config.weight_decay, + ) scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma) + + # wrap model with DDP if train_config.enable_ddp: model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()]) - results = train( + + _ = train( model, - tokenizer, train_dataloader, eval_dataloader, + tokenizer, optimizer, scheduler, + train_config.gradient_accumulation_steps, train_config, + train_config.device, dist.get_rank() if train_config.enable_ddp else None, + None, ) + + # finalize torch distributed if train_config.enable_ddp: dist.destroy_process_group() - return results if __name__ == "__main__": diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 30e67344a..68be72fa8 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -197,10 +197,6 @@ def main( **kwargs, ) - # If the io-encrypt flag is passed we will exit after QPC generation. - if kwargs.get("io_encrypt", None): - exit() - ######### # Execute ######### diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index 70a912cd7..5ce22bed9 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -64,6 +64,9 @@ def compile_kv_model_on_cloud_ai_100( DeprecationWarning, stacklevel=2, ) + if kwargs: + # FIXME + raise NotImplementedError("Can't handle extra compilation args now!") aic_binary_dir = os.path.join(base_path, "qpcs") if os.path.isdir(aic_binary_dir): @@ -108,13 +111,6 @@ def compile_kv_model_on_cloud_ai_100( with open(mdp_ts_config_path, "w") as file: json.dump(mdp_ts_config, file, indent=4) command.append(f"-mdp-load-partition-config={mdp_ts_config_path}") - for key, value in kwargs.items(): - option = "-" + key.replace("_", "-") - if isinstance(value, bool): - if value: - command.append(option) - continue - command.append(f"{option}={value}") print("Running AI 100 compiler:", " ".join(command)) result = subprocess.run(command, capture_output=True, text=True) if result.returncode != 0: @@ -225,13 +221,6 @@ def compile( allow_mxint8_mdp_io=allow_mxint8_mdp_io, mos=mos, device_group=device_group, - **kwargs, ) - if kwargs.get("io_encrypt", None): - logger.warning( - f"Compilation for IO-Encrypt has been successfully completed at path: {qpc_path}. However, Efficient-Transformers do not support IO-Encrypt execution. Please run the execution separately" - ) - else: - logger.info(f"Compiled QPC files can be found here: {qpc_path}") - + logger.info(f"Compiled QPC files can be found here: {qpc_path}") return qpc_path diff --git a/QEfficient/finetune/configs/peft_config.py b/QEfficient/finetune/configs/peft_config.py index a47774500..e2d018f05 100644 --- a/QEfficient/finetune/configs/peft_config.py +++ b/QEfficient/finetune/configs/peft_config.py @@ -9,24 +9,15 @@ from typing import List +# Currently, the support is for Lora Configs only +# In future, we can expand to llama_adapters and prefix tuning +# TODO: vbaddi: Check back once FSDP is enabled @dataclass -class LoraConfig: - """LoRA-specific configuration for parameter-efficient fine-tuning. - - Attributes: - r (int): LoRA rank (default: 8). - lora_alpha (int): LoRA scaling factor (default: 32). - target_modules (List[str]): Modules to apply LoRA to (default: ["q_proj", "v_proj"]). - bias (str): Bias handling in LoRA (default: "none"). - task_type (str): Task type for LoRA (default: "CAUSAL_LM"). - lora_dropout (float): Dropout rate for LoRA (default: 0.0). - inference_mode (bool): Whether model is in inference mode (default: False). - """ - +class lora_config: r: int = 8 lora_alpha: int = 32 target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"]) - bias: str = "none" + bias = "none" task_type: str = "CAUSAL_LM" lora_dropout: float = 0.05 inference_mode: bool = False # should be False for finetuning @@ -34,6 +25,6 @@ class LoraConfig: # CAUTION prefix tuning is currently not supported @dataclass -class PrefixConfig: +class prefix_config: num_virtual_tokens: int = 30 task_type: str = "CAUSAL_LM" diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index 69b083b6a..c50954c4c 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -7,54 +7,8 @@ from dataclasses import dataclass -# Configuration Classes @dataclass -class TrainConfig: - """Training configuration for model fine-tuning. - - Attributes: - model_name (str): Name of the pre-trained model to fine-tune (default: "meta-llama/Llama-3.2-1B"). - tokenizer_name (str): Name of the tokenizer (defaults to model_name if None). - run_validation (bool): Whether to run validation during training (default: True). - batch_size_training (int): Batch size for training (default: 1). - context_length (Optional[int]): Maximum sequence length for inputs (default: None). - gradient_accumulation_steps (int): Steps for gradient accumulation (default: 4). - gradient checkpointing (bool): Enable gradient checkpointing to save the memory by compromising the speed. (default: False). - num_epochs (int): Number of training epochs (default: 1). - max_train_step (int): Maximum training steps (default: 0, unlimited if 0). - max_eval_step (int): Maximum evaluation steps (default: 0, unlimited if 0). - device (str): Device to train on (default: "qaic"). - num_workers_dataloader (int): Number of workers for data loading (default: 1). - lr (float): Learning rate (default: 3e-4). - weight_decay (float): Weight decay for optimizer (default: 0.0). - gamma (float): Learning rate decay factor (default: 0.85). - seed (int): Random seed for reproducibility (default: 42). - use_fp16 (bool): Use mixed precision training (default: True). - use_autocast (bool): Use autocast for mixed precision (default: True). - val_batch_size (int): Batch size for validation (default: 1). - dataset (str): Dataset name for training (default: "samsum_dataset"). - task_type (str): Type of task for which the finetuning is to be done. Options: "generation" and "seq_classification". (default: "generation") - peft_method (str): Parameter-efficient fine-tuning method (default: "lora"). - use_peft (bool): Whether to use PEFT (default: True). - from_peft_checkpoint (str): Path to PEFT checkpoint (default: ""). - output_dir (str): Directory to save outputs (default: "meta-llama-samsum"). - num_freeze_layers (int): Number of layers to freeze (default: 1). - one_qaic (bool): Use single QAIC device (default: False). - save_model (bool): Save the trained model (default: True). - save_metrics (bool): Save training metrics (default: True). - intermediate_step_save (int): Steps between intermediate saves (default: 1000). - batching_strategy (str): Batching strategy (default: "packing"). - enable_sorting_for_ddp (bool): Sort data for DDP (default: True). - convergence_counter (int): Steps to check convergence (default: 5). - convergence_loss (float): Loss threshold for convergence (default: 1e-4). - use_profiler (bool): Enable profiling (default: False). - enable_ddp (bool): Enable distributed data parallel (default: False). - dist_backend (str): Backend for distributed training (default: "cpu:gloo,qaic:qccl,cuda:gloo"). - grad_scaler (bool): Use gradient scaler (default: True). - dump_root_dir (str): Directory for mismatch dumps (default: "meta-llama-samsum-mismatches/step_"). - opByOpVerifier (bool): Enable operation-by-operation verification (default: False). - """ - +class train_config: model_name: str = "meta-llama/Llama-3.2-1B" tokenizer_name: str = None # if not passed as an argument, it uses the value of model_name run_validation: bool = True diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index 3fe6e0d81..918230554 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -11,6 +11,7 @@ import fire import numpy as np import torch +from configs.training import train_config as TRAIN_CONFIG from peft import AutoPeftModelForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer from utils.config_utils import ( @@ -24,8 +25,6 @@ ) from utils.train_utils import evaluation, print_model_size -from QEfficient.finetune.configs.training import TrainConfig - try: import torch_qaic # noqa: F401 @@ -40,7 +39,7 @@ def main(**kwargs): # update the configuration for the training process - train_config = TrainConfig() + train_config = TRAIN_CONFIG() update_config(train_config, **kwargs) # Set the seeds for reproducibility diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index c5c7fe615..e979961d6 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -4,39 +4,27 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + import inspect -import json -import os from dataclasses import asdict -from typing import Any, Dict import torch.distributed as dist import torch.utils.data as data_utils -import yaml from peft import ( AdaptionPromptConfig, + LoraConfig, PrefixTuningConfig, ) -from peft import LoraConfig as PeftLoraConfig from transformers.data import DataCollatorForSeq2Seq import QEfficient.finetune.configs.dataset_config as datasets -from QEfficient.finetune.configs.peft_config import LoraConfig, PrefixConfig -from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.configs.peft_config import lora_config, prefix_config +from QEfficient.finetune.configs.training import train_config from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC def update_config(config, **kwargs): - """Update the attributes of a config object based on provided keyword arguments. - - Args: - config: The configuration object (e.g., TrainConfig, LoraConfig) or a list/tuple of such objects. - **kwargs: Keyword arguments representing attributes to update. - - Raises: - ValueError: If an unknown parameter is provided and the config type doesn't support nested updates. - """ if isinstance(config, (tuple, list)): for c in config: update_config(c, **kwargs) @@ -45,73 +33,40 @@ def update_config(config, **kwargs): if hasattr(config, k): setattr(config, k, v) elif "." in k: - config_name, param_name = k.split(".", 1) - if type(config).__name__.lower() == config_name.lower(): + # allow --some_config.some_param=True + config_name, param_name = k.split(".") + if type(config).__name__ == config_name: if hasattr(config, param_name): setattr(config, param_name, v) else: - raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'") - else: - config_type = type(config).__name__ - # FIXME (Meet): Once logger is available put this in debug level. - print(f"[WARNING]: Unknown parameter '{k}' for config type '{config_type}'") + # In case of specialized config we can warn user + assert False, f"Warning: {config_name} does not accept parameter: {k}" + elif isinstance(config, train_config): + assert False, f"Warning: unknown parameter {k}" -def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any: - """Generate a PEFT-compatible configuration from a custom config based on peft_method. +def generate_peft_config(train_config, kwargs): + configs = (lora_config, prefix_config) + peft_configs = (LoraConfig, AdaptionPromptConfig, PrefixTuningConfig) + names = tuple(c.__name__.rstrip("_config") for c in configs) - Args: - train_config (TrainConfig): Training configuration with peft_method. - custom_config: Custom configuration object (e.g., LoraConfig). + if train_config.peft_method not in names: + raise RuntimeError(f"Peft config not found: {train_config.peft_method}") - Returns: - Any: A PEFT-specific configuration object (e.g., PeftLoraConfig). + config = configs[names.index(train_config.peft_method)]() - Raises: - RuntimeError: If the peft_method is not supported. - """ - if peft_config_file: - peft_config_data = load_config_file(peft_config_file) - validate_config(peft_config_data, config_type="lora") - peft_config = PeftLoraConfig(**peft_config_data) - else: - config_map = { - "lora": (LoraConfig, PeftLoraConfig), - "prefix": (PrefixConfig, PrefixTuningConfig), - "adaption_prompt": (None, AdaptionPromptConfig), - } - - if train_config.peft_method not in config_map: - raise RuntimeError(f"Peft config not found: {train_config.peft_method}") - - config_cls, peft_config_cls = config_map[train_config.peft_method] - if config_cls is None: - params = kwargs - else: - config = config_cls() - update_config(config, **kwargs) - params = asdict(config) + update_config(config, **kwargs) + params = asdict(config) + peft_config = peft_configs[names.index(train_config.peft_method)](**params) - peft_config = peft_config_cls(**params) return peft_config -def generate_dataset_config(dataset_name: str) -> Any: - """Generate a dataset configuration based on the specified dataset. - - Args: - dataset_name (str): Name of the dataset to be used for finetuning. - - Returns: - Any: A dataset configuration object. - - Raises: - AssertionError: If the dataset name is not recognized. - """ - supported_datasets = DATASET_PREPROC.keys() - assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported." - # FIXME (Meet): Replace below logic by creating using auto registry of datasets. - dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]() +def generate_dataset_config(train_config, kwargs): + names = tuple(DATASET_PREPROC.keys()) + assert train_config.dataset in names, f"Unknown dataset: {train_config.dataset}" + dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[train_config.dataset]() + update_config(dataset_config, **kwargs) return dataset_config @@ -143,84 +98,3 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, mode): kwargs["drop_last"] = True kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) return kwargs - - -def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> None: - """Validate the provided YAML/JSON configuration for required fields and types. - - Args: - config_data (Dict[str, Any]): The configuration dictionary loaded from YAML/JSON. - config_type (str): Type of config to validate ("lora" for LoraConfig, default: "lora"). - - Raises: - ValueError: If required fields are missing or have incorrect types. - FileNotFoundError: If the config file path is invalid (handled upstream). - - Notes: - - Validates required fields for LoraConfig: r, lora_alpha, target_modules. - - Ensures types match expected values (int, float, list, etc.). - """ - if config_type.lower() != "lora": - raise ValueError(f"Unsupported config_type: {config_type}. Only 'lora' is supported.") - - required_fields = { - "r": int, - "lora_alpha": int, - "target_modules": list, - } - optional_fields = { - "bias": str, - "task_type": str, - "lora_dropout": float, - "inference_mode": bool, - } - - # Check for missing required fields - missing_fields = [field for field in required_fields if field not in config_data] - if missing_fields: - raise ValueError(f"Missing required fields in {config_type} config: {missing_fields}") - - # Validate types of required fields - for field, expected_type in required_fields.items(): - if not isinstance(config_data[field], expected_type): - raise ValueError( - f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, " - f"got {type(config_data[field]).__name__}" - ) - - # Validate target_modules contains strings - if not all(isinstance(mod, str) for mod in config_data["target_modules"]): - raise ValueError("All elements in 'target_modules' must be strings") - - # Validate types of optional fields if present - for field, expected_type in optional_fields.items(): - if field in config_data and not isinstance(config_data[field], expected_type): - raise ValueError( - f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, " - f"got {type(config_data[field]).__name__}" - ) - - -def load_config_file(config_path: str) -> Dict[str, Any]: - """Load a configuration from a YAML or JSON file. - - Args: - config_path (str): Path to the YAML or JSON file. - - Returns: - Dict[str, Any]: The loaded configuration as a dictionary. - - Raises: - FileNotFoundError: If the file does not exist. - ValueError: If the file format is unsupported. - """ - if not os.path.exists(config_path): - raise FileNotFoundError(f"Config file not found: {config_path}") - - with open(config_path, "r") as f: - if config_path.endswith(".yaml") or config_path.endswith(".yml"): - return yaml.safe_load(f) - elif config_path.endswith(".json"): - return json.load(f) - else: - raise ValueError("Unsupported config file format. Use .yaml, .yml, or .json") diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 8693ae32d..2bc701008 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -18,7 +18,7 @@ from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm -from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG try: import torch_qaic # noqa: F401 @@ -34,31 +34,34 @@ def train( model, - tokenizer, train_dataloader, eval_dataloader, + tokenizer, optimizer, lr_scheduler, - train_config: TrainConfig, + gradient_accumulation_steps, + train_config: TRAIN_CONFIG, + device, local_rank=None, + rank=None, ): """ Trains the model on the given dataloader Args: model: The model to be trained - tokenizer: tokenizer used in the eval for decoding the predicitons train_dataloader: The dataloader containing the training data - eval_dataloader: The dataloader containing the eval data optimizer: The optimizer used for training lr_scheduler: The learning rate scheduler - train_config: The training configuration + gradient_accumulation_steps: The number of steps to accumulate gradients before performing a backward/update operation + num_epochs: The number of epochs to train for local_rank: The rank of the current node in a distributed setting + train_config: The training configuration + eval_dataloader: The dataloader containing the eval data + tokenizer: tokenizer used in the eval for decoding the predicitons Returns: results dictionary containing average training and validation perplexity and loss """ - device = train_config.device - train_metric = [] train_loss = [] val_metric = [] @@ -458,7 +461,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): # Print evaluation metrics print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") - return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric + return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 70601489d..58b837e9c 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -560,9 +560,16 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower - def forward(self, pixel_values): + def forward(self, input_ids, pixel_values): + inputs_embeds = self.model.get_input_embeddings()(input_ids) + B, N, C = inputs_embeds.shape image_features = self.model.get_image_features(pixel_values=pixel_values) - return image_features + selected = input_ids == self.model.config.image_token_index + indices1 = selected.to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) + image_features_expanded = image_features.reshape(-1, C).unsqueeze(0)[indices0, indices1] + image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) + return image_input_embeds class QEffGemma3DecoderWrapper(nn.Module): @@ -572,21 +579,14 @@ def __init__(self, model): self.language_model = self.model.language_model self.config = self.model.config - def forward(self, input_ids, vision_embeds, position_ids, index, past_key_values): - inputs_embeds = self.model.get_input_embeddings()(input_ids) - B, N, C = inputs_embeds.shape - selected = input_ids == self.model.config.image_token_index - indices1 = selected.to(torch.int64).cumsum(1) - 1 - indices1 = torch.where(indices1 != -1, indices1 + index, indices1) - indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) - image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1] - image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) - inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) + def forward(self, input_ids, vision_embeds, position_ids, past_key_values): + image_embeds = vision_embeds[:, : input_ids.shape[1], :] + inputs_embeds = self.model.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_embeds) outputs = self.model.language_model( inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True ) - index = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) - return outputs.logits, vision_embeds, index, outputs.past_key_values + return outputs.logits, vision_embeds, outputs.past_key_values class QEffGemma3ForConditionalGeneration(Gemma3ForConditionalGeneration): @@ -605,6 +605,11 @@ def get_specializations( kv_offload: bool = False, **compiler_options, ): + vision_seq_len = compiler_options.pop("vision_seq_len", None) + if vision_seq_len is None: + # TODO: Check properly for Gemma3, Not verified yet. + vision_seq_len = 512 # for Gemma3 Vision feature shape is (1, 4096, 1152) --> 1152 is hidden size) + prefill_seq_len = prefill_seq_len if prefill_seq_len else 32 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN if img_size is None and hasattr(self.config.vision_config, "image_size"): @@ -612,13 +617,12 @@ def get_specializations( elif img_size is None: img_size = 896 # FIXME based on gemma3 Image size logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config") - mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) vision = [ { "batch_size": batch_size, "img_size": img_size, - "seq_len": prefill_seq_len, + "seq_len": vision_seq_len, "ctx_len": ctx_len, } ] @@ -628,14 +632,14 @@ def get_specializations( "seq_len": prefill_seq_len, "ctx_len": ctx_len, "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, + "chunk_length": prefill_seq_len, }, { "batch_size": batch_size, "seq_len": "1", "ctx_len": ctx_len, "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, + "chunk_length": prefill_seq_len, }, ] @@ -654,8 +658,9 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False): lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "mm_tokens_per_image"} + lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "chunk_length"} vision_dynamic_axes["pixel_values"] = {0: "batch_size", 2: "img_size", 3: "img_size"} + vision_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} for i in range(self.language_model.config.num_hidden_layers): @@ -680,7 +685,6 @@ def get_output_names(self, kv_offload: bool = False): output_names = {} if kv_offload: lang_output_names.insert(1, "vision_embeds_RetainedState") - lang_output_names.insert(2, "index_output") output_names["vision"] = vision_output_names output_names["lang"] = lang_output_names else: @@ -694,13 +698,12 @@ def get_dummy_inputs(self, kv_offload: bool = False): else: img_size = 896 - mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) # Define shapes inputs_shapes = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( 1, # constants.INTERN_NUM_PATCHES, - mm_tokens_per_image, # constants.INTERN_FEATURE_SIZE, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, # constants.INTERN_FEATURE_SIZE, self.language_model.config.hidden_size, # 5120 ) inputs_shapes["position_ids"] = ( @@ -713,12 +716,12 @@ def get_dummy_inputs(self, kv_offload: bool = False): img_size, img_size, ) - inputs_shapes["index"] = (1, 1) # Define inputs vision_inputs = {} lang_inputs = {} vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + vision_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32) lang_inputs["position_ids"] = ( @@ -726,7 +729,7 @@ def get_dummy_inputs(self, kv_offload: bool = False): .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) - lang_inputs["index"] = torch.zeros((inputs_shapes["index"]), dtype=torch.int64) + # Add data for KV kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index ebfd529cc..1a9610187 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -751,8 +751,8 @@ def kv_offload_generate( input_len = inputs["attention_mask"].sum(1, keepdims=True) input_ids_length = inputs["input_ids"].shape[1] num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float - padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len - + # padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len + padded_len = vision_session.bindings[vision_session.binding_index_map["input_ids"]].dims[1] if generation_len is None: generation_len = ctx_len - input_len.max() assert generation_len > 0, "generation length should be greater than zero" @@ -783,11 +783,13 @@ def kv_offload_generate( } vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + vision_inputs["input_ids"] = inputs["input_ids"] vision_start = perf_counter() vision_outputs = vision_session.run(vision_inputs) vision_end = perf_counter() lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + lang_inputs["input_ids"] = inputs["input_ids"] lang_inputs["position_ids"] = np.where( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens @@ -795,27 +797,25 @@ def kv_offload_generate( vision_session.deactivate() lang_session.activate() lang_inputs["vision_embeds"] = vision_outputs["vision_embeds"] - lang_session.set_buffers(vision_outputs) + # lang_session.set_buffers(vision_outputs) prefill_start = perf_counter() # Run prefill - chunk_inputs = lang_inputs.copy() - chunk_inputs["index"] = np.array([[0]]) for i in range(num_chunks): + chunk_inputs = lang_inputs.copy() chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] chunk_inputs["position_ids"] = lang_inputs["position_ids"][ :, i * prefill_seq_len : (i + 1) * prefill_seq_len ] + chunk_inputs["vision_embeds"] = lang_inputs["vision_embeds"][ + :, i * prefill_seq_len : (i + 1) * prefill_seq_len + ] outputs = lang_session.run(chunk_inputs) - chunk_inputs["index"] = outputs["index_output"] prefill_time = perf_counter() - prefill_start + vision_end - vision_start + lang_inputs["vision_embeds"] = lang_inputs["vision_embeds"][:, :prefill_seq_len] # Skip inputs/outputs again lang_session.skip_buffers( - [ - x - for x in lang_session.input_names + lang_session.output_names - if x.startswith("past_") or x.endswith("_RetainedState") - ] + [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] ) # Get first token @@ -1643,11 +1643,6 @@ def compile( **compiler_options, ) - if compiler_options.get("io_encrypt", None): - logger.warning( - "Compilation for IO-Encrypt has been successfully completed. However, Efficient-Transformers do not support IO-Encrypt execution. Please run the execution separately with QPC compiled without io-encrypt." - ) - return qpc_path # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 564bdd94d..b6af66be5 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -521,57 +521,27 @@ def __repr__(self): def dump_qconfig(func): def wrapper(self, *args, **kwargs): result = func(self, *args, **kwargs) - try: - create_and_dump_qconfigs( - self.qpc_path, - self.onnx_path, - self.get_model_config, - [cls.__name__ for cls in self._pytorch_transforms], - [cls.__name__ for cls in self._onnx_transforms], - kwargs.get("specializations"), - kwargs.get("mdp_ts_num_devices", 1), - kwargs.get("num_speculative_tokens"), - **{ - k: v - for k, v in kwargs.items() - if k - not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io", "onnx_path"] - }, - ) - except Exception as e: - print(f"An unexpected error occurred while dumping the qconfig: {e}") + create_and_dump_qconfigs( + self.qpc_path, + self.onnx_path, + self.get_model_config, + [cls.__name__ for cls in self._pytorch_transforms], + [cls.__name__ for cls in self._onnx_transforms], + kwargs.get("specializations"), + kwargs.get("mdp_ts_num_devices", 1), + kwargs.get("num_speculative_tokens"), + **{ + k: v + for k, v in kwargs.items() + if k + not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io", "onnx_path"] + }, + ) return result return wrapper -def get_qaic_sdk_version(qaic_sdk_xml_path: str) -> Optional[str]: - """ - Extracts the QAIC SDK version from the given SDK XML file. - - Args: - qaic_sdk_xml_path (str): Path to the SDK XML file. - Returns: - The SDK version as a string if found, otherwise None. - """ - qaic_sdk_version = None - - # Check and extract version from the given SDK XML file - if os.path.exists(qaic_sdk_xml_path): - try: - tree = ET.parse(qaic_sdk_xml_path) - root = tree.getroot() - base_version_element = root.find(".//base_version") - if base_version_element is not None: - qaic_sdk_version = base_version_element.text - except ET.ParseError as e: - print(f"Error parsing XML file {qaic_sdk_xml_path}: {e}") - except Exception as e: - print(f"An unexpected error occurred while processing {qaic_sdk_xml_path}: {e}") - - return qaic_sdk_version - - def create_and_dump_qconfigs( qpc_path, onnx_path, @@ -588,12 +558,29 @@ def create_and_dump_qconfigs( Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and many other compilation options. """ - enable_qnn = compiler_options.get("enable_qnn", False) - qnn_config_path = compiler_options.get("qnn_config", None) + qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None + enable_qnn = True if "qnn_config" in compiler_options else None + qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json") onnx_path = str(onnx_path) specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json")) compile_dir = str(os.path.dirname(qpc_path)) + qnn_config_path = ( + (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None + ) + + # Extract QAIC SDK Apps Version from SDK XML file + tree = ET.parse(Constants.SDK_APPS_XML) + root = tree.getroot() + qaic_version = root.find(".//base_version").text + + # Extract QNN SDK details from YAML file if the environment variable is set + qnn_sdk_details = None + qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) + if enable_qnn and qnn_sdk_path: + qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML) + with open(qnn_sdk_yaml_path, "r") as file: + qnn_sdk_details = yaml.safe_load(file) # Ensure all objects in the configs dictionary are JSON serializable def make_serializable(obj): @@ -615,38 +602,29 @@ def make_serializable(obj): "onnx_transforms": make_serializable(onnx_transforms), "onnx_path": onnx_path, }, - "compiler_config": { - "enable_qnn": enable_qnn, - "compile_dir": compile_dir, - "specializations_file_path": specializations_file_path, - "specializations": make_serializable(specializations), - "mdp_ts_num_devices": mdp_ts_num_devices, - "num_speculative_tokens": num_speculative_tokens, - **compiler_options, - }, - "aic_sdk_config": { - "qaic_apps_version": get_qaic_sdk_version(Constants.SDK_APPS_XML), - "qaic_platform_version": get_qaic_sdk_version(Constants.SDK_PLATFORM_XML), - }, }, } + aic_compiler_config = { + "apps_sdk_version": qaic_version, + "compile_dir": compile_dir, + "specializations_file_path": specializations_file_path, + "specializations": make_serializable(specializations), + "mdp_ts_num_devices": mdp_ts_num_devices, + "num_speculative_tokens": num_speculative_tokens, + **compiler_options, + } + qnn_config = { + "enable_qnn": enable_qnn, + "qnn_config_path": qnn_config_path, + } + # Put AIC or qnn details. if enable_qnn: - qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) - if not qnn_sdk_path: - raise EnvironmentError( - f"QNN_SDK_PATH {qnn_sdk_path} is not set. Please set {QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME}" - ) - qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML) - qnn_sdk_details = load_yaml( - qnn_sdk_yaml_path - ) # Extract QNN SDK details from YAML file if the environment variable is set - qnn_config = { - "qnn_config_path": qnn_config_path, - } qconfigs["qpc_config"]["qnn_config"] = qnn_config if qnn_sdk_details: qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details) + else: + qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config create_json(qconfig_file_path, qconfigs) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index c8f74907a..b1ff9701e 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -97,10 +97,7 @@ class Constants: MAX_QPC_LIMIT = 30 MAX_RETRIES = 10 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 - SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml" # This xml file is parsed to find out the SDK apps version. - SDK_PLATFORM_XML = ( - "/opt/qti-aic/versions/platform.xml" # This xml file is parsed to find out the SDK platform version. - ) + SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml" # This xml file is parsed to find out the SDK version. @dataclass diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 7036d6f6d..fcd2fece5 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -171,4 +171,4 @@ pipeline { deleteDir() } } -} \ No newline at end of file +} diff --git a/scripts/finetune/run_ft_model.py b/scripts/finetune/run_ft_model.py index ef014923b..5e88db641 100644 --- a/scripts/finetune/run_ft_model.py +++ b/scripts/finetune/run_ft_model.py @@ -12,7 +12,7 @@ from peft import AutoPeftModelForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer -from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG # Suppress all warnings warnings.filterwarnings("ignore") @@ -25,7 +25,7 @@ print(f"Warning: {e}. Moving ahead without these qaic modules.") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -train_config = TrainConfig() +train_config = TRAIN_CONFIG() model = AutoModelForCausalLM.from_pretrained( train_config.model_name, use_cache=False, diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py index fb4a84dc0..45330cad6 100644 --- a/tests/finetune/test_finetune.py +++ b/tests/finetune/test_finetune.py @@ -8,7 +8,6 @@ import os import shutil -import numpy as np import pytest import torch.optim as optim from torch.utils.data import DataLoader @@ -23,25 +22,12 @@ def clean_up(path): shutil.rmtree(path) -configs = [ - pytest.param( - "meta-llama/Llama-3.2-1B", # model_name - 10, # max_eval_step - 20, # max_train_step - 1, # intermediate_step_save - None, # context_length - True, # run_validation - True, # use_peft - "qaic", # device - id="llama_config", # config name - ) -] +configs = [pytest.param("meta-llama/Llama-3.2-1B", 1, 1, 1, None, True, True, "cpu", id="llama_config")] -@pytest.mark.skip(reason="Currently CI is broken. Once it is fixed we will enable this test.") -@pytest.mark.cli +# TODO:enable this once docker is available @pytest.mark.on_qaic -@pytest.mark.finetune +@pytest.mark.skip(reason="eager docker not available in sdk") @pytest.mark.parametrize( "model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device", configs, @@ -57,7 +43,7 @@ def test_finetune( device, mocker, ): - train_config_spy = mocker.spy(QEfficient.cloud.finetune, "TrainConfig") + train_config_spy = mocker.spy(QEfficient.cloud.finetune, "TRAIN_CONFIG") generate_dataset_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_dataset_config") generate_peft_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_peft_config") get_dataloader_kwargs_spy = mocker.spy(QEfficient.cloud.finetune, "get_dataloader_kwargs") @@ -79,28 +65,23 @@ def test_finetune( "device": device, } - results = finetune(**kwargs) - assert np.allclose(results["avg_train_loss"], 0.00232327, atol=1e-5), "Train loss is not matching." - assert np.allclose(results["avg_train_metric"], 1.002326, atol=1e-5), "Train metric is not matching." - assert np.allclose(results["avg_eval_loss"], 0.0206124, atol=1e-5), "Eval loss is not matching." - assert np.allclose(results["avg_eval_metric"], 1.020826, atol=1e-5), "Eval metric is not matching." - assert results["avg_epoch_time"] < 60, "Training should complete within 60 seconds." + finetune(**kwargs) train_config_spy.assert_called_once() generate_dataset_config_spy.assert_called_once() generate_peft_config_spy.assert_called_once() + update_config_spy.assert_called_once() get_custom_data_collator_spy.assert_called_once() get_longest_seq_length_spy.assert_called_once() print_model_size_spy.assert_called_once() train_spy.assert_called_once() - assert update_config_spy.call_count == 2 assert get_dataloader_kwargs_spy.call_count == 2 assert get_preprocessed_dataset_spy.call_count == 2 args, kwargs = train_spy.call_args - train_dataloader = args[2] - eval_dataloader = args[3] + train_dataloader = args[1] + eval_dataloader = args[2] optimizer = args[4] batch = next(iter(train_dataloader)) @@ -116,19 +97,12 @@ def test_finetune( else: assert eval_dataloader is None - args, kwargs = update_config_spy.call_args_list[0] + args, kwargs = update_config_spy.call_args train_config = args[0] - assert max_train_step >= train_config.gradient_accumulation_steps, ( - "Total training step should be more than " - f"{train_config.gradient_accumulation_steps} which is gradient accumulation steps." - ) - saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors") + saved_file = os.path.join(train_config.output_dir, "adapter_model.safetensors") assert os.path.isfile(saved_file) clean_up(train_config.output_dir) clean_up("runs") clean_up(train_config.dump_root_dir) - - -# TODO (Meet): Add seperate tests for BERT FT and LLama FT diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py index 71b4e01cd..c80fe5969 100644 --- a/tests/transformers/spd/test_pld_inference.py +++ b/tests/transformers/spd/test_pld_inference.py @@ -262,7 +262,7 @@ def test_pld_spec_decode_inference( num_speculative_tokens=num_speculative_tokens, ) # init qaic session - target_model_session = QAICInferenceSession(target_model_qpc_path) + target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group) draft_model_session = None # skip inputs/outputs buffers @@ -453,7 +453,7 @@ def test_pld_spec_decode_inference( del draft_model_session generated_ids = np.asarray(generated_ids[0]).flatten() gen_len = generated_ids.shape[0] - exec_info = target_model.generate(tokenizer, Constants.INPUT_STR) + exec_info = target_model.generate(tokenizer, Constants.INPUT_STR, device_group) cloud_ai_100_tokens = exec_info.generated_ids[0][ :gen_len ] # Because we always run for single input and single batch size diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py index e87c51d5f..6f6bdb268 100644 --- a/tests/transformers/spd/test_spd_inference.py +++ b/tests/transformers/spd/test_spd_inference.py @@ -157,8 +157,8 @@ def test_spec_decode_inference( full_batch_size=full_batch_size, ) # init qaic session - target_model_session = QAICInferenceSession(target_model_qpc_path) - draft_model_session = QAICInferenceSession(draft_model_qpc_path) + target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group) + draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=device_group) # skip inputs/outputs buffers target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")])) @@ -341,7 +341,7 @@ def test_spec_decode_inference( del draft_model_session generated_ids = np.asarray(generated_ids[0]).flatten() gen_len = generated_ids.shape[0] - exec_info = draft_model.generate(tokenizer, Constants.INPUT_STR) + exec_info = draft_model.generate(tokenizer, Constants.INPUT_STR, device_group) cloud_ai_100_tokens = exec_info.generated_ids[0][ :gen_len ] # Because we always run for single input and single batch size