quic · quic-amitraj · May 14, 2025
@@ -85,29 +85,17 @@
     parser.add_argument(
         "--enable_qnn",
         "--enable-qnn",
-        nargs="?",
-        const=True,
-        type=str,
+        action="store_true",
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
              Sample Config: QEfficient/compile/qnn_config.json",
     )
-
-    args, compiler_options = parser.parse_known_args()
-
-    if isinstance(args.enable_qnn, str):
-        args.qnn_config = args.enable_qnn
-        args.enable_qnn = True
-
-    compiler_options_dict = {}
-    for i in range(0, len(compiler_options)):
-        if compiler_options[i].startswith("--"):
-            key = compiler_options[i].lstrip("-").replace("-", "_")
-            value = (
-                compiler_options[i + 1]
-                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
-                else True
-            )
-            compiler_options_dict[key] = value
-    QEfficient.compile(**args.__dict__, **compiler_options_dict)
+    parser.add_argument(
+        "qnn_config",
+        nargs="?",
+        type=str,
+    )
+    # FIXME(ochougul): Allow extra compilation arguments
+    args = parser.parse_args()
+    QEfficient.compile(**vars(args))
@@ -7,7 +7,6 @@
 
 import random
 import warnings
-from typing import Any, Dict, Optional, Union
 
 import fire
 import numpy as np
@@ -18,9 +17,8 @@
 import torch.utils.data
 from peft import PeftModel, get_peft_model
 from torch.optim.lr_scheduler import StepLR
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
-from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
@@ -34,81 +32,52 @@
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
 
-# Try importing QAIC-specific module, proceed without it if unavailable
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    print(f"Warning: {e}. Proceeding without QAIC modules.")
+    print(f"Warning: {e}. Moving ahead without these qaic modules.")
 
 
-from transformers import AutoModelForSequenceClassification
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 # Suppress all warnings
 warnings.filterwarnings("ignore")
 
 
-def setup_distributed_training(train_config: TrainConfig) -> None:
-    """Initialize distributed training environment if enabled.
-
-    Args:
-        train_config (TrainConfig): Training configuration object.
-
-    Notes:
-        - If distributed data parallel (DDP) is disabled, this function does nothing.
-        - Ensures the device is not CPU and does not specify an index for DDP compatibility.
-        - Initializes the process group using the specified distributed backend.
-
-    Raises:
-        AssertionError: If device is CPU or includes an index with DDP enabled.
+def main(**kwargs):
     """
-    if not train_config.enable_ddp:
-        return
+    Helper function to finetune the model on QAic.
 
-    torch_device = torch.device(train_config.device)
-    assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
-    assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
+    .. code-block:: bash
 
-    dist.init_process_group(backend=train_config.dist_backend)
-    # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-    getattr(torch, torch_device.type).set_device(dist.get_rank())
+        python -m QEfficient.cloud.finetune OPTIONS
 
+    """
+    # update the configuration for the training process
+    train_config = TRAIN_CONFIG()
+    update_config(train_config, **kwargs)
+    dataset_config = generate_dataset_config(train_config, kwargs)
+    device = train_config.device
 
-def setup_seeds(seed: int) -> None:
-    """Set random seeds across libraries for reproducibility.
+    # dist init
+    if train_config.enable_ddp:
+        # TODO: may have to init qccl backend, next try run with torchrun command
+        torch_device = torch.device(device)
+        assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
+        assert torch_device.index is None, (
+            f"DDP requires specification of device type only, however provided device index as well: {torch_device}"
+        )
+        dist.init_process_group(backend=train_config.dist_backend)
+        # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
+        getattr(torch, torch_device.type).set_device(dist.get_rank())
 
-    Args:
-        seed (int): Seed value to set for random number generators.
+    # Set the seeds for reproducibility
+    torch.manual_seed(train_config.seed)
+    random.seed(train_config.seed)
+    np.random.seed(train_config.seed)
 
-    Notes:
-        - Sets seeds for PyTorch, Python's random module, and NumPy.
-    """
-    torch.manual_seed(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-
-
-def load_model_and_tokenizer(
-    train_config: TrainConfig, dataset_config: Any, peft_config_file: str, **kwargs
-) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
-    """Load the pre-trained model and tokenizer from Hugging Face.
-
-    Args:
-        config (TrainConfig): Training configuration object containing model and tokenizer names.
-        dataset_config (Any): A dataclass object representing dataset configuration.
-        peft_config_file (str): Path to PEFT config file used for PEFT finetuning.
-        kwargs: Additional arguments to override PEFT config.
-
-    Returns:
-        tuple: A tuple of two values.
-            - Model with pretrained weights loaded.
-            - Model's tokenizer (AutoTokenizer).
-
-    Notes:
-        - Downloads the model if not already cached using login_and_download_hf_lm.
-        - Configures the model with FP16 precision and disables caching for training.
-        - Resizes model embeddings if tokenizer vocab size exceeds model embedding size.
-        - Sets pad_token_id to eos_token_id if not defined in the tokenizer.
-    """
+    # Load the pre-trained model and setup its configuration
+    # config = AutoConfig.from_pretrained(train_config.model_name)
     pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
     if train_config.task_type == "seq_classification":
         model = AutoModelForSequenceClassification.from_pretrained(
@@ -135,6 +104,7 @@ def load_model_and_tokenizer(
             torch_dtype=torch.float16,
         )
 
+    # Load the tokenizer and add special tokens
     tokenizer = AutoTokenizer.from_pretrained(
         train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name
     )
@@ -144,12 +114,14 @@ def load_model_and_tokenizer(
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
+        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
-    # FIXME (Meet): Cover below line inside the logger once it is implemented.
     print_model_size(model, train_config)
 
+    # print the datatype of the model parameters
+    # print(get_parameter_dtypes(model))
+
     # Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model.
     # Because, both makes model.is_gradient_checkpointing = True which is used in peft library to
     # apply gradient checkpointing related hooks to the input embeddings. Without this we will get
@@ -162,70 +134,17 @@ def load_model_and_tokenizer(
         else:
             raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.")
 
-    model = apply_peft(model, train_config, peft_config_file, **kwargs)
-
-    return model, tokenizer
-
-
-def apply_peft(
-    model: AutoModel, train_config: TrainConfig, peft_config_file: Dict, **kwargs
-) -> Union[AutoModel, PeftModel]:
-    """Apply Parameter-Efficient Fine-Tuning (PEFT) to the model if enabled.
-
-    Args:
-        model (AutoModel): Huggingface model.
-        train_config (TrainConfig): Training configuration object.
-        peft_config_file (str, optional): Path to YAML/JSON file containing
-            PEFT (LoRA) config. Defaults to None.
-        kwargs: Additional arguments to override PEFT config params.
+    if train_config.use_peft:
+        # Load the pre-trained peft model checkpoint and setup its configuration
+        if train_config.from_peft_checkpoint:
+            model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
+            peft_config = model.peft_config
+        # Generate the peft config and start fine-tuning from original model
+        else:
+            peft_config = generate_peft_config(train_config, kwargs)
+            model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
 
-    Returns:
-        Union[AutoModel, PeftModel]: If the use_peft in train_config is True
-            then PeftModel object is returned else original model object
-            (AutoModel) is returned.
-    """
-    if not train_config.use_peft:
-        return model
-
-    # Load the pre-trained peft model checkpoint and setup its configuration
-    if train_config.from_peft_checkpoint:
-        model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
-        peft_config = model.peft_config
-    # Generate the peft config and start fine-tuning from original model
-    else:
-        peft_config = generate_peft_config(train_config, peft_config_file, **kwargs)
-        model = get_peft_model(model, peft_config)
-    model.print_trainable_parameters()
-
-    return model
-
-
-def setup_dataloaders(
-    train_config: TrainConfig,
-    dataset_config: Any,
-    tokenizer: AutoTokenizer,
-) -> tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader], int]:
-    """Set up training and validation DataLoaders.
-
-    Args:
-        train_config (TrainConfig): Training configuration object.
-        dataset_config (Any): Configuration for the dataset (generated from train_config).
-        tokenizer (AutoTokenizer): Tokenizer for preprocessing data.
-
-    Returns:
-        tuple: A tuple of three values.
-            - First value represents train_dataloader
-            - Second value represents eval_dataloader. It is None if
-              validation is disabled.
-            - Length of longest sequence in the dataset.
-
-    Raises:
-        ValueError: If validation is enabled but the validation set is too small.
-
-    Notes:
-        - Applies a custom data collator if provided by get_custom_data_collator.
-        - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
-    """
     # Get the dataset utils
     dataset_processer = tokenizer
 
@@ -245,8 +164,6 @@ def setup_dataloaders(
     ##
     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
     print("length of dataset_train", len(dataset_train))
-
-    # FIXME (Meet): Add custom data collator registration from the outside by the user.
     custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
     if custom_data_collator:
         print("custom_data_collator is used")
@@ -291,66 +208,40 @@ def setup_dataloaders(
     else:
         longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset)
 
-    return train_dataloader, eval_dataloader, longest_seq_length
-
-
-def main(peft_config_file: str = None, **kwargs) -> None:
-    """
-    Fine-tune a model on QAIC hardware with configurable training and LoRA parameters.
-
-    Args:
-        peft_config_file (str, optional): Path to YAML/JSON file containing PEFT (LoRA) config. Defaults to None.
-        kwargs: Additional arguments to override TrainConfig.
-
-    Example:
-        .. code-block:: bash
-
-            # Using a YAML config file for PEFT
-            python -m QEfficient.cloud.finetune \\
-                --model_name "meta-llama/Llama-3.2-1B" \\
-                --lr 5e-4 \\
-                --peft_config_file "lora_config.yaml"
-
-            # Using default LoRA config
-            python -m QEfficient.cloud.finetune \\
-                --model_name "meta-llama/Llama-3.2-1B" \\
-                --lr 5e-4
-    """
-    train_config = TrainConfig()
-    update_config(train_config, **kwargs)
-    dataset_config = generate_dataset_config(train_config.dataset)
-    update_config(dataset_config, **kwargs)
-
-    setup_distributed_training(train_config)
-    setup_seeds(train_config.seed)
-    model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)
-
-    # Create DataLoaders for the training and validation dataset
-    train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
     print(
         f"The longest sequence length in the train data is {longest_seq_length}, "
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
     )
-
     model.to(train_config.device)
-    optimizer = optim.AdamW(model.parameters(), lr=train_config.lr, weight_decay=train_config.weight_decay)
+    optimizer = optim.AdamW(
+        model.parameters(),
+        lr=train_config.lr,
+        weight_decay=train_config.weight_decay,
+    )
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
+
+    # wrap model with DDP
     if train_config.enable_ddp:
         model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()])
-    results = train(
+
+    _ = train(
         model,
-        tokenizer,
         train_dataloader,
         eval_dataloader,
+        tokenizer,
         optimizer,
         scheduler,
+        train_config.gradient_accumulation_steps,
         train_config,
+        train_config.device,
         dist.get_rank() if train_config.enable_ddp else None,
+        None,
     )
+
+    # finalize torch distributed
     if train_config.enable_ddp:
         dist.destroy_process_group()
-    return results
 
 
 if __name__ == "__main__":

@@ -197,10 +197,6 @@ def main(
         **kwargs,
     )
 
-    #  If the io-encrypt flag is passed we will exit after QPC generation.
-    if kwargs.get("io_encrypt", None):
-        exit()
-
     #########
     # Execute
     #########