diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index e16c1322e5d1..7522996b2424 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -194,6 +194,59 @@ Currently, [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] only support +### Hotswapping LoRA adapters + +A common use case when serving multiple adapters is to load one adapter first, generate images, load another adapter, generate more images, load another adapter, etc. This workflow normally requires calling [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`], and possibly [`~loaders.peft.PeftAdapterMixin.delete_adapters`] to save memory. Moreover, if the model is compiled using `torch.compile`, performing these steps requires recompilation, which takes time. + +To better support this common workflow, you can "hotswap" a LoRA adapter, to avoid accumulating memory and in some cases, recompilation. It requires an adapter to already be loaded, and the new adapter weights are swapped in-place for the existing adapter. + +Pass `hotswap=True` when loading a LoRA adapter to enable this feature. It is important to indicate the name of the existing adapter, (`default_0` is the default adapter name), to be swapped. If you loaded the first adapter with a different name, use that name instead. + +```python +pipe = ... +# load adapter 1 as normal +pipeline.load_lora_weights(file_name_adapter_1) +# generate some images with adapter 1 +... +# now hot swap the 2nd adapter +pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0") +# generate images with adapter 2 +``` + + + + +Hotswapping is not currently supported for LoRA adapters that target the text encoder. + + + +For compiled models, it is often (though not always if the second adapter targets identical LoRA ranks and scales) necessary to call [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation. Use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] _before_ loading the first adapter, and `torch.compile` should be called _after_ loading the first adapter. + +```python +pipe = ... +# call this extra method +pipe.enable_lora_hotswap(target_rank=max_rank) +# now load adapter 1 +pipe.load_lora_weights(file_name_adapter_1) +# now compile the unet of the pipeline +pipe.unet = torch.compile(pipeline.unet, ...) +# generate some images with adapter 1 +... +# now hot swap adapter 2 +pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0") +# generate images with adapter 2 +``` + +The `target_rank=max_rank` argument is important for setting the maximum rank among all LoRA adapters that will be loaded. If you have one adapter with rank 8 and another with rank 16, pass `target_rank=16`. You should use a higher value if in doubt. By default, this value is 128. + +However, there can be situations where recompilation is unavoidable. For example, if the hotswapped adapter targets more layers than the initial adapter, then recompilation is triggered. Try to load the adapter that targets the most layers first. Refer to the PEFT docs on [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) for more details about the limitations of this feature. + + + +Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If you detect recompilation despite following all the steps above, please open an issue with [Diffusers](https://github.com/huggingface/diffusers/issues) with a reproducible example. + + + ### Kohya and TheLastBen Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way. diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py index 17ed8c5444fc..280a9fa6e73f 100644 --- a/src/diffusers/loaders/lora_base.py +++ b/src/diffusers/loaders/lora_base.py @@ -316,6 +316,7 @@ def _load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -341,6 +342,10 @@ def _load_lora_into_text_encoder( # their prefixes. prefix = text_encoder_name if prefix is None else prefix + # Safe prefix to check with. + if hotswap and any(text_encoder_name in key for key in state_dict.keys()): + raise ValueError("At the moment, hotswapping is not supported for text encoders, please pass `hotswap=False`.") + # Load the layers corresponding to text encoder and make necessary adjustments. if prefix is not None: state_dict = {k[len(f"{prefix}.") :]: v for k, v in state_dict.items() if k.startswith(f"{prefix}.")} @@ -908,3 +913,23 @@ def lora_scale(self) -> float: # property function that returns the lora scale which can be set at run time by the pipeline. # if _lora_scale has not been set, return 1 return self._lora_scale if hasattr(self, "_lora_scale") else 1.0 + + def enable_lora_hotswap(self, **kwargs) -> None: + """Enables the possibility to hotswap LoRA adapters. + + Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of + the loaded adapters differ. + + Args: + target_rank (`int`): + The highest rank among all the adapters that will be loaded. + check_compiled (`str`, *optional*, defaults to `"error"`): + How to handle the case when the model is already compiled, which should generally be avoided. The + options are: + - "error" (default): raise an error + - "warn": issue a warning + - "ignore": do nothing + """ + for key, component in self.components.items(): + if hasattr(component, "enable_lora_hotswap") and (key in self._lora_loadable_modules): + component.enable_lora_hotswap(**kwargs) diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py index e522778deeed..513c15ac5dab 100644 --- a/src/diffusers/loaders/lora_pipeline.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -79,10 +79,13 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin): text_encoder_name = TEXT_ENCODER_NAME def load_lora_weights( - self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + adapter_name=None, + hotswap: bool = False, + **kwargs, ): - """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and + """Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. @@ -105,6 +108,29 @@ def load_lora_weights( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap kwargs (`dict`, *optional*): See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. """ @@ -135,6 +161,7 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) self.load_lora_into_text_encoder( state_dict, @@ -146,6 +173,7 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -265,7 +293,14 @@ def lora_state_dict( @classmethod def load_lora_into_unet( - cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, + state_dict, + network_alphas, + unet, + adapter_name=None, + _pipeline=None, + low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `unet`. @@ -287,6 +322,29 @@ def load_lora_into_unet( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -307,6 +365,7 @@ def load_lora_into_unet( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -320,6 +379,7 @@ def load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `text_encoder` @@ -345,6 +405,29 @@ def load_lora_into_text_encoder( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ _load_lora_into_text_encoder( state_dict=state_dict, @@ -356,6 +439,7 @@ def load_lora_into_text_encoder( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -700,7 +784,14 @@ def lora_state_dict( @classmethod # Copied from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.load_lora_into_unet def load_lora_into_unet( - cls, state_dict, network_alphas, unet, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, + state_dict, + network_alphas, + unet, + adapter_name=None, + _pipeline=None, + low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `unet`. @@ -722,6 +813,29 @@ def load_lora_into_unet( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -742,6 +856,7 @@ def load_lora_into_unet( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -756,6 +871,7 @@ def load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `text_encoder` @@ -781,6 +897,29 @@ def load_lora_into_text_encoder( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ _load_lora_into_text_encoder( state_dict=state_dict, @@ -792,6 +931,7 @@ def load_lora_into_text_encoder( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -1035,7 +1175,11 @@ def lora_state_dict( return state_dict def load_lora_weights( - self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + adapter_name=None, + hotswap: bool = False, + **kwargs, ): """ Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and @@ -1058,6 +1202,29 @@ def load_lora_weights( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap kwargs (`dict`, *optional*): See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. """ @@ -1087,6 +1254,7 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) self.load_lora_into_text_encoder( state_dict, @@ -1097,6 +1265,7 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) self.load_lora_into_text_encoder( state_dict, @@ -1107,11 +1276,12 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -1129,6 +1299,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -1143,6 +1336,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -1157,6 +1351,7 @@ def load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `text_encoder` @@ -1182,6 +1377,29 @@ def load_lora_into_text_encoder( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ _load_lora_into_text_encoder( state_dict=state_dict, @@ -1193,6 +1411,7 @@ def load_lora_into_text_encoder( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -1476,7 +1695,11 @@ def lora_state_dict( return state_dict def load_lora_weights( - self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + adapter_name=None, + hotswap: bool = False, + **kwargs, ): """ Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and @@ -1501,6 +1724,26 @@ def load_lora_weights( low_cpu_mem_usage (`bool`, *optional*): `Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. If the new + adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need to call an + additional method before loading the adapter: + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -1569,6 +1812,7 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) if len(transformer_norm_state_dict) > 0: @@ -1587,11 +1831,19 @@ def load_lora_weights( adapter_name=adapter_name, _pipeline=self, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod def load_lora_into_transformer( - cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, + state_dict, + network_alphas, + transformer, + adapter_name=None, + _pipeline=None, + low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -1613,6 +1865,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"): raise ValueError( @@ -1627,6 +1902,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -1695,6 +1971,7 @@ def load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `text_encoder` @@ -1720,6 +1997,29 @@ def load_lora_into_text_encoder( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ _load_lora_into_text_encoder( state_dict=state_dict, @@ -1731,6 +2031,7 @@ def load_lora_into_text_encoder( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -2141,7 +2442,14 @@ class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin): @classmethod # Copied from diffusers.loaders.lora_pipeline.FluxLoraLoaderMixin.load_lora_into_transformer with FluxTransformer2DModel->UVit2DModel def load_lora_into_transformer( - cls, state_dict, network_alphas, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, + state_dict, + network_alphas, + transformer, + adapter_name=None, + _pipeline=None, + low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -2163,6 +2471,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"): raise ValueError( @@ -2177,6 +2508,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -2191,6 +2523,7 @@ def load_lora_into_text_encoder( adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, + hotswap: bool = False, ): """ This will load the LoRA layers specified in `state_dict` into `text_encoder` @@ -2216,6 +2549,29 @@ def load_lora_into_text_encoder( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ _load_lora_into_text_encoder( state_dict=state_dict, @@ -2227,6 +2583,7 @@ def load_lora_into_text_encoder( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -2443,7 +2800,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->CogVideoXTransformer3DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -2461,6 +2818,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -2475,6 +2855,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -2750,7 +3131,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->MochiTransformer3DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -2768,6 +3149,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -2782,6 +3186,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -3059,7 +3464,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->LTXVideoTransformer3DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -3077,6 +3482,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3091,6 +3519,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -3368,7 +3797,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->SanaTransformer2DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -3386,6 +3815,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3400,6 +3852,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -3680,7 +4133,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->HunyuanVideoTransformer3DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -3698,6 +4151,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3712,6 +4188,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -3993,7 +4470,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->Lumina2Transformer2DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -4011,6 +4488,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4025,6 +4525,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -4333,7 +4834,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->WanTransformer3DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -4351,6 +4852,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4365,6 +4889,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod @@ -4642,7 +5167,7 @@ def load_lora_weights( @classmethod # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->CogView4Transformer2DModel def load_lora_into_transformer( - cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False + cls, state_dict, transformer, adapter_name=None, _pipeline=None, low_cpu_mem_usage=False, hotswap: bool = False ): """ This will load the LoRA layers specified in `state_dict` into `transformer`. @@ -4660,6 +5185,29 @@ def load_lora_into_transformer( low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4674,6 +5222,7 @@ def load_lora_into_transformer( adapter_name=adapter_name, _pipeline=_pipeline, low_cpu_mem_usage=low_cpu_mem_usage, + hotswap=hotswap, ) @classmethod diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py index 8b52cf63456c..1809a5d56c8f 100644 --- a/src/diffusers/loaders/peft.py +++ b/src/diffusers/loaders/peft.py @@ -16,7 +16,7 @@ import os from functools import partial from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Dict, List, Literal, Optional, Union import safetensors import torch @@ -128,6 +128,8 @@ class PeftAdapterMixin: """ _hf_peft_config_loaded = False + # kwargs for prepare_model_for_compiled_hotswap, if required + _prepare_lora_hotswap_kwargs: Optional[dict] = None @classmethod # Copied from diffusers.loaders.lora_base.LoraBaseMixin._optionally_disable_offloading @@ -145,7 +147,9 @@ def _optionally_disable_offloading(cls, _pipeline): """ return _func_optionally_disable_offloading(_pipeline=_pipeline) - def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="transformer", **kwargs): + def load_lora_adapter( + self, pretrained_model_name_or_path_or_dict, prefix="transformer", hotswap: bool = False, **kwargs + ): r""" Loads a LoRA adapter into the underlying model. @@ -189,6 +193,29 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans low_cpu_mem_usage (`bool`, *optional*): Speed up model loading by only loading the pretrained LoRA weights and not initializing the random weights. + hotswap : (`bool`, *optional*) + Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter + in-place. This means that, instead of loading an additional adapter, this will take the existing + adapter weights and replace them with the weights of the new adapter. This can be faster and more + memory efficient. However, the main advantage of hotswapping is that when the model is compiled with + torch.compile, loading the new adapter does not require recompilation of the model. When using + hotswapping, the passed `adapter_name` should be the name of an already loaded adapter. + + If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need + to call an additional method before loading the adapter: + + ```py + pipeline = ... # load diffusers pipeline + max_rank = ... # the highest rank among all LoRAs that you want to load + # call *before* compiling and loading the LoRA adapter + pipeline.enable_lora_hotswap(target_rank=max_rank) + pipeline.load_lora_weights(file_name) + # optionally compile the model now + ``` + + Note that hotswapping adapters of the text encoder is not yet supported. There are some further + limitations to this technique, which are documented here: + https://huggingface.co/docs/peft/main/en/package_reference/hotswap """ from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict from peft.tuners.tuners_utils import BaseTunerLayer @@ -239,10 +266,15 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans state_dict = {k[len(f"{prefix}.") :]: v for k, v in state_dict.items() if k.startswith(f"{prefix}.")} if len(state_dict) > 0: - if adapter_name in getattr(self, "peft_config", {}): + if adapter_name in getattr(self, "peft_config", {}) and not hotswap: raise ValueError( f"Adapter name {adapter_name} already in use in the model - please select a new adapter name." ) + elif adapter_name not in getattr(self, "peft_config", {}) and hotswap: + raise ValueError( + f"Trying to hotswap LoRA adapter '{adapter_name}' but there is no existing adapter by that name. " + "Please choose an existing adapter name or set `hotswap=False` to prevent hotswapping." + ) # check with first key if is not in peft format first_key = next(iter(state_dict.keys())) @@ -302,11 +334,68 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans if is_peft_version(">=", "0.13.1"): peft_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage + if hotswap or (self._prepare_lora_hotswap_kwargs is not None): + if is_peft_version(">", "0.14.0"): + from peft.utils.hotswap import ( + check_hotswap_configs_compatible, + hotswap_adapter_from_state_dict, + prepare_model_for_compiled_hotswap, + ) + else: + msg = ( + "Hotswapping requires PEFT > v0.14. Please upgrade PEFT to a higher version or install it " + "from source." + ) + raise ImportError(msg) + + if hotswap: + + def map_state_dict_for_hotswap(sd): + # For hotswapping, we need the adapter name to be present in the state dict keys + new_sd = {} + for k, v in sd.items(): + if k.endswith("lora_A.weight") or key.endswith("lora_B.weight"): + k = k[: -len(".weight")] + f".{adapter_name}.weight" + elif k.endswith("lora_B.bias"): # lora_bias=True option + k = k[: -len(".bias")] + f".{adapter_name}.bias" + new_sd[k] = v + return new_sd + # To handle scenarios where we cannot successfully set state dict. If it's unsucessful, # we should also delete the `peft_config` associated to the `adapter_name`. try: - inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs) - incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs) + if hotswap: + state_dict = map_state_dict_for_hotswap(state_dict) + check_hotswap_configs_compatible(self.peft_config[adapter_name], lora_config) + try: + hotswap_adapter_from_state_dict( + model=self, + state_dict=state_dict, + adapter_name=adapter_name, + config=lora_config, + ) + except Exception as e: + logger.error(f"Hotswapping {adapter_name} was unsucessful with the following error: \n{e}") + raise + # the hotswap function raises if there are incompatible keys, so if we reach this point we can set + # it to None + incompatible_keys = None + else: + inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs) + incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs) + + if self._prepare_lora_hotswap_kwargs is not None: + # For hotswapping of compiled models or adapters with different ranks. + # If the user called enable_lora_hotswap, we need to ensure it is called: + # - after the first adapter was loaded + # - before the model is compiled and the 2nd adapter is being hotswapped in + # Therefore, it needs to be called here + prepare_model_for_compiled_hotswap( + self, config=lora_config, **self._prepare_lora_hotswap_kwargs + ) + # We only want to call prepare_model_for_compiled_hotswap once + self._prepare_lora_hotswap_kwargs = None + # Set peft config loaded flag to True if module has been successfully injected and incompatible keys retrieved if not self._hf_peft_config_loaded: self._hf_peft_config_loaded = True @@ -769,3 +858,36 @@ def delete_adapters(self, adapter_names: Union[List[str], str]): # Pop also the corresponding adapter from the config if hasattr(self, "peft_config"): self.peft_config.pop(adapter_name, None) + + def enable_lora_hotswap( + self, target_rank: int = 128, check_compiled: Literal["error", "warn", "ignore"] = "error" + ) -> None: + """Enables the possibility to hotswap LoRA adapters. + + Calling this method is only required when hotswapping adapters and if the model is compiled or if the ranks of + the loaded adapters differ. + + Args: + target_rank (`int`, *optional*, defaults to `128`): + The highest rank among all the adapters that will be loaded. + + check_compiled (`str`, *optional*, defaults to `"error"`): + How to handle the case when the model is already compiled, which should generally be avoided. The + options are: + - "error" (default): raise an error + - "warn": issue a warning + - "ignore": do nothing + """ + if getattr(self, "peft_config", {}): + if check_compiled == "error": + raise RuntimeError("Call `enable_lora_hotswap` before loading the first adapter.") + elif check_compiled == "warn": + logger.warning( + "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation." + ) + elif check_compiled != "ignore": + raise ValueError( + f"check_compiles should be one of 'error', 'warn', or 'ignore', got '{check_compiled}' instead." + ) + + self._prepare_lora_hotswap_kwargs = {"target_rank": target_rank, "check_compiled": check_compiled} diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index fc4a3128dd9f..d55ff6e62872 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -24,6 +24,7 @@ import unittest import unittest.mock as mock import uuid +import warnings from collections import defaultdict from typing import Dict, List, Optional, Tuple, Union @@ -56,15 +57,20 @@ from diffusers.utils.hub_utils import _add_variant from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, + floats_tensor, get_python_version, is_torch_compile, numpy_cosine_similarity_distance, + require_peft_backend, + require_peft_version_greater, require_torch_2, require_torch_accelerator, require_torch_accelerator_with_training, require_torch_gpu, require_torch_multi_accelerator, run_test_in_subprocess, + slow, torch_all_close, torch_device, ) @@ -1659,3 +1665,234 @@ def test_push_to_hub_library_name(self): # Reset repo delete_repo(self.repo_id, token=TOKEN) + + +@slow +@require_torch_2 +@require_torch_accelerator +@require_peft_backend +@require_peft_version_greater("0.14.0") +@is_torch_compile +class TestLoraHotSwappingForModel(unittest.TestCase): + """Test that hotswapping does not result in recompilation on the model directly. + + We're not extensively testing the hotswapping functionality since it is implemented in PEFT and is extensively + tested there. The goal of this test is specifically to ensure that hotswapping with diffusers does not require + recompilation. + + See + https://github.com/huggingface/peft/blob/eaab05e18d51fb4cce20a73c9acd82a00c013b83/tests/test_gpu_examples.py#L4252 + for the analogous PEFT test. + + """ + + def tearDown(self): + # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, + # there will be recompilation errors, as torch caches the model when run in the same process. + super().tearDown() + torch._dynamo.reset() + gc.collect() + backend_empty_cache(torch_device) + + def get_small_unet(self): + # from diffusers UNet2DConditionModelTests + torch.manual_seed(0) + init_dict = { + "block_out_channels": (4, 8), + "norm_num_groups": 4, + "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"), + "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"), + "cross_attention_dim": 8, + "attention_head_dim": 2, + "out_channels": 4, + "in_channels": 4, + "layers_per_block": 1, + "sample_size": 16, + } + model = UNet2DConditionModel(**init_dict) + return model.to(torch_device) + + def get_unet_lora_config(self, lora_rank, lora_alpha, target_modules): + # from diffusers test_models_unet_2d_condition.py + from peft import LoraConfig + + unet_lora_config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=target_modules, + init_lora_weights=False, + use_dora=False, + ) + return unet_lora_config + + def get_dummy_input(self): + # from UNet2DConditionModelTests + batch_size = 4 + num_channels = 4 + sizes = (16, 16) + + noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) + time_step = torch.tensor([10]).to(torch_device) + encoder_hidden_states = floats_tensor((batch_size, 4, 8)).to(torch_device) + + return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states} + + def check_model_hotswap(self, do_compile, rank0, rank1, target_modules0, target_modules1=None): + """ + Check that hotswapping works on a small unet. + + Steps: + - create 2 LoRA adapters and save them + - load the first adapter + - hotswap the second adapter + - check that the outputs are correct + - optionally compile the model + + Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would + fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is + fine. + """ + # create 2 adapters with different ranks and alphas + dummy_input = self.get_dummy_input() + alpha0, alpha1 = rank0, rank1 + max_rank = max([rank0, rank1]) + if target_modules1 is None: + target_modules1 = target_modules0[:] + lora_config0 = self.get_unet_lora_config(rank0, alpha0, target_modules0) + lora_config1 = self.get_unet_lora_config(rank1, alpha1, target_modules1) + + unet = self.get_small_unet() + unet.add_adapter(lora_config0, adapter_name="adapter0") + with torch.inference_mode(): + output0_before = unet(**dummy_input)["sample"] + + unet.add_adapter(lora_config1, adapter_name="adapter1") + unet.set_adapter("adapter1") + with torch.inference_mode(): + output1_before = unet(**dummy_input)["sample"] + + # sanity checks: + tol = 5e-3 + assert not torch.allclose(output0_before, output1_before, atol=tol, rtol=tol) + assert not (output0_before == 0).all() + assert not (output1_before == 0).all() + + with tempfile.TemporaryDirectory() as tmp_dirname: + # save the adapter checkpoints + unet.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0") + unet.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1") + del unet + + # load the first adapter + unet = self.get_small_unet() + if do_compile or (rank0 != rank1): + # no need to prepare if the model is not compiled or if the ranks are identical + unet.enable_lora_hotswap(target_rank=max_rank) + + file_name0 = os.path.join(os.path.join(tmp_dirname, "0"), "pytorch_lora_weights.safetensors") + file_name1 = os.path.join(os.path.join(tmp_dirname, "1"), "pytorch_lora_weights.safetensors") + unet.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0", prefix=None) + + if do_compile: + unet = torch.compile(unet, mode="reduce-overhead") + + with torch.inference_mode(): + output0_after = unet(**dummy_input)["sample"] + assert torch.allclose(output0_before, output0_after, atol=tol, rtol=tol) + + # hotswap the 2nd adapter + unet.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True, prefix=None) + + # we need to call forward to potentially trigger recompilation + with torch.inference_mode(): + output1_after = unet(**dummy_input)["sample"] + assert torch.allclose(output1_before, output1_after, atol=tol, rtol=tol) + + # check error when not passing valid adapter name + name = "does-not-exist" + msg = f"Trying to hotswap LoRA adapter '{name}' but there is no existing adapter by that name" + with self.assertRaisesRegex(ValueError, msg): + unet.load_lora_adapter(file_name1, adapter_name=name, hotswap=True, prefix=None) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_model(self, rank0, rank1): + self.check_model_hotswap( + do_compile=False, rank0=rank0, rank1=rank1, target_modules0=["to_q", "to_k", "to_v", "to_out.0"] + ) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_model_linear(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["to_q", "to_k", "to_v", "to_out.0"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_model_conv2d(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["conv", "conv1", "conv2"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["to_q", "conv"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + def test_enable_lora_hotswap_called_after_adapter_added_raises(self): + # ensure that enable_lora_hotswap is called before loading the first adapter + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + unet = self.get_small_unet() + unet.add_adapter(lora_config) + msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.") + with self.assertRaisesRegex(RuntimeError, msg): + unet.enable_lora_hotswap(target_rank=32) + + def test_enable_lora_hotswap_called_after_adapter_added_warning(self): + # ensure that enable_lora_hotswap is called before loading the first adapter + from diffusers.loaders.peft import logger + + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + unet = self.get_small_unet() + unet.add_adapter(lora_config) + msg = ( + "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation." + ) + with self.assertLogs(logger=logger, level="WARNING") as cm: + unet.enable_lora_hotswap(target_rank=32, check_compiled="warn") + assert any(msg in log for log in cm.output) + + def test_enable_lora_hotswap_called_after_adapter_added_ignore(self): + # check possibility to ignore the error/warning + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + unet = self.get_small_unet() + unet.add_adapter(lora_config) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") # Capture all warnings + unet.enable_lora_hotswap(target_rank=32, check_compiled="warn") + self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}") + + def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self): + # check that wrong argument value raises an error + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + unet = self.get_small_unet() + unet.add_adapter(lora_config) + msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.") + with self.assertRaisesRegex(ValueError, msg): + unet.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument") + + def test_hotswap_second_adapter_targets_more_layers_raises(self): + # check the error and log + from diffusers.loaders.peft import logger + + # at the moment, PEFT requires the 2nd adapter to target the same or a subset of layers + target_modules0 = ["to_q"] + target_modules1 = ["to_q", "to_k"] + with self.assertRaises(RuntimeError): # peft raises RuntimeError + with self.assertLogs(logger=logger, level="ERROR") as cm: + self.check_model_hotswap( + do_compile=True, rank0=8, rank1=8, target_modules0=target_modules0, target_modules1=target_modules1 + ) + assert any("Hotswapping adapter0 was unsuccessful" in log for log in cm.output) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 48c89d399216..ae5a12e04ba8 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -17,12 +17,14 @@ import json import os import random +import re import shutil import sys import tempfile import traceback import unittest import unittest.mock as mock +import warnings import numpy as np import PIL.Image @@ -78,6 +80,8 @@ require_flax, require_hf_hub_version_greater, require_onnxruntime, + require_peft_backend, + require_peft_version_greater, require_torch_2, require_torch_accelerator, require_transformers_version_greater, @@ -2175,3 +2179,264 @@ def test_ddpm_ddim_equality_batched(self): # the values aren't exactly equal, but the images look the same visually assert np.abs(ddpm_images - ddim_images).max() < 1e-1 + + +@slow +@require_torch_2 +@require_torch_accelerator +@require_peft_backend +@require_peft_version_greater("0.14.0") +@is_torch_compile +class TestLoraHotSwappingForPipeline(unittest.TestCase): + """Test that hotswapping does not result in recompilation in a pipeline. + + We're not extensively testing the hotswapping functionality since it is implemented in PEFT and is extensively + tested there. The goal of this test is specifically to ensure that hotswapping with diffusers does not require + recompilation. + + See + https://github.com/huggingface/peft/blob/eaab05e18d51fb4cce20a73c9acd82a00c013b83/tests/test_gpu_examples.py#L4252 + for the analogous PEFT test. + + """ + + def tearDown(self): + # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, + # there will be recompilation errors, as torch caches the model when run in the same process. + super().tearDown() + torch._dynamo.reset() + gc.collect() + backend_empty_cache(torch_device) + + def get_unet_lora_config(self, lora_rank, lora_alpha, target_modules): + # from diffusers test_models_unet_2d_condition.py + from peft import LoraConfig + + unet_lora_config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=target_modules, + init_lora_weights=False, + use_dora=False, + ) + return unet_lora_config + + def get_lora_state_dicts(self, modules_to_save, adapter_name): + from peft import get_peft_model_state_dict + + state_dicts = {} + for module_name, module in modules_to_save.items(): + if module is not None: + state_dicts[f"{module_name}_lora_layers"] = get_peft_model_state_dict( + module, adapter_name=adapter_name + ) + return state_dicts + + def get_dummy_input(self): + pipeline_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 5, + "guidance_scale": 6.0, + "output_type": "np", + "return_dict": False, + } + return pipeline_inputs + + def check_pipeline_hotswap(self, do_compile, rank0, rank1, target_modules0, target_modules1=None): + """ + Check that hotswapping works on a pipeline. + + Steps: + - create 2 LoRA adapters and save them + - load the first adapter + - hotswap the second adapter + - check that the outputs are correct + - optionally compile the model + + Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would + fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is + fine. + """ + # create 2 adapters with different ranks and alphas + dummy_input = self.get_dummy_input() + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + alpha0, alpha1 = rank0, rank1 + max_rank = max([rank0, rank1]) + if target_modules1 is None: + target_modules1 = target_modules0[:] + lora_config0 = self.get_unet_lora_config(rank0, alpha0, target_modules0) + lora_config1 = self.get_unet_lora_config(rank1, alpha1, target_modules1) + + torch.manual_seed(0) + pipeline.unet.add_adapter(lora_config0, adapter_name="adapter0") + output0_before = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + torch.manual_seed(1) + pipeline.unet.add_adapter(lora_config1, adapter_name="adapter1") + pipeline.unet.set_adapter("adapter1") + output1_before = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check + tol = 1e-3 + assert not np.allclose(output0_before, output1_before, atol=tol, rtol=tol) + assert not (output0_before == 0).all() + assert not (output1_before == 0).all() + + with tempfile.TemporaryDirectory() as tmp_dirname: + # save the adapter checkpoints + lora0_state_dicts = self.get_lora_state_dicts({"unet": pipeline.unet}, adapter_name="adapter0") + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter0"), safe_serialization=True, **lora0_state_dicts + ) + lora1_state_dicts = self.get_lora_state_dicts({"unet": pipeline.unet}, adapter_name="adapter1") + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter1"), safe_serialization=True, **lora1_state_dicts + ) + del pipeline + + # load the first adapter + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + if do_compile or (rank0 != rank1): + # no need to prepare if the model is not compiled or if the ranks are identical + pipeline.enable_lora_hotswap(target_rank=max_rank) + + file_name0 = os.path.join(tmp_dirname, "adapter0", "pytorch_lora_weights.safetensors") + file_name1 = os.path.join(tmp_dirname, "adapter1", "pytorch_lora_weights.safetensors") + + pipeline.load_lora_weights(file_name0) + if do_compile: + pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead") + + output0_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check: still same result + assert np.allclose(output0_before, output0_after, atol=tol, rtol=tol) + + # hotswap the 2nd adapter + pipeline.load_lora_weights(file_name1, hotswap=True, adapter_name="default_0") + output1_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check: since it's the same LoRA, the results should be identical + assert np.allclose(output1_before, output1_after, atol=tol, rtol=tol) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_pipeline(self, rank0, rank1): + self.check_pipeline_hotswap( + do_compile=False, rank0=rank0, rank1=rank1, target_modules0=["to_q", "to_k", "to_v", "to_out.0"] + ) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_pipline_linear(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["to_q", "to_k", "to_v", "to_out.0"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["conv", "conv1", "conv2"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa + def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1): + # It's important to add this context to raise an error on recompilation + target_modules = ["to_q", "conv"] + with torch._dynamo.config.patch(error_on_recompile=True): + self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) + + def test_enable_lora_hotswap_called_after_adapter_added_raises(self): + # ensure that enable_lora_hotswap is called before loading the first adapter + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + pipeline.unet.add_adapter(lora_config) + msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.") + with self.assertRaisesRegex(RuntimeError, msg): + pipeline.enable_lora_hotswap(target_rank=32) + + def test_enable_lora_hotswap_called_after_adapter_added_warns(self): + # ensure that enable_lora_hotswap is called before loading the first adapter + from diffusers.loaders.peft import logger + + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + pipeline.unet.add_adapter(lora_config) + msg = ( + "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation." + ) + with self.assertLogs(logger=logger, level="WARNING") as cm: + pipeline.enable_lora_hotswap(target_rank=32, check_compiled="warn") + assert any(msg in log for log in cm.output) + + def test_enable_lora_hotswap_called_after_adapter_added_ignore(self): + # check possibility to ignore the error/warning + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + pipeline.unet.add_adapter(lora_config) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") # Capture all warnings + pipeline.enable_lora_hotswap(target_rank=32, check_compiled="warn") + self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}") + + def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self): + # check that wrong argument value raises an error + lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"]) + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + pipeline.unet.add_adapter(lora_config) + msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.") + with self.assertRaisesRegex(ValueError, msg): + pipeline.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument") + + def test_hotswap_second_adapter_targets_more_layers_raises(self): + # check the error and log + from diffusers.loaders.peft import logger + + # at the moment, PEFT requires the 2nd adapter to target the same or a subset of layers + target_modules0 = ["to_q"] + target_modules1 = ["to_q", "to_k"] + with self.assertRaises(RuntimeError): # peft raises RuntimeError + with self.assertLogs(logger=logger, level="ERROR") as cm: + self.check_pipeline_hotswap( + do_compile=True, rank0=8, rank1=8, target_modules0=target_modules0, target_modules1=target_modules1 + ) + assert any("Hotswapping adapter0 was unsuccessful" in log for log in cm.output) + + def test_hotswap_component_not_supported_raises(self): + # right now, not some components don't support hotswapping, e.g. the text_encoder + from peft import LoraConfig + + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + lora_config0 = LoraConfig(target_modules=["q_proj"]) + lora_config1 = LoraConfig(target_modules=["q_proj"]) + + pipeline.text_encoder.add_adapter(lora_config0, adapter_name="adapter0") + pipeline.text_encoder.add_adapter(lora_config1, adapter_name="adapter1") + + with tempfile.TemporaryDirectory() as tmp_dirname: + # save the adapter checkpoints + lora0_state_dicts = self.get_lora_state_dicts( + {"text_encoder": pipeline.text_encoder}, adapter_name="adapter0" + ) + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter0"), safe_serialization=True, **lora0_state_dicts + ) + lora1_state_dicts = self.get_lora_state_dicts( + {"text_encoder": pipeline.text_encoder}, adapter_name="adapter1" + ) + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter1"), safe_serialization=True, **lora1_state_dicts + ) + del pipeline + + # load the first adapter + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + file_name0 = os.path.join(tmp_dirname, "adapter0", "pytorch_lora_weights.safetensors") + file_name1 = os.path.join(tmp_dirname, "adapter1", "pytorch_lora_weights.safetensors") + + pipeline.load_lora_weights(file_name0) + msg = re.escape( + "At the moment, hotswapping is not supported for text encoders, please pass `hotswap=False`" + ) + with self.assertRaisesRegex(ValueError, msg): + pipeline.load_lora_weights(file_name1, hotswap=True, adapter_name="default_0")