16
16
from QEfficient .base .common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP , QEFF_MODEL_TYPE , QEFFCommonLoader
17
17
from QEfficient .base .modeling_qeff import QEFFBaseModel
18
18
from QEfficient .exporter .export_utils import export_onnx , fix_onnx_fp16 , generate_input_files , run_model_on_ort
19
- from QEfficient .lora .auto import QEffAutoLoraModelForCausalLM
20
19
from QEfficient .transformers .modeling_utils import get_lists_of_cb_qeff_models
21
20
from QEfficient .transformers .models .modeling_auto import QEFFAutoModelForCausalLM
22
21
from QEfficient .utils import load_hf_tokenizer
@@ -149,7 +148,6 @@ def convert_to_cloud_kvstyle(
149
148
tokenizer : Union [PreTrainedTokenizer , PreTrainedTokenizerFast ],
150
149
onnx_dir_path : str ,
151
150
seq_len : int ,
152
- max_num_adapters : int ,
153
151
) -> str :
154
152
"""
155
153
API to convert model with kv retention and export to ONNX.
@@ -178,7 +176,7 @@ def convert_to_cloud_kvstyle(
178
176
179
177
# Decide path for saving exported ONNX files.
180
178
model_name = export_kvstyle_transformed_model_to_onnx (
181
- model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len , max_num_adapters
179
+ model_name , qeff_model .model , tokenizer , onnx_dir_path , seq_len
182
180
) # type: ignore
183
181
184
182
# return the model path for automation.
@@ -192,7 +190,6 @@ def export_kvstyle_transformed_model_to_onnx(
192
190
onnx_dir_path : str ,
193
191
seq_len : int ,
194
192
full_batch_size : Optional [int ] = None ,
195
- max_num_adapters : Optional [int ] = None ,
196
193
) -> str :
197
194
# Disabling requires_grad on all parameters
198
195
for _ , p in enumerate (transformed_model .parameters ()):
@@ -211,7 +208,6 @@ def export_kvstyle_transformed_model_to_onnx(
211
208
prompt_len = Constants .PROMPT_LEN ,
212
209
ctx_len = seq_len ,
213
210
full_batch_size = full_batch_size ,
214
- max_num_adapters = max_num_adapters ,
215
211
)
216
212
217
213
inputs = input_handler .prepare_pytorch_inputs ()
@@ -319,7 +315,6 @@ def export_for_cloud(
319
315
onnx_dir_path : str ,
320
316
seq_length : int = Constants .SEQ_LEN ,
321
317
full_batch_size : Optional [int ] = None ,
322
- max_num_adapters : Optional [int ] = None ,
323
318
) -> str :
324
319
# Check if model architecture is supported for continuous batching.
325
320
if full_batch_size and qeff_model .model .config .architectures [0 ].lower () not in {
@@ -330,18 +325,14 @@ def export_for_cloud(
330
325
)
331
326
332
327
# FIXME: move all this to class instead of here, and just call qeff_model.export here.
333
- if (
334
- AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM
335
- or qeff_model .__class__ == QEffAutoLoraModelForCausalLM
336
- ): # type: ignore
328
+ if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP .get (qeff_model .__class__ , None ) == QEFF_MODEL_TYPE .CAUSALLM : # type: ignore
337
329
return export_lm_model_for_cloud (
338
330
model_name = model_name ,
339
331
qeff_model = qeff_model , # type: ignore
340
332
tokenizer = tokenizer ,
341
333
onnx_dir_path = onnx_dir_path ,
342
334
seq_length = seq_length ,
343
335
full_batch_size = full_batch_size ,
344
- max_num_adapters = max_num_adapters ,
345
336
)
346
337
else :
347
338
raise NotImplementedError (
@@ -356,7 +347,6 @@ def export_lm_model_for_cloud(
356
347
onnx_dir_path : str ,
357
348
seq_length : int ,
358
349
full_batch_size : Optional [int ] = None ,
359
- max_num_adapters : Optional [int ] = None ,
360
350
) -> str :
361
351
if os .path .exists (onnx_dir_path ):
362
352
logger .warning (f"Overriding { onnx_dir_path } " )
@@ -385,7 +375,6 @@ def qualcomm_efficient_converter(
385
375
kv : bool = True ,
386
376
form_factor : str = "cloud" ,
387
377
full_batch_size : Optional [int ] = None ,
388
- max_num_adapters : Optional [int ] = None ,
389
378
) -> Tuple [str , str ]:
390
379
"""
391
380
This method is an alias for ``QEfficient.export``.
@@ -461,7 +450,6 @@ def qualcomm_efficient_converter(
461
450
onnx_dir_path = onnx_dir_path ,
462
451
seq_length = seq_length ,
463
452
full_batch_size = full_batch_size ,
464
- max_num_adapters = max_num_adapters ,
465
453
)
466
454
return onnx_dir_path , generated_onnx_model_path
467
455
else :
0 commit comments