1
1
import logging
2
2
import os
3
+ import re
3
4
import traceback
5
+ import xml .etree .ElementTree as ET
6
+ from io import BytesIO
4
7
5
8
import librosa
6
- import re
7
9
import numpy as np
8
- import xml .etree .ElementTree as ET
9
-
10
- from contants import config
11
10
import soundfile as sf
12
- from io import BytesIO
13
11
from graiax import silkcoder
14
- from contants import ModelType
15
12
from scipy .signal import resample_poly
16
13
14
+ from contants import ModelType
15
+ from contants import config
17
16
from logger import logger
18
17
from manager .observer import Observer
19
18
from utils .data_utils import check_is_none
@@ -248,9 +247,10 @@ def process_ssml_infer_task(self, tasks, format):
248
247
raise ValueError (f"Unsupported model type: { task .get ('model_type' )} " )
249
248
model_type = ModelType (model_type_str )
250
249
model = self .get_model (model_type , task .get ("id" ))
251
- task ["id" ] = self .get_real_id (model_type , task .get ("id" ))
252
250
sampling_rates .append (model .sampling_rate )
253
251
last_sampling_rate = model .sampling_rate
252
+
253
+ # self.logger.debug(model, model.sampling_rate, task)
254
254
audio = self .infer_map [model_type ](task , encode = False )
255
255
audios .append (audio )
256
256
# 得到最高的采样率
@@ -394,7 +394,7 @@ def bert_vits2_infer(self, state, encode=True):
394
394
state ["text" ] = re .sub (r'\s+' , ' ' , state ["text" ]).strip ()
395
395
sampling_rate = model .sampling_rate
396
396
sentences_list = sentence_split (state ["text" ], state ["segment_size" ])
397
-
397
+
398
398
if model .zh_bert_extra :
399
399
infer_func = model .infer
400
400
state ["lang" ] = "zh"
@@ -411,7 +411,7 @@ def bert_vits2_infer(self, state, encode=True):
411
411
state ["text" ] = sentences
412
412
audio = infer_func (** state )
413
413
audios .append (audio )
414
-
414
+
415
415
audio = np .concatenate (audios )
416
416
417
417
return self .encode (sampling_rate , audio , state ["format" ]) if encode else audio
@@ -430,7 +430,7 @@ def stream_bert_vits2_infer(self, state, encode=True):
430
430
infer_func = model .infer_multilang
431
431
else :
432
432
infer_func = model .infer
433
-
433
+
434
434
for sentences in sentences_list :
435
435
state ["text" ] = sentences
436
436
audio = infer_func (** state )
0 commit comments