API Reference
️🎞 Subtitles generation tool (Web-UI + CLI + Python package) powered by OpenAI's Whisper and its variants 🎞️
subsai.main
SubsAI: Subtitles AI Subtitles generation tool powered by OpenAI's Whisper and its variants.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.
SubsAI
Subs AI class
Example usage:
file = './assets/test1.mp4'
subs_ai = SubsAI()
model = subs_ai.create_model('openai/whisper', {'model_type': 'base'})
subs = subs_ai.transcribe(file, model)
subs.save('test1.srt')
available_models
staticmethod
available_models()
Returns the supported models
Returns:
-
list
–list of available models
Source code in src/subsai/main.py
54 55 56 57 58 59 60 61 |
|
model_info
staticmethod
model_info(model)
Returns general infos about the model (brief description and url)
Parameters:
-
model
(str
) –model name
Returns:
-
dict
–dict of infos
Source code in src/subsai/main.py
63 64 65 66 67 68 69 70 71 72 73 |
|
config_schema
staticmethod
config_schema(model)
Returns the configs associated with a model
Parameters:
-
model
(str
) –model name
Returns:
-
dict
–dict of configs
Source code in src/subsai/main.py
75 76 77 78 79 80 81 82 83 84 |
|
create_model
staticmethod
create_model(model_name, model_config={})
Returns a model instance
Parameters:
-
model_name
(str
) –the name of the model
-
model_config
(dict
, default:{}
) –the configuration dict
Returns:
-
AbstractModel
–the model instance
Source code in src/subsai/main.py
86 87 88 89 90 91 92 93 94 95 96 |
|
transcribe
staticmethod
transcribe(media_file, model, model_config={})
Takes the model instance (created by :func:create_model
) or the model name.
Returns a :class:pysubs2.SSAFile
https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file`_
Parameters:
-
media_file
(str
) –path of the media file (video/audio)
-
model
(Union[AbstractModel, str]
) –model instance or model name
-
model_config
(dict
, default:{}
) –model configs' dict
Returns:
-
SSAFile
–SSAFile: list of subtitles
Source code in src/subsai/main.py
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
|
Tools
Tools()
Some tools related to subtitles processing (ex: translation)
Source code in src/subsai/main.py
123 124 |
|
available_translation_models
staticmethod
available_translation_models()
Returns available translation models
A simple link to :func:utils.available_translation_models
for easy access
Returns:
-
list
–list of available models
Source code in src/subsai/main.py
126 127 128 129 130 131 132 133 134 135 |
|
available_translation_languages
staticmethod
available_translation_languages(model)
Returns the languages supported by the translation model
Parameters:
-
model
(Union[str, TranslationModel]
) –the name of the model
Returns:
-
list
–list of available languages
Source code in src/subsai/main.py
137 138 139 140 141 142 143 144 145 146 147 148 149 |
|
create_translation_model
staticmethod
create_translation_model(
model_name="m2m100", model_family=None
)
Creates and returns a translation model instance.
Parameters:
-
model_name
(str
, default:'m2m100'
) –name of the model. To get available models use :func:
available_translation_models
-
model_family
(str
, default:None
) –Either "mbart50" or "m2m100". By default, See
dl-translate
docs
Returns:
-
TranslationModel
–A translation model instance
Source code in src/subsai/main.py
151 152 153 154 155 156 157 158 159 160 161 |
|
translate
staticmethod
translate(
subs,
source_language,
target_language,
model="m2m100",
model_family=None,
translation_configs={},
)
Translates a subtitles SSAFile
object, what :func:SubsAI.transcribe
is returning
Parameters:
-
subs
(SSAFile
) –SSAFile
object -
source_language
(str
) –the language of the subtitles
-
target_language
(str
) –the target language
-
model
(Union[str, TranslationModel]
, default:'m2m100'
) –the translation model, either an
str
or the model instance created by :func:create_translation_model
-
model_family
(str
, default:None
) –Either "mbart50" or "m2m100". By default, See
dl-translate
docs -
translation_configs
(dict
, default:{}
) –dict of translation configs (see :attr:
configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
-
SSAFile
–returns an
SSAFile
subtitles translated to the target language
Source code in src/subsai/main.py
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
|
auto_sync
staticmethod
auto_sync(subs, media_file, **kwargs)
Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file
Parameters:
-
subs
(SSAFile
) –SSAFile
file -
media_file
(str
) –path of the media_file
-
kwargs
–configs to pass to ffsubsync (see :attr:
configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
-
SSAFile
–SSAFile
auto-synced
Source code in src/subsai/main.py
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
|
merge_subs_with_video
staticmethod
merge_subs_with_video(
subs, media_file, output_filename=None, **kwargs
)
Uses ffmpeg to merge subtitles into a video media file.
You cna merge multiple subs at the same time providing a dict with (lang,SSAFile
object) key,value pairs
Example:
file = '../../assets/video/test1.webm'
subs_ai = SubsAI()
model = subs_ai.create_model('openai/whisper', {'model_type': 'tiny'})
en_subs = subs_ai.transcribe(file, model)
ar_subs = pysubs2.load('../../assets/video/test0-ar.srt')
Tools.merge_subs_with_video2({'English': subs, "Arabic": subs2}, file)
Parameters:
-
subs
(Dict[str, SSAFile]
) –dict with (lang,
SSAFile
object) key,value pairs -
media_file
(str
) –path of the video media_file
-
output_filename
(str
, default:None
) –Output file name (without the extension as it will be inferred from the media file)
Returns:
-
str
–Absolute path of the output file
Source code in src/subsai/main.py
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
|
subsai.models
abstract_model
API that the transcription models should follow
AbstractModel
AbstractModel(model_name=None, model_config={})
Bases: ABC
Abstract Model class
Source code in src/subsai/models/abstract_model.py
15 16 17 |
|
model_name
instance-attribute
model_name = model_name
model_config
instance-attribute
model_config = model_config
transcribe
abstractmethod
transcribe(media_file)
Transcribe the media_file
to subtitles.
example use case from pysubs2.whisper:
.. code-block:: python :linenos:
subs = SSAFile() for segment in segments: event = SSAEvent(start=make_time(s=segment["start"]), end=make_time(s=segment["end"])) event.plaintext = segment["text"].strip() subs.append(event)
Parameters:
-
media_file
–Path of the media file
Returns:
-
SSAFile
–Collection of SSAEvent(s) (see :mod:
pysubs2.ssaevent
)
Source code in src/subsai/models/abstract_model.py
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
|
faster_whisper_model
Faster Whisper Model
See guillaumekln/faster-whisper
FasterWhisperModel
FasterWhisperModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/faster_whisper_model.py
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
|
model_name
class-attribute
instance-attribute
model_name = 'guillaumekln/faster-whisper'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_size_or_path": {
"type": list,
"description": 'Size of the model to use (e.g. "large-v2", "small", "tiny.en", etc.)or a path to a converted model directory. When a size is configured, the convertedmodel is downloaded from the Hugging Face Hub.',
"options": available_models(),
"default": "base",
},
"device": {
"type": list,
"description": 'Device to use for computation ("cpu", "cuda", "auto")',
"options": ["auto", "cpu", "cuda"],
"default": "auto",
},
"device_index": {
"type": int,
"description": "Device ID to use.The model can also be loaded on multiple GPUs by passing a list of IDs(e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallelwhen transcribe() is called from multiple Python threads (see also num_workers).",
"options": None,
"default": 0,
},
"compute_type": {
"type": str,
"description": "Type to use for computation.See https://opennmt.net/CTranslate2/quantization.html.",
"options": None,
"default": "default",
},
"cpu_threads": {
"type": int,
"description": "Number of threads to use when running on CPU (4 by default).A non zero value overrides the OMP_NUM_THREADS environment variable.",
"options": None,
"default": 0,
},
"num_workers": {
"type": int,
"description": "When transcribe() is called from multiple Python threads,having multiple workers enables true parallelism when running the model(concurrent calls to self.model.generate() will run in parallel).This can improve the global throughput at the cost of increased memory usage.",
"options": None,
"default": 1,
},
"temperature": {
"type": Tuple,
"description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
"options": None,
"default": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
},
"compression_ratio_threshold": {
"type": float,
"description": "If the gzip compression ratio is above this value, treat as failed",
"options": None,
"default": 2.4,
},
"log_prob_threshold": {
"type": float,
"description": "If the average log probability over sampled tokens is below this value, treat as failed",
"options": None,
"default": -1.0,
},
"no_speech_threshold": {
"type": float,
"description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
"options": None,
"default": 0.6,
},
"condition_on_previous_text": {
"type": bool,
"description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
"options": None,
"default": True,
},
"task": {
"type": list,
"description": "whether to perform X->X 'transcribe' or X->English 'translate'",
"options": ["transcribe", "translate"],
"default": "transcribe",
},
"language": {
"type": str,
"description": "language that the audio is in; uses detected language if None",
"options": None,
"default": None,
},
"best_of": {
"type": int,
"description": "number of independent samples to collect, when t > 0",
"options": None,
"default": 5,
},
"beam_size": {
"type": int,
"description": "number of beams in beam search, when t == 0",
"options": None,
"default": 5,
},
"patience": {
"type": float,
"description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
"options": None,
"default": 1.0,
},
"length_penalty": {
"type": float,
"description": "'alpha' in Google NMT, None defaults to length norm",
"options": None,
"default": 1.0,
},
"prefix": {
"type": str,
"description": "text or tokens to prefix the current context",
"options": None,
"default": None,
},
"suppress_blank": {
"type": bool,
"description": "this will suppress blank outputs",
"options": None,
"default": True,
},
"suppress_tokens": {
"type": Tuple,
"description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
"options": None,
"default": [-1],
},
"without_timestamps": {
"type": bool,
"description": "use <|notimestamps|> to sample text tokens only",
"options": None,
"default": False,
},
"max_initial_timestamp": {
"type": float,
"description": "the initial timestamp cannot be later than this",
"options": None,
"default": 1.0,
},
"initial_prompt": {
"type": str,
"description": "Optional text to provide as a prompt for the first window.",
"options": None,
"default": None,
},
"word_timestamps": {
"type": bool,
"description": "Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.",
"options": None,
"default": False,
},
"prepend_punctuations": {
"type": str,
"description": "If word_timestamps is True, merge these punctuation symbolswith the next word",
"options": None,
"default": "\"'“¿([{-",
},
"append_punctuations": {
"type": str,
"description": "If word_timestamps is True, merge these punctuation symbolswith the previous word",
"options": None,
"default": "\"'.。,,!!??::”)]}、",
},
"vad_filter": {
"type": bool,
"description": "If True, use the integrated Silero VAD model to filter out parts of the audio without speech.",
"options": None,
"default": False,
},
"vad_parameters": {
"type": dict,
"description": "Parameters for splitting long audios into speech chunks using silero VAD.",
"options": None,
"default": {
"threshold": 0.5,
"min_speech_duration_ms": 250,
"max_speech_duration_s": float("inf"),
"min_silence_duration_ms": 2000,
"window_size_samples": 1024,
"speech_pad_ms": 400,
},
},
}
transcribe_configs
instance-attribute
transcribe_configs = {
config: _load_config(
config, model_config, config_schema
)
for config in config_schema
if not hasattr(self, f"_{config}")
}
model
instance-attribute
model = WhisperModel(
model_size_or_path=_model_size_or_path,
device=_device,
device_index=_device_index,
compute_type=_compute_type,
cpu_threads=_cpu_threads,
num_workers=_num_workers,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/faster_whisper_model.py
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
|
hugging_face_model
Hugging Face Model
See automatic-speech-recognition
devices
module-attribute
devices = get_available_devices()
HuggingFaceModel
HuggingFaceModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/hugging_face_model.py
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
|
model_name
class-attribute
instance-attribute
model_name = 'HuggingFaceModel'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_id": {
"type": str,
"description": "The model id from the Hugging Face Hub.",
"options": None,
"default": "openai/whisper-tiny",
},
"device": {
"type": list,
"description": "Pytorch device",
"options": devices,
"default": devices[0],
},
"segment_type": {
"type": list,
"description": "Sentence-level or word-level timestamps",
"options": ["sentence", "word"],
"default": "sentence",
},
"chunk_length_s": {
"type": float,
"description": "(`float`, *optional*, defaults to 0):The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).",
"options": None,
"default": 30,
},
}
segment_type
instance-attribute
segment_type = _load_config(
"segment_type", model_config, config_schema
)
model
instance-attribute
model = pipeline(
"automatic-speech-recognition",
model=_model_id,
device=_device,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/hugging_face_model.py
67 68 69 70 71 72 73 74 75 76 77 78 79 |
|
stable_ts_model
Stable-ts Model
StableTsModel
StableTsModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/stable_ts_model.py
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 |
|
model_name
class-attribute
instance-attribute
model_name = 'jianfch/stable-ts'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
"options": available_models(),
"default": "base",
},
"device": {
"type": list,
"description": "The PyTorch device to put the model into",
"options": [None, *get_available_devices()],
"default": None,
},
"in_memory": {
"type": bool,
"description": "bool, default False, Whether to preload the model weights into host memory.",
"options": None,
"default": False,
},
"cpu_preload": {
"type": bool,
"description": "Load model into CPU memory first then move model to specified device to reduce GPU memory usage when loading model",
"options": None,
"default": True,
},
"dq": {
"type": bool,
"description": "Whether to apply Dynamic Quantization to model to reduced memory usage and increase inference speed but at the cost of a slight decrease in accuracy. Only for CPU.",
"options": None,
"default": False,
},
"verbose": {
"type": bool,
"description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
"options": None,
"default": None,
},
"temperature": {
"type": Tuple,
"description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
"options": None,
"default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
},
"compression_ratio_threshold": {
"type": float,
"description": "If the gzip compression ratio is above this value, treat as failed",
"options": None,
"default": 2.4,
},
"logprob_threshold": {
"type": float,
"description": "If the average log probability over sampled tokens is below this value, treat as failed",
"options": None,
"default": -1.0,
},
"no_speech_threshold": {
"type": float,
"description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
"options": None,
"default": 0.6,
},
"condition_on_previous_text": {
"type": bool,
"description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
"options": None,
"default": True,
},
"initial_prompt": {
"type": str,
"description": "Optional text to provide as a prompt for the first window.",
"options": None,
"default": None,
},
"word_timestamps": {
"type": bool,
"description": "Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.",
"options": None,
"default": True,
},
"regroup": {
"type": bool,
"description": "default True, meaning the default regroup algorithmString for customizing the regrouping algorithm. False disables regrouping.Ignored if ``word_timestamps = False``.",
"options": None,
"default": True,
},
"ts_num": {
"type": int,
"description": "meaning disable this optionNumber of extra timestamp inferences to perform then use average of these extra timestamps.An experimental option that might hurt performance.",
"options": None,
"default": 0,
},
"ts_noise": {
"type": float,
"description": "Percentage of noise to add to audio_features to perform inferences for ``ts_num``.",
"options": None,
"default": 0.1,
},
"suppress_silence": {
"type": bool,
"description": "Whether to enable timestamps adjustments based on the detected silence.",
"options": None,
"default": True,
},
"suppress_word_ts": {
"type": bool,
"description": "Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.",
"options": None,
"default": True,
},
"q_levels": {
"type": int,
"description": "Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.Acts as a threshold to marking sound as silent.Fewer levels will increase the threshold of volume at which to mark a sound as silent.",
"options": None,
"default": 20,
},
"k_size": {
"type": int,
"description": "Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.Recommend 5 or 3; higher sizes will reduce detection of silence.",
"options": None,
"default": 5,
},
"time_scale": {
"type": float,
"description": "Factor for scaling audio duration for inference.Greater than 1.0 'slows down' the audio, and less than 1.0 'speeds up' the audio. None is same as 1.0.A factor of 1.5 will stretch 10s audio to 15s for inference. This increases the effective resolutionof the model but can increase word error rate.",
"options": None,
"default": None,
},
"demucs": {
"type": bool,
"description": "Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance ofa Demucs model to avoid reloading the model for each run.Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.",
"options": None,
"default": False,
},
"demucs_output": {
"type": str,
"description": "Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.",
"options": None,
"default": None,
},
"demucs_options": {
"type": dict,
"description": "Options to use for :func:`stable_whisper.audio.demucs_audio`.",
"options": None,
"default": None,
},
"vad": {
"type": bool,
"description": "Whether to use Silero VAD to generate timestamp suppression mask.Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.",
"options": None,
"default": False,
},
"vad_threshold": {
"type": float,
"description": "Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.",
"options": None,
"default": 0.35,
},
"vad_onnx": {
"type": bool,
"description": "Whether to use ONNX for Silero VAD.",
"options": None,
"default": False,
},
"min_word_dur": {
"type": float,
"description": "Shortest duration each word is allowed to reach for silence suppression.",
"options": None,
"default": 0.1,
},
"only_voice_freq": {
"type": bool,
"description": "Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.",
"options": None,
"default": False,
},
"prepend_punctuations": {
"type": str,
"description": "If word_timestamps is True, merge these punctuation symbolswith the next word",
"options": None,
"default": "\"'“¿([{-",
},
"append_punctuations": {
"type": str,
"description": "If word_timestamps is True, merge these punctuation symbolswith the previous word",
"options": None,
"default": "\"'.。,,!!??::”)]}、",
},
"mel_first": {
"type": bool,
"description": "Process entire audio track into log-Mel spectrogram first instead in chunks.Used if odd behavior seen in stable-ts but not in whisper, but use significantly more memory for long audio.",
"options": None,
"default": False,
},
"suppress_ts_tokens": {
"type": bool,
"description": " Whether to suppress timestamp tokens during inference for timestamps are detected at silent.Reduces hallucinations in some cases, but also prone to ignore disfluencies and repetitions.This option is ignored if ``suppress_silence = False``.",
"options": None,
"default": False,
},
"gap_padding": {
"type": str,
"description": "Padding prepend to each segments for word timing alignment.Used to reduce the probability of model predicting timestamps earlier than the first utterance.",
"options": None,
"default": "...",
},
"only_ffmpeg": {
"type": bool,
"description": "Whether to use only FFmpeg (instead of not yt-dlp) for URls",
"options": None,
"default": False,
},
"max_instant_words": {
"type": float,
"description": "If percentage of instantaneous words in a segment exceed this amount, the segment is removed.",
"options": None,
"default": 0.5,
},
"avg_prob_threshold": {
"type": float,
"description": "Transcribe the gap after the previous word and if the average word proababiliy of a segment falls below thisvalue, discard the segment. If ``None``, skip transcribing the gap to reduce chance of timestamps startingbefore the next utterance.",
"options": None,
"default": None,
},
"ignore_compatibility": {
"type": bool,
"description": "Whether to ignore warnings for compatibility issues with the detected Whisper version.",
"options": None,
"default": False,
},
"task": {
"type": list,
"description": "whether to perform X->X 'transcribe' or X->English 'translate'",
"options": ["transcribe", "translate"],
"default": "transcribe",
},
"language": {
"type": str,
"description": "language that the audio is in; uses detected language if None",
"options": None,
"default": None,
},
"sample_len": {
"type": int,
"description": "maximum number of tokens to sample",
"options": None,
"default": None,
},
"best_of": {
"type": int,
"description": "number of independent samples to collect, when t > 0",
"options": None,
"default": None,
},
"beam_size": {
"type": int,
"description": "number of beams in beam search, when t == 0",
"options": None,
"default": None,
},
"patience": {
"type": float,
"description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
"options": None,
"default": None,
},
"length_penalty": {
"type": float,
"description": "'alpha' in Google NMT, None defaults to length norm",
"options": None,
"default": None,
},
"prompt": {
"type": str,
"description": "text or tokens for the previous context",
"options": None,
"default": None,
},
"prefix": {
"type": str,
"description": "text or tokens to prefix the current context",
"options": None,
"default": None,
},
"suppress_blank": {
"type": bool,
"description": "this will suppress blank outputs",
"options": None,
"default": True,
},
"suppress_tokens": {
"type": str,
"description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
"options": None,
"default": "-1",
},
"without_timestamps": {
"type": bool,
"description": "use <|notimestamps|> to sample text tokens only",
"options": None,
"default": False,
},
"max_initial_timestamp": {
"type": float,
"description": "the initial timestamp cannot be later than this",
"options": None,
"default": 1.0,
},
"fp16": {
"type": bool,
"description": "use fp16 for most of the calculation",
"options": None,
"default": True,
},
}
transcribe_configs
instance-attribute
transcribe_configs = {
config: _load_config(
config, model_config, config_schema
)
for config in config_schema
if not hasattr(self, f"_{config}")
}
model
instance-attribute
model = load_model(
name=_model_type,
device=_device,
in_memory=_in_memory,
cpu_preload=_cpu_preload,
dq=_dq,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/stable_ts_model.py
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 |
|
whisperX_model
WhisperX Model
See m-bain/whisperX
WhisperXModel
WhisperXModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/whisperX_model.py
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
|
model_name
class-attribute
instance-attribute
model_name = 'm-bain/whisperX'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
"options": available_models(),
"default": "base",
},
"device": {
"type": list,
"description": 'Device to use for computation ("cpu", "cuda")',
"options": ["cpu", "cuda"],
"default": "cpu",
},
"compute_type": {
"type": list,
"description": "change to 'int8' if low on GPU mem (may reduce accuracy)",
"options": ["default", "float16", "int8"],
"default": "default",
},
"download_root": {
"type": str,
"description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
"options": None,
"default": None,
},
"language": {
"type": str,
"description": "language that the audio is in; uses detected language if None",
"options": None,
"default": None,
},
"segment_type": {
"type": list,
"description": "Word-level timestamps, Choose here between sentence-level and word-level",
"options": ["sentence", "word"],
"default": "sentence",
},
"batch_size": {
"type": int,
"description": "reduce if low on GPU mem",
"options": None,
"default": 16,
},
"return_char_alignments": {
"type": bool,
"description": "Whether to return char alignments",
"options": None,
"default": False,
},
"speaker_labels": {
"type": bool,
"description": "Run Diarization Pipeline",
"options": None,
"default": False,
},
"HF_TOKEN": {
"type": str,
"description": "if speaker labels is True, you will need Hugging Face access token to use the diarization models, https://github.com/m-bain/whisperX#speaker-diarization",
"options": None,
"default": None,
},
"min_speakers": {
"type": int,
"description": "min speakers",
"options": None,
"default": None,
},
"max_speakers": {
"type": int,
"description": "max speakers",
"options": None,
"default": None,
},
}
model_type
instance-attribute
model_type = _load_config(
"model_type", model_config, config_schema
)
device
instance-attribute
device = _load_config("device", model_config, config_schema)
compute_type
instance-attribute
compute_type = _load_config(
"compute_type", model_config, config_schema
)
download_root
instance-attribute
download_root = _load_config(
"download_root", model_config, config_schema
)
language
instance-attribute
language = _load_config(
"language", model_config, config_schema
)
segment_type
instance-attribute
segment_type = _load_config(
"segment_type", model_config, config_schema
)
batch_size
instance-attribute
batch_size = _load_config(
"batch_size", model_config, config_schema
)
return_char_alignments
instance-attribute
return_char_alignments = _load_config(
"return_char_alignments", model_config, config_schema
)
speaker_labels
instance-attribute
speaker_labels = _load_config(
"speaker_labels", model_config, config_schema
)
HF_TOKEN
instance-attribute
HF_TOKEN = _load_config(
"HF_TOKEN", model_config, config_schema
)
min_speakers
instance-attribute
min_speakers = _load_config(
"min_speakers", model_config, config_schema
)
max_speakers
instance-attribute
max_speakers = _load_config(
"max_speakers", model_config, config_schema
)
model
instance-attribute
model = load_model(
model_type,
device=device,
compute_type=compute_type,
download_root=download_root,
language=language,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisperX_model.py
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
|
whisper_api_model
Whisper API Model
See openai/whisper
TMPDIR
module-attribute
TMPDIR = gettempdir()
OPENAI_API_SIZE_LIMIT_MB
module-attribute
OPENAI_API_SIZE_LIMIT_MB = 24
WhisperAPIModel
WhisperAPIModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/whisper_api_model.py
81 82 83 84 85 86 87 88 89 |
|
model_name
class-attribute
instance-attribute
model_name = 'openai/whisper'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "OpenAI Whisper API, currently only supports large-v2 which is named as whisper-1/ There is a 25mb upload limit so audio is chunked locally, this may lead to lower performance.",
"options": ["whisper-1"],
"default": "whisper-1",
},
"api_key": {
"type": str,
"description": "Your OpenAI API key",
"options": None,
"default": get("OPENAI_KEY", None),
},
"language": {
"type": str,
"description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.",
"options": None,
"default": None,
},
"prompt": {
"type": str,
"description": "An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.",
"options": None,
"default": None,
},
"temperature": {
"type": float,
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.",
"options": None,
"default": 0,
},
}
model_type
instance-attribute
model_type = _load_config(
"model_type", model_config, config_schema
)
api_key
instance-attribute
api_key = _load_config(
"api_key", model_config, config_schema
)
language
instance-attribute
language = _load_config(
"language", model_config, config_schema
)
prompt
instance-attribute
prompt = _load_config("prompt", model_config, config_schema)
temperature
instance-attribute
temperature = _load_config(
"temperature", model_config, config_schema
)
client
instance-attribute
client = OpenAI(api_key=api_key)
chunk_audio
chunk_audio(audio_file_path)
Source code in src/subsai/models/whisper_api_model.py
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
|
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_api_model.py
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
|
split_filename
split_filename(filepath)
Source code in src/subsai/models/whisper_api_model.py
22 23 24 25 |
|
convert_video_to_audio_ffmpeg
convert_video_to_audio_ffmpeg(video_file, output_ext='mp3')
Source code in src/subsai/models/whisper_api_model.py
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
|
whisper_model
Whisper Model
See openai/whisper
WhisperModel
WhisperModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/whisper_model.py
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
|
model_name
class-attribute
instance-attribute
model_name = 'openai/whisper'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
"options": available_models(),
"default": "base",
},
"device": {
"type": list,
"description": "The PyTorch device to put the model into",
"options": [None, *get_available_devices()],
"default": None,
},
"download_root": {
"type": str,
"description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
"options": None,
"default": None,
},
"in_memory": {
"type": bool,
"description": "whether to preload the model weights into host memory",
"options": None,
"default": False,
},
"verbose": {
"type": bool,
"description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
"options": None,
"default": None,
},
"temperature": {
"type": Tuple,
"description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
"options": None,
"default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
},
"compression_ratio_threshold": {
"type": float,
"description": "If the gzip compression ratio is above this value, treat as failed",
"options": None,
"default": 2.4,
},
"logprob_threshold": {
"type": float,
"description": "If the average log probability over sampled tokens is below this value, treat as failed",
"options": None,
"default": -1.0,
},
"no_speech_threshold": {
"type": float,
"description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
"options": None,
"default": 0.6,
},
"condition_on_previous_text": {
"type": bool,
"description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
"options": None,
"default": True,
},
"task": {
"type": list,
"description": "whether to perform X->X 'transcribe' or X->English 'translate'",
"options": ["transcribe", "translate"],
"default": "transcribe",
},
"language": {
"type": str,
"description": "language that the audio is in; uses detected language if None",
"options": None,
"default": None,
},
"sample_len": {
"type": int,
"description": "maximum number of tokens to sample",
"options": None,
"default": None,
},
"best_of": {
"type": int,
"description": "number of independent samples to collect, when t > 0",
"options": None,
"default": None,
},
"beam_size": {
"type": int,
"description": "number of beams in beam search, when t == 0",
"options": None,
"default": None,
},
"patience": {
"type": float,
"description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
"options": None,
"default": None,
},
"length_penalty": {
"type": float,
"description": "'alpha' in Google NMT, None defaults to length norm",
"options": None,
"default": None,
},
"prompt": {
"type": str,
"description": "text or tokens for the previous context",
"options": None,
"default": None,
},
"prefix": {
"type": str,
"description": "text or tokens to prefix the current context",
"options": None,
"default": None,
},
"suppress_blank": {
"type": bool,
"description": "this will suppress blank outputs",
"options": None,
"default": True,
},
"suppress_tokens": {
"type": str,
"description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
"options": None,
"default": "-1",
},
"without_timestamps": {
"type": bool,
"description": "use <|notimestamps|> to sample text tokens only",
"options": None,
"default": False,
},
"max_initial_timestamp": {
"type": float,
"description": "the initial timestamp cannot be later than this",
"options": None,
"default": 1.0,
},
"fp16": {
"type": bool,
"description": "use fp16 for most of the calculation",
"options": None,
"default": True,
},
}
model_type
instance-attribute
model_type = _load_config(
"model_type", model_config, config_schema
)
device
instance-attribute
device = _load_config("device", model_config, config_schema)
download_root
instance-attribute
download_root = _load_config(
"download_root", model_config, config_schema
)
in_memory
instance-attribute
in_memory = _load_config(
"in_memory", model_config, config_schema
)
verbose
instance-attribute
verbose = _load_config(
"verbose", model_config, config_schema
)
temperature
instance-attribute
temperature = _load_config(
"temperature", model_config, config_schema
)
compression_ratio_threshold
instance-attribute
compression_ratio_threshold = _load_config(
"compression_ratio_threshold",
model_config,
config_schema,
)
logprob_threshold
instance-attribute
logprob_threshold = _load_config(
"logprob_threshold", model_config, config_schema
)
no_speech_threshold
instance-attribute
no_speech_threshold = _load_config(
"no_speech_threshold", model_config, config_schema
)
condition_on_previous_text
instance-attribute
condition_on_previous_text = _load_config(
"condition_on_previous_text",
model_config,
config_schema,
)
decode_options
instance-attribute
decode_options = {
config: _load_config(
config, model_config, config_schema
)
for config in config_schema
if not hasattr(self, config)
}
model
instance-attribute
model = load_model(
name=model_type,
device=device,
download_root=download_root,
in_memory=in_memory,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_model.py
206 207 208 209 210 211 212 213 214 215 216 217 |
|
whisper_timestamped_model
whisper_timestamped
See linto-ai/whisper-timestamped
WhisperTimeStamped
WhisperTimeStamped(model_config={})
Bases: AbstractModel
Source code in src/subsai/models/whisper_timestamped_model.py
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
|
model_name
class-attribute
instance-attribute
model_name = 'linto-ai/whisper-timestamped'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
"options": available_models(),
"default": "base",
},
"segment_type": {
"type": list,
"description": "Whisper_timestamps gives the ability to have word-level timestamps, Choose here between sentence-level and word-level",
"options": ["sentence", "word"],
"default": "sentence",
},
"device": {
"type": list,
"description": "The PyTorch device to put the model into",
"options": [None, *get_available_devices()],
"default": None,
},
"download_root": {
"type": str,
"description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
"options": None,
"default": None,
},
"in_memory": {
"type": bool,
"description": "whether to preload the model weights into host memory",
"options": None,
"default": False,
},
"verbose": {
"type": bool,
"description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
"options": None,
"default": None,
},
"temperature": {
"type": Tuple,
"description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
"options": None,
"default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
},
"compression_ratio_threshold": {
"type": float,
"description": "If the gzip compression ratio is above this value, treat as failed",
"options": None,
"default": 2.4,
},
"logprob_threshold": {
"type": float,
"description": "If the average log probability over sampled tokens is below this value, treat as failed",
"options": None,
"default": -1.0,
},
"no_speech_threshold": {
"type": float,
"description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
"options": None,
"default": 0.6,
},
"condition_on_previous_text": {
"type": bool,
"description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
"options": None,
"default": True,
},
"task": {
"type": list,
"description": "whether to perform X->X 'transcribe' or X->English 'translate'",
"options": ["transcribe", "translate"],
"default": "transcribe",
},
"language": {
"type": str,
"description": "language that the audio is in; uses detected language if None",
"options": None,
"default": None,
},
"sample_len": {
"type": int,
"description": "maximum number of tokens to sample",
"options": None,
"default": None,
},
"best_of": {
"type": int,
"description": "number of independent samples to collect, when t > 0",
"options": None,
"default": None,
},
"beam_size": {
"type": int,
"description": "number of beams in beam search, when t == 0",
"options": None,
"default": None,
},
"patience": {
"type": float,
"description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
"options": None,
"default": None,
},
"length_penalty": {
"type": float,
"description": "'alpha' in Google NMT, None defaults to length norm",
"options": None,
"default": None,
},
"suppress_tokens": {
"type": str,
"description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
"options": None,
"default": "-1",
},
"fp16": {
"type": bool,
"description": "use fp16 for most of the calculation",
"options": None,
"default": True,
},
"remove_punctuation_from_words": {
"type": bool,
"description": "If False, words will be glued with the next punctuation mark (if any).If True, there will be no punctuation mark in the `words[:]['text']` list.It only affects these strings; This has no influence on the computation of the word confidence, whatever the value of `include_punctuation_in_confidence` is.",
"options": None,
"default": False,
},
"refine_whisper_precision": {
"type": float,
"description": "How much can we refine Whisper segment positions, in seconds. Must be a multiple of 0.02.",
"options": None,
"default": 0.5,
},
"min_word_duration": {
"type": float,
"description": "Minimum duration of a word, in seconds. If a word is shorter than this, timestamps will be adjusted.",
"options": None,
"default": 0.04,
},
"plot_word_alignment": {
"type": bool,
"description": "Whether to plot the word alignment for each segment. matplotlib must be installed to use this option.",
"options": None,
"default": False,
},
"seed": {
"type": int,
"description": "Random seed to use for temperature sampling, for the sake of reproducibility.Choose None for unpredictable randomness",
"options": None,
"default": 1234,
},
"vad": {
"type": bool,
"description": "Whether to perform voice activity detection (VAD) on the audio file, to remove silent parts before transcribing with Whisper model. This should decrease hallucinations from the Whisper model.",
"options": None,
"default": False,
},
"detect_disfluencies": {
"type": bool,
"description": 'Whether to detect disfluencies (i.e. hesitations, filler words, repetitions, corrections, etc.) that Whisper model might have omitted in the transcription. This should make the word timestamp prediction more accurate.And probable disfluencies will be marked as special words "[*]"',
"options": None,
"default": False,
},
"trust_whisper_timestamps": {
"type": bool,
"description": "Whether to rely on Whisper's timestamps to get approximative first estimate of segment positions (up to refine_whisper_precision).",
"options": None,
"default": True,
},
"naive_approach": {
"type": bool,
"description": "Force the naive approach that consists in decoding twice the audio file, once to get the transcription and once with the decoded tokens to get the alignment. Note that this approach is used anyway when beam_size is not None and/or when the temperature is a list with more than one element.",
"options": None,
"default": False,
},
}
model_type
instance-attribute
model_type = _load_config(
"model_type", model_config, config_schema
)
segment_type
instance-attribute
segment_type = _load_config(
"segment_type", model_config, config_schema
)
device
instance-attribute
device = _load_config("device", model_config, config_schema)
download_root
instance-attribute
download_root = _load_config(
"download_root", model_config, config_schema
)
in_memory
instance-attribute
in_memory = _load_config(
"in_memory", model_config, config_schema
)
verbose
instance-attribute
verbose = _load_config(
"verbose", model_config, config_schema
)
temperature
instance-attribute
temperature = _load_config(
"temperature", model_config, config_schema
)
compression_ratio_threshold
instance-attribute
compression_ratio_threshold = _load_config(
"compression_ratio_threshold",
model_config,
config_schema,
)
logprob_threshold
instance-attribute
logprob_threshold = _load_config(
"logprob_threshold", model_config, config_schema
)
no_speech_threshold
instance-attribute
no_speech_threshold = _load_config(
"no_speech_threshold", model_config, config_schema
)
condition_on_previous_text
instance-attribute
condition_on_previous_text = _load_config(
"condition_on_previous_text",
model_config,
config_schema,
)
decode_options
instance-attribute
decode_options = {
config: _load_config(
config, model_config, config_schema
)
for config in config_schema
if not hasattr(self, config)
}
model
instance-attribute
model = load_model(
name=model_type,
device=device,
download_root=download_root,
in_memory=in_memory,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_timestamped_model.py
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 |
|
whispercpp_model
Whisper.cpp Model
See whisper.cpp, See pywhispercpp
WhisperCppModel
WhisperCppModel(model_config)
Bases: AbstractModel
Source code in src/subsai/models/whispercpp_model.py
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
|
model_name
class-attribute
instance-attribute
model_name = 'ggerganov/whisper.cpp'
config_schema
class-attribute
instance-attribute
config_schema = {
"model_type": {
"type": list,
"description": "Available whisper.cpp models",
"options": AVAILABLE_MODELS,
"default": "base",
},
"n_threads": {
"type": int,
"description": "Number of threads to allocate for the inferencedefault to min(4, available hardware_concurrency)",
"options": None,
"default": 4,
},
"n_max_text_ctx": {
"type": int,
"description": "max tokens to use from past text as prompt for the decoder",
"options": None,
"default": 16384,
},
"offset_ms": {
"type": int,
"description": "start offset in ms",
"options": None,
"default": 0,
},
"duration_ms": {
"type": int,
"description": "audio duration to process in ms",
"options": None,
"default": 0,
},
"translate": {
"type": bool,
"description": "whether to translate the audio to English",
"options": None,
"default": False,
},
"no_context": {
"type": bool,
"description": "do not use past transcription (if any) as initial prompt for the decoder",
"options": None,
"default": False,
},
"single_segment": {
"type": bool,
"description": "force single segment output (useful for streaming)",
"options": None,
"default": False,
},
"print_special": {
"type": bool,
"description": "print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)",
"options": None,
"default": False,
},
"print_progress": {
"type": bool,
"description": "print progress information",
"options": None,
"default": True,
},
"print_realtime": {
"type": bool,
"description": "print results from within whisper.cpp (avoid it, use callback instead)",
"options": None,
"default": False,
},
"print_timestamps": {
"type": bool,
"description": "print timestamps for each text segment when printing realtime",
"options": None,
"default": True,
},
"token_timestamps": {
"type": bool,
"description": "enable token-level timestamps",
"options": None,
"default": False,
},
"thold_pt": {
"type": float,
"description": "timestamp token probability threshold (~0.01)",
"options": None,
"default": 0.01,
},
"thold_ptsum": {
"type": float,
"description": "timestamp token sum probability threshold (~0.01)",
"options": None,
"default": 0.01,
},
"max_len": {
"type": int,
"description": "max segment length in characters",
"options": None,
"default": 0,
},
"split_on_word": {
"type": bool,
"description": "split on word rather than on token (when used with max_len)",
"options": None,
"default": False,
},
"max_tokens": {
"type": int,
"description": "max tokens per segment (0 = no limit)",
"options": None,
"default": 0,
},
"speed_up": {
"type": bool,
"description": "speed-up the audio by 2x using Phase Vocoder",
"options": None,
"default": False,
},
"audio_ctx": {
"type": int,
"description": "overwrite the audio context size (0 = use default)",
"options": None,
"default": 0,
},
"prompt_n_tokens": {
"type": int,
"description": "tokens to provide to the whisper decoder as initial prompt",
"options": None,
"default": 0,
},
"language": {
"type": str,
"description": 'for auto-detection, set to None, "" or "auto"',
"options": None,
"default": "en",
},
"suppress_blank": {
"type": bool,
"description": "common decoding parameters",
"options": None,
"default": True,
},
"suppress_non_speech_tokens": {
"type": bool,
"description": "common decoding parameters",
"options": None,
"default": False,
},
"temperature": {
"type": float,
"description": "initial decoding temperature",
"options": None,
"default": 0.0,
},
"max_initial_ts": {
"type": float,
"description": "max_initial_ts",
"options": None,
"default": 1.0,
},
"length_penalty": {
"type": float,
"description": "length_penalty",
"options": None,
"default": -1.0,
},
"temperature_inc": {
"type": float,
"description": "temperature_inc",
"options": None,
"default": 0.2,
},
"entropy_thold": {
"type": float,
"description": 'similar to OpenAI\'s "compression_ratio_threshold"',
"options": None,
"default": 2.4,
},
"logprob_thold": {
"type": float,
"description": "logprob_thold",
"options": None,
"default": -1.0,
},
"no_speech_thold": {
"type": float,
"description": "no_speech_thold",
"options": None,
"default": 0.6,
},
"greedy": {
"type": dict,
"description": "greedy",
"options": None,
"default": {"best_of": -1},
},
"beam_search": {
"type": dict,
"description": "beam_search",
"options": None,
"default": {"beam_size": -1, "patience": -1.0},
},
}
model_type
instance-attribute
model_type = _load_config(
"model_type", model_config, config_schema
)
params
instance-attribute
params = {}
model
instance-attribute
model = Model(model=model_type, **params)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whispercpp_model.py
246 247 248 249 250 251 252 253 |
|
subsai.configs
Configurations file
AVAILABLE_MODELS
module-attribute
AVAILABLE_MODELS = {
"openai/whisper": {
"class": WhisperModel,
"description": "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.",
"url": "https://github.com/openai/whisper",
"config_schema": config_schema,
},
"linto-ai/whisper-timestamped": {
"class": WhisperTimeStamped,
"description": "Multilingual Automatic Speech Recognition with word-level timestamps and confidence.",
"url": "https://github.com/linto-ai/whisper-timestamped",
"config_schema": config_schema,
},
"ggerganov/whisper.cpp": {
"class": WhisperCppModel,
"description": "High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model\n* Plain C/C++ implementation without dependencies\n* Runs on the CPU\n",
"url": "https://github.com/ggerganov/whisper.cpp\nhttps://github.com/abdeladim-s/pywhispercpp",
"config_schema": config_schema,
},
"guillaumekln/faster-whisper": {
"class": FasterWhisperModel,
"description": "**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.\nThis implementation is up to 4 times faster than [openai/whisper]( https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.",
"url": "https://github.com/guillaumekln/faster-whisper",
"config_schema": config_schema,
},
"m-bain/whisperX": {
"class": WhisperXModel,
"description": "**whisperX** is a fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.",
"url": "https://github.com/m-bain/whisperX",
"config_schema": config_schema,
},
"jianfch/stable-ts": {
"class": StableTsModel,
"description": "**Stabilizing Timestamps for Whisper** This library modifies [Whisper](https://github.com/openai/whisper) to produce more reliable timestamps and extends its functionality.",
"url": "https://github.com/jianfch/stable-ts",
"config_schema": config_schema,
},
"API/openai/whisper": {
"class": WhisperAPIModel,
"description": "API for the OpenAI large-v2 Whisper model, requires an API key.",
"url": "https://platform.openai.com/docs/guides/speech-to-text",
"config_schema": config_schema,
},
"HuggingFace": {
"class": HuggingFaceModel,
"description": "Hugging Face implementation of Whisper. Any speech recognition pretrained model from the Hugging Face hub can be used as well",
"url": "https://huggingface.co/tasks/automatic-speech-recognition",
"config_schema": config_schema,
},
}
BASIC_TOOLS_CONFIGS
module-attribute
BASIC_TOOLS_CONFIGS = {
"set time": {
"description": "Set time to a subtitle",
"config_schema": {
"h": {
"type": float,
"description": "hours: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"m": {
"type": float,
"description": "minutes: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"s": {
"type": float,
"description": "seconds: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"ms": {
"type": float,
"description": "milliseconds: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
},
},
"shift": {
"description": "Shift all subtitles by constant time amount",
"config_schema": {
"h": {
"type": float,
"description": "hours: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"m": {
"type": float,
"description": "minutes: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"s": {
"type": float,
"description": "seconds: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"ms": {
"type": float,
"description": "milliseconds: Integer or float values, may be positive or negative",
"options": None,
"default": 0,
},
"frames": {
"type": int,
"description": "When specified, must be an integer number of frames",
"options": None,
"default": None,
},
"fps": {
"type": float,
"description": "When specified, must be a positive number.",
"options": None,
"default": None,
},
},
},
}
ADVANCED_TOOLS_CONFIGS
module-attribute
ADVANCED_TOOLS_CONFIGS = {
"ffsubsync": {
"description": "Language-agnostic automatic synchronization of subtitles with video, so that subtitles are aligned to the correct starting point within the video.",
"url": "https://github.com/smacke/ffsubsync",
"config_schema": {
"vad": {
"type": list,
"description": "Which voice activity detector to use for speech extraction (if using video / audio as a reference",
"options": [
"subs_then_webrtc",
"webrtc",
"subs_then_auditok",
"auditok",
"subs_then_silero",
"silero",
],
"default": DEFAULT_VAD,
},
"max-subtitle-seconds": {
"type": float,
"description": "Maximum duration for a subtitle to appear on-screen",
"options": None,
"default": DEFAULT_MAX_SUBTITLE_SECONDS,
},
"start-seconds": {
"type": int,
"description": "Start time for processing",
"options": None,
"default": DEFAULT_START_SECONDS,
},
"max-offset-seconds": {
"type": float,
"description": "The max allowed offset seconds for any subtitle segment",
"options": None,
"default": DEFAULT_MAX_OFFSET_SECONDS,
},
"apply-offset-seconds": {
"type": float,
"description": "Apply a predefined offset in seconds to all subtitle segments",
"options": None,
"default": DEFAULT_APPLY_OFFSET_SECONDS,
},
"suppress-output-if-offset-less-than": {
"type": float,
"description": "Apply a predefined offset in seconds to all subtitle segments",
"options": None,
"default": None,
},
"frame-rate": {
"type": int,
"description": "Frame rate for audio extraction",
"options": None,
"default": DEFAULT_FRAME_RATE,
},
"output-encoding": {
"type": str,
"description": 'What encoding to use for writing output subtitles (default=utf-8). Can indicate "same" to use same encoding as that of the input.',
"options": None,
"default": "utf-8",
},
"skip-infer-framerate-ratio": {
"type": bool,
"description": "If set, do not try to infer framerate ratio based on duration ratio.",
"options": None,
"default": False,
},
"no-fix-framerate": {
"type": bool,
"description": "If specified, subsync will not attempt to correct a framerate",
"options": None,
"default": False,
},
"serialize-speech": {
"type": bool,
"description": "If specified, serialize reference speech to a numpy array.",
"options": None,
"default": False,
},
"gss": {
"type": bool,
"description": "If specified, use golden-section search to try to findthe optimal framerate ratio between video and subtitles.",
"options": None,
"default": False,
},
},
},
"Translation": {
"description": "Translate to different languages using AI",
"url": "https://github.com/xhluca/dl-translate",
"config_schema": {
"model": {
"type": list,
"description": "The model",
"options": available_translation_models(),
"default": available_translation_models()[
0
],
},
"device": {
"type": list,
"description": '"cpu", "gpu" or "auto". If it\'s set to "auto", will try to select a GPU when available or else fall back to CPU',
"options": [
"auto",
*get_available_devices(),
],
"default": "auto",
},
"batch_size": {
"type": int,
"description": "The number of samples to load at once. If set to `None`, it will process everything at once\nA smaller value is preferred for `batch_size` if your (video) RAM is limited",
"options": None,
"default": 32,
},
"verbose": {
"type": bool,
"description": "Whether to display the progress bar for every batch processed.",
"options": None,
"default": True,
},
},
},
}