Skip to content

API Reference

️🎞 Subtitles generation tool (Web-UI + CLI + Python package) powered by OpenAI's Whisper and its variants 🎞️

subsai.main

SubsAI: Subtitles AI Subtitles generation tool powered by OpenAI's Whisper and its variants.

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

SubsAI

Subs AI class

Example usage:

file = './assets/test1.mp4'
subs_ai = SubsAI()
model = subs_ai.create_model('openai/whisper', {'model_type': 'base'})
subs = subs_ai.transcribe(file, model)
subs.save('test1.srt')

available_models staticmethod

available_models()

Returns the supported models

Returns:

  • list

    list of available models

Source code in src/subsai/main.py
54
55
56
57
58
59
60
61
@staticmethod
def available_models() -> list:
    """
    Returns the supported models

    :return: list of available models
    """
    return list(AVAILABLE_MODELS.keys())

model_info staticmethod

model_info(model)

Returns general infos about the model (brief description and url)

Parameters:

  • model (str) –

    model name

Returns:

  • dict

    dict of infos

Source code in src/subsai/main.py
63
64
65
66
67
68
69
70
71
72
73
@staticmethod
def model_info(model: str) -> dict:
    """
    Returns general infos about the model (brief description and url)

    :param model: model name

    :return: dict of infos
    """
    return {'description': AVAILABLE_MODELS[model]['description'],
            'url': AVAILABLE_MODELS[model]['url']}

config_schema staticmethod

config_schema(model)

Returns the configs associated with a model

Parameters:

  • model (str) –

    model name

Returns:

  • dict

    dict of configs

Source code in src/subsai/main.py
75
76
77
78
79
80
81
82
83
84
@staticmethod
def config_schema(model: str) -> dict:
    """
    Returns the configs associated with a model

    :param model: model name

    :return: dict of configs
    """
    return AVAILABLE_MODELS[model]['config_schema']

create_model staticmethod

create_model(model_name, model_config={})

Returns a model instance

Parameters:

  • model_name (str) –

    the name of the model

  • model_config (dict, default: {} ) –

    the configuration dict

Returns:

Source code in src/subsai/main.py
86
87
88
89
90
91
92
93
94
95
96
@staticmethod
def create_model(model_name: str, model_config: dict = {}) -> AbstractModel:
    """
    Returns a model instance

    :param model_name: the name of the model
    :param model_config: the configuration dict

    :return: the model instance
    """
    return AVAILABLE_MODELS[model_name]['class'](model_config)

transcribe staticmethod

transcribe(media_file, model, model_config={})

Takes the model instance (created by :func:create_model) or the model name. Returns a :class:pysubs2.SSAFile https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file`_

Parameters:

  • media_file (str) –

    path of the media file (video/audio)

  • model (Union[AbstractModel, str]) –

    model instance or model name

  • model_config (dict, default: {} ) –

    model configs' dict

Returns:

  • SSAFile

    SSAFile: list of subtitles

Source code in src/subsai/main.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
@staticmethod
def transcribe(media_file: str, model: Union[AbstractModel, str], model_config: dict = {}) -> SSAFile:
    """
    Takes the model instance (created by :func:`create_model`) or the model name.
    Returns a :class:`pysubs2.SSAFile` <https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file>`_

    :param media_file: path of the media file (video/audio)
    :param model: model instance or model name
    :param model_config: model configs' dict

    :return: SSAFile: list of subtitles
    """
    if type(model) == str:
        stt_model = SubsAI.create_model(model, model_config)
    else:
        stt_model = model
    media_file = str(pathlib.Path(media_file).resolve())
    return stt_model.transcribe(media_file)

Tools

Tools()

Some tools related to subtitles processing (ex: translation)

Source code in src/subsai/main.py
123
124
def __init__(self):
    pass

available_translation_models staticmethod

available_translation_models()

Returns available translation models A simple link to :func:utils.available_translation_models for easy access

Returns:

  • list

    list of available models

Source code in src/subsai/main.py
126
127
128
129
130
131
132
133
134
135
@staticmethod
def available_translation_models() -> list:
    """
    Returns available translation models
    A simple link to :func:`utils.available_translation_models` for easy access

    :return: list of available models
    """

    return available_translation_models()

available_translation_languages staticmethod

available_translation_languages(model)

Returns the languages supported by the translation model

Parameters:

  • model (Union[str, TranslationModel]) –

    the name of the model

Returns:

  • list

    list of available languages

Source code in src/subsai/main.py
137
138
139
140
141
142
143
144
145
146
147
148
149
@staticmethod
def available_translation_languages(model: Union[str, TranslationModel]) -> list:
    """
    Returns the languages supported by the translation model

    :param model: the name of the model
    :return: list of available languages
    """
    if type(model) == str:
        langs = Tools.create_translation_model(model).available_languages()
    else:
        langs = model.available_languages()
    return langs

create_translation_model staticmethod

create_translation_model(
    model_name="m2m100", model_family=None
)

Creates and returns a translation model instance.

Parameters:

  • model_name (str, default: 'm2m100' ) –

    name of the model. To get available models use :func:available_translation_models

  • model_family (str, default: None ) –

    Either "mbart50" or "m2m100". By default, See dl-translate docs

Returns:

  • TranslationModel

    A translation model instance

Source code in src/subsai/main.py
151
152
153
154
155
156
157
158
159
160
161
@staticmethod
def create_translation_model(model_name: str = "m2m100", model_family: str = None) -> TranslationModel:
    """
    Creates and returns a translation model instance.

    :param model_name: name of the model. To get available models use :func:`available_translation_models`
    :param model_family: Either "mbart50" or "m2m100". By default, See `dl-translate` docs
    :return: A translation model instance
    """
    mt = TranslationModel(model_or_path=model_name, model_family=model_family)
    return mt

translate staticmethod

translate(
    subs,
    source_language,
    target_language,
    model="m2m100",
    model_family=None,
    translation_configs={},
)

Translates a subtitles SSAFile object, what :func:SubsAI.transcribe is returning

Parameters:

  • subs (SSAFile) –

    SSAFile object

  • source_language (str) –

    the language of the subtitles

  • target_language (str) –

    the target language

  • model (Union[str, TranslationModel], default: 'm2m100' ) –

    the translation model, either an str or the model instance created by :func:create_translation_model

  • model_family (str, default: None ) –

    Either "mbart50" or "m2m100". By default, See dl-translate docs

  • translation_configs (dict, default: {} ) –

    dict of translation configs (see :attr:configs.ADVANCED_TOOLS_CONFIGS)

Returns:

  • SSAFile

    returns an SSAFile subtitles translated to the target language

Source code in src/subsai/main.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
@staticmethod
def translate(subs: SSAFile,
              source_language: str,
              target_language: str,
              model: Union[str, TranslationModel] = "m2m100",
              model_family: str = None,
              translation_configs: dict = {}) -> SSAFile:
    """
    Translates a subtitles `SSAFile` object, what :func:`SubsAI.transcribe` is returning

    :param subs: `SSAFile` object
    :param source_language: the language of the subtitles
    :param target_language: the target language
    :param model: the translation model, either an `str` or the model instance created by
                    :func:`create_translation_model`
    :param model_family: Either "mbart50" or "m2m100". By default, See `dl-translate` docs
    :param translation_configs: dict of translation configs (see :attr:`configs.ADVANCED_TOOLS_CONFIGS`)

    :return: returns an `SSAFile` subtitles translated to the target language
    """
    if type(model) == str:
        translation_model = Tools.create_translation_model(model_name=model, model_family=model_family)
    else:
        translation_model = model

    translated_subs = SSAFile()
    for sub in subs:
        translated_sub = sub.copy()
        translated_sub.text = translation_model.translate(text=sub.text,
                                                          source=source_language,
                                                          target=target_language,
                                                          batch_size=translation_configs[
                                                              'batch_size'] if 'batch_size' in translation_configs else 32,
                                                          verbose=translation_configs[
                                                              'verbose'] if 'verbose' in translation_configs else False)
        translated_subs.append(translated_sub)
    return translated_subs

auto_sync staticmethod

auto_sync(subs, media_file, **kwargs)

Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file

Parameters:

  • subs (SSAFile) –

    SSAFile file

  • media_file (str) –

    path of the media_file

  • kwargs

    configs to pass to ffsubsync (see :attr:configs.ADVANCED_TOOLS_CONFIGS)

Returns:

  • SSAFile

    SSAFile auto-synced

Source code in src/subsai/main.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
@staticmethod
def auto_sync(subs: SSAFile,
              media_file: str,
              **kwargs
              ) -> SSAFile:
    """
    Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file

    :param subs: `SSAFile` file
    :param media_file: path of the media_file
    :param kwargs: configs to pass to ffsubsync (see :attr:`configs.ADVANCED_TOOLS_CONFIGS`)

    :return: `SSAFile` auto-synced
    """
    parser = make_parser()
    srtin_file = tempfile.NamedTemporaryFile(delete=False)
    srtout_file = tempfile.NamedTemporaryFile(delete=False)
    try:
        srtin = srtin_file.name + '.ass'
        srtout = srtout_file.name + '.srt'
        subs.save(srtin)
        cmd = [media_file,
               '-i', srtin,
               '-o', srtout]
        for config_name in kwargs:
            value = kwargs[config_name]
            if value is None or value is False:
                continue
            elif type(value) == bool and value is True:
                cmd.append(f'--{config_name}')
            else:
                cmd.append(f'--{config_name}')
                cmd.append(f'{value}')
        parsed_args = parser.parse_args(cmd)
        retval = run(parsed_args)["retval"]
        synced_subs = pysubs2.load(srtout)
        return synced_subs
    finally:
        srtin_file.close()
        os.unlink(srtin_file.name)
        srtout_file.close()
        os.unlink(srtout_file.name)

merge_subs_with_video staticmethod

merge_subs_with_video(
    subs, media_file, output_filename=None, **kwargs
)

Uses ffmpeg to merge subtitles into a video media file. You cna merge multiple subs at the same time providing a dict with (lang,SSAFile object) key,value pairs Example:

    file = '../../assets/video/test1.webm'
    subs_ai = SubsAI()
    model = subs_ai.create_model('openai/whisper', {'model_type': 'tiny'})
    en_subs = subs_ai.transcribe(file, model)
    ar_subs = pysubs2.load('../../assets/video/test0-ar.srt')
    Tools.merge_subs_with_video2({'English': subs, "Arabic": subs2}, file)

Parameters:

  • subs (Dict[str, SSAFile]) –

    dict with (lang,SSAFile object) key,value pairs

  • media_file (str) –

    path of the video media_file

  • output_filename (str, default: None ) –

    Output file name (without the extension as it will be inferred from the media file)

Returns:

  • str

    Absolute path of the output file

Source code in src/subsai/main.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
@staticmethod
def merge_subs_with_video(subs: Dict[str, SSAFile],
              media_file: str,
              output_filename: str = None,
              **kwargs
              ) -> str:
    """
    Uses ffmpeg to merge subtitles into a video media file.
    You cna merge multiple subs at the same time providing a dict with (lang,`SSAFile` object) key,value pairs
    Example:
    ```python
        file = '../../assets/video/test1.webm'
        subs_ai = SubsAI()
        model = subs_ai.create_model('openai/whisper', {'model_type': 'tiny'})
        en_subs = subs_ai.transcribe(file, model)
        ar_subs = pysubs2.load('../../assets/video/test0-ar.srt')
        Tools.merge_subs_with_video2({'English': subs, "Arabic": subs2}, file)
    ```

    :param subs: dict with (lang,`SSAFile` object) key,value pairs
    :param media_file: path of the video media_file
    :param output_filename: Output file name (without the extension as it will be inferred from the media file)

    :return: Absolute path of the output file
    """
    metadata = ffmpeg.probe(media_file, select_streams="v")['streams'][0]
    assert metadata['codec_type'] == 'video', f'File {media_file} is not a video'


    srtin_files = {key: tempfile.NamedTemporaryFile(delete=False) for key in subs}
    try:
        in_file = pathlib.Path(media_file)
        if output_filename is not None:
            out_file = in_file.parent / f"{output_filename}{in_file.suffix}"
        else:
            out_file = in_file.parent / f"{in_file.stem}-subs-merged{in_file.suffix}"

        video = str(in_file.resolve())
        metadata_subs = {'scodec': 'mov_text'} if metadata['codec_name'] == 'h264' else {}
        ffmpeg_subs_inputs = []
        for i,lang in enumerate(srtin_files):
            srtin = srtin_files[lang].name + '.srt'
            subs[lang].save(srtin)
            ffmpeg_subs_inputs.append(ffmpeg.input(srtin)['s'])
            metadata_subs[f'metadata:s:s:{i}'] = "title=" + lang

        output_file = str(out_file.resolve())
        input_ffmpeg = ffmpeg.input(video)
        input_video = input_ffmpeg['v']
        input_audio = input_ffmpeg['a']
        output_ffmpeg = ffmpeg.output(
            input_video, input_audio, *ffmpeg_subs_inputs, output_file,
            vcodec='copy', acodec='copy',
            # scodec='mov_text',
            **metadata_subs
        )
        output_ffmpeg = ffmpeg.overwrite_output(output_ffmpeg)
        ffmpeg.run(output_ffmpeg)
    finally:
        for srtin_file in srtin_files.values():
            srtin_file.close()
            os.unlink(srtin_file.name)
    return str(out_file.resolve())

subsai.models

abstract_model

API that the transcription models should follow

AbstractModel

AbstractModel(model_name=None, model_config={})

Bases: ABC

Abstract Model class

Source code in src/subsai/models/abstract_model.py
15
16
17
def __init__(self, model_name=None, model_config={}):
    self.model_name = model_name
    self.model_config = model_config
model_name instance-attribute
model_name = model_name
model_config instance-attribute
model_config = model_config
transcribe abstractmethod
transcribe(media_file)

Transcribe the media_file to subtitles.

example use case from pysubs2.whisper:

.. code-block:: python :linenos:

subs = SSAFile() for segment in segments: event = SSAEvent(start=make_time(s=segment["start"]), end=make_time(s=segment["end"])) event.plaintext = segment["text"].strip() subs.append(event)

Parameters:

  • media_file

    Path of the media file

Returns:

  • SSAFile

    Collection of SSAEvent(s) (see :mod:pysubs2.ssaevent)

Source code in src/subsai/models/abstract_model.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@abstractmethod
def transcribe(self, media_file) -> SSAFile:
    """
    Transcribe the `media_file` to subtitles.

    example use case from pysubs2.whisper:

    .. code-block:: python
        :linenos:

    subs = SSAFile()
    for segment in segments:
        event = SSAEvent(start=make_time(s=segment["start"]), end=make_time(s=segment["end"]))
        event.plaintext = segment["text"].strip()
        subs.append(event)

    :param media_file: Path of the media file
    :return: Collection of SSAEvent(s) (see :mod:`pysubs2.ssaevent`)
    """
    pass

faster_whisper_model

Faster Whisper Model

See guillaumekln/faster-whisper

FasterWhisperModel

FasterWhisperModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/faster_whisper_model.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def __init__(self, model_config):
    super(FasterWhisperModel, self).__init__(model_config=model_config,
                                       model_name=self.model_name)
    # config
    self._model_size_or_path = _load_config('model_size_or_path', model_config, self.config_schema)
    self._device = _load_config('device', model_config, self.config_schema)
    self._device_index = _load_config('device_index', model_config, self.config_schema)
    self._compute_type = _load_config('compute_type', model_config, self.config_schema)
    self._cpu_threads = _load_config('cpu_threads', model_config, self.config_schema)
    self._num_workers = _load_config('num_workers', model_config, self.config_schema)

    self.transcribe_configs = \
        {config: _load_config(config, model_config, self.config_schema)
         for config in self.config_schema if not hasattr(self, f"_{config}")}

    self.model = WhisperModel(model_size_or_path=self._model_size_or_path,
                              device=self._device,
                              device_index=self._device_index,
                              compute_type=self._compute_type,
                              cpu_threads=self._cpu_threads,
                              num_workers=self._num_workers)


    # to show the progress
    import logging

    logging.basicConfig()
    logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
model_name class-attribute instance-attribute
model_name = 'guillaumekln/faster-whisper'
config_schema class-attribute instance-attribute
config_schema = {
    "model_size_or_path": {
        "type": list,
        "description": 'Size of the model to use (e.g. "large-v2", "small", "tiny.en", etc.)or a path to a converted model directory. When a size is configured, the convertedmodel is downloaded from the Hugging Face Hub.',
        "options": available_models(),
        "default": "base",
    },
    "device": {
        "type": list,
        "description": 'Device to use for computation ("cpu", "cuda", "auto")',
        "options": ["auto", "cpu", "cuda"],
        "default": "auto",
    },
    "device_index": {
        "type": int,
        "description": "Device ID to use.The model can also be loaded on multiple GPUs by passing a list of IDs(e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallelwhen transcribe() is called from multiple Python threads (see also num_workers).",
        "options": None,
        "default": 0,
    },
    "compute_type": {
        "type": str,
        "description": "Type to use for computation.See https://opennmt.net/CTranslate2/quantization.html.",
        "options": None,
        "default": "default",
    },
    "cpu_threads": {
        "type": int,
        "description": "Number of threads to use when running on CPU (4 by default).A non zero value overrides the OMP_NUM_THREADS environment variable.",
        "options": None,
        "default": 0,
    },
    "num_workers": {
        "type": int,
        "description": "When transcribe() is called from multiple Python threads,having multiple workers enables true parallelism when running the model(concurrent calls to self.model.generate() will run in parallel).This can improve the global throughput at the cost of increased memory usage.",
        "options": None,
        "default": 1,
    },
    "temperature": {
        "type": Tuple,
        "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
        "options": None,
        "default": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    },
    "compression_ratio_threshold": {
        "type": float,
        "description": "If the gzip compression ratio is above this value, treat as failed",
        "options": None,
        "default": 2.4,
    },
    "log_prob_threshold": {
        "type": float,
        "description": "If the average log probability over sampled tokens is below this value, treat as failed",
        "options": None,
        "default": -1.0,
    },
    "no_speech_threshold": {
        "type": float,
        "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
        "options": None,
        "default": 0.6,
    },
    "condition_on_previous_text": {
        "type": bool,
        "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
        "options": None,
        "default": True,
    },
    "task": {
        "type": list,
        "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
        "options": ["transcribe", "translate"],
        "default": "transcribe",
    },
    "language": {
        "type": str,
        "description": "language that the audio is in; uses detected language if None",
        "options": None,
        "default": None,
    },
    "best_of": {
        "type": int,
        "description": "number of independent samples to collect, when t > 0",
        "options": None,
        "default": 5,
    },
    "beam_size": {
        "type": int,
        "description": "number of beams in beam search, when t == 0",
        "options": None,
        "default": 5,
    },
    "patience": {
        "type": float,
        "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
        "options": None,
        "default": 1.0,
    },
    "length_penalty": {
        "type": float,
        "description": "'alpha' in Google NMT, None defaults to length norm",
        "options": None,
        "default": 1.0,
    },
    "prefix": {
        "type": str,
        "description": "text or tokens to prefix the current context",
        "options": None,
        "default": None,
    },
    "suppress_blank": {
        "type": bool,
        "description": "this will suppress blank outputs",
        "options": None,
        "default": True,
    },
    "suppress_tokens": {
        "type": Tuple,
        "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
        "options": None,
        "default": [-1],
    },
    "without_timestamps": {
        "type": bool,
        "description": "use <|notimestamps|> to sample text tokens only",
        "options": None,
        "default": False,
    },
    "max_initial_timestamp": {
        "type": float,
        "description": "the initial timestamp cannot be later than this",
        "options": None,
        "default": 1.0,
    },
    "initial_prompt": {
        "type": str,
        "description": "Optional text to provide as a prompt for the first window.",
        "options": None,
        "default": None,
    },
    "word_timestamps": {
        "type": bool,
        "description": "Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.",
        "options": None,
        "default": False,
    },
    "prepend_punctuations": {
        "type": str,
        "description": "If word_timestamps is True, merge these punctuation symbolswith the next word",
        "options": None,
        "default": "\"'“¿([{-",
    },
    "append_punctuations": {
        "type": str,
        "description": "If word_timestamps is True, merge these punctuation symbolswith the previous word",
        "options": None,
        "default": "\"'.。,,!!??::”)]}、",
    },
    "vad_filter": {
        "type": bool,
        "description": "If True, use the integrated Silero VAD model to filter out parts of the audio without speech.",
        "options": None,
        "default": False,
    },
    "vad_parameters": {
        "type": dict,
        "description": "Parameters for splitting long audios into speech chunks using silero VAD.",
        "options": None,
        "default": {
            "threshold": 0.5,
            "min_speech_duration_ms": 250,
            "max_speech_duration_s": float("inf"),
            "min_silence_duration_ms": 2000,
            "window_size_samples": 1024,
            "speech_pad_ms": 400,
        },
    },
}
transcribe_configs instance-attribute
transcribe_configs = {
    config: _load_config(
        config, model_config, config_schema
    )
    for config in config_schema
    if not hasattr(self, f"_{config}")
}
model instance-attribute
model = WhisperModel(
    model_size_or_path=_model_size_or_path,
    device=_device,
    device_index=_device_index,
    compute_type=_compute_type,
    cpu_threads=_cpu_threads,
    num_workers=_num_workers,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/faster_whisper_model.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def transcribe(self, media_file) -> str:
    segments, info = self.model.transcribe(media_file, **self.transcribe_configs)
    subs = SSAFile()
    total_duration = round(info.duration, 2)  # Same precision as the Whisper timestamps.
    timestamps = 0.0  # to get the current segments
    with tqdm(total=total_duration, unit=" audio seconds") as pbar:
        if self.transcribe_configs['word_timestamps']:  # word level timestamps
            for segment in segments:
                pbar.update(segment.end - timestamps)
                timestamps = segment.end
                if timestamps < info.duration:
                    pbar.update(info.duration - timestamps)
                for word in segment.words:
                    event = SSAEvent(start=pysubs2.make_time(s=word.start), end=pysubs2.make_time(s=word.end))
                    event.plaintext = word.word.strip()
                    subs.append(event)
        else:
            for segment in segments:
                pbar.update(segment.end - timestamps)
                timestamps = segment.end
                if timestamps < info.duration:
                    pbar.update(info.duration - timestamps)
                event = SSAEvent(start=pysubs2.make_time(s=segment.start), end=pysubs2.make_time(s=segment.end))
                event.plaintext = segment.text.strip()
                subs.append(event)

    return subs

hugging_face_model

Hugging Face Model

See automatic-speech-recognition

devices module-attribute

devices = get_available_devices()

HuggingFaceModel

HuggingFaceModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/hugging_face_model.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(self, model_config):
    super(HuggingFaceModel, self).__init__(model_config=model_config,
                                           model_name=self.model_name)
    # config
    self._model_id = _load_config('model_id', model_config, self.config_schema)
    self._device = _load_config('device', model_config, self.config_schema)
    self.segment_type = _load_config('segment_type', model_config, self.config_schema)
    self._chunk_length_s = _load_config('chunk_length_s', model_config, self.config_schema)


    self.model = pipeline(
        "automatic-speech-recognition",
        model=self._model_id,
        device=self._device,
    )
model_name class-attribute instance-attribute
model_name = 'HuggingFaceModel'
config_schema class-attribute instance-attribute
config_schema = {
    "model_id": {
        "type": str,
        "description": "The model id from the Hugging Face Hub.",
        "options": None,
        "default": "openai/whisper-tiny",
    },
    "device": {
        "type": list,
        "description": "Pytorch device",
        "options": devices,
        "default": devices[0],
    },
    "segment_type": {
        "type": list,
        "description": "Sentence-level or word-level timestamps",
        "options": ["sentence", "word"],
        "default": "sentence",
    },
    "chunk_length_s": {
        "type": float,
        "description": "(`float`, *optional*, defaults to 0):The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).",
        "options": None,
        "default": 30,
    },
}
segment_type instance-attribute
segment_type = _load_config(
    "segment_type", model_config, config_schema
)
model instance-attribute
model = pipeline(
    "automatic-speech-recognition",
    model=_model_id,
    device=_device,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/hugging_face_model.py
67
68
69
70
71
72
73
74
75
76
77
78
79
def transcribe(self, media_file):
    results = self.model(
        media_file,
        chunk_length_s=self._chunk_length_s,
        return_timestamps=True if self.segment_type == 'sentence' else 'word',
    )
    subs = SSAFile()
    for chunk in results['chunks']:
        event = SSAEvent(start=pysubs2.make_time(s=chunk['timestamp'][0]),
                         end=pysubs2.make_time(s=chunk['timestamp'][1]))
        event.plaintext = chunk['text']
        subs.append(event)
    return subs

stable_ts_model

Stable-ts Model

See jianfch/stable-ts

StableTsModel

StableTsModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/stable_ts_model.py
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def __init__(self, model_config):
    super(StableTsModel, self).__init__(model_config=model_config,
                                        model_name=self.model_name)
    # config
    self._model_type = _load_config('model_type', model_config, self.config_schema)
    self._device = _load_config('device', model_config, self.config_schema)
    self._in_memory = _load_config('in_memory', model_config, self.config_schema)
    self._cpu_preload = _load_config('cpu_preload', model_config, self.config_schema)
    self._dq = _load_config('dq', model_config, self.config_schema)

    self._verbose = _load_config('verbose', model_config, self.config_schema)
    self._temperature = _load_config('temperature', model_config, self.config_schema)
    self._compression_ratio_threshold = _load_config('compression_ratio_threshold', model_config, self.config_schema)
    self._logprob_threshold = _load_config('logprob_threshold', model_config, self.config_schema)
    self._no_speech_threshold = _load_config('no_speech_threshold', model_config, self.config_schema)
    self._condition_on_previous_text = _load_config('condition_on_previous_text', model_config, self.config_schema)
    self._initial_prompt = _load_config('initial_prompt', model_config, self.config_schema)
    self._word_timestamps = _load_config('word_timestamps', model_config, self.config_schema)
    self._regroup = _load_config('regroup', model_config, self.config_schema)
    self._ts_num = _load_config('ts_num', model_config, self.config_schema)
    self._ts_noise = _load_config('ts_noise', model_config, self.config_schema)
    self._suppress_silence = _load_config('suppress_silence', model_config, self.config_schema)
    self._suppress_word_ts = _load_config('suppress_word_ts', model_config, self.config_schema)
    self._q_levels = _load_config('q_levels', model_config, self.config_schema)
    self._k_size = _load_config('k_size', model_config, self.config_schema)
    self._time_scale = _load_config('time_scale', model_config, self.config_schema)
    self._demucs = _load_config('demucs', model_config, self.config_schema)
    self._demucs_output = _load_config('demucs_output', model_config, self.config_schema)
    self._demucs_options = _load_config('demucs_options', model_config, self.config_schema)
    self._vad = _load_config('vad', model_config, self.config_schema)
    self._vad_threshold = _load_config('vad_threshold', model_config, self.config_schema)
    self._vad_onnx = _load_config('vad_onnx', model_config, self.config_schema)
    self._min_word_dur = _load_config('min_word_dur', model_config, self.config_schema)
    self._only_voice_freq = _load_config('only_voice_freq', model_config, self.config_schema)
    self._prepend_punctuations = _load_config('prepend_punctuations', model_config, self.config_schema)
    self._append_punctuations = _load_config('append_punctuations', model_config, self.config_schema)
    self._mel_first = _load_config('mel_first', model_config, self.config_schema)
    self._suppress_ts_tokens = _load_config('suppress_ts_tokens', model_config, self.config_schema)
    self._gap_padding = _load_config('gap_padding', model_config, self.config_schema)
    self._only_ffmpeg = _load_config('only_ffmpeg', model_config, self.config_schema)
    self._max_instant_words = _load_config('max_instant_words', model_config, self.config_schema)
    self._avg_prob_threshold = _load_config('avg_prob_threshold', model_config, self.config_schema)
    self._ignore_compatibility = _load_config('ignore_compatibility', model_config, self.config_schema)

    self.transcribe_configs = \
        {config: _load_config(config, model_config, self.config_schema)
         for config in self.config_schema if not hasattr(self, f"_{config}")}

    self.model = load_model(name=self._model_type,
                            device=self._device,
                            in_memory=self._in_memory,
                            cpu_preload=self._cpu_preload,
                            dq=self._dq)
model_name class-attribute instance-attribute
model_name = 'jianfch/stable-ts'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
        "options": available_models(),
        "default": "base",
    },
    "device": {
        "type": list,
        "description": "The PyTorch device to put the model into",
        "options": [None, *get_available_devices()],
        "default": None,
    },
    "in_memory": {
        "type": bool,
        "description": "bool, default False, Whether to preload the model weights into host memory.",
        "options": None,
        "default": False,
    },
    "cpu_preload": {
        "type": bool,
        "description": "Load model into CPU memory first then move model to specified device to reduce GPU memory usage when loading model",
        "options": None,
        "default": True,
    },
    "dq": {
        "type": bool,
        "description": "Whether to apply Dynamic Quantization to model to reduced memory usage and increase inference speed but at the cost of a slight decrease in accuracy. Only for CPU.",
        "options": None,
        "default": False,
    },
    "verbose": {
        "type": bool,
        "description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
        "options": None,
        "default": None,
    },
    "temperature": {
        "type": Tuple,
        "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
        "options": None,
        "default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    },
    "compression_ratio_threshold": {
        "type": float,
        "description": "If the gzip compression ratio is above this value, treat as failed",
        "options": None,
        "default": 2.4,
    },
    "logprob_threshold": {
        "type": float,
        "description": "If the average log probability over sampled tokens is below this value, treat as failed",
        "options": None,
        "default": -1.0,
    },
    "no_speech_threshold": {
        "type": float,
        "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
        "options": None,
        "default": 0.6,
    },
    "condition_on_previous_text": {
        "type": bool,
        "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
        "options": None,
        "default": True,
    },
    "initial_prompt": {
        "type": str,
        "description": "Optional text to provide as a prompt for the first window.",
        "options": None,
        "default": None,
    },
    "word_timestamps": {
        "type": bool,
        "description": "Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.",
        "options": None,
        "default": True,
    },
    "regroup": {
        "type": bool,
        "description": "default True, meaning the default regroup algorithmString for customizing the regrouping algorithm. False disables regrouping.Ignored if ``word_timestamps = False``.",
        "options": None,
        "default": True,
    },
    "ts_num": {
        "type": int,
        "description": "meaning disable this optionNumber of extra timestamp inferences to perform then use average of these extra timestamps.An experimental option that might hurt performance.",
        "options": None,
        "default": 0,
    },
    "ts_noise": {
        "type": float,
        "description": "Percentage of noise to add to audio_features to perform inferences for ``ts_num``.",
        "options": None,
        "default": 0.1,
    },
    "suppress_silence": {
        "type": bool,
        "description": "Whether to enable timestamps adjustments based on the detected silence.",
        "options": None,
        "default": True,
    },
    "suppress_word_ts": {
        "type": bool,
        "description": "Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.",
        "options": None,
        "default": True,
    },
    "q_levels": {
        "type": int,
        "description": "Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.Acts as a threshold to marking sound as silent.Fewer levels will increase the threshold of volume at which to mark a sound as silent.",
        "options": None,
        "default": 20,
    },
    "k_size": {
        "type": int,
        "description": "Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.Recommend 5 or 3; higher sizes will reduce detection of silence.",
        "options": None,
        "default": 5,
    },
    "time_scale": {
        "type": float,
        "description": "Factor for scaling audio duration for inference.Greater than 1.0 'slows down' the audio, and less than 1.0 'speeds up' the audio. None is same as 1.0.A factor of 1.5 will stretch 10s audio to 15s for inference. This increases the effective resolutionof the model but can increase word error rate.",
        "options": None,
        "default": None,
    },
    "demucs": {
        "type": bool,
        "description": "Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance ofa Demucs model to avoid reloading the model for each run.Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.",
        "options": None,
        "default": False,
    },
    "demucs_output": {
        "type": str,
        "description": "Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.",
        "options": None,
        "default": None,
    },
    "demucs_options": {
        "type": dict,
        "description": "Options to use for :func:`stable_whisper.audio.demucs_audio`.",
        "options": None,
        "default": None,
    },
    "vad": {
        "type": bool,
        "description": "Whether to use Silero VAD to generate timestamp suppression mask.Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.",
        "options": None,
        "default": False,
    },
    "vad_threshold": {
        "type": float,
        "description": "Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.",
        "options": None,
        "default": 0.35,
    },
    "vad_onnx": {
        "type": bool,
        "description": "Whether to use ONNX for Silero VAD.",
        "options": None,
        "default": False,
    },
    "min_word_dur": {
        "type": float,
        "description": "Shortest duration each word is allowed to reach for silence suppression.",
        "options": None,
        "default": 0.1,
    },
    "only_voice_freq": {
        "type": bool,
        "description": "Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.",
        "options": None,
        "default": False,
    },
    "prepend_punctuations": {
        "type": str,
        "description": "If word_timestamps is True, merge these punctuation symbolswith the next word",
        "options": None,
        "default": "\"'“¿([{-",
    },
    "append_punctuations": {
        "type": str,
        "description": "If word_timestamps is True, merge these punctuation symbolswith the previous word",
        "options": None,
        "default": "\"'.。,,!!??::”)]}、",
    },
    "mel_first": {
        "type": bool,
        "description": "Process entire audio track into log-Mel spectrogram first instead in chunks.Used if odd behavior seen in stable-ts but not in whisper, but use significantly more memory for long audio.",
        "options": None,
        "default": False,
    },
    "suppress_ts_tokens": {
        "type": bool,
        "description": " Whether to suppress timestamp tokens during inference for timestamps are detected at silent.Reduces hallucinations in some cases, but also prone to ignore disfluencies and repetitions.This option is ignored if ``suppress_silence = False``.",
        "options": None,
        "default": False,
    },
    "gap_padding": {
        "type": str,
        "description": "Padding prepend to each segments for word timing alignment.Used to reduce the probability of model predicting timestamps earlier than the first utterance.",
        "options": None,
        "default": "...",
    },
    "only_ffmpeg": {
        "type": bool,
        "description": "Whether to use only FFmpeg (instead of not yt-dlp) for URls",
        "options": None,
        "default": False,
    },
    "max_instant_words": {
        "type": float,
        "description": "If percentage of instantaneous words in a segment exceed this amount, the segment is removed.",
        "options": None,
        "default": 0.5,
    },
    "avg_prob_threshold": {
        "type": float,
        "description": "Transcribe the gap after the previous word and if the average word proababiliy of a segment falls below thisvalue, discard the segment. If ``None``, skip transcribing the gap to reduce chance of timestamps startingbefore the next utterance.",
        "options": None,
        "default": None,
    },
    "ignore_compatibility": {
        "type": bool,
        "description": "Whether to ignore warnings for compatibility issues with the detected Whisper version.",
        "options": None,
        "default": False,
    },
    "task": {
        "type": list,
        "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
        "options": ["transcribe", "translate"],
        "default": "transcribe",
    },
    "language": {
        "type": str,
        "description": "language that the audio is in; uses detected language if None",
        "options": None,
        "default": None,
    },
    "sample_len": {
        "type": int,
        "description": "maximum number of tokens to sample",
        "options": None,
        "default": None,
    },
    "best_of": {
        "type": int,
        "description": "number of independent samples to collect, when t > 0",
        "options": None,
        "default": None,
    },
    "beam_size": {
        "type": int,
        "description": "number of beams in beam search, when t == 0",
        "options": None,
        "default": None,
    },
    "patience": {
        "type": float,
        "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
        "options": None,
        "default": None,
    },
    "length_penalty": {
        "type": float,
        "description": "'alpha' in Google NMT, None defaults to length norm",
        "options": None,
        "default": None,
    },
    "prompt": {
        "type": str,
        "description": "text or tokens for the previous context",
        "options": None,
        "default": None,
    },
    "prefix": {
        "type": str,
        "description": "text or tokens to prefix the current context",
        "options": None,
        "default": None,
    },
    "suppress_blank": {
        "type": bool,
        "description": "this will suppress blank outputs",
        "options": None,
        "default": True,
    },
    "suppress_tokens": {
        "type": str,
        "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
        "options": None,
        "default": "-1",
    },
    "without_timestamps": {
        "type": bool,
        "description": "use <|notimestamps|> to sample text tokens only",
        "options": None,
        "default": False,
    },
    "max_initial_timestamp": {
        "type": float,
        "description": "the initial timestamp cannot be later than this",
        "options": None,
        "default": 1.0,
    },
    "fp16": {
        "type": bool,
        "description": "use fp16 for most of the calculation",
        "options": None,
        "default": True,
    },
}
transcribe_configs instance-attribute
transcribe_configs = {
    config: _load_config(
        config, model_config, config_schema
    )
    for config in config_schema
    if not hasattr(self, f"_{config}")
}
model instance-attribute
model = load_model(
    name=_model_type,
    device=_device,
    in_memory=_in_memory,
    cpu_preload=_cpu_preload,
    dq=_dq,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/stable_ts_model.py
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
def transcribe(self, media_file) -> SSAFile:
    result = transcribe_stable(self.model,
                               audio=media_file,
                               verbose=self._verbose,
                               temperature=self._temperature,
                               compression_ratio_threshold=self._compression_ratio_threshold,
                               logprob_threshold=self._logprob_threshold,
                               no_speech_threshold=self._no_speech_threshold,
                               condition_on_previous_text=self._condition_on_previous_text,
                               initial_prompt=self._initial_prompt,
                               word_timestamps=self._word_timestamps,
                               regroup=self._regroup,
                               ts_num=self._ts_num,
                               ts_noise=self._ts_noise,
                               suppress_silence=self._suppress_silence,
                               suppress_word_ts=self._suppress_word_ts,
                               q_levels=self._q_levels,
                               k_size=self._k_size,
                               time_scale=self._time_scale,
                               demucs=self._demucs,
                               demucs_output=self._demucs_output,
                               demucs_options=self._demucs_options,
                               vad=self._vad,
                               vad_threshold=self._vad_threshold,
                               vad_onnx=self._vad_onnx,
                               min_word_dur=self._min_word_dur,
                               only_voice_freq=self._only_voice_freq,
                               prepend_punctuations=self._prepend_punctuations,
                               append_punctuations=self._append_punctuations,
                               mel_first=self._mel_first,
                               suppress_ts_tokens=self._suppress_ts_tokens,
                               gap_padding=self._gap_padding,
                               only_ffmpeg=self._only_ffmpeg,
                               max_instant_words=self._max_instant_words,
                               avg_prob_threshold=self._avg_prob_threshold,
                               ignore_compatibility=self._ignore_compatibility,
                               **self.transcribe_configs,
                               )

    subs = SSAFile()

    if self._word_timestamps:  # word level timestamps
        for segment in result.segments:
            for word in segment.words:
                try:
                    event = SSAEvent(start=pysubs2.make_time(s=word.start), end=pysubs2.make_time(s=word.end))
                    event.plaintext = word.word.strip()
                    subs.append(event)
                except Exception as e:
                    logging.warning(f"Something wrong with {word}")
                    logging.warning(e)

    else:
        for segment in result.segments:
            event = SSAEvent(start=pysubs2.make_time(s=segment.start), end=pysubs2.make_time(s=segment.end))
            event.plaintext = segment.text.strip()
            subs.append(event)


    return subs

whisperX_model

WhisperX Model

See m-bain/whisperX

WhisperXModel

WhisperXModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/whisperX_model.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __init__(self, model_config):
    super(WhisperXModel, self).__init__(model_config=model_config,
                                        model_name=self.model_name)
    # config
    self.model_type = _load_config('model_type', model_config, self.config_schema)
    self.device = _load_config('device', model_config, self.config_schema)
    self.compute_type = _load_config('compute_type', model_config, self.config_schema)
    self.download_root = _load_config('download_root', model_config, self.config_schema)
    self.language = _load_config('language', model_config, self.config_schema)
    self.segment_type = _load_config('segment_type', model_config, self.config_schema)
    # transcribe config
    self.batch_size = _load_config('batch_size', model_config, self.config_schema)
    self.return_char_alignments = _load_config('return_char_alignments', model_config, self.config_schema)
    self.speaker_labels = _load_config('speaker_labels', model_config, self.config_schema)
    self.HF_TOKEN = _load_config('HF_TOKEN', model_config, self.config_schema)
    self.min_speakers = _load_config('min_speakers', model_config, self.config_schema)
    self.max_speakers = _load_config('max_speakers', model_config, self.config_schema)

    self.model = whisperx.load_model(self.model_type,
                                     device=self.device,
                                     compute_type=self.compute_type,
                                     download_root=self.download_root,
                                     language=self.language)
model_name class-attribute instance-attribute
model_name = 'm-bain/whisperX'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
        "options": available_models(),
        "default": "base",
    },
    "device": {
        "type": list,
        "description": 'Device to use for computation ("cpu", "cuda")',
        "options": ["cpu", "cuda"],
        "default": "cpu",
    },
    "compute_type": {
        "type": list,
        "description": "change to 'int8' if low on GPU mem (may reduce accuracy)",
        "options": ["default", "float16", "int8"],
        "default": "default",
    },
    "download_root": {
        "type": str,
        "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
        "options": None,
        "default": None,
    },
    "language": {
        "type": str,
        "description": "language that the audio is in; uses detected language if None",
        "options": None,
        "default": None,
    },
    "segment_type": {
        "type": list,
        "description": "Word-level timestamps, Choose here between sentence-level and word-level",
        "options": ["sentence", "word"],
        "default": "sentence",
    },
    "batch_size": {
        "type": int,
        "description": "reduce if low on GPU mem",
        "options": None,
        "default": 16,
    },
    "return_char_alignments": {
        "type": bool,
        "description": "Whether to return char alignments",
        "options": None,
        "default": False,
    },
    "speaker_labels": {
        "type": bool,
        "description": "Run Diarization Pipeline",
        "options": None,
        "default": False,
    },
    "HF_TOKEN": {
        "type": str,
        "description": "if speaker labels is True, you will need Hugging Face access token to use the diarization models, https://github.com/m-bain/whisperX#speaker-diarization",
        "options": None,
        "default": None,
    },
    "min_speakers": {
        "type": int,
        "description": "min speakers",
        "options": None,
        "default": None,
    },
    "max_speakers": {
        "type": int,
        "description": "max speakers",
        "options": None,
        "default": None,
    },
}
model_type instance-attribute
model_type = _load_config(
    "model_type", model_config, config_schema
)
device instance-attribute
device = _load_config("device", model_config, config_schema)
compute_type instance-attribute
compute_type = _load_config(
    "compute_type", model_config, config_schema
)
download_root instance-attribute
download_root = _load_config(
    "download_root", model_config, config_schema
)
language instance-attribute
language = _load_config(
    "language", model_config, config_schema
)
segment_type instance-attribute
segment_type = _load_config(
    "segment_type", model_config, config_schema
)
batch_size instance-attribute
batch_size = _load_config(
    "batch_size", model_config, config_schema
)
return_char_alignments instance-attribute
return_char_alignments = _load_config(
    "return_char_alignments", model_config, config_schema
)
speaker_labels instance-attribute
speaker_labels = _load_config(
    "speaker_labels", model_config, config_schema
)
HF_TOKEN instance-attribute
HF_TOKEN = _load_config(
    "HF_TOKEN", model_config, config_schema
)
min_speakers instance-attribute
min_speakers = _load_config(
    "min_speakers", model_config, config_schema
)
max_speakers instance-attribute
max_speakers = _load_config(
    "max_speakers", model_config, config_schema
)
model instance-attribute
model = load_model(
    model_type,
    device=device,
    compute_type=compute_type,
    download_root=download_root,
    language=language,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisperX_model.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def transcribe(self, media_file) -> str:
    audio = whisperx.load_audio(media_file)
    result = self.model.transcribe(audio, batch_size=self.batch_size)
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=self.device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, self.device,
                            return_char_alignments=self.return_char_alignments)
    self._clear_gpu()
    del model_a
    if self.speaker_labels:
        diarize_model = whisperx.DiarizationPipeline(use_auth_token=self.HF_TOKEN, device=self.device)
        diarize_segments = diarize_model(audio, min_speakers=self.min_speakers, max_speakers=self.max_speakers)
        result = whisperx.assign_word_speakers(diarize_segments, result)
        self._clear_gpu()
        del diarize_model

    subs = SSAFile()

    if self.segment_type == 'word':  # word level timestamps
        for segment in result['segments']:
            for word in segment['words']:
                try:
                    event = SSAEvent(start=pysubs2.make_time(s=word["start"]), end=pysubs2.make_time(s=word["end"]),
                                     name=segment["speaker"] if self.speaker_labels else "")
                    event.plaintext = segment["speaker"] + ": " + word["word"].strip() if self.speaker_labels else word["word"].strip()
                    subs.append(event)
                except Exception as e:
                    logging.warning(f"Something wrong with {word}")
                    logging.warning(e)

    elif self.segment_type == 'sentence':
        for segment in result['segments']:
            event = SSAEvent(start=pysubs2.make_time(s=segment["start"]), end=pysubs2.make_time(s=segment["end"]),
                             name=segment["speaker"] if self.speaker_labels else "")
            event.plaintext = segment["speaker"] + ": "+ segment["text"].strip() if self.speaker_labels else segment["text"].strip()
            subs.append(event)
    else:
        raise Exception(f'Unknown `segment_type` value, it should be one of the following: '
                        f' {self.config_schema["segment_type"]["options"]}')
    return subs

whisper_api_model

Whisper API Model

See openai/whisper

TMPDIR module-attribute

TMPDIR = gettempdir()

OPENAI_API_SIZE_LIMIT_MB module-attribute

OPENAI_API_SIZE_LIMIT_MB = 24

WhisperAPIModel

WhisperAPIModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/whisper_api_model.py
81
82
83
84
85
86
87
88
89
def __init__(self, model_config):
    # config
    self.model_type = _load_config('model_type', model_config, self.config_schema)
    self.api_key = _load_config('api_key', model_config, self.config_schema)
    self.language = _load_config('language', model_config, self.config_schema)
    self.prompt = _load_config('prompt', model_config, self.config_schema)
    self.temperature = _load_config('temperature', model_config, self.config_schema)

    self.client = OpenAI(api_key=self.api_key)
model_name class-attribute instance-attribute
model_name = 'openai/whisper'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "OpenAI Whisper API, currently only supports large-v2 which is named as whisper-1/                                 There is a 25mb upload limit so audio is chunked locally, this may lead to lower performance.",
        "options": ["whisper-1"],
        "default": "whisper-1",
    },
    "api_key": {
        "type": str,
        "description": "Your OpenAI API key",
        "options": None,
        "default": get("OPENAI_KEY", None),
    },
    "language": {
        "type": str,
        "description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.",
        "options": None,
        "default": None,
    },
    "prompt": {
        "type": str,
        "description": "An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.",
        "options": None,
        "default": None,
    },
    "temperature": {
        "type": float,
        "description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.",
        "options": None,
        "default": 0,
    },
}
model_type instance-attribute
model_type = _load_config(
    "model_type", model_config, config_schema
)
api_key instance-attribute
api_key = _load_config(
    "api_key", model_config, config_schema
)
language instance-attribute
language = _load_config(
    "language", model_config, config_schema
)
prompt instance-attribute
prompt = _load_config("prompt", model_config, config_schema)
temperature instance-attribute
temperature = _load_config(
    "temperature", model_config, config_schema
)
client instance-attribute
client = OpenAI(api_key=api_key)
chunk_audio
chunk_audio(audio_file_path)
Source code in src/subsai/models/whisper_api_model.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def chunk_audio(self,audio_file_path) -> list:
    # Load the audio file
    audio = AudioSegment.from_mp3(audio_file_path)

    # Desired chunk size in megabytes (MB)
    chunk_size_bits = OPENAI_API_SIZE_LIMIT_MB * 1024 * 1024 * 8
    bitrate = audio.frame_rate * audio.frame_width
    chunk_duration_ms = ((chunk_size_bits) / bitrate) * 1000

    chunks = []

    # Split the audio into chunks
    current_ms = 0
    while current_ms < len(audio):
        # Calculate the end of the current chunk
        end_ms = current_ms + chunk_duration_ms
        # Create a chunk from the current position to the end position
        chunk = audio[current_ms:int(end_ms)]
        # Add the chunk to the list of chunks and include offset
        chunks.append((chunk,current_ms))
        # Update the current position
        current_ms = end_ms

    return chunks
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_api_model.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def transcribe(self, media_file) -> str:

    audio_file_path = convert_video_to_audio_ffmpeg(media_file)

    chunks = self.chunk_audio(audio_file_path)

    results = ''

    for i, (chunk,offset) in enumerate(chunks):
        chunk_path = os.path.join(TMPDIR,f'chunk_{i}.mp3')
        print('Transcribing audio chunk {}/{}'.format(i,len(chunks)))
        chunk.export(chunk_path, format='mp3')
        audio_file = open(chunk_path, "rb")

        # Use OpenAI Whisper API
        result = self.client.audio.transcriptions.create(
            model=self.model_type,
            language=self.language,
            prompt=self.prompt,
            temperature=self.temperature,
            file=audio_file,
            response_format="srt"
        )

        with open(chunk_path+'.srt','w') as f:
            f.write(result)

        # shift subtitles by offset
        result = SSAFile.from_string(result)
        result.shift(ms=offset)
        results += result.to_string('srt')

    results = ''.join(results)

    return SSAFile.from_string(results)

split_filename

split_filename(filepath)
Source code in src/subsai/models/whisper_api_model.py
22
23
24
25
def split_filename(filepath):
    path, full_filename = os.path.split(filepath)
    filename, ext = os.path.splitext(full_filename)
    return path,filename,ext

convert_video_to_audio_ffmpeg

convert_video_to_audio_ffmpeg(video_file, output_ext='mp3')
Source code in src/subsai/models/whisper_api_model.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
    # Construct the output file name
    path,filename,ext = split_filename(video_file)
    output_file = os.path.join(TMPDIR,f"{filename}.{output_ext}")


    print('Saving audio to {} with ffmpeg...'.format(output_file))
    # Execute the ffmpeg conversion
    (
        ffmpeg
        .input(video_file)
        .output(output_file)
        .overwrite_output()
        .run(quiet=True)
    )
    return output_file

whisper_model

Whisper Model

See openai/whisper

WhisperModel

WhisperModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/whisper_model.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def __init__(self, model_config):
    super(WhisperModel, self).__init__(model_config=model_config,
                                       model_name=self.model_name)
    # config
    self.model_type = _load_config('model_type', model_config, self.config_schema)
    self.device = _load_config('device', model_config, self.config_schema)
    self.download_root = _load_config('download_root', model_config, self.config_schema)
    self.in_memory = _load_config('in_memory', model_config, self.config_schema)

    self.verbose = _load_config('verbose', model_config, self.config_schema)
    self.temperature = _load_config('temperature', model_config, self.config_schema)
    self.compression_ratio_threshold = _load_config('compression_ratio_threshold', model_config, self.config_schema)
    self.logprob_threshold = _load_config('logprob_threshold', model_config, self.config_schema)
    self.no_speech_threshold = _load_config('no_speech_threshold', model_config, self.config_schema)
    self.condition_on_previous_text = _load_config('condition_on_previous_text', model_config, self.config_schema)

    self.decode_options = \
        {config: _load_config(config, model_config, self.config_schema)
         for config in self.config_schema if not hasattr(self, config)}

    self.model = whisper.load_model(name=self.model_type,
                                    device=self.device,
                                    download_root=self.download_root,
                                    in_memory=self.in_memory)
model_name class-attribute instance-attribute
model_name = 'openai/whisper'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
        "options": available_models(),
        "default": "base",
    },
    "device": {
        "type": list,
        "description": "The PyTorch device to put the model into",
        "options": [None, *get_available_devices()],
        "default": None,
    },
    "download_root": {
        "type": str,
        "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
        "options": None,
        "default": None,
    },
    "in_memory": {
        "type": bool,
        "description": "whether to preload the model weights into host memory",
        "options": None,
        "default": False,
    },
    "verbose": {
        "type": bool,
        "description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
        "options": None,
        "default": None,
    },
    "temperature": {
        "type": Tuple,
        "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
        "options": None,
        "default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    },
    "compression_ratio_threshold": {
        "type": float,
        "description": "If the gzip compression ratio is above this value, treat as failed",
        "options": None,
        "default": 2.4,
    },
    "logprob_threshold": {
        "type": float,
        "description": "If the average log probability over sampled tokens is below this value, treat as failed",
        "options": None,
        "default": -1.0,
    },
    "no_speech_threshold": {
        "type": float,
        "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
        "options": None,
        "default": 0.6,
    },
    "condition_on_previous_text": {
        "type": bool,
        "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
        "options": None,
        "default": True,
    },
    "task": {
        "type": list,
        "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
        "options": ["transcribe", "translate"],
        "default": "transcribe",
    },
    "language": {
        "type": str,
        "description": "language that the audio is in; uses detected language if None",
        "options": None,
        "default": None,
    },
    "sample_len": {
        "type": int,
        "description": "maximum number of tokens to sample",
        "options": None,
        "default": None,
    },
    "best_of": {
        "type": int,
        "description": "number of independent samples to collect, when t > 0",
        "options": None,
        "default": None,
    },
    "beam_size": {
        "type": int,
        "description": "number of beams in beam search, when t == 0",
        "options": None,
        "default": None,
    },
    "patience": {
        "type": float,
        "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
        "options": None,
        "default": None,
    },
    "length_penalty": {
        "type": float,
        "description": "'alpha' in Google NMT, None defaults to length norm",
        "options": None,
        "default": None,
    },
    "prompt": {
        "type": str,
        "description": "text or tokens for the previous context",
        "options": None,
        "default": None,
    },
    "prefix": {
        "type": str,
        "description": "text or tokens to prefix the current context",
        "options": None,
        "default": None,
    },
    "suppress_blank": {
        "type": bool,
        "description": "this will suppress blank outputs",
        "options": None,
        "default": True,
    },
    "suppress_tokens": {
        "type": str,
        "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
        "options": None,
        "default": "-1",
    },
    "without_timestamps": {
        "type": bool,
        "description": "use <|notimestamps|> to sample text tokens only",
        "options": None,
        "default": False,
    },
    "max_initial_timestamp": {
        "type": float,
        "description": "the initial timestamp cannot be later than this",
        "options": None,
        "default": 1.0,
    },
    "fp16": {
        "type": bool,
        "description": "use fp16 for most of the calculation",
        "options": None,
        "default": True,
    },
}
model_type instance-attribute
model_type = _load_config(
    "model_type", model_config, config_schema
)
device instance-attribute
device = _load_config("device", model_config, config_schema)
download_root instance-attribute
download_root = _load_config(
    "download_root", model_config, config_schema
)
in_memory instance-attribute
in_memory = _load_config(
    "in_memory", model_config, config_schema
)
verbose instance-attribute
verbose = _load_config(
    "verbose", model_config, config_schema
)
temperature instance-attribute
temperature = _load_config(
    "temperature", model_config, config_schema
)
compression_ratio_threshold instance-attribute
compression_ratio_threshold = _load_config(
    "compression_ratio_threshold",
    model_config,
    config_schema,
)
logprob_threshold instance-attribute
logprob_threshold = _load_config(
    "logprob_threshold", model_config, config_schema
)
no_speech_threshold instance-attribute
no_speech_threshold = _load_config(
    "no_speech_threshold", model_config, config_schema
)
condition_on_previous_text instance-attribute
condition_on_previous_text = _load_config(
    "condition_on_previous_text",
    model_config,
    config_schema,
)
decode_options instance-attribute
decode_options = {
    config: _load_config(
        config, model_config, config_schema
    )
    for config in config_schema
    if not hasattr(self, config)
}
model instance-attribute
model = load_model(
    name=model_type,
    device=device,
    download_root=download_root,
    in_memory=in_memory,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_model.py
206
207
208
209
210
211
212
213
214
215
216
217
def transcribe(self, media_file) -> str:
    audio = whisper.load_audio(media_file)
    result = self.model.transcribe(audio,
                                   verbose=self.verbose,
                                   temperature=self.temperature,
                                   compression_ratio_threshold=self.compression_ratio_threshold,
                                   logprob_threshold=self.logprob_threshold,
                                   no_speech_threshold=self.no_speech_threshold,
                                   condition_on_previous_text=self.condition_on_previous_text,
                                   **self.decode_options)
    subs = pysubs2.load_from_whisper(result)
    return subs

whisper_timestamped_model

whisper_timestamped

See linto-ai/whisper-timestamped

WhisperTimeStamped

WhisperTimeStamped(model_config={})

Bases: AbstractModel

Source code in src/subsai/models/whisper_timestamped_model.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def __init__(self, model_config={}):
    super(WhisperTimeStamped, self).__init__(model_config=model_config,
                                             model_name=self.model_name)
    # config
    self.model_type = _load_config('model_type', model_config, self.config_schema)
    self.segment_type = _load_config('segment_type', model_config, self.config_schema)
    self.device = _load_config('device', model_config, self.config_schema)
    self.download_root = _load_config('download_root', model_config, self.config_schema)
    self.in_memory = _load_config('in_memory', model_config, self.config_schema)

    self.verbose = _load_config('verbose', model_config, self.config_schema)
    self.temperature = _load_config('temperature', model_config, self.config_schema)
    self.compression_ratio_threshold = _load_config('compression_ratio_threshold', model_config, self.config_schema)
    self.logprob_threshold = _load_config('logprob_threshold', model_config, self.config_schema)
    self.no_speech_threshold = _load_config('no_speech_threshold', model_config, self.config_schema)
    self.condition_on_previous_text = _load_config('condition_on_previous_text', model_config, self.config_schema)

    self.decode_options = \
        {config: _load_config(config, model_config, self.config_schema)
         for config in self.config_schema if not hasattr(self, config)}

    self.model = whisper_timestamped.load_model(name=self.model_type,
                                                device=self.device,
                                                download_root=self.download_root,
                                                in_memory=self.in_memory)
model_name class-attribute instance-attribute
model_name = 'linto-ai/whisper-timestamped'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
        "options": available_models(),
        "default": "base",
    },
    "segment_type": {
        "type": list,
        "description": "Whisper_timestamps gives the ability to have word-level timestamps, Choose here between sentence-level and word-level",
        "options": ["sentence", "word"],
        "default": "sentence",
    },
    "device": {
        "type": list,
        "description": "The PyTorch device to put the model into",
        "options": [None, *get_available_devices()],
        "default": None,
    },
    "download_root": {
        "type": str,
        "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
        "options": None,
        "default": None,
    },
    "in_memory": {
        "type": bool,
        "description": "whether to preload the model weights into host memory",
        "options": None,
        "default": False,
    },
    "verbose": {
        "type": bool,
        "description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
        "options": None,
        "default": None,
    },
    "temperature": {
        "type": Tuple,
        "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
        "options": None,
        "default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    },
    "compression_ratio_threshold": {
        "type": float,
        "description": "If the gzip compression ratio is above this value, treat as failed",
        "options": None,
        "default": 2.4,
    },
    "logprob_threshold": {
        "type": float,
        "description": "If the average log probability over sampled tokens is below this value, treat as failed",
        "options": None,
        "default": -1.0,
    },
    "no_speech_threshold": {
        "type": float,
        "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
        "options": None,
        "default": 0.6,
    },
    "condition_on_previous_text": {
        "type": bool,
        "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
        "options": None,
        "default": True,
    },
    "task": {
        "type": list,
        "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
        "options": ["transcribe", "translate"],
        "default": "transcribe",
    },
    "language": {
        "type": str,
        "description": "language that the audio is in; uses detected language if None",
        "options": None,
        "default": None,
    },
    "sample_len": {
        "type": int,
        "description": "maximum number of tokens to sample",
        "options": None,
        "default": None,
    },
    "best_of": {
        "type": int,
        "description": "number of independent samples to collect, when t > 0",
        "options": None,
        "default": None,
    },
    "beam_size": {
        "type": int,
        "description": "number of beams in beam search, when t == 0",
        "options": None,
        "default": None,
    },
    "patience": {
        "type": float,
        "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
        "options": None,
        "default": None,
    },
    "length_penalty": {
        "type": float,
        "description": "'alpha' in Google NMT, None defaults to length norm",
        "options": None,
        "default": None,
    },
    "suppress_tokens": {
        "type": str,
        "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
        "options": None,
        "default": "-1",
    },
    "fp16": {
        "type": bool,
        "description": "use fp16 for most of the calculation",
        "options": None,
        "default": True,
    },
    "remove_punctuation_from_words": {
        "type": bool,
        "description": "If False, words will be glued with the next punctuation mark (if any).If True, there will be no punctuation mark in the `words[:]['text']` list.It only affects these strings; This has no influence on the computation of the word confidence, whatever the value of `include_punctuation_in_confidence` is.",
        "options": None,
        "default": False,
    },
    "refine_whisper_precision": {
        "type": float,
        "description": "How much can we refine Whisper segment positions, in seconds. Must be a multiple of 0.02.",
        "options": None,
        "default": 0.5,
    },
    "min_word_duration": {
        "type": float,
        "description": "Minimum duration of a word, in seconds. If a word is shorter than this, timestamps will be adjusted.",
        "options": None,
        "default": 0.04,
    },
    "plot_word_alignment": {
        "type": bool,
        "description": "Whether to plot the word alignment for each segment. matplotlib must be installed to use this option.",
        "options": None,
        "default": False,
    },
    "seed": {
        "type": int,
        "description": "Random seed to use for temperature sampling, for the sake of reproducibility.Choose None for unpredictable randomness",
        "options": None,
        "default": 1234,
    },
    "vad": {
        "type": bool,
        "description": "Whether to perform voice activity detection (VAD) on the audio file, to remove silent parts before transcribing with Whisper model. This should decrease hallucinations from the Whisper model.",
        "options": None,
        "default": False,
    },
    "detect_disfluencies": {
        "type": bool,
        "description": 'Whether to detect disfluencies (i.e. hesitations, filler words, repetitions, corrections, etc.) that Whisper model might have omitted in the transcription. This should make the word timestamp prediction more accurate.And probable disfluencies will be marked as special words "[*]"',
        "options": None,
        "default": False,
    },
    "trust_whisper_timestamps": {
        "type": bool,
        "description": "Whether to rely on Whisper's timestamps to get approximative first estimate of segment positions (up to refine_whisper_precision).",
        "options": None,
        "default": True,
    },
    "naive_approach": {
        "type": bool,
        "description": "Force the naive approach that consists in decoding twice the audio file, once to get the transcription and once with the decoded tokens to get the alignment. Note that this approach is used anyway when beam_size is not None and/or when the temperature is a list with more than one element.",
        "options": None,
        "default": False,
    },
}
model_type instance-attribute
model_type = _load_config(
    "model_type", model_config, config_schema
)
segment_type instance-attribute
segment_type = _load_config(
    "segment_type", model_config, config_schema
)
device instance-attribute
device = _load_config("device", model_config, config_schema)
download_root instance-attribute
download_root = _load_config(
    "download_root", model_config, config_schema
)
in_memory instance-attribute
in_memory = _load_config(
    "in_memory", model_config, config_schema
)
verbose instance-attribute
verbose = _load_config(
    "verbose", model_config, config_schema
)
temperature instance-attribute
temperature = _load_config(
    "temperature", model_config, config_schema
)
compression_ratio_threshold instance-attribute
compression_ratio_threshold = _load_config(
    "compression_ratio_threshold",
    model_config,
    config_schema,
)
logprob_threshold instance-attribute
logprob_threshold = _load_config(
    "logprob_threshold", model_config, config_schema
)
no_speech_threshold instance-attribute
no_speech_threshold = _load_config(
    "no_speech_threshold", model_config, config_schema
)
condition_on_previous_text instance-attribute
condition_on_previous_text = _load_config(
    "condition_on_previous_text",
    model_config,
    config_schema,
)
decode_options instance-attribute
decode_options = {
    config: _load_config(
        config, model_config, config_schema
    )
    for config in config_schema
    if not hasattr(self, config)
}
model instance-attribute
model = load_model(
    name=model_type,
    device=device,
    download_root=download_root,
    in_memory=in_memory,
)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whisper_timestamped_model.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def transcribe(self, media_file) -> str:
    audio = whisper_timestamped.load_audio(media_file)
    results = whisper_timestamped.transcribe(self.model, audio,
                                             verbose=self.verbose,
                                             temperature=self.temperature,
                                             compression_ratio_threshold=self.compression_ratio_threshold,
                                             logprob_threshold=self.logprob_threshold,
                                             no_speech_threshold=self.no_speech_threshold,
                                             condition_on_previous_text=self.condition_on_previous_text,
                                             **self.decode_options
                                             )
    subs = SSAFile()
    if self.segment_type == 'word':  # word level timestamps
        for segment in results['segments']:
            for word in segment['words']:
                event = SSAEvent(start=pysubs2.make_time(s=word["start"]), end=pysubs2.make_time(s=word["end"]))
                event.plaintext = word["text"].strip()
                subs.append(event)
    elif self.segment_type == 'sentence':
        for segment in results['segments']:
            event = SSAEvent(start=pysubs2.make_time(s=segment["start"]), end=pysubs2.make_time(s=segment["end"]))
            event.plaintext = segment["text"].strip()
            subs.append(event)
    else:
        raise Exception(f'Unknown `segment_type` value, it should be one of the following: '
                        f' {self.config_schema["segment_type"]["options"]}')
    return subs

whispercpp_model

Whisper.cpp Model

See whisper.cpp, See pywhispercpp

WhisperCppModel

WhisperCppModel(model_config)

Bases: AbstractModel

Source code in src/subsai/models/whispercpp_model.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def __init__(self, model_config):
    super(WhisperCppModel, self).__init__(model_config=model_config,
                                       model_name=self.model_name)
    # config
    self.model_type = _load_config('model_type', model_config, self.config_schema)

    self.params = {}
    for config in self.config_schema:
        if not hasattr(self, config):
            config_value = _load_config(config, model_config, self.config_schema)
            if config_value is None:
                continue
            self.params[config] = config_value

    self.model = Model(model=self.model_type, **self.params)
model_name class-attribute instance-attribute
model_name = 'ggerganov/whisper.cpp'
config_schema class-attribute instance-attribute
config_schema = {
    "model_type": {
        "type": list,
        "description": "Available whisper.cpp models",
        "options": AVAILABLE_MODELS,
        "default": "base",
    },
    "n_threads": {
        "type": int,
        "description": "Number of threads to allocate for the inferencedefault to min(4, available hardware_concurrency)",
        "options": None,
        "default": 4,
    },
    "n_max_text_ctx": {
        "type": int,
        "description": "max tokens to use from past text as prompt for the decoder",
        "options": None,
        "default": 16384,
    },
    "offset_ms": {
        "type": int,
        "description": "start offset in ms",
        "options": None,
        "default": 0,
    },
    "duration_ms": {
        "type": int,
        "description": "audio duration to process in ms",
        "options": None,
        "default": 0,
    },
    "translate": {
        "type": bool,
        "description": "whether to translate the audio to English",
        "options": None,
        "default": False,
    },
    "no_context": {
        "type": bool,
        "description": "do not use past transcription (if any) as initial prompt for the decoder",
        "options": None,
        "default": False,
    },
    "single_segment": {
        "type": bool,
        "description": "force single segment output (useful for streaming)",
        "options": None,
        "default": False,
    },
    "print_special": {
        "type": bool,
        "description": "print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)",
        "options": None,
        "default": False,
    },
    "print_progress": {
        "type": bool,
        "description": "print progress information",
        "options": None,
        "default": True,
    },
    "print_realtime": {
        "type": bool,
        "description": "print results from within whisper.cpp (avoid it, use callback instead)",
        "options": None,
        "default": False,
    },
    "print_timestamps": {
        "type": bool,
        "description": "print timestamps for each text segment when printing realtime",
        "options": None,
        "default": True,
    },
    "token_timestamps": {
        "type": bool,
        "description": "enable token-level timestamps",
        "options": None,
        "default": False,
    },
    "thold_pt": {
        "type": float,
        "description": "timestamp token probability threshold (~0.01)",
        "options": None,
        "default": 0.01,
    },
    "thold_ptsum": {
        "type": float,
        "description": "timestamp token sum probability threshold (~0.01)",
        "options": None,
        "default": 0.01,
    },
    "max_len": {
        "type": int,
        "description": "max segment length in characters",
        "options": None,
        "default": 0,
    },
    "split_on_word": {
        "type": bool,
        "description": "split on word rather than on token (when used with max_len)",
        "options": None,
        "default": False,
    },
    "max_tokens": {
        "type": int,
        "description": "max tokens per segment (0 = no limit)",
        "options": None,
        "default": 0,
    },
    "speed_up": {
        "type": bool,
        "description": "speed-up the audio by 2x using Phase Vocoder",
        "options": None,
        "default": False,
    },
    "audio_ctx": {
        "type": int,
        "description": "overwrite the audio context size (0 = use default)",
        "options": None,
        "default": 0,
    },
    "prompt_n_tokens": {
        "type": int,
        "description": "tokens to provide to the whisper decoder as initial prompt",
        "options": None,
        "default": 0,
    },
    "language": {
        "type": str,
        "description": 'for auto-detection, set to None, "" or "auto"',
        "options": None,
        "default": "en",
    },
    "suppress_blank": {
        "type": bool,
        "description": "common decoding parameters",
        "options": None,
        "default": True,
    },
    "suppress_non_speech_tokens": {
        "type": bool,
        "description": "common decoding parameters",
        "options": None,
        "default": False,
    },
    "temperature": {
        "type": float,
        "description": "initial decoding temperature",
        "options": None,
        "default": 0.0,
    },
    "max_initial_ts": {
        "type": float,
        "description": "max_initial_ts",
        "options": None,
        "default": 1.0,
    },
    "length_penalty": {
        "type": float,
        "description": "length_penalty",
        "options": None,
        "default": -1.0,
    },
    "temperature_inc": {
        "type": float,
        "description": "temperature_inc",
        "options": None,
        "default": 0.2,
    },
    "entropy_thold": {
        "type": float,
        "description": 'similar to OpenAI\'s "compression_ratio_threshold"',
        "options": None,
        "default": 2.4,
    },
    "logprob_thold": {
        "type": float,
        "description": "logprob_thold",
        "options": None,
        "default": -1.0,
    },
    "no_speech_thold": {
        "type": float,
        "description": "no_speech_thold",
        "options": None,
        "default": 0.6,
    },
    "greedy": {
        "type": dict,
        "description": "greedy",
        "options": None,
        "default": {"best_of": -1},
    },
    "beam_search": {
        "type": dict,
        "description": "beam_search",
        "options": None,
        "default": {"beam_size": -1, "patience": -1.0},
    },
}
model_type instance-attribute
model_type = _load_config(
    "model_type", model_config, config_schema
)
params instance-attribute
params = {}
model instance-attribute
model = Model(model=model_type, **params)
transcribe
transcribe(media_file)
Source code in src/subsai/models/whispercpp_model.py
246
247
248
249
250
251
252
253
def transcribe(self, media_file) -> str:
    segments = self.model.transcribe(media=media_file)
    subs = SSAFile()
    for seg in segments:
        event = SSAEvent(start=seg.t0*10, end=seg.t1*10)
        event.plaintext = seg.text.strip()
        subs.append(event)
    return subs

subsai.configs

Configurations file

AVAILABLE_MODELS module-attribute

AVAILABLE_MODELS = {
    "openai/whisper": {
        "class": WhisperModel,
        "description": "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.",
        "url": "https://github.com/openai/whisper",
        "config_schema": config_schema,
    },
    "linto-ai/whisper-timestamped": {
        "class": WhisperTimeStamped,
        "description": "Multilingual Automatic Speech Recognition with word-level timestamps and confidence.",
        "url": "https://github.com/linto-ai/whisper-timestamped",
        "config_schema": config_schema,
    },
    "ggerganov/whisper.cpp": {
        "class": WhisperCppModel,
        "description": "High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model\n* Plain C/C++ implementation without dependencies\n* Runs on the CPU\n",
        "url": "https://github.com/ggerganov/whisper.cpp\nhttps://github.com/abdeladim-s/pywhispercpp",
        "config_schema": config_schema,
    },
    "guillaumekln/faster-whisper": {
        "class": FasterWhisperModel,
        "description": "**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.\nThis implementation is up to 4 times faster than [openai/whisper]( https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.",
        "url": "https://github.com/guillaumekln/faster-whisper",
        "config_schema": config_schema,
    },
    "m-bain/whisperX": {
        "class": WhisperXModel,
        "description": "**whisperX** is a fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.",
        "url": "https://github.com/m-bain/whisperX",
        "config_schema": config_schema,
    },
    "jianfch/stable-ts": {
        "class": StableTsModel,
        "description": "**Stabilizing Timestamps for Whisper** This library modifies [Whisper](https://github.com/openai/whisper) to produce more reliable timestamps and extends its functionality.",
        "url": "https://github.com/jianfch/stable-ts",
        "config_schema": config_schema,
    },
    "API/openai/whisper": {
        "class": WhisperAPIModel,
        "description": "API for the OpenAI large-v2 Whisper model, requires an API key.",
        "url": "https://platform.openai.com/docs/guides/speech-to-text",
        "config_schema": config_schema,
    },
    "HuggingFace": {
        "class": HuggingFaceModel,
        "description": "Hugging Face implementation of Whisper. Any speech recognition pretrained model from the Hugging Face hub can be used as well",
        "url": "https://huggingface.co/tasks/automatic-speech-recognition",
        "config_schema": config_schema,
    },
}

BASIC_TOOLS_CONFIGS module-attribute

BASIC_TOOLS_CONFIGS = {
    "set time": {
        "description": "Set time to a subtitle",
        "config_schema": {
            "h": {
                "type": float,
                "description": "hours: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "m": {
                "type": float,
                "description": "minutes: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "s": {
                "type": float,
                "description": "seconds: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "ms": {
                "type": float,
                "description": "milliseconds: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
        },
    },
    "shift": {
        "description": "Shift all subtitles by constant time amount",
        "config_schema": {
            "h": {
                "type": float,
                "description": "hours: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "m": {
                "type": float,
                "description": "minutes: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "s": {
                "type": float,
                "description": "seconds: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "ms": {
                "type": float,
                "description": "milliseconds: Integer or float values, may be positive or negative",
                "options": None,
                "default": 0,
            },
            "frames": {
                "type": int,
                "description": "When specified, must be an integer number of frames",
                "options": None,
                "default": None,
            },
            "fps": {
                "type": float,
                "description": "When specified, must be a positive number.",
                "options": None,
                "default": None,
            },
        },
    },
}

ADVANCED_TOOLS_CONFIGS module-attribute

ADVANCED_TOOLS_CONFIGS = {
    "ffsubsync": {
        "description": "Language-agnostic automatic synchronization of subtitles with video, so that subtitles are aligned to the correct starting point within the video.",
        "url": "https://github.com/smacke/ffsubsync",
        "config_schema": {
            "vad": {
                "type": list,
                "description": "Which voice activity detector to use for speech extraction (if using video / audio as a reference",
                "options": [
                    "subs_then_webrtc",
                    "webrtc",
                    "subs_then_auditok",
                    "auditok",
                    "subs_then_silero",
                    "silero",
                ],
                "default": DEFAULT_VAD,
            },
            "max-subtitle-seconds": {
                "type": float,
                "description": "Maximum duration for a subtitle to appear on-screen",
                "options": None,
                "default": DEFAULT_MAX_SUBTITLE_SECONDS,
            },
            "start-seconds": {
                "type": int,
                "description": "Start time for processing",
                "options": None,
                "default": DEFAULT_START_SECONDS,
            },
            "max-offset-seconds": {
                "type": float,
                "description": "The max allowed offset seconds for any subtitle segment",
                "options": None,
                "default": DEFAULT_MAX_OFFSET_SECONDS,
            },
            "apply-offset-seconds": {
                "type": float,
                "description": "Apply a predefined offset in seconds to all subtitle segments",
                "options": None,
                "default": DEFAULT_APPLY_OFFSET_SECONDS,
            },
            "suppress-output-if-offset-less-than": {
                "type": float,
                "description": "Apply a predefined offset in seconds to all subtitle segments",
                "options": None,
                "default": None,
            },
            "frame-rate": {
                "type": int,
                "description": "Frame rate for audio extraction",
                "options": None,
                "default": DEFAULT_FRAME_RATE,
            },
            "output-encoding": {
                "type": str,
                "description": 'What encoding to use for writing output subtitles (default=utf-8). Can indicate "same" to use same encoding as that of the input.',
                "options": None,
                "default": "utf-8",
            },
            "skip-infer-framerate-ratio": {
                "type": bool,
                "description": "If set, do not try to infer framerate ratio based on duration ratio.",
                "options": None,
                "default": False,
            },
            "no-fix-framerate": {
                "type": bool,
                "description": "If specified, subsync will not attempt to correct a framerate",
                "options": None,
                "default": False,
            },
            "serialize-speech": {
                "type": bool,
                "description": "If specified, serialize reference speech to a numpy array.",
                "options": None,
                "default": False,
            },
            "gss": {
                "type": bool,
                "description": "If specified, use golden-section search to try to findthe optimal framerate ratio between video and subtitles.",
                "options": None,
                "default": False,
            },
        },
    },
    "Translation": {
        "description": "Translate to different languages using AI",
        "url": "https://github.com/xhluca/dl-translate",
        "config_schema": {
            "model": {
                "type": list,
                "description": "The model",
                "options": available_translation_models(),
                "default": available_translation_models()[
                    0
                ],
            },
            "device": {
                "type": list,
                "description": '"cpu", "gpu" or "auto". If it\'s set to "auto", will try to select a GPU when available or else fall back to CPU',
                "options": [
                    "auto",
                    *get_available_devices(),
                ],
                "default": "auto",
            },
            "batch_size": {
                "type": int,
                "description": "The number of samples to load at once. If set to `None`, it will process everything at once\nA smaller value is preferred for `batch_size` if your (video) RAM is limited",
                "options": None,
                "default": 32,
            },
            "verbose": {
                "type": bool,
                "description": "Whether to display the progress bar for every batch processed.",
                "options": None,
                "default": True,
            },
        },
    },
}