Skip to content

PyLLaMACpp API Reference

pyllamacpp.model

This module contains a simple Python API around llama.cpp

Model

Model(
    model_path,
    prompt_context="",
    prompt_prefix="",
    prompt_suffix="",
    log_level=logging.ERROR,
    n_ctx=512,
    seed=0,
    n_gpu_layers=0,
    f16_kv=False,
    logits_all=False,
    vocab_only=False,
    use_mlock=False,
    embedding=False,
)

A simple Python class on top of llama.cpp

Example usage

from pyllamacpp.model import Model

model = Model(ggml_model='path/to/ggml/model')
for token in model.generate("Tell me a joke ?"):
    print(token, end='', flush=True)

Parameters:

Name Type Description Default
model_path str

the path to the ggml model

required
prompt_context str

the global context of the interaction

''
prompt_prefix str

the prompt prefix

''
prompt_suffix str

the prompt suffix

''
log_level int

logging level, set to INFO by default

ERROR
n_ctx int

LLaMA context

512
seed int

random seed

0
n_gpu_layers int

number of layers to store in VRAM

0
f16_kv bool

use fp16 for KV cache

False
logits_all bool

the llama_eval() call computes all logits, not just the last one

False
vocab_only bool

only load the vocabulary, no weights

False
use_mlock bool

force system to keep model in RAM

False
embedding bool

embedding mode only

False
Source code in pyllamacpp/model.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(self,
             model_path: str,
             prompt_context: str = '',
             prompt_prefix: str = '',
             prompt_suffix: str = '',
             log_level: int = logging.ERROR,
             n_ctx: int = 512,
             seed: int = 0,
             n_gpu_layers: int = 0,
             f16_kv: bool = False,
             logits_all: bool = False,
             vocab_only: bool = False,
             use_mlock: bool = False,
             embedding: bool = False):
    """
    :param model_path: the path to the ggml model
    :param prompt_context: the global context of the interaction
    :param prompt_prefix: the prompt prefix
    :param prompt_suffix: the prompt suffix
    :param log_level: logging level, set to INFO by default
    :param n_ctx: LLaMA context
    :param seed: random seed
    :param n_gpu_layers: number of layers to store in VRAM
    :param f16_kv: use fp16 for KV cache
    :param logits_all: the llama_eval() call computes all logits, not just the last one
    :param vocab_only: only load the vocabulary, no weights
    :param use_mlock: force system to keep model in RAM
    :param embedding: embedding mode only
    """

    # set logging level
    set_log_level(log_level)
    self._ctx = None

    if not Path(model_path).is_file():
        raise Exception(f"File {model_path} not found!")

    self.llama_params = pp.llama_context_default_params()
    # update llama_params
    self.llama_params.n_ctx = n_ctx
    self.llama_params.seed = seed
    self.llama_params.n_gpu_layers = n_gpu_layers
    self.llama_params.f16_kv = f16_kv
    self.llama_params.logits_all = logits_all
    self.llama_params.vocab_only = vocab_only
    self.llama_params.use_mlock = use_mlock
    self.llama_params.embedding = embedding

    self._ctx = pp.llama_init_from_file(model_path, self.llama_params)

    # gpt params
    self.gpt_params = pp.gpt_params()

    self.res = ""

    self._n_ctx = pp.llama_n_ctx(self._ctx)
    self._last_n_tokens = [0] * self._n_ctx  # n_ctx elements
    self._n_past = 0
    self.prompt_cntext = prompt_context
    self.prompt_prefix = prompt_prefix
    self.prompt_suffix = prompt_suffix

    self._prompt_context_tokens = []
    self._prompt_prefix_tokens = []
    self._prompt_suffix_tokens = []

    self.reset()

reset

reset()

Resets the context

Source code in pyllamacpp/model.py
107
108
109
110
111
112
113
def reset(self) -> None:
    """Resets the context"""
    self._prompt_context_tokens = pp.llama_tokenize(self._ctx, self.prompt_cntext, True)
    self._prompt_prefix_tokens = pp.llama_tokenize(self._ctx, self.prompt_prefix, True)
    self._prompt_suffix_tokens = pp.llama_tokenize(self._ctx, self.prompt_suffix, False)
    self._last_n_tokens = [0] * self._n_ctx  # n_ctx elements
    self._n_past = 0

tokenize

tokenize(text)

Returns a list of tokens for the text

Parameters:

Name Type Description Default
text str

text to be tokenized

required

Returns:

Type Description

List of tokens

Source code in pyllamacpp/model.py
115
116
117
118
119
120
121
def tokenize(self, text: str):
    """
    Returns a list of tokens for the text
    :param text: text to be tokenized
    :return: List of tokens
    """
    return pp.llama_tokenize(self._ctx, text, True)

detokenize

detokenize(tokens)

Returns a list of tokens for the text

Parameters:

Name Type Description Default
text

text to be tokenized

required

Returns:

Type Description

A string representing the text extracted from the tokens

Source code in pyllamacpp/model.py
123
124
125
126
127
128
129
def detokenize(self, tokens: list):
    """
    Returns a list of tokens for the text
    :param text: text to be tokenized
    :return: A string representing the text extracted from the tokens
    """
    return pp.llama_tokens_to_str(self._ctx, tokens)

generate

generate(
    prompt,
    n_predict=None,
    n_threads=4,
    seed=None,
    antiprompt=None,
    n_batch=512,
    n_keep=0,
    top_k=40,
    top_p=0.95,
    tfs_z=1.0,
    typical_p=1.0,
    temp=0.8,
    repeat_penalty=1.1,
    repeat_last_n=64,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    mirostat=0,
    mirostat_tau=5.0,
    mirostat_eta=0.1,
    infinite_generation=False,
)

Runs llama.cpp inference and yields new predicted tokens from the prompt provided as input

Parameters:

Name Type Description Default
prompt str

The prompt :)

required
n_predict Union[None, int]

if n_predict is not None, the inference will stop if it reaches n_predict tokens, otherwise it will continue until EOS

None
n_threads int

The number of CPU threads

4
seed Union[None, int]

Set rng seed, leave it None for random

None
antiprompt str

aka the stop word, the generation will stop if this word is predicted, keep it None to handle it in your own way

None
n_batch int

batch size for prompt processing (must be >=32 to use BLAS)

512
n_keep int

number of tokens to keep from initial prompt

0
top_k int

top K sampling parameter, <= 0 to use vocab size

40
top_p float

top P sampling parameter, 1.0 = disabled

0.95
tfs_z float

tfs_z sampling parameter, 1.0 = disabled

1.0
typical_p float

typical_p sampling parameter, 1.0 = disabled

1.0
temp float

Temperature, 1.0 = disabled

0.8
repeat_penalty float

repeat penalty sampling parameter, 1.0 = disabled

1.1
repeat_last_n int

last n tokens to penalize (0 = disable penalty, -1 = context size)

64
frequency_penalty float

0.0 = disabled

0.0
presence_penalty float

0.0 = disabled

0.0
mirostat int

0 = disabled, 1 = mirostat, 2 = mirostat 2.0

0
mirostat_tau int

target entropy

5.0
mirostat_eta int

learning rate

0.1
infinite_generation bool

set it to True to make the generation go infinitely

False

Returns:

Type Description
Generator

Tokens generator

Source code in pyllamacpp/model.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def generate(self,
             prompt: str,
             n_predict: Union[None, int] = None,
             n_threads: int = 4,
             seed: Union[None, int] = None,
             antiprompt: str = None,
             n_batch: int = 512,
             n_keep: int = 0,
             top_k: int = 40,
             top_p: float = 0.95,
             tfs_z: float = 1.00,
             typical_p: float = 1.00,
             temp: float = 0.8,
             repeat_penalty: float = 1.10,
             repeat_last_n: int = 64,
             frequency_penalty: float = 0.00,
             presence_penalty: float = 0.00,
             mirostat: int = 0,
             mirostat_tau: int = 5.00,
             mirostat_eta: int = 0.1,
             infinite_generation: bool = False) -> Generator:
    """
    Runs llama.cpp inference and yields new predicted tokens from the prompt provided as input

    :param prompt: The prompt :)
    :param n_predict: if n_predict is not None, the inference will stop if it reaches `n_predict` tokens, otherwise
                      it will continue until `EOS`
    :param n_threads: The number of CPU threads
    :param seed: Set rng seed, leave it None for random
    :param antiprompt: aka the stop word, the generation will stop if this word is predicted,
                       keep it None to handle it in your own way
    :param n_batch: batch size for prompt processing (must be >=32 to use BLAS)
    :param n_keep: number of tokens to keep from initial prompt
    :param top_k: top K sampling parameter, <= 0 to use vocab size
    :param top_p: top P sampling parameter, 1.0 = disabled
    :param tfs_z: tfs_z sampling parameter, 1.0 = disabled
    :param typical_p: typical_p sampling parameter, 1.0 = disabled
    :param temp: Temperature, 1.0 = disabled
    :param repeat_penalty: repeat penalty sampling parameter, 1.0 = disabled
    :param repeat_last_n: last n tokens to penalize (0 = disable penalty, -1 = context size)
    :param frequency_penalty: 0.0 = disabled
    :param presence_penalty: 0.0 = disabled
    :param mirostat: 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    :param mirostat_tau: target entropy
    :param mirostat_eta: learning rate
    :param infinite_generation: set it to `True` to make the generation go infinitely

    :return: Tokens generator
    """
    # update params
    self.gpt_params.n_batch = n_batch
    self.gpt_params.n_keep = n_keep
    self.gpt_params.top_k = top_k
    self.gpt_params.top_p = top_p
    self.gpt_params.tfs_z = tfs_z
    self.gpt_params.typical_p = typical_p
    self.gpt_params.temp = temp
    self.gpt_params.repeat_penalty = repeat_penalty
    self.gpt_params.repeat_last_n = repeat_last_n
    self.gpt_params.frequency_penalty = frequency_penalty
    self.gpt_params.presence_penalty = presence_penalty
    self.gpt_params.mirostat = mirostat
    self.gpt_params.mirostat_tau = mirostat_tau
    self.gpt_params.mirostat_eta = mirostat_eta

    if seed is not None:
        pp.llama_set_rng_seed(self._ctx, seed)
    else:
        seed = int(time.time())
        pp.llama_set_rng_seed(self._ctx, seed)

    input_tokens = self._prompt_prefix_tokens + \
                   pp.llama_tokenize(self._ctx, prompt, len(self._prompt_prefix_tokens) == 0) + \
                   self._prompt_suffix_tokens

    # input_tokens = pp.llama_tokenize(self._ctx, prompt, True)

    if len(input_tokens) > self._n_ctx - 4:
        raise Exception('Prompt too long!')
    predicted_tokens = []
    predicted_token = 0

    # add global context if no past yet
    if self._n_past == 0:
        for tok in self._prompt_context_tokens:
            predicted_tokens.append(tok)
            self._last_n_tokens.pop(0)
            self._last_n_tokens.append(tok)

    # consume input tokens
    for tok in input_tokens:
        predicted_tokens.append(tok)
        self._last_n_tokens.pop(0)
        self._last_n_tokens.append(tok)

    n_remain = 0
    if antiprompt is not None:
        sequence_queue = []
        stop_word = antiprompt.strip()

    n_ctx = pp.llama_n_ctx(self._ctx)

    while infinite_generation or predicted_token != pp.llama_token_eos():
        if len(predicted_tokens) > 0:
            # infinite text generation via context swapping
            if (self._n_past + len(predicted_tokens)) > n_ctx:
                n_left = self._n_past - self.gpt_params.n_keep
                self._n_past = max(1, self.gpt_params.n_keep)
                predicted_tokens[:0] = self._last_n_tokens[
                                       n_ctx - n_left // 2 - len(predicted_tokens):len(self._last_n_tokens) - len(
                                           predicted_tokens)]

            for i in range(0, len(predicted_tokens), self.gpt_params.n_batch):
                n_eval = len(predicted_tokens) - i
                if n_eval > self.gpt_params.n_batch:
                    n_eval = self.gpt_params.n_batch

                pp.llama_eval(self._ctx,
                              predicted_tokens[i:],
                              n_eval,
                              self._n_past,
                              n_threads)
                self._n_past += n_eval

        predicted_tokens.clear()

        # sampling
        predicted_token = pp.sample_next_token(self._ctx, self.gpt_params, self._last_n_tokens)

        predicted_tokens.append(predicted_token)
        # tokens come as raw undecoded bytes,
        # and we decode them, replacing those that can't be decoded.
        # I decoded here for fear of breaking the stopword logic,
        token_str = pp.llama_token_to_str(self._ctx, predicted_token).decode('utf-8', "replace")
        if antiprompt is not None:
            if token_str == '\n':
                sequence_queue.append(token_str)
                continue
            if len(sequence_queue) != 0:
                if stop_word.startswith(''.join(sequence_queue).strip()):
                    sequence_queue.append(token_str)
                    if ''.join(sequence_queue).strip() == stop_word:
                        break
                    else:
                        continue
                else:
                    # consume sequence queue tokens
                    while len(sequence_queue) != 0:
                        yield sequence_queue.pop(0)
                    sequence_queue = []
        self._last_n_tokens.pop(0)
        self._last_n_tokens.append(predicted_token)
        if n_predict is not None:
            if n_remain == n_predict:
                break
            else:
                n_remain += 1
        yield token_str

cpp_generate

cpp_generate(
    prompt,
    n_predict=128,
    new_text_callback=None,
    n_threads=4,
    top_k=40,
    top_p=0.95,
    tfs_z=1.0,
    typical_p=1.0,
    temp=0.8,
    repeat_penalty=1.1,
    repeat_last_n=64,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    mirostat=0,
    mirostat_tau=5.0,
    mirostat_eta=0.1,
    n_batch=8,
    n_keep=0,
    interactive=False,
    antiprompt=[],
    instruct=False,
    verbose_prompt=False,
)

The generate function from llama.cpp

Parameters:

Name Type Description Default
prompt str

the prompt

required
n_predict int

number of tokens to generate

128
new_text_callback Callable[[bytes], None]

a callback function called when new text is generated, default None

None
n_threads int

The number of CPU threads

4
top_k int

top K sampling parameter, <= 0 to use vocab size

40
top_p float

top P sampling parameter, 1.0 = disabled

0.95
tfs_z float

tfs_z sampling parameter, 1.0 = disabled

1.0
typical_p float

typical_p sampling parameter, 1.0 = disabled

1.0
temp float

Temperature, 1.0 = disabled

0.8
repeat_penalty float

repeat penalty sampling parameter, 1.0 = disabled

1.1
repeat_last_n int

last n tokens to penalize (0 = disable penalty, -1 = context size)

64
frequency_penalty float

0.0 = disabled

0.0
presence_penalty float

0.0 = disabled

0.0
mirostat int

0 = disabled, 1 = mirostat, 2 = mirostat 2.0

0
mirostat_tau int

target entropy

5.0
mirostat_eta int

learning rate

0.1
n_batch int

GPT params n_batch

8
n_keep int

GPT params n_keep

0
interactive bool

interactive communication

False
anti_prompt

list of anti prompts

required
instruct bool

Activate instruct mode

False
verbose_prompt bool

verbose prompt

False

Returns:

Type Description
str

the new generated text

Source code in pyllamacpp/model.py
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def cpp_generate(self, prompt: str,
                 n_predict: int = 128,
                 new_text_callback: Callable[[bytes], None] = None,
                 n_threads: int = 4,
                 top_k: int = 40,
                 top_p: float = 0.95,
                 tfs_z: float = 1.00,
                 typical_p: float = 1.00,
                 temp: float = 0.8,
                 repeat_penalty: float = 1.10,
                 repeat_last_n: int = 64,
                 frequency_penalty: float = 0.00,
                 presence_penalty: float = 0.00,
                 mirostat: int = 0,
                 mirostat_tau: int = 5.00,
                 mirostat_eta: int = 0.1,
                 n_batch: int = 8,
                 n_keep: int = 0,
                 interactive: bool = False,
                 antiprompt: List = [],
                 instruct: bool = False,
                 verbose_prompt: bool = False,
                 ) -> str:
    """
    The generate function from `llama.cpp`

    :param prompt: the prompt
    :param n_predict: number of tokens to generate
    :param new_text_callback: a callback function called when new text is generated, default `None`
    :param n_threads: The number of CPU threads
    :param top_k: top K sampling parameter, <= 0 to use vocab size
    :param top_p: top P sampling parameter, 1.0 = disabled
    :param tfs_z: tfs_z sampling parameter, 1.0 = disabled
    :param typical_p: typical_p sampling parameter, 1.0 = disabled
    :param temp: Temperature, 1.0 = disabled
    :param repeat_penalty: repeat penalty sampling parameter, 1.0 = disabled
    :param repeat_last_n: last n tokens to penalize (0 = disable penalty, -1 = context size)
    :param frequency_penalty: 0.0 = disabled
    :param presence_penalty: 0.0 = disabled
    :param mirostat: 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    :param mirostat_tau: target entropy
    :param mirostat_eta: learning rate
    :param n_batch: GPT params n_batch
    :param n_keep: GPT params n_keep
    :param interactive: interactive communication
    :param anti_prompt: list of anti prompts
    :param instruct: Activate instruct mode
    :param verbose_prompt: verbose prompt
    :return: the new generated text
    """
    self.gpt_params.prompt = prompt
    self.gpt_params.n_predict = n_predict
    self.gpt_params.n_threads = n_threads
    self.gpt_params.top_k = top_k
    self.gpt_params.top_p = top_p
    self.gpt_params.tfs_z = tfs_z
    self.gpt_params.typical_p = typical_p
    self.gpt_params.temp = temp
    self.gpt_params.repeat_penalty = repeat_penalty
    self.gpt_params.repeat_last_n = repeat_last_n
    self.gpt_params.frequency_penalty = frequency_penalty
    self.gpt_params.presence_penalty = presence_penalty
    self.gpt_params.mirostat = mirostat
    self.gpt_params.mirostat_tau = mirostat_tau
    self.gpt_params.mirostat_eta = mirostat_eta
    self.gpt_params.n_batch = n_batch
    self.gpt_params.n_keep = n_keep
    self.gpt_params.interactive = interactive
    self.gpt_params.antiprompt = antiprompt
    self.gpt_params.instruct = instruct
    self.gpt_params.verbose_prompt = verbose_prompt

    # assign new_text_callback
    self.res = ""
    Model._new_text_callback = new_text_callback

    # run the prediction
    pp.llama_generate(self._ctx, self.gpt_params, self._call_new_text_callback)
    return self.res

get_params staticmethod

get_params(params)

Returns a dict representation of the params

Returns:

Type Description
dict

params dict

Source code in pyllamacpp/model.py
392
393
394
395
396
397
398
399
400
401
402
403
@staticmethod
def get_params(params) -> dict:
    """
    Returns a `dict` representation of the params
    :return: params dict
    """
    res = {}
    for param in dir(params):
        if param.startswith('__'):
            continue
        res[param] = getattr(params, param)
    return res

get_embeddings

get_embeddings()

Get the embeddings for the input

Returns:

Type Description
List[float]

[n_embd] (1-dimensional))

Source code in pyllamacpp/model.py
412
413
414
415
416
417
418
419
def get_embeddings(self) -> List[float]:
    """
    Get the embeddings for the input

    :return the last embeddings vector from the context (shape: [n_embd] (1-dimensional))
    """
    assert self.llama_params.embedding, "The model should be instanciated with embedding=True to get the embeddings"
    return pp.llama_get_embeddings(self._ctx)

get_prompt_embeddings

get_prompt_embeddings(prompt, n_threads=4, n_batch=512)

Get the embeddings of a specific prompt

âš  this will reset the context

Parameters:

Name Type Description Default
prompt str

the prompt :)

required
n_threads int

The number of CPU threads

4
n_batch int

batch size for prompt processing (must be >=32 to use BLAS)

512
Source code in pyllamacpp/model.py
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
def get_prompt_embeddings(self,
                          prompt: str,
                          n_threads: int = 4,
                          n_batch: int = 512
                          ) -> List[float]:
    """
    Get the embeddings of a specific prompt

    :warning: this will reset the context

    :param prompt: the prompt :)
    :param n_threads: The number of CPU threads
    :param n_batch: batch size for prompt processing (must be >=32 to use BLAS)
    :return The embeddings vector
    """
    assert self.llama_params.embedding, "The model should be instanced with embedding=True to get the embeddings"
    self.reset()
    tokens = self.tokenize(prompt)
    for i in range(0, len(tokens), n_batch):
        n_eval = len(tokens) - i
        if n_eval > n_batch:
            n_eval = n_batch

        pp.llama_eval(self._ctx,
                      tokens[i:],
                      n_eval,
                      0,
                      n_threads)
    embeddings = self.get_embeddings()
    self.reset()
    return embeddings

pyllamacpp.utils

Helper functions

llama_to_ggml

llama_to_ggml(dir_model, ftype=1)

A helper function to convert LLaMa Pytorch models to ggml, same exact script as convert-pth-to-ggml.py from llama.cpp repository, copied here for convinience purposes only!

Parameters:

Name Type Description Default
dir_model str

llama model directory

required
ftype int

0 or 1, 0-> f32, 1-> f16

1

Returns:

Type Description
str

ggml model path

Source code in pyllamacpp/utils.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def llama_to_ggml(dir_model: str, ftype: int = 1) -> str:
    """
    A helper function to convert LLaMa Pytorch models to ggml,
    same exact script as `convert-pth-to-ggml.py` from [llama.cpp](https://github.com/ggerganov/llama.cpp)
    repository, copied here for convinience purposes only!

    :param dir_model: llama model directory
    :param ftype: 0 or 1, 0-> f32, 1-> f16
    :return: ggml model path
    """
    # output in the same directory as the model
    assert ftype in [0, 1], f"ftype should be in [0,1], 0-> f32, 1-> f16"

    fname_hparams = str((Path(dir_model) / "params.json").absolute())
    fname_tokenizer = str((Path(dir_model).parent / "tokenizer.model").absolute())

    def get_n_parts(dim):
        if dim == 4096:
            return 1
        elif dim == 5120:
            return 2
        elif dim == 6656:
            return 4
        elif dim == 8192:
            return 8
        else:
            print("Invalid dim: " + str(dim))
            sys.exit(1)

    # possible data types
    #   ftype == 0 -> float32
    #   ftype == 1 -> float16
    #
    # map from ftype to string
    ftype_str = ["f32", "f16"]

    with open(fname_hparams, "r") as f:
        hparams = json.load(f)

    tokenizer = SentencePieceProcessor(fname_tokenizer)

    hparams.update({"vocab_size": tokenizer.vocab_size()})

    n_parts = get_n_parts(hparams["dim"])

    print(hparams)
    print('n_parts = ', n_parts)

    for p in range(n_parts):
        print('Processing part ', p)

        # fname_model = dir_model + "/consolidated.00.pth"

        fname_model = str(Path(dir_model) / f"consolidated.0{str(p)}.pth")
        fname_out = str(Path(dir_model) / f"ggml-model-{ftype_str[ftype]}.bin")
        if (p > 0):
            fname_out = str(Path(dir_model) / f"ggml-model-{ftype_str[ftype]}.bin.{str(p)}")

        model = torch.load(fname_model, map_location="cpu")

        fout = open(fname_out, "wb")

        fout.write(struct.pack("i", 0x67676d6c))  # magic: ggml in hex
        fout.write(struct.pack("i", hparams["vocab_size"]))
        fout.write(struct.pack("i", hparams["dim"]))
        fout.write(struct.pack("i", hparams["multiple_of"]))
        fout.write(struct.pack("i", hparams["n_heads"]))
        fout.write(struct.pack("i", hparams["n_layers"]))
        fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"]))  # rot (obsolete)
        fout.write(struct.pack("i", ftype))

        # Is this correct??
        for i in range(32000):
            if tokenizer.is_unknown(i):
                # "<unk>" token (translated as ??)
                text = " \u2047 ".encode("utf-8")
                fout.write(struct.pack("i", len(text)))
                fout.write(text)
            elif tokenizer.is_control(i):
                # "<s>"/"</s>" tokens
                fout.write(struct.pack("i", 0))
            elif tokenizer.is_byte(i):
                # "<U+XX>" tokens (which may be invalid UTF-8)
                piece = tokenizer.id_to_piece(i)
                if len(piece) != 6:
                    print("Invalid token: " + piece)
                    sys.exit(1)
                byte_value = int(piece[3:-1], 16)
                fout.write(struct.pack("i", 1))
                fout.write(struct.pack("B", byte_value))
            else:
                # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
                fout.write(struct.pack("i", len(text)))
                fout.write(text)

        for k, v in model.items():
            name = k
            shape = v.shape

            # skip layers.X.attention.inner_attention.rope.freqs
            if name[-5:] == "freqs":
                continue

            print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

            # data = tf.train.load_variable(dir_model, name).squeeze()
            data = v.numpy().squeeze()
            n_dims = len(data.shape);

            # for efficiency - transpose some matrices
            # "model/h.*/attn/c_attn/w"
            # "model/h.*/attn/c_proj/w"
            # "model/h.*/mlp/c_fc/w"
            # "model/h.*/mlp/c_proj/w"
            # if name[-14:] == "/attn/c_attn/w" or \
            #   name[-14:] == "/attn/c_proj/w" or \
            #   name[-11:] == "/mlp/c_fc/w" or \
            #   name[-13:] == "/mlp/c_proj/w":
            #    print("  Transposing")
            #    data = data.transpose()

            dshape = data.shape

            # default type is fp16
            ftype_cur = 1
            if ftype == 0 or n_dims == 1:
                print("  Converting to float32")
                data = data.astype(np.float32)
                ftype_cur = 0

            # header
            sname = name.encode('utf-8')
            fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
            for i in range(n_dims):
                fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
            fout.write(sname);

            # data
            data.tofile(fout)

        # I hope this deallocates the memory ..
        model = None

        fout.close()

        print("Done. Output file: " + fname_out + ", (part ", p, ")")
        print("")
        return fname_out

quantize

quantize(ggml_model_path, output_model_path=None, itype=2)

Qunatizes the ggml model.

Parameters:

Name Type Description Default
ggml_model_path str

path of the ggml model

required
output_model_path str

output file path for the qunatized model

None
itype int

quantization type: 2 -> Q4_0, 3 -> Q4_1

2

Returns:

Type Description
str

quantized model path

Source code in pyllamacpp/utils.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def quantize(ggml_model_path: str, output_model_path: str = None, itype: int = 2) -> str:
    """
    Qunatizes the ggml model.

    :param ggml_model_path: path of the ggml model
    :param output_model_path: output file path for the qunatized model
    :param itype: quantization type: 2 -> Q4_0, 3 -> Q4_1
    :return: quantized model path
    """
    if output_model_path is None:
        output_model_path = ggml_model_path + f'-q4_{0 if itype == 2 else 1}.bin'
    logging.info("Quantization will start soon ... (This my take a while)")
    pp.llama_quantize(ggml_model_path, output_model_path, itype)
    logging.info(f"Quantized model is created successfully {output_model_path}")
    return output_model_path

_pyllamacpp

PyLLaMACpp: Python binding for llama.cpp

.. currentmodule:: _pyllamacpp

.. autosummary:: :toctree: _generate

LLaMAModel

LLaMAModel()

Bases: pybind11_builtins.pybind11_object

init(self: _pyllamacpp.LLaMAModel, arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.gpt_params, arg2: int) -> None

generate method descriptor

generate()

generate(self: _pyllamacpp.LLaMAModel, arg0: _pyllamacpp.gpt_params) -> int

setup method descriptor

setup()

setup(self: _pyllamacpp.LLaMAModel, arg0: _pyllamacpp.gpt_params) -> int

update_prompt method descriptor

update_prompt()

update_prompt(self: _pyllamacpp.LLaMAModel, arg0: _pyllamacpp.gpt_params, arg1: str) -> None

gpt_params

gpt_params()

Bases: pybind11_builtins.pybind11_object

init(self: _pyllamacpp.gpt_params) -> None

llama_context

llama_context(*args, **kwargs)

Bases: pybind11_builtins.pybind11_object

Initialize self. See help(type(self)) for accurate signature.

llama_context_params

llama_context_params()

Bases: pybind11_builtins.pybind11_object

init(self: _pyllamacpp.llama_context_params) -> None

llama_ftype

llama_ftype()

Bases: pybind11_builtins.pybind11_object

Members:

LLAMA_FTYPE_ALL_F32

LLAMA_FTYPE_MOSTLY_F16

LLAMA_FTYPE_MOSTLY_Q4_0

LLAMA_FTYPE_MOSTLY_Q4_1

LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16

LLAMA_FTYPE_MOSTLY_Q8_0

LLAMA_FTYPE_MOSTLY_Q5_0

LLAMA_FTYPE_MOSTLY_Q5_1

init(self: _pyllamacpp.llama_ftype, value: int) -> None

name property

name

name(self: handle) -> str

llama_token_data

llama_token_data()

Bases: pybind11_builtins.pybind11_object

init(self: _pyllamacpp.llama_token_data) -> None

llama_token_data_array

llama_token_data_array()

Bases: pybind11_builtins.pybind11_object

init(self: _pyllamacpp.llama_token_data_array) -> None

llama_apply_lora_from_file builtin

llama_apply_lora_from_file()

llama_apply_lora_from_file(arg0: _pyllamacpp.llama_context, arg1: str, arg2: str, arg3: int) -> None

llama_context_default_params builtin

llama_context_default_params()

llama_context_default_params() -> _pyllamacpp.llama_context_params

llama_eval builtin

llama_eval()

llama_eval(arg0: _pyllamacpp.llama_context, arg1: numpy.ndarray[numpy.int32], arg2: int, arg3: int, arg4: int) -> int

llama_free builtin

llama_free()

llama_free(arg0: _pyllamacpp.llama_context) -> None

llama_generate builtin

llama_generate()

llama_generate(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.gpt_params, arg2: function) -> int

llama_get_embeddings builtin

llama_get_embeddings()

llama_get_embeddings(arg0: _pyllamacpp.llama_context) -> List[float]

llama_get_kv_cache_token_count builtin

llama_get_kv_cache_token_count()

llama_get_kv_cache_token_count(arg0: _pyllamacpp.llama_context) -> int

llama_get_logits builtin

llama_get_logits()

llama_get_logits(arg0: _pyllamacpp.llama_context) -> float

llama_get_state_size builtin

llama_get_state_size()

llama_get_state_size(arg0: _pyllamacpp.llama_context) -> int

llama_init_from_file builtin

llama_init_from_file()

llama_init_from_file(arg0: str, arg1: _pyllamacpp.llama_context_params) -> _pyllamacpp.llama_context

llama_load_session_file builtin

llama_load_session_file()

llama_load_session_file(arg0: _pyllamacpp.llama_context, arg1: str, arg2: numpy.ndarray[numpy.int32], arg3: int, arg4: int) -> bool

llama_mlock_supported builtin

llama_mlock_supported()

llama_mlock_supported() -> bool

llama_mmap_supported builtin

llama_mmap_supported()

llama_mmap_supported() -> bool

llama_model_quantize builtin

llama_model_quantize()

llama_model_quantize(args, *kwargs) Overloaded function.

  1. llama_model_quantize(arg0: str, arg1: str, arg2: _pyllamacpp.llama_ftype, arg3: int) -> int

  2. llama_model_quantize(arg0: str, arg1: str, arg2: _pyllamacpp.llama_ftype, arg3: int) -> int

llama_n_ctx builtin

llama_n_ctx()

llama_n_ctx(arg0: _pyllamacpp.llama_context) -> int

llama_n_embd builtin

llama_n_embd()

llama_n_embd(arg0: _pyllamacpp.llama_context) -> int

llama_n_vocab builtin

llama_n_vocab()

llama_n_vocab(arg0: _pyllamacpp.llama_context) -> int

llama_print_system_info builtin

llama_print_system_info()

llama_print_system_info() -> str

llama_print_timings builtin

llama_print_timings()

llama_print_timings(arg0: _pyllamacpp.llama_context) -> None

llama_reset_timings builtin

llama_reset_timings()

llama_reset_timings(arg0: _pyllamacpp.llama_context) -> None

llama_sample_frequency_and_presence_penalties builtin

llama_sample_frequency_and_presence_penalties()

llama_sample_frequency_and_presence_penalties(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: numpy.ndarray[numpy.int32], arg3: int, arg4: float, arg5: float) -> None

llama_sample_repetition_penalty builtin

llama_sample_repetition_penalty()

llama_sample_repetition_penalty(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: numpy.ndarray[numpy.int32], arg3: int, arg4: float) -> None

llama_sample_softmax builtin

llama_sample_softmax()

llama_sample_softmax(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array) -> None

llama_sample_tail_free builtin

llama_sample_tail_free()

llama_sample_tail_free(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float, arg3: int) -> None

llama_sample_temperature builtin

llama_sample_temperature()

llama_sample_temperature(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float) -> None

llama_sample_token builtin

llama_sample_token()

llama_sample_token(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array) -> int

llama_sample_token_greedy builtin

llama_sample_token_greedy()

llama_sample_token_greedy(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array) -> int

llama_sample_token_mirostat builtin

llama_sample_token_mirostat()

llama_sample_token_mirostat(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float, arg3: float, arg4: int, arg5: float) -> int

llama_sample_token_mirostat_v2 builtin

llama_sample_token_mirostat_v2()

llama_sample_token_mirostat_v2(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float, arg3: float, arg4: float) -> int

llama_sample_top_k builtin

llama_sample_top_k()

llama_sample_top_k(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: int, arg3: int) -> None

llama_sample_top_p builtin

llama_sample_top_p()

llama_sample_top_p(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float, arg3: int) -> None

llama_sample_typical builtin

llama_sample_typical()

llama_sample_typical(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.llama_token_data_array, arg2: float, arg3: int) -> None

llama_save_session_file builtin

llama_save_session_file()

llama_save_session_file(arg0: _pyllamacpp.llama_context, arg1: str, arg2: numpy.ndarray[numpy.int32], arg3: int) -> bool

llama_set_rng_seed builtin

llama_set_rng_seed()

llama_set_rng_seed(arg0: _pyllamacpp.llama_context, arg1: int) -> None

llama_token_bos builtin

llama_token_bos()

llama_token_bos() -> int

llama_token_eos builtin

llama_token_eos()

llama_token_eos() -> int

llama_token_nl builtin

llama_token_nl()

llama_token_nl() -> int

llama_token_to_str builtin

llama_token_to_str()

llama_token_to_str(arg0: _pyllamacpp.llama_context, arg1: int) -> bytes

llama_tokenize builtin

llama_tokenize()

llama_tokenize(arg0: _pyllamacpp.llama_context, arg1: str, arg2: bool) -> List[int]

llama_tokens_to_str builtin

llama_tokens_to_str()

llama_tokens_to_str(arg0: _pyllamacpp.llama_context, arg1: numpy.ndarray[numpy.int32]) -> str

sample_next_token builtin

sample_next_token()

sample_next_token(arg0: _pyllamacpp.llama_context, arg1: _pyllamacpp.gpt_params, arg2: List[int]) -> int