diff --git a/CHANGELOG.md b/CHANGELOG.md index 16954eb88..d7153ca76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.18] + +- feat: Update llama.cpp to ggerganov/llama.cpp@3bcc99099 (b7868) +- feat: GLM 4.7 Flash support via upstream CUDA/Flash Attention optimizations +- feat: Qwen3 Next model support +- feat: Self-speculative decoding support (no draft model required) +- feat: Add `use_direct_io` field to `llama_model_params` +- feat: Add `llama_model_is_hybrid()` for hybrid models (Jamba, Granite) +- feat: Add `llama_max_tensor_buft_overrides()` +- feat: Add adapter metadata functions (`llama_adapter_meta_*`) +- feat: Add aLoRA invocation token functions (`llama_adapter_get_alora_*`) +- feat: Add `llama_memory_breakdown_print()` for debugging +- feat: Add `llama_log_get()` to retrieve current log callback +- feat: Add `llama_sampler_init_adaptive_p()` (commented, requires rebuild) +- fix: Map `flash_attn=False` to `DISABLED` (was incorrectly `AUTO`) +- NOTE: `llama_adapter_lora_free` is now deprecated (adapters freed with model) + +## [0.3.17] + +- feat: Update llama.cpp to ggerganov/llama.cpp@95ea9e086 (b7652) +- feat: Add `flash_attn_type` parameter to `Llama()` for forward compatibility +- feat: Add `LLAMA_FLASH_ATTN_TYPE_*` enum (AUTO, DISABLED, ENABLED) +- feat: Add `LLAMA_PARAMS_FIT_STATUS_*` enum +- feat: Add `LLAMA_MODEL_META_KEY_*` enum constants +- feat: Add `LLAMA_ROPE_TYPE_IMROPE` constant +- feat: Add `no_host` and `no_alloc` fields to `llama_model_params` +- feat: Add `flash_attn_type`, `samplers`, `n_samplers` fields to `llama_context_params` +- feat: Add `llama_n_ctx_seq`, `llama_model_n_embd_inp`, `llama_model_n_embd_out` functions +- fix: Replace deprecated `llama_sampler_init_softmax` with `llama_sampler_init_temp(1.0)` in LlamaSampler.add_softmax() +- fix: Update embed() to use wrapper methods instead of direct C API calls +- fix: Add `LLAMA_INSTALL_VERSION` fallback in CMakeLists.txt for mtmd build +- removed: `llama_get_kv_self` binding (use `llama_get_memory` wrapper instead) +- removed: `llama_kv_self_*` bindings (11 functions, use `llama_memory_*` wrappers instead) +- removed: `llama_sampler_init_softmax` binding +- BREAKING (C API): `flash_attn` bool replaced with `flash_attn_type` enum in context params +- NOTE: Python API backward compatible - both `flash_attn` bool and `flash_attn_type` enum work + ## [0.3.16] - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317 @@ -105,7 +142,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - fix: Fix memory allocation of ndarray in by @xu-song in #1704 - fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3 - ## [0.2.90] - feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48 @@ -120,7 +156,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.2.88] - feat: Update llama.cpp to ggerganov/llama.cpp@fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2 -- fix: only print 'cache saved' in verbose mode by @lsorber in #1668 +- fix: only print 'cache saved' in verbose mode by @lsorber in #1668 - fix: Added back from_file method to LlamaGrammar by @ExtReMLapin in #1673 - fix: grammar prints on each call by @abetlen in 0998ea0deea076a547d54bd598d6b413b588ee2b - feat: Enable recursive search of HFFS.ls when using from_pretrained by @benHeidabetlen in #1656 diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b06d98b3..d027a14fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,6 +154,10 @@ if (LLAMA_BUILD) endif() # Building llava + # Set LLAMA_INSTALL_VERSION if not already defined (needed by mtmd CMakeLists.txt) + if(NOT DEFINED LLAMA_INSTALL_VERSION) + set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "llama.cpp install version" FORCE) + endif() add_subdirectory(vendor/llama.cpp/tools/mtmd) if (WIN32) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c1dde7046..bdaefb9e0 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.16" +__version__ = "0.3.18" diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5175a7f2..780fcd50f 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int): # Sampling functions - deprecated, use LlamaSampler instead def set_rng_seed(self, seed: int): - raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "set_rng_seed is deprecated, use LlamaSampler instead" + ) def sample_repetition_penalties( self, @@ -366,30 +368,44 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_repetition_penalties is deprecated, use LlamaSampler instead" + ) def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_softmax is deprecated, use LlamaSampler instead" + ) def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_k is deprecated, use LlamaSampler instead" + ) def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_p is deprecated, use LlamaSampler instead" + ) def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_min_p is deprecated, use LlamaSampler instead" + ) def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_typical is deprecated, use LlamaSampler instead" + ) def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead") def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_grammar is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat( self, @@ -399,7 +415,9 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat_v2( self, @@ -408,17 +426,25 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead" + ) def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_greedy is deprecated, use LlamaSampler instead" + ) def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token is deprecated, use LlamaSampler instead" + ) # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "grammar_accept_token is deprecated, use LlamaSampler instead" + ) def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -602,16 +628,16 @@ def sample( logits_array: Optional[npt.NDArray[np.single]] = None, ): # This method is deprecated in favor of using LlamaSampler directly - raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead" + ) def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): self.prev.append(id) class CustomSampler: - def __init__( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] - ): + def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]): self.apply_func = apply_func def apply_wrapper( @@ -673,7 +699,7 @@ def add_dist(self, seed: int): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_softmax(self): - sampler = llama_cpp.llama_sampler_init_softmax() + sampler = llama_cpp.llama_sampler_init_temp(1.0) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_top_k(self, k: int): @@ -723,20 +749,20 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_grammar_lazy_patterns( - self, - model: LlamaModel, + self, + model: LlamaModel, grammar: LlamaGrammar, trigger_patterns: List[str], - trigger_tokens: List[int] + trigger_tokens: List[int], ): # Convert patterns to C array pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))() for i, pattern in enumerate(trigger_patterns): pattern_ptrs[i] = pattern.encode("utf-8") - + # Convert tokens to C array token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) - + sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( model.vocab, grammar._grammar.encode("utf-8"), @@ -744,7 +770,7 @@ def add_grammar_lazy_patterns( pattern_ptrs, len(trigger_patterns), token_array, - len(trigger_tokens) + len(trigger_tokens), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -771,13 +797,13 @@ def add_dry( dry_base: float, dry_allowed_length: int, dry_penalty_last_n: int, - seq_breakers: List[str] + seq_breakers: List[str], ): # Convert seq_breakers to C array breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))() for i, breaker in enumerate(seq_breakers): breaker_ptrs[i] = breaker.encode("utf-8") - + sampler = llama_cpp.llama_sampler_init_dry( model.vocab, n_ctx_train, @@ -786,25 +812,19 @@ def add_dry( dry_allowed_length, dry_penalty_last_n, breaker_ptrs, - len(seq_breakers) + len(seq_breakers), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_logit_bias( - self, - n_vocab: int, - logit_bias: Dict[int, float] - ): + def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]): # Convert logit_bias dict to C array bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))() for i, (token, bias) in enumerate(logit_bias.items()): bias_array[i].token = token bias_array[i].bias = bias - + sampler = llama_cpp.llama_sampler_init_logit_bias( - n_vocab, - len(logit_bias), - bias_array + n_vocab, len(logit_bias), bias_array ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -838,15 +858,17 @@ def reset(self): def clone(self): # NOTE: Custom samplers cannot be cloned due to Python callback limitations if self.custom_samplers: - raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers") - + raise NotImplementedError( + "Cannot clone LlamaSampler that contains custom samplers" + ) + cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler) # Create a new wrapper around the cloned sampler new_sampler = LlamaSampler.__new__(LlamaSampler) new_sampler.sampler = cloned_sampler new_sampler.custom_samplers = [] new_sampler._exit_stack = ExitStack() - + def free_sampler(): if new_sampler.sampler is not None: llama_cpp.llama_sampler_free(new_sampler.sampler) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..b82ff0ed3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -92,6 +92,7 @@ def __init__( embedding: bool = False, offload_kqv: bool = True, flash_attn: bool = False, + flash_attn_type: Optional[int] = None, op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, # Sampling Params @@ -341,7 +342,14 @@ def __init__( self._logits_all = logits_all if draft_model is None else True self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - self.context_params.flash_attn = flash_attn + if flash_attn_type is not None: + self.context_params.flash_attn_type = flash_attn_type + else: + self.context_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + if flash_attn + else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) if op_offload is not None: self.context_params.op_offload = op_offload @@ -934,7 +942,8 @@ def generate( sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] + self._input_ids[:sample_idx], + self._scores[sample_idx - self.n_tokens, :], ): return tokens_or_none = yield token @@ -1041,7 +1050,7 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + self._ctx.kv_cache_clear() self._ctx.decode(self._batch) self._batch.reset() @@ -1112,7 +1121,7 @@ def decode_batch(seq_sizes: List[int]): output = data[0] if isinstance(input, str) else data - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + self._ctx.kv_cache_clear() self.reset() if return_count: @@ -1157,9 +1166,9 @@ def _create_completion( bos_token_id: int = self.token_bos() cls_token_id: int = self._model.token_cls() sep_token_id: int = self._model.token_sep() - prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix - middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix - suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix + prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix + middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix + suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix add_space_prefix: bool = ( self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" ) @@ -1315,7 +1324,7 @@ def logit_bias_processor( if seed is not None: self.set_seed(seed) else: - self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) + self.set_seed(random.Random(self._seed).randint(0, 2**32)) finish_reason = "length" multibyte_fix = 0 @@ -2056,7 +2065,10 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + return ( + ChatCompletionChunk(**chunk) + for chunk in self.create_chat_completion(*args, **kwargs) + ) # type: ignore else: return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore except ImportError: @@ -2096,7 +2108,9 @@ def __getstate__(self): logits_all=self._logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, - flash_attn=self.context_params.flash_attn, + flash_attn=self.context_params.flash_attn_type + == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED, + flash_attn_type=self.context_params.flash_attn_type, op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params @@ -2318,7 +2332,11 @@ def from_pretrained( if additional_files: for additonal_file_name in additional_files: # find the additional shard file: - matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + matching_additional_files = [ + file + for file in file_list + if fnmatch.fnmatch(file, additonal_file_name) + ] if len(matching_additional_files) == 0: raise ValueError( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b63c1f561..fbf83c136 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -286,11 +286,15 @@ def _convert_text_completion_logprobs_to_chat( } for top_token, top_logprob in top_logprobs.items() ], - } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]) + } + for (token, logprob, top_logprobs) in zip( + logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"] + ) ], "refusal": None, } + def _convert_text_completion_to_chat( completion: llama_types.Completion, ) -> llama_types.ChatCompletion: @@ -307,7 +311,9 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -351,7 +357,9 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -368,7 +376,9 @@ def _convert_completion_to_chat( llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk] ]: if stream: - chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = ( + completion_or_chunks # type: ignore + ) return _convert_text_completion_chunks_to_chat(chunks) else: completion: llama_types.Completion = completion_or_chunks # type: ignore @@ -414,7 +424,9 @@ def _convert_completion_to_chat_function( } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -422,7 +434,9 @@ def _convert_completion_to_chat_function( } return chat_completion else: - chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = ( + completion_or_chunks # type: ignore + ) def _stream_response_to_function_stream( chunks: Iterator[llama_types.CreateCompletionStreamResponse], @@ -467,7 +481,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -504,7 +520,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -702,7 +720,7 @@ def chat_completion_handler( def hf_autotokenizer_to_chat_formatter( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> ChatFormatter: # https://huggingface.co/docs/transformers/main/chat_templating # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format @@ -727,7 +745,7 @@ def format_autotokenizer( def hf_autotokenizer_to_chat_completion_handler( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> LlamaChatCompletionHandler: chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path) return chat_formatter_to_chat_completion_handler(chat_formatter) @@ -1552,9 +1570,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) all_messages.append( @@ -1632,7 +1650,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): logits_processor=logits_processor, grammar=grammar, ) - return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore + return _convert_completion_to_chat( + completion_or_completion_chunks, stream=stream + ) # type: ignore if function_call is None or ( isinstance(function_call, str) and function_call == "auto" @@ -1748,7 +1768,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -1789,9 +1811,9 @@ def functionary_v1_v2_chat_handler( SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" tokenizer = llama.tokenizer_ - assert hasattr( - tokenizer, "hf_tokenizer" - ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + assert hasattr(tokenizer, "hf_tokenizer"), ( + "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + ) from transformers import AutoTokenizer if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: @@ -1941,9 +1963,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) if version == "v1": @@ -2005,7 +2027,9 @@ def prepare_messages_for_inference( completion_or_completion_chunks["choices"][0]["text"] = ( completion_or_completion_chunks["choices"][0]["text"].lstrip() ) - return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore + return _convert_completion_to_chat( + completion_or_completion_chunks, stream=stream + ) # type: ignore def get_grammar(function_call): function_body = None @@ -2160,7 +2184,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2262,7 +2288,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": None, @@ -2300,7 +2328,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": buffer.pop(0), @@ -2323,7 +2353,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": ( @@ -2409,7 +2441,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2643,7 +2677,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None if content == "" else content, @@ -2716,20 +2752,20 @@ def _init_mtmd_context(self, llama_model: llama.Llama): with suppress_stdout_stderr(disable=self.verbose): # Get default parameters ctx_params = self._mtmd_cpp.mtmd_context_params_default() - ctx_params.use_gpu = True # TODO: Make this configurable + ctx_params.use_gpu = True # TODO: Make this configurable ctx_params.print_timings = self.verbose ctx_params.n_threads = llama_model.n_threads ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2 # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), - llama_model.model, - ctx_params + self.clip_model_path.encode(), llama_model.model, ctx_params ) if self.mtmd_ctx is None: - raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError( + f"Failed to load mtmd context from: {self.clip_model_path}" + ) # Check if vision is supported if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): @@ -2756,12 +2792,12 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes): bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), - len(image_bytes) + len(image_bytes), ) - + if bitmap is None: raise ValueError("Failed to create bitmap from image bytes") - + return bitmap def __call__( @@ -2820,10 +2856,10 @@ def __call__( trim_blocks=True, lstrip_blocks=True, ).from_string(self.CHAT_FORMAT) - + # Get the default media marker - media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - + media_marker = self._mtmd_cpp.mtmd_default_marker().decode("utf-8") + # Replace image URLs with media markers in the template text = template.render( messages=messages, @@ -2831,7 +2867,7 @@ def __call__( eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) - + # Replace image URLs in text with media markers for image_url in image_urls: text = text.replace(image_url, media_marker) @@ -2851,7 +2887,7 @@ def __call__( # Create input text structure input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') + input_text.text = text.encode("utf-8") input_text.add_special = True input_text.parse_special = True @@ -2862,13 +2898,15 @@ def __call__( try: # Tokenize text and images together - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))( + *bitmaps + ) result = self._mtmd_cpp.mtmd_tokenize( self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, - len(bitmaps) + len(bitmaps), ) if result != 0: @@ -2881,40 +2919,45 @@ def __call__( # Process each chunk n_past = llama_cpp.llama_pos(0) n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - + for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - + if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT: # Handle text chunk n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( chunk, ctypes.byref(n_tokens_out) ) - + if tokens_ptr and n_tokens_out.value > 0: # Convert ctypes array to Python list tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - + if llama.n_tokens + len(tokens) > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" ) llama.eval(tokens) - - elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]: + + elif chunk_type in [ + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO, + ]: # Handle image/audio chunk using helper - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens( + chunk + ) + if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" ) - + new_n_past = llama_cpp.llama_pos(0) result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( self.mtmd_ctx, @@ -2924,12 +2967,14 @@ def __call__( llama_cpp.llama_seq_id(0), llama.n_batch, False, # logits_last - ctypes.byref(new_n_past) + ctypes.byref(new_n_past), ) - + if result != 0: - raise ValueError(f"Failed to evaluate chunk: error code {result}") - + raise ValueError( + f"Failed to evaluate chunk: error code {result}" + ) + # Update llama's token count llama.n_tokens = new_n_past.value @@ -3019,7 +3064,7 @@ def __call__( grammar=grammar, logit_bias=logit_bias, ) - + if tool is not None: tool_name = tool["function"]["name"] return _convert_completion_to_chat_function( @@ -3032,10 +3077,12 @@ def _load_image(image_url: str) -> bytes: # TODO: Add Pillow support for other image formats beyond (jpg, png) if image_url.startswith("data:"): import base64 + image_bytes = base64.b64decode(image_url.split(",")[1]) return image_bytes else: import urllib.request + with urllib.request.urlopen(image_url) as f: image_bytes = f.read() return image_bytes @@ -3062,6 +3109,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): @staticmethod def split_text_on_image_urls(text: str, image_urls: List[str]): """This method is no longer used in the new implementation.""" + def find_first(s: str, substrs: List[str]): for i, substr in enumerate(substrs): pos = s.find(substr) @@ -3443,7 +3491,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): "{% endif %}" "{% endif %}" "{% endfor %}" - "{% for content in message['content'] %}" "{% if content.type == 'text' %}" "{{ content.text }}" @@ -3465,8 +3512,8 @@ class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." CHAT_FORMAT = ( - #"{% set image_count = namespace(value=0) %}" - #"{% set video_count = namespace(value=0) %}" + # "{% set image_count = namespace(value=0) %}" + # "{% set video_count = namespace(value=0) %}" "{% for message in messages %}" "{% if loop.first and message['role'] != 'system' %}" "<|im_start|>system\n" @@ -3483,7 +3530,7 @@ class Qwen25VLChatHandler(Llava15ChatHandler): "{% else %}" "{{ content.image_url.url }}" "{% endif %}" - #"{% set image_count.value = image_count.value + 1 %}" + # "{% set image_count.value = image_count.value + 1 %}" "{% elif content['type'] == 'text' %}" "{{ content['text'] }}" "{% endif %}" @@ -3495,25 +3542,28 @@ class Qwen25VLChatHandler(Llava15ChatHandler): ) def __call__(self, **kwargs): - llama = kwargs['llama'] + llama = kwargs["llama"] # Clear state for multiple runs llama.reset() llama._ctx.kv_cache_clear() llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): + if hasattr(llama, "input_ids"): llama.input_ids.fill(0) # Clear any handler state - if hasattr(self, '_last_image_embed'): + if hasattr(self, "_last_image_embed"): self._last_image_embed = None self._last_image_hash = None if self.verbose: - messages = kwargs.get('messages', []) + messages = kwargs.get("messages", []) image_count = len(self.get_image_urls(messages)) - print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr) + print( + f"Minimal - Cleared state, processing {image_count} images", + file=sys.stderr, + ) # Use parent implementation return super().__call__(**kwargs) @@ -3534,15 +3584,15 @@ class Gemma3ChatHandler(Llava15ChatHandler): "{% endif %}" "{% set loop_messages = messages[1:] %}" "{% else %}" - "{% set first_user_prefix = \"\" %}" + '{% set first_user_prefix = "" %}' "{% set loop_messages = messages %}" "{% endif %}" "{% for message in loop_messages %}" "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + '{{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}' "{% endif %}" "{% if (message['role'] == 'assistant') %}" - "{% set role = \"model\" %}" + '{% set role = "model" %}' "{% else %}" "{% set role = message['role'] %}" "{% endif %}" @@ -3560,7 +3610,7 @@ class Gemma3ChatHandler(Llava15ChatHandler): "{% endif %}" "{% endfor %}" "{% else %}" - "{{ raise_exception(\"Invalid content type\") }}" + '{{ raise_exception("Invalid content type") }}' "{% endif %}" "{{ '\n' }}" "{% endfor %}" @@ -3687,7 +3737,9 @@ def chatml_function_calling( stop = ( [stop, "<|im_end|>"] if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + else stop + ["<|im_end|>"] + if stop + else ["<|im_end|>"] ) # Case 1: No tool choice by user @@ -3789,7 +3841,7 @@ def chatml_function_calling( # Case 3: Automatic tool choice assert isinstance(tool_choice, str) and tool_choice == "auto" function_names = " | ".join( - [f'''"functions.{tool['function']['name']}:"''' for tool in tools] + [f'''"functions.{tool["function"]["name"]}:"''' for tool in tools] ) initial_gbnf_tool_grammar = ( """root ::= functions | "message:"\n""" @@ -3965,7 +4017,9 @@ def chatml_function_calling( { "finish_reason": "tool_calls", "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 711d42a6a..d5bdca2e5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -33,7 +33,11 @@ # Specify the base name of the shared library to load _lib_base_name = "llama" _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH") -_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) +_base_path = ( + pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + if _override_base_path is None + else pathlib.Path(_override_base_path) +) # Load the library _lib = load_shared_library(_lib_base_name, _base_path) @@ -294,6 +298,7 @@ LLAMA_ROPE_TYPE_NORM = 0 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 +LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 16 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 @@ -462,16 +467,31 @@ LLAMA_ATTENTION_TYPE_CAUSAL = 0 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 +LLAMA_FLASH_ATTN_TYPE_AUTO = -1 +LLAMA_FLASH_ATTN_TYPE_DISABLED = 0 +LLAMA_FLASH_ATTN_TYPE_ENABLED = 1 -# enum llama_split_mode { -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported -# }; LLAMA_SPLIT_MODE_NONE = 0 LLAMA_SPLIT_MODE_LAYER = 1 LLAMA_SPLIT_MODE_ROW = 2 +LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2 +LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5 +LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11 + +LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 +LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 +LLAMA_PARAMS_FIT_STATUS_ERROR = 2 + # typedef struct llama_token_data { # llama_token id; // token id @@ -559,6 +579,7 @@ class llama_token_data_array(ctypes.Structure): # typedef struct llama_batch { # int32_t n_tokens; + # llama_token * token; # float * embd; # llama_pos * pos; @@ -688,6 +709,7 @@ class llama_model_kv_override(ctypes.Structure): # // override key-value pairs of the model meta data # const struct llama_model_kv_override * kv_overrides; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible @@ -716,7 +738,9 @@ class llama_model_params(ctypes.Structure): if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused - tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused + tensor_buft_overrides: CtypesArray[ + llama_model_tensor_buft_override + ] # NOTE: unused n_gpu_layers: int split_mode: int main_gpu: int @@ -726,13 +750,16 @@ class llama_model_params(ctypes.Structure): kv_overrides: CtypesArray[llama_model_kv_override] vocab_only: bool use_mmap: bool + use_direct_io: bool use_mlock: bool check_tensors: bool use_extra_bufts: bool + no_host: bool + no_alloc: bool _fields_ = [ - ("devices", ctypes.c_void_p), # NOTE: unnused - ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused + ("devices", ctypes.c_void_p), # NOTE: unnused + ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -742,9 +769,12 @@ class llama_model_params(ctypes.Structure): ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), ("vocab_only", ctypes.c_bool), ("use_mmap", ctypes.c_bool), + ("use_direct_io", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), + ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), ] @@ -784,6 +814,7 @@ class llama_model_params(ctypes.Structure): # ggml_abort_callback abort_callback; # void * abort_callback_data; + # // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU @@ -859,11 +890,13 @@ class llama_context_params(ctypes.Structure): abort_callback_data: ctypes.c_void_p embeddings: bool offload_kqv: bool - flash_attn: bool no_perf: bool op_offload: bool swa_full: bool kv_unified: bool + flash_attn_type: int + samplers: ctypes.c_void_p + n_samplers: int _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -875,6 +908,7 @@ class llama_context_params(ctypes.Structure): ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), + ("flash_attn_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -891,11 +925,12 @@ class llama_context_params(ctypes.Structure): ("abort_callback_data", ctypes.c_void_p), ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), - ("flash_attn", ctypes.c_bool), ("no_perf", ctypes.c_bool), ("op_offload", ctypes.c_bool), ("swa_full", ctypes.c_bool), ("kv_unified", ctypes.c_bool), + ("samplers", ctypes.c_void_p), + ("n_samplers", ctypes.c_size_t), ] @@ -1137,8 +1172,7 @@ def llama_backend_free(): [ctypes.c_int], None, ) -def llama_numa_init(numa: int, /): - ... +def llama_numa_init(numa: int, /): ... # // Optional: an auto threadpool gets created in ggml if not passed explicitly @@ -1164,8 +1198,7 @@ def llama_numa_init(numa: int, /): ) def llama_load_model_from_file( path_model: bytes, params: llama_model_params, / -) -> Optional[llama_model_p]: - ... +) -> Optional[llama_model_p]: ... # // Load the model from a file @@ -1230,8 +1263,7 @@ def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /): [llama_model_p_ctypes], None, ) -def llama_free_model(model: llama_model_p, /): - ... +def llama_free_model(model: llama_model_p, /): ... # LLAMA_API void llama_model_free(struct llama_model * model); @@ -1240,8 +1272,7 @@ def llama_free_model(model: llama_model_p, /): [llama_model_p_ctypes], None, ) -def llama_model_free(model: llama_model_p, /): - ... +def llama_model_free(model: llama_model_p, /): ... # LLAMA_API struct llama_context * llama_init_from_model( @@ -1254,8 +1285,7 @@ def llama_model_free(model: llama_model_p, /): ) def llama_init_from_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: - ... +) -> Optional[llama_context_p]: ... # DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model( @@ -1269,8 +1299,7 @@ def llama_init_from_model( ) def llama_new_context_with_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: - ... +) -> Optional[llama_context_p]: ... # // Frees all allocated memory @@ -1291,104 +1320,96 @@ def llama_free(ctx: llama_context_p, /): [], ctypes.c_int64, ) -def llama_time_us() -> int: - ... +def llama_time_us() -> int: ... # LLAMA_API size_t llama_max_devices(void); @ctypes_function("llama_max_devices", [], ctypes.c_size_t) -def llama_max_devices() -> int: - ... +def llama_max_devices() -> int: ... # LLAMA_API size_t llama_max_parallel_sequences(void); @ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t) -def llama_max_parallel_sequences() -> int: - ... +def llama_max_parallel_sequences() -> int: ... + + +# LLAMA_API size_t llama_max_tensor_buft_overrides(void); +@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t) +def llama_max_tensor_buft_overrides() -> int: ... # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) -def llama_supports_mmap() -> bool: - ... +def llama_supports_mmap() -> bool: ... # LLAMA_API bool llama_supports_mlock (void); @ctypes_function("llama_supports_mlock", [], ctypes.c_bool) -def llama_supports_mlock() -> bool: - ... +def llama_supports_mlock() -> bool: ... # LLAMA_API bool llama_supports_gpu_offload(void); @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool) -def llama_supports_gpu_offload() -> bool: - ... +def llama_supports_gpu_offload() -> bool: ... # LLAMA_API bool llama_supports_rpc (void); @ctypes_function("llama_supports_rpc", [], ctypes.c_bool) -def llama_supports_rpc() -> bool: - ... +def llama_supports_rpc() -> bool: ... # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ctx(ctx: llama_context_p, /) -> int: - ... +def llama_n_ctx(ctx: llama_context_p, /) -> int: ... + + +@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_ctx_seq(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_batch(ctx: llama_context_p, /) -> int: - ... +def llama_n_batch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ubatch(ctx: llama_context_p, /) -> int: - ... +def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_seq_max(ctx: llama_context_p, /) -> int: - ... +def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_ctx_train(model: llama_model_p, /) -> int: - ... +def llama_n_ctx_train(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead"); @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_embd(model: llama_model_p, /) -> int: - ... +def llama_n_embd(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead"); @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_layer(model: llama_model_p, /) -> int: - ... +def llama_n_layer(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead"); @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_head(model: llama_model_p, /) -> int: - ... +def llama_n_head(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); @ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32) -def llama_n_vocab(model: llama_vocab_p, /) -> int: - ... +def llama_n_vocab(model: llama_vocab_p, /) -> int: ... # LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) -def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: - ... +def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... # LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); @@ -1400,74 +1421,63 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]: # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) -def llama_pooling_type(ctx: llama_context_p, /) -> int: - ... - - -# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); -@ctypes_function( - "llama_get_kv_self", - [llama_context_p_ctypes], - llama_kv_cache_p_ctypes, -) -def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: - """Get the KV cache for self-attention (DEPRECATED)""" - ... +def llama_pooling_type(ctx: llama_context_p, /) -> int: ... # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes) -def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: - ... +def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ... # LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @ctypes_function("llama_model_rope_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_model_rope_type(model: llama_model_p, /) -> int: - ... +def llama_model_rope_type(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); @ctypes_function("llama_model_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_ctx_train(model: llama_model_p, /) -> int: - ... +def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); @ctypes_function("llama_model_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_embd(model: llama_model_p, /) -> int: - ... +def llama_model_n_embd(model: llama_model_p, /) -> int: ... + + +@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_inp(model: llama_model_p, /) -> int: ... + + +@ctypes_function("llama_model_n_embd_out", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_out(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_layer(model: llama_model_p, /) -> int: - ... +def llama_model_n_layer(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_head(model: llama_model_p, /) -> int: - ... +def llama_model_n_head(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_head_kv(model: llama_model_p, /) -> int: - ... +def llama_model_n_head_kv(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); @ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_swa(model: llama_model_p, /) -> int: - ... +def llama_model_n_swa(model: llama_model_p, /) -> int: ... # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); -@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) -def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: - ... +@ctypes_function( + "llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float +) +def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: ... # // Returns the number of classifier outputs (only valid for classifier models) @@ -1481,7 +1491,9 @@ def llama_model_n_cls_out(model: llama_model_p, /) -> int: # // Returns label of classifier output by index ( Optional[bytes]: """Returns label of classifier output by index. Returns None if no label provided""" ... @@ -1489,14 +1501,12 @@ def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]: # LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int) -def llama_vocab_type(vocab: llama_vocab_p, /) -> int: - ... +def llama_vocab_type(vocab: llama_vocab_p, /) -> int: ... # LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab); @ctypes_function("llama_vocab_n_tokens", [llama_vocab_p_ctypes], ctypes.c_int32) -def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: - ... +def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: ... # // Functions to access the model's GGUF metadata scalar values @@ -1611,8 +1621,14 @@ def llama_model_size(model: llama_model_p, /) -> int: # // Get the default chat template. Returns nullptr if not available # // If name is NULL, returns the default chat template # LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); -@ctypes_function("llama_model_chat_template", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_char_p) -def llama_model_chat_template(model: llama_model_p, name: Optional[bytes], /) -> Optional[bytes]: +@ctypes_function( + "llama_model_chat_template", + [llama_model_p_ctypes, ctypes.c_char_p], + ctypes.c_char_p, +) +def llama_model_chat_template( + model: llama_model_p, name: Optional[bytes], / +) -> Optional[bytes]: """Get the default chat template. Returns None if not available If name is None, returns the default chat template""" ... @@ -1663,6 +1679,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool: ... +# // Returns true if the model is hybrid (like Jamba, Granite, etc.) +# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model); +@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_is_hybrid(model: llama_model_p, /) -> bool: + """Returns true if the model is hybrid (like Jamba, Granite, etc.)""" + ... + + # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool) @@ -1699,6 +1723,7 @@ def llama_model_quantize( # // Adapters # // + # // Load a LoRA adapter from file # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( # struct llama_model * model, @@ -1710,20 +1735,98 @@ def llama_model_quantize( ) def llama_adapter_lora_init( model: llama_model_p, path_lora: bytes, / -) -> Optional[llama_adapter_lora_p]: - ... +) -> Optional[llama_adapter_lora_p]: ... + + +# // Get metadata value as a string by key name +# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_val_str", + [llama_adapter_lora_p_ctypes, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_val_str( + adapter: llama_adapter_lora_p, + key: bytes, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: ... + + +# // Get the number of metadata key/value pairs +# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32 +) +def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int: ... + + +# // Get metadata key name by index +# LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_key_by_index", + [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_key_by_index( + adapter: llama_adapter_lora_p, + i: int, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: ... + + +# // Get metadata value as a string by index +# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_val_str_by_index", + [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_val_str_by_index( + adapter: llama_adapter_lora_p, + i: int, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: ... # // Manually free a LoRA adapter -# // Note: loaded adapters will be free when the associated model is deleted -# LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); +# // NOTE: loaded adapters will be free when the associated model is deleted (DEPRECATED) +# LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter), +# "adapters are now freed together with the associated model"); @ctypes_function( "llama_adapter_lora_free", [llama_adapter_lora_p_ctypes], None, ) -def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): - ... +def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ... + + +# // Get the invocation tokens if the current lora is an alora +# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_get_alora_n_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.c_uint64, +) +def llama_adapter_get_alora_n_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> int: ... + + +# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_get_alora_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.POINTER(llama_token), +) +def llama_adapter_get_alora_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> CtypesPointer[llama_token]: ... # // The following functions operate on a llama_context, hence the naming: llama_verb_... @@ -1825,6 +1928,7 @@ def llama_apply_adapter_cvec( # // Memory # // + # // Clear the memory contents # // If data == true, the data buffers will also be cleared together with the metadata # LLAMA_API void llama_memory_clear( @@ -1916,9 +2020,7 @@ def llama_memory_seq_cp( # LLAMA_API void llama_memory_seq_keep( # llama_memory_t mem, # llama_seq_id seq_id); -@ctypes_function( - "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None -) +@ctypes_function("llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None) def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /): """Removes all tokens that do not belong to the specified sequence""" ... @@ -2036,261 +2138,23 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: ... -# // -# // KV cache for self-attention (TODO: deprecate in favor of llama_memory) -# // - -# // Returns the number of tokens in the KV cache (slow, use only for debug) -# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" - ... - - -# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) -# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (DEPRECATED)""" - ... - - -# // Clear the KV cache - both cell info is erased and KV data is zeroed -# DEPRECATED(LLAMA_API void llama_kv_self_clear( -# struct llama_context * ctx), -# "Use llama_memory_clear() instead"); -@ctypes_function( - "llama_kv_self_clear", [llama_context_p_ctypes], None -) -def llama_kv_self_clear(ctx: llama_context_p, /): - """Clear the KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) -# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails -# // seq_id < 0 : match any sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_rm() instead"); -@ctypes_function( - "llama_kv_self_seq_rm", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ], - ctypes.c_bool, -) -def llama_kv_self_seq_rm( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -) -> bool: - """Remove tokens from KV cache (DEPRECATED)""" - ... - - -# // Copy all tokens that belong to the specified sequence to another sequence -# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( -# struct llama_context * ctx, -# llama_seq_id seq_id_src, -# llama_seq_id seq_id_dst, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_cp() instead"); -@ctypes_function( - "llama_kv_self_seq_cp", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_seq_id, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_cp( - ctx: llama_context_p, - seq_id_src: Union[llama_seq_id, int], - seq_id_dst: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -): - """Copy tokens in KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that do not belong to the specified sequence -# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_keep() instead"); -@ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None -) -def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Keep only specified sequence in KV cache (DEPRECATED)""" - ... - - -# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_add( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# llama_pos delta), -# "Use llama_memory_seq_add() instead"); -@ctypes_function( - "llama_kv_self_seq_add", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_add( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - delta: Union[llama_pos, int], - /, -): - """Add delta to sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Integer division of the positions by factor of `d > 1` -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_div( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# int d), -# "Use llama_memory_seq_div() instead"); -@ctypes_function( - "llama_kv_self_seq_div", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ctypes.c_int, - ], - None, -) -def llama_kv_self_seq_div( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - d: Union[ctypes.c_int, int], - /, -): - """Divide sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Returns the smallest position present in the KV cache for the specified sequence -# // This is typically non-zero only for SWA caches -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_min() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_min( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the smallest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Returns the largest position present in the KV cache for the specified sequence -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_max() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_max( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the largest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Defragment the KV cache -# // This will be applied: -# // - lazily on next llama_decode() -# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), -# "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); -@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) -def llama_kv_self_defrag(ctx: llama_context_p, /): - """Defragment the KV cache (DEPRECATED)""" +# // Print memory breakdown (for debugging) +# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); +@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None) +def llama_memory_breakdown_print(ctx: llama_context_p, /) -> None: + """Print memory breakdown (for debugging)""" ... -# // Check if the context supports KV cache shifting -# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), -# "use llama_memory_can_shift() instead"); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting (DEPRECATED)""" - ... - - -# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), -# "simply remove this call, updates are applied lazily on the next llama_decode()"); -@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) -def llama_kv_self_update(ctx: llama_context_p, /): - """Apply the KV cache updates (DEPRECATED)""" - ... +# // +# // KV cache for self-attention (TODO: deprecate in favor of llama_memory) +# // +# +# State / sessions # // -# // State / sessions -# // + # // Returns the *actual* size in bytes of the state # // (logits, embedding and memory) @@ -2420,8 +2284,7 @@ def llama_state_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_load_session_file( @@ -2449,8 +2312,7 @@ def llama_load_session_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API bool llama_state_save_file( @@ -2474,8 +2336,7 @@ def llama_state_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_save_session_file( @@ -2500,8 +2361,7 @@ def llama_save_session_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: - ... +) -> bool: ... # // Get the exact size needed to copy the state of a single sequence @@ -2599,8 +2459,7 @@ def llama_state_seq_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: - ... +) -> int: ... # LLAMA_API size_t llama_state_seq_load_file( @@ -2630,14 +2489,14 @@ def llama_state_seq_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: - ... +) -> int: ... # // # // Decoding # // + # // Return batch for single sequence of tokens # // The sequence ID will be fixed to 0 # // The position of the tokens will be tracked automatically by llama_decode @@ -2947,14 +2806,14 @@ def llama_get_embeddings_seq( # // Vocab # // + # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token); @ctypes_function( "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p ) def llama_vocab_get_text( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bytes: - ... +) -> bytes: ... # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token); @@ -2963,8 +2822,7 @@ def llama_vocab_get_text( ) def llama_vocab_get_score( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> float: - ... +) -> float: ... # LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token); @@ -2973,8 +2831,7 @@ def llama_vocab_get_score( ) def llama_vocab_get_attr( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> int: - ... +) -> int: ... # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) @@ -3055,8 +2912,7 @@ def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab); @@ -3065,8 +2921,7 @@ def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab); @@ -3075,8 +2930,7 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab); @@ -3085,8 +2939,7 @@ def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab); @@ -3095,8 +2948,7 @@ def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab); @@ -3105,8 +2957,7 @@ def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab); @@ -3115,8 +2966,7 @@ def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab); @@ -3125,8 +2975,7 @@ def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab); @@ -3135,8 +2984,7 @@ def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED functions @@ -3148,8 +2996,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ) def llama_token_get_text( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bytes: - ... +) -> bytes: ... # DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead"); @@ -3160,8 +3007,8 @@ def llama_token_get_text( ) def llama_token_get_score( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> float: - ... +) -> float: ... + # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); @ctypes_function( @@ -3171,8 +3018,8 @@ def llama_token_get_score( ) def llama_token_get_attr( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> int: - ... +) -> int: ... + # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead"); @ctypes_function( @@ -3182,8 +3029,8 @@ def llama_token_get_attr( ) def llama_token_is_eog( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bool: - ... +) -> bool: ... + # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead"); @ctypes_function( @@ -3193,8 +3040,8 @@ def llama_token_is_eog( ) def llama_token_is_control( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bool: - ... +) -> bool: ... + # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead"); @ctypes_function( @@ -3202,8 +3049,8 @@ def llama_token_is_control( [llama_vocab_p_ctypes], llama_token, ) -def llama_token_bos(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_bos(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead"); @ctypes_function( @@ -3211,8 +3058,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_eos(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_eos(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead"); @ctypes_function( @@ -3220,8 +3067,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_eot(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_eot(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead"); @ctypes_function( @@ -3229,8 +3076,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_cls(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_cls(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead"); @ctypes_function( @@ -3238,8 +3085,7 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_sep(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_sep(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead"); @@ -3248,8 +3094,7 @@ def llama_token_sep(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_nl(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_nl(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead"); @@ -3258,8 +3103,7 @@ def llama_token_nl(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_pad(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_pad(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead"); @@ -3268,8 +3112,8 @@ def llama_token_pad(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: - ... +def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ... + # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead"); @ctypes_function( @@ -3277,8 +3121,7 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: - ... +def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead"); @@ -3287,8 +3130,8 @@ def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead"); @ctypes_function( @@ -3296,8 +3139,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead"); @ctypes_function( @@ -3305,8 +3148,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead"); @ctypes_function( @@ -3314,8 +3157,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead"); @ctypes_function( @@ -3323,8 +3166,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead"); @ctypes_function( @@ -3332,8 +3175,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ... + # // CLS is equivalent to BOS # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification @@ -3343,8 +3186,7 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: ... # // @@ -3353,6 +3195,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: # // The API is thread-safe. # // + # /// @details Convert the provided text into tokens. # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. # /// @return Returns the number of tokens on success, no more than n_tokens_max @@ -3512,6 +3355,7 @@ def llama_detokenize( # // Chat templates # // + # /// Apply chat template. Inspired by hf apply_chat_template() on python. # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" # /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template @@ -3535,9 +3379,9 @@ def llama_detokenize( ctypes.c_char_p, # tmpl ctypes.POINTER(llama_chat_message), # chat ctypes.c_size_t, # n_msg - ctypes.c_bool, # add_ass (added) + ctypes.c_bool, # add_ass (added) ctypes.c_char_p, # buf - ctypes.c_int32, # length + ctypes.c_int32, # length ], ctypes.c_int32, ) @@ -3611,11 +3455,11 @@ def llama_chat_builtin_templates( # struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL # void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL + # // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph # //void (*apply_ggml) (struct llama_sampler * smpl, ...); # }; -class llama_sampler_i(ctypes.Structure): - ... +class llama_sampler_i(ctypes.Structure): ... # struct llama_sampler { @@ -3662,8 +3506,7 @@ class llama_sampler(ctypes.Structure): ) def llama_sampler_init( iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); @@ -3672,8 +3515,7 @@ def llama_sampler_init( [llama_sampler_p_ctypes], ctypes.c_char_p, ) -def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: - ... +def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: ... # LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); @@ -3682,8 +3524,7 @@ def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: [llama_sampler_p_ctypes, llama_token], None, ) -def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): - ... +def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): ... # LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); @@ -3694,8 +3535,7 @@ def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], ) def llama_sampler_apply( smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], / -): - ... +): ... # LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl); @@ -3704,8 +3544,7 @@ def llama_sampler_apply( [llama_sampler_p_ctypes], None, ) -def llama_sampler_reset(smpl: llama_sampler_p, /): - ... +def llama_sampler_reset(smpl: llama_sampler_p, /): ... # LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl); @@ -3714,8 +3553,7 @@ def llama_sampler_reset(smpl: llama_sampler_p, /): [llama_sampler_p_ctypes], llama_sampler_p_ctypes, ) -def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: - ... +def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: ... # // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add) @@ -3725,21 +3563,22 @@ def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: [llama_sampler_p_ctypes], None, ) -def llama_sampler_free(smpl: llama_sampler_p, /): - ... +def llama_sampler_free(smpl: llama_sampler_p, /): ... # // llama_sampler_chain # // a type of llama_sampler that can chain multiple samplers one after another + # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); @ctypes_function( "llama_sampler_chain_init", [llama_sampler_chain_params], llama_sampler_p_ctypes, ) -def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sampler_p: - ... +def llama_sampler_chain_init( + params: llama_sampler_chain_params, / +) -> llama_sampler_p: ... # // important: takes ownership of the sampler object and will free it when llama_sampler_free is called @@ -3749,8 +3588,7 @@ def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sam [llama_sampler_p_ctypes, llama_sampler_p_ctypes], None, ) -def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): - ... +def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ... # LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); @@ -3761,8 +3599,7 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ) def llama_sampler_chain_get( chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); @@ -3771,8 +3608,7 @@ def llama_sampler_chain_get( [llama_sampler_p_ctypes], ctypes.c_int, ) -def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: - ... +def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ... # // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed @@ -3784,39 +3620,27 @@ def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ) def llama_sampler_chain_remove( chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # // available samplers: + # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) -def llama_sampler_init_greedy() -> llama_sampler_p: - ... +def llama_sampler_init_greedy() -> llama_sampler_p: ... # LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) -def llama_sampler_init_dist(seed: int) -> llama_sampler_p: - ... - - -# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. -# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), -# "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); -@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) -def llama_sampler_init_softmax() -> llama_sampler_p: - ... +def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ... # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 # /// Setting k <= 0 makes this a noop # LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes) -def llama_sampler_init_top_k(k: int) -> llama_sampler_p: - ... +def llama_sampler_init_top_k(k: int) -> llama_sampler_p: ... # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -3826,8 +3650,7 @@ def llama_sampler_init_top_k(k: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ... # /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841 @@ -3837,8 +3660,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ... # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. @@ -3848,15 +3670,13 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ... # /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf # LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes) -def llama_sampler_init_temp(t: float) -> llama_sampler_p: - ... +def llama_sampler_init_temp(t: float) -> llama_sampler_p: ... # /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. @@ -3868,8 +3688,7 @@ def llama_sampler_init_temp(t: float) -> llama_sampler_p: ) def llama_sampler_init_temp_ext( t: float, delta: float, exponent: float -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 @@ -3881,8 +3700,7 @@ def llama_sampler_init_temp_ext( ) def llama_sampler_init_xtc( p: float, t: float, min_keep: int, seed: int, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641 @@ -3892,8 +3710,7 @@ def llama_sampler_init_xtc( [ctypes.c_float], llama_sampler_p_ctypes, ) -def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: - ... +def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ... # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. @@ -3910,8 +3727,7 @@ def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ) def llama_sampler_init_mirostat( n_vocab: int, seed: int, tau: float, eta: float, m: int, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. @@ -3926,8 +3742,30 @@ def llama_sampler_init_mirostat( ) def llama_sampler_init_mirostat_v2( seed: int, tau: float, eta: float, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... + + +# /// adaptive-p: select tokens near a configurable target probability over time. +# /// +# /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) +# /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) +# /// @param seed RNG seed +# /// +# /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +# /// +# LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( +# float target, +# float decay, +# uint32_t seed); +# NOTE: Binding commented out - requires library rebuild against b7868+ +# @ctypes_function( +# "llama_sampler_init_adaptive_p", +# [ctypes.c_float, ctypes.c_float, ctypes.c_uint32], +# llama_sampler_p_ctypes, +# ) +# def llama_sampler_init_adaptive_p( +# target: float, decay: float, seed: int, / +# ) -> llama_sampler_p: ... # /// @details Intializes a GBNF grammar, see grammars/README.md for details. @@ -3942,8 +3780,7 @@ def llama_sampler_init_mirostat_v2( ) def llama_sampler_init_grammar( vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy( @@ -3977,8 +3814,7 @@ def llama_sampler_init_grammar_lazy( trigger_tokens: CtypesArray[llama_token], num_trigger_tokens: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639 @@ -4012,8 +3848,7 @@ def llama_sampler_init_grammar_lazy_patterns( trigger_tokens: CtypesArray[llama_token], num_trigger_tokens: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. @@ -4033,8 +3868,7 @@ def llama_sampler_init_penalties( penalty_freq: float, penalty_present: float, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 @@ -4071,8 +3905,7 @@ def llama_sampler_init_dry( seq_breakers, num_breakers: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( @@ -4086,8 +3919,7 @@ def llama_sampler_init_dry( ) def llama_sampler_init_logit_bias( n_vocab: int, n_logit_bias: int, logit_bias: CtypesArray[llama_logit_bias], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # // this sampler is meant to be used for fill-in-the-middle infilling @@ -4097,8 +3929,7 @@ def llama_sampler_init_logit_bias( [llama_vocab_p_ctypes], llama_sampler_p_ctypes, ) -def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: - ... +def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: ... # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise @@ -4108,8 +3939,7 @@ def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: [llama_sampler_p_ctypes], ctypes.c_uint32, ) -def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: - ... +def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ... # /// @details Sample and accept a token from the idx-th output of the last evaluation @@ -4121,14 +3951,14 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ) def llama_sampler_sample( smpl: llama_sampler_p, ctx: llama_context_p, idx: int, / -) -> int: - ... +) -> int: ... # // # // Model split # // + # /// @details Build a split GGUF final path for this chunk. # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); @ctypes_function( @@ -4170,8 +4000,7 @@ def llama_split_prefix( # // Print system information # LLAMA_API const char * llama_print_system_info(void); @ctypes_function("llama_print_system_info", [], ctypes.c_char_p) -def llama_print_system_info() -> bytes: - ... +def llama_print_system_info() -> bytes: ... # // Set callback for all future logging events. @@ -4193,6 +4022,22 @@ def llama_log_set( ... +# // Get the current log callback and user data +# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); +@ctypes_function( + "llama_log_get", + [ctypes.c_void_p, ctypes.c_void_p], + None, +) +def llama_log_get( + log_callback: ctypes.c_void_p, + user_data: ctypes.c_void_p, + /, +): + """Get the current log callback and user data.""" + ... + + # // # // Performance utils # // @@ -4203,6 +4048,7 @@ def llama_log_set( # double t_p_eval_ms; # double t_eval_ms; + # int32_t n_p_eval; # int32_t n_eval; # int32_t n_reused; // number of times a ggml compute graph had been reused @@ -4222,6 +4068,7 @@ class llama_perf_context_data(ctypes.Structure): # struct llama_perf_sampler_data { # double t_sample_ms; + # int32_t n_sample; # }; class llama_perf_sampler_data(ctypes.Structure): @@ -4237,8 +4084,7 @@ class llama_perf_sampler_data(ctypes.Structure): [llama_context_p_ctypes], llama_perf_context_data, ) -def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: - ... +def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: ... # LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); @@ -4247,8 +4093,7 @@ def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: [llama_context_p_ctypes], None, ) -def llama_perf_context_print(ctx: llama_context_p, /): - ... +def llama_perf_context_print(ctx: llama_context_p, /): ... # LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); @@ -4257,8 +4102,7 @@ def llama_perf_context_print(ctx: llama_context_p, /): [llama_context_p_ctypes], None, ) -def llama_perf_context_reset(ctx: llama_context_p, /): - ... +def llama_perf_context_reset(ctx: llama_context_p, /): ... # // NOTE: the following work only with samplers constructed via llama_sampler_chain_init @@ -4268,8 +4112,7 @@ def llama_perf_context_reset(ctx: llama_context_p, /): [llama_sampler_p_ctypes], llama_perf_sampler_data, ) -def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: - ... +def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: ... # LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); @@ -4278,8 +4121,7 @@ def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: [llama_sampler_p_ctypes], None, ) -def llama_perf_sampler_print(chain: llama_sampler_p, /): - ... +def llama_perf_sampler_print(chain: llama_sampler_p, /): ... # LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); @@ -4288,8 +4130,7 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): [llama_sampler_p_ctypes], None, ) -def llama_perf_sampler_reset(chain: llama_sampler_p, /): - ... +def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... # // @@ -4298,7 +4139,10 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): # // function that returns whether or not a given tensor contains trainable parameters # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata); -llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) +llama_opt_param_filter = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p +) + # // always returns true # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata); @@ -4307,8 +4151,9 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): [ctypes.c_void_p, ctypes.c_void_p], ctypes.c_bool, ) -def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool: - ... +def llama_opt_param_filter_all( + tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, / +) -> bool: ... # struct llama_opt_params { @@ -4317,6 +4162,7 @@ def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_ # llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters # void * param_filter_ud; // userdata for determining which tensors contain trainable parameters + # ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters # void * get_opt_pars_ud; // userdata for calculating optimizer parameters # }; @@ -4325,7 +4171,10 @@ class llama_opt_params(ctypes.Structure): ("n_ctx_train", ctypes.c_uint32), ("param_filter", llama_opt_param_filter), ("param_filter_ud", ctypes.c_void_p), - ("get_opt_pars", ctypes.c_void_p), # ggml_opt_get_optimizer_params - not implemented here + ( + "get_opt_pars", + ctypes.c_void_p, + ), # ggml_opt_get_optimizer_params - not implemented here ("get_opt_pars_ud", ctypes.c_void_p), ] @@ -4336,8 +4185,9 @@ class llama_opt_params(ctypes.Structure): [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params], None, ) -def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /): - ... +def llama_opt_init( + lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, / +): ... # LLAMA_API void llama_opt_epoch( @@ -4353,7 +4203,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla [ llama_context_p_ctypes, ctypes.c_void_p, # ggml_opt_dataset_t - ctypes.c_void_p, # ggml_opt_result_t + ctypes.c_void_p, # ggml_opt_result_t ctypes.c_void_p, # ggml_opt_result_t ctypes.c_int64, ctypes.c_void_p, # ggml_opt_epoch_callback @@ -4370,5 +4220,4 @@ def llama_opt_epoch( callback_train: ctypes.c_void_p, callback_eval: ctypes.c_void_p, /, -): - ... +): ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be4..3bcc99099 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit 3bcc990997f201114ee6b6abdec5eb43683d7af2