Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.18]

- feat: Update llama.cpp to ggerganov/llama.cpp@3bcc99099 (b7868)
- feat: GLM 4.7 Flash support via upstream CUDA/Flash Attention optimizations
- feat: Qwen3 Next model support
- feat: Self-speculative decoding support (no draft model required)
- feat: Add `use_direct_io` field to `llama_model_params`
- feat: Add `llama_model_is_hybrid()` for hybrid models (Jamba, Granite)
- feat: Add `llama_max_tensor_buft_overrides()`
- feat: Add adapter metadata functions (`llama_adapter_meta_*`)
- feat: Add aLoRA invocation token functions (`llama_adapter_get_alora_*`)
- feat: Add `llama_memory_breakdown_print()` for debugging
- feat: Add `llama_log_get()` to retrieve current log callback
- feat: Add `llama_sampler_init_adaptive_p()` (commented, requires rebuild)
- fix: Map `flash_attn=False` to `DISABLED` (was incorrectly `AUTO`)
- NOTE: `llama_adapter_lora_free` is now deprecated (adapters freed with model)

## [0.3.17]

- feat: Update llama.cpp to ggerganov/llama.cpp@95ea9e086 (b7652)
- feat: Add `flash_attn_type` parameter to `Llama()` for forward compatibility
- feat: Add `LLAMA_FLASH_ATTN_TYPE_*` enum (AUTO, DISABLED, ENABLED)
- feat: Add `LLAMA_PARAMS_FIT_STATUS_*` enum
- feat: Add `LLAMA_MODEL_META_KEY_*` enum constants
- feat: Add `LLAMA_ROPE_TYPE_IMROPE` constant
- feat: Add `no_host` and `no_alloc` fields to `llama_model_params`
- feat: Add `flash_attn_type`, `samplers`, `n_samplers` fields to `llama_context_params`
- feat: Add `llama_n_ctx_seq`, `llama_model_n_embd_inp`, `llama_model_n_embd_out` functions
- fix: Replace deprecated `llama_sampler_init_softmax` with `llama_sampler_init_temp(1.0)` in LlamaSampler.add_softmax()
- fix: Update embed() to use wrapper methods instead of direct C API calls
- fix: Add `LLAMA_INSTALL_VERSION` fallback in CMakeLists.txt for mtmd build
- removed: `llama_get_kv_self` binding (use `llama_get_memory` wrapper instead)
- removed: `llama_kv_self_*` bindings (11 functions, use `llama_memory_*` wrappers instead)
- removed: `llama_sampler_init_softmax` binding
- BREAKING (C API): `flash_attn` bool replaced with `flash_attn_type` enum in context params
- NOTE: Python API backward compatible - both `flash_attn` bool and `flash_attn_type` enum work

## [0.3.16]

- feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317
Expand Down Expand Up @@ -105,7 +142,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- fix: Fix memory allocation of ndarray in by @xu-song in #1704
- fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3


## [0.2.90]

- feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48
Expand All @@ -120,7 +156,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [0.2.88]

- feat: Update llama.cpp to ggerganov/llama.cpp@fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2
- fix: only print 'cache saved' in verbose mode by @lsorber in #1668
- fix: only print 'cache saved' in verbose mode by @lsorber in #1668
- fix: Added back from_file method to LlamaGrammar by @ExtReMLapin in #1673
- fix: grammar prints on each call by @abetlen in 0998ea0deea076a547d54bd598d6b413b588ee2b
- feat: Enable recursive search of HFFS.ls when using from_pretrained by @benHeidabetlen in #1656
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ if (LLAMA_BUILD)
endif()

# Building llava
# Set LLAMA_INSTALL_VERSION if not already defined (needed by mtmd CMakeLists.txt)
if(NOT DEFINED LLAMA_INSTALL_VERSION)
set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "llama.cpp install version" FORCE)
endif()
add_subdirectory(vendor/llama.cpp/tools/mtmd)

if (WIN32)
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.16"
__version__ = "0.3.18"
100 changes: 61 additions & 39 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int):
# Sampling functions - deprecated, use LlamaSampler instead

def set_rng_seed(self, seed: int):
raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"set_rng_seed is deprecated, use LlamaSampler instead"
)

def sample_repetition_penalties(
self,
Expand All @@ -366,30 +368,44 @@ def sample_repetition_penalties(
penalty_freq: float,
penalty_present: float,
):
raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_repetition_penalties is deprecated, use LlamaSampler instead"
)

def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_softmax is deprecated, use LlamaSampler instead"
)

def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_top_k is deprecated, use LlamaSampler instead"
)

def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_top_p is deprecated, use LlamaSampler instead"
)

def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_min_p is deprecated, use LlamaSampler instead"
)

def sample_typical(
self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
):
raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_typical is deprecated, use LlamaSampler instead"
)

def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")

def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_grammar is deprecated, use LlamaSampler instead"
)

def sample_token_mirostat(
self,
Expand All @@ -399,7 +415,9 @@ def sample_token_mirostat(
m: int,
mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
) -> int:
raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_token_mirostat is deprecated, use LlamaSampler instead"
)

def sample_token_mirostat_v2(
self,
Expand All @@ -408,17 +426,25 @@ def sample_token_mirostat_v2(
eta: float,
mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
) -> int:
raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_token_mirostat_v2 is deprecated, use LlamaSampler instead"
)

def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_token_greedy is deprecated, use LlamaSampler instead"
)

def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"sample_token is deprecated, use LlamaSampler instead"
)

# Grammar
def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"grammar_accept_token is deprecated, use LlamaSampler instead"
)

def reset_timings(self):
llama_cpp.llama_perf_context_reset(self.ctx)
Expand Down Expand Up @@ -602,16 +628,16 @@ def sample(
logits_array: Optional[npt.NDArray[np.single]] = None,
):
# This method is deprecated in favor of using LlamaSampler directly
raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
raise NotImplementedError(
"LlamaSamplingContext.sample is deprecated, use LlamaSampler instead"
)

def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
self.prev.append(id)


class CustomSampler:
def __init__(
self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
):
def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]):
self.apply_func = apply_func

def apply_wrapper(
Expand Down Expand Up @@ -673,7 +699,7 @@ def add_dist(self, seed: int):
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

def add_softmax(self):
sampler = llama_cpp.llama_sampler_init_softmax()
sampler = llama_cpp.llama_sampler_init_temp(1.0)
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

def add_top_k(self, k: int):
Expand Down Expand Up @@ -723,28 +749,28 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

def add_grammar_lazy_patterns(
self,
model: LlamaModel,
self,
model: LlamaModel,
grammar: LlamaGrammar,
trigger_patterns: List[str],
trigger_tokens: List[int]
trigger_tokens: List[int],
):
# Convert patterns to C array
pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
for i, pattern in enumerate(trigger_patterns):
pattern_ptrs[i] = pattern.encode("utf-8")

# Convert tokens to C array
token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)

sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
model.vocab,
grammar._grammar.encode("utf-8"),
grammar._root.encode("utf-8"),
pattern_ptrs,
len(trigger_patterns),
token_array,
len(trigger_tokens)
len(trigger_tokens),
)
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

Expand All @@ -771,13 +797,13 @@ def add_dry(
dry_base: float,
dry_allowed_length: int,
dry_penalty_last_n: int,
seq_breakers: List[str]
seq_breakers: List[str],
):
# Convert seq_breakers to C array
breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
for i, breaker in enumerate(seq_breakers):
breaker_ptrs[i] = breaker.encode("utf-8")

sampler = llama_cpp.llama_sampler_init_dry(
model.vocab,
n_ctx_train,
Expand All @@ -786,25 +812,19 @@ def add_dry(
dry_allowed_length,
dry_penalty_last_n,
breaker_ptrs,
len(seq_breakers)
len(seq_breakers),
)
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

def add_logit_bias(
self,
n_vocab: int,
logit_bias: Dict[int, float]
):
def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]):
# Convert logit_bias dict to C array
bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
for i, (token, bias) in enumerate(logit_bias.items()):
bias_array[i].token = token
bias_array[i].bias = bias

sampler = llama_cpp.llama_sampler_init_logit_bias(
n_vocab,
len(logit_bias),
bias_array
n_vocab, len(logit_bias), bias_array
)
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

Expand Down Expand Up @@ -838,15 +858,17 @@ def reset(self):
def clone(self):
# NOTE: Custom samplers cannot be cloned due to Python callback limitations
if self.custom_samplers:
raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")

raise NotImplementedError(
"Cannot clone LlamaSampler that contains custom samplers"
)

cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
# Create a new wrapper around the cloned sampler
new_sampler = LlamaSampler.__new__(LlamaSampler)
new_sampler.sampler = cloned_sampler
new_sampler.custom_samplers = []
new_sampler._exit_stack = ExitStack()

def free_sampler():
if new_sampler.sampler is not None:
llama_cpp.llama_sampler_free(new_sampler.sampler)
Expand Down
Loading