womboai · metaspartan · Jan 7, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.18]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@3bcc99099 (b7868)
+- feat: GLM 4.7 Flash support via upstream CUDA/Flash Attention optimizations
+- feat: Qwen3 Next model support
+- feat: Self-speculative decoding support (no draft model required)
+- feat: Add `use_direct_io` field to `llama_model_params`
+- feat: Add `llama_model_is_hybrid()` for hybrid models (Jamba, Granite)
+- feat: Add `llama_max_tensor_buft_overrides()`
+- feat: Add adapter metadata functions (`llama_adapter_meta_*`)
+- feat: Add aLoRA invocation token functions (`llama_adapter_get_alora_*`)
+- feat: Add `llama_memory_breakdown_print()` for debugging
+- feat: Add `llama_log_get()` to retrieve current log callback
+- feat: Add `llama_sampler_init_adaptive_p()` (commented, requires rebuild)
+- fix: Map `flash_attn=False` to `DISABLED` (was incorrectly `AUTO`)
+- NOTE: `llama_adapter_lora_free` is now deprecated (adapters freed with model)
+
+## [0.3.17]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@95ea9e086 (b7652)
+- feat: Add `flash_attn_type` parameter to `Llama()` for forward compatibility
+- feat: Add `LLAMA_FLASH_ATTN_TYPE_*` enum (AUTO, DISABLED, ENABLED)
+- feat: Add `LLAMA_PARAMS_FIT_STATUS_*` enum
+- feat: Add `LLAMA_MODEL_META_KEY_*` enum constants
+- feat: Add `LLAMA_ROPE_TYPE_IMROPE` constant
+- feat: Add `no_host` and `no_alloc` fields to `llama_model_params`
+- feat: Add `flash_attn_type`, `samplers`, `n_samplers` fields to `llama_context_params`
+- feat: Add `llama_n_ctx_seq`, `llama_model_n_embd_inp`, `llama_model_n_embd_out` functions
+- fix: Replace deprecated `llama_sampler_init_softmax` with `llama_sampler_init_temp(1.0)` in LlamaSampler.add_softmax()
+- fix: Update embed() to use wrapper methods instead of direct C API calls
+- fix: Add `LLAMA_INSTALL_VERSION` fallback in CMakeLists.txt for mtmd build
+- removed: `llama_get_kv_self` binding (use `llama_get_memory` wrapper instead)
+- removed: `llama_kv_self_*` bindings (11 functions, use `llama_memory_*` wrappers instead)
+- removed: `llama_sampler_init_softmax` binding
+- BREAKING (C API): `flash_attn` bool replaced with `flash_attn_type` enum in context params
+- NOTE: Python API backward compatible - both `flash_attn` bool and `flash_attn_type` enum work
+
 ## [0.3.16]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317
@@ -105,7 +142,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - fix: Fix memory allocation of ndarray in by @xu-song in #1704
 - fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3
 
-
 ## [0.2.90]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48
@@ -120,7 +156,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.2.88]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2
-- fix: only print 'cache saved' in verbose mode by @lsorber in #1668 
+- fix: only print 'cache saved' in verbose mode by @lsorber in #1668
 - fix: Added back from_file method to LlamaGrammar by @ExtReMLapin in #1673
 - fix: grammar prints on each call by @abetlen in 0998ea0deea076a547d54bd598d6b413b588ee2b
 - feat: Enable recursive search of HFFS.ls when using from_pretrained by @benHeidabetlen in #1656

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -154,6 +154,10 @@ if (LLAMA_BUILD)
         endif()
 
         # Building llava
+        # Set LLAMA_INSTALL_VERSION if not already defined (needed by mtmd CMakeLists.txt)
+        if(NOT DEFINED LLAMA_INSTALL_VERSION)
+            set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "llama.cpp install version" FORCE)
+        endif()
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.3.18"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int):
     # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "set_rng_seed is deprecated, use LlamaSampler instead"
+        )
 
     def sample_repetition_penalties(
         self,
@@ -366,30 +368,44 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_repetition_penalties is deprecated, use LlamaSampler instead"
+        )
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_softmax is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_k is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_min_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_typical is deprecated, use LlamaSampler instead"
+        )
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
         raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_grammar is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat(
         self,
@@ -399,7 +415,9 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat_v2(
         self,
@@ -408,17 +426,25 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_greedy is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token is deprecated, use LlamaSampler instead"
+        )
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "grammar_accept_token is deprecated, use LlamaSampler instead"
+        )
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -602,16 +628,16 @@ def sample(
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
         # This method is deprecated in favor of using LlamaSampler directly
-        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead"
+        )
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
 
 
 class CustomSampler:
-    def __init__(
-        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
-    ):
+    def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]):
         self.apply_func = apply_func
 
         def apply_wrapper(
@@ -673,7 +699,7 @@ def add_dist(self, seed: int):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
+        sampler = llama_cpp.llama_sampler_init_temp(1.0)
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_top_k(self, k: int):
@@ -723,28 +749,28 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar_lazy_patterns(
-        self, 
-        model: LlamaModel, 
+        self,
+        model: LlamaModel,
         grammar: LlamaGrammar,
         trigger_patterns: List[str],
-        trigger_tokens: List[int]
+        trigger_tokens: List[int],
     ):
         # Convert patterns to C array
         pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
         for i, pattern in enumerate(trigger_patterns):
             pattern_ptrs[i] = pattern.encode("utf-8")
-        
+
         # Convert tokens to C array
         token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
-        
+
         sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
             model.vocab,
             grammar._grammar.encode("utf-8"),
             grammar._root.encode("utf-8"),
             pattern_ptrs,
             len(trigger_patterns),
             token_array,
-            len(trigger_tokens)
+            len(trigger_tokens),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -771,13 +797,13 @@ def add_dry(
         dry_base: float,
         dry_allowed_length: int,
         dry_penalty_last_n: int,
-        seq_breakers: List[str]
+        seq_breakers: List[str],
     ):
         # Convert seq_breakers to C array
         breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
         for i, breaker in enumerate(seq_breakers):
             breaker_ptrs[i] = breaker.encode("utf-8")
-        
+
         sampler = llama_cpp.llama_sampler_init_dry(
             model.vocab,
             n_ctx_train,
@@ -786,25 +812,19 @@ def add_dry(
             dry_allowed_length,
             dry_penalty_last_n,
             breaker_ptrs,
-            len(seq_breakers)
+            len(seq_breakers),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_logit_bias(
-        self, 
-        n_vocab: int, 
-        logit_bias: Dict[int, float]
-    ):
+    def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]):
         # Convert logit_bias dict to C array
         bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
         for i, (token, bias) in enumerate(logit_bias.items()):
             bias_array[i].token = token
             bias_array[i].bias = bias
-        
+
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab,
-            len(logit_bias),
-            bias_array
+            n_vocab, len(logit_bias), bias_array
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -838,15 +858,17 @@ def reset(self):
     def clone(self):
         # NOTE: Custom samplers cannot be cloned due to Python callback limitations
         if self.custom_samplers:
-            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
-
+            raise NotImplementedError(
+                "Cannot clone LlamaSampler that contains custom samplers"
+            )
+
         cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
         # Create a new wrapper around the cloned sampler
         new_sampler = LlamaSampler.__new__(LlamaSampler)
         new_sampler.sampler = cloned_sampler
         new_sampler.custom_samplers = []
         new_sampler._exit_stack = ExitStack()
-        
+
         def free_sampler():
             if new_sampler.sampler is not None:
                 llama_cpp.llama_sampler_free(new_sampler.sampler)