Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1301 commits
Select commit Hold shift + click to select a range
33f890e
vulkan: support flash attention GQA/split_k with small batches (#18938)
jeffbolznv Jan 21, 2026
fbbf3ad
server: /v1/responses (partial) (#18486)
openingnow Jan 21, 2026
14be5a3
common : improve error message when HTTPS is missing but required (#1…
angt Jan 21, 2026
bd544c9
vulkan: Remove transfer_ctx, do everything in compute_ctx. (#18945)
jeffbolznv Jan 21, 2026
9da3dcd
llama : clarify nemotron-h.cpp comment about RoPE [no ci] (#18997)
danbev Jan 21, 2026
3802d3c
fix: Use `tabular-nums` for chat message statistics (#18915)
nathanlesage Jan 21, 2026
c301172
jinja: support none|string (#18995)
pwilkin Jan 21, 2026
77078e8
convert : add Devstral-2 (Ministral3ForCausalLM) arch (#18972)
emsi Jan 21, 2026
6b99a22
ci : update GitHub Actions versions [no ci] (#18935)
pgoslatara Jan 21, 2026
94242a6
ggml-zdnn : mark zDNN buffers as non-host (#18967)
AlekseiNikiforovIBM Jan 22, 2026
5516b9c
opencl: add TRI op support (#18979)
shaofeiqi Jan 22, 2026
b70d251
CUDA: add gqa_ratio 4 for GLM 4.7 flash (#18953)
am17an Jan 22, 2026
c6926d1
server: Reorder methods in `server-task.cpp` (#19016)
openingnow Jan 22, 2026
9eb5bfe
mtmd : update docs to use llama_model_n_embd_inp (#18999)
ngxson Jan 22, 2026
8b30840
release: update github api (#19022)
taronaeo Jan 22, 2026
0e4ebeb
quant : manual overrides of tensor types take precedence (#18952)
ggerganov Jan 22, 2026
4e595b2
server: do not log certain endpoints (avoid log spam) (#19028)
ngxson Jan 22, 2026
9c96465
opencl: enable the general fp mm for non-cont input and as a fallback…
lhez Jan 22, 2026
e34d6d0
convert_hf_to_gguf.py: refactor modify_tensors to call super (#18866)
am17an Jan 22, 2026
e2baf02
CUDA: fix alignment check for FA (#19023)
JohannesGaessler Jan 22, 2026
a5eaa1d
mla : make the V tensor a view of K (#18986)
ggerganov Jan 22, 2026
51fa458
server : support preserving reasoning_content in assistant message (#…
ngxson Jan 22, 2026
a3e8128
cli : load parser definition (#19031)
aldehir Jan 23, 2026
091a46c
ggml-cpu: aarm64: q5_K repack gemm and gemv (and generic) implementat…
Alcpz Jan 23, 2026
a14b960
model-conversion : use BUILD_DIR variable in all scripts (#19015)
danbev Jan 23, 2026
b5b8fa1
chat : fix translategemma crash on common_chat_format_example (#19019)
ngxson Jan 23, 2026
cb6caca
[SYCL] use malloc to support both iGPU and dGPU in same time (#18992)
arthw Jan 23, 2026
557515b
graph : utilize `ggml_build_forward_select()` to avoid reallocations …
ggerganov Jan 23, 2026
8af1f5f
ggml-hexagon: flash-attn opt (#19025)
chraac Jan 24, 2026
81ab64f
ggml-cuda: enable cuda-graphs for `n-cpu-moe` (#18934)
am17an Jan 24, 2026
8f91ca5
CUDA: re-use MLA K data for V in MMA FA (#19057)
JohannesGaessler Jan 24, 2026
bb02f74
chat: fix language input for translategemma (#19052)
ngxson Jan 24, 2026
4e5b83b
GGUF: check that tensor size is representable (#19072)
JohannesGaessler Jan 24, 2026
e9fd8dc
llama-fit-params: keep explicit --ctx-size 0 (#19070)
JohannesGaessler Jan 24, 2026
9981c30
convert : fix conversion for inheriting models that were bypassing mo…
bartowski1182 Jan 25, 2026
16639ba
common : use two decimal places for float arg help messages (#19048)
danbev Jan 25, 2026
24bc238
llama: fix integer type consistency in split helpers (#18894)
MaheshJakkala Jan 25, 2026
1243f93
readme: update RWKV7 model links (#19061)
MollySophia Jan 25, 2026
080b161
completion : fix prompt cache for recurrent models (#19045)
ggerganov Jan 25, 2026
70d8608
convert : fix Gemma3N, GraniteMoe and Ernie4.5Moe (#19084)
CISC Jan 25, 2026
d9c6ce4
kv-cache : support V-less cache (#19067)
ggerganov Jan 25, 2026
bcb4316
ggml-cpu: Use tiled FA for prompt-processing (#19012)
am17an Jan 25, 2026
0bf5636
convert : yield Gemma3N custom_map tensors directly (#19091)
CISC Jan 25, 2026
0440bfd
metal : fix recommendedMaxWorkingSetSize availability on legacy iOS/m…
ccbinn Jan 25, 2026
0c21677
CUDA: faster FA for GQA > 1 but not power of 2 (#19092)
JohannesGaessler Jan 25, 2026
56f3ebf
model : add correct type for GLM 4.7 Flash (#19106)
ggerganov Jan 26, 2026
142cbe2
ci : use new 1vCPU runner for lightweight jobs (#19107)
CISC Jan 26, 2026
8f80d1b
graph : fix nkvo offload with FA (#19105)
ggerganov Jan 26, 2026
b0311c1
CUDA: fix padding of GQA to power of 2 in FA (#19115)
JohannesGaessler Jan 26, 2026
94eeb59
opencl: add flattened q6_K mv (#19054)
lhez Jan 27, 2026
7afdfc9
ggml-cpu: Enable FP16 MMA kernels on PPC (#19060)
shalinib-ibm Jan 27, 2026
fc3cdf3
common : clarify HTTPS build options in error message (#19103)
danbev Jan 27, 2026
a83c73a
[CUDA] Reduce CPU-side stalls due to the CUDA command buffer being fu…
gaugarg-nv Jan 27, 2026
be8890e
ggml-cpu: aarm64: q6_K repack gemm and gemv (and generic) implementat…
Alcpz Jan 27, 2026
c0204a0
ci : revert slim runner for winget (#19129)
CISC Jan 27, 2026
a5bb8ba
CUDA: tune GLM 4.7 Flash FA kernel selection logic (#19097)
JohannesGaessler Jan 27, 2026
68ac3ac
docs: Remove duplicated word on CUDA build section (#19136)
davelima Jan 27, 2026
2b4cbd2
jinja : implement mixed type object keys (#18955)
CISC Jan 27, 2026
f2571df
ggml-zendnn : update ZenDNN git tag to main branch (#19133)
z-vishal Jan 27, 2026
06961e2
ggml webgpu: Split shared state (webgpu_context) into global state an…
nikhilJain17 Jan 28, 2026
eef375c
sampling : remove sampling branching in output_reserve (#18811)
danbev Jan 28, 2026
c5c64f7
llama : disable Direct IO by default (#19109)
ggerganov Jan 28, 2026
b931f81
server : adjust spec tests to generate up to 16 tokens (#19093)
ggerganov Jan 28, 2026
2eee6c8
CUDA: tune GLM 4.7 Flash FA kernel selection logic (DGX Spark) (#19142)
ggerganov Jan 28, 2026
631cbfc
cuda : fix "V is K view" check for non-unified KV cache (#19145)
ggerganov Jan 28, 2026
6ad70c5
ggml-cpu: arm64: Q4_K scale unroll and vectorization (#19108)
Alcpz Jan 28, 2026
b7feacf
ggml: new backend for Virglrenderer API Remoting acceleration (v2) (#…
kpouget Jan 28, 2026
0a95026
doc: add build instruction to use Vulkan backend on macos (#19029)
chanbengz Jan 28, 2026
88d23ad
vulkan: handle device dedup on MacOS + Vega II Duo cards (#19058)
okuvshynov Jan 28, 2026
60368e1
jinja : undefined should be treated as sequence/iterable (return stri…
CISC Jan 28, 2026
0cd7032
ggml-sycl: remove unused syclcompat header (#19140)
PatKamin Jan 28, 2026
ebf5725
convert : yield Mamba2Model/GraniteMoeModel modify_tensors (#19157)
danbev Jan 28, 2026
72d3b18
spec : add self‑speculative decoding (no draft model required) + refa…
srogmann Jan 28, 2026
f6b533d
Vulkan Flash Attention Coopmat1 Refactor (#19075)
0cc4m Jan 28, 2026
50e8962
ci : find latest release with asset for winget (#19161)
CISC Jan 28, 2026
d4964a7
sycl: fix norm kernels: l2_norm, group_norm, rms_norm by remove asser…
arthw Jan 29, 2026
3bcc990
CUDA: refactor topk-moe to enable more models (GLM 4.7, Nemotron etc.…
am17an Jan 29, 2026
b33df26
ggml-zendnn : resolve ZenDNN backend cross-module symbol dependency (…
z-vishal Jan 29, 2026
eed25bc
arg : add -kvu to llama-batched-bench (#19172)
ggerganov Jan 29, 2026
f3dd7b8
HIP: add mmf for CDNA (#18896)
zhang-hui-yulo Jan 29, 2026
b45ef27
jinja : do not pass empty tools and add some none filters (#19176)
CISC Jan 29, 2026
84b0a98
webui: Update Svelte to fix effect_update_depth_exceeded errors (#19144)
amarshall Jan 29, 2026
7b7ae85
chat : add parsing for solar-open-100b (#18540)
aldehir Jan 29, 2026
4fdbc1e
cuda : fix nkvo, offload and cuda graph node properties matching (#19…
ggerganov Jan 29, 2026
ce38a4d
hexagon: enable offloading to Hexagon on Windows on Snapdragon (#19150)
tboinovski1 Jan 29, 2026
bd90fc7
ggml-webgpu: improve flastAttention performance by software pipelinin…
ArberSephirotheca Jan 29, 2026
d284baf
Fix typos in SYCL documentation (#19162)
DDXDB Jan 30, 2026
c7358dd
sycl: implement GGML_OP_TRI (#19089)
RachelMantel Jan 30, 2026
1025fd2
sycl: implement GGML_UNARY_OP_SOFTPLUS (#19114)
s8322 Jan 30, 2026
ecbf01d
add tensor type checking as part of cuda graph properties (#19186)
bssrdf Jan 30, 2026
b316895
docs: Add LlamaLib to UI projects (#19181)
amakropoulos Jan 30, 2026
83bcdf7
memory : remove unused tmp_buf (#19199)
danbev Jan 30, 2026
0562503
convert : add missing return statement for GraniteMoeModel (#19202)
danbev Jan 30, 2026
c3b87ce
tests : add GQA=20 FA test (#19095)
ggerganov Jan 30, 2026
f3bc988
memory : clarify comments for r_l and s_l tensors [no ci] (#19203)
danbev Jan 30, 2026
2e916f9
jinja : add unordered_map include to value.h [no ci] (#19205)
mseri Jan 30, 2026
dabaa2e
spec : add ngram-mod (#19164)
ggerganov Jan 30, 2026
13f3ebf
Correctly fetch q8_1 quantize pipeline in test as needed by 8a3519b (…
sredman Jan 30, 2026
bbada8b
server : wrap around the "id_slot" parameter (#19207)
ggerganov Jan 30, 2026
dfd6106
cuda : fix compile warnings (whisper/0)
ggerganov Jan 30, 2026
d9a2a4b
sync : ggml
ggerganov Jan 30, 2026
971facc
opencl: add optimized q8_0 mm kernel for adreno (#18871)
shaofeiqi Jan 30, 2026
4927795
ngram-mod : fix build [no ci] (#19216)
ggerganov Jan 30, 2026
1488339
lookup, lookahead: fix crash when n_ctx not specified (#18729)
pestopoppa Jan 30, 2026
ec6c742
mtmd: support MiniCPM-o 4.5(vision only) (#19211)
tc-mb Jan 30, 2026
3dd9591
quantize: add option --tensor-type-file to llama-quantize (#18572)
EugeoSynthesisThirtyTwo Jan 31, 2026
89f10ba
ggml-hexagon: flash-attention and reduce-sum optimizations (#19141)
chraac Jan 31, 2026
41ea261
nix: fix nix develop .#python-scripts (#19218)
teto Jan 31, 2026
2634ed2
create test.sh to enhance the parameters for testing, update the guid…
arthw Feb 1, 2026
8a98ba4
nix: fix allowUnfreePredicate for packages with multiple licenses (#1…
typedrat Feb 1, 2026
3bc8d2c
Bump cmake max version (needed for Windows on Snapdragon builds) (#19…
max-krasnyansky Feb 1, 2026
2dc3ce2
Remove pipeline cache mutexes (#19195)
nikhilJain17 Feb 2, 2026
b4d05a3
spec : various improvements ton ngram-map + docs (#19253)
srogmann Feb 2, 2026
7a4ca3c
docs : Minor cleanups (#19252)
ckastner Feb 2, 2026
1239267
authors : update (#19263)
ggerganov Feb 2, 2026
59377a6
ggml-backend: fix async set/get fallback sync (#19179)
JohannesGaessler Feb 2, 2026
6156ae5
model-conversion : add debug option to conversion script (#19265)
danbev Feb 2, 2026
6fdddb4
metal : support virtual devices (#18919)
ggerganov Feb 2, 2026
4d5e972
sycl: implement GGML_OP_TOP_K (#19242)
tdevelope Feb 2, 2026
bf38346
Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nv…
arthw Feb 2, 2026
15818ac
ci: add test-backend-ops test for CPU (#19268)
am17an Feb 2, 2026
a3fa035
server: print actual model name in 'model not found" error (#19117)
teto Feb 2, 2026
9f682fb
ggml-cpu: FA split across kv for faster TG (#19209)
am17an Feb 2, 2026
07a7412
mtmd: add min/max pixels gguf metadata (#19273)
ngxson Feb 2, 2026
0dfcd3b
jinja : add missing 'in' test to template engine (#19004) (#19239)
sidmohan0 Feb 2, 2026
91ea44e
opencl: refactor some ops, concat, repeat, tanh and scale (#19226)
lhez Feb 2, 2026
aeb827a
spec : simplify time measurement using common_time_meas (#19262)
ggerganov Feb 3, 2026
1efb5f7
vocab: add Falcon-H1-Tiny-Coder FIM tokens (#19249)
vhsw Feb 3, 2026
41e3f02
cuda : revert CUDA_SCALE_LAUNCH_QUEUES override until investigated (#…
gaugarg-nv Feb 3, 2026
e9a859d
ggml: added cleanups in ggml_quantize_free (#19278)
noctrex Feb 3, 2026
1f1e57f
CUDA: Fix loop unrolling for BW in mul_mat_q_stream_k_fixup (#19053)
ORippler Feb 3, 2026
c55bce4
metal : minor cleanup (#19251)
ggerganov Feb 3, 2026
a6fd8ca
models : remove unnecessary cont in openelm (#19289)
CISC Feb 3, 2026
8bece2e
CUDA: use mmvq for mul-mat-id for small batch sizes (#18958)
am17an Feb 3, 2026
32b17ab
vulkan: disable coopmat1 fa on Nvidia Turing (#19290)
0cc4m Feb 3, 2026
faa1bc2
sampling : delegate input allocation to the scheduler (#19266)
ggerganov Feb 3, 2026
6a9bf2f
ci : add sanitizer runs for server (#19291)
ggerganov Feb 3, 2026
44008ce
metal : add solve_tri (#19302)
ggerganov Feb 3, 2026
2ceda3f
ggml-cpu: use LUT for converting e8->f32 scales on x86 (#19288)
am17an Feb 4, 2026
015deb9
ggml-virtgpu: make the code thread safe (#19204)
kpouget Feb 4, 2026
25f40ca
completion : simplify batch (embd) processing (#19286)
danbev Feb 4, 2026
d838c22
spec : fix the check-rate logic of ngram-simple (#19261)
ggerganov Feb 4, 2026
6ab881b
model-conversion : add tensor-info.py utility (#18954)
danbev Feb 4, 2026
eaba92c
tests : add non-cont, inplace rope tests (#19296)
ggerganov Feb 4, 2026
8abcc70
model: (qwen3next) correct vectorized key_gdiff calculation (#19324)
ngxson Feb 4, 2026
423bee4
ci : fix sanitize workflow to enable ggml sanitizers too (#19323)
ggerganov Feb 4, 2026
e0c93af
debug: make common_debug_print_tensor readable (#19331)
ngxson Feb 4, 2026
b536eb0
codeowners : add danbev for examples/debug (#19332)
danbev Feb 4, 2026
e6e934c
vendor: update cpp-httplib version (#19313)
taronaeo Feb 4, 2026
11fb327
vendor : add missing llama_add_compile_flags (#19322)
CISC Feb 5, 2026
af252d0
metal : add missing includes (#19348)
will-lms Feb 5, 2026
c342c3b
vulkan: fix non-contig rope (#19299)
jeffbolznv Feb 5, 2026
3409ab8
vulkan: Set k_load_shmem to false when K is too large (#19301)
jeffbolznv Feb 5, 2026
a498c75
vulkan: fix GPU deduplication logic. (#19222)
okuvshynov Feb 5, 2026
7a4f97d
metal : add diag (#19330)
ggerganov Feb 5, 2026
a4ea7a1
vendor : update BoringSSL to 0.20260204.0 (#19333)
angt Feb 5, 2026
b828e18
docker : fix vulkan build (#19352)
CISC Feb 5, 2026
3795cc1
benches : update models + numbers (#19359)
ggerganov Feb 5, 2026
449ec2a
vulkan: Preprocess FA mask to detect all-neg-inf and all-zero. (#19281)
jeffbolznv Feb 5, 2026
22cae83
metal : adaptive CPU/GPU interleave based on number of nodes (#19369)
ggerganov Feb 5, 2026
3e21647
cuda : cuda graphs now compare all node params (#19383)
ggerganov Feb 6, 2026
e696cfc
llama : rename llama-sampling to llama-sampler (#19363)
danbev Feb 6, 2026
7fcf1ef
metal : skip loading all-zero mask (#19337)
ggerganov Feb 6, 2026
f9bd518
vulkan: make FA mask/softcap enables spec constants (#19309)
jeffbolznv Feb 6, 2026
1946e46
vulkan: For coopmat2 FA, use fp16 accumulators for the final result (…
jeffbolznv Feb 6, 2026
3688c4f
Kimi-Linear support (backend agnostic + MLA KV cache) (#18755)
ymcki Feb 6, 2026
06bf379
unicode : MSVC regex fix (#19340)
Iemand005 Feb 6, 2026
dfde599
common : add common_speculative_is_compat() (#19270)
ggerganov Feb 6, 2026
db6adb3
tests: reduce number of FA test permutations (#19381)
jeffbolznv Feb 6, 2026
537eadb
sycl: add F16 support for GGML_OP_CEIL (#19306)
NechamaKrashinski Feb 6, 2026
7fbd36c
ggml-webgpu: JIT compile binary operators and handle binding overlaps…
abhijitramesh Feb 6, 2026
3228e77
gguf-py : bump sentencepiece version (#19319)
Ahajha Feb 6, 2026
b831118
model : support Step3.5-Flash (#19283)
forforever73 Feb 6, 2026
34ba7b5
metal : fix event synchronization in cpy_tensor_async (#19402)
ggerganov Feb 7, 2026
8872ad2
metal : consolidate bin kernels (#19390)
ggerganov Feb 7, 2026
96441c9
ci : use -j param correctly when building with sanitizers (#19411)
ggerganov Feb 7, 2026
9a5f577
ci : remove server job from webui and move slow test (#19424)
CISC Feb 8, 2026
5999b50
llama-quantize : cleanup `--help` output (#19317)
ddh0 Feb 8, 2026
eb449cd
server : improve context checkpoint logic (#19408)
ggerganov Feb 8, 2026
5fa1c19
rpc : update from common.cpp (#19400)
angt Feb 8, 2026
e06088d
CUDA: Fix non-contig rope (#19338)
ORippler Feb 8, 2026
39bf692
[Model] Qwen3.5 dense and MoE support (no vision) (#19435)
pwilkin Feb 8, 2026
1e8924f
cmake : add variable to skip installing tests (#19370)
WhyNotHugo Feb 9, 2026
f5e7734
ggml-virtgpu: add backend documentation (#19354)
kpouget Feb 9, 2026
972f323
revert : "[Model] Qwen3.5 dense and MoE support (no vision) (#19435)"…
ggerganov Feb 9, 2026
81ddc60
ci : add metal server workflows (#19293)
ggerganov Feb 9, 2026
292f690
spec : remove check rate (#19377)
srogmann Feb 9, 2026
820ebfa
Server: log when converting requests to chat completions format (#19457)
openingnow Feb 9, 2026
262364e
mtmd: Implement tiling for LFM2-VL (#19454)
tdakhran Feb 9, 2026
98e57ca
chat: fix case where template accepts type content only (#19419)
ngxson Feb 9, 2026
a0d5855
cuda : extend GGML_OP_PAD to work with non-cont src0 (#19429)
ggerganov Feb 10, 2026
52e38fa
CANN: implement quantized MUL_MAT_ID for MoE models (#19228)
hipudding Feb 10, 2026
f0bfe54
CANN: Remove unnecessary wrapper for `gml_backend_buft_is_cann` (#18968)
rauletorresc Feb 10, 2026
66d403c
tts : fix typos in README.md [no ci] (#19463)
danbev Feb 10, 2026
854b09f
convert : move experts permutation from Qwen2MoeModel to Qwen3VLMoeTe…
pwilkin Feb 10, 2026
6948adc
ggml : use noexcept overload for is_regular_file in backend registrat…
k4ss4n Feb 10, 2026
c03a5a4
ggml-cpu: arm64: q6_K repack gemm and gemv (and generic) implementati…
Alcpz Feb 10, 2026
9a96352
test: fix IMROPE perf test case (#19465)
ngxson Feb 10, 2026
fc0fe40
models : support qwen3.5 series (#19468)
JJJYmmm Feb 10, 2026
57487a6
[WebGPU] Plug memory leaks and free resources on shutdown (#19315)
nikhilJain17 Feb 10, 2026
612db61
CUDA : Update CCCL-tag for 3.2 to final release from RC (#19486)
ORippler Feb 10, 2026
2cce9fd
llama : refactor sampling_info to use buffer_view template (#19368)
danbev Feb 11, 2026
ceaa89b
metal : consolidate unary ops (#19490)
ggerganov Feb 11, 2026
89181c0
ggml : extend bin bcast for permuted src1 (#19484)
ggerganov Feb 11, 2026
6d95707
model : fix wavtokenizer embedding notions (#19479)
ggerganov Feb 11, 2026
8ee538c
llama : correct typos 'occured' and 'occurences' (#19414)
thecaptain789 Feb 11, 2026
73cd5e1
hexagon: Add ARGSORT, DIV, SQR, SQRT, SUM_ROWS, GEGLU (#19406)
max-krasnyansky Feb 11, 2026
0c1f39a
common : improve download error reporting (#19491)
angt Feb 11, 2026
ada90bf
docs: ban AI for issues and discussions [no CI] (#19512)
JohannesGaessler Feb 11, 2026
9ab072e
metal : extend l2_norm support for non-cont src0 (#19502)
ggerganov Feb 11, 2026
53de59f
build : fix case in dSYMs path for build-macos [no ci] (#19515)
danbev Feb 11, 2026
e463bbd
model: Add Kimi-K2.5 support (#19170)
AesSedai Feb 11, 2026
3136a84
common : remove unused token util functions (#19506)
danbev Feb 11, 2026
914dde7
ggml : unary ops support non-cont src0 + metal F16 unary ops (#19511)
ggerganov Feb 11, 2026
4d3daf8
opencl: add general Q6_K mm and Q4_K mv (#19347)
lhez Feb 11, 2026
4ae1b75
common : replace deprecated codecvt using parse_utf8_codepoint (#19517)
angt Feb 12, 2026
b1ff83b
hexagon: further optimization and tuning of matmul and dot kernels (#…
max-krasnyansky Feb 12, 2026
313493d
docs : update path in snapdragon README.md (#19533)
TriDefender Feb 12, 2026
fa16e51
server : fix typo in README.md for features list (#19510)
RichardScottOZ Feb 12, 2026
6845f7f
Add a workaround for compilation with ROCWMMA_FATTN and gfx9 (#19461)
superm1 Feb 12, 2026
3b3a948
metal : update sum_rows kernel to support float4 (#19524)
ggerganov Feb 12, 2026
38adc7d
WebUI Architecture Cleanup (#19541)
allozaur Feb 12, 2026
f486ce9
(webui) REFACTOR: UI primitives and polish (#19551)
allozaur Feb 12, 2026
ff59903
scripts : add support for forks in pr2wt.sh (#19540)
danbev Feb 12, 2026
4d688f9
(webui) FEATURE: Enable adding or injecting System Message into chat …
allozaur Feb 12, 2026
f488429
llama : update outdated comment in llama.h (#19428)
MonkeybreadSoftware Feb 12, 2026
4b385bf
vendor : update cpp-httplib (#19537)
angt Feb 12, 2026
4c61875
webui: Add switcher to Chat Message UI to show raw LLM output (#19571)
allozaur Feb 12, 2026
338085c
args : add -kvu to llama-parallel (#19577)
ggerganov Feb 12, 2026
79cc0f2
opencl: add basic support for q4_1 (#19534)
lhez Feb 12, 2026
3bb7813
hexagon: fix typo in vtcm_needs_release (#19545)
FanShupei Feb 12, 2026
490eb96
metal : support GGML_OP_SET (#19548)
ggerganov Feb 13, 2026
0644bae
metal : improve concurrency (#19555)
ggerganov Feb 13, 2026
bb96bfd
memory : fix kv cache size for hybrid models (#19559)
ggerganov Feb 13, 2026
2f5d8f8
vendor : update BoringSSL to 0.20260211.0 (#19562)
angt Feb 13, 2026
25224c8
llama : remove deprecated codecvt (#19565)
angt Feb 13, 2026
33a56f9
model : Kimi Linear fix conv state update (#19531)
ymcki Feb 13, 2026
423cf0b
docs : fix broken link and typo (#19560)
pavan-sh Feb 13, 2026
43919b7
CUDA: Do not mutate cgraph for fused ADDs (#19566)
ORippler Feb 13, 2026
5174d72
webui: UI and routing fixes (#19586)
allozaur Feb 13, 2026
5065da5
CUDA: loop over ne2*ne3 in case it overflows (#19538)
am17an Feb 13, 2026
b2ecc0c
support --verbose-prompt (#19576)
CISC Feb 13, 2026
0e21991
fix vulkan ggml_acc only works in 3d but not 4d (#19426)
ymcki Feb 13, 2026
cc2aa81
Fix wrong memcpy length for block_interleave == 4 (#19575)
Alcpz Feb 13, 2026
752584d
model: support GLM MoE DSA arch (NOTE: indexer is not yet supported) …
ngxson Feb 13, 2026
b48e80f
common : update download code (#19573)
angt Feb 13, 2026
05a6f0e
vulkan: restore -inf check in FA shaders (#19582)
jeffbolznv Feb 13, 2026
94a602d
github : add missing backends to issue templates (#19603)
mengshengwu Feb 13, 2026
0ccbfde
hexagon: further optimizations and refactoring for flash attention (#…
max-krasnyansky Feb 14, 2026
bf45430
feat:upgrade to 0ccbfde
lochjin Feb 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
20 changes: 10 additions & 10 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

Expand All @@ -36,20 +34,22 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
# Use the passed CHIP_TYPE argument and add general build options
ARG CHIP_TYPE
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
-DSOC_TYPE=ascend${CHIP_TYPE} \
-DUSE_ACL_GRAPH=ON \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
Expand Down Expand Up @@ -108,11 +108,11 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli
# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

ENTRYPOINT [ "/app/llama-cli" ]

Expand Down
6 changes: 3 additions & 3 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
apt-get install -y build-essential git cmake libssl-dev

WORKDIR /app

Expand All @@ -20,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
95 changes: 95 additions & 0 deletions .devops/cuda-new.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=13.1.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

WORKDIR /app

COPY . .

RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete


ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
6 changes: 3 additions & 3 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

WORKDIR /app

Expand All @@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
6 changes: 3 additions & 3 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
apt-get install -y git libssl-dev

WORKDIR /app

Expand All @@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light

COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
7 changes: 4 additions & 3 deletions .devops/llama-cli-cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ WORKDIR /app

COPY . .

RUN yum install -y gcc g++ cmake make libcurl-devel
RUN yum install -y gcc g++ cmake make openssl-devel
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
Expand All @@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion

# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

ENV LC_ALL=C.utf8

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp-cuda.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple

Expand Down Expand Up @@ -68,6 +69,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple

Expand Down Expand Up @@ -70,6 +71,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
Expand Down
6 changes: 3 additions & 3 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ RUN apt-get update && \
python3 \
python3-pip \
git \
libcurl4-openssl-dev \
libssl-dev \
libgomp1

WORKDIR /app
Expand All @@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
4 changes: 2 additions & 2 deletions .devops/nix/nixpkgs-instances.nix
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
# `_module.args.pkgs` (defined in this case by flake-parts).
perSystem =
{ system, ... }:
{ lib, system, ... }:
{
_module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
Expand Down Expand Up @@ -33,7 +33,7 @@
"CUDA EULA"
"cuDNN EULA"
]
) (p.meta.licenses or [ p.meta.license ]);
) (p.meta.licenses or (lib.toList p.meta.license));
};
# Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs {
Expand Down
2 changes: 2 additions & 0 deletions .devops/nix/package-gguf-py.nix
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
llamaVersion,
numpy,
tqdm,
requests,
sentencepiece,
pyyaml,
poetry-core,
Expand All @@ -20,6 +21,7 @@ buildPythonPackage {
tqdm
sentencepiece
pyyaml
requests
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
Expand Down
7 changes: 3 additions & 4 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
useMpi ? false,
useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

# It's necessary to consistently use backendStdenv when building with CUDA support,
Expand Down Expand Up @@ -159,22 +159,21 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
++ optionals useVulkan vulkanBuildInputs;

cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIP" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "GGML_RPC" useRpc)
]
++ optionals useCuda [
(
Expand Down
Loading
Loading