#1. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot "blk\.(3|4)\.ffn_.*=CUDA0" -ot "blk\.(5|6)\.ffn_.*=CUDA1" -ot "blk\.(7|8)\.ffn_.*=CUDA2" --override-tensor exps=CPU
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |         pp512 |     13.65 ± 0.37 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |         tg128 |      0.58 ± 0.00 |

build: f26fe36 (1)

#2. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot ".ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU 
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |         pp512 |     14.08 ± 0.22 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |         tg128 |      2.47 ± 0.01 |

build: f26fe36 (1)

#3. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot ".ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU -rtr 1
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | rtr |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | --: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |   1 |         pp512 |     14.44 ± 0.11 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |   1 |         tg128 |      2.45 ± 0.02 |

build: f26fe36 (1)

#4. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot ".ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU -t 36
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |         pp512 |     19.71 ± 1.24 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |         tg128 |      3.67 ± 0.02 |

build: f26fe36 (1)

#5. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot ".ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU -t 36 -mla 2
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads | mla |          test |              t/s |    
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | --: | ------------: | ---------------: |    
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   2 |         pp512 |     19.16 ± 1.23 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   2 |         tg128 |      3.55 ± 0.02 |

build: f26fe36 (1)

#6. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot ".ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU -t 36 -mla 3
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads | mla |          test |              t/s |    
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | --: | ------------: | ---------------: |    
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   3 |         pp512 |     19.18 ± 1.07 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   3 |         tg128 |      3.57 ± 0.02 |

build: f26fe36 (1)

#7. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 -ot "(4|5|6)\d{1}.ffn_(up|down)_exps.=CPU" --override-tensor exps=CPU -t 36
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |         pp512 |     19.73 ± 1.26 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |         tg128 |      3.65 ± 0.01 |

build: f26fe36 (1)

#8. Bench...
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -ngl 62 -fa --warmup-batch --main-gpu 0 --override-tensor exps=CPU -t 36 -mla 3
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads | mla |          test |              t/s |    
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | --: | ------------: | ---------------: |    
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   3 |         pp512 |     19.20 ± 1.12 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |   3 |         tg128 |      3.57 ± 0.03 |

build: f26fe36 (1)

#9. ubergarm's params
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-sweep-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf  -mla 3 -fa \
  -amb 512 \
  -fmoe \
  -ctk f16 \
  -c 16384 \
  -ngl 99 \
  -ot "blk\.(3|4|5|6|7)\.ffn_.*=CUDA0" \
  -ot exps=CPU \
  -b 4096 -ub 4096 \
  --warmup-batch \
  --no-mmap \
  --threads 36
---
main: n_kv_max = 16384, n_batch = 4096, n_ubatch = 4096, flash_attn = 1, n_gpu_layers = 99, n_threads = 36, n_threads_batch = 36      

|    PP |     TG |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |
|-------|--------|--------|----------|----------|----------|----------|
|  4096 |   1024 |      0 |  189.111 |    21.66 | 1030.959 |     0.99 |
---

#10. FASTEST - MULTI-GPU, which leaves plenty of room for context size!
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-cli -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf --threads 36 --ctx-size 16384 -ngl 99 --flash-attn --main-gpu 0 --warmup-batch -fmoe -p '<｜begin▁of▁sentence｜><｜User｜>What is the solution of x+5=-2?<｜Assistant｜><think>\n' -mla 3  --override-tensor exps=CPU -rtr
---
llama_print_timings:        load time =   69348.38 ms
llama_print_timings:      sample time =      17.76 ms /    96 runs   (    0.18 ms per token,  5406.62 tokens per second)
llama_print_timings: prompt eval time =    3469.41 ms /    32 tokens (  108.42 ms per token,     9.22 tokens per second)
llama_print_timings:        eval time =   22193.81 ms /    95 runs   (  233.62 ms per token,     4.28 tokens per second)
llama_print_timings:       total time =   25903.35 ms /   127 tokens
---

#11. BENCH IT!
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0,2,1 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-bench -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf -fmoe 1 -n 0 --override-tensor exps=CPU -ngl 62 -fa --warmup-batch --main-gpu 0 --override-tensor exps=CPU -t 36 -mla 3 -b 128,512,1024,2048,4096 -ub 128,512,1024,2048,4096
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 3 CUDA devices:
  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
  Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
  Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
| model                          |       size |     params | backend    | ngl | threads | n_batch | n_ubatch | mla | fmoe |          test |              t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------: | -------: | --: | ---: | ------------: | ---------------: |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     128 |      128 |   3 |    1 |         pp512 |     15.53 ± 1.43 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     128 |      512 |   3 |    1 |         pp512 |     16.20 ± 0.10 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     128 |     1024 |   3 |    1 |         pp512 |     16.23 ± 0.13 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     128 |     2048 |   3 |    1 |         pp512 |     16.15 ± 0.15 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     128 |     4096 |   3 |    1 |         pp512 |     16.12 ± 0.07 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     512 |      128 |   3 |    1 |         pp512 |     16.26 ± 0.15 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     512 |      512 |   3 |    1 |         pp512 |     20.17 ± 0.09 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     512 |     1024 |   3 |    1 |         pp512 |     20.10 ± 0.03 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     512 |     2048 |   3 |    1 |         pp512 |     20.15 ± 0.12 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |     512 |     4096 |   3 |    1 |         pp512 |     20.18 ± 0.07 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    1024 |      128 |   3 |    1 |         pp512 |     16.26 ± 0.16 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    1024 |      512 |   3 |    1 |         pp512 |     20.13 ± 0.11 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    1024 |     1024 |   3 |    1 |         pp512 |     20.13 ± 0.07 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    1024 |     2048 |   3 |    1 |         pp512 |     20.16 ± 0.08 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    1024 |     4096 |   3 |    1 |         pp512 |     20.22 ± 0.09 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    2048 |      128 |   3 |    1 |         pp512 |     16.19 ± 0.11 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    2048 |      512 |   3 |    1 |         pp512 |     20.23 ± 0.09 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    2048 |     1024 |   3 |    1 |         pp512 |     20.17 ± 0.08 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    2048 |     2048 |   3 |    1 |         pp512 |     20.11 ± 0.10 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    2048 |     4096 |   3 |    1 |         pp512 |     20.07 ± 0.20 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    4096 |      128 |   3 |    1 |         pp512 |     16.19 ± 0.12 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    4096 |      512 |   3 |    1 |         pp512 |     20.16 ± 0.10 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    4096 |     1024 |   3 |    1 |         pp512 |     20.22 ± 0.09 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    4096 |     2048 |   3 |    1 |         pp512 |     20.13 ± 0.11 |
| deepseek2 671B IQ1_S_R4 - 1.5 bpw | 130.20 GiB |   672.05 B | CUDA       |  62 |      36 |    4096 |     4096 |   3 |    1 |         pp512 |     20.25 ± 0.06 |

build: f26fe36 (1)


#12. AS FAST AS #10. - SINGLE GPU, which doesn't leave room for context size!
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 ~/ik_llama-main-b3746-f26fe36-bin-win-cuda-12.8-x64/llama-cli -m DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf --threads 36 --ctx-size 16384 -ngl 99 --flash-attn --main-gpu 0 --warmup-batch -fmoe -p '<｜begin▁of▁sentence｜><｜User｜>What is the solution of x+5=-2?<｜Assistant｜><think>\n' -mla 3  --override-tensor exps=CPU