magiccodingman commited on
Commit
92970fa
·
verified ·
1 Parent(s): d98377c

initial upload

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json +44 -0
  3. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md +11 -0
  4. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log +177 -0
  5. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log +177 -0
  6. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log +177 -0
  7. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json +44 -0
  8. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md +11 -0
  9. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log +177 -0
  10. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log +177 -0
  11. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log +177 -0
  12. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json +44 -0
  13. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md +11 -0
  14. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log +177 -0
  15. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log +177 -0
  16. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log +177 -0
  17. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  18. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  19. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  20. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  21. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  22. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  23. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  24. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  25. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  26. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  27. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  28. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  29. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  30. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  31. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  32. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/bench_metrics.json +44 -0
  33. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md +11 -0
  34. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log +176 -0
  35. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log +176 -0
  36. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log +176 -0
  37. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  38. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  39. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +176 -0
  40. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +176 -0
  41. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +176 -0
  42. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json +44 -0
  43. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md +11 -0
  44. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log +178 -0
  45. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log +178 -0
  46. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log +178 -0
  47. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json +44 -0
  48. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md +11 -0
  49. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log +178 -0
  50. Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log +178 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ Qwen3-30B-A3B-Thinking-2507-Q5_K.gguf filter=lfs diff=lfs merge=lfs -text
38
+ Qwen3-30B-A3B-Thinking-2507-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
39
+ Qwen3-30B-A3B-Thinking-2507-iq4_nl-QKOUD-IQ4NL-E-MXFP4-H-Q5K.gguf filter=lfs diff=lfs merge=lfs -text
40
+ Qwen3-30B-A3B-Thinking-2507-iq4_nl-QKOUD-IQ4NL-EH-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
41
+ Qwen3-30B-A3B-Thinking-2507-mxfp4_moe-H-B16-EUD-IQ4NL-R-Q6K-QKO-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
42
+ Qwen3-30B-A3B-Thinking-2507-mxfp4_moe-HQKOR-B16-U-Q5K-E-Q6K-D-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.07 GiB",
13
+ "t/s": "147.62 \u00b1 3.10",
14
+ "test": "pp8",
15
+ "tps_value": 147.62
16
+ },
17
+ "test": "pp8",
18
+ "tps": 147.62
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log",
23
+ "ppl": 1.3063,
24
+ "ppl_error": 0.00707
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log",
28
+ "ppl": 6.3691,
29
+ "ppl_error": 0.12991
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log",
33
+ "ppl": 5.7368,
34
+ "ppl_error": 0.10519
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 1.1729,
40
+ "bench_tps": 147.62,
41
+ "file_size_bytes": 17263163424,
42
+ "file_size_gb": 16.08
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | pp8 | 147.62 ± 3.10 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | tg128 | 49.93 ± 0.96 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20685 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 111.624 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.31 seconds per pass - ETA 2.42 minutes
166
+ [1]1.5158,[2]1.4623,[3]1.2888,[4]1.2489,[5]1.3367,[6]1.3996,[7]1.4022,[8]1.4004,[9]1.3605,[10]1.3375,[11]1.3229,[12]1.3248,[13]1.3094,[14]1.3004,[15]1.2965,[16]1.2849,[17]1.2792,[18]1.2783,[19]1.2709,[20]1.2615,[21]1.2587,[22]1.2589,[23]1.2749,[24]1.2678,[25]1.2660,[26]1.2573,[27]1.2519,[28]1.2510,[29]1.2637,[30]1.2656,[31]1.2592,[32]1.2541,[33]1.2549,[34]1.2541,[35]1.2530,[36]1.2742,[37]1.2836,[38]1.2884,[39]1.2952,[40]1.2964,[41]1.2930,[42]1.3064,[43]1.3059,[44]1.3063,
167
+ Final estimate: PPL = 1.3063 +/- 0.00707
168
+
169
+ llama_perf_context_print: load time = 2360.00 ms
170
+ llama_perf_context_print: prompt eval time = 123483.57 ms / 90112 tokens ( 1.37 ms per token, 729.75 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124693.97 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16670 + (3896 = 3351 + 40 + 504) + 3549 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20692 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 49.041 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.25 seconds per pass - ETA 0.80 minutes
166
+ [1]5.4570,[2]6.4839,[3]6.8398,[4]6.7548,[5]6.6892,[6]5.7698,[7]5.2742,[8]5.2992,[9]5.6087,[10]5.7400,[11]5.7881,[12]6.0905,[13]6.1602,[14]6.2905,[15]6.3691,
167
+ Final estimate: PPL = 6.3691 +/- 0.12991
168
+
169
+ llama_perf_context_print: load time = 2312.99 ms
170
+ llama_perf_context_print: prompt eval time = 44838.75 ms / 30720 tokens ( 1.46 ms per token, 685.12 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45404.54 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16681 + (3896 = 3351 + 40 + 504) + 3537 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20668 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 44.543 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.35 seconds per pass - ETA 0.88 minutes
166
+ [1]4.5913,[2]5.0009,[3]5.2857,[4]5.4514,[5]5.6706,[6]5.6722,[7]5.6540,[8]5.6046,[9]5.6586,[10]5.6478,[11]5.6541,[12]5.6483,[13]5.7230,[14]5.7342,[15]5.7287,[16]5.7368,
167
+ Final estimate: PPL = 5.7368 +/- 0.10519
168
+
169
+ llama_perf_context_print: load time = 2333.37 ms
170
+ llama_perf_context_print: prompt eval time = 49087.99 ms / 32768 tokens ( 1.50 ms per token, 667.54 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49532.14 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16667 + (3896 = 3351 + 40 + 504) + 3551 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.11 GiB",
13
+ "t/s": "134.10 \u00b1 5.78",
14
+ "test": "pp8",
15
+ "tps_value": 134.1
16
+ },
17
+ "test": "pp8",
18
+ "tps": 134.1
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log",
23
+ "ppl": 1.3066,
24
+ "ppl_error": 0.00709
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log",
28
+ "ppl": 6.3484,
29
+ "ppl_error": 0.12911
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log",
33
+ "ppl": 5.7026,
34
+ "ppl_error": 0.10404
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.8703,
40
+ "bench_tps": 134.1,
41
+ "file_size_bytes": 17304490016,
42
+ "file_size_gb": 16.12
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | pp8 | 134.10 ± 5.78 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | tg128 | 48.35 ± 0.50 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20935 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 113.646 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.31 seconds per pass - ETA 2.42 minutes
166
+ [1]1.5148,[2]1.4623,[3]1.2888,[4]1.2491,[5]1.3370,[6]1.4009,[7]1.4038,[8]1.4016,[9]1.3615,[10]1.3384,[11]1.3239,[12]1.3258,[13]1.3104,[14]1.3014,[15]1.2974,[16]1.2857,[17]1.2800,[18]1.2791,[19]1.2716,[20]1.2621,[21]1.2593,[22]1.2594,[23]1.2754,[24]1.2683,[25]1.2665,[26]1.2577,[27]1.2523,[28]1.2514,[29]1.2641,[30]1.2660,[31]1.2595,[32]1.2544,[33]1.2553,[34]1.2544,[35]1.2533,[36]1.2744,[37]1.2839,[38]1.2887,[39]1.2955,[40]1.2967,[41]1.2933,[42]1.3066,[43]1.3063,[44]1.3066,
167
+ Final estimate: PPL = 1.3066 +/- 0.00709
168
+
169
+ llama_perf_context_print: load time = 2340.34 ms
170
+ llama_perf_context_print: prompt eval time = 123762.53 ms / 90112 tokens ( 1.37 ms per token, 728.10 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124977.94 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16892 + (3935 = 3351 + 40 + 544) + 3286 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20911 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 47.554 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.26 seconds per pass - ETA 0.80 minutes
166
+ [1]5.4142,[2]6.4548,[3]6.8451,[4]6.7474,[5]6.6742,[6]5.7565,[7]5.2607,[8]5.2909,[9]5.5957,[10]5.7255,[11]5.7724,[12]6.0754,[13]6.1445,[14]6.2748,[15]6.3484,
167
+ Final estimate: PPL = 6.3484 +/- 0.12911
168
+
169
+ llama_perf_context_print: load time = 2343.92 ms
170
+ llama_perf_context_print: prompt eval time = 44899.49 ms / 30720 tokens ( 1.46 ms per token, 684.19 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45321.84 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16892 + (3935 = 3351 + 40 + 544) + 3287 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20936 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 43.891 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.36 seconds per pass - ETA 0.88 minutes
166
+ [1]4.5753,[2]4.9789,[3]5.2511,[4]5.4213,[5]5.6313,[6]5.6336,[7]5.6162,[8]5.5661,[9]5.6202,[10]5.6090,[11]5.6163,[12]5.6108,[13]5.6870,[14]5.6986,[15]5.6948,[16]5.7026,
167
+ Final estimate: PPL = 5.7026 +/- 0.10404
168
+
169
+ llama_perf_context_print: load time = 2344.79 ms
170
+ llama_perf_context_print: prompt eval time = 49162.82 ms / 32768 tokens ( 1.50 ms per token, 666.52 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49605.37 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16887 + (3935 = 3351 + 40 + 544) + 3292 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.18 GiB",
13
+ "t/s": "131.55 \u00b1 3.54",
14
+ "test": "pp8",
15
+ "tps_value": 131.55
16
+ },
17
+ "test": "pp8",
18
+ "tps": 131.55
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log",
23
+ "ppl": 1.3062,
24
+ "ppl_error": 0.00708
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log",
28
+ "ppl": 6.3521,
29
+ "ppl_error": 0.12931
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log",
33
+ "ppl": 5.7063,
34
+ "ppl_error": 0.10414
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.9013,
40
+ "bench_tps": 131.55,
41
+ "file_size_bytes": 17379850272,
42
+ "file_size_gb": 16.19
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | pp8 | 131.55 ± 3.54 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | tg128 | 45.73 ± 0.43 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20926 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 115.016 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.34 seconds per pass - ETA 2.43 minutes
166
+ [1]1.5133,[2]1.4614,[3]1.2882,[4]1.2486,[5]1.3366,[6]1.4002,[7]1.4030,[8]1.4009,[9]1.3609,[10]1.3379,[11]1.3233,[12]1.3253,[13]1.3099,[14]1.3009,[15]1.2970,[16]1.2853,[17]1.2797,[18]1.2787,[19]1.2712,[20]1.2618,[21]1.2590,[22]1.2592,[23]1.2751,[24]1.2680,[25]1.2662,[26]1.2575,[27]1.2521,[28]1.2512,[29]1.2638,[30]1.2657,[31]1.2592,[32]1.2541,[33]1.2550,[34]1.2541,[35]1.2530,[36]1.2740,[37]1.2835,[38]1.2883,[39]1.2951,[40]1.2963,[41]1.2929,[42]1.3063,[43]1.3059,[44]1.3062,
167
+ Final estimate: PPL = 1.3062 +/- 0.00708
168
+
169
+ llama_perf_context_print: load time = 4892.48 ms
170
+ llama_perf_context_print: prompt eval time = 124979.07 ms / 90112 tokens ( 1.39 ms per token, 721.02 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 126196.66 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16810 + (4007 = 3351 + 40 + 616) + 3297 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20929 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 48.552 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.34 seconds per pass - ETA 0.83 minutes
166
+ [1]5.4240,[2]6.4552,[3]6.8390,[4]6.7454,[5]6.6753,[6]5.7589,[7]5.2620,[8]5.2916,[9]5.5976,[10]5.7269,[11]5.7740,[12]6.0750,[13]6.1452,[14]6.2766,[15]6.3521,
167
+ Final estimate: PPL = 6.3521 +/- 0.12931
168
+
169
+ llama_perf_context_print: load time = 2322.23 ms
170
+ llama_perf_context_print: prompt eval time = 45246.07 ms / 30720 tokens ( 1.47 ms per token, 678.95 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45672.70 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16810 + (4007 = 3351 + 40 + 616) + 3297 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20926 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 44.117 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.37 seconds per pass - ETA 0.88 minutes
166
+ [1]4.5758,[2]4.9793,[3]5.2509,[4]5.4211,[5]5.6307,[6]5.6351,[7]5.6187,[8]5.5700,[9]5.6239,[10]5.6121,[11]5.6193,[12]5.6142,[13]5.6899,[14]5.7023,[15]5.6984,[16]5.7063,
167
+ Final estimate: PPL = 5.7063 +/- 0.10414
168
+
169
+ llama_perf_context_print: load time = 2343.02 ms
170
+ llama_perf_context_print: prompt eval time = 49355.25 ms / 32768 tokens ( 1.51 ms per token, 663.92 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49799.01 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16809 + (4007 = 3351 + 40 + 616) + 3298 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.07 GiB",
13
+ "t/s": "140.60 \u00b1 5.97",
14
+ "test": "pp8",
15
+ "tps_value": 140.6
16
+ },
17
+ "test": "pp8",
18
+ "tps": 140.6
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3094,
24
+ "ppl_error": 0.00712
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.5314,
29
+ "ppl_error": 0.13444
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8549,
34
+ "ppl_error": 0.10805
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.8064,
40
+ "bench_tps": 140.6,
41
+ "file_size_bytes": 17263163424,
42
+ "file_size_gb": 16.08
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | pp8 | 140.60 ± 5.97 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | tg128 | 50.90 ± 0.12 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20671 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 117.87 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.31 seconds per pass - ETA 2.42 minutes
166
+ [1]1.5203,[2]1.4651,[3]1.2904,[4]1.2484,[5]1.3391,[6]1.4023,[7]1.4060,[8]1.4050,[9]1.3644,[10]1.3415,[11]1.3268,[12]1.3283,[13]1.3123,[14]1.3032,[15]1.2987,[16]1.2873,[17]1.2816,[18]1.2804,[19]1.2729,[20]1.2636,[21]1.2605,[22]1.2608,[23]1.2769,[24]1.2698,[25]1.2677,[26]1.2590,[27]1.2536,[28]1.2528,[29]1.2654,[30]1.2671,[31]1.2607,[32]1.2556,[33]1.2566,[34]1.2560,[35]1.2550,[36]1.2767,[37]1.2862,[38]1.2911,[39]1.2982,[40]1.2997,[41]1.2962,[42]1.3097,[43]1.3091,[44]1.3094,
167
+ Final estimate: PPL = 1.3094 +/- 0.00712
168
+
169
+ llama_perf_context_print: load time = 2353.55 ms
170
+ llama_perf_context_print: prompt eval time = 123585.53 ms / 90112 tokens ( 1.37 ms per token, 729.15 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124806.28 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16703 + (3859 = 3351 + 40 + 467) + 3552 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20663 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 50.065 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.26 seconds per pass - ETA 0.80 minutes
166
+ [1]5.5363,[2]6.5899,[3]7.0472,[4]6.9218,[5]6.8426,[6]5.8956,[7]5.3900,[8]5.4252,[9]5.7401,[10]5.8727,[11]5.9266,[12]6.2397,[13]6.3078,[14]6.4487,[15]6.5314,
167
+ Final estimate: PPL = 6.5314 +/- 0.13444
168
+
169
+ llama_perf_context_print: load time = 2400.47 ms
170
+ llama_perf_context_print: prompt eval time = 44875.22 ms / 30720 tokens ( 1.46 ms per token, 684.56 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45305.33 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16703 + (3859 = 3351 + 40 + 467) + 3552 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20669 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 44.494 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.35 seconds per pass - ETA 0.88 minutes
166
+ [1]4.6628,[2]5.0886,[3]5.3551,[4]5.5435,[5]5.7587,[6]5.7679,[7]5.7713,[8]5.7213,[9]5.7670,[10]5.7605,[11]5.7618,[12]5.7592,[13]5.8437,[14]5.8542,[15]5.8479,[16]5.8549,
167
+ Final estimate: PPL = 5.8549 +/- 0.10805
168
+
169
+ llama_perf_context_print: load time = 2442.75 ms
170
+ llama_perf_context_print: prompt eval time = 48979.64 ms / 32768 tokens ( 1.49 ms per token, 669.01 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49425.17 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16703 + (3859 = 3351 + 40 + 467) + 3552 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.11 GiB",
13
+ "t/s": "143.24 \u00b1 4.42",
14
+ "test": "pp8",
15
+ "tps_value": 143.24
16
+ },
17
+ "test": "pp8",
18
+ "tps": 143.24
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3091,
24
+ "ppl_error": 0.00711
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.5299,
29
+ "ppl_error": 0.13434
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8552,
34
+ "ppl_error": 0.10806
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.7924,
40
+ "bench_tps": 143.24,
41
+ "file_size_bytes": 17304490016,
42
+ "file_size_gb": 16.12
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | pp8 | 143.24 ± 4.42 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | tg128 | 53.04 ± 0.43 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20673 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 120.29 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.29 seconds per pass - ETA 2.40 minutes
166
+ [1]1.5278,[2]1.4668,[3]1.2915,[4]1.2495,[5]1.3401,[6]1.4029,[7]1.4067,[8]1.4054,[9]1.3647,[10]1.3419,[11]1.3268,[12]1.3283,[13]1.3125,[14]1.3034,[15]1.2991,[16]1.2875,[17]1.2820,[18]1.2811,[19]1.2736,[20]1.2642,[21]1.2612,[22]1.2615,[23]1.2772,[24]1.2701,[25]1.2680,[26]1.2592,[27]1.2538,[28]1.2530,[29]1.2656,[30]1.2672,[31]1.2609,[32]1.2558,[33]1.2568,[34]1.2561,[35]1.2551,[36]1.2768,[37]1.2863,[38]1.2912,[39]1.2983,[40]1.2996,[41]1.2959,[42]1.3094,[43]1.3088,[44]1.3091,
167
+ Final estimate: PPL = 1.3091 +/- 0.00711
168
+
169
+ llama_perf_context_print: load time = 2322.79 ms
170
+ llama_perf_context_print: prompt eval time = 123226.59 ms / 90112 tokens ( 1.37 ms per token, 731.27 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124445.92 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16704 + (3859 = 3351 + 40 + 467) + 3551 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20675 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 48.333 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.22 seconds per pass - ETA 0.80 minutes
166
+ [1]5.5268,[2]6.5916,[3]7.0599,[4]6.9248,[5]6.8390,[6]5.8915,[7]5.3858,[8]5.4210,[9]5.7348,[10]5.8667,[11]5.9198,[12]6.2353,[13]6.3051,[14]6.4444,[15]6.5299,
167
+ Final estimate: PPL = 6.5299 +/- 0.13434
168
+
169
+ llama_perf_context_print: load time = 2330.41 ms
170
+ llama_perf_context_print: prompt eval time = 44414.37 ms / 30720 tokens ( 1.45 ms per token, 691.67 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 44837.11 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16713 + (3859 = 3351 + 40 + 467) + 3542 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20665 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 45.264 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.35 seconds per pass - ETA 0.88 minutes
166
+ [1]4.6592,[2]5.0932,[3]5.3619,[4]5.5518,[5]5.7639,[6]5.7737,[7]5.7765,[8]5.7245,[9]5.7687,[10]5.7621,[11]5.7632,[12]5.7604,[13]5.8453,[14]5.8550,[15]5.8489,[16]5.8552,
167
+ Final estimate: PPL = 5.8552 +/- 0.10806
168
+
169
+ llama_perf_context_print: load time = 2341.69 ms
170
+ llama_perf_context_print: prompt eval time = 49041.53 ms / 32768 tokens ( 1.50 ms per token, 668.17 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49486.92 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16702 + (3859 = 3351 + 40 + 467) + 3554 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.18 GiB",
13
+ "t/s": "147.38 \u00b1 9.26",
14
+ "test": "pp8",
15
+ "tps_value": 147.38
16
+ },
17
+ "test": "pp8",
18
+ "tps": 147.38
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3092,
24
+ "ppl_error": 0.0071
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.527,
29
+ "ppl_error": 0.13429
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8546,
34
+ "ppl_error": 0.10803
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.7761,
40
+ "bench_tps": 147.38,
41
+ "file_size_bytes": 17379850272,
42
+ "file_size_gb": 16.19
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | pp8 | 147.38 ± 9.26 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | tg128 | 53.39 ± 0.70 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20666 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 115.443 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.29 seconds per pass - ETA 2.40 minutes
166
+ [1]1.5173,[2]1.4633,[3]1.2894,[4]1.2472,[5]1.3385,[6]1.4012,[7]1.4050,[8]1.4039,[9]1.3630,[10]1.3404,[11]1.3259,[12]1.3272,[13]1.3114,[14]1.3024,[15]1.2981,[16]1.2869,[17]1.2810,[18]1.2799,[19]1.2726,[20]1.2632,[21]1.2603,[22]1.2606,[23]1.2765,[24]1.2694,[25]1.2674,[26]1.2587,[27]1.2532,[28]1.2523,[29]1.2650,[30]1.2667,[31]1.2604,[32]1.2554,[33]1.2565,[34]1.2557,[35]1.2548,[36]1.2766,[37]1.2860,[38]1.2910,[39]1.2981,[40]1.2996,[41]1.2959,[42]1.3094,[43]1.3089,[44]1.3092,
167
+ Final estimate: PPL = 1.3092 +/- 0.00710
168
+
169
+ llama_perf_context_print: load time = 2330.47 ms
170
+ llama_perf_context_print: prompt eval time = 123459.47 ms / 90112 tokens ( 1.37 ms per token, 729.89 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124970.84 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16692 + (3859 = 3351 + 40 + 467) + 3563 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20669 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.314 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.24 seconds per pass - ETA 0.80 minutes
166
+ [1]5.5380,[2]6.6025,[3]7.0556,[4]6.9240,[5]6.8321,[6]5.8898,[7]5.3840,[8]5.4182,[9]5.7337,[10]5.8682,[11]5.9228,[12]6.2366,[13]6.3046,[14]6.4432,[15]6.5270,
167
+ Final estimate: PPL = 6.5270 +/- 0.13429
168
+
169
+ llama_perf_context_print: load time = 2539.66 ms
170
+ llama_perf_context_print: prompt eval time = 44703.27 ms / 30720 tokens ( 1.46 ms per token, 687.20 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45125.62 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16695 + (3859 = 3351 + 40 + 467) + 3561 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20663 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 45.982 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.37 seconds per pass - ETA 0.88 minutes
166
+ [1]4.6504,[2]5.0826,[3]5.3555,[4]5.5459,[5]5.7597,[6]5.7685,[7]5.7712,[8]5.7222,[9]5.7676,[10]5.7611,[11]5.7628,[12]5.7604,[13]5.8451,[14]5.8550,[15]5.8482,[16]5.8546,
167
+ Final estimate: PPL = 5.8546 +/- 0.10803
168
+
169
+ llama_perf_context_print: load time = 2562.13 ms
170
+ llama_perf_context_print: prompt eval time = 49239.31 ms / 32768 tokens ( 1.50 ms per token, 665.48 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49718.33 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16693 + (3859 = 3351 + 40 + 467) + 3563 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md",
6
+ "ngl": "30",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "30",
11
+ "params": "30.53 B",
12
+ "size": "56.89 GiB",
13
+ "t/s": "51.98 \u00b1 2.02",
14
+ "test": "pp8",
15
+ "tps_value": 51.98
16
+ },
17
+ "test": "pp8",
18
+ "tps": 51.98
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log",
23
+ "ppl": 1.2903,
24
+ "ppl_error": 0.00687
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log",
28
+ "ppl": 6.2878,
29
+ "ppl_error": 0.1285
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log",
33
+ "ppl": 5.6808,
34
+ "ppl_error": 0.10471
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.0,
40
+ "bench_tps": 51.98,
41
+ "file_size_bytes": 61095802912,
42
+ "file_size_gb": 56.9
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 56.89 GiB | 30.53 B | CUDA | 30 | pp8 | 51.98 ± 2.02 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 56.89 GiB | 30.53 B | CUDA | 30 | tg128 | 16.62 ± 0.03 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20791 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 110.9 ms
163
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 9.77 seconds per pass - ETA 7.15 minutes
165
+ [1]1.4786,[2]1.4104,[3]1.2579,[4]1.2238,[5]1.3138,[6]1.3756,[7]1.3806,[8]1.3798,[9]1.3416,[10]1.3201,[11]1.3058,[12]1.3073,[13]1.2927,[14]1.2844,[15]1.2775,[16]1.2673,[17]1.2611,[18]1.2588,[19]1.2523,[20]1.2429,[21]1.2407,[22]1.2417,[23]1.2584,[24]1.2520,[25]1.2492,[26]1.2413,[27]1.2364,[28]1.2355,[29]1.2479,[30]1.2498,[31]1.2438,[32]1.2389,[33]1.2396,[34]1.2390,[35]1.2383,[36]1.2590,[37]1.2686,[38]1.2732,[39]1.2799,[40]1.2804,[41]1.2772,[42]1.2901,[43]1.2899,[44]1.2903,
166
+ Final estimate: PPL = 1.2903 +/- 0.00687
167
+
168
+ llama_perf_context_print: load time = 7194.73 ms
169
+ llama_perf_context_print: prompt eval time = 386399.67 ms / 90112 tokens ( 4.29 ms per token, 233.21 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 387959.02 ms / 90113 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 7760 + (12824 = 11890 + 40 + 894) + 3530 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20798 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 47.691 ms
163
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 9.62 seconds per pass - ETA 2.40 minutes
165
+ [1]5.3521,[2]6.3539,[3]6.7273,[4]6.6668,[5]6.5895,[6]5.6932,[7]5.2097,[8]5.2274,[9]5.5344,[10]5.6618,[11]5.7134,[12]6.0129,[13]6.0809,[14]6.2123,[15]6.2878,
166
+ Final estimate: PPL = 6.2878 +/- 0.12850
167
+
168
+ llama_perf_context_print: load time = 7182.82 ms
169
+ llama_perf_context_print: prompt eval time = 139724.78 ms / 30720 tokens ( 4.55 ms per token, 219.86 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 140144.23 ms / 30721 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 7763 + (12824 = 11890 + 40 + 894) + 3527 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20787 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 46.364 ms
163
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 9.95 seconds per pass - ETA 2.65 minutes
165
+ [1]4.6325,[2]5.0040,[3]5.2627,[4]5.4294,[5]5.6199,[6]5.6158,[7]5.6044,[8]5.5563,[9]5.6047,[10]5.5880,[11]5.5970,[12]5.5936,[13]5.6653,[14]5.6749,[15]5.6701,[16]5.6808,
166
+ Final estimate: PPL = 5.6808 +/- 0.10471
167
+
168
+ llama_perf_context_print: load time = 7238.09 ms
169
+ llama_perf_context_print: prompt eval time = 153467.89 ms / 32768 tokens ( 4.68 ms per token, 213.52 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 154260.57 ms / 32769 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 7767 + (12824 = 11890 + 40 + 894) + 3523 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.04 GiB",
13
+ "t/s": "143.64 \u00b1 9.45",
14
+ "test": "pp8",
15
+ "tps_value": 143.64
16
+ },
17
+ "test": "pp8",
18
+ "tps": 143.64
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3098,
24
+ "ppl_error": 0.00712
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.5252,
29
+ "ppl_error": 0.13424
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8597,
34
+ "ppl_error": 0.10817
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.812,
40
+ "bench_tps": 143.64,
41
+ "file_size_bytes": 17224267808,
42
+ "file_size_gb": 16.04
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.04 GiB | 30.53 B | CUDA | 35 | pp8 | 143.64 ± 9.45 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.04 GiB | 30.53 B | CUDA | 35 | tg128 | 50.81 ± 0.75 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20789 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 113.985 ms
163
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.36 seconds per pass - ETA 2.45 minutes
165
+ [1]1.5216,[2]1.4688,[3]1.2927,[4]1.2509,[5]1.3411,[6]1.4040,[7]1.4075,[8]1.4065,[9]1.3658,[10]1.3428,[11]1.3276,[12]1.3292,[13]1.3133,[14]1.3040,[15]1.3000,[16]1.2883,[17]1.2826,[18]1.2817,[19]1.2742,[20]1.2648,[21]1.2618,[22]1.2620,[23]1.2779,[24]1.2707,[25]1.2686,[26]1.2598,[27]1.2544,[28]1.2535,[29]1.2662,[30]1.2680,[31]1.2615,[32]1.2564,[33]1.2575,[34]1.2567,[35]1.2557,[36]1.2772,[37]1.2866,[38]1.2916,[39]1.2987,[40]1.3002,[41]1.2966,[42]1.3101,[43]1.3095,[44]1.3098,
166
+ Final estimate: PPL = 1.3098 +/- 0.00712
167
+
168
+ llama_perf_context_print: load time = 2525.79 ms
169
+ llama_perf_context_print: prompt eval time = 126559.42 ms / 90112 tokens ( 1.40 ms per token, 712.01 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 127840.44 ms / 90113 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16883 + (3859 = 3351 + 40 + 467) + 3373 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20856 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 50.436 ms
163
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.30 seconds per pass - ETA 0.82 minutes
165
+ [1]5.5164,[2]6.5847,[3]7.0380,[4]6.9090,[5]6.8306,[6]5.8839,[7]5.3798,[8]5.4143,[9]5.7279,[10]5.8593,[11]5.9151,[12]6.2282,[13]6.2985,[14]6.4406,[15]6.5252,
166
+ Final estimate: PPL = 6.5252 +/- 0.13424
167
+
168
+ llama_perf_context_print: load time = 2632.70 ms
169
+ llama_perf_context_print: prompt eval time = 45810.21 ms / 30720 tokens ( 1.49 ms per token, 670.59 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 46251.63 ms / 30721 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16832 + (3859 = 3351 + 40 + 467) + 3423 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20838 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 44.471 ms
163
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.39 seconds per pass - ETA 0.90 minutes
165
+ [1]4.6492,[2]5.0826,[3]5.3607,[4]5.5500,[5]5.7673,[6]5.7750,[7]5.7793,[8]5.7272,[9]5.7721,[10]5.7634,[11]5.7653,[12]5.7635,[13]5.8485,[14]5.8593,[15]5.8536,[16]5.8597,
166
+ Final estimate: PPL = 5.8597 +/- 0.10817
167
+
168
+ llama_perf_context_print: load time = 2501.90 ms
169
+ llama_perf_context_print: prompt eval time = 50382.21 ms / 32768 tokens ( 1.54 ms per token, 650.39 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 50841.87 ms / 32769 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16728 + (3859 = 3351 + 40 + 467) + 3528 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.06 GiB",
13
+ "t/s": "153.05 \u00b1 4.67",
14
+ "test": "pp8",
15
+ "tps_value": 153.05
16
+ },
17
+ "test": "pp8",
18
+ "tps": 153.05
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log",
23
+ "ppl": 1.3056,
24
+ "ppl_error": 0.00706
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log",
28
+ "ppl": 6.3772,
29
+ "ppl_error": 0.13014
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log",
33
+ "ppl": 5.7351,
34
+ "ppl_error": 0.1051
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 1.1878,
40
+ "bench_tps": 153.05,
41
+ "file_size_bytes": 17253439520,
42
+ "file_size_gb": 16.07
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.06 GiB | 30.53 B | CUDA | 35 | pp8 | 153.05 ± 4.67 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.06 GiB | 30.53 B | CUDA | 35 | tg128 | 50.86 ± 0.32 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20668 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round1_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ llama_model_loader: - type mxfp4: 1 tensors
54
+ print_info: file format = GGUF V3 (latest)
55
+ print_info: file type = IQ4_NL - 4.5 bpw
56
+ print_info: file size = 16.06 GiB (4.52 BPW)
57
+ load: printing all EOG tokens:
58
+ load: - 151643 ('<|endoftext|>')
59
+ load: - 151645 ('<|im_end|>')
60
+ load: - 151662 ('<|fim_pad|>')
61
+ load: - 151663 ('<|repo_name|>')
62
+ load: - 151664 ('<|file_sep|>')
63
+ load: special tokens cache size = 26
64
+ load: token to piece cache size = 0.9311 MB
65
+ print_info: arch = qwen3moe
66
+ print_info: vocab_only = 0
67
+ print_info: n_ctx_train = 262144
68
+ print_info: n_embd = 2048
69
+ print_info: n_embd_inp = 2048
70
+ print_info: n_layer = 48
71
+ print_info: n_head = 32
72
+ print_info: n_head_kv = 4
73
+ print_info: n_rot = 128
74
+ print_info: n_swa = 0
75
+ print_info: is_swa_any = 0
76
+ print_info: n_embd_head_k = 128
77
+ print_info: n_embd_head_v = 128
78
+ print_info: n_gqa = 8
79
+ print_info: n_embd_k_gqa = 512
80
+ print_info: n_embd_v_gqa = 512
81
+ print_info: f_norm_eps = 0.0e+00
82
+ print_info: f_norm_rms_eps = 1.0e-06
83
+ print_info: f_clamp_kqv = 0.0e+00
84
+ print_info: f_max_alibi_bias = 0.0e+00
85
+ print_info: f_logit_scale = 0.0e+00
86
+ print_info: f_attn_scale = 0.0e+00
87
+ print_info: n_ff = 6144
88
+ print_info: n_expert = 128
89
+ print_info: n_expert_used = 8
90
+ print_info: n_expert_groups = 0
91
+ print_info: n_group_used = 0
92
+ print_info: causal attn = 1
93
+ print_info: pooling type = 0
94
+ print_info: rope type = 2
95
+ print_info: rope scaling = linear
96
+ print_info: freq_base_train = 10000000.0
97
+ print_info: freq_scale_train = 1
98
+ print_info: n_ctx_orig_yarn = 262144
99
+ print_info: rope_finetuned = unknown
100
+ print_info: model type = 30B.A3B
101
+ print_info: model params = 30.53 B
102
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
103
+ print_info: n_ff_exp = 768
104
+ print_info: vocab type = BPE
105
+ print_info: n_vocab = 151936
106
+ print_info: n_merges = 151387
107
+ print_info: BOS token = 11 ','
108
+ print_info: EOS token = 151645 '<|im_end|>'
109
+ print_info: EOT token = 151645 '<|im_end|>'
110
+ print_info: PAD token = 151654 '<|vision_pad|>'
111
+ print_info: LF token = 198 'Ċ'
112
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
113
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
114
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
115
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
116
+ print_info: FIM REP token = 151663 '<|repo_name|>'
117
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
118
+ print_info: EOG token = 151643 '<|endoftext|>'
119
+ print_info: EOG token = 151645 '<|im_end|>'
120
+ print_info: EOG token = 151662 '<|fim_pad|>'
121
+ print_info: EOG token = 151663 '<|repo_name|>'
122
+ print_info: EOG token = 151664 '<|file_sep|>'
123
+ print_info: max token length = 256
124
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
125
+ load_tensors: offloading 20 repeating layers to GPU
126
+ load_tensors: offloaded 20/49 layers to GPU
127
+ load_tensors: CPU_Mapped model buffer size = 9745.64 MiB
128
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
129
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
130
+ ...................................................................................................
131
+ llama_context: constructing llama_context
132
+ llama_context: n_seq_max = 1
133
+ llama_context: n_ctx = 2048
134
+ llama_context: n_ctx_seq = 2048
135
+ llama_context: n_batch = 2048
136
+ llama_context: n_ubatch = 512
137
+ llama_context: causal_attn = 1
138
+ llama_context: flash_attn = auto
139
+ llama_context: kv_unified = false
140
+ llama_context: freq_base = 10000000.0
141
+ llama_context: freq_scale = 1
142
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
143
+ llama_context: CPU output buffer size = 0.58 MiB
144
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
145
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
147
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
148
+ llama_context: Flash Attention was auto, set to enabled
149
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
150
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
151
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
152
+ llama_context: graph nodes = 3031
153
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
154
+ common_init_from_params: added <|endoftext|> logit bias = -inf
155
+ common_init_from_params: added <|im_end|> logit bias = -inf
156
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
157
+ common_init_from_params: added <|repo_name|> logit bias = -inf
158
+ common_init_from_params: added <|file_sep|> logit bias = -inf
159
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
160
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
161
+
162
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
163
+ perplexity: tokenizing the input ..
164
+ perplexity: tokenization took 112.915 ms
165
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
166
+ perplexity: 3.31 seconds per pass - ETA 2.42 minutes
167
+ [1]1.5133,[2]1.4561,[3]1.2850,[4]1.2452,[5]1.3348,[6]1.3979,[7]1.4007,[8]1.3980,[9]1.3584,[10]1.3355,[11]1.3212,[12]1.3232,[13]1.3079,[14]1.2995,[15]1.2950,[16]1.2837,[17]1.2777,[18]1.2768,[19]1.2694,[20]1.2601,[21]1.2574,[22]1.2577,[23]1.2737,[24]1.2667,[25]1.2652,[26]1.2566,[27]1.2512,[28]1.2505,[29]1.2631,[30]1.2650,[31]1.2585,[32]1.2534,[33]1.2542,[34]1.2534,[35]1.2523,[36]1.2737,[37]1.2830,[38]1.2878,[39]1.2945,[40]1.2956,[41]1.2922,[42]1.3056,[43]1.3053,[44]1.3056,
168
+ Final estimate: PPL = 1.3056 +/- 0.00706
169
+
170
+ llama_perf_context_print: load time = 5258.07 ms
171
+ llama_perf_context_print: prompt eval time = 123618.62 ms / 90112 tokens ( 1.37 ms per token, 728.95 tokens per second)
172
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
173
+ llama_perf_context_print: total time = 124982.52 ms / 90113 tokens
174
+ llama_perf_context_print: graphs reused = 0
175
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
176
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16666 + (3896 = 3351 + 40 + 504) + 3552 |
177
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
178
+ llama_memory_breakdown_print: | - Host | 9865 = 9745 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20669 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round1_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ llama_model_loader: - type mxfp4: 1 tensors
54
+ print_info: file format = GGUF V3 (latest)
55
+ print_info: file type = IQ4_NL - 4.5 bpw
56
+ print_info: file size = 16.06 GiB (4.52 BPW)
57
+ load: printing all EOG tokens:
58
+ load: - 151643 ('<|endoftext|>')
59
+ load: - 151645 ('<|im_end|>')
60
+ load: - 151662 ('<|fim_pad|>')
61
+ load: - 151663 ('<|repo_name|>')
62
+ load: - 151664 ('<|file_sep|>')
63
+ load: special tokens cache size = 26
64
+ load: token to piece cache size = 0.9311 MB
65
+ print_info: arch = qwen3moe
66
+ print_info: vocab_only = 0
67
+ print_info: n_ctx_train = 262144
68
+ print_info: n_embd = 2048
69
+ print_info: n_embd_inp = 2048
70
+ print_info: n_layer = 48
71
+ print_info: n_head = 32
72
+ print_info: n_head_kv = 4
73
+ print_info: n_rot = 128
74
+ print_info: n_swa = 0
75
+ print_info: is_swa_any = 0
76
+ print_info: n_embd_head_k = 128
77
+ print_info: n_embd_head_v = 128
78
+ print_info: n_gqa = 8
79
+ print_info: n_embd_k_gqa = 512
80
+ print_info: n_embd_v_gqa = 512
81
+ print_info: f_norm_eps = 0.0e+00
82
+ print_info: f_norm_rms_eps = 1.0e-06
83
+ print_info: f_clamp_kqv = 0.0e+00
84
+ print_info: f_max_alibi_bias = 0.0e+00
85
+ print_info: f_logit_scale = 0.0e+00
86
+ print_info: f_attn_scale = 0.0e+00
87
+ print_info: n_ff = 6144
88
+ print_info: n_expert = 128
89
+ print_info: n_expert_used = 8
90
+ print_info: n_expert_groups = 0
91
+ print_info: n_group_used = 0
92
+ print_info: causal attn = 1
93
+ print_info: pooling type = 0
94
+ print_info: rope type = 2
95
+ print_info: rope scaling = linear
96
+ print_info: freq_base_train = 10000000.0
97
+ print_info: freq_scale_train = 1
98
+ print_info: n_ctx_orig_yarn = 262144
99
+ print_info: rope_finetuned = unknown
100
+ print_info: model type = 30B.A3B
101
+ print_info: model params = 30.53 B
102
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
103
+ print_info: n_ff_exp = 768
104
+ print_info: vocab type = BPE
105
+ print_info: n_vocab = 151936
106
+ print_info: n_merges = 151387
107
+ print_info: BOS token = 11 ','
108
+ print_info: EOS token = 151645 '<|im_end|>'
109
+ print_info: EOT token = 151645 '<|im_end|>'
110
+ print_info: PAD token = 151654 '<|vision_pad|>'
111
+ print_info: LF token = 198 'Ċ'
112
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
113
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
114
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
115
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
116
+ print_info: FIM REP token = 151663 '<|repo_name|>'
117
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
118
+ print_info: EOG token = 151643 '<|endoftext|>'
119
+ print_info: EOG token = 151645 '<|im_end|>'
120
+ print_info: EOG token = 151662 '<|fim_pad|>'
121
+ print_info: EOG token = 151663 '<|repo_name|>'
122
+ print_info: EOG token = 151664 '<|file_sep|>'
123
+ print_info: max token length = 256
124
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
125
+ load_tensors: offloading 20 repeating layers to GPU
126
+ load_tensors: offloaded 20/49 layers to GPU
127
+ load_tensors: CPU_Mapped model buffer size = 9745.64 MiB
128
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
129
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
130
+ ...................................................................................................
131
+ llama_context: constructing llama_context
132
+ llama_context: n_seq_max = 1
133
+ llama_context: n_ctx = 2048
134
+ llama_context: n_ctx_seq = 2048
135
+ llama_context: n_batch = 2048
136
+ llama_context: n_ubatch = 512
137
+ llama_context: causal_attn = 1
138
+ llama_context: flash_attn = auto
139
+ llama_context: kv_unified = false
140
+ llama_context: freq_base = 10000000.0
141
+ llama_context: freq_scale = 1
142
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
143
+ llama_context: CPU output buffer size = 0.58 MiB
144
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
145
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
147
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
148
+ llama_context: Flash Attention was auto, set to enabled
149
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
150
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
151
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
152
+ llama_context: graph nodes = 3031
153
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
154
+ common_init_from_params: added <|endoftext|> logit bias = -inf
155
+ common_init_from_params: added <|im_end|> logit bias = -inf
156
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
157
+ common_init_from_params: added <|repo_name|> logit bias = -inf
158
+ common_init_from_params: added <|file_sep|> logit bias = -inf
159
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
160
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
161
+
162
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
163
+ perplexity: tokenizing the input ..
164
+ perplexity: tokenization took 48.471 ms
165
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
166
+ perplexity: 3.26 seconds per pass - ETA 0.80 minutes
167
+ [1]5.4538,[2]6.4958,[3]6.8567,[4]6.7701,[5]6.6970,[6]5.7723,[7]5.2801,[8]5.3017,[9]5.6168,[10]5.7496,[11]5.7953,[12]6.0979,[13]6.1691,[14]6.2968,[15]6.3772,
168
+ Final estimate: PPL = 6.3772 +/- 0.13014
169
+
170
+ llama_perf_context_print: load time = 2328.69 ms
171
+ llama_perf_context_print: prompt eval time = 44950.22 ms / 30720 tokens ( 1.46 ms per token, 683.42 tokens per second)
172
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
173
+ llama_perf_context_print: total time = 45519.81 ms / 30721 tokens
174
+ llama_perf_context_print: graphs reused = 0
175
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
176
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16664 + (3896 = 3351 + 40 + 504) + 3554 |
177
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
178
+ llama_memory_breakdown_print: | - Host | 9865 = 9745 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20670 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round1_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ llama_model_loader: - type mxfp4: 1 tensors
54
+ print_info: file format = GGUF V3 (latest)
55
+ print_info: file type = IQ4_NL - 4.5 bpw
56
+ print_info: file size = 16.06 GiB (4.52 BPW)
57
+ load: printing all EOG tokens:
58
+ load: - 151643 ('<|endoftext|>')
59
+ load: - 151645 ('<|im_end|>')
60
+ load: - 151662 ('<|fim_pad|>')
61
+ load: - 151663 ('<|repo_name|>')
62
+ load: - 151664 ('<|file_sep|>')
63
+ load: special tokens cache size = 26
64
+ load: token to piece cache size = 0.9311 MB
65
+ print_info: arch = qwen3moe
66
+ print_info: vocab_only = 0
67
+ print_info: n_ctx_train = 262144
68
+ print_info: n_embd = 2048
69
+ print_info: n_embd_inp = 2048
70
+ print_info: n_layer = 48
71
+ print_info: n_head = 32
72
+ print_info: n_head_kv = 4
73
+ print_info: n_rot = 128
74
+ print_info: n_swa = 0
75
+ print_info: is_swa_any = 0
76
+ print_info: n_embd_head_k = 128
77
+ print_info: n_embd_head_v = 128
78
+ print_info: n_gqa = 8
79
+ print_info: n_embd_k_gqa = 512
80
+ print_info: n_embd_v_gqa = 512
81
+ print_info: f_norm_eps = 0.0e+00
82
+ print_info: f_norm_rms_eps = 1.0e-06
83
+ print_info: f_clamp_kqv = 0.0e+00
84
+ print_info: f_max_alibi_bias = 0.0e+00
85
+ print_info: f_logit_scale = 0.0e+00
86
+ print_info: f_attn_scale = 0.0e+00
87
+ print_info: n_ff = 6144
88
+ print_info: n_expert = 128
89
+ print_info: n_expert_used = 8
90
+ print_info: n_expert_groups = 0
91
+ print_info: n_group_used = 0
92
+ print_info: causal attn = 1
93
+ print_info: pooling type = 0
94
+ print_info: rope type = 2
95
+ print_info: rope scaling = linear
96
+ print_info: freq_base_train = 10000000.0
97
+ print_info: freq_scale_train = 1
98
+ print_info: n_ctx_orig_yarn = 262144
99
+ print_info: rope_finetuned = unknown
100
+ print_info: model type = 30B.A3B
101
+ print_info: model params = 30.53 B
102
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
103
+ print_info: n_ff_exp = 768
104
+ print_info: vocab type = BPE
105
+ print_info: n_vocab = 151936
106
+ print_info: n_merges = 151387
107
+ print_info: BOS token = 11 ','
108
+ print_info: EOS token = 151645 '<|im_end|>'
109
+ print_info: EOT token = 151645 '<|im_end|>'
110
+ print_info: PAD token = 151654 '<|vision_pad|>'
111
+ print_info: LF token = 198 'Ċ'
112
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
113
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
114
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
115
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
116
+ print_info: FIM REP token = 151663 '<|repo_name|>'
117
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
118
+ print_info: EOG token = 151643 '<|endoftext|>'
119
+ print_info: EOG token = 151645 '<|im_end|>'
120
+ print_info: EOG token = 151662 '<|fim_pad|>'
121
+ print_info: EOG token = 151663 '<|repo_name|>'
122
+ print_info: EOG token = 151664 '<|file_sep|>'
123
+ print_info: max token length = 256
124
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
125
+ load_tensors: offloading 20 repeating layers to GPU
126
+ load_tensors: offloaded 20/49 layers to GPU
127
+ load_tensors: CPU_Mapped model buffer size = 9745.64 MiB
128
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
129
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
130
+ ...................................................................................................
131
+ llama_context: constructing llama_context
132
+ llama_context: n_seq_max = 1
133
+ llama_context: n_ctx = 2048
134
+ llama_context: n_ctx_seq = 2048
135
+ llama_context: n_batch = 2048
136
+ llama_context: n_ubatch = 512
137
+ llama_context: causal_attn = 1
138
+ llama_context: flash_attn = auto
139
+ llama_context: kv_unified = false
140
+ llama_context: freq_base = 10000000.0
141
+ llama_context: freq_scale = 1
142
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
143
+ llama_context: CPU output buffer size = 0.58 MiB
144
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
145
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
147
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
148
+ llama_context: Flash Attention was auto, set to enabled
149
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
150
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
151
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
152
+ llama_context: graph nodes = 3031
153
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
154
+ common_init_from_params: added <|endoftext|> logit bias = -inf
155
+ common_init_from_params: added <|im_end|> logit bias = -inf
156
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
157
+ common_init_from_params: added <|repo_name|> logit bias = -inf
158
+ common_init_from_params: added <|file_sep|> logit bias = -inf
159
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
160
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
161
+
162
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
163
+ perplexity: tokenizing the input ..
164
+ perplexity: tokenization took 45.919 ms
165
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
166
+ perplexity: 3.36 seconds per pass - ETA 0.88 minutes
167
+ [1]4.5968,[2]5.0125,[3]5.2798,[4]5.4458,[5]5.6643,[6]5.6658,[7]5.6468,[8]5.5969,[9]5.6531,[10]5.6451,[11]5.6541,[12]5.6482,[13]5.7220,[14]5.7333,[15]5.7271,[16]5.7351,
168
+ Final estimate: PPL = 5.7351 +/- 0.10510
169
+
170
+ llama_perf_context_print: load time = 2390.66 ms
171
+ llama_perf_context_print: prompt eval time = 49186.34 ms / 32768 tokens ( 1.50 ms per token, 666.20 tokens per second)
172
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
173
+ llama_perf_context_print: total time = 49704.78 ms / 32769 tokens
174
+ llama_perf_context_print: graphs reused = 0
175
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
176
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16667 + (3896 = 3351 + 40 + 504) + 3551 |
177
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
178
+ llama_memory_breakdown_print: | - Host | 9865 = 9745 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.17 GiB",
13
+ "t/s": "138.04 \u00b1 8.36",
14
+ "test": "pp8",
15
+ "tps_value": 138.04
16
+ },
17
+ "test": "pp8",
18
+ "tps": 138.04
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log",
23
+ "ppl": 1.3055,
24
+ "ppl_error": 0.00706
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log",
28
+ "ppl": 6.3615,
29
+ "ppl_error": 0.12957
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log",
33
+ "ppl": 5.7044,
34
+ "ppl_error": 0.10406
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.9219,
40
+ "bench_tps": 138.04,
41
+ "file_size_bytes": 17370126368,
42
+ "file_size_gb": 16.18
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.17 GiB | 30.53 B | CUDA | 35 | pp8 | 138.04 ± 8.36 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.17 GiB | 30.53 B | CUDA | 35 | tg128 | 44.46 ± 0.42 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20658 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round2_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ llama_model_loader: - type mxfp4: 1 tensors
54
+ print_info: file format = GGUF V3 (latest)
55
+ print_info: file type = IQ4_NL - 4.5 bpw
56
+ print_info: file size = 16.17 GiB (4.55 BPW)
57
+ load: printing all EOG tokens:
58
+ load: - 151643 ('<|endoftext|>')
59
+ load: - 151645 ('<|im_end|>')
60
+ load: - 151662 ('<|fim_pad|>')
61
+ load: - 151663 ('<|repo_name|>')
62
+ load: - 151664 ('<|file_sep|>')
63
+ load: special tokens cache size = 26
64
+ load: token to piece cache size = 0.9311 MB
65
+ print_info: arch = qwen3moe
66
+ print_info: vocab_only = 0
67
+ print_info: n_ctx_train = 262144
68
+ print_info: n_embd = 2048
69
+ print_info: n_embd_inp = 2048
70
+ print_info: n_layer = 48
71
+ print_info: n_head = 32
72
+ print_info: n_head_kv = 4
73
+ print_info: n_rot = 128
74
+ print_info: n_swa = 0
75
+ print_info: is_swa_any = 0
76
+ print_info: n_embd_head_k = 128
77
+ print_info: n_embd_head_v = 128
78
+ print_info: n_gqa = 8
79
+ print_info: n_embd_k_gqa = 512
80
+ print_info: n_embd_v_gqa = 512
81
+ print_info: f_norm_eps = 0.0e+00
82
+ print_info: f_norm_rms_eps = 1.0e-06
83
+ print_info: f_clamp_kqv = 0.0e+00
84
+ print_info: f_max_alibi_bias = 0.0e+00
85
+ print_info: f_logit_scale = 0.0e+00
86
+ print_info: f_attn_scale = 0.0e+00
87
+ print_info: n_ff = 6144
88
+ print_info: n_expert = 128
89
+ print_info: n_expert_used = 8
90
+ print_info: n_expert_groups = 0
91
+ print_info: n_group_used = 0
92
+ print_info: causal attn = 1
93
+ print_info: pooling type = 0
94
+ print_info: rope type = 2
95
+ print_info: rope scaling = linear
96
+ print_info: freq_base_train = 10000000.0
97
+ print_info: freq_scale_train = 1
98
+ print_info: n_ctx_orig_yarn = 262144
99
+ print_info: rope_finetuned = unknown
100
+ print_info: model type = 30B.A3B
101
+ print_info: model params = 30.53 B
102
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
103
+ print_info: n_ff_exp = 768
104
+ print_info: vocab type = BPE
105
+ print_info: n_vocab = 151936
106
+ print_info: n_merges = 151387
107
+ print_info: BOS token = 11 ','
108
+ print_info: EOS token = 151645 '<|im_end|>'
109
+ print_info: EOT token = 151645 '<|im_end|>'
110
+ print_info: PAD token = 151654 '<|vision_pad|>'
111
+ print_info: LF token = 198 'Ċ'
112
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
113
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
114
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
115
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
116
+ print_info: FIM REP token = 151663 '<|repo_name|>'
117
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
118
+ print_info: EOG token = 151643 '<|endoftext|>'
119
+ print_info: EOG token = 151645 '<|im_end|>'
120
+ print_info: EOG token = 151662 '<|fim_pad|>'
121
+ print_info: EOG token = 151663 '<|repo_name|>'
122
+ print_info: EOG token = 151664 '<|file_sep|>'
123
+ print_info: max token length = 256
124
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
125
+ load_tensors: offloading 20 repeating layers to GPU
126
+ load_tensors: offloaded 20/49 layers to GPU
127
+ load_tensors: CPU_Mapped model buffer size = 9856.92 MiB
128
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
129
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
130
+ ...................................................................................................
131
+ llama_context: constructing llama_context
132
+ llama_context: n_seq_max = 1
133
+ llama_context: n_ctx = 2048
134
+ llama_context: n_ctx_seq = 2048
135
+ llama_context: n_batch = 2048
136
+ llama_context: n_ubatch = 512
137
+ llama_context: causal_attn = 1
138
+ llama_context: flash_attn = auto
139
+ llama_context: kv_unified = false
140
+ llama_context: freq_base = 10000000.0
141
+ llama_context: freq_scale = 1
142
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
143
+ llama_context: CPU output buffer size = 0.58 MiB
144
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
145
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
147
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
148
+ llama_context: Flash Attention was auto, set to enabled
149
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
150
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
151
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
152
+ llama_context: graph nodes = 3031
153
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
154
+ common_init_from_params: added <|endoftext|> logit bias = -inf
155
+ common_init_from_params: added <|im_end|> logit bias = -inf
156
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
157
+ common_init_from_params: added <|repo_name|> logit bias = -inf
158
+ common_init_from_params: added <|file_sep|> logit bias = -inf
159
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
160
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
161
+
162
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
163
+ perplexity: tokenizing the input ..
164
+ perplexity: tokenization took 114.273 ms
165
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
166
+ perplexity: 3.35 seconds per pass - ETA 2.45 minutes
167
+ [1]1.5110,[2]1.4554,[3]1.2846,[4]1.2451,[5]1.3348,[6]1.3984,[7]1.4012,[8]1.3983,[9]1.3587,[10]1.3358,[11]1.3216,[12]1.3237,[13]1.3084,[14]1.3000,[15]1.2953,[16]1.2841,[17]1.2782,[18]1.2772,[19]1.2697,[20]1.2604,[21]1.2577,[22]1.2579,[23]1.2739,[24]1.2669,[25]1.2654,[26]1.2567,[27]1.2513,[28]1.2506,[29]1.2632,[30]1.2650,[31]1.2585,[32]1.2535,[33]1.2542,[34]1.2534,[35]1.2523,[36]1.2735,[37]1.2829,[38]1.2876,[39]1.2944,[40]1.2955,[41]1.2921,[42]1.3054,[43]1.3052,[44]1.3055,
168
+ Final estimate: PPL = 1.3055 +/- 0.00706
169
+
170
+ llama_perf_context_print: load time = 2382.39 ms
171
+ llama_perf_context_print: prompt eval time = 125241.33 ms / 90112 tokens ( 1.39 ms per token, 719.51 tokens per second)
172
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
173
+ llama_perf_context_print: total time = 126477.18 ms / 90113 tokens
174
+ llama_perf_context_print: graphs reused = 0
175
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
176
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16537 + (4007 = 3351 + 40 + 616) + 3570 |
177
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
178
+ llama_memory_breakdown_print: | - Host | 9976 = 9856 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20659 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/ToBench/Qwen3-30B-A3B-Thinking-2507-unsloth/Magic_Quant/GGUF/dc_round2_Qwen3-30B-A3B-Thinking-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_MXFP4-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Thinking 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Thinking-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Thinking 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,3] = ["qwen", "qwen3", "unsloth"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ llama_model_loader: - type mxfp4: 1 tensors
54
+ print_info: file format = GGUF V3 (latest)
55
+ print_info: file type = IQ4_NL - 4.5 bpw
56
+ print_info: file size = 16.17 GiB (4.55 BPW)
57
+ load: printing all EOG tokens:
58
+ load: - 151643 ('<|endoftext|>')
59
+ load: - 151645 ('<|im_end|>')
60
+ load: - 151662 ('<|fim_pad|>')
61
+ load: - 151663 ('<|repo_name|>')
62
+ load: - 151664 ('<|file_sep|>')
63
+ load: special tokens cache size = 26
64
+ load: token to piece cache size = 0.9311 MB
65
+ print_info: arch = qwen3moe
66
+ print_info: vocab_only = 0
67
+ print_info: n_ctx_train = 262144
68
+ print_info: n_embd = 2048
69
+ print_info: n_embd_inp = 2048
70
+ print_info: n_layer = 48
71
+ print_info: n_head = 32
72
+ print_info: n_head_kv = 4
73
+ print_info: n_rot = 128
74
+ print_info: n_swa = 0
75
+ print_info: is_swa_any = 0
76
+ print_info: n_embd_head_k = 128
77
+ print_info: n_embd_head_v = 128
78
+ print_info: n_gqa = 8
79
+ print_info: n_embd_k_gqa = 512
80
+ print_info: n_embd_v_gqa = 512
81
+ print_info: f_norm_eps = 0.0e+00
82
+ print_info: f_norm_rms_eps = 1.0e-06
83
+ print_info: f_clamp_kqv = 0.0e+00
84
+ print_info: f_max_alibi_bias = 0.0e+00
85
+ print_info: f_logit_scale = 0.0e+00
86
+ print_info: f_attn_scale = 0.0e+00
87
+ print_info: n_ff = 6144
88
+ print_info: n_expert = 128
89
+ print_info: n_expert_used = 8
90
+ print_info: n_expert_groups = 0
91
+ print_info: n_group_used = 0
92
+ print_info: causal attn = 1
93
+ print_info: pooling type = 0
94
+ print_info: rope type = 2
95
+ print_info: rope scaling = linear
96
+ print_info: freq_base_train = 10000000.0
97
+ print_info: freq_scale_train = 1
98
+ print_info: n_ctx_orig_yarn = 262144
99
+ print_info: rope_finetuned = unknown
100
+ print_info: model type = 30B.A3B
101
+ print_info: model params = 30.53 B
102
+ print_info: general.name = Qwen3 30B A3B Thinking 2507 Unsloth
103
+ print_info: n_ff_exp = 768
104
+ print_info: vocab type = BPE
105
+ print_info: n_vocab = 151936
106
+ print_info: n_merges = 151387
107
+ print_info: BOS token = 11 ','
108
+ print_info: EOS token = 151645 '<|im_end|>'
109
+ print_info: EOT token = 151645 '<|im_end|>'
110
+ print_info: PAD token = 151654 '<|vision_pad|>'
111
+ print_info: LF token = 198 'Ċ'
112
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
113
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
114
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
115
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
116
+ print_info: FIM REP token = 151663 '<|repo_name|>'
117
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
118
+ print_info: EOG token = 151643 '<|endoftext|>'
119
+ print_info: EOG token = 151645 '<|im_end|>'
120
+ print_info: EOG token = 151662 '<|fim_pad|>'
121
+ print_info: EOG token = 151663 '<|repo_name|>'
122
+ print_info: EOG token = 151664 '<|file_sep|>'
123
+ print_info: max token length = 256
124
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
125
+ load_tensors: offloading 20 repeating layers to GPU
126
+ load_tensors: offloaded 20/49 layers to GPU
127
+ load_tensors: CPU_Mapped model buffer size = 9856.92 MiB
128
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
129
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
130
+ ...................................................................................................
131
+ llama_context: constructing llama_context
132
+ llama_context: n_seq_max = 1
133
+ llama_context: n_ctx = 2048
134
+ llama_context: n_ctx_seq = 2048
135
+ llama_context: n_batch = 2048
136
+ llama_context: n_ubatch = 512
137
+ llama_context: causal_attn = 1
138
+ llama_context: flash_attn = auto
139
+ llama_context: kv_unified = false
140
+ llama_context: freq_base = 10000000.0
141
+ llama_context: freq_scale = 1
142
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
143
+ llama_context: CPU output buffer size = 0.58 MiB
144
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
145
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
147
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
148
+ llama_context: Flash Attention was auto, set to enabled
149
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
150
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
151
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
152
+ llama_context: graph nodes = 3031
153
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
154
+ common_init_from_params: added <|endoftext|> logit bias = -inf
155
+ common_init_from_params: added <|im_end|> logit bias = -inf
156
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
157
+ common_init_from_params: added <|repo_name|> logit bias = -inf
158
+ common_init_from_params: added <|file_sep|> logit bias = -inf
159
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
160
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
161
+
162
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
163
+ perplexity: tokenizing the input ..
164
+ perplexity: tokenization took 48.386 ms
165
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
166
+ perplexity: 3.29 seconds per pass - ETA 0.82 minutes
167
+ [1]5.4240,[2]6.4707,[3]6.8596,[4]6.7633,[5]6.6858,[6]5.7635,[7]5.2691,[8]5.2953,[9]5.6070,[10]5.7377,[11]5.7823,[12]6.0837,[13]6.1553,[14]6.2838,[15]6.3615,
168
+ Final estimate: PPL = 6.3615 +/- 0.12957
169
+
170
+ llama_perf_context_print: load time = 2389.34 ms
171
+ llama_perf_context_print: prompt eval time = 45389.07 ms / 30720 tokens ( 1.48 ms per token, 676.81 tokens per second)
172
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
173
+ llama_perf_context_print: total time = 45816.96 ms / 30721 tokens
174
+ llama_perf_context_print: graphs reused = 0
175
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
176
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16542 + (4007 = 3351 + 40 + 616) + 3565 |
177
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 649 |
178
+ llama_memory_breakdown_print: | - Host | 9976 = 9856 + 112 + 8 |