bearzi commited on
Commit
98cffed
·
verified ·
1 Parent(s): 5f8b605

Upload Qwen3-Coder-Next-oQ3

Browse files
README.md CHANGED
@@ -6,24 +6,35 @@ license: apache-2.0
6
  tags:
7
  - mlx
8
  - omlx
9
- - quantized
10
  - oq3
 
11
  ---
12
 
13
  # Qwen3-Coder-Next-oQ3
14
 
15
  oQ3 mixed-precision MLX quantization produced via [oMLX](https://github.com/jundot/omlx).
16
 
17
- - **Quantization:** oQ3 (sensitivity-driven, group_size=64)
18
- - **Format:** MLX safetensors, loadable with `mlx-vlm` and `mlx-lm`
 
19
 
20
  ## Usage
21
 
22
- ```bash
23
- pip install mlx-vlm
24
- python3 -m mlx_vlm generate --model bearzi/Qwen3-Coder-Next-oQ3 --prompt "Your prompt here" --max-tokens 512
 
 
 
 
 
25
  ```
26
 
27
  ## About oQ
28
 
29
- oQ measures per-layer quantization sensitivity through calibration inference and allocates bits where they matter most — critical layers stay at higher precision, tolerant layers compress aggressively. See [oMLX docs](https://github.com/jundot/omlx/blob/main/docs/oQ_Quantization.md).
 
 
 
 
 
6
  tags:
7
  - mlx
8
  - omlx
9
+ - oq
10
  - oq3
11
+ - quantized
12
  ---
13
 
14
  # Qwen3-Coder-Next-oQ3
15
 
16
  oQ3 mixed-precision MLX quantization produced via [oMLX](https://github.com/jundot/omlx).
17
 
18
+ - **Quantization:** oQ3 (sensitivity-driven mixed precision, group_size=64)
19
+ - **Format:** MLX safetensors
20
+ - **Compatible with:** mlx-lm, mlx-vlm, oMLX on Apple Silicon
21
 
22
  ## Usage
23
 
24
+ ```python
25
+ from mlx_lm import load, generate
26
+ model, tokenizer = load("bearzi/Qwen3-Coder-Next-oQ3")
27
+ prompt = tokenizer.apply_chat_template(
28
+ [{"role": "user", "content": "Hello"}],
29
+ add_generation_prompt=True,
30
+ )
31
+ print(generate(model, tokenizer, prompt=prompt, max_tokens=512, verbose=True))
32
  ```
33
 
34
  ## About oQ
35
 
36
+ oQ measures per-layer quantization sensitivity through calibration and allocates bits where they matter most — critical layers stay at higher precision, tolerant layers compress aggressively. Target averages of 2/3/4/6/8 bits are provided; actual per-layer bits vary by measured sensitivity.
37
+
38
+ See [oQ documentation](https://github.com/jundot/omlx/blob/main/docs/oQ_Quantization.md).
39
+
40
+ Comparative benchmarks and feedback welcome — please open a discussion.
config.json CHANGED
@@ -56,7 +56,7 @@
56
  "mode": "affine"
57
  },
58
  "model.layers.0.linear_attn.in_proj_qkvz": {
59
- "bits": 6,
60
  "group_size": 64,
61
  "mode": "affine"
62
  },
@@ -321,22 +321,22 @@
321
  "mode": "affine"
322
  },
323
  "model.layers.7.self_attn.k_proj": {
324
- "bits": 5,
325
  "group_size": 64,
326
  "mode": "affine"
327
  },
328
  "model.layers.7.self_attn.o_proj": {
329
- "bits": 5,
330
  "group_size": 64,
331
  "mode": "affine"
332
  },
333
  "model.layers.7.self_attn.q_proj": {
334
- "bits": 5,
335
  "group_size": 64,
336
  "mode": "affine"
337
  },
338
  "model.layers.7.self_attn.v_proj": {
339
- "bits": 5,
340
  "group_size": 64,
341
  "mode": "affine"
342
  },
@@ -1481,7 +1481,7 @@
1481
  "mode": "affine"
1482
  },
1483
  "model.layers.39.self_attn.k_proj": {
1484
- "bits": 5,
1485
  "group_size": 64,
1486
  "mode": "affine"
1487
  },
@@ -1491,12 +1491,12 @@
1491
  "mode": "affine"
1492
  },
1493
  "model.layers.39.self_attn.q_proj": {
1494
- "bits": 5,
1495
  "group_size": 64,
1496
  "mode": "affine"
1497
  },
1498
  "model.layers.39.self_attn.v_proj": {
1499
- "bits": 5,
1500
  "group_size": 64,
1501
  "mode": "affine"
1502
  },
@@ -1811,7 +1811,7 @@
1811
  "mode": "affine"
1812
  },
1813
  "model.layers.0.linear_attn.in_proj_qkvz": {
1814
- "bits": 6,
1815
  "group_size": 64,
1816
  "mode": "affine"
1817
  },
@@ -2076,22 +2076,22 @@
2076
  "mode": "affine"
2077
  },
2078
  "model.layers.7.self_attn.k_proj": {
2079
- "bits": 5,
2080
  "group_size": 64,
2081
  "mode": "affine"
2082
  },
2083
  "model.layers.7.self_attn.o_proj": {
2084
- "bits": 5,
2085
  "group_size": 64,
2086
  "mode": "affine"
2087
  },
2088
  "model.layers.7.self_attn.q_proj": {
2089
- "bits": 5,
2090
  "group_size": 64,
2091
  "mode": "affine"
2092
  },
2093
  "model.layers.7.self_attn.v_proj": {
2094
- "bits": 5,
2095
  "group_size": 64,
2096
  "mode": "affine"
2097
  },
@@ -3236,7 +3236,7 @@
3236
  "mode": "affine"
3237
  },
3238
  "model.layers.39.self_attn.k_proj": {
3239
- "bits": 5,
3240
  "group_size": 64,
3241
  "mode": "affine"
3242
  },
@@ -3246,12 +3246,12 @@
3246
  "mode": "affine"
3247
  },
3248
  "model.layers.39.self_attn.q_proj": {
3249
- "bits": 5,
3250
  "group_size": 64,
3251
  "mode": "affine"
3252
  },
3253
  "model.layers.39.self_attn.v_proj": {
3254
- "bits": 5,
3255
  "group_size": 64,
3256
  "mode": "affine"
3257
  },
 
56
  "mode": "affine"
57
  },
58
  "model.layers.0.linear_attn.in_proj_qkvz": {
59
+ "bits": 5,
60
  "group_size": 64,
61
  "mode": "affine"
62
  },
 
321
  "mode": "affine"
322
  },
323
  "model.layers.7.self_attn.k_proj": {
324
+ "bits": 4,
325
  "group_size": 64,
326
  "mode": "affine"
327
  },
328
  "model.layers.7.self_attn.o_proj": {
329
+ "bits": 4,
330
  "group_size": 64,
331
  "mode": "affine"
332
  },
333
  "model.layers.7.self_attn.q_proj": {
334
+ "bits": 4,
335
  "group_size": 64,
336
  "mode": "affine"
337
  },
338
  "model.layers.7.self_attn.v_proj": {
339
+ "bits": 4,
340
  "group_size": 64,
341
  "mode": "affine"
342
  },
 
1481
  "mode": "affine"
1482
  },
1483
  "model.layers.39.self_attn.k_proj": {
1484
+ "bits": 6,
1485
  "group_size": 64,
1486
  "mode": "affine"
1487
  },
 
1491
  "mode": "affine"
1492
  },
1493
  "model.layers.39.self_attn.q_proj": {
1494
+ "bits": 6,
1495
  "group_size": 64,
1496
  "mode": "affine"
1497
  },
1498
  "model.layers.39.self_attn.v_proj": {
1499
+ "bits": 8,
1500
  "group_size": 64,
1501
  "mode": "affine"
1502
  },
 
1811
  "mode": "affine"
1812
  },
1813
  "model.layers.0.linear_attn.in_proj_qkvz": {
1814
+ "bits": 5,
1815
  "group_size": 64,
1816
  "mode": "affine"
1817
  },
 
2076
  "mode": "affine"
2077
  },
2078
  "model.layers.7.self_attn.k_proj": {
2079
+ "bits": 4,
2080
  "group_size": 64,
2081
  "mode": "affine"
2082
  },
2083
  "model.layers.7.self_attn.o_proj": {
2084
+ "bits": 4,
2085
  "group_size": 64,
2086
  "mode": "affine"
2087
  },
2088
  "model.layers.7.self_attn.q_proj": {
2089
+ "bits": 4,
2090
  "group_size": 64,
2091
  "mode": "affine"
2092
  },
2093
  "model.layers.7.self_attn.v_proj": {
2094
+ "bits": 4,
2095
  "group_size": 64,
2096
  "mode": "affine"
2097
  },
 
3236
  "mode": "affine"
3237
  },
3238
  "model.layers.39.self_attn.k_proj": {
3239
+ "bits": 6,
3240
  "group_size": 64,
3241
  "mode": "affine"
3242
  },
 
3246
  "mode": "affine"
3247
  },
3248
  "model.layers.39.self_attn.q_proj": {
3249
+ "bits": 6,
3250
  "group_size": 64,
3251
  "mode": "affine"
3252
  },
3253
  "model.layers.39.self_attn.v_proj": {
3254
+ "bits": 8,
3255
  "group_size": 64,
3256
  "mode": "affine"
3257
  },
model-00001-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c392a664a13c8707b639971a9d1a1d661a038afe38b2a1323b8d618ff35288a5
3
- size 5227126440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ea7a8ee31c305afb0cb2a70f03964a7af1dd751c267a5660a2ea2ad32954bb
3
+ size 5223194280
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 35761708772
4
  },
5
  "weight_map": {
6
  "lm_head.biases": "model-00001-of-00007.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 35757776612
4
  },
5
  "weight_map": {
6
  "lm_head.biases": "model-00001-of-00007.safetensors",