vLLM with this?

#21
by vamcrizer - opened

I'm considering use the model for large production, does vLLM support GGUF or do you have a safetensors version of the model? Thx and great work!

You probably will have to conveart GGUF back to SafeTensors for it to work in vLLM. At least I never had much luck running GGUFs with vLLM. Barely any archidectures are supported and the ones that are run much slower.

You probably will have to conveart GGUF back to SafeTensors for it to work in vLLM. At least I never had much luck running GGUFs with vLLM. Barely any archidectures are supported and the ones that are run much slower.

How do i do that? Does llamacpp have that script? thanks? Ive been searching for solution but none worked, and does converting back guarantee keeping the model quality?

@vamcrizer You simply use https://github.com/purinnohito/gguf_to_safetensors. It does work with latest llama.cpp despite them claiming otherwise at least after letting it use latest requirements:

diff --git a/requirements.txt b/requirements.txt
index 6266ea5..0d0cfa8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-torch==2.6.0
-numpy<2
-safetensors==0.4.5
-gguf==0.14.0
-transformers==4.47.0
+torch
+numpy
+safetensors
+gguf
+transformers

If you use https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive/blob/main/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf quality will be exactly equal to the original SafeTensors model from which this GGUFs were created but the author didn't bother to upload and equal to using the BF16 GGUF. Qwen 3.5 is a BF16 model and the process from BF16 SafeTensors to BF16 GGUF and from BF16 GGUFs back to SafeTensors is lossless due to converting BF16 to BF16.

After GGUF to SafeTensors conversion you can use the following python script I created to fix up the resulting SafeTensors and copy over the missing files:

import os
import shutil
from safetensors import safe_open
from safetensors.torch import save_file
from concurrent.futures import ThreadPoolExecutor, as_completed

SOURCE_DIR = "/nvme/Huihui-MiniMax-M2-abliterated"
TARGET_DIR = "/nvme/Huihui-MiniMax-M2-abliterated-repaired"
NUM_THREADS = 3

os.makedirs(TARGET_DIR, exist_ok=True)

# Copy config/tokenizer/custom code files
for fname in os.listdir(SOURCE_DIR):
    src = os.path.join(SOURCE_DIR, fname)
    dst = os.path.join(TARGET_DIR, fname)

    if fname.startswith("model-") and fname.endswith(".safetensors"):
        continue  # shards handled separately

    if os.path.isfile(src):
        shutil.copy2(src, dst)

print("βœ“ Copied config/tokenizer files")

# Find shards
shards = sorted([
    f for f in os.listdir(SOURCE_DIR)
    if f.startswith("model-") and f.endswith(".safetensors")
])

print(f"Found {len(shards)} shards total\n")

def repair_shard(shard):
    src_path = os.path.join(SOURCE_DIR, shard)
    dst_path = os.path.join(TARGET_DIR, shard)

    # Skip if already repaired
    if os.path.exists(dst_path):
        return f"skipped {shard}"

    tensors = {}
    with safe_open(src_path, framework="pt") as f:
        for key in f.keys():
            tensors[key] = f.get_tensor(key)

    save_file(tensors, dst_path)
    return f"repaired {shard}"

# Run repairs concurrently
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    futures = {executor.submit(repair_shard, shard): shard for shard in shards}

    for future in as_completed(futures):
        result = future.result()
        print("βœ“", result)

print("\nAll shards processed. Output stored in:")
print(TARGET_DIR)

After GGUF to SafeTensors conversion you can use the following python script I created to fix up the resulting SafeTensors and copy over the missing files:

import os
import shutil
from safetensors import safe_open
from safetensors.torch import save_file
from concurrent.futures import ThreadPoolExecutor, as_completed

SOURCE_DIR = "/nvme/Huihui-MiniMax-M2-abliterated"
TARGET_DIR = "/nvme/Huihui-MiniMax-M2-abliterated-repaired"
NUM_THREADS = 3

os.makedirs(TARGET_DIR, exist_ok=True)

# Copy config/tokenizer/custom code files
for fname in os.listdir(SOURCE_DIR):
    src = os.path.join(SOURCE_DIR, fname)
    dst = os.path.join(TARGET_DIR, fname)

    if fname.startswith("model-") and fname.endswith(".safetensors"):
        continue  # shards handled separately

    if os.path.isfile(src):
        shutil.copy2(src, dst)

print("βœ“ Copied config/tokenizer files")

# Find shards
shards = sorted([
    f for f in os.listdir(SOURCE_DIR)
    if f.startswith("model-") and f.endswith(".safetensors")
])

print(f"Found {len(shards)} shards total\n")

def repair_shard(shard):
    src_path = os.path.join(SOURCE_DIR, shard)
    dst_path = os.path.join(TARGET_DIR, shard)

    # Skip if already repaired
    if os.path.exists(dst_path):
        return f"skipped {shard}"

    tensors = {}
    with safe_open(src_path, framework="pt") as f:
        for key in f.keys():
            tensors[key] = f.get_tensor(key)

    save_file(tensors, dst_path)
    return f"repaired {shard}"

# Run repairs concurrently
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    futures = {executor.submit(repair_shard, shard): shard for shard in shards}

    for future in as_completed(futures):
        result = future.result()
        print("βœ“", result)

print("\nAll shards processed. Output stored in:")
print(TARGET_DIR)

thanks

vamcrizer changed discussion status to closed

Sign up or log in to comment