Spaces:

GhostScientist
/

qwen2.5-coder-7b

Paused

App Files Files Community

GhostScientist commited on Jan 7

Commit

545ff31

verified ·

1 Parent(s): 844ffb0

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.claude/settings.local.json +2 -1
app.py +49 -17
requirements.txt +4 -1

.claude/settings.local.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
   "permissions": {
     "allow": [
-      "Skill(writing-skills:hugging-face-space-deployer)"
     ]
   }
 }

 {
   "permissions": {
     "allow": [
+      "Skill(writing-skills:hugging-face-space-deployer)",
+      "Bash(hf upload:*)"
     ]
   }
 }

app.py CHANGED Viewed

@@ -1,11 +1,32 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
-client = InferenceClient(MODEL_ID)
-def respond(message, history, system_message, max_tokens, temperature, top_p):
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
@@ -16,31 +37,42 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     messages.append({"role": "user", "content": message})
-    response = ""
-    for token in client.chat_completion(
         messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        delta = token.choices[0].delta.content or ""
-        response += delta
-        yield response
 demo = gr.ChatInterface(
-    respond,
     title="Qwen2.5 Coder 7B",
-    description="A coding assistant powered by Qwen2.5-Coder-7B-Instruct via Hugging Face Inference API",
     additional_inputs=[
         gr.Textbox(
             value="You are Qwen, a helpful coding assistant. You excel at writing clean, efficient code and explaining programming concepts clearly.",
             label="System message",
             lines=2,
         ),
-        gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max tokens"),
-        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
     ],
     examples=[

 import gradio as gr
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
+# Load tokenizer at startup
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+# Global model - loaded lazily on first GPU call
+model = None
+def load_model():
+    global model
+    if model is None:
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+    return model
+@spaces.GPU(duration=120)
+def generate_response(message, history, system_message, max_tokens, temperature, top_p):
+    loaded_model = load_model()
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
     messages.append({"role": "user", "content": message})
+    text = tokenizer.apply_chat_template(
         messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = tokenizer([text], return_tensors="pt").to(loaded_model.device)
+    with torch.no_grad():
+        outputs = loaded_model.generate(
+            **inputs,
+            max_new_tokens=int(max_tokens),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    response = tokenizer.decode(
+        outputs[0][inputs['input_ids'].shape[1]:],
+        skip_special_tokens=True
+    )
+    return response
 demo = gr.ChatInterface(
+    generate_response,
     title="Qwen2.5 Coder 7B",
+    description="A coding assistant powered by Qwen2.5-Coder-7B-Instruct on ZeroGPU",
     additional_inputs=[
         gr.Textbox(
             value="You are Qwen, a helpful coding assistant. You excel at writing clean, efficient code and explaining programming concepts clearly.",
             label="System message",
             lines=2,
         ),
+        gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max tokens"),
+        gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
     ],
     examples=[

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 gradio>=5.0.0
-huggingface_hub>=0.26.0

 gradio>=5.0.0
+torch
+transformers
+accelerate
+spaces