GhostScientist commited on
Commit
545ff31
·
verified ·
1 Parent(s): 844ffb0

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .claude/settings.local.json +2 -1
  2. app.py +49 -17
  3. requirements.txt +4 -1
.claude/settings.local.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "permissions": {
3
  "allow": [
4
- "Skill(writing-skills:hugging-face-space-deployer)"
 
5
  ]
6
  }
7
  }
 
1
  {
2
  "permissions": {
3
  "allow": [
4
+ "Skill(writing-skills:hugging-face-space-deployer)",
5
+ "Bash(hf upload:*)"
6
  ]
7
  }
8
  }
app.py CHANGED
@@ -1,11 +1,32 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
4
  MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
5
- client = InferenceClient(MODEL_ID)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def respond(message, history, system_message, max_tokens, temperature, top_p):
9
  messages = [{"role": "system", "content": system_message}]
10
 
11
  for user_msg, assistant_msg in history:
@@ -16,31 +37,42 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
16
 
17
  messages.append({"role": "user", "content": message})
18
 
19
- response = ""
20
- for token in client.chat_completion(
21
  messages,
22
- max_tokens=max_tokens,
23
- stream=True,
24
- temperature=temperature,
25
- top_p=top_p,
26
- ):
27
- delta = token.choices[0].delta.content or ""
28
- response += delta
29
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  demo = gr.ChatInterface(
33
- respond,
34
  title="Qwen2.5 Coder 7B",
35
- description="A coding assistant powered by Qwen2.5-Coder-7B-Instruct via Hugging Face Inference API",
36
  additional_inputs=[
37
  gr.Textbox(
38
  value="You are Qwen, a helpful coding assistant. You excel at writing clean, efficient code and explaining programming concepts clearly.",
39
  label="System message",
40
  lines=2,
41
  ),
42
- gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max tokens"),
43
- gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
44
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
45
  ],
46
  examples=[
 
1
  import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
  MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
 
7
 
8
+ # Load tokenizer at startup
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+
11
+ # Global model - loaded lazily on first GPU call
12
+ model = None
13
+
14
+
15
+ def load_model():
16
+ global model
17
+ if model is None:
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_ID,
20
+ torch_dtype=torch.float16,
21
+ device_map="auto",
22
+ )
23
+ return model
24
+
25
+
26
+ @spaces.GPU(duration=120)
27
+ def generate_response(message, history, system_message, max_tokens, temperature, top_p):
28
+ loaded_model = load_model()
29
 
 
30
  messages = [{"role": "system", "content": system_message}]
31
 
32
  for user_msg, assistant_msg in history:
 
37
 
38
  messages.append({"role": "user", "content": message})
39
 
40
+ text = tokenizer.apply_chat_template(
 
41
  messages,
42
+ tokenize=False,
43
+ add_generation_prompt=True
44
+ )
45
+ inputs = tokenizer([text], return_tensors="pt").to(loaded_model.device)
46
+
47
+ with torch.no_grad():
48
+ outputs = loaded_model.generate(
49
+ **inputs,
50
+ max_new_tokens=int(max_tokens),
51
+ temperature=float(temperature),
52
+ top_p=float(top_p),
53
+ do_sample=True,
54
+ pad_token_id=tokenizer.eos_token_id,
55
+ )
56
+
57
+ response = tokenizer.decode(
58
+ outputs[0][inputs['input_ids'].shape[1]:],
59
+ skip_special_tokens=True
60
+ )
61
+ return response
62
 
63
 
64
  demo = gr.ChatInterface(
65
+ generate_response,
66
  title="Qwen2.5 Coder 7B",
67
+ description="A coding assistant powered by Qwen2.5-Coder-7B-Instruct on ZeroGPU",
68
  additional_inputs=[
69
  gr.Textbox(
70
  value="You are Qwen, a helpful coding assistant. You excel at writing clean, efficient code and explaining programming concepts clearly.",
71
  label="System message",
72
  lines=2,
73
  ),
74
+ gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max tokens"),
75
+ gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature"),
76
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
77
  ],
78
  examples=[
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
  gradio>=5.0.0
2
- huggingface_hub>=0.26.0
 
 
 
 
1
  gradio>=5.0.0
2
+ torch
3
+ transformers
4
+ accelerate
5
+ spaces