Instructions to use tiny-random/phi-4-multimodal with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use tiny-random/phi-4-multimodal with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="tiny-random/phi-4-multimodal", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("tiny-random/phi-4-multimodal", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use tiny-random/phi-4-multimodal with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "tiny-random/phi-4-multimodal" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiny-random/phi-4-multimodal", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/tiny-random/phi-4-multimodal
- SGLang
How to use tiny-random/phi-4-multimodal with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "tiny-random/phi-4-multimodal" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiny-random/phi-4-multimodal", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "tiny-random/phi-4-multimodal" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tiny-random/phi-4-multimodal", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use tiny-random/phi-4-multimodal with Docker Model Runner:
docker model run hf.co/tiny-random/phi-4-multimodal
| library_name: transformers | |
| pipeline_tag: text-generation | |
| inference: true | |
| widget: | |
| - text: Hello! | |
| example_title: Hello world | |
| group: Python | |
| This tiny model is for debugging. It is randomly initialized with the config adapted from [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct). | |
| ### Example usage: | |
| ```python | |
| import io | |
| import os | |
| from urllib.request import urlopen | |
| import torch | |
| import requests | |
| import soundfile as sf | |
| from PIL import Image | |
| from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | |
| # Define model path | |
| model_id = "tiny-random/phi-4-multimodal" | |
| # Load model and processor | |
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| device_map="cuda", | |
| torch_dtype="auto", | |
| trust_remote_code=True, | |
| attn_implementation='flash_attention_2', | |
| ).cuda() | |
| # Load generation config | |
| generation_config = GenerationConfig.from_pretrained(model_id) | |
| # Define prompt structure | |
| user_prompt = '<|user|>' | |
| assistant_prompt = '<|assistant|>' | |
| prompt_suffix = '<|end|>' | |
| # Part 1: Image Processing | |
| print("\n--- IMAGE PROCESSING ---") | |
| image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg' | |
| prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}' | |
| print(f'>>> Prompt\n{prompt}') | |
| # Download and open image | |
| image = Image.open(requests.get(image_url, stream=True).raw) | |
| inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0') | |
| # Generate response | |
| generate_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=8, | |
| generation_config=generation_config, | |
| ) | |
| generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
| response = processor.batch_decode( | |
| generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| print(f'>>> Response\n{response}') | |
| # Part 2: Audio Processing | |
| print("\n--- AUDIO PROCESSING ---") | |
| audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac" | |
| speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation." | |
| prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}' | |
| print(f'>>> Prompt\n{prompt}') | |
| # Downlowd and open audio file | |
| audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read())) | |
| # Process with the model | |
| inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0') | |
| generate_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=8, | |
| generation_config=generation_config, | |
| ) | |
| generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
| response = processor.batch_decode( | |
| generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| print(f'>>> Response\n{response}') | |
| ``` | |
| ### Codes to create this repo: | |
| ```python | |
| import json | |
| import shutil | |
| import sys | |
| from pathlib import Path | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from transformers import ( | |
| AutoConfig, | |
| AutoModelForCausalLM, | |
| AutoProcessor, | |
| AutoTokenizer, | |
| GenerationConfig, | |
| pipeline, | |
| set_seed, | |
| ) | |
| source_model_id = "microsoft/Phi-4-multimodal-instruct" | |
| save_folder = "/tmp/tiny-random/phi-4-multimodal" | |
| Path(save_folder).mkdir(exist_ok=True) | |
| AutoTokenizer.from_pretrained(source_model_id).save_pretrained(save_folder) | |
| # preprocessor config | |
| for json_file in ['preprocessor_config.json', 'processor_config.json', 'config.json']: | |
| with open(hf_hub_download(source_model_id, json_file), 'r') as f: | |
| config = json.load(f) | |
| auto_map = config.get('auto_map', {}) | |
| for key, value in auto_map.items(): | |
| if '.' in value: | |
| auto_map[key] = f'{source_model_id}--{value}' | |
| with open(f'{save_folder}/{json_file}', 'w') as f: | |
| json.dump(config, f, indent=2) | |
| # model config | |
| with open(f'{save_folder}/config.json', 'r') as f: | |
| config = json.load(f) | |
| config['hidden_size'] = 16 | |
| config['intermediate_size'] = 32 | |
| config['num_attention_heads'] = 2 | |
| config['num_hidden_layers'] = 2 | |
| config['num_key_value_heads'] = 1 | |
| config['audio_processor']['config']['num_blocks'] = 2 | |
| config['audio_processor']['config']['attention_dim'] = 16 | |
| config['audio_processor']['config']['attention_heads'] = 2 | |
| config['audio_processor']['config']['nemo_conv_settings']['conv_channels'] = 16 | |
| config['audio_processor']['config']['depthwise_seperable_out_channel'] = 16 | |
| config['audio_processor']['config']['ext_pw_out_channel'] = 16 | |
| config['audio_processor']['config']['linear_units'] = 24 | |
| config['vision_lora']['r'] = 8 | |
| config['vision_lora']['lora_alpha'] = 16 | |
| config['speech_lora']['r'] = 8 | |
| config['speech_lora']['lora_alpha'] = 16 | |
| config['rope_scaling']['long_factor'] = [1.0] * 3 | |
| config['rope_scaling']['short_factor'] = [1.0] * 3 | |
| with open(f'{save_folder}/config.json', 'w') as f: | |
| json.dump(config, f, indent=2) | |
| config = AutoConfig.from_pretrained( | |
| save_folder, | |
| trust_remote_code=True, | |
| ) | |
| Path(save_folder, 'phi4mm').mkdir(exist_ok=True) | |
| for python_files in ['modeling_phi4mm.py', 'configuration_phi4mm.py', 'speech_conformer_encoder.py', 'vision_siglip_navit.py', 'processing_phi4mm.py']: | |
| with open(hf_hub_download(source_model_id, python_files), 'r') as f: | |
| codes = f.read() | |
| with open(f'{save_folder}/phi4mm/{python_files}', 'w') as f: | |
| f.write(codes) | |
| with open(Path(save_folder, 'phi4mm/vision_siglip_navit.py'), 'r') as f: | |
| codes = f.read() | |
| codes = codes.replace('def get_siglip_vision_model', '# modified for tiny-random\ndef get_siglip_vision_model') | |
| codes = codes.replace('"hidden_size": 1152,', '"hidden_size": 16,') | |
| codes = codes.replace('"intermediate_size": 4304,', '"intermediate_size": 32,') | |
| codes = codes.replace('"num_attention_heads": 16,', '"num_attention_heads": 2,') | |
| codes = codes.replace('"num_hidden_layers": 27,', '"num_hidden_layers": 2,') | |
| with open(Path(save_folder, 'phi4mm/vision_siglip_navit.py'), 'w') as f: | |
| f.write(codes) | |
| sys.path.append(str(Path(save_folder))) | |
| from phi4mm.modeling_phi4mm import Phi4MMForCausalLM | |
| print(Phi4MMForCausalLM) # ensure imported | |
| model = Phi4MMForCausalLM(config).to(torch.bfloat16) | |
| set_seed(42) | |
| with torch.no_grad(): | |
| for name, p in sorted(model.named_parameters()): | |
| torch.nn.init.normal_(p, 0, 0.5) | |
| print(name, p.shape) | |
| model.save_pretrained(Path(save_folder)) | |
| shutil.rmtree(Path(save_folder, 'phi4mm')) | |
| generation_config = GenerationConfig.from_pretrained( | |
| source_model_id, trust_remote_code=True, | |
| ) | |
| generation_config.save_pretrained(save_folder) | |
| ``` |