Skip to content

Commit bc5a061

Browse files
authored
[Example] ggml: add Qwen2-VL example (second-state#166)
Signed-off-by: dm4 <sunrisedm4@gmail.com>
1 parent 56d497a commit bc5a061

File tree

5 files changed

+270
-1
lines changed

5 files changed

+270
-1
lines changed

‎.github/workflows/llama.yml‎

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
strategy:
2626
matrix:
2727
runner: [ubuntu-20.04, macos-m1]
28-
wasmedge: ["0.13.5", "0.14.0"]
28+
wasmedge: ["0.14.1"]
2929
plugin: [wasi_nn-ggml]
3030
job:
3131
- name: "Tiny Llama"
@@ -301,6 +301,23 @@ jobs:
301301
default \
302302
$'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always output JSON format string.\n<</SYS>>\nGive me a JSON array of Apple products.[/INST]'
303303
304+
- name: Qwen2-VL
305+
run: |
306+
test -f ~/.wasmedge/env && source ~/.wasmedge/env
307+
cd wasmedge-ggml/qwen2vl
308+
curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-vision-encoder.gguf
309+
curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q5_K_M.gguf
310+
curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
311+
cargo build --target wasm32-wasi --release
312+
time wasmedge --dir .:. \
313+
--env n_gpu_layers="$NGL" \
314+
--nn-preload default:GGML:AUTO:Qwen2-VL-2B-Instruct-Q5_K_M.gguf \
315+
--env mmproj=Qwen2-VL-2B-Instruct-vision-encoder.gguf \
316+
--env image=monalisa.jpg \
317+
target/wasm32-wasi/release/wasmedge-ggml-qwen2vl.wasm \
318+
default \
319+
$'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><image><|vision_end|>what is in this picture?<|im_end|>\n<|im_start|>assistant\n'
320+
304321
- name: Build llama-stream
305322
run: |
306323
cd wasmedge-ggml/llama-stream

‎wasmedge-ggml/qwen2vl/Cargo.toml‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[package]
2+
name = "wasmedge-ggml-qwen2vl"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[dependencies]
7+
serde_json = "1.0"
8+
wasmedge-wasi-nn = "0.7.1"

‎wasmedge-ggml/qwen2vl/README.md‎

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Qwen-2VL Example For WASI-NN with GGML Backend
2+
3+
> [!NOTE]
4+
> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of the Qwen2-VL model.
5+
6+
## Get Qwen2-VL Model
7+
8+
In this example, we are going to use the pre-converted [Qwen2-VL-2B](https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/tree/main) model.
9+
10+
Download the model:
11+
12+
```bash
13+
curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-vision-encoder.gguf
14+
curl -LO https://huggingface.co/second-state/Qwen2-VL-2B-Instruct-GGUF/resolve/main/Qwen2-VL-2B-Instruct-Q5_K_M.gguf
15+
```
16+
17+
## Prepare the Image
18+
19+
Download the image you want to perform inference on:
20+
21+
```bash
22+
curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
23+
```
24+
25+
## Parameters
26+
27+
> [!NOTE]
28+
> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
29+
30+
In Qwen2-VL inference, we recommend to use the `ctx-size` at least `4096` for better results.
31+
32+
```rust
33+
options.insert("ctx-size", Value::from(4096));
34+
```
35+
36+
## Execute
37+
38+
Execute the WASM with the `wasmedge` using the named model feature to preload a large model:
39+
40+
```bash
41+
wasmedge --dir .:. \
42+
--nn-preload default:GGML:AUTO:Qwen2-VL-2B-Instruct-Q5_K_M.gguf \
43+
--env mmproj=Qwen2-VL-2B-Instruct-vision-encoder.gguf \
44+
--env image=monalisa.jpg \
45+
--env ctx_size=4096 \
46+
wasmedge-ggml-qwen2vl.wasm default
47+
```

‎wasmedge-ggml/qwen2vl/src/main.rs‎

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
use serde_json::Value;
2+
use std::collections::HashMap;
3+
use std::env;
4+
use std::io;
5+
use wasmedge_wasi_nn::{
6+
self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
7+
TensorType,
8+
};
9+
10+
fn read_input() -> String {
11+
loop {
12+
let mut answer = String::new();
13+
io::stdin()
14+
.read_line(&mut answer)
15+
.expect("Failed to read line");
16+
if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
17+
return answer.trim().to_string();
18+
}
19+
}
20+
}
21+
22+
fn get_options_from_env() -> HashMap<&'static str, Value> {
23+
let mut options = HashMap::new();
24+
25+
// Required parameters for llava
26+
if let Ok(val) = env::var("mmproj") {
27+
options.insert("mmproj", Value::from(val.as_str()));
28+
} else {
29+
eprintln!("Failed to get mmproj model.");
30+
std::process::exit(1);
31+
}
32+
if let Ok(val) = env::var("image") {
33+
options.insert("image", Value::from(val.as_str()));
34+
} else {
35+
eprintln!("Failed to get the target image.");
36+
std::process::exit(1);
37+
}
38+
39+
// Optional parameters
40+
if let Ok(val) = env::var("enable_log") {
41+
options.insert("enable-log", serde_json::from_str(val.as_str()).unwrap());
42+
} else {
43+
options.insert("enable-log", Value::from(false));
44+
}
45+
if let Ok(val) = env::var("ctx_size") {
46+
options.insert("ctx-size", serde_json::from_str(val.as_str()).unwrap());
47+
} else {
48+
options.insert("ctx-size", Value::from(4096));
49+
}
50+
if let Ok(val) = env::var("n_gpu_layers") {
51+
options.insert("n-gpu-layers", serde_json::from_str(val.as_str()).unwrap());
52+
} else {
53+
options.insert("n-gpu-layers", Value::from(0));
54+
}
55+
options
56+
}
57+
58+
fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
59+
context.set_input(0, TensorType::U8, &[1], &data)
60+
}
61+
62+
fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
63+
// Preserve for 4096 tokens with average token length 6
64+
const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
65+
let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
66+
let mut output_size = context
67+
.get_output(index, &mut output_buffer)
68+
.expect("Failed to get output");
69+
output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
70+
71+
String::from_utf8_lossy(&output_buffer[..output_size]).to_string()
72+
}
73+
74+
fn get_output_from_context(context: &GraphExecutionContext) -> String {
75+
get_data_from_context(context, 0)
76+
}
77+
78+
fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
79+
serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
80+
}
81+
82+
fn main() {
83+
let args: Vec<String> = env::args().collect();
84+
let model_name: &str = &args[1];
85+
86+
// Set options for the graph. Check our README for more details:
87+
// https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
88+
let options = get_options_from_env();
89+
// You could also set the options manually like this:
90+
91+
// Create graph and initialize context.
92+
let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
93+
.config(serde_json::to_string(&options).expect("Failed to serialize options"))
94+
.build_from_cache(model_name)
95+
.expect("Failed to build graph");
96+
let mut context = graph
97+
.init_execution_context()
98+
.expect("Failed to init context");
99+
100+
// If there is a third argument, use it as the prompt and enter non-interactive mode.
101+
// This is mainly for the CI workflow.
102+
if args.len() >= 3 {
103+
let prompt = &args[2];
104+
// Set the prompt.
105+
println!("Prompt:\n{}", prompt);
106+
let tensor_data = prompt.as_bytes().to_vec();
107+
context
108+
.set_input(0, TensorType::U8, &[1], &tensor_data)
109+
.expect("Failed to set input");
110+
println!("Response:");
111+
112+
// Get the number of input tokens and llama.cpp versions.
113+
let input_metadata = get_metadata_from_context(&context);
114+
println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
115+
println!(
116+
"[INFO] llama_build_number: {}",
117+
input_metadata["llama_build_number"]
118+
);
119+
println!(
120+
"[INFO] Number of input tokens: {}",
121+
input_metadata["input_tokens"]
122+
);
123+
124+
// Get the output.
125+
context.compute().expect("Failed to compute");
126+
let output = get_output_from_context(&context);
127+
println!("{}", output.trim());
128+
129+
// Retrieve the output metadata.
130+
let metadata = get_metadata_from_context(&context);
131+
println!(
132+
"[INFO] Number of input tokens: {}",
133+
metadata["input_tokens"]
134+
);
135+
println!(
136+
"[INFO] Number of output tokens: {}",
137+
metadata["output_tokens"]
138+
);
139+
std::process::exit(0);
140+
}
141+
142+
let mut saved_prompt = String::new();
143+
let system_prompt = String::from("You are a helpful assistant.");
144+
let image_placeholder = "<image>";
145+
146+
loop {
147+
println!("USER:");
148+
let input = read_input();
149+
150+
// Qwen2VL prompt format: <|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n<|vision_start|>{image_placeholder}<|vision_end|>{user_prompt}<|im_end|>\n<|im_start|>assistant\n";
151+
if saved_prompt.is_empty() {
152+
saved_prompt = format!(
153+
"<|im_start|>system\n{}<|im_end|>\n<|im_start|>user\n<|vision_start|>{}<|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
154+
system_prompt, image_placeholder, input
155+
);
156+
} else {
157+
saved_prompt = format!(
158+
"{}<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
159+
saved_prompt, input
160+
);
161+
}
162+
163+
// Set prompt to the input tensor.
164+
set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec())
165+
.expect("Failed to set input");
166+
167+
// Execute the inference.
168+
let mut reset_prompt = false;
169+
match context.compute() {
170+
Ok(_) => (),
171+
Err(Error::BackendError(BackendError::ContextFull)) => {
172+
println!("\n[INFO] Context full, we'll reset the context and continue.");
173+
reset_prompt = true;
174+
}
175+
Err(Error::BackendError(BackendError::PromptTooLong)) => {
176+
println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
177+
reset_prompt = true;
178+
}
179+
Err(err) => {
180+
println!("\n[ERROR] {}", err);
181+
std::process::exit(1);
182+
}
183+
}
184+
185+
// Retrieve the output.
186+
let mut output = get_output_from_context(&context);
187+
println!("ASSISTANT:\n{}", output.trim());
188+
189+
// Update the saved prompt.
190+
if reset_prompt {
191+
saved_prompt.clear();
192+
} else {
193+
output = output.trim().to_string();
194+
saved_prompt = format!("{}{}<|im_end|>\n", saved_prompt, output);
195+
}
196+
}
197+
}
1.74 MB
Binary file not shown.

0 commit comments

Comments
 (0)