Skip to content

Commit c573af5

Browse files
authored
[Example] ggml: add basic example with CI (second-state#110)
* [Example] ggml: add basic example Signed-off-by: dm4 <dm4@secondstate.io> * [CI] llama: add job for StarCoder 2 model Signed-off-by: dm4 <dm4@secondstate.io> --------- Signed-off-by: dm4 <dm4@secondstate.io>
1 parent 4dffec2 commit c573af5

File tree

5 files changed

+228
-0
lines changed

5 files changed

+228
-0
lines changed

‎.github/workflows/llama.yml‎

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,20 @@ jobs:
100100
default \
101101
$'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<</SYS>>\nWhat is the capital of Japan?[/INST]'
102102
103+
- name: StarCoder 2 7B
104+
run: |
105+
test -f ~/.wasmedge/env && source ~/.wasmedge/env
106+
cd wasmedge-ggml/basic
107+
curl -LO https://huggingface.co/second-state/StarCoder2-7B-GGUF/resolve/main/starcoder2-7b-Q5_K_M.gguf
108+
cargo build --target wasm32-wasi --release
109+
time wasmedge --dir .:. \
110+
--env n_gpu_layers="$NGL" \
111+
--env n_predict=100 \
112+
--nn-preload default:GGML:AUTO:starcoder2-7b-Q5_K_M.gguf \
113+
target/wasm32-wasi/release/wasmedge-ggml-basic.wasm \
114+
default \
115+
'def print_hello_world():'
116+
103117
- name: Build llama-stream
104118
run: |
105119
cd wasmedge-ggml/llama-stream

‎wasmedge-ggml/basic/Cargo.toml‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[package]
2+
name = "wasmedge-ggml-basic"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[dependencies]
7+
serde_json = "1.0"
8+
wasmedge-wasi-nn = "0.7.0"

‎wasmedge-ggml/basic/README.md‎

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
# Basic Example For WASI-NN with GGML Backend
3+
4+
> [!NOTE]
5+
> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example for the models without prompt templates.
6+
7+
## Get the Model
8+
9+
This example is for the models without prompt template. For example, the `StarCoder2` model.
10+
11+
Download the model:
12+
13+
```bash
14+
curl -LO https://huggingface.co/second-state/StarCoder2-7B-GGUF/resolve/main/starcoder2-7b-Q5_K_M.gguf
15+
```
16+
17+
## Parameters
18+
19+
> [!NOTE]
20+
> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
21+
22+
- For GPU offloading, please adjust the `n-gpu-layers` options to the number of layers that you want to offload to the GPU.
23+
- When using the `StarCoder2` model, the `n-predict` option can be used to adjust the number of predictions. Since the inference may not stop as expected, it is recommended to set a limit for the number of predictions.
24+
25+
## Execute
26+
27+
Execute the WASM with the `wasmedge` using the named model feature to preload a large model:
28+
29+
```console
30+
$ wasmedge --dir .:. \
31+
--env n_predict=100 \
32+
--nn-preload default:GGML:AUTO:/disk/starcoder2-7b-Q5_K_M.gguf \
33+
wasmedge-ggml-basic.wasm default
34+
35+
USER:
36+
def print_hello_world():
37+
ASSISTANT:
38+
39+
print("Hello World!")
40+
41+
def main():
42+
print_hello_world()
43+
44+
45+
if __name__ == "__main__":
46+
main()/README.md
47+
# python-learning
48+
49+
This repository is for learning Python.
50+
51+
## References
52+
53+
* [Python 3.8.0 Documentation](https://docs.python.org/3/)
54+
```

‎wasmedge-ggml/basic/src/main.rs‎

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
use serde_json::json;
2+
use serde_json::Value;
3+
use std::env;
4+
use std::io;
5+
use wasmedge_wasi_nn::{
6+
self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
7+
TensorType,
8+
};
9+
10+
fn read_input() -> String {
11+
loop {
12+
let mut answer = String::new();
13+
io::stdin()
14+
.read_line(&mut answer)
15+
.expect("Failed to read line");
16+
if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
17+
return answer.trim().to_string();
18+
}
19+
}
20+
}
21+
22+
fn get_options_from_env() -> Value {
23+
let mut options = json!({});
24+
if let Ok(val) = env::var("enable_log") {
25+
options["enable-log"] =
26+
serde_json::from_str(val.as_str()).expect("invalid enable-log value (true/false)")
27+
} else {
28+
options["enable-log"] = serde_json::from_str("false").unwrap()
29+
}
30+
if let Ok(val) = env::var("ctx_size") {
31+
options["ctx-size"] =
32+
serde_json::from_str(val.as_str()).expect("invalid ctx-size value (unsigned integer)")
33+
} else {
34+
options["ctx-size"] = serde_json::from_str("512").unwrap()
35+
}
36+
if let Ok(val) = env::var("n_gpu_layers") {
37+
options["n-gpu-layers"] =
38+
serde_json::from_str(val.as_str()).expect("invalid ngl (unsigned integer)")
39+
} else {
40+
options["n-gpu-layers"] = serde_json::from_str("100").unwrap()
41+
}
42+
if let Ok(val) = env::var("n_predict") {
43+
options["n-predict"] =
44+
serde_json::from_str(val.as_str()).expect("invalid n-predict (unsigned integer)")
45+
} else {
46+
options["n-predict"] = serde_json::from_str("512").unwrap()
47+
}
48+
49+
options
50+
}
51+
52+
fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
53+
context.set_input(0, TensorType::U8, &[1], &data)
54+
}
55+
56+
#[allow(dead_code)]
57+
fn set_metadata_to_context(
58+
context: &mut GraphExecutionContext,
59+
data: Vec<u8>,
60+
) -> Result<(), Error> {
61+
context.set_input(1, TensorType::U8, &[1], &data)
62+
}
63+
64+
fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
65+
// Preserve for 4096 tokens with average token length 6
66+
const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
67+
let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
68+
let mut output_size = context
69+
.get_output(index, &mut output_buffer)
70+
.expect("Failed to get output");
71+
output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
72+
73+
return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
74+
}
75+
76+
fn get_output_from_context(context: &GraphExecutionContext) -> String {
77+
get_data_from_context(context, 0)
78+
}
79+
80+
#[allow(dead_code)]
81+
fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
82+
serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
83+
}
84+
85+
fn main() {
86+
let args: Vec<String> = env::args().collect();
87+
let model_name: &str = &args[1];
88+
89+
// Set options for the graph. Check our README for more details:
90+
// https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
91+
let mut options = get_options_from_env();
92+
// Set the stream-stdout option to true to make the response more interactive.
93+
options["stream-stdout"] = serde_json::from_str("true").unwrap();
94+
// We set the temperature to 0.2 in this example to make the response more consistent.
95+
options["temp"] = Value::from(0.1);
96+
97+
// Create graph and initialize context.
98+
let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
99+
.config(serde_json::to_string(&options).expect("Failed to serialize options"))
100+
.build_from_cache(model_name)
101+
.expect("Failed to build graph");
102+
let mut context = graph
103+
.init_execution_context()
104+
.expect("Failed to init context");
105+
106+
// If there is a third argument, use it as the prompt and enter non-interactive mode.
107+
// This is mainly for the CI workflow.
108+
if args.len() >= 3 {
109+
let prompt = &args[2];
110+
println!("Prompt:\n{}", prompt);
111+
let tensor_data = prompt.as_bytes().to_vec();
112+
context
113+
.set_input(0, TensorType::U8, &[1], &tensor_data)
114+
.expect("Failed to set input");
115+
println!("Response:");
116+
context.compute().expect("Failed to compute");
117+
let output = get_output_from_context(&context);
118+
println!("{}", output.trim());
119+
std::process::exit(0);
120+
}
121+
122+
loop {
123+
println!("USER:");
124+
let input = read_input();
125+
126+
// Set prompt to the input tensor.
127+
set_data_to_context(&mut context, input.as_bytes().to_vec()).expect("Failed to set input");
128+
129+
// Execute the inference.
130+
println!("ASSISTANT:");
131+
match context.compute() {
132+
Ok(_) => (),
133+
Err(Error::BackendError(BackendError::ContextFull)) => {
134+
println!("\n[INFO] Context full, we'll reset the context and continue.");
135+
}
136+
Err(Error::BackendError(BackendError::PromptTooLong)) => {
137+
println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
138+
}
139+
Err(err) => {
140+
println!("\n[ERROR] {}", err);
141+
}
142+
}
143+
144+
// Retrieve the output.
145+
let output = get_output_from_context(&context);
146+
if let Some(true) = options["stream-stdout"].as_bool() {
147+
println!();
148+
} else {
149+
println!("{}", output.trim());
150+
}
151+
}
152+
}
2.14 MB
Binary file not shown.

0 commit comments

Comments
 (0)