CaptainVincent
diff --git a/‎.github/workflows/llama.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/llama.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎wasmedge-ggml/multimodel/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎wasmedge-ggml/multimodel/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎wasmedge-ggml/multimodel/README.md‎
Lines changed: 48 additions & 0 deletions b/‎wasmedge-ggml/multimodel/README.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎wasmedge-ggml/multimodel/src/main.rs‎
Lines changed: 202 additions & 0 deletions b/‎wasmedge-ggml/multimodel/src/main.rs‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎wasmedge-ggml/multimodel/wasmedge-ggml-multimodel.wasm‎
2.16 MB b/‎wasmedge-ggml/multimodel/wasmedge-ggml-multimodel.wasm‎
2.16 MB
@@ -114,6 +114,24 @@ jobs:
                 default \
                 'def print_hello_world():'
 
+          - name: Multiple Models Example
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/multimodel
+              curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf
+              curl -LO https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
+              curl -LO https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
+              curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env image=monalisa.jpg \
+                --env mmproj=mmproj-vicuna7b-f16.gguf \
+                --nn-preload llama2:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \
+                --nn-preload llava:GGML:AUTO:vicuna-7b-q5_k.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-multimodel.wasm \
+                'describe this picture please'
+
           - name: Build llama-stream
             run: |
               cd wasmedge-ggml/llama-stream
 
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-ggml-multimodel"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.0"
@@ -0,0 +1,48 @@
+
+# Multiple Models Example For WASI-NN with GGML Backend
+
+> [!NOTE]
+> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example for the chaining the results between multiple models.
+
+In this example, we will try asking the `Llava` model a question with a image, and then pass the answer to the `Llama2` model for further response. This example will demonstrate how to use WasmEdge WASI-NN plugin to link two or more models together.
+
+## Get the Model
+
+This example uses the `Llama2` model and `Llava` model. You can download the models from the following links:
+
+```bash
+curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf
+curl -LO https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
+curl -LO https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
+```
+
+## Parameters
+
+> [!NOTE]
+> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
+
+Download the image for the Llava model:
+
+```bash
+curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
+```
+
+## Execute
+
+Execute the WASM with the `wasmedge` using the named model feature to preload the two large models:
+
+```console
+$ wasmedge --dir .:. \
+  --env image=monalisa.jpg \
+  --env mmproj=mmproj-vicuna7b-f16.gguf \
+  --nn-preload llama2:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \
+  --nn-preload llava:GGML:AUTO:vicuna-7b-q5_k.gguf \
+  wasmedge-ggml-multimodel.wasm default
+
+USER:
+describe this picture please
+ASSISTANT (llava):
+The image you've provided appears to be a painting of the Mona Lisa, one of Leonardo da Vinci's most famous works. It is a portrait of a woman with a serene and enigmatic expression, looking directly at the viewer. Her hair is styled in an updo, and she wears a dark dress that drapes elegantly around her shoulders. The background features a landscape with rolling hills and a river, which adds depth to the composition. The painting is renowned for its subtle changes in expression and the enigmatic smile on the subject's face, which has intrigued viewers for centuries.
+ASSISTANT (llama2):
+The image provided is a painting of the Mona Lisa, one of Leonardo da Vinci's most famous works, depicting a woman with a serene and enigmatic expression, styled in an updo with a dark dress draped elegantly around her shoulders, set against a landscape background with rolling hills and a river.
+```
@@ -0,0 +1,202 @@
+use serde_json::json;
+use serde_json::Value;
+use std::env;
+use std::io;
+use wasmedge_wasi_nn::{
+    self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+
+fn read_input() -> String {
+    loop {
+        let mut answer = String::new();
+        io::stdin()
+            .read_line(&mut answer)
+            .expect("Failed to read line");
+        if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
+            return answer.trim().to_string();
+        }
+    }
+}
+
+fn get_options_from_env() -> Value {
+    let mut options = json!({});
+
+    // Required parameters for llava
+    if let Ok(val) = env::var("mmproj") {
+        options["mmproj"] = Value::from(val.as_str());
+    } else {
+        eprintln!("Failed to get mmproj model.");
+        std::process::exit(1);
+    }
+    if let Ok(val) = env::var("image") {
+        options["image"] = Value::from(val.as_str());
+    } else {
+        eprintln!("Failed to get the target image.");
+        std::process::exit(1);
+    }
+
+    // Optional parameters
+    if let Ok(val) = env::var("enable_log") {
+        options["enable-log"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for enable-log option (true/false)")
+    } else {
+        options["enable-log"] = serde_json::from_str("false").unwrap()
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options["n-gpu-layers"] =
+            serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
+    } else {
+        options["n-gpu-layers"] = serde_json::from_str("0").unwrap()
+    }
+
+    options
+}
+
+fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(0, TensorType::U8, &[1], &data)
+}
+
+#[allow(dead_code)]
+fn set_metadata_to_context(
+    context: &mut GraphExecutionContext,
+    data: Vec<u8>,
+) -> Result<(), Error> {
+    context.set_input(1, TensorType::U8, &[1], &data)
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+#[allow(dead_code)]
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let mut options = get_options_from_env();
+    // We set the temperature to 0.1 for more consistent results.
+    options["temp"] = Value::from(0.1);
+    // Set the context size to 4096 tokens for the llava 1.6 model.
+    options["ctx-size"] = Value::from(4096);
+
+    // Create the llava model.
+    let mut graphs = Vec::new();
+    graphs.push(
+        GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+            .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+            .build_from_cache("llava")
+            .expect("Failed to build graph"),
+    );
+
+    // Remove unnecessary options for the llama2 model.
+    options
+        .as_object_mut()
+        .expect("Failed to get jsons object")
+        .remove("mmproj");
+    options
+        .as_object_mut()
+        .expect("Failed to get json object")
+        .remove("image");
+    // Create the llama2 model.
+    graphs.push(
+        GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+            .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+            .build_from_cache("llama2")
+            .expect("Failed to build graph"),
+    );
+
+    // Initilize the execution contexts.
+    let mut contexts = Vec::new();
+    contexts.push(
+        graphs[0]
+            .init_execution_context()
+            .expect("Failed to init context"),
+    );
+    contexts.push(
+        graphs[1]
+            .init_execution_context()
+            .expect("Failed to init context"),
+    );
+
+    let system_prompt = String::from("You are a helpful, respectful and honest assistant.");
+    let mut input = String::from("");
+
+    // If the user provides a prompt, use it.
+    println!("USER:");
+    if args.len() >= 2 {
+        input += &args[1];
+        println!("{}", input);
+    } else {
+        input = read_input();
+    }
+
+    // Llava inference.
+    let image_placeholder = "<image>";
+    let mut saved_prompt = format!(
+        "{}\nUSER:{}\n{}\nASSISTANT:",
+        system_prompt, image_placeholder, input
+    );
+    set_data_to_context(&mut contexts[0], saved_prompt.as_bytes().to_vec())
+        .expect("Failed to set input");
+    match contexts[0].compute() {
+        Ok(_) => (),
+        Err(Error::BackendError(BackendError::ContextFull)) => {
+            println!("\n[INFO] Context full, we'll reset the context and continue.");
+        }
+        Err(Error::BackendError(BackendError::PromptTooLong)) => {
+            println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+        }
+        Err(err) => {
+            println!("\n[ERROR] {}", err);
+        }
+    }
+
+    // Retrieve the llava output.
+    let mut output = get_output_from_context(&contexts[0]);
+    println!("ASSISTANT (llava):\n{}", output.trim());
+
+    // Llama2 inference.
+    let llama2_prompt = "Summarize the following text in 1 sentence:";
+    saved_prompt = format!(
+        "[INST] <<SYS>> {} <</SYS>> {} {} [/INST]",
+        system_prompt,
+        llama2_prompt,
+        output.trim()
+    );
+    set_data_to_context(&mut contexts[1], saved_prompt.as_bytes().to_vec())
+        .expect("Failed to set input");
+    match contexts[1].compute() {
+        Ok(_) => (),
+        Err(Error::BackendError(BackendError::ContextFull)) => {
+            println!("\n[INFO] Context full, we'll reset the context and continue.");
+        }
+        Err(Error::BackendError(BackendError::PromptTooLong)) => {
+            println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+        }
+        Err(err) => {
+            println!("\n[ERROR] {}", err);
+        }
+    }
+
+    // Retrieve the llama2 output.
+    output = get_output_from_context(&contexts[1]);
+    println!("ASSISTANT (llama2):\n{}", output.trim());
+}