CaptainVincent
diff --git a/‎.github/workflows/llama.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/llama.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎wasmedge-ggml/nnrpc/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎wasmedge-ggml/nnrpc/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎wasmedge-ggml/nnrpc/README.md‎
Lines changed: 44 additions & 0 deletions b/‎wasmedge-ggml/nnrpc/README.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎wasmedge-ggml/nnrpc/src/main.rs‎
Lines changed: 188 additions & 0 deletions b/‎wasmedge-ggml/nnrpc/src/main.rs‎
Lines changed: 188 additions & 0 deletions
diff --git a/‎wasmedge-ggml/nnrpc/wasmedge-ggml-nnrpc.wasm‎
2.15 MB b/‎wasmedge-ggml/nnrpc/wasmedge-ggml-nnrpc.wasm‎
2.15 MB
@@ -145,6 +145,19 @@ jobs:
                 default \
                 'hello world'
 
+          - name: RPC Example
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/nnrpc
+              curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --nn-preload default:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-nnrpc.wasm \
+                default \
+                $'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<</SYS>>\nWhat is the capital of Japan?[/INST]'
+
           - name: Build llama-stream
             run: |
               cd wasmedge-ggml/llama-stream
 
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-ggml-nnrpc"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.0"
@@ -0,0 +1,44 @@
+# RPC Example For WASI-NN with GGML Backend
+
+> [!NOTE]
+> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of the WASI-NN RPC usage.
+
+## Parameters
+
+> [!NOTE]
+> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
+
+For GPU offloading, please adjust the `n-gpu-layers` options to the number of layers that you want to offload to the GPU.
+
+```rust
+options.insert("n-gpu-layers", Value::from(...));
+```
+
+In llava inference, we recommend to use the `ctx-size` at least `2048` when using llava-v1.5 and at least `4096` when using llava-v1.6 for better results.
+
+```rust
+options.insert("ctx-size", Value::from(4096));
+```
+
+## Execute
+
+
+```console
+# Run the RPC server.
+$ wasi_nn_rpcserver --nn-rpc-uri unix://$PWD/nn_server.sock \
+  --nn-preload default:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf
+
+# Run the wasmedge and inference though the RPC server.
+$ wasmedge \
+  --nn-rpc-uri unix://$PWD/nn_server.sock \
+  wasmedge-ggml-nnrpc.wasm default
+
+USER:
+What's the capital of the United States?
+ASSISTANT:
+The capital of the United States is Washington, D.C. (District of Columbia).
+USER:
+How about France?
+ASSISTANT:
+The capital of France is Paris.
+```
@@ -0,0 +1,188 @@
+use serde_json::json;
+use serde_json::Value;
+use std::env;
+use std::io;
+use wasmedge_wasi_nn::{
+    self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+
+fn read_input() -> String {
+    loop {
+        let mut answer = String::new();
+        io::stdin()
+            .read_line(&mut answer)
+            .expect("Failed to read line");
+        if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
+            return answer.trim().to_string();
+        }
+    }
+}
+
+fn get_options_from_env() -> Value {
+    let mut options = json!({});
+    if let Ok(val) = env::var("enable_log") {
+        options["enable-log"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for enable-log option (true/false)")
+    } else {
+        options["enable-log"] = serde_json::from_str("false").unwrap()
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options["n-gpu-layers"] =
+            serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
+    } else {
+        options["n-gpu-layers"] = serde_json::from_str("0").unwrap()
+    }
+    options["ctx-size"] = serde_json::from_str("1024").unwrap();
+
+    options
+}
+
+fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(0, TensorType::U8, &[1], &data)
+}
+
+#[allow(dead_code)]
+fn set_metadata_to_context(
+    context: &mut GraphExecutionContext,
+    data: Vec<u8>,
+) -> Result<(), Error> {
+    context.set_input(1, TensorType::U8, &[1], &data)
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+#[allow(dead_code)]
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let options = get_options_from_env();
+
+    // Create graph and initialize context.
+    let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    // We also support setting the options via input tensor with index 1.
+    // Uncomment the line below to run the example, Check our README for more details.
+    set_metadata_to_context(
+        &mut context,
+        serde_json::to_string(&options)
+            .expect("Failed to serialize options")
+            .as_bytes()
+            .to_vec(),
+    )
+    .expect("Failed to set metadata");
+
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() >= 3 {
+        // Set the prompt.
+        let prompt = &args[2];
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+        // Get the response.
+        println!("Response:");
+        context.compute().expect("Failed to compute");
+        let output = get_output_from_context(&context);
+        println!("{}", output.trim());
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
+        std::process::exit(0);
+    }
+
+    let mut saved_prompt = String::new();
+    let system_prompt = String::from("You are a helpful, respectful and honest assistant. Always answer as short as possible, while being safe." );
+
+    loop {
+        println!("USER:");
+        let input = read_input();
+        if saved_prompt.is_empty() {
+            saved_prompt = format!(
+                "[INST] <<SYS>> {} <</SYS>> {} [/INST]",
+                system_prompt, input
+            );
+        } else {
+            saved_prompt = format!("{} [INST] {} [/INST]", saved_prompt, input);
+        }
+
+        // Set prompt to the input tensor.
+        set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec())
+            .expect("Failed to set input");
+
+        // Execute the inference.
+        let mut reset_prompt = false;
+        match context.compute() {
+            Ok(_) => (),
+            Err(Error::BackendError(BackendError::ContextFull)) => {
+                println!("\n[INFO] Context full, we'll reset the context and continue.");
+                reset_prompt = true;
+            }
+            Err(Error::BackendError(BackendError::PromptTooLong)) => {
+                println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+                reset_prompt = true;
+            }
+            Err(err) => {
+                println!("\n[ERROR] {}", err);
+            }
+        }
+
+        // Retrieve the output.
+        let mut output = get_output_from_context(&context);
+        println!("ASSISTANT:\n{}", output.trim());
+
+        // Update the saved prompt.
+        if reset_prompt {
+            saved_prompt.clear();
+        } else {
+            output = output.trim().to_string();
+            saved_prompt = format!("{} {}", saved_prompt, output);
+        }
+    }
+}