HyeonJungHam
diff --git a/‎.github/workflows/llama.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/llama.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎wasmedge-ggml/llama/src/main.rs‎
Lines changed: 26 additions & 24 deletions b/‎wasmedge-ggml/llama/src/main.rs‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎wasmedge-ggml/llama/wasmedge-ggml-llama.wasm‎
16.7 KB b/‎wasmedge-ggml/llama/wasmedge-ggml-llama.wasm‎
16.7 KB
diff --git a/‎wasmedge-ggml/llava/src/main.rs‎
Lines changed: 30 additions & 0 deletions b/‎wasmedge-ggml/llava/src/main.rs‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎wasmedge-ggml/llava/wasmedge-ggml-llava.wasm‎
16.9 KB b/‎wasmedge-ggml/llava/wasmedge-ggml-llava.wasm‎
16.9 KB
diff --git a/‎wasmedge-ggml/test/set-input-twice/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎wasmedge-ggml/test/set-input-twice/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎wasmedge-ggml/test/set-input-twice/README.md‎
Lines changed: 12 additions & 0 deletions b/‎wasmedge-ggml/test/set-input-twice/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎wasmedge-ggml/test/set-input-twice/src/main.rs‎
Lines changed: 104 additions & 0 deletions b/‎wasmedge-ggml/test/set-input-twice/src/main.rs‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎wasmedge-ggml/test/set-input-twice/wasmedge-ggml-set-input-twice.wasm‎
2.15 MB b/‎wasmedge-ggml/test/set-input-twice/wasmedge-ggml-set-input-twice.wasm‎
2.15 MB
@@ -158,6 +158,19 @@ jobs:
                 default \
                 $'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<</SYS>>\nWhat is the capital of Japan?[/INST]'
 
+          - name: Set Input Twice
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/test/set-input-twice
+              curl -LO https://huggingface.co/second-state/Gemma-2b-it-GGUF/resolve/main/gemma-2b-it-Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --nn-preload default:GGML:AUTO:gemma-2b-it-Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-set-input-twice.wasm \
+                default \
+                '<start_of_turn>user Where is the capital of Japan? <end_of_turn><start_of_turn>model'
+
           - name: Build llama-stream
             run: |
               cd wasmedge-ggml/llama-stream
 
@@ -66,7 +66,6 @@ fn get_output_from_context(context: &GraphExecutionContext) -> String {
     get_data_from_context(context, 0)
 }
 
-#[allow(dead_code)]
 fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
     serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
 }
@@ -103,15 +102,41 @@ fn main() {
     // This is mainly for the CI workflow.
     if args.len() >= 3 {
         let prompt = &args[2];
+        // Set the prompt.
         println!("Prompt:\n{}", prompt);
         let tensor_data = prompt.as_bytes().to_vec();
         context
             .set_input(0, TensorType::U8, &[1], &tensor_data)
             .expect("Failed to set input");
         println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Get the output.
         context.compute().expect("Failed to compute");
         let output = get_output_from_context(&context);
         println!("{}", output.trim());
+
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
         std::process::exit(0);
     }
 
@@ -134,18 +159,6 @@ fn main() {
         set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec())
             .expect("Failed to set input");
 
-        // Get the number of input tokens and llama.cpp versions.
-        // let input_metadata = get_metadata_from_context(&context);
-        // println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
-        // println!(
-        //     "[INFO] llama_build_number: {}",
-        //     input_metadata["llama_build_number"]
-        // );
-        // println!(
-        //     "[INFO] Number of input tokens: {}",
-        //     input_metadata["input_tokens"]
-        // );
-
         // Execute the inference.
         let mut reset_prompt = false;
         match context.compute() {
@@ -174,16 +187,5 @@ fn main() {
             output = output.trim().to_string();
             saved_prompt = format!("{} {}", saved_prompt, output);
         }
-
-        // Retrieve the output metadata.
-        // let metadata = get_metadata_from_context(&context);
-        // println!(
-        //     "[INFO] Number of input tokens: {}",
-        //     metadata["input_tokens"]
-        // );
-        // println!(
-        //     "[INFO] Number of output tokens: {}",
-        //     metadata["output_tokens"]
-        // );
     }
 }
@@ -75,6 +75,10 @@ fn get_output_from_context(context: &GraphExecutionContext) -> String {
     get_data_from_context(context, 0)
 }
 
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
 fn main() {
     let args: Vec<String> = env::args().collect();
     let model_name: &str = &args[1];
@@ -98,15 +102,41 @@ fn main() {
     // This is mainly for the CI workflow.
     if args.len() >= 3 {
         let prompt = &args[2];
+        // Set the prompt.
         println!("Prompt:\n{}", prompt);
         let tensor_data = prompt.as_bytes().to_vec();
         context
             .set_input(0, TensorType::U8, &[1], &tensor_data)
             .expect("Failed to set input");
         println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Get the output.
         context.compute().expect("Failed to compute");
         let output = get_output_from_context(&context);
         println!("{}", output.trim());
+
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
         std::process::exit(0);
     }
 
 
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-ggml-set-input-twice"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.0"
@@ -0,0 +1,12 @@
+# `set-input-twice`
+
+Ensure that we get the same result from executing `set_input` twice.
+
+## Execute
+
+```console
+$ curl -LO https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf
+$ wasmedge --dir .:. \
+  --nn-preload default:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \
+  wasmedge-ggml-set-input-twice.wasm default '<PROMPT>'
+```
@@ -0,0 +1,104 @@
+use serde_json::json;
+use serde_json::Value;
+use std::env;
+use wasmedge_wasi_nn::{
+    self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, TensorType,
+};
+
+fn get_options_from_env() -> Value {
+    let mut options = json!({});
+    if let Ok(val) = env::var("enable_log") {
+        options["enable-log"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for enable-log option (true/false)")
+    } else {
+        options["enable-log"] = serde_json::from_str("false").unwrap()
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options["n-gpu-layers"] =
+            serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
+    } else {
+        options["n-gpu-layers"] = serde_json::from_str("0").unwrap()
+    }
+    options["ctx-size"] = serde_json::from_str("1024").unwrap();
+
+    options
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+}
+
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let options = get_options_from_env();
+
+    // Create graph and initialize context.
+    let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() < 3 {
+        println!("Usage: {} <model_name> <prompt>", args[0]);
+    } else {
+        let prompt = &args[2];
+
+        // Set the prompt.
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Set the prompt, twice.
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata_after = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata_after["input_tokens"]
+        );
+
+        // Check it the numbers of input_tokens are the same
+        if input_metadata["input_tokens"] != input_metadata_after["input_tokens"] {
+            panic!("The number of input tokens is different after setting the input twice.");
+        }
+    }
+}