update

flexflow · Feb 24, 2025 · 20850ad · 20850ad
1 parent 91bcf2d
commit 20850ad
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 11 deletions.
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -101,7 +101,7 @@ def main():
     )
     llm.compile(
         generation_config,
-        max_requests_per_batch=1,
+        max_requests_per_batch=4,
         max_seq_length=256,
         max_tokens_per_batch=64,
     )

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -130,15 +130,15 @@ def main():
     for ssm in ssms:
         ssm.compile(
             generation_config,
-            max_requests_per_batch=1,
+            max_requests_per_batch=4,
             max_seq_length=256,
             max_tokens_per_batch=64,
         )
 
     # Compile the LLM for inference and load the weights into memory
     llm.compile(
         generation_config,
-        max_requests_per_batch=1,
+        max_requests_per_batch=4,
         max_seq_length=256,
         max_tokens_per_batch=64,
         ssms=ssms,

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
@@ -4718,7 +4718,7 @@ def generate(self, requests_list: List[Request]):
         ]  # entry will be None for finetuning requests
         c_output_texts = [
             (
-                ffi.new("char[]", max_sequence_length * 5)
+                ffi.new("char[]", max_sequence_length * 10)
                 if request.req_type == RequestType.REQ_INFERENCE
                 else ffi.NULL
             )

diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
@@ -1780,10 +1780,9 @@ void flexflow_model_generate(flexflow_model_t handle_,
       if (max_lengths[i] >= 0) {
         assert(total_tokens <= max_lengths[i] || num_output_tokens == 0);
       }
-      // assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
-      //        results[i].output_tokens.size() ==
-      //        results[i].input_tokens.size());
       output_length_and_tokens[i][0] = results[i].output_tokens.size();
+      assert(results[i].output_tokens.size() <= max_lengths[i]+100 && "Exceeding python buffer size for token ids");
+      assert(results[i].output_text.length() <= max_lengths[i]*10 && "Exceeding python buffer size for output text");
       std::copy(results[i].output_tokens.begin(),
                 results[i].output_tokens.end(),
                 output_length_and_tokens[i] + 1);

diff --git a/tests/inference/generate_inf_test_configs.py b/tests/inference/generate_inf_test_configs.py
@@ -19,7 +19,6 @@
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
     "enable_peft": False,
-    "peft_activation_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,
@@ -34,7 +33,7 @@
     "full_precision": True,
     "prompt": "",
     "output_file": "",
-    "max_length": 128,
+    "max_length": 255,
 }
 ssm_configs = {
     "ssms": [

diff --git a/tests/inference/test_inference_output.py b/tests/inference/test_inference_output.py
@@ -43,7 +43,41 @@ def compare_single_line(file_a, file_b):
             raise AssertionError(
                 f"File contents differ at position {i}:\n  {file_a} -> {list_a[i]}\n  {file_b} -> {list_b[i]}"
             )
+def compare_token_ids(file1_path, file2_path):
+    prefix = "token IDs: "
+
+    # Read lines from both files.
+    with open(file1_path, 'r') as f1, open(file2_path, 'r') as f2:
+        lines1 = f1.readlines()
+        lines2 = f2.readlines()
+
+    # Filter lines that start with the specified prefix.
+    token_lines1 = [line for line in lines1 if line.startswith(prefix)]
+    token_lines2 = [line for line in lines2 if line.startswith(prefix)]
+
+    # Check if both files have the same number of token lines.
+    if len(token_lines1) != len(token_lines2):
+        print(f"Error: Number of token ID lines differ: {len(token_lines1)} vs {len(token_lines2)}")
+        return False
+
+    # Compare corresponding token lines.
+    for i, (line1, line2) in enumerate(zip(token_lines1, token_lines2)):
+        try:
+            tokens1 = [int(tok.strip()) for tok in line1[len(prefix):].strip().split(",") if tok.strip()]
+        except ValueError as e:
+            print(f"Error parsing integers in file1, line {i}: {line1}\n{e}")
+            continue
+
+        try:
+            tokens2 = [int(tok.strip()) for tok in line2[len(prefix):].strip().split(",") if tok.strip()]
+        except ValueError as e:
+            print(f"Error parsing integers in file2, line {i}: {line2}\n{e}")
+            continue
 
+        # Determine number of tokens to compare: first 50 or less if the list is shorter.
+        n_to_compare = min(50, len(tokens1), len(tokens2))
+        if tokens1[:n_to_compare] != tokens2[:n_to_compare]:
+            raise AssertionError(f"Mismatch in line {i}:\nFile1 tokens (first {n_to_compare}): {tokens1[:n_to_compare]}\nFile2 tokens (first {n_to_compare}): {tokens2[:n_to_compare]}")
 
 def group_model_files(prefix):
     """
@@ -118,7 +152,8 @@ def test_output_alignment(file_a, file_b):
     """
     Each file pair is tested and reported separately.
     """
-    compare_single_line(file_a, file_b)
+    # compare_single_line(file_a, file_b)
+    compare_token_ids(file_a, file_b)
 
 
 

diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
@@ -18,7 +18,16 @@ CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF}
 rm -rf inference/prompt inference/output inference/inf_test_configs || true
 # Create test prompt file
 mkdir -p ./inference/prompt
-echo '["Three tips for staying healthy are: "]' > ./inference/prompt/test.json
+# echo '["Three tips for staying healthy are: "]' > ./inference/prompt/test.json
+sample_prompts='[
+  "The largest ocean on Earth is",
+  "The inventor of the telephone was",
+  "The speed of light is",
+  "The tallest mountain in the world is",
+  "The first man on the moon was"
+]'
+echo "$sample_prompts" > ./inference/prompt/test.json
+
 # Create output folder
 mkdir -p ./inference/output