From d7b7f4b734ef72c5bd0c648a1e3e709810fff715 Mon Sep 17 00:00:00 2001
From: sitloboi2012 <huyvo6812@gmail.com>
Date: Tue, 18 Feb 2025 08:27:15 +0700
Subject: [PATCH] modify to keep the original comment info

Signed-off-by: sitloboi2012 <huyvo6812@gmail.com>
---
 src/vllm_router/request_stats.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/vllm_router/request_stats.py b/src/vllm_router/request_stats.py
index 369d4447..1c724f69 100644
--- a/src/vllm_router/request_stats.py
+++ b/src/vllm_router/request_stats.py
@@ -87,16 +87,17 @@ def __init__(self, sliding_window_size: float):
         """
         self.sliding_window_size = sliding_window_size
 
-        # Monitors for calculating QPS and TTFT
+        # Finished requests for each serving engine
+        # The elements in the deque should be sorted by 'complete' time
         self.qps_monitors: Dict[str, MovingAverageMonitor] = {}
         self.ttft_monitors: Dict[str, MovingAverageMonitor] = {}
 
-        # Record initial request start time: (engine_url, request_id) -> timestamp
+        # The time when the request is coming (engine_url, request_id) -> timestamp
         self.request_start_time: Dict[Tuple[str, str], float] = {}
         # Record time when first token is received: (engine_url, request_id) -> timestamp
         self.first_token_time: Dict[Tuple[str, str], float] = {}
 
-        # Counters for requests in different stages
+        # Number of requests in different stages (from the start of the router)
         self.in_prefill_requests: Dict[str, int] = {}
         self.in_decoding_requests: Dict[str, int] = {}
         self.finished_requests: Dict[str, int] = {}
@@ -195,13 +196,16 @@ def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float)
 
     def get_request_stats(self, current_time: float) -> Dict[str, RequestStats]:
         """
-        Get the request statistics from the monitor.
+        Get the request statistics for each serving engine
 
         Args:
-            current_time: The current timestamp
+            current_time: The current timestamp in seconds
 
         Returns:
-            A dictionary mapping engine URLs to RequestStats objects
+            A dictionary where the key is the serving engine URL and the value
+            is the request statistics for that engine.
+            The TTFT and inter token latency will be -1 if there is no requests
+            finished in the sliding window.
         """
         ret = {}
         urls = set(self.in_prefill_requests.keys()).union(