From d7b7f4b734ef72c5bd0c648a1e3e709810fff715 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Tue, 18 Feb 2025 08:27:15 +0700 Subject: [PATCH] modify to keep the original comment info Signed-off-by: sitloboi2012 --- src/vllm_router/request_stats.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/vllm_router/request_stats.py b/src/vllm_router/request_stats.py index 369d4447..1c724f69 100644 --- a/src/vllm_router/request_stats.py +++ b/src/vllm_router/request_stats.py @@ -87,16 +87,17 @@ def __init__(self, sliding_window_size: float): """ self.sliding_window_size = sliding_window_size - # Monitors for calculating QPS and TTFT + # Finished requests for each serving engine + # The elements in the deque should be sorted by 'complete' time self.qps_monitors: Dict[str, MovingAverageMonitor] = {} self.ttft_monitors: Dict[str, MovingAverageMonitor] = {} - # Record initial request start time: (engine_url, request_id) -> timestamp + # The time when the request is coming (engine_url, request_id) -> timestamp self.request_start_time: Dict[Tuple[str, str], float] = {} # Record time when first token is received: (engine_url, request_id) -> timestamp self.first_token_time: Dict[Tuple[str, str], float] = {} - # Counters for requests in different stages + # Number of requests in different stages (from the start of the router) self.in_prefill_requests: Dict[str, int] = {} self.in_decoding_requests: Dict[str, int] = {} self.finished_requests: Dict[str, int] = {} @@ -195,13 +196,16 @@ def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float) def get_request_stats(self, current_time: float) -> Dict[str, RequestStats]: """ - Get the request statistics from the monitor. + Get the request statistics for each serving engine Args: - current_time: The current timestamp + current_time: The current timestamp in seconds Returns: - A dictionary mapping engine URLs to RequestStats objects + A dictionary where the key is the serving engine URL and the value + is the request statistics for that engine. + The TTFT and inter token latency will be -1 if there is no requests + finished in the sliding window. """ ret = {} urls = set(self.in_prefill_requests.keys()).union(