Skip to content

Commit

Permalink
modify to keep the original comment info
Browse files Browse the repository at this point in the history
Signed-off-by: sitloboi2012 <[email protected]>
  • Loading branch information
sitloboi2012 committed Feb 18, 2025
1 parent feec809 commit 4f190d3
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions src/vllm_router/request_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,17 @@ def __init__(self, sliding_window_size: float):
"""
self.sliding_window_size = sliding_window_size

# Monitors for calculating QPS and TTFT
# Finished requests for each serving engine
# The elements in the deque should be sorted by 'complete' time
self.qps_monitors: Dict[str, MovingAverageMonitor] = {}
self.ttft_monitors: Dict[str, MovingAverageMonitor] = {}

# Record initial request start time: (engine_url, request_id) -> timestamp
# The time when the request is coming (engine_url, request_id) -> timestamp
self.request_start_time: Dict[Tuple[str, str], float] = {}
# Record time when first token is received: (engine_url, request_id) -> timestamp
self.first_token_time: Dict[Tuple[str, str], float] = {}

# Counters for requests in different stages
# Number of requests in different stages (from the start of the router)
self.in_prefill_requests: Dict[str, int] = {}
self.in_decoding_requests: Dict[str, int] = {}
self.finished_requests: Dict[str, int] = {}
Expand Down Expand Up @@ -195,13 +196,16 @@ def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float)

def get_request_stats(self, current_time: float) -> Dict[str, RequestStats]:
"""
Get the request statistics from the monitor.
Get the request statistics for each serving engine
Args:
current_time: The current timestamp
current_time: The current timestamp in seconds
Returns:
A dictionary mapping engine URLs to RequestStats objects
A dictionary where the key is the serving engine URL and the value
is the request statistics for that engine.
The TTFT and inter token latency will be -1 if there is no requests
finished in the sliding window.
"""
ret = {}
urls = set(self.in_prefill_requests.keys()).union(
Expand Down

0 comments on commit 4f190d3

Please sign in to comment.