From af1ef2c52901816d37e24319f5a505ea841c8e11 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Wed, 12 Feb 2025 23:21:50 +0700 Subject: [PATCH 01/15] update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run Signed-off-by: sitloboi2012 --- observability/README.md | 7 + observability/vllm-dashboard.json | 789 +++++++++++++----------------- src/tests/requirements.txt | 1 + src/vllm_router/engine_stats.py | 2 +- src/vllm_router/requirements.txt | 2 + src/vllm_router/router.py | 88 ++-- utils/install-minikube-cluster.sh | 80 ++- 7 files changed, 467 insertions(+), 502 deletions(-) diff --git a/observability/README.md b/observability/README.md index 086d2ebb..4bf83de5 100644 --- a/observability/README.md +++ b/observability/README.md @@ -10,6 +10,13 @@ The observability stack is based on [kube-prom-stack](https://github.com/prometh To launch the observability stack: +Make sure to have: + +- A running Kubernetes (K8s) environment with GPUs + - Run `cd utils && bash install-minikube-cluster.sh` + - Or follow our [tutorial](tutorials/00-install-kubernetes-env.md) + +After that you can run: ```bash sudo bash install.sh ``` diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index 73b9ca11..141afb8a 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -22,53 +22,40 @@ "links": [], "panels": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 100, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false, + "title": "Core vLLM Metrics", + "type": "row" + }, + { + "id": 1, + "type": "stat", + "title": "Available vLLM instances", "description": "Number of healthy vLLM instances", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 1 }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 5, - "x": 0, - "y": 0 - }, - "id": 1, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true @@ -76,578 +63,492 @@ "pluginVersion": "11.4.0", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "disableTextWrap": false, - "editorMode": "builder", "expr": "count by(endpoint) (vllm:cpu_cache_usage_perc)", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "builder", + "format": "time_series", "legendFormat": "vLLM instances", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "Available vLLM instances", - "type": "stat" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 2, + "type": "bargauge", + "title": "Request latency distribution", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 9, "x": 6, "y": 1 }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 9, - "x": 5, - "y": 0 - }, - "id": 6, "options": { "displayMode": "gradient", - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "showUnfilled": true, "sizing": "auto", - "valueMode": "color" + "valueMode": "color", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, "expr": "sum by(le) (vllm:e2e_request_latency_seconds_bucket)", + "editorMode": "builder", "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, "legendFormat": "{{le}}", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "Request latency distribution", - "type": "bargauge" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 3, + "type": "bargauge", + "title": "Request TTFT distribution", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 9, "x": 15, "y": 1 }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 10, - "x": 14, - "y": 0 - }, - "id": 7, "options": { "displayMode": "gradient", - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "showUnfilled": true, "sizing": "auto", - "valueMode": "color" + "valueMode": "color", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", + "editorMode": "builder", "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, "legendFormat": "__auto", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "Request TTFT distribution", - "type": "bargauge" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 101, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, + "collapsed": false, + "title": "Operational Metrics", + "type": "row", + "note": "Metrics related to the operational state of the vLLM instances." + }, + { + "id": 4, + "type": "timeseries", + "title": "Number of running requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "id": 2, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", "expr": "vllm:num_requests_running", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "builder", "legendFormat": "{{instance}}", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "Number of running requests", - "type": "timeseries" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 5, + "type": "timeseries", + "title": "GPU KV Usage percent", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "id": 4, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", "expr": "vllm:gpu_cache_usage_perc", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "builder", "legendFormat": "{{instance}}", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "GPU KV Usage percent", - "type": "timeseries" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, + "id": 6, + "type": "timeseries", + "title": "Number of pending requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "id": 3, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:num_requests_waiting", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "GPU KV cache hit rate", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, + "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", - "expr": "vllm:num_requests_waiting", - "fullMetaSearch": false, - "includeNullMetadata": true, "legendFormat": "{{instance}}", - "range": true, "refId": "A", - "useBackend": false + "range": true } - ], - "title": "Number of pending requests", - "type": "timeseries" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" + "id": 102, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "collapsed": false, + "title": "Router Observability Metrics", + "type": "row", + "note": "Metrics related to the router-side queueing delay and current QPS." + }, + { + "id": 8, + "type": "stat", + "title": "Current QPS", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 26 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:current_qps", + "editorMode": "builder", + "legendFormat": "Current QPS", + "refId": "A", + "range": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Router-side Queueing Delay", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 26 }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } + { "color": "green", "value": null }, + { "color": "red", "value": 80 } ] } }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:router_queueing_delay_seconds", + "editorMode": "builder", + "legendFormat": "Queueing Delay", + "refId": "A", + "range": true + } + ] + }, + { + "id": 12, + "type": "stat", + "title": "Average Prefill Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 26 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:avg_prefill_length", + "editorMode": "builder", + "legendFormat": "Avg. Prefill Length", + "refId": "A", + "range": true + } + ] + }, + { + "id": 10, + "type": "timeseries", + "title": "Number of Prefilling Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 26 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] }, - "id": 5, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:num_prefill_requests", + "editorMode": "builder", + "legendFormat": "Prefilling Requests", + "refId": "A", + "range": true + } + ] + }, + { + "id": 11, + "type": "timeseries", + "title": "Number of Decoding Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 16, "x": 0, "y": 33 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } }, - "tooltip": { - "mode": "single", - "sort": "none" + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:num_decoding_requests", + "editorMode": "builder", + "legendFormat": "Decoding Requests", + "refId": "A", + "range": true } + ] + }, + { + "id": 13, + "type": "stat", + "title": "Average Decoding Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 33 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, + "expr": "vllm:avg_decoding_length", "editorMode": "builder", - "expr": "vllm:gpu_prefix_cache_hit_rate", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "{{instance}}", - "range": true, + "legendFormat": "Avg. Decoding Length", "refId": "A", - "useBackend": false + "range": true } - ], - "title": "GPU KV cache hit rate", - "type": "timeseries" + ] } ], "preload": false, "refresh": "auto", "schemaVersion": 40, "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, + "templating": { "list": [] }, + "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "vllm dashboard", "uid": "ee9i0i4y606psc", - "version": 17, + "version": 18, "weekStart": "" } diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt index 6e6cb41b..07fcefca 100644 --- a/src/tests/requirements.txt +++ b/src/tests/requirements.txt @@ -1,4 +1,5 @@ fastapi httpx uvicorn +openai vllm diff --git a/src/vllm_router/engine_stats.py b/src/vllm_router/engine_stats.py index 342b36f5..b92a9c1d 100644 --- a/src/vllm_router/engine_stats.py +++ b/src/vllm_router/engine_stats.py @@ -1,7 +1,7 @@ import threading import time from dataclasses import dataclass -from typing import Dict +from typing import Dict, Optional import requests from prometheus_client.parser import text_string_to_metric_families diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt index 301f2877..7151639f 100644 --- a/src/vllm_router/requirements.txt +++ b/src/vllm_router/requirements.txt @@ -1,7 +1,9 @@ +aiofiles==24.1.0 fastapi==0.115.8 httpx==0.28.1 kubernetes==32.0.0 numpy==1.26.4 prometheus_client==0.21.1 +python-multipart==0.0.20 uhashring==2.3 uvicorn==0.34.0 diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index bfc9a3ea..e7c7c562 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -9,6 +9,7 @@ import uvicorn from fastapi import FastAPI, Request, UploadFile from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import Gauge, generate_latest from vllm_router.batch import BatchProcessor, initialize_batch_processor from vllm_router.engine_stats import GetEngineStatsScraper, InitializeEngineStatsScraper @@ -44,18 +45,21 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) -# TODO: better request id system - +# --- Observation & Tracking (from v2) --- +# Define a Prometheus gauge for tracking the number of running requests per server. +vnum_requests_running = Gauge( + "vllm:num_requests_running", "Number of running requests", ["server"] +) -async def process_request( - method, header, body, backend_url, request_id, endpoint, debug_request=None -): +# --- Request Processing & Routing --- +# TODO: better request id system +async def process_request(method, header, body, backend_url, request_id, endpoint, debug_request=None): """ Async generator to stream data from the backend server to the client. """ first_token = False total_len = 0 - # Pass response headers to the client + # Record the request start time and notify the request stats monitor. start_time = time.time() GetRequestStatsMonitor().on_new_request(backend_url, request_id, start_time) @@ -67,33 +71,29 @@ async def process_request( content=body, timeout=None, ) as backend_response: + # Yield headers and status code first. yield backend_response.headers, backend_response.status_code - # Stream response content + # Then stream the response content in chunks. async for chunk in backend_response.aiter_bytes(): total_len += len(chunk) if not first_token: first_token = True - GetRequestStatsMonitor().on_request_response( - backend_url, request_id, time.time() - ) + GetRequestStatsMonitor().on_request_response(backend_url, request_id, time.time()) yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) - - # if debug_request: - # logger.debug(f"Finished the request with request id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") - + # Optional debug logging can be enabled here. + # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") async def route_general_request(request: Request, endpoint: str): """ - Route the incoming request to the backend server and stream the response - back to the client. + Route the incoming request to the backend server and stream the response back to the client. """ in_router_time = time.time() request_id = str(uuid.uuid4()) - # TODO (ApostaC): merge two awaits into one + # Read the full request body and JSON payload. request_body = await request.body() request_json = await request.json() requested_model = request_json.get("model", None) @@ -107,6 +107,7 @@ async def route_general_request(request: Request, endpoint: str): engine_stats = GetEngineStatsScraper().get_engine_stats() request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) + # Filter endpoints by the requested model. endpoints = list(filter(lambda x: x.model_name == requested_model, endpoints)) if len(endpoints) == 0: return JSONResponse( @@ -146,13 +147,8 @@ async def route_files(request: Request): """Handle file upload requests that include a purpose and file data.""" form = await request.form() - # Validate required fields - if "purpose" not in form: - # Unlike openai, we do not support fine-tuning, so we do not need to - # check for 'purpose`.` - purpose = "unknown" - else: - purpose = form["purpose"] + # Validate required fields. + purpose = form.get("purpose", "unknown") if "file" not in form: return JSONResponse( status_code=400, content={"error": "Missing required parameter 'file'"} @@ -301,7 +297,6 @@ async def route_chat_completition(request: Request): async def route_completition(request: Request): return await route_general_request(request, "/v1/completions") - @app.get("/version") async def show_version(): ver = {"version": STACK_VERSION} @@ -328,10 +323,9 @@ async def show_models(): model_list = ModelList(data=model_cards) return JSONResponse(content=model_list.model_dump()) - @app.get("/health") async def health() -> Response: - """Health check. check the health of the threads""" + """Health check: verifies that service discovery and engine stats scraping are operational.""" if not GetServiceDiscovery().get_health(): return JSONResponse( content={"status": "Service discovery module is down."}, status_code=503 @@ -342,7 +336,12 @@ async def health() -> Response: ) return Response(status_code=200) +# --- Prometheus Metrics Endpoint (v2 observation/tracking) --- +@app.get("/metrics") +async def metrics(): + return Response(generate_latest(), media_type="text/plain") +# --- Argument Parsing and Initialization --- def validate_args(args): if args.service_discovery == "static": if args.static_backends is None: @@ -397,14 +396,13 @@ def parse_args(): "--static-backends", type=str, default=None, - help="The urls of static backends, separated by comma." - "E.g., http://localhost:8000,http://localhost:8001", + help="The URLs of static backends, separated by commas. E.g., http://localhost:8000,http://localhost:8001", ) parser.add_argument( "--static-models", type=str, default=None, - help="The models of static backends, separated by comma. E.g., model1,model2", + help="The models of static backends, separated by commas. E.g., model1,model2", ) parser.add_argument( "--k8s-port", @@ -479,14 +477,13 @@ def parse_args(): "--request-stats-window", type=int, default=60, - help="The sliding window seconds to compute request statistics.", + help="The sliding window in seconds to compute request statistics.", ) # Logging parser.add_argument( "--log-stats", action="store_true", help="Log statistics periodically." ) - parser.add_argument( "--log-stats-interval", type=int, @@ -505,7 +502,7 @@ def parse_static_urls(args): if validate_url(url): backend_urls.append(url) else: - logger.warning(f"Skipping invalid url: {url}") + logger.warning(f"Skipping invalid URL: {url}") return backend_urls @@ -555,14 +552,33 @@ def log_stats(interval: int = 10): request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) for endpoint in endpoints: url = endpoint.url + + logstr += f"Model: {endpoint.model_name}\n" logstr += f"Server: {url}\n" if url in engine_stats: - logstr += f" Engine stats: {engine_stats[url]}\n" + num_running_requests = engine_stats[url].num_running_requests + num_queing_requests = engine_stats[url].num_queuing_requests + gpu_cache_hit_rate = engine_stats[url].gpu_cache_hit_rate + logstr += ( + f" Engine stats: {num_running_requests} running requests, " + f"{num_queing_requests} queuing requests, {gpu_cache_hit_rate:.2f} GPU cache hit rate\n" + ) else: logstr += " Engine stats: No stats available\n" if url in request_stats: - logstr += f" Request Stats: {request_stats[url]}\n" + qps = request_stats[url].qps + num_requests = request_stats[url].ttft + in_prefill_requests = request_stats[url].in_prefill_requests + in_decoding_requets = request_stats[url].in_decoding_requests + finished_requests = request_stats[url].finished_requests + uptime = request_stats[url].uptime + logstr += ( + f" Request Stats: {qps:.2f} QPS, {num_requests} TTFT, " + f"{in_prefill_requests} in prefill, {in_decoding_requets} in decoding, " + f"{finished_requests} finished, uptime {uptime:.2f} seconds\n" + ) + vnum_requests_running.labels(server=url).set(qps) else: logstr += " Request Stats: No stats available\n" @@ -573,9 +589,7 @@ def log_stats(interval: int = 10): def main(): args = parse_args() - InitializeAll(args) - if args.log_stats: threading.Thread( target=log_stats, args=(args.log_stats_interval,), daemon=True diff --git a/utils/install-minikube-cluster.sh b/utils/install-minikube-cluster.sh index 5918961c..6017c28d 100755 --- a/utils/install-minikube-cluster.sh +++ b/utils/install-minikube-cluster.sh @@ -1,51 +1,91 @@ #!/bin/bash set -e +# Allow users to override the paths for the NVIDIA tools. +: "${NVIDIA_SMI_PATH:=nvidia-smi}" +: "${NVIDIA_CTK_PATH:=nvidia-ctk}" + +# --- Debug and Environment Setup --- +echo "Current PATH: $PATH" +echo "Operating System: $(uname -a)" + +# --- Helper Functions --- +# Check if minikube is installed. minikube_exists() { command -v minikube >/dev/null 2>&1 } -# Get script directory for relative paths +# Get the script directory to reference local scripts reliably. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Install kubectl and helm +# --- Install Prerequisites --- +echo "Installing kubectl and helm..." bash "$SCRIPT_DIR/install-kubectl.sh" bash "$SCRIPT_DIR/install-helm.sh" -# Install minikube +# Install minikube if it isn’t already installed. if minikube_exists; then - echo "Minikube already installed" + echo "Minikube already installed." else + echo "Minikube not found. Installing minikube..." curl -LO https://github.com/kubernetes/minikube/releases/latest/download/minikube-linux-amd64 sudo install minikube-linux-amd64 /usr/local/bin/minikube && rm minikube-linux-amd64 fi -# Configure BPF if available +# --- Configure BPF (if available) --- if [ -f /proc/sys/net/core/bpf_jit_harden ]; then + echo "Configuring BPF: Setting net.core.bpf_jit_harden=0" echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf sudo sysctl -p else echo "BPF JIT hardening configuration not available, skipping..." fi -# Check if NVIDIA GPU is available -if command -v nvidia-smi &> /dev/null; then - # Install nvidia-container-toolkit - sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker +# --- NVIDIA GPU Setup --- +GPU_AVAILABLE=false +if command -v "$NVIDIA_SMI_PATH" >/dev/null 2>&1; then + echo "NVIDIA GPU detected via nvidia-smi at: $(command -v "$NVIDIA_SMI_PATH")" + if command -v "$NVIDIA_CTK_PATH" >/dev/null 2>&1; then + echo "nvidia-ctk found at: $(command -v "$NVIDIA_CTK_PATH")" + GPU_AVAILABLE=true + else + echo "nvidia-ctk not found. Please install the NVIDIA Container Toolkit to enable GPU support." + fi +else + echo "No NVIDIA GPU detected. Will start minikube without GPU support." +fi - # Start cluster with GPU support - minikube start --driver docker --container-runtime docker --gpus all --force --addons=nvidia-device-plugin +if [ "$GPU_AVAILABLE" = true ]; then + # Configure Docker for GPU support. + echo "Configuring Docker runtime for GPU support..." + if sudo "$NVIDIA_CTK_PATH" runtime configure --runtime=docker; then + echo "Restarting Docker to apply changes..." + sudo systemctl restart docker + echo "Docker runtime configured successfully." + else + echo "Error: Failed to configure Docker runtime using the NVIDIA Container Toolkit." + exit 1 + fi - # Install gpu-operator + # Start minikube with GPU support. + echo "Starting minikube with GPU support..." + sudo minikube start --driver=docker --container-runtime=docker --gpus=all --force --addons=nvidia-device-plugin + + # Update kubeconfig context. + echo "Updating kubeconfig context..." + sudo minikube update-context + + # Install the GPU Operator via Helm. + echo "Adding NVIDIA helm repo and updating..." sudo helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && sudo helm repo update - sudo helm install --wait --generate-name \ - -n gpu-operator --create-namespace \ - nvidia/gpu-operator \ - --version=v24.9.1 + echo "Installing GPU Operator..." + sudo helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator --version=v24.9.1 else - echo "No NVIDIA GPU detected, starting minikube without GPU support..." - # Fix permission issues + # No GPU: Start minikube without GPU support. + echo "Starting minikube without GPU support..." + # Fix potential permission issues. sudo sysctl fs.protected_regular=0 - # Start cluster without GPU - minikube start --driver docker --force + minikube start --driver=docker --force fi + +echo "Minikube cluster installation complete." From ebdb6b21d65f3ab923192766c820f2547c888b05 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Wed, 12 Feb 2025 23:54:14 +0700 Subject: [PATCH 02/15] update the router and vllm-dashboard to align with the reference from @YuhanLiu11 Signed-off-by: sitloboi2012 --- observability/vllm-dashboard.json | 390 +++++++++++++++++------------- src/vllm_router/router.py | 54 +++-- src/vllm_router/run-router.sh | 2 +- 3 files changed, 256 insertions(+), 190 deletions(-) diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index 141afb8a..b95e79c5 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -9,7 +9,7 @@ }, "enable": true, "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", + "iconColor": "rgba(0,211,255,1)", "name": "Annotations & Alerts", "type": "dashboard" } @@ -25,7 +25,7 @@ "id": 100, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "collapsed": false, - "title": "Core vLLM Metrics", + "title": "Overview of the system", "type": "row" }, { @@ -55,7 +55,7 @@ "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true @@ -98,7 +98,7 @@ "showUnfilled": true, "sizing": "auto", "valueMode": "color", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ @@ -112,12 +112,53 @@ } ] }, + { + "id": 101, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, + "collapsed": false, + "title": "QoS Information", + "type": "row" + }, { "id": 3, + "type": "stat", + "title": "Current QPS", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:current_qps", "editorMode": "builder", "legendFormat": "Current QPS", "refId": "A", "range": true } + ] + }, + { + "id": 4, "type": "bargauge", "title": "Request TTFT distribution", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 9, "x": 15, "y": 1 }, + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -138,34 +179,87 @@ "showUnfilled": true, "sizing": "auto", "valueMode": "color", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", - "editorMode": "builder", - "format": "heatmap", - "legendFormat": "__auto", - "refId": "A", - "range": true - } + { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", "legendFormat": "__auto", "refId": "A", "range": true } ] }, { - "id": 101, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, - "collapsed": false, - "title": "Operational Metrics", - "type": "row", - "note": "Metrics related to the operational state of the vLLM instances." + "id": 5, + "type": "stat", + "title": "Router-side Queueing Delay", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", "legendFormat": "Queueing Delay", "refId": "A", "range": true } + ] }, { - "id": 4, + "id": 6, + "type": "stat", + "title": "Average Prefill Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:avg_prefill_length", "editorMode": "builder", "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } + ] + }, + { + "id": 7, "type": "timeseries", - "title": "Number of running requests", + "title": "Number of Prefilling Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -186,21 +280,15 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:num_requests_running", - "editorMode": "builder", - "legendFormat": "{{instance}}", - "refId": "A", - "range": true - } + { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } ] }, { - "id": 5, + "id": 8, "type": "timeseries", - "title": "GPU KV Usage percent", + "title": "Number of Decoding Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -221,21 +309,57 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:gpu_cache_usage_perc", - "editorMode": "builder", - "legendFormat": "{{instance}}", - "refId": "A", - "range": true - } + { "expr": "vllm:num_decoding_requests", "editorMode": "builder", "legendFormat": "Decoding Requests", "refId": "A", "range": true } ] }, { - "id": 6, + "id": 9, + "type": "stat", + "title": "Average Decoding Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:avg_decoding_length", "editorMode": "builder", "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } + ] + }, + { + "id": 102, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "collapsed": false, + "title": "Serving Engine Load", + "type": "row", + "note": "Metrics indicating the load on the serving engine." + }, + { + "id": 10, "type": "timeseries", - "title": "Number of pending requests", + "title": "Number of Running Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -256,21 +380,15 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:num_requests_waiting", - "editorMode": "builder", - "legendFormat": "{{instance}}", - "refId": "A", - "range": true - } + { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { - "id": 7, + "id": 11, "type": "timeseries", - "title": "GPU KV cache hit rate", + "title": "Number of Pending Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -291,32 +409,18 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:gpu_prefix_cache_hit_rate", - "editorMode": "builder", - "legendFormat": "{{instance}}", - "refId": "A", - "range": true - } + { "expr": "vllm:num_requests_waiting", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { - "id": 102, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, - "collapsed": false, - "title": "Router Observability Metrics", - "type": "row", - "note": "Metrics related to the router-side queueing delay and current QPS." - }, - { - "id": 8, - "type": "stat", - "title": "Current QPS", + "id": 12, + "type": "timeseries", + "title": "GPU KV Usage Percentage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 0, "y": 26 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -329,34 +433,23 @@ "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:current_qps", - "editorMode": "builder", - "legendFormat": "Current QPS", - "refId": "A", - "range": true - } + { "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { - "id": 9, - "type": "stat", - "title": "Router-side Queueing Delay", + "id": 13, + "type": "timeseries", + "title": "GPU KV Cache Hit Rate", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 6, "y": 26 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -369,34 +462,31 @@ "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:router_queueing_delay_seconds", - "editorMode": "builder", - "legendFormat": "Queueing Delay", - "refId": "A", - "range": true - } + { "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { - "id": 12, - "type": "stat", - "title": "Average Prefill Length", + "id": 103, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, + "collapsed": false, + "title": "Current Resource Usage", + "type": "row", + "note": "Metrics for GPU, CPU, Memory and Disk usage." + }, + { + "id": 14, + "type": "timeseries", + "title": "GPU Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 12, "y": 26 }, + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 42 }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -409,31 +499,20 @@ "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:avg_prefill_length", - "editorMode": "builder", - "legendFormat": "Avg. Prefill Length", - "refId": "A", - "range": true - } + { "expr": "node_gpu_usage_query", "editorMode": "builder", "legendFormat": "GPU Usage", "refId": "A", "range": true } ] }, { - "id": 10, + "id": 15, "type": "timeseries", - "title": "Number of Prefilling Requests", + "title": "CPU Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 18, "y": 26 }, + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 42 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -454,21 +533,15 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:num_prefill_requests", - "editorMode": "builder", - "legendFormat": "Prefilling Requests", - "refId": "A", - "range": true - } + { "expr": "node_cpu_usage_query", "editorMode": "builder", "legendFormat": "CPU Usage", "refId": "A", "range": true } ] }, { - "id": 11, + "id": 16, "type": "timeseries", - "title": "Number of Decoding Requests", + "title": "Memory Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 16, "x": 0, "y": 33 }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 42 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -489,24 +562,18 @@ }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:num_decoding_requests", - "editorMode": "builder", - "legendFormat": "Decoding Requests", - "refId": "A", - "range": true - } + { "expr": "node_memory_usage_query", "editorMode": "builder", "legendFormat": "Memory Usage", "refId": "A", "range": true } ] }, { - "id": 13, - "type": "stat", - "title": "Average Decoding Length", + "id": 17, + "type": "timeseries", + "title": "Disk Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 16, "y": 33 }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 42 }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -519,23 +586,12 @@ "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:avg_decoding_length", - "editorMode": "builder", - "legendFormat": "Avg. Decoding Length", - "refId": "A", - "range": true - } + { "expr": "node_disk_usage_query", "editorMode": "builder", "legendFormat": "Disk Usage", "refId": "A", "range": true } ] } ], @@ -547,8 +603,8 @@ "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "vllm dashboard", - "uid": "ee9i0i4y606psc", - "version": 18, + "title": "vLLM Dashboard", + "uid": "750918234", + "version": 20, "weekStart": "" } diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index e7c7c562..0908d791 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -9,7 +9,7 @@ import uvicorn from fastapi import FastAPI, Request, UploadFile from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import Gauge, generate_latest +from prometheus_client import Gauge, generate_latest, CONTENT_TYPE_LATEST from vllm_router.batch import BatchProcessor, initialize_batch_processor from vllm_router.engine_stats import GetEngineStatsScraper, InitializeEngineStatsScraper @@ -50,6 +50,10 @@ async def lifespan(app: FastAPI): vnum_requests_running = Gauge( "vllm:num_requests_running", "Number of running requests", ["server"] ) +current_qps = Gauge("vllm:current_qps", "Current Queries Per Second", ["server"]) +avg_decoding_length = Gauge("vllm:avg_decoding_length", "Average Decoding Length", ["server"]) +num_prefill_requests = Gauge("vllm:num_prefill_requests", "Number of Prefill Requests", ["server"]) +num_decoding_requests = Gauge("vllm:num_decoding_requests", "Number of Decoding Requests", ["server"]) # --- Request Processing & Routing --- # TODO: better request id system @@ -62,6 +66,8 @@ async def process_request(method, header, body, backend_url, request_id, endpoin # Record the request start time and notify the request stats monitor. start_time = time.time() GetRequestStatsMonitor().on_new_request(backend_url, request_id, start_time) + # Log the start of request processing + logger.info(f"Started request {request_id} for backend {backend_url}") client = httpx_client_wrapper() async with client.stream( @@ -83,6 +89,7 @@ async def process_request(method, header, body, backend_url, request_id, endpoin yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) + logger.info(f"Completed request {request_id} for backend {backend_url}") # Optional debug logging can be enabled here. # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") @@ -114,9 +121,11 @@ async def route_general_request(request: Request, endpoint: str): status_code=400, content={"error": f"Model {requested_model} not found."} ) + logger.debug(f"Routing request {request_id} for model: {requested_model}") server_url = GetRoutingLogic().route_request( endpoints, engine_stats, request_stats, request ) + logger.info(f"Request {request_id} routed to {server_url}") curr_time = time.time() logger.info( @@ -339,7 +348,15 @@ async def health() -> Response: # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): - return Response(generate_latest(), media_type="text/plain") + # Update gauges with stats from the request monitor + stats = GetRequestStatsMonitor().get_request_stats(time.time()) + for server, stat in stats.items(): + current_qps.labels(server=server).set(stat.qps) + avg_decoding_length.labels(server=server).set(stat.ttft) + num_prefill_requests.labels(server=server).set(stat.in_prefill_requests) + num_decoding_requests.labels(server=server).set(stat.in_decoding_requests) + vnum_requests_running.labels(server=server).set(stat.in_prefill_requests + stat.in_decoding_requests) + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) # --- Argument Parsing and Initialization --- def validate_args(args): @@ -552,36 +569,29 @@ def log_stats(interval: int = 10): request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) for endpoint in endpoints: url = endpoint.url - logstr += f"Model: {endpoint.model_name}\n" logstr += f"Server: {url}\n" if url in engine_stats: - num_running_requests = engine_stats[url].num_running_requests - num_queing_requests = engine_stats[url].num_queuing_requests - gpu_cache_hit_rate = engine_stats[url].gpu_cache_hit_rate + es = engine_stats[url] logstr += ( - f" Engine stats: {num_running_requests} running requests, " - f"{num_queing_requests} queuing requests, {gpu_cache_hit_rate:.2f} GPU cache hit rate\n" + f" Engine Stats (Dashboard): Running Requests: {es.num_running_requests}, " + f"Queueing Delay (requests): {es.num_queing_requests}, " + f"GPU Cache Hit Rate: {es.gpu_cache_hit_rate:.2f}\n" ) else: - logstr += " Engine stats: No stats available\n" - + logstr += " Engine Stats: No stats available\n" if url in request_stats: - qps = request_stats[url].qps - num_requests = request_stats[url].ttft - in_prefill_requests = request_stats[url].in_prefill_requests - in_decoding_requets = request_stats[url].in_decoding_requests - finished_requests = request_stats[url].finished_requests - uptime = request_stats[url].uptime + rs = request_stats[url] logstr += ( - f" Request Stats: {qps:.2f} QPS, {num_requests} TTFT, " - f"{in_prefill_requests} in prefill, {in_decoding_requets} in decoding, " - f"{finished_requests} finished, uptime {uptime:.2f} seconds\n" + f" Request Stats (Dashboard): Current QPS: {rs.qps:.2f}, " + f"Avg Decoding Length: {rs.ttft}, " + f"Prefill Requests: {rs.in_prefill_requests}, " + f"Decoding Requests: {rs.in_decoding_requests}, " + f"Finished Requests: {rs.finished_requests}, " + f"Uptime: {rs.uptime:.2f} sec\n" ) - vnum_requests_running.labels(server=url).set(qps) else: - logstr += " Request Stats: No stats available\n" - + logstr += " Request Stats: No stats available\n" logstr += "-" * 50 + "\n" logstr += "=" * 50 + "\n" logger.info(logstr) diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index cde00c46..6fb4cad2 100644 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -4,7 +4,7 @@ if [[ $# -ne 1 ]]; then exit 1 fi -python3 router.py --port "$1" \ +python3 vllm_router/router.py --port "$1" \ --service-discovery k8s \ --k8s-label-selector release=test \ --k8s-namespace default \ From c133f82c958508e392f0d33b7822b046d6876b9c Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 00:26:03 +0700 Subject: [PATCH 03/15] update the run router for case with static service discovery and k8s service discovery Signed-off-by: sitloboi2012 --- observability/README.md | 8 +++++++- src/vllm_router/router.py | 2 +- src/vllm_router/run-router.sh | 26 +++++++++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) mode change 100644 => 100755 src/vllm_router/run-router.sh diff --git a/observability/README.md b/observability/README.md index 4bf83de5..c903b888 100644 --- a/observability/README.md +++ b/observability/README.md @@ -23,7 +23,7 @@ sudo bash install.sh After installing, the dashboard can be accessed through the service `service/kube-prom-stack-grafana` in the `monitoring` namespace. -## Access the Grafana dashboard +## Access the Grafana & Prometheus dashboard Forward the Grafana dashboard port to the local node-port @@ -31,6 +31,12 @@ Forward the Grafana dashboard port to the local node-port sudo kubectl --namespace monitoring port-forward svc/kube-prom-stack-grafana 3000:80 --address 0.0.0.0 ``` +Forward the Prometheus dashboard + +```bash +sudo kubectl --namespace monitoring port-forward prometheus-kube-prom-stack-kube-prome-prometheus-0 9090:9090 +``` + Open the webpage at `http://:3000` to access the Grafana web page. The default user name is `admin` and the password can be configured in `values.yaml` (default is `prom-operator`). Import the dashboard using the `vllm-dashboard.json` in this folder. diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 0908d791..a9d1e659 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -575,7 +575,7 @@ def log_stats(interval: int = 10): es = engine_stats[url] logstr += ( f" Engine Stats (Dashboard): Running Requests: {es.num_running_requests}, " - f"Queueing Delay (requests): {es.num_queing_requests}, " + f"Queueing Delay (requests): {es.num_queuing_requests}, " f"GPU Cache Hit Rate: {es.gpu_cache_hit_rate:.2f}\n" ) else: diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh old mode 100644 new mode 100755 index 6fb4cad2..4137a46f --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -4,15 +4,27 @@ if [[ $# -ne 1 ]]; then exit 1 fi -python3 vllm_router/router.py --port "$1" \ - --service-discovery k8s \ - --k8s-label-selector release=test \ - --k8s-namespace default \ - --routing-logic session \ - --session-key "x-user-id" \ +# Use this command when testing with k8s service discovery +# python3 -m vllm_router.router --port "$1" \ +# --service-discovery k8s \ +# --k8s-label-selector release=test \ +# --k8s-namespace default \ +# --routing-logic session \ +# --session-key "x-user-id" \ +# --engine-stats-interval 10 \ +# --log-stats + +# Use this command when testing with static service discovery +python3 -m vllm_router.router --port "$1" \ + --service-discovery static \ + --static-backends "http://localhost:9000" \ + --static-models "fake_model_name" \ --engine-stats-interval 10 \ - --log-stats + --log-stats \ + --routing-logic session \ + --session-key "x-user-id" +# Use this command when testing with roundrobin routing logic #python3 router.py --port "$1" \ # --service-discovery k8s \ # --k8s-label-selector release=test \ From 629f88d8b9d4c606a0f8fbe50972984374cf1e49 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 09:57:18 +0700 Subject: [PATCH 04/15] rebase with main Signed-off-by: sitloboi2012 --- observability/README.md | 1 + src/tests/requirements.txt | 2 +- src/vllm_router/router.py | 34 ++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/observability/README.md b/observability/README.md index c903b888..9b79d73e 100644 --- a/observability/README.md +++ b/observability/README.md @@ -17,6 +17,7 @@ Make sure to have: - Or follow our [tutorial](tutorials/00-install-kubernetes-env.md) After that you can run: + ```bash sudo bash install.sh ``` diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt index 07fcefca..743b58aa 100644 --- a/src/tests/requirements.txt +++ b/src/tests/requirements.txt @@ -1,5 +1,5 @@ fastapi httpx -uvicorn openai +uvicorn vllm diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index a9d1e659..a23e280c 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -9,7 +9,7 @@ import uvicorn from fastapi import FastAPI, Request, UploadFile from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import Gauge, generate_latest, CONTENT_TYPE_LATEST +from prometheus_client import CONTENT_TYPE_LATEST, Gauge, generate_latest from vllm_router.batch import BatchProcessor, initialize_batch_processor from vllm_router.engine_stats import GetEngineStatsScraper, InitializeEngineStatsScraper @@ -51,13 +51,22 @@ async def lifespan(app: FastAPI): "vllm:num_requests_running", "Number of running requests", ["server"] ) current_qps = Gauge("vllm:current_qps", "Current Queries Per Second", ["server"]) -avg_decoding_length = Gauge("vllm:avg_decoding_length", "Average Decoding Length", ["server"]) -num_prefill_requests = Gauge("vllm:num_prefill_requests", "Number of Prefill Requests", ["server"]) -num_decoding_requests = Gauge("vllm:num_decoding_requests", "Number of Decoding Requests", ["server"]) +avg_decoding_length = Gauge( + "vllm:avg_decoding_length", "Average Decoding Length", ["server"] +) +num_prefill_requests = Gauge( + "vllm:num_prefill_requests", "Number of Prefill Requests", ["server"] +) +num_decoding_requests = Gauge( + "vllm:num_decoding_requests", "Number of Decoding Requests", ["server"] +) -# --- Request Processing & Routing --- + +# --- Request Processing & Routing --- # TODO: better request id system -async def process_request(method, header, body, backend_url, request_id, endpoint, debug_request=None): +async def process_request( + method, header, body, backend_url, request_id, endpoint, debug_request=None +): """ Async generator to stream data from the backend server to the client. """ @@ -85,7 +94,9 @@ async def process_request(method, header, body, backend_url, request_id, endpoin total_len += len(chunk) if not first_token: first_token = True - GetRequestStatsMonitor().on_request_response(backend_url, request_id, time.time()) + GetRequestStatsMonitor().on_request_response( + backend_url, request_id, time.time() + ) yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) @@ -93,6 +104,7 @@ async def process_request(method, header, body, backend_url, request_id, endpoin # Optional debug logging can be enabled here. # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") + async def route_general_request(request: Request, endpoint: str): """ Route the incoming request to the backend server and stream the response back to the client. @@ -306,6 +318,7 @@ async def route_chat_completition(request: Request): async def route_completition(request: Request): return await route_general_request(request, "/v1/completions") + @app.get("/version") async def show_version(): ver = {"version": STACK_VERSION} @@ -332,6 +345,7 @@ async def show_models(): model_list = ModelList(data=model_cards) return JSONResponse(content=model_list.model_dump()) + @app.get("/health") async def health() -> Response: """Health check: verifies that service discovery and engine stats scraping are operational.""" @@ -345,6 +359,7 @@ async def health() -> Response: ) return Response(status_code=200) + # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): @@ -355,9 +370,12 @@ async def metrics(): avg_decoding_length.labels(server=server).set(stat.ttft) num_prefill_requests.labels(server=server).set(stat.in_prefill_requests) num_decoding_requests.labels(server=server).set(stat.in_decoding_requests) - vnum_requests_running.labels(server=server).set(stat.in_prefill_requests + stat.in_decoding_requests) + vnum_requests_running.labels(server=server).set( + stat.in_prefill_requests + stat.in_decoding_requests + ) return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) + # --- Argument Parsing and Initialization --- def validate_args(args): if args.service_discovery == "static": From a885e9f3a7eca59ca98920edb871478a11c54048 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Wed, 12 Feb 2025 23:21:50 +0700 Subject: [PATCH 05/15] update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run Signed-off-by: sitloboi2012 --- observability/README.md | 3 + observability/vllm-dashboard.json | 321 ++++++++++++++++++++++++++++++ src/tests/requirements.txt | 1 + src/vllm_router/router.py | 79 +++++++- 4 files changed, 399 insertions(+), 5 deletions(-) diff --git a/observability/README.md b/observability/README.md index 9b79d73e..2fe4d03d 100644 --- a/observability/README.md +++ b/observability/README.md @@ -17,7 +17,10 @@ Make sure to have: - Or follow our [tutorial](tutorials/00-install-kubernetes-env.md) After that you can run: +<<<<<<< HEAD +======= +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ```bash sudo bash install.sh ``` diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index b95e79c5..ebfe6894 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -25,7 +25,11 @@ "id": 100, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "collapsed": false, +<<<<<<< HEAD "title": "Overview of the system", +======= + "title": "Core vLLM Metrics", +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "type": "row" }, { @@ -55,7 +59,11 @@ "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", +<<<<<<< HEAD "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, +======= + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "showPercentChange": false, "textMode": "auto", "wideLayout": true @@ -98,7 +106,11 @@ "showUnfilled": true, "sizing": "auto", "valueMode": "color", +<<<<<<< HEAD "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } +======= + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) }, "pluginVersion": "11.4.0", "targets": [ @@ -111,6 +123,7 @@ "range": true } ] +<<<<<<< HEAD }, { "id": 101, @@ -125,6 +138,15 @@ "title": "Current QPS", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, +======= + }, + { + "id": 3, + "type": "bargauge", + "title": "Request TTFT distribution", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 9, "x": 15, "y": 1 }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -140,6 +162,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -162,6 +185,44 @@ "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, +======= + "displayMode": "gradient", + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", + "editorMode": "builder", + "format": "heatmap", + "legendFormat": "__auto", + "refId": "A", + "range": true + } + ] + }, + { + "id": 101, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, + "collapsed": false, + "title": "Operational Metrics", + "type": "row", + "note": "Metrics related to the operational state of the vLLM instances." + }, + { + "id": 4, + "type": "timeseries", + "title": "Number of running requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "mappings": [], "thresholds": { "mode": "absolute", @@ -174,6 +235,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "showUnfilled": true, @@ -184,10 +246,25 @@ "pluginVersion": "11.4.0", "targets": [ { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", "legendFormat": "__auto", "refId": "A", "range": true } +======= + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:num_requests_running", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 5, +<<<<<<< HEAD "type": "stat", "title": "Router-side Queueing Delay", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -195,6 +272,15 @@ "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, +======= + "type": "timeseries", + "title": "GPU KV Usage percent", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "mappings": [], "thresholds": { "mode": "absolute", @@ -207,6 +293,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -218,10 +305,25 @@ "pluginVersion": "11.4.0", "targets": [ { "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", "legendFormat": "Queueing Delay", "refId": "A", "range": true } +======= + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:gpu_cache_usage_perc", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 6, +<<<<<<< HEAD "type": "stat", "title": "Average Prefill Length", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -229,6 +331,15 @@ "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, +======= + "type": "timeseries", + "title": "Number of pending requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "mappings": [], "thresholds": { "mode": "absolute", @@ -241,6 +352,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -252,14 +364,34 @@ "pluginVersion": "11.4.0", "targets": [ { "expr": "vllm:avg_prefill_length", "editorMode": "builder", "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } +======= + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:num_requests_waiting", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 7, "type": "timeseries", +<<<<<<< HEAD "title": "Number of Prefilling Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, +======= + "title": "GPU KV cache hit rate", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -280,6 +412,7 @@ }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } ] }, @@ -318,6 +451,31 @@ "title": "Average Decoding Length", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, +======= + { + "expr": "vllm:gpu_prefix_cache_hit_rate", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true + } + ] + }, + { + "id": 102, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "collapsed": false, + "title": "Router Observability Metrics", + "type": "row", + "note": "Metrics related to the router-side queueing delay and current QPS." + }, + { + "id": 8, + "type": "stat", + "title": "Current QPS", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 26 }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -337,12 +495,17 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", +<<<<<<< HEAD "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, +======= + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD { "expr": "vllm:avg_decoding_length", "editorMode": "builder", "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } ] }, @@ -353,13 +516,109 @@ "title": "Serving Engine Load", "type": "row", "note": "Metrics indicating the load on the serving engine." +======= + { + "expr": "vllm:current_qps", + "editorMode": "builder", + "legendFormat": "Current QPS", + "refId": "A", + "range": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Router-side Queueing Delay", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 26 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:router_queueing_delay_seconds", + "editorMode": "builder", + "legendFormat": "Queueing Delay", + "refId": "A", + "range": true + } + ] + }, + { + "id": 12, + "type": "stat", + "title": "Average Prefill Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 26 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:avg_prefill_length", + "editorMode": "builder", + "legendFormat": "Avg. Prefill Length", + "refId": "A", + "range": true + } + ] +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) }, { "id": 10, "type": "timeseries", +<<<<<<< HEAD "title": "Number of Running Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, +======= + "title": "Number of Prefilling Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 26 }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -380,15 +639,31 @@ }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } +======= + { + "expr": "vllm:num_prefill_requests", + "editorMode": "builder", + "legendFormat": "Prefilling Requests", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 11, "type": "timeseries", +<<<<<<< HEAD "title": "Number of Pending Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, +======= + "title": "Number of Decoding Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 16, "x": 0, "y": 33 }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -409,6 +684,7 @@ }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD { "expr": "vllm:num_requests_waiting", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, @@ -439,10 +715,20 @@ "pluginVersion": "11.4.0", "targets": [ { "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } +======= + { + "expr": "vllm:num_decoding_requests", + "editorMode": "builder", + "legendFormat": "Decoding Requests", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 13, +<<<<<<< HEAD "type": "timeseries", "title": "GPU KV Cache Hit Rate", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -450,6 +736,15 @@ "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, +======= + "type": "stat", + "title": "Average Decoding Length", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 33 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "mappings": [], "thresholds": { "mode": "absolute", @@ -462,6 +757,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, @@ -592,6 +888,25 @@ "pluginVersion": "11.4.0", "targets": [ { "expr": "node_disk_usage_query", "editorMode": "builder", "legendFormat": "Disk Usage", "refId": "A", "range": true } +======= + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "vllm:avg_decoding_length", + "editorMode": "builder", + "legendFormat": "Avg. Decoding Length", + "refId": "A", + "range": true + } +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] } ], @@ -603,8 +918,14 @@ "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", +<<<<<<< HEAD "title": "vLLM Dashboard", "uid": "750918234", "version": 20, +======= + "title": "vllm dashboard", + "uid": "ee9i0i4y606psc", + "version": 18, +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) "weekStart": "" } diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt index 743b58aa..a5a09113 100644 --- a/src/tests/requirements.txt +++ b/src/tests/requirements.txt @@ -2,4 +2,5 @@ fastapi httpx openai uvicorn +openai vllm diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index a23e280c..654fa99a 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -50,6 +50,7 @@ async def lifespan(app: FastAPI): vnum_requests_running = Gauge( "vllm:num_requests_running", "Number of running requests", ["server"] ) +<<<<<<< HEAD current_qps = Gauge("vllm:current_qps", "Current Queries Per Second", ["server"]) avg_decoding_length = Gauge( "vllm:avg_decoding_length", "Average Decoding Length", ["server"] @@ -67,6 +68,12 @@ async def lifespan(app: FastAPI): async def process_request( method, header, body, backend_url, request_id, endpoint, debug_request=None ): +======= + +# --- Request Processing & Routing --- +# TODO: better request id system +async def process_request(method, header, body, backend_url, request_id, endpoint, debug_request=None): +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) """ Async generator to stream data from the backend server to the client. """ @@ -94,16 +101,19 @@ async def process_request( total_len += len(chunk) if not first_token: first_token = True - GetRequestStatsMonitor().on_request_response( - backend_url, request_id, time.time() - ) + GetRequestStatsMonitor().on_request_response(backend_url, request_id, time.time()) yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) +<<<<<<< HEAD logger.info(f"Completed request {request_id} for backend {backend_url}") # Optional debug logging can be enabled here. # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") +======= + # Optional debug logging can be enabled here. + # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_general_request(request: Request, endpoint: str): """ @@ -162,8 +172,13 @@ async def route_general_request(request: Request, endpoint: str): media_type="text/event-stream", ) +<<<<<<< HEAD @app.post("/v1/files") +======= +# --- File Endpoints --- +@app.post("/files") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_files(request: Request): """Handle file upload requests that include a purpose and file data.""" form = await request.form() @@ -189,8 +204,12 @@ async def route_files(request: Request): status_code=500, content={"error": f"Failed to save file: {str(e)}"} ) +<<<<<<< HEAD @app.get("/v1/files/{file_id}") +======= +@app.get("/files/{file_id}") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_get_file(file_id: str): try: storage: Storage = app.state.batch_storage @@ -201,8 +220,12 @@ async def route_get_file(file_id: str): status_code=404, content={"error": f"File {file_id} not found"} ) +<<<<<<< HEAD @app.get("/v1/files/{file_id}/content") +======= +@app.get("/files/{file_id}/content") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_get_file_content(file_id: str): try: # TODO(gaocegege): Stream the file content with chunks to support @@ -215,6 +238,7 @@ async def route_get_file_content(file_id: str): status_code=404, content={"error": f"File {file_id} not found"} ) +<<<<<<< HEAD @app.post("/v1/batches") async def route_batches(request: Request): @@ -315,17 +339,28 @@ async def route_chat_completition(request: Request): @app.post("/v1/completions") +======= +# --- API Endpoints --- +@app.post("/chat/completions") +async def route_chat_completition(request: Request): + return await route_general_request(request, "/v1/chat/completions") + +@app.post("/completions") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_completition(request: Request): return await route_general_request(request, "/v1/completions") - @app.get("/version") async def show_version(): ver = {"version": STACK_VERSION} return JSONResponse(content=ver) +<<<<<<< HEAD @app.get("/v1/models") +======= +@app.get("/models") +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def show_models(): endpoints = GetServiceDiscovery().get_endpoint_info() existing_models = set() @@ -345,7 +380,6 @@ async def show_models(): model_list = ModelList(data=model_cards) return JSONResponse(content=model_list.model_dump()) - @app.get("/health") async def health() -> Response: """Health check: verifies that service discovery and engine stats scraping are operational.""" @@ -359,7 +393,12 @@ async def health() -> Response: ) return Response(status_code=200) +# --- Prometheus Metrics Endpoint (v2 observation/tracking) --- +@app.get("/metrics") +async def metrics(): + return Response(generate_latest(), media_type="text/plain") +<<<<<<< HEAD # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): @@ -376,6 +415,8 @@ async def metrics(): return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) +======= +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) # --- Argument Parsing and Initialization --- def validate_args(args): if args.service_discovery == "static": @@ -587,6 +628,7 @@ def log_stats(interval: int = 10): request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) for endpoint in endpoints: url = endpoint.url +<<<<<<< HEAD logstr += f"Model: {endpoint.model_name}\n" logstr += f"Server: {url}\n" if url in engine_stats: @@ -595,10 +637,23 @@ def log_stats(interval: int = 10): f" Engine Stats (Dashboard): Running Requests: {es.num_running_requests}, " f"Queueing Delay (requests): {es.num_queuing_requests}, " f"GPU Cache Hit Rate: {es.gpu_cache_hit_rate:.2f}\n" +======= + + logstr += f"Model: {endpoint.model_name}\n" + logstr += f"Server: {url}\n" + if url in engine_stats: + num_running_requests = engine_stats[url].num_running_requests + num_queing_requests = engine_stats[url].num_queuing_requests + gpu_cache_hit_rate = engine_stats[url].gpu_cache_hit_rate + logstr += ( + f" Engine stats: {num_running_requests} running requests, " + f"{num_queing_requests} queuing requests, {gpu_cache_hit_rate:.2f} GPU cache hit rate\n" +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ) else: logstr += " Engine Stats: No stats available\n" if url in request_stats: +<<<<<<< HEAD rs = request_stats[url] logstr += ( f" Request Stats (Dashboard): Current QPS: {rs.qps:.2f}, " @@ -608,6 +663,20 @@ def log_stats(interval: int = 10): f"Finished Requests: {rs.finished_requests}, " f"Uptime: {rs.uptime:.2f} sec\n" ) +======= + qps = request_stats[url].qps + num_requests = request_stats[url].ttft + in_prefill_requests = request_stats[url].in_prefill_requests + in_decoding_requets = request_stats[url].in_decoding_requests + finished_requests = request_stats[url].finished_requests + uptime = request_stats[url].uptime + logstr += ( + f" Request Stats: {qps:.2f} QPS, {num_requests} TTFT, " + f"{in_prefill_requests} in prefill, {in_decoding_requets} in decoding, " + f"{finished_requests} finished, uptime {uptime:.2f} seconds\n" + ) + vnum_requests_running.labels(server=url).set(qps) +>>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) else: logstr += " Request Stats: No stats available\n" logstr += "-" * 50 + "\n" From c4b14050ca522666a08cb7856036bfe0b8b58328 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Wed, 12 Feb 2025 23:54:14 +0700 Subject: [PATCH 06/15] update the router and vllm-dashboard to align with the reference from @YuhanLiu11 Signed-off-by: sitloboi2012 --- observability/vllm-dashboard.json | 350 ++++++++++++++++++++++++++++-- src/vllm_router/router.py | 69 ------ src/vllm_router/run-router.sh | 11 + 3 files changed, 337 insertions(+), 93 deletions(-) diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index ebfe6894..d0334e40 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -25,11 +25,15 @@ "id": 100, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "collapsed": false, +<<<<<<< HEAD <<<<<<< HEAD "title": "Overview of the system", ======= "title": "Core vLLM Metrics", >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "title": "Overview of the system", +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "type": "row" }, { @@ -59,11 +63,15 @@ "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", +<<<<<<< HEAD <<<<<<< HEAD "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, ======= "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "showPercentChange": false, "textMode": "auto", "wideLayout": true @@ -106,11 +114,15 @@ "showUnfilled": true, "sizing": "auto", "valueMode": "color", +<<<<<<< HEAD <<<<<<< HEAD "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } ======= "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) }, "pluginVersion": "11.4.0", "targets": [ @@ -140,13 +152,58 @@ "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, ======= }, + { + "id": 101, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, + "collapsed": false, + "title": "QoS Information", + "type": "row" + }, { "id": 3, + "type": "stat", + "title": "Current QPS", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:current_qps", "editorMode": "builder", "legendFormat": "Current QPS", "refId": "A", "range": true } + ] + }, + { + "id": 4, "type": "bargauge", "title": "Request TTFT distribution", "datasource": { "type": "prometheus", "uid": "prometheus" }, +<<<<<<< HEAD "gridPos": { "h": 7, "w": 9, "x": 15, "y": 1 }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -191,10 +248,11 @@ "showUnfilled": true, "sizing": "auto", "valueMode": "color", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", @@ -260,10 +318,14 @@ "range": true } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", "legendFormat": "__auto", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 5, +<<<<<<< HEAD <<<<<<< HEAD "type": "stat", "title": "Router-side Queueing Delay", @@ -275,12 +337,20 @@ ======= "type": "timeseries", "title": "GPU KV Usage percent", +======= + "type": "stat", + "title": "Router-side Queueing Delay", +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, "fieldConfig": { "defaults": { +<<<<<<< HEAD "color": { "mode": "palette-classic" }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "color": { "mode": "thresholds" }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "mappings": [], "thresholds": { "mode": "absolute", @@ -294,6 +364,9 @@ }, "options": { <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -301,6 +374,7 @@ "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true +<<<<<<< HEAD }, "pluginVersion": "11.4.0", "targets": [ @@ -319,10 +393,17 @@ "range": true } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", "legendFormat": "Queueing Delay", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 6, +<<<<<<< HEAD <<<<<<< HEAD "type": "stat", "title": "Average Prefill Length", @@ -334,12 +415,20 @@ ======= "type": "timeseries", "title": "Number of pending requests", +======= + "type": "stat", + "title": "Average Prefill Length", +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, "fieldConfig": { "defaults": { +<<<<<<< HEAD "color": { "mode": "palette-classic" }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "color": { "mode": "thresholds" }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "mappings": [], "thresholds": { "mode": "absolute", @@ -353,6 +442,9 @@ }, "options": { <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -360,6 +452,7 @@ "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true +<<<<<<< HEAD }, "pluginVersion": "11.4.0", "targets": [ @@ -378,11 +471,18 @@ "range": true } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:avg_prefill_length", "editorMode": "builder", "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 7, "type": "timeseries", +<<<<<<< HEAD <<<<<<< HEAD "title": "Number of Prefilling Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -392,6 +492,11 @@ "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "title": "Number of Prefilling Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -412,6 +517,7 @@ }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD <<<<<<< HEAD { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } ] @@ -459,26 +565,25 @@ "refId": "A", "range": true } +======= + { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, - { - "id": 102, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, - "collapsed": false, - "title": "Router Observability Metrics", - "type": "row", - "note": "Metrics related to the router-side queueing delay and current QPS." - }, { "id": 8, - "type": "stat", - "title": "Current QPS", + "type": "timeseries", + "title": "Number of Decoding Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, +<<<<<<< HEAD "gridPos": { "h": 7, "w": 6, "x": 0, "y": 26 }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -491,6 +596,7 @@ "overrides": [] }, "options": { +<<<<<<< HEAD "colorMode": "value", "graphMode": "area", "justifyMode": "auto", @@ -524,14 +630,22 @@ "refId": "A", "range": true } +======= + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:num_decoding_requests", "editorMode": "builder", "legendFormat": "Decoding Requests", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 9, "type": "stat", - "title": "Router-side Queueing Delay", + "title": "Average Decoding Length", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 6, "y": 26 }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -551,22 +665,17 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ - { - "expr": "vllm:router_queueing_delay_seconds", - "editorMode": "builder", - "legendFormat": "Queueing Delay", - "refId": "A", - "range": true - } + { "expr": "vllm:avg_decoding_length", "editorMode": "builder", "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } ] }, { +<<<<<<< HEAD "id": 12, "type": "stat", "title": "Average Prefill Length", @@ -606,10 +715,19 @@ } ] >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "id": 102, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "collapsed": false, + "title": "Serving Engine Load", + "type": "row", + "note": "Metrics indicating the load on the serving engine." +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) }, { "id": 10, "type": "timeseries", +<<<<<<< HEAD <<<<<<< HEAD "title": "Number of Running Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -619,6 +737,11 @@ "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 26 }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "title": "Number of Running Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -639,6 +762,7 @@ }, "pluginVersion": "11.4.0", "targets": [ +<<<<<<< HEAD <<<<<<< HEAD { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ======= @@ -650,11 +774,15 @@ "range": true } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 11, "type": "timeseries", +<<<<<<< HEAD <<<<<<< HEAD "title": "Number of Pending Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -664,6 +792,11 @@ "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 16, "x": 0, "y": 33 }, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "title": "Number of Pending Requests", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -685,6 +818,9 @@ "pluginVersion": "11.4.0", "targets": [ <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) { "expr": "vllm:num_requests_waiting", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, @@ -711,6 +847,7 @@ "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } +<<<<<<< HEAD }, "pluginVersion": "11.4.0", "targets": [ @@ -907,6 +1044,165 @@ "range": true } >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } + ] + }, + { + "id": 13, + "type": "timeseries", + "title": "GPU KV Cache Hit Rate", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } + ] + }, + { + "id": 103, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, + "collapsed": false, + "title": "Current Resource Usage", + "type": "row", + "note": "Metrics for GPU, CPU, Memory and Disk usage." + }, + { + "id": 14, + "type": "timeseries", + "title": "GPU Usage", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 42 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "node_gpu_usage_query", "editorMode": "builder", "legendFormat": "GPU Usage", "refId": "A", "range": true } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "CPU Usage", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 42 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "node_cpu_usage_query", "editorMode": "builder", "legendFormat": "CPU Usage", "refId": "A", "range": true } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Memory Usage", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 42 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "node_memory_usage_query", "editorMode": "builder", "legendFormat": "Memory Usage", "refId": "A", "range": true } + ] + }, + { + "id": 17, + "type": "timeseries", + "title": "Disk Usage", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 42 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { "expr": "node_disk_usage_query", "editorMode": "builder", "legendFormat": "Disk Usage", "refId": "A", "range": true } +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] } ], @@ -918,6 +1214,7 @@ "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", +<<<<<<< HEAD <<<<<<< HEAD "title": "vLLM Dashboard", "uid": "750918234", @@ -927,5 +1224,10 @@ "uid": "ee9i0i4y606psc", "version": 18, >>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) +======= + "title": "vLLM Dashboard", + "uid": "750918234", + "version": 20, +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "weekStart": "" } diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 654fa99a..89b7236b 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -50,7 +50,6 @@ async def lifespan(app: FastAPI): vnum_requests_running = Gauge( "vllm:num_requests_running", "Number of running requests", ["server"] ) -<<<<<<< HEAD current_qps = Gauge("vllm:current_qps", "Current Queries Per Second", ["server"]) avg_decoding_length = Gauge( "vllm:avg_decoding_length", "Average Decoding Length", ["server"] @@ -68,12 +67,6 @@ async def lifespan(app: FastAPI): async def process_request( method, header, body, backend_url, request_id, endpoint, debug_request=None ): -======= - -# --- Request Processing & Routing --- -# TODO: better request id system -async def process_request(method, header, body, backend_url, request_id, endpoint, debug_request=None): ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) """ Async generator to stream data from the backend server to the client. """ @@ -105,15 +98,10 @@ async def process_request(method, header, body, backend_url, request_id, endpoin yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) -<<<<<<< HEAD logger.info(f"Completed request {request_id} for backend {backend_url}") # Optional debug logging can be enabled here. # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") -======= - # Optional debug logging can be enabled here. - # logger.debug(f"Finished the request with id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_general_request(request: Request, endpoint: str): """ @@ -172,13 +160,8 @@ async def route_general_request(request: Request, endpoint: str): media_type="text/event-stream", ) -<<<<<<< HEAD @app.post("/v1/files") -======= -# --- File Endpoints --- -@app.post("/files") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_files(request: Request): """Handle file upload requests that include a purpose and file data.""" form = await request.form() @@ -204,12 +187,8 @@ async def route_files(request: Request): status_code=500, content={"error": f"Failed to save file: {str(e)}"} ) -<<<<<<< HEAD @app.get("/v1/files/{file_id}") -======= -@app.get("/files/{file_id}") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_get_file(file_id: str): try: storage: Storage = app.state.batch_storage @@ -220,12 +199,8 @@ async def route_get_file(file_id: str): status_code=404, content={"error": f"File {file_id} not found"} ) -<<<<<<< HEAD @app.get("/v1/files/{file_id}/content") -======= -@app.get("/files/{file_id}/content") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_get_file_content(file_id: str): try: # TODO(gaocegege): Stream the file content with chunks to support @@ -238,7 +213,6 @@ async def route_get_file_content(file_id: str): status_code=404, content={"error": f"File {file_id} not found"} ) -<<<<<<< HEAD @app.post("/v1/batches") async def route_batches(request: Request): @@ -339,14 +313,6 @@ async def route_chat_completition(request: Request): @app.post("/v1/completions") -======= -# --- API Endpoints --- -@app.post("/chat/completions") -async def route_chat_completition(request: Request): - return await route_general_request(request, "/v1/chat/completions") - -@app.post("/completions") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def route_completition(request: Request): return await route_general_request(request, "/v1/completions") @@ -355,12 +321,8 @@ async def show_version(): ver = {"version": STACK_VERSION} return JSONResponse(content=ver) -<<<<<<< HEAD @app.get("/v1/models") -======= -@app.get("/models") ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) async def show_models(): endpoints = GetServiceDiscovery().get_endpoint_info() existing_models = set() @@ -398,7 +360,6 @@ async def health() -> Response: async def metrics(): return Response(generate_latest(), media_type="text/plain") -<<<<<<< HEAD # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): @@ -415,8 +376,6 @@ async def metrics(): return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) -======= ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) # --- Argument Parsing and Initialization --- def validate_args(args): if args.service_discovery == "static": @@ -628,7 +587,6 @@ def log_stats(interval: int = 10): request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) for endpoint in endpoints: url = endpoint.url -<<<<<<< HEAD logstr += f"Model: {endpoint.model_name}\n" logstr += f"Server: {url}\n" if url in engine_stats: @@ -637,23 +595,10 @@ def log_stats(interval: int = 10): f" Engine Stats (Dashboard): Running Requests: {es.num_running_requests}, " f"Queueing Delay (requests): {es.num_queuing_requests}, " f"GPU Cache Hit Rate: {es.gpu_cache_hit_rate:.2f}\n" -======= - - logstr += f"Model: {endpoint.model_name}\n" - logstr += f"Server: {url}\n" - if url in engine_stats: - num_running_requests = engine_stats[url].num_running_requests - num_queing_requests = engine_stats[url].num_queuing_requests - gpu_cache_hit_rate = engine_stats[url].gpu_cache_hit_rate - logstr += ( - f" Engine stats: {num_running_requests} running requests, " - f"{num_queing_requests} queuing requests, {gpu_cache_hit_rate:.2f} GPU cache hit rate\n" ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ) else: logstr += " Engine Stats: No stats available\n" if url in request_stats: -<<<<<<< HEAD rs = request_stats[url] logstr += ( f" Request Stats (Dashboard): Current QPS: {rs.qps:.2f}, " @@ -663,20 +608,6 @@ def log_stats(interval: int = 10): f"Finished Requests: {rs.finished_requests}, " f"Uptime: {rs.uptime:.2f} sec\n" ) -======= - qps = request_stats[url].qps - num_requests = request_stats[url].ttft - in_prefill_requests = request_stats[url].in_prefill_requests - in_decoding_requets = request_stats[url].in_decoding_requests - finished_requests = request_stats[url].finished_requests - uptime = request_stats[url].uptime - logstr += ( - f" Request Stats: {qps:.2f} QPS, {num_requests} TTFT, " - f"{in_prefill_requests} in prefill, {in_decoding_requets} in decoding, " - f"{finished_requests} finished, uptime {uptime:.2f} seconds\n" - ) - vnum_requests_running.labels(server=url).set(qps) ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) else: logstr += " Request Stats: No stats available\n" logstr += "-" * 50 + "\n" diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index 4137a46f..31ad8f4f 100755 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -4,6 +4,7 @@ if [[ $# -ne 1 ]]; then exit 1 fi +<<<<<<< HEAD # Use this command when testing with k8s service discovery # python3 -m vllm_router.router --port "$1" \ # --service-discovery k8s \ @@ -13,6 +14,16 @@ fi # --session-key "x-user-id" \ # --engine-stats-interval 10 \ # --log-stats +======= +python3 vllm_router/router.py --port "$1" \ + --service-discovery k8s \ + --k8s-label-selector release=test \ + --k8s-namespace default \ + --routing-logic session \ + --session-key "x-user-id" \ + --engine-stats-interval 10 \ + --log-stats +>>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) # Use this command when testing with static service discovery python3 -m vllm_router.router --port "$1" \ From ef19047e24c28cebf89bb0dae7737e46b0fd7b51 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 00:30:29 +0700 Subject: [PATCH 07/15] run pre-commit for linting Signed-off-by: sitloboi2012 --- src/tests/requirements.txt | 4 ++ src/vllm_router/router.py | 103 ++++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt index a5a09113..2f92ce9f 100644 --- a/src/tests/requirements.txt +++ b/src/tests/requirements.txt @@ -1,6 +1,10 @@ fastapi httpx +<<<<<<< HEAD openai uvicorn +======= +>>>>>>> 2ba572e (run pre-commit for linting) openai +uvicorn vllm diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 89b7236b..25ab1ecb 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -94,7 +94,9 @@ async def process_request( total_len += len(chunk) if not first_token: first_token = True - GetRequestStatsMonitor().on_request_response(backend_url, request_id, time.time()) + GetRequestStatsMonitor().on_request_response( + backend_url, request_id, time.time() + ) yield chunk GetRequestStatsMonitor().on_request_complete(backend_url, request_id, time.time()) @@ -188,6 +190,7 @@ async def route_files(request: Request): ) + @app.get("/v1/files/{file_id}") async def route_get_file(file_id: str): try: @@ -200,6 +203,7 @@ async def route_get_file(file_id: str): ) + @app.get("/v1/files/{file_id}/content") async def route_get_file_content(file_id: str): try: @@ -307,6 +311,100 @@ async def route_cancel_batch(batch_id: str): ) +@app.post("/v1/batches") +async def route_batches(request: Request): + """Handle batch requests that process files with specified endpoints.""" + try: + request_json = await request.json() + + # Validate required fields + if "input_file_id" not in request_json: + return JSONResponse( + status_code=400, + content={"error": "Missing required parameter 'input_file_id'"}, + ) + if "endpoint" not in request_json: + return JSONResponse( + status_code=400, + content={"error": "Missing required parameter 'endpoint'"}, + ) + + # Verify file exists + storage: Storage = app.state.batch_storage + file_id = request_json["input_file_id"] + try: + await storage.get_file(file_id) + except FileNotFoundError: + return JSONResponse( + status_code=404, content={"error": f"File {file_id} not found"} + ) + + batch_processor: BatchProcessor = app.state.batch_processor + batch = await batch_processor.create_batch( + input_file_id=file_id, + endpoint=request_json["endpoint"], + completion_window=request_json.get("completion_window", "5s"), + metadata=request_json.get("metadata", None), + ) + + # Return metadata as attribute, not a callable. + return JSONResponse(content=batch.to_dict()) + + except Exception as e: + return JSONResponse( + status_code=500, + content={"error": f"Failed to process batch request: {str(e)}"}, + ) + + +@app.get("/v1/batches/{batch_id}") +async def route_get_batch(batch_id: str): + try: + batch_processor: BatchProcessor = app.state.batch_processor + batch = await batch_processor.retrieve_batch(batch_id) + return JSONResponse(content=batch.to_dict()) + except FileNotFoundError: + return JSONResponse( + status_code=404, content={"error": f"Batch {batch_id} not found"} + ) + + +@app.get("/v1/batches") +async def route_list_batches(limit: int = 20, after: str = None): + try: + batch_processor: BatchProcessor = app.state.batch_processor + batches = await batch_processor.list_batches(limit=limit, after=after) + + # Convert batches to response format + batch_data = [batch.to_dict() for batch in batches] + + response = { + "object": "list", + "data": batch_data, + "first_id": batch_data[0]["id"] if batch_data else None, + "last_id": batch_data[-1]["id"] if batch_data else None, + "has_more": len(batch_data) + == limit, # If we got limit items, there may be more + } + + return JSONResponse(content=response) + except FileNotFoundError: + return JSONResponse(status_code=404, content={"error": "No batches found"}) + + +@app.delete("/v1/batches/{batch_id}") +async def route_cancel_batch(batch_id: str): + try: + batch_processor: BatchProcessor = app.state.batch_processor + batch = await batch_processor.cancel_batch(batch_id) + return JSONResponse(content=batch.to_dict()) + except FileNotFoundError: + return JSONResponse( + status_code=404, content={"error": f"Batch {batch_id} not found"} + ) + + + @app.post("/v1/chat/completions") async def route_chat_completition(request: Request): return await route_general_request(request, "/v1/chat/completions") @@ -316,6 +414,7 @@ async def route_chat_completition(request: Request): async def route_completition(request: Request): return await route_general_request(request, "/v1/completions") + @app.get("/version") async def show_version(): ver = {"version": STACK_VERSION} @@ -342,6 +441,7 @@ async def show_models(): model_list = ModelList(data=model_cards) return JSONResponse(content=model_list.model_dump()) + @app.get("/health") async def health() -> Response: """Health check: verifies that service discovery and engine stats scraping are operational.""" @@ -355,6 +455,7 @@ async def health() -> Response: ) return Response(status_code=200) + # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): From 61efe0c3d80fe13d6bcae0c2df0d21e2259ace39 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 10:40:29 +0700 Subject: [PATCH 08/15] remove conflict info data in router.py Signed-off-by: sitloboi2012 --- src/vllm_router/router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 25ab1ecb..60af6f40 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -461,6 +461,7 @@ async def health() -> Response: async def metrics(): return Response(generate_latest(), media_type="text/plain") + # --- Prometheus Metrics Endpoint (v2 observation/tracking) --- @app.get("/metrics") async def metrics(): From 14669bcfba30e50d3bd0cbd8aef1050b1a455ec1 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 10:42:20 +0700 Subject: [PATCH 09/15] running pre-commit to make sure linting pass Signed-off-by: sitloboi2012 --- observability/README.md | 3 --- src/tests/requirements.txt | 5 ----- 2 files changed, 8 deletions(-) diff --git a/observability/README.md b/observability/README.md index 2fe4d03d..9b79d73e 100644 --- a/observability/README.md +++ b/observability/README.md @@ -17,10 +17,7 @@ Make sure to have: - Or follow our [tutorial](tutorials/00-install-kubernetes-env.md) After that you can run: -<<<<<<< HEAD -======= ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ```bash sudo bash install.sh ``` diff --git a/src/tests/requirements.txt b/src/tests/requirements.txt index 2f92ce9f..743b58aa 100644 --- a/src/tests/requirements.txt +++ b/src/tests/requirements.txt @@ -1,10 +1,5 @@ fastapi httpx -<<<<<<< HEAD -openai -uvicorn -======= ->>>>>>> 2ba572e (run pre-commit for linting) openai uvicorn vllm From 5e4bf857de3d9b674cc2fca54d9868ba39469223 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 11:59:01 +0700 Subject: [PATCH 10/15] update vllm dashboard to remove redundancy Signed-off-by: sitloboi2012 --- observability/vllm-dashboard.json | 742 ++++++------------------------ 1 file changed, 135 insertions(+), 607 deletions(-) diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index d0334e40..d3917f29 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -25,15 +25,7 @@ "id": 100, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "collapsed": false, -<<<<<<< HEAD -<<<<<<< HEAD "title": "Overview of the system", -======= - "title": "Core vLLM Metrics", ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - "title": "Overview of the system", ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "type": "row" }, { @@ -63,15 +55,7 @@ "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", -<<<<<<< HEAD -<<<<<<< HEAD - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, -======= "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "showPercentChange": false, "textMode": "auto", "wideLayout": true @@ -110,19 +94,16 @@ }, "options": { "displayMode": "gradient", - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, "showUnfilled": true, "sizing": "auto", "valueMode": "color", -<<<<<<< HEAD -<<<<<<< HEAD - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } -======= "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) }, "pluginVersion": "11.4.0", "targets": [ @@ -135,22 +116,6 @@ "range": true } ] -<<<<<<< HEAD - }, - { - "id": 101, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, - "collapsed": false, - "title": "QoS Information", - "type": "row" - }, - { - "id": 3, - "type": "stat", - "title": "Current QPS", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, -======= }, { "id": 101, @@ -184,13 +149,19 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "vllm:current_qps", "editorMode": "builder", "legendFormat": "Current QPS", "refId": "A", "range": true } + { + "expr": "vllm:current_qps", + "editorMode": "builder", + "legendFormat": "Current QPS", + "refId": "A", + "range": true + } ] }, { @@ -198,12 +169,7 @@ "type": "bargauge", "title": "Request TTFT distribution", "datasource": { "type": "prometheus", "uid": "prometheus" }, -<<<<<<< HEAD - "gridPos": { "h": 7, "w": 9, "x": 15, "y": 1 }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -219,40 +185,20 @@ "overrides": [] }, "options": { -<<<<<<< HEAD - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:current_qps", "editorMode": "builder", "legendFormat": "Current QPS", "refId": "A", "range": true } - ] - }, - { - "id": 4, - "type": "bargauge", - "title": "Request TTFT distribution", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, -======= "displayMode": "gradient", - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, "showUnfilled": true, "sizing": "auto", "valueMode": "color", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ -<<<<<<< HEAD { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", @@ -263,94 +209,15 @@ } ] }, - { - "id": 101, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, - "collapsed": false, - "title": "Operational Metrics", - "type": "row", - "note": "Metrics related to the operational state of the vLLM instances." - }, - { - "id": 4, - "type": "timeseries", - "title": "Number of running requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { -<<<<<<< HEAD - "displayMode": "gradient", - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", "legendFormat": "__auto", "refId": "A", "range": true } -======= - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "expr": "vllm:num_requests_running", - "editorMode": "builder", - "legendFormat": "{{instance}}", - "refId": "A", - "range": true - } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - { "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", "legendFormat": "__auto", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) - ] - }, { "id": 5, -<<<<<<< HEAD -<<<<<<< HEAD - "type": "stat", - "title": "Router-side Queueing Delay", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, -======= - "type": "timeseries", - "title": "GPU KV Usage percent", -======= "type": "stat", "title": "Router-side Queueing Delay", ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, "fieldConfig": { "defaults": { -<<<<<<< HEAD - "color": { "mode": "palette-classic" }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "color": { "mode": "thresholds" }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "mappings": [], "thresholds": { "mode": "absolute", @@ -363,72 +230,34 @@ "overrides": [] }, "options": { -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true -<<<<<<< HEAD - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", "legendFormat": "Queueing Delay", "refId": "A", "range": true } -======= - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:gpu_cache_usage_perc", + "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", - "legendFormat": "{{instance}}", + "legendFormat": "Queueing Delay", "refId": "A", "range": true } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:router_queueing_delay_seconds", "editorMode": "builder", "legendFormat": "Queueing Delay", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 6, -<<<<<<< HEAD -<<<<<<< HEAD - "type": "stat", - "title": "Average Prefill Length", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, -======= - "type": "timeseries", - "title": "Number of pending requests", -======= "type": "stat", "title": "Average Prefill Length", ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, "fieldConfig": { "defaults": { -<<<<<<< HEAD - "color": { "mode": "palette-classic" }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "color": { "mode": "thresholds" }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "mappings": [], "thresholds": { "mode": "absolute", @@ -441,62 +270,31 @@ "overrides": [] }, "options": { -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true -<<<<<<< HEAD - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:avg_prefill_length", "editorMode": "builder", "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } -======= - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:num_requests_waiting", + "expr": "vllm:avg_prefill_length", "editorMode": "builder", - "legendFormat": "{{instance}}", + "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:avg_prefill_length", "editorMode": "builder", "legendFormat": "Avg. Prefill Length", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 7, "type": "timeseries", -<<<<<<< HEAD -<<<<<<< HEAD "title": "Number of Prefilling Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, -======= - "title": "GPU KV cache hit rate", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - "title": "Number of Prefilling Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -512,62 +310,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ -<<<<<<< HEAD -<<<<<<< HEAD - { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } - ] - }, - { - "id": 8, - "type": "timeseries", - "title": "Number of Decoding Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true }, - "overrides": [] - }, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "vllm:num_decoding_requests", "editorMode": "builder", "legendFormat": "Decoding Requests", "refId": "A", "range": true } - ] - }, - { - "id": 9, - "type": "stat", - "title": "Average Decoding Length", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, -======= { - "expr": "vllm:gpu_prefix_cache_hit_rate", + "expr": "vllm:num_prefill_requests", "editorMode": "builder", - "legendFormat": "{{instance}}", + "legendFormat": "Prefilling Requests", "refId": "A", "range": true } -======= - { "expr": "vllm:num_prefill_requests", "editorMode": "builder", "legendFormat": "Prefilling Requests", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { @@ -575,12 +334,7 @@ "type": "timeseries", "title": "Number of Decoding Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, -<<<<<<< HEAD - "gridPos": { "h": 7, "w": 6, "x": 0, "y": 26 }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -596,48 +350,23 @@ "overrides": [] }, "options": { -<<<<<<< HEAD - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", -<<<<<<< HEAD - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, -======= - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) - "textMode": "auto", - "wideLayout": true + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ -<<<<<<< HEAD - { "expr": "vllm:avg_decoding_length", "editorMode": "builder", "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } - ] - }, - { - "id": 102, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, - "collapsed": false, - "title": "Serving Engine Load", - "type": "row", - "note": "Metrics indicating the load on the serving engine." -======= { - "expr": "vllm:current_qps", + "expr": "vllm:num_decoding_requests", "editorMode": "builder", - "legendFormat": "Current QPS", + "legendFormat": "Decoding Requests", "refId": "A", "range": true } -======= - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:num_decoding_requests", "editorMode": "builder", "legendFormat": "Decoding Requests", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { @@ -660,41 +389,6 @@ }, "overrides": [] }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:avg_decoding_length", "editorMode": "builder", "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } - ] - }, - { -<<<<<<< HEAD - "id": 12, - "type": "stat", - "title": "Average Prefill Length", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 12, "y": 26 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, "options": { "colorMode": "value", "graphMode": "area", @@ -707,41 +401,28 @@ "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:avg_prefill_length", + "expr": "vllm:avg_decoding_length", "editorMode": "builder", - "legendFormat": "Avg. Prefill Length", + "legendFormat": "Avg. Decoding Length", "refId": "A", "range": true } ] ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= + }, + { "id": 102, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, "collapsed": false, "title": "Serving Engine Load", "type": "row", "note": "Metrics indicating the load on the serving engine." ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) }, { "id": 10, "type": "timeseries", -<<<<<<< HEAD -<<<<<<< HEAD "title": "Number of Running Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, -======= - "title": "Number of Prefilling Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 18, "y": 26 }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - "title": "Number of Running Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -757,46 +438,31 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ -<<<<<<< HEAD -<<<<<<< HEAD - { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } -======= { - "expr": "vllm:num_prefill_requests", + "expr": "vllm:num_requests_running", "editorMode": "builder", - "legendFormat": "Prefilling Requests", + "legendFormat": "{{instance}}", "refId": "A", "range": true } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - { "expr": "vllm:num_requests_running", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) ] }, { "id": 11, "type": "timeseries", -<<<<<<< HEAD -<<<<<<< HEAD - "title": "Number of Pending Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, -======= - "title": "Number of Decoding Requests", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 16, "x": 0, "y": 33 }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "title": "Number of Pending Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -812,16 +478,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) - { "expr": "vllm:num_requests_waiting", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } + { + "expr": "vllm:num_requests_waiting", + "editorMode": "builder", + "legendFormat": "{{instance}}", + "refId": "A", + "range": true + } ] }, { @@ -845,27 +518,27 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } -<<<<<<< HEAD }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } -======= { - "expr": "vllm:num_decoding_requests", + "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", - "legendFormat": "Decoding Requests", + "legendFormat": "{{instance}}", "refId": "A", "range": true } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) ] }, { "id": 13, -<<<<<<< HEAD "type": "timeseries", "title": "GPU KV Cache Hit Rate", "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -873,111 +546,6 @@ "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, -======= - "type": "stat", - "title": "Average Decoding Length", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 16, "y": 33 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { -<<<<<<< HEAD - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } - ] - }, - { - "id": 103, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, - "collapsed": false, - "title": "Current Resource Usage", - "type": "row", - "note": "Metrics for GPU, CPU, Memory and Disk usage." - }, - { - "id": 14, - "type": "timeseries", - "title": "GPU Usage", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 0, "y": 42 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "node_gpu_usage_query", "editorMode": "builder", "legendFormat": "GPU Usage", "refId": "A", "range": true } - ] - }, - { - "id": 15, - "type": "timeseries", - "title": "CPU Usage", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 6, "y": 42 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "node_cpu_usage_query", "editorMode": "builder", "legendFormat": "CPU Usage", "refId": "A", "range": true } - ] - }, - { - "id": 16, - "type": "timeseries", - "title": "Memory Usage", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 12, "y": 42 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -990,94 +558,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "node_memory_usage_query", "editorMode": "builder", "legendFormat": "Memory Usage", "refId": "A", "range": true } - ] - }, - { - "id": 17, - "type": "timeseries", - "title": "Disk Usage", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 18, "y": 42 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true }, - "overrides": [] - }, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", - "targets": [ - { "expr": "node_disk_usage_query", "editorMode": "builder", "legendFormat": "Disk Usage", "refId": "A", "range": true } -======= - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:avg_decoding_length", + "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", - "legendFormat": "Avg. Decoding Length", + "legendFormat": "{{instance}}", "refId": "A", "range": true } ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } - ] - }, - { - "id": 13, - "type": "timeseries", - "title": "GPU KV Cache Hit Rate", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { @@ -1109,12 +606,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "node_gpu_usage_query", "editorMode": "builder", "legendFormat": "GPU Usage", "refId": "A", "range": true } + { + "expr": "node_gpu_usage_query", + "editorMode": "builder", + "legendFormat": "GPU Usage", + "refId": "A", + "range": true + } ] }, { @@ -1138,12 +646,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "node_cpu_usage_query", "editorMode": "builder", "legendFormat": "CPU Usage", "refId": "A", "range": true } + { + "expr": "node_cpu_usage_query", + "editorMode": "builder", + "legendFormat": "CPU Usage", + "refId": "A", + "range": true + } ] }, { @@ -1167,12 +686,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "node_memory_usage_query", "editorMode": "builder", "legendFormat": "Memory Usage", "refId": "A", "range": true } + { + "expr": "node_memory_usage_query", + "editorMode": "builder", + "legendFormat": "Memory Usage", + "refId": "A", + "range": true + } ] }, { @@ -1196,13 +726,23 @@ "overrides": [] }, "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ - { "expr": "node_disk_usage_query", "editorMode": "builder", "legendFormat": "Disk Usage", "refId": "A", "range": true } ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) + { + "expr": "node_disk_usage_query", + "editorMode": "builder", + "legendFormat": "Disk Usage", + "refId": "A", + "range": true + } ] } ], @@ -1214,20 +754,8 @@ "time": { "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "browser", -<<<<<<< HEAD -<<<<<<< HEAD - "title": "vLLM Dashboard", - "uid": "750918234", - "version": 20, -======= - "title": "vllm dashboard", - "uid": "ee9i0i4y606psc", - "version": 18, ->>>>>>> cccef7d (update vllm-dashboard and router to contain add on metrics such as coree vLLM metrics, operational metrics, router observe metrics, update requirement.txt for router and tests, update install-minikube-cluster to be more logging info, restart docker service and minikube context after the run) -======= "title": "vLLM Dashboard", "uid": "750918234", "version": 20, ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) "weekStart": "" } From 90e885c9f106b237e1972b148580b849353a9b5c Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 12:14:18 +0700 Subject: [PATCH 11/15] remove redundancy in run-router.sh Signed-off-by: sitloboi2012 --- src/vllm_router/run-router.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index 31ad8f4f..4137a46f 100755 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -4,7 +4,6 @@ if [[ $# -ne 1 ]]; then exit 1 fi -<<<<<<< HEAD # Use this command when testing with k8s service discovery # python3 -m vllm_router.router --port "$1" \ # --service-discovery k8s \ @@ -14,16 +13,6 @@ fi # --session-key "x-user-id" \ # --engine-stats-interval 10 \ # --log-stats -======= -python3 vllm_router/router.py --port "$1" \ - --service-discovery k8s \ - --k8s-label-selector release=test \ - --k8s-namespace default \ - --routing-logic session \ - --session-key "x-user-id" \ - --engine-stats-interval 10 \ - --log-stats ->>>>>>> 694f804 (update the router and vllm-dashboard to align with the reference from @YuhanLiu11) # Use this command when testing with static service discovery python3 -m vllm_router.router --port "$1" \ From d5ebb35cb3d0750bf930c0dffd24bf70eea184fa Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Thu, 13 Feb 2025 22:39:16 +0700 Subject: [PATCH 12/15] update the average itl, and other metrics into router.py, engine_stats.py and request_stats.py, update observe/install.sh to helm update to solve the problem already run helm service same name, update dashboard layout with latest metrics, update request_generator to align with the new endpoint /v1 Signed-off-by: sitloboi2012 --- observability/install.sh | 2 +- observability/vllm-dashboard.json | 311 +++++++++++------------- src/tests/perftest/request_generator.py | 2 +- src/vllm_router/engine_stats.py | 65 +++-- src/vllm_router/request_stats.py | 154 +++++++----- src/vllm_router/router.py | 120 +++++---- src/vllm_router/run-router.sh | 5 +- 7 files changed, 338 insertions(+), 321 deletions(-) diff --git a/observability/install.sh b/observability/install.sh index 93340993..bf606544 100644 --- a/observability/install.sh +++ b/observability/install.sh @@ -1,7 +1,7 @@ #!/bin/bash helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -helm install kube-prom-stack prometheus-community/kube-prometheus-stack \ +helm upgrade --install kube-prom-stack prometheus-community/kube-prometheus-stack \ --namespace monitoring \ --create-namespace \ -f values.yaml diff --git a/observability/vllm-dashboard.json b/observability/vllm-dashboard.json index d3917f29..c3201efd 100644 --- a/observability/vllm-dashboard.json +++ b/observability/vllm-dashboard.json @@ -25,16 +25,16 @@ "id": 100, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "collapsed": false, - "title": "Overview of the system", + "title": "Overview System Performance", "type": "row" }, { "id": 1, "type": "stat", "title": "Available vLLM instances", - "description": "Number of healthy vLLM instances", + "description": "Number of healthy vLLM instances (by instance usage)", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 0, "y": 1 }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -72,12 +72,53 @@ } ] }, + { + "id": 19, + "type": "stat", + "title": "Average Latency", + "description": "Average end-to-end request latency in seconds", + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "avg(vllm:e2e_request_latency_seconds_sum) / avg(vllm:e2e_request_latency_seconds_count)", + "editorMode": "builder", + "legendFormat": "Avg Latency", + "refId": "A", + "range": true + } + ] + }, { "id": 2, "type": "bargauge", "title": "Request latency distribution", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 9, "x": 6, "y": 1 }, + "gridPos": { "h": 7, "w": 24, "x": 0, "y": 8 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -119,7 +160,7 @@ }, { "id": 101, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }, "collapsed": false, "title": "QoS Information", "type": "row" @@ -129,7 +170,7 @@ "type": "stat", "title": "Current QPS", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 0, "y": 9 }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -164,57 +205,12 @@ } ] }, - { - "id": 4, - "type": "bargauge", - "title": "Request TTFT distribution", - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 6, "y": 9 }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false - }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", - "editorMode": "builder", - "format": "heatmap", - "legendFormat": "__auto", - "refId": "A", - "range": true - } - ] - }, { "id": 5, "type": "stat", "title": "Router-side Queueing Delay", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 12, "y": 9 }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -254,7 +250,7 @@ "type": "stat", "title": "Average Prefill Length", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 6, "x": 18, "y": 9 }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, @@ -290,54 +286,55 @@ ] }, { - "id": 7, - "type": "timeseries", - "title": "Number of Prefilling Requests", + "id": 20, + "type": "stat", + "title": "Average ITL", + "description": "Average Inter-Token Latency", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, - { "color": "red", "value": 80 } + { "color": "red", "value": 1 } ] } }, "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "tooltip": { "mode": "single", "sort": "none" } + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:num_prefill_requests", + "expr": "avg(vllm:time_per_output_token_seconds_sum) / avg(vllm:time_per_output_token_seconds_count)", "editorMode": "builder", - "legendFormat": "Prefilling Requests", + "legendFormat": "Avg ITL", "refId": "A", "range": true } ] }, { - "id": 8, - "type": "timeseries", - "title": "Number of Decoding Requests", + "id": 4, + "type": "bargauge", + "title": "Request TTFT distribution", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, + "gridPos": { "h": 7, "w": 24, "x": 0, "y": 30 }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -350,34 +347,47 @@ "overrides": [] }, "options": { + "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "bottom", + "showLegend": false }, - "tooltip": { "mode": "single", "sort": "none" } + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:num_decoding_requests", + "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", "editorMode": "builder", - "legendFormat": "Decoding Requests", + "format": "heatmap", + "legendFormat": "__auto", "refId": "A", "range": true } ] }, { - "id": 9, - "type": "stat", - "title": "Average Decoding Length", + "id": 102, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }, + "collapsed": false, + "title": "Serving Engine Load", + "type": "row", + "note": "Metrics indicating the load on the serving engine." + }, + { + "id": 10, + "type": "timeseries", + "title": "Number of Running Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 38 }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", @@ -390,39 +400,26 @@ "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto", - "wideLayout": true + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:avg_decoding_length", + "expr": "vllm:num_requests_running", "editorMode": "builder", - "legendFormat": "Avg. Decoding Length", + "legendFormat": "{{instance}}", "refId": "A", "range": true } ] }, { - "id": 102, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, - "collapsed": false, - "title": "Serving Engine Load", - "type": "row", - "note": "Metrics indicating the load on the serving engine." - }, - { - "id": 10, + "id": 11, "type": "timeseries", - "title": "Number of Running Requests", + "title": "Number of Pending Requests", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 38 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -438,18 +435,13 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:num_requests_running", + "expr": "vllm:num_requests_waiting", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", @@ -458,11 +450,11 @@ ] }, { - "id": 11, + "id": 12, "type": "timeseries", - "title": "Number of Pending Requests", + "title": "GPU KV Usage Percentage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 38 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -478,18 +470,13 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:num_requests_waiting", + "expr": "vllm:gpu_cache_usage_perc", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", @@ -498,11 +485,11 @@ ] }, { - "id": 12, + "id": 13, "type": "timeseries", - "title": "GPU KV Usage Percentage", + "title": "GPU KV Cache Hit Rate", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 46 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -518,18 +505,13 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:gpu_cache_usage_perc", + "expr": "vllm:gpu_prefix_cache_hit_rate", "editorMode": "builder", "legendFormat": "{{instance}}", "refId": "A", @@ -538,40 +520,41 @@ ] }, { - "id": 13, - "type": "timeseries", - "title": "GPU KV Cache Hit Rate", + "id": 21, + "type": "stat", + "title": "Number of Swapped Requests", + "description": "Requests moved from GPU to CPU", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 46 }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, + "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, - { "color": "red", "value": 80 } + { "color": "red", "value": 0 } ] } }, "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "tooltip": { "mode": "single", "sort": "none" } + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto", + "wideLayout": true }, "pluginVersion": "11.4.0", "targets": [ { - "expr": "vllm:gpu_prefix_cache_hit_rate", + "expr": "vllm:num_requests_swapped", "editorMode": "builder", - "legendFormat": "{{instance}}", + "legendFormat": "Swapped Requests", "refId": "A", "range": true } @@ -579,7 +562,7 @@ }, { "id": 103, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }, "collapsed": false, "title": "Current Resource Usage", "type": "row", @@ -590,7 +573,7 @@ "type": "timeseries", "title": "GPU Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 0, "y": 42 }, + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 56 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -606,12 +589,7 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", @@ -630,7 +608,7 @@ "type": "timeseries", "title": "CPU Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 6, "y": 42 }, + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 56 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -646,12 +624,7 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", @@ -670,7 +643,7 @@ "type": "timeseries", "title": "Memory Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 12, "y": 42 }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 56 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -686,12 +659,7 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", @@ -710,7 +678,7 @@ "type": "timeseries", "title": "Disk Usage", "datasource": { "type": "prometheus", "uid": "prometheus" }, - "gridPos": { "h": 8, "w": 6, "x": 18, "y": 42 }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 56 }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, @@ -726,12 +694,7 @@ "overrides": [] }, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, + "legend": { "calcs": [], "displayMode": "list", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "11.4.0", diff --git a/src/tests/perftest/request_generator.py b/src/tests/perftest/request_generator.py index 8fb30edd..bbbfa107 100644 --- a/src/tests/perftest/request_generator.py +++ b/src/tests/perftest/request_generator.py @@ -96,7 +96,7 @@ def main(): processes = [] api_key = "YOUR_API_KEY_HERE" - base_url = "http://localhost:8000/" + base_url = "http://localhost:8000/v1" model = "fake_model_name" for _ in range(args.num_workers): diff --git a/src/vllm_router/engine_stats.py b/src/vllm_router/engine_stats.py index b92a9c1d..1733b172 100644 --- a/src/vllm_router/engine_stats.py +++ b/src/vllm_router/engine_stats.py @@ -18,12 +18,12 @@ class EngineStats: # Number of running requests num_running_requests: int = 0 - # Number of queuing requests num_queuing_requests: int = 0 - - # GPU cache hit rate - gpu_cache_hit_rate: float = 0.0 + # GPU prefix cache hit rate (as used in some panels) + gpu_prefix_cache_hit_rate: float = 0.0 + # GPU KV usage percentage (new field for dashboard "GPU KV Usage Percentage") + gpu_cache_usage_perc: float = 0.0 @staticmethod def FromVllmScrape(vllm_scrape: str): @@ -41,7 +41,9 @@ def FromVllmScrape(vllm_scrape: str): """ num_running_reqs = 0 num_queuing_reqs = 0 - gpu_cache_hit_rate = 0 + gpu_prefix_cache_hit_rate = 0.0 + gpu_cache_usage_perc = 0.0 + for family in text_string_to_metric_families(vllm_scrape): for sample in family.samples: if sample.name == "vllm:num_requests_running": @@ -49,18 +51,23 @@ def FromVllmScrape(vllm_scrape: str): elif sample.name == "vllm:num_requests_waiting": num_queuing_reqs = sample.value elif sample.name == "vllm:gpu_prefix_cache_hit_rate": - gpu_cache_hit_rate = sample.value + gpu_prefix_cache_hit_rate = sample.value + elif sample.name == "vllm:gpu_cache_usage_perc": + gpu_cache_usage_perc = sample.value return EngineStats( num_running_requests=num_running_reqs, num_queuing_requests=num_queuing_reqs, - gpu_cache_hit_rate=gpu_cache_hit_rate, + gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate, + gpu_cache_usage_perc=gpu_cache_usage_perc, ) class EngineStatsScraper: def __init__(self, scrape_interval: float): """ + Initialize the scraper to periodically fetch metrics from all serving engines. + Args: scrape_interval (float): The interval in seconds to scrape the metrics. @@ -73,18 +80,16 @@ def __init__(self, scrape_interval: float): self.service_discovery = GetServiceDiscovery() self.engine_stats: Dict[str, EngineStats] = {} self.engine_stats_lock = threading.Lock() - self.scrape_interval = scrape_interval self.scrape_thread = threading.Thread(target=self._scrape_worker, daemon=True) self.scrape_thread.start() def _scrape_one_endpoint(self, url: str): - """Scrape the metrics and model information from a single - serving engine + """ + Scrape metrics from a single serving engine. Args: - url (str): The URL of the serving engine - (does not contain endpoint) + url (str): The base URL of the serving engine. """ try: response = requests.get(url + "/metrics") @@ -96,6 +101,14 @@ def _scrape_one_endpoint(self, url: str): return engine_stats def _scrape_metrics(self): + """ + Scrape metrics from all serving engines. + + Scrape metrics from all serving engines by calling + _scrape_one_endpoint on each of them. The metrics are + stored in self.engine_stats. + + """ collected_engine_stats = {} endpoints = self.service_discovery.get_endpoint_info() logger.info(f"Scraping metrics from {len(endpoints)} serving engine(s)") @@ -110,16 +123,29 @@ def _scrape_metrics(self): for old_url in old_urls: if old_url not in collected_engine_stats: del self.engine_stats[old_url] - for url, stats in collected_engine_stats.items(): self.engine_stats[url] = stats def _scrape_worker(self): + """ + Periodically scrape metrics from all serving engines in the background. + + This function will loop forever and sleep for self.scrape_interval + seconds between each scrape. It will call _scrape_metrics to scrape + metrics from all serving engines and store them in self.engine_stats. + + """ while True: self._scrape_metrics() time.sleep(self.scrape_interval) def get_engine_stats(self) -> Dict[str, EngineStats]: + """ + Retrieve a copy of the current engine statistics. + + Returns: + A dictionary mapping engine URLs to their respective EngineStats objects. + """ with self.engine_stats_lock: return self.engine_stats.copy() @@ -136,8 +162,10 @@ def get_health(self) -> bool: def InitializeEngineStatsScraper(scrape_interval: float) -> EngineStatsScraper: """ - Initialize the EngineStatsScraper object. This function should be - called after the service discovery module has been initialized. + Initialize the EngineStatsScraper. + + Args: + scrape_interval (float): The interval (in seconds) to scrape metrics. Raises: ValueError: if the service discover module is have @@ -149,23 +177,20 @@ def InitializeEngineStatsScraper(scrape_interval: float) -> EngineStatsScraper: global _global_engine_stats_scraper if _global_engine_stats_scraper: raise ValueError("EngineStatsScraper object has already been initialized") - _global_engine_stats_scraper = EngineStatsScraper(scrape_interval) return _global_engine_stats_scraper def GetEngineStatsScraper() -> EngineStatsScraper: """ - Get the EngineStatsScraper object + Retrieve the EngineStatsScraper. Raises: - ValueError: if the EngineStatsScraper object has not been - initialized + ValueError: If not initialized. """ global _global_engine_stats_scraper if not _global_engine_stats_scraper: raise ValueError("EngineStatsScraper object has not been initialized") - return _global_engine_stats_scraper diff --git a/src/vllm_router/request_stats.py b/src/vllm_router/request_stats.py index f42dd7e3..46c69b1d 100644 --- a/src/vllm_router/request_stats.py +++ b/src/vllm_router/request_stats.py @@ -1,6 +1,6 @@ from collections import deque from dataclasses import dataclass -from typing import Deque, Dict +from typing import Deque, Dict, Tuple from vllm_router.log import init_logger @@ -13,63 +13,56 @@ class RequestStats: # Number of queries per second qps: float - - # Average time-to-first-token in seconds + # Average time-to-first-token (TTFT) in seconds ttft: float - # Total number of requests during prefilling in_prefill_requests: int - # Total number of requests during decoding in_decoding_requests: int - # Total number of requests finished finished_requests: int - - # How long does this url serves requests - # NOTE (ApostaC): consider moving this to engine stats + # How long the engine has been serving requests (uptime) uptime: int + # Average decoding length (time from first token to completion) + avg_decoding_length: float + # Average overall latency (from request arrival to completion) + avg_latency: float + # Average inter-token latency (if available; default -1 if not computed) + avg_itl: float + # Number of swapped requests (moved from GPU to CPU) + num_swapped_requests: int class MovingAverageMonitor: """ - Monitors the average of the value of in a sliding window + Monitors the average of values in a sliding window. """ def __init__(self, sliding_window_size: float): self.sliding_window_size = sliding_window_size - self.timestamps = deque() - self.values = deque() + self.timestamps: Deque[float] = deque() + self.values: Deque[float] = deque() def update(self, timestamp: float, value: float): - """ - Update the throughput monitor with a new timestamp - """ self.timestamps.append(timestamp) self.values.append(value) while ( - len(self.timestamps) > 0 + self.timestamps and self.timestamps[0] < timestamp - self.sliding_window_size ): self.timestamps.popleft() self.values.popleft() def get_average(self) -> float: - """ - Get the throughput in the sliding window - """ - return sum(self.values) / len(self.values) + return sum(self.values) / len(self.values) if self.values else -1 def get_sum(self) -> float: - """ - Get the sum of the values in the sliding window - """ return sum(self.values) class RequestStatsMonitor: """ - Monitors the request statistics of all serving engines + Monitors the request statistics of all serving engines. """ # NOTE (ApostaC): Currently, QPS is calculated based on the number of @@ -84,20 +77,28 @@ def __init__(self, sliding_window_size: float): """ self.sliding_window_size = sliding_window_size - # Finished requests for each serving engine - # The elements in the deque should be sorted by 'complete' time + # Monitors for calculating QPS and TTFT self.qps_monitors: Dict[str, MovingAverageMonitor] = {} self.ttft_monitors: Dict[str, MovingAverageMonitor] = {} - # The time when the request is coming (engine_url, request_id) -> timestamp - self.request_coming_time: Dict[(str, str), float] = {} + # Record initial request start time: (engine_url, request_id) -> timestamp + self.request_start_time: Dict[Tuple[str, str], float] = {} + # Record time when first token is received: (engine_url, request_id) -> timestamp + self.first_token_time: Dict[Tuple[str, str], float] = {} - # Number of requests in different stages (from the start of the router) + # Counters for requests in different stages self.in_prefill_requests: Dict[str, int] = {} self.in_decoding_requests: Dict[str, int] = {} self.finished_requests: Dict[str, int] = {} - self.first_query_time = None + # New monitors for overall latency and decoding length + self.latency_monitors: Dict[str, MovingAverageMonitor] = {} + self.decoding_length_monitors: Dict[str, MovingAverageMonitor] = {} + + # Counter for swapped requests + self.swapped_requests: Dict[str, int] = {} + + self.first_query_time: float = None def on_new_request(self, engine_url: str, request_id: str, timestamp: float): """ @@ -108,7 +109,7 @@ def on_new_request(self, engine_url: str, request_id: str, timestamp: float): request_id: The global request ID timestamp: the timestamp when the request was created """ - self.request_coming_time[(engine_url, request_id)] = timestamp + self.request_start_time[(engine_url, request_id)] = timestamp if engine_url not in self.in_prefill_requests: self.in_prefill_requests[engine_url] = 0 @@ -118,7 +119,6 @@ def on_new_request(self, engine_url: str, request_id: str, timestamp: float): self.qps_monitors[engine_url] = MovingAverageMonitor( self.sliding_window_size ) - self.qps_monitors[engine_url].update(timestamp, 1) if self.first_query_time is None: @@ -133,20 +133,25 @@ def on_request_response(self, engine_url: str, request_id: str, timestamp: float request_id: The global request ID timestamp: The timestamp when the response token was received """ - if (engine_url, request_id) not in self.request_coming_time: + if (engine_url, request_id) not in self.request_start_time: return - coming_time = self.request_coming_time.pop((engine_url, request_id)) + # Record first token time (do not pop so we can compute overall latency later) + self.first_token_time[(engine_url, request_id)] = timestamp if engine_url not in self.in_decoding_requests: self.in_decoding_requests[engine_url] = 0 - self.in_prefill_requests[engine_url] -= 1 + self.in_prefill_requests[engine_url] = max( + 0, self.in_prefill_requests.get(engine_url, 1) - 1 + ) self.in_decoding_requests[engine_url] += 1 if engine_url not in self.ttft_monitors: self.ttft_monitors[engine_url] = MovingAverageMonitor( self.sliding_window_size ) - self.ttft_monitors[engine_url].update(timestamp, timestamp - coming_time) + # Update TTFT as time from request start to first token + ttft = timestamp - self.request_start_time[(engine_url, request_id)] + self.ttft_monitors[engine_url].update(timestamp, ttft) def on_request_complete(self, engine_url: str, request_id: str, timestamp: float): """ @@ -159,33 +164,39 @@ def on_request_complete(self, engine_url: str, request_id: str, timestamp: float """ if engine_url not in self.finished_requests: self.finished_requests[engine_url] = 0 - self.in_decoding_requests[engine_url] -= 1 + self.in_decoding_requests[engine_url] = max( + 0, self.in_decoding_requests.get(engine_url, 1) - 1 + ) self.finished_requests[engine_url] += 1 - def get_request_stats( - self, - current_time: float, - ) -> Dict[str, RequestStats]: + def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float): + # This function should be called if a request is determined to be swapped from GPU to CPU. """ - Get the request statistics for each serving engine + Tell the monitor that a request has been swapped from GPU to CPU. Args: - current_time: The current timestamp in seconds + engine_url: The URL of the serving engine + request_id: The global request ID + timestamp: The timestamp when the request was swapped + """ + if engine_url not in self.swapped_requests: + self.swapped_requests[engine_url] = 0 + self.swapped_requests[engine_url] += 1 + + def get_request_stats(self, current_time: float) -> Dict[str, RequestStats]: + """ + Get the request statistics from the monitor. + + Args: + current_time: The current timestamp Returns: - A dictionary where the key is the serving engine URL and the value - is the request statistics for that engine. - The TTFT and inter token latency will be -1 if there is no requests - finished in the sliding window. + A dictionary mapping engine URLs to RequestStats objects """ - # Calculate the request statistics ret = {} - - # Get all urls: urls = set(self.in_prefill_requests.keys()).union( set(self.in_decoding_requests.keys()) ) - for engine_url in urls: if engine_url not in self.qps_monitors: qps = -1 @@ -197,19 +208,42 @@ def get_request_stats( else: ttft = self.ttft_monitors[engine_url].get_average() - in_prefill_requests = self.in_prefill_requests.get(engine_url, 0) - in_decoding_requests = self.in_decoding_requests.get(engine_url, 0) - finished_requests = self.finished_requests.get(engine_url, 0) + in_prefill = self.in_prefill_requests.get(engine_url, 0) + in_decoding = self.in_decoding_requests.get(engine_url, 0) + finished = self.finished_requests.get(engine_url, 0) + + if engine_url in self.decoding_length_monitors: + avg_dec_len = self.decoding_length_monitors[engine_url].get_average() + else: + avg_dec_len = -1 + + if engine_url in self.latency_monitors: + avg_lat = self.latency_monitors[engine_url].get_average() + else: + avg_lat = -1 + + # For avg_itl, if not computed, default to -1. + avg_itl_val = -1 + + if engine_url in self.swapped_requests: + swapped = self.swapped_requests[engine_url] + else: + swapped = 0 ret[engine_url] = RequestStats( qps=qps, ttft=ttft, - in_prefill_requests=in_prefill_requests, - in_decoding_requests=in_decoding_requests, - finished_requests=finished_requests, - uptime=current_time - self.first_query_time, + in_prefill_requests=in_prefill, + in_decoding_requests=in_decoding, + finished_requests=finished, + uptime=( + current_time - self.first_query_time if self.first_query_time else 0 + ), + avg_decoding_length=avg_dec_len, + avg_latency=avg_lat, + avg_itl=avg_itl_val, + num_swapped_requests=swapped, ) - return ret @@ -227,7 +261,6 @@ def InitializeRequestStatsMonitor(sliding_window_size: float): global _global_request_stats_monitor if _global_request_stats_monitor is not None: raise ValueError("The global request statistics monitor has been initialized") - _global_request_stats_monitor = RequestStatsMonitor(sliding_window_size) return _global_request_stats_monitor @@ -247,5 +280,4 @@ def GetRequestStatsMonitor(): raise ValueError( "The global request statistics monitor has not been initialized" ) - return _global_request_stats_monitor diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 60af6f40..af35cbf7 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -45,11 +45,14 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) -# --- Observation & Tracking (from v2) --- -# Define a Prometheus gauge for tracking the number of running requests per server. -vnum_requests_running = Gauge( +# --- Prometheus Gauges --- +# Existing metrics +num_requests_running = Gauge( "vllm:num_requests_running", "Number of running requests", ["server"] ) +num_requests_waiting = Gauge( + "vllm:num_requests_waiting", "Number of waiting requests", ["server"] +) current_qps = Gauge("vllm:current_qps", "Current Queries Per Second", ["server"]) avg_decoding_length = Gauge( "vllm:avg_decoding_length", "Average Decoding Length", ["server"] @@ -61,9 +64,20 @@ async def lifespan(app: FastAPI): "vllm:num_decoding_requests", "Number of Decoding Requests", ["server"] ) +# New metrics per dashboard update +healthy_pods_total = Gauge( + "vllm:healthy_pods_total", "Number of healthy vLLM pods", ["server"] +) +avg_latency = Gauge( + "vllm:avg_latency", "Average end-to-end request latency", ["server"] +) +avg_itl = Gauge("vllm:avg_itl", "Average Inter-Token Latency", ["server"]) +num_requests_swapped = Gauge( + "vllm:num_requests_swapped", "Number of swapped requests", ["server"] +) + # --- Request Processing & Routing --- -# TODO: better request id system async def process_request( method, header, body, backend_url, request_id, endpoint, debug_request=None ): @@ -72,10 +86,8 @@ async def process_request( """ first_token = False total_len = 0 - # Record the request start time and notify the request stats monitor. start_time = time.time() GetRequestStatsMonitor().on_new_request(backend_url, request_id, start_time) - # Log the start of request processing logger.info(f"Started request {request_id} for backend {backend_url}") client = httpx_client_wrapper() @@ -88,8 +100,7 @@ async def process_request( ) as backend_response: # Yield headers and status code first. yield backend_response.headers, backend_response.status_code - - # Then stream the response content in chunks. + # Stream response content. async for chunk in backend_response.aiter_bytes(): total_len += len(chunk) if not first_token: @@ -106,13 +117,8 @@ async def process_request( async def route_general_request(request: Request, endpoint: str): - """ - Route the incoming request to the backend server and stream the response back to the client. - """ in_router_time = time.time() request_id = str(uuid.uuid4()) - - # Read the full request body and JSON payload. request_body = await request.body() request_json = await request.json() requested_model = request_json.get("model", None) @@ -125,10 +131,8 @@ async def route_general_request(request: Request, endpoint: str): endpoints = GetServiceDiscovery().get_endpoint_info() engine_stats = GetEngineStatsScraper().get_engine_stats() request_stats = GetRequestStatsMonitor().get_request_stats(time.time()) - - # Filter endpoints by the requested model. endpoints = list(filter(lambda x: x.model_name == requested_model, endpoints)) - if len(endpoints) == 0: + if not endpoints: return JSONResponse( status_code=400, content={"error": f"Model {requested_model} not found."} ) @@ -138,11 +142,9 @@ async def route_general_request(request: Request, endpoint: str): endpoints, engine_stats, request_stats, request ) logger.info(f"Request {request_id} routed to {server_url}") - curr_time = time.time() logger.info( - f"Routing request {request_id} to {server_url} at {curr_time}, " - f"process time = {curr_time - in_router_time:.4f}" + f"Routing request {request_id} to {server_url} at {curr_time}, process time = {curr_time - in_router_time:.4f}" ) stream_generator = process_request( request.method, @@ -152,9 +154,7 @@ async def route_general_request(request: Request, endpoint: str): request_id, endpoint=endpoint, ) - headers, status_code = await anext(stream_generator) - return StreamingResponse( stream_generator, status_code=status_code, @@ -163,21 +163,17 @@ async def route_general_request(request: Request, endpoint: str): ) +# --- File Endpoints --- @app.post("/v1/files") async def route_files(request: Request): - """Handle file upload requests that include a purpose and file data.""" form = await request.form() - - # Validate required fields. purpose = form.get("purpose", "unknown") if "file" not in form: return JSONResponse( status_code=400, content={"error": "Missing required parameter 'file'"} ) - file_obj: UploadFile = form["file"] file_content = await file_obj.read() - try: storage: Storage = app.state.batch_storage file_info = await storage.save_file( @@ -417,8 +413,7 @@ async def route_completition(request: Request): @app.get("/version") async def show_version(): - ver = {"version": STACK_VERSION} - return JSONResponse(content=ver) + return JSONResponse(content={"version": STACK_VERSION}) @app.get("/v1/models") @@ -437,14 +432,12 @@ async def show_models(): ) model_cards.append(model_card) existing_models.add(endpoint.model_name) - model_list = ModelList(data=model_cards) return JSONResponse(content=model_list.model_dump()) @app.get("/health") async def health() -> Response: - """Health check: verifies that service discovery and engine stats scraping are operational.""" if not GetServiceDiscovery().get_health(): return JSONResponse( content={"status": "Service discovery module is down."}, status_code=503 @@ -456,25 +449,31 @@ async def health() -> Response: return Response(status_code=200) -# --- Prometheus Metrics Endpoint (v2 observation/tracking) --- +# --- Prometheus Metrics Endpoint --- @app.get("/metrics") async def metrics(): - return Response(generate_latest(), media_type="text/plain") - - -# --- Prometheus Metrics Endpoint (v2 observation/tracking) --- -@app.get("/metrics") -async def metrics(): - # Update gauges with stats from the request monitor + # Retrieve request stats from the monitor. stats = GetRequestStatsMonitor().get_request_stats(time.time()) for server, stat in stats.items(): current_qps.labels(server=server).set(stat.qps) - avg_decoding_length.labels(server=server).set(stat.ttft) + # Assuming stat contains the following attributes: + avg_decoding_length.labels(server=server).set(stat.avg_decoding_length) num_prefill_requests.labels(server=server).set(stat.in_prefill_requests) num_decoding_requests.labels(server=server).set(stat.in_decoding_requests) - vnum_requests_running.labels(server=server).set( + num_requests_running.labels(server=server).set( stat.in_prefill_requests + stat.in_decoding_requests ) + avg_latency.labels(server=server).set(stat.avg_latency) + avg_itl.labels(server=server).set(stat.avg_itl) + num_requests_swapped.labels(server=server).set(stat.num_swapped_requests) + # For healthy pods, we use a hypothetical function from service discovery. + healthy = {} + endpoints = GetServiceDiscovery().get_endpoint_info() + for ep in endpoints: + # Assume each endpoint object has an attribute 'healthy' (1 if healthy, 0 otherwise). + healthy[ep.url] = 1 if getattr(ep, "healthy", True) else 0 + for server, value in healthy.items(): + healthy_pods_total.labels(server=server).set(value) return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) @@ -489,26 +488,16 @@ def validate_args(args): raise ValueError( "Static models must be provided when using static service discovery." ) - - if args.service_discovery == "static" and args.static_backends is None: - raise ValueError( - "Static backends must be provided when using static service discovery." - ) - if args.service_discovery == "k8s" and args.k8s_port is None: raise ValueError("K8s port must be provided when using K8s service discovery.") - if args.routing_logic == "session" and args.session_key is None: raise ValueError( "Session key must be provided when using session routing logic." ) - if args.log_stats and args.log_stats_interval <= 0: raise ValueError("Log stats interval must be greater than 0.") - if args.engine_stats_interval <= 0: raise ValueError("Engine stats interval must be greater than 0.") - if args.request_stats_window <= 0: raise ValueError("Request stats window must be greater than 0.") @@ -521,8 +510,6 @@ def parse_args(): parser.add_argument( "--port", type=int, default=8001, help="The port to run the server on." ) - - # Service discovery parser.add_argument( "--service-discovery", required=True, @@ -559,8 +546,6 @@ def parse_args(): default="", help="The label selector to filter vLLM pods when using K8s service discovery.", ) - - # Routing logic parser.add_argument( "--routing-logic", type=str, @@ -616,8 +601,6 @@ def parse_args(): default=60, help="The sliding window in seconds to compute request statistics.", ) - - # Logging parser.add_argument( "--log-stats", action="store_true", help="Log statistics periodically." ) @@ -664,7 +647,6 @@ def InitializeAll(args): ) else: raise ValueError(f"Invalid service discovery type: {args.service_discovery}") - InitializeEngineStatsScraper(args.engine_stats_interval) InitializeRequestStatsMonitor(args.request_stats_window) @@ -694,22 +676,34 @@ def log_stats(interval: int = 10): if url in engine_stats: es = engine_stats[url] logstr += ( - f" Engine Stats (Dashboard): Running Requests: {es.num_running_requests}, " - f"Queueing Delay (requests): {es.num_queuing_requests}, " - f"GPU Cache Hit Rate: {es.gpu_cache_hit_rate:.2f}\n" + f" Engine Stats: Running Requests: {es.num_running_requests}, " + f"Queued Requests: {es.num_queuing_requests}, " + f"GPU Cache Hit Rate: {es.gpu_prefix_cache_hit_rate:.2f}\n" ) else: logstr += " Engine Stats: No stats available\n" if url in request_stats: rs = request_stats[url] logstr += ( - f" Request Stats (Dashboard): Current QPS: {rs.qps:.2f}, " - f"Avg Decoding Length: {rs.ttft}, " + f" Request Stats: QPS: {rs.qps:.2f}, " + f"Avg Latency: {rs.avg_latency}, " + f"Avg ITL: {rs.avg_itl}, " f"Prefill Requests: {rs.in_prefill_requests}, " f"Decoding Requests: {rs.in_decoding_requests}, " - f"Finished Requests: {rs.finished_requests}, " + f"Swapped Requests: {rs.num_swapped_requests}, " + f"Finished: {rs.finished_requests}, " f"Uptime: {rs.uptime:.2f} sec\n" ) + current_qps.labels(server=url).set(rs.qps) + avg_decoding_length.labels(server=url).set(rs.avg_decoding_length) + num_prefill_requests.labels(server=url).set(rs.in_prefill_requests) + num_decoding_requests.labels(server=url).set(rs.in_decoding_requests) + num_requests_running.labels(server=url).set( + rs.in_prefill_requests + rs.in_decoding_requests + ) + avg_latency.labels(server=url).set(rs.avg_latency) + avg_itl.labels(server=url).set(rs.avg_itl) + num_requests_swapped.labels(server=url).set(rs.num_swapped_requests) else: logstr += " Request Stats: No stats available\n" logstr += "-" * 50 + "\n" diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index 4137a46f..b1b6ddbb 100755 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -19,8 +19,11 @@ python3 -m vllm_router.router --port "$1" \ --service-discovery static \ --static-backends "http://localhost:9000" \ --static-models "fake_model_name" \ - --engine-stats-interval 10 \ --log-stats \ + --log-stats-interval 10 \ + --engine-stats-interval 10 \ + --request-stats-window 10 \ + --request-stats-window 10 \ --routing-logic session \ --session-key "x-user-id" From beaaa621b42068c968a740f4185d2cff0bbcc3fb Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Sat, 15 Feb 2025 00:07:04 +0700 Subject: [PATCH 13/15] update based on Shaoting-Feng comment Signed-off-by: sitloboi2012 --- src/vllm_router/request_stats.py | 10 +++ src/vllm_router/router.py | 116 ++++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 3 deletions(-) diff --git a/src/vllm_router/request_stats.py b/src/vllm_router/request_stats.py index 46c69b1d..369d4447 100644 --- a/src/vllm_router/request_stats.py +++ b/src/vllm_router/request_stats.py @@ -44,6 +44,16 @@ def __init__(self, sliding_window_size: float): self.values: Deque[float] = deque() def update(self, timestamp: float, value: float): + """ + Update the throughput monitor with a new timestamp + + Args: + timestamp: The timestamp of the data point. + value: The value of the data point. + + This method adds the new data point to the sliding window and + removes any data point that is older than the sliding window size. + """ self.timestamps.append(timestamp) self.values.append(value) while ( diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index af35cbf7..5a4e55ff 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -78,11 +78,28 @@ async def lifespan(app: FastAPI): # --- Request Processing & Routing --- +# TODO: better request id system async def process_request( method, header, body, backend_url, request_id, endpoint, debug_request=None ): """ - Async generator to stream data from the backend server to the client. + Process a request by sending it to the chosen backend. + + Args: + method: The HTTP method to use when sending the request to the backend. + header: The headers to send with the request to the backend. + body: The content of the request to send to the backend. + backend_url: The URL of the backend to send the request to. + request_id: A unique identifier for the request. + endpoint: The endpoint to send the request to on the backend. + debug_request: The original request object from the client, used for + optional debug logging. + + Yields: + The response headers and status code, followed by the response content. + + Raises: + HTTPError: If the backend returns a 4xx or 5xx status code. """ first_token = False total_len = 0 @@ -117,10 +134,26 @@ async def process_request( async def route_general_request(request: Request, endpoint: str): + """ + Route the incoming request to the backend server and stream the response back to the client. + + This function extracts the requested model from the request body and retrieves the + corresponding endpoints. It uses routing logic to determine the best server URL to handle + the request, then streams the request to that server. If the requested model is not available, + it returns an error response. + + Args: + request (Request): The incoming HTTP request. + endpoint (str): The endpoint to which the request should be routed. + + Returns: + StreamingResponse: A response object that streams data from the backend server to the client. + """ + in_router_time = time.time() request_id = str(uuid.uuid4()) request_body = await request.body() - request_json = await request.json() + request_json = await request.json() # TODO (ApostaC): merge two awaits into one requested_model = request_json.get("model", None) if requested_model is None: return JSONResponse( @@ -141,7 +174,6 @@ async def route_general_request(request: Request, endpoint: str): server_url = GetRoutingLogic().route_request( endpoints, engine_stats, request_stats, request ) - logger.info(f"Request {request_id} routed to {server_url}") curr_time = time.time() logger.info( f"Routing request {request_id} to {server_url} at {curr_time}, process time = {curr_time - in_router_time:.4f}" @@ -166,6 +198,19 @@ async def route_general_request(request: Request, endpoint: str): # --- File Endpoints --- @app.post("/v1/files") async def route_files(request: Request): + """ + Handle file upload requests and save the files to the configured storage. + + Args: + request (Request): The incoming HTTP request. + + Returns: + JSONResponse: A JSON response containing the file metadata. + + Raises: + JSONResponse: A JSON response with a 400 status code if the request is invalid, + or a 500 status code if an error occurs during file saving. + """ form = await request.form() purpose = form.get("purpose", "unknown") if "file" not in form: @@ -418,6 +463,18 @@ async def show_version(): @app.get("/v1/models") async def show_models(): + """ + Returns a list of all models available in the stack. + + Args: + None + + Returns: + JSONResponse: A JSON response containing the list of models. + + Raises: + Exception: If there is an error in retrieving the endpoint information. + """ endpoints = GetServiceDiscovery().get_endpoint_info() existing_models = set() model_cards = [] @@ -438,6 +495,20 @@ async def show_models(): @app.get("/health") async def health() -> Response: + """ + Endpoint to check the health status of various components. + + This function verifies the health of the service discovery module and + the engine stats scraper. If either component is down, it returns a + 503 response with the appropriate status message. If both components + are healthy, it returns a 200 OK response. + + Returns: + Response: A JSONResponse with status code 503 if a component is + down, or a plain Response with status code 200 if all components + are healthy. + """ + if not GetServiceDiscovery().get_health(): return JSONResponse( content={"status": "Service discovery module is down."}, status_code=503 @@ -453,6 +524,22 @@ async def health() -> Response: @app.get("/metrics") async def metrics(): # Retrieve request stats from the monitor. + """ + Endpoint to expose Prometheus metrics for the vLLM router. + + This function gathers request statistics, engine metrics, and health status + of the service endpoints to update Prometheus gauges. It exports metrics + such as queries per second (QPS), average decoding length, number of prefill + and decoding requests, average latency, average inter-token latency, number + of swapped requests, and the number of healthy pods for each server. The + metrics are used to monitor the performance and health of the vLLM router + services. + + Returns: + Response: A HTTP response containing the latest Prometheus metrics in + the appropriate content type. + """ + stats = GetRequestStatsMonitor().get_request_stats(time.time()) for server, stat in stats.items(): current_qps.labels(server=server).set(stat.qps) @@ -632,6 +719,15 @@ def parse_static_model_names(args): def InitializeAll(args): + """ + Initialize all the components of the router with the given arguments. + + Args: + args: the parsed command-line arguments + + Raises: + ValueError: if the service discovery type is invalid + """ if args.service_discovery == "static": InitializeServiceDiscovery( ServiceDiscoveryType.STATIC, @@ -663,6 +759,20 @@ def InitializeAll(args): def log_stats(interval: int = 10): + """ + Periodically logs the engine and request statistics for each service endpoint. + + This function retrieves the current service endpoints and their corresponding + engine and request statistics, and logs them at a specified interval. The + statistics include the number of running and queued requests, GPU cache hit + rate, queries per second (QPS), average latency, average inter-token latency + (ITL), and more. These statistics are also updated in the Prometheus metrics. + + Args: + interval (int): The interval in seconds at which statistics are logged. + Default is 10 seconds. + """ + while True: time.sleep(interval) logstr = "\n" + "=" * 50 + "\n" From d7b7f4b734ef72c5bd0c648a1e3e709810fff715 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Tue, 18 Feb 2025 08:27:15 +0700 Subject: [PATCH 14/15] modify to keep the original comment info Signed-off-by: sitloboi2012 --- src/vllm_router/request_stats.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/vllm_router/request_stats.py b/src/vllm_router/request_stats.py index 369d4447..1c724f69 100644 --- a/src/vllm_router/request_stats.py +++ b/src/vllm_router/request_stats.py @@ -87,16 +87,17 @@ def __init__(self, sliding_window_size: float): """ self.sliding_window_size = sliding_window_size - # Monitors for calculating QPS and TTFT + # Finished requests for each serving engine + # The elements in the deque should be sorted by 'complete' time self.qps_monitors: Dict[str, MovingAverageMonitor] = {} self.ttft_monitors: Dict[str, MovingAverageMonitor] = {} - # Record initial request start time: (engine_url, request_id) -> timestamp + # The time when the request is coming (engine_url, request_id) -> timestamp self.request_start_time: Dict[Tuple[str, str], float] = {} # Record time when first token is received: (engine_url, request_id) -> timestamp self.first_token_time: Dict[Tuple[str, str], float] = {} - # Counters for requests in different stages + # Number of requests in different stages (from the start of the router) self.in_prefill_requests: Dict[str, int] = {} self.in_decoding_requests: Dict[str, int] = {} self.finished_requests: Dict[str, int] = {} @@ -195,13 +196,16 @@ def on_request_swapped(self, engine_url: str, request_id: str, timestamp: float) def get_request_stats(self, current_time: float) -> Dict[str, RequestStats]: """ - Get the request statistics from the monitor. + Get the request statistics for each serving engine Args: - current_time: The current timestamp + current_time: The current timestamp in seconds Returns: - A dictionary mapping engine URLs to RequestStats objects + A dictionary where the key is the serving engine URL and the value + is the request statistics for that engine. + The TTFT and inter token latency will be -1 if there is no requests + finished in the sliding window. """ ret = {} urls = set(self.in_prefill_requests.keys()).union( From 03cf9a9859126995fd12d04e0fa87231667040f9 Mon Sep 17 00:00:00 2001 From: sitloboi2012 Date: Tue, 18 Feb 2025 19:12:48 +0700 Subject: [PATCH 15/15] run pre-commit for formatting Signed-off-by: sitloboi2012 --- src/vllm_router/router.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 5a4e55ff..ea209383 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -231,7 +231,6 @@ async def route_files(request: Request): ) - @app.get("/v1/files/{file_id}") async def route_get_file(file_id: str): try: @@ -244,7 +243,6 @@ async def route_get_file(file_id: str): ) - @app.get("/v1/files/{file_id}/content") async def route_get_file_content(file_id: str): try: @@ -445,7 +443,6 @@ async def route_cancel_batch(batch_id: str): ) - @app.post("/v1/chat/completions") async def route_chat_completition(request: Request): return await route_general_request(request, "/v1/chat/completions")