From fcd75b43dc5faa3c1d0252a8b3f939951479ac88 Mon Sep 17 00:00:00 2001 From: Shaoting Date: Mon, 3 Mar 2025 13:29:35 -0600 Subject: [PATCH] [CI/Build][Router] Make semantic caching optional (#218) * Make semantic caching optional Signed-off-by: Shaoting Feng * Avoid check semantic cache model Signed-off-by: Shaoting Feng --------- Signed-off-by: Shaoting Feng --- .../workflows/functionality-helm-chart.yml | 2 +- docker/Dockerfile | 3 + setup.py | 38 +++-- src/vllm_router/router.py | 156 ++++++++++-------- 4 files changed, 113 insertions(+), 86 deletions(-) diff --git a/.github/workflows/functionality-helm-chart.yml b/.github/workflows/functionality-helm-chart.yml index 4ad6c096..63c94e7a 100644 --- a/.github/workflows/functionality-helm-chart.yml +++ b/.github/workflows/functionality-helm-chart.yml @@ -32,7 +32,7 @@ jobs: DOCKER_BUILDKIT: 1 run: | cd ${{ github.workspace }} - sudo docker build -t localhost:5000/git-act-router -f docker/Dockerfile . + sudo docker build --build-arg INSTALL_SENTENCE_TRANSFORMERS=false -t localhost:5000/git-act-router -f docker/Dockerfile . sudo docker push localhost:5000/git-act-router sudo sysctl fs.protected_regular=0 sudo minikube image load localhost:5000/git-act-router diff --git a/docker/Dockerfile b/docker/Dockerfile index 57801e7a..04ec54ee 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,6 +15,9 @@ COPY .git/ .git/ # Copy the rest of the application code COPY src/ src/ +ARG INSTALL_SENTENCE_TRANSFORMERS=true +ENV INSTALL_SENTENCE_TRANSFORMERS=${INSTALL_SENTENCE_TRANSFORMERS} + # Install dependencies (use cache, and delete after install, to speed up the build) RUN pip install --upgrade --no-cache-dir pip setuptools_scm && \ pip install --no-cache-dir . diff --git a/setup.py b/setup.py index b3ccfa9c..b823f564 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,28 @@ +import os + from setuptools import find_packages, setup +install_sentence_transformers = ( + os.getenv("INSTALL_SENTENCE_TRANSFORMERS", "true") == "true" +) + +install_requires = [ + "numpy==1.26.4", + "fastapi==0.115.8", + "httpx==0.28.1", + "uvicorn==0.34.0", + "kubernetes==32.0.0", + "prometheus_client==0.21.1", + "uhashring==2.3", + "aiofiles==24.1.0", + "python-multipart==0.0.20", + "faiss-cpu==1.10.0", + "huggingface-hub==0.25.2", # downgrade to 0.25.2 to avoid breaking changes +] + +if install_sentence_transformers: + install_requires.append("sentence-transformers==2.2.2") + setup( name="vllm-router", use_scm_version=True, @@ -7,20 +30,7 @@ packages=find_packages(where="src"), package_dir={"": "src"}, # Should be the same as src/router/requirements.txt - install_requires=[ - "numpy==1.26.4", - "fastapi==0.115.8", - "httpx==0.28.1", - "uvicorn==0.34.0", - "kubernetes==32.0.0", - "prometheus_client==0.21.1", - "uhashring==2.3", - "aiofiles==24.1.0", - "python-multipart==0.0.20", - "sentence-transformers==2.2.2", - "faiss-cpu==1.10.0", - "huggingface-hub==0.25.2", # downgrade to 0.25.2 to avoid breaking changes - ], + install_requires=install_requires, entry_points={ "console_scripts": [ "vllm-router=vllm_router.router:main", diff --git a/src/vllm_router/router.py b/src/vllm_router/router.py index 94d935f5..258185e5 100644 --- a/src/vllm_router/router.py +++ b/src/vllm_router/router.py @@ -26,23 +26,29 @@ initialize_feature_gates, ) -# Semantic cache integration -from vllm_router.experimental.semantic_cache import ( - GetSemanticCache, - InitializeSemanticCache, - enable_semantic_cache, - is_semantic_cache_enabled, -) -from vllm_router.experimental.semantic_cache_integration import ( - add_semantic_cache_args, - check_semantic_cache, - semantic_cache_hit_ratio, - semantic_cache_hits, - semantic_cache_latency, - semantic_cache_misses, - semantic_cache_size, - store_in_semantic_cache, -) +try: + # Semantic cache integration + from vllm_router.experimental.semantic_cache import ( + GetSemanticCache, + InitializeSemanticCache, + enable_semantic_cache, + is_semantic_cache_enabled, + ) + from vllm_router.experimental.semantic_cache_integration import ( + add_semantic_cache_args, + check_semantic_cache, + semantic_cache_hit_ratio, + semantic_cache_hits, + semantic_cache_latency, + semantic_cache_misses, + semantic_cache_size, + store_in_semantic_cache, + ) + + semantic_cache_available = True +except ImportError: + semantic_cache_available = False + from vllm_router.files import Storage, initialize_storage from vllm_router.httpx_client import HTTPXClientWrapper from vllm_router.protocols import ModelCard, ModelList @@ -195,14 +201,15 @@ async def process_request( backend_url, request_id, time.time() ) - # if debug_request: - # logger.debug(f"Finished the request with request id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") - # Store in semantic cache if applicable - # Use the full response for non-streaming requests, or the last chunk for streaming - cache_chunk = bytes(full_response) if full_response is not None else chunk - await store_in_semantic_cache( - endpoint=endpoint, method=method, body=body, chunk=cache_chunk - ) + if semantic_cache_available: + # if debug_request: + # logger.debug(f"Finished the request with request id: {debug_request.headers.get('x-request-id', None)} at {time.time()}") + # Store in semantic cache if applicable + # Use the full response for non-streaming requests, or the last chunk for streaming + cache_chunk = bytes(full_response) if full_response is not None else chunk + await store_in_semantic_cache( + endpoint=endpoint, method=method, body=body, chunk=cache_chunk + ) async def route_general_request(request: Request, endpoint: str): @@ -521,13 +528,14 @@ async def route_cancel_batch(batch_id: str): @app.post("/v1/chat/completions") async def route_chat_completition(request: Request): - # Check if the request can be served from the semantic cache - logger.debug("Received chat completion request, checking semantic cache") - cache_response = await check_semantic_cache(request=request) + if semantic_cache_available: + # Check if the request can be served from the semantic cache + logger.debug("Received chat completion request, checking semantic cache") + cache_response = await check_semantic_cache(request=request) - if cache_response: - logger.info("Serving response from semantic cache") - return cache_response + if cache_response: + logger.info("Serving response from semantic cache") + return cache_response logger.debug("No cache hit, forwarding request to backend") return await route_general_request(request, "/v1/chat/completions") @@ -832,8 +840,9 @@ def parse_args(): help="Show version and exit", ) - # Add semantic cache arguments - add_semantic_cache_args(parser) + if semantic_cache_available: + # Add semantic cache arguments + add_semantic_cache_args(parser) # Add feature gates argument parser.add_argument( @@ -901,53 +910,58 @@ def InitializeAll(args): initialize_feature_gates(args.feature_gates) # Check if the SemanticCache feature gate is enabled feature_gates = get_feature_gates() - if feature_gates.is_enabled("SemanticCache"): - # The feature gate is enabled, explicitly enable the semantic cache - enable_semantic_cache() + if semantic_cache_available: + if feature_gates.is_enabled("SemanticCache"): + # The feature gate is enabled, explicitly enable the semantic cache + enable_semantic_cache() - # Verify that the semantic cache was successfully enabled - if not is_semantic_cache_enabled(): - logger.error("Failed to enable semantic cache feature") + # Verify that the semantic cache was successfully enabled + if not is_semantic_cache_enabled(): + logger.error("Failed to enable semantic cache feature") - logger.info("SemanticCache feature gate is enabled") + logger.info("SemanticCache feature gate is enabled") - # Initialize the semantic cache with the model if specified - if args.semantic_cache_model: - logger.info( - f"Initializing semantic cache with model: {args.semantic_cache_model}" - ) - logger.info( - f"Semantic cache directory: {args.semantic_cache_dir or 'default'}" - ) - logger.info(f"Semantic cache threshold: {args.semantic_cache_threshold}") + # Initialize the semantic cache with the model if specified + if args.semantic_cache_model: + logger.info( + f"Initializing semantic cache with model: {args.semantic_cache_model}" + ) + logger.info( + f"Semantic cache directory: {args.semantic_cache_dir or 'default'}" + ) + logger.info( + f"Semantic cache threshold: {args.semantic_cache_threshold}" + ) - cache = InitializeSemanticCache( - embedding_model=args.semantic_cache_model, - cache_dir=args.semantic_cache_dir, - default_similarity_threshold=args.semantic_cache_threshold, - ) + cache = InitializeSemanticCache( + embedding_model=args.semantic_cache_model, + cache_dir=args.semantic_cache_dir, + default_similarity_threshold=args.semantic_cache_threshold, + ) + + # Update cache size metric + if cache and hasattr(cache, "db") and hasattr(cache.db, "index"): + semantic_cache_size.labels(server="router").set( + cache.db.index.ntotal + ) + logger.info( + f"Semantic cache initialized with {cache.db.index.ntotal} entries" + ) - # Update cache size metric - if cache and hasattr(cache, "db") and hasattr(cache.db, "index"): - semantic_cache_size.labels(server="router").set(cache.db.index.ntotal) logger.info( - f"Semantic cache initialized with {cache.db.index.ntotal} entries" + f"Semantic cache initialized with model {args.semantic_cache_model}" ) - - logger.info( - f"Semantic cache initialized with model {args.semantic_cache_model}" - ) - else: + else: + logger.warning( + "SemanticCache feature gate is enabled but no embedding model specified. " + "The semantic cache will not be functional without an embedding model. " + "Use --semantic-cache-model to specify an embedding model." + ) + elif args.semantic_cache_model: logger.warning( - "SemanticCache feature gate is enabled but no embedding model specified. " - "The semantic cache will not be functional without an embedding model. " - "Use --semantic-cache-model to specify an embedding model." + "Semantic cache model specified but SemanticCache feature gate is not enabled. " + "Enable the feature gate with --feature-gates=SemanticCache=true" ) - elif args.semantic_cache_model: - logger.warning( - "Semantic cache model specified but SemanticCache feature gate is not enabled. " - "Enable the feature gate with --feature-gates=SemanticCache=true" - ) # --- Hybrid addition: attach singletons to FastAPI state --- app.state.engine_stats_scraper = GetEngineStatsScraper()