Skip to content

Commit

Permalink
Updates (#157)
Browse files Browse the repository at this point in the history
Fix recursion issue for page transition animation
- Add support for multi github repositories Feature Request: Multi-Repo Indexing #113
- Add website/youtube/pdf limitations per guru. Link crawl limit to this as well Bug Report: The ArduPilot Guru needs around 2000 documents #148
- Fix youtube link sidebar close
- Fix widget apis for selfhosted Bug Report: self-hosted widget api returns 404 not found #153
- More granular error messages for codebase reindexing failures
- Add a task that checks failed github repositories (hourly)
- Wildcard support for Widget URLs

---------

Co-authored-by: aralyekta <[email protected]>
  • Loading branch information
kursataktas and aralyekta authored Mar 4, 2025
1 parent 15125eb commit 822302d
Show file tree
Hide file tree
Showing 34 changed files with 2,020 additions and 396 deletions.
5 changes: 3 additions & 2 deletions src/gurubase-backend/backend/backend/celery.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os

import celery.signals
from celery import Celery

# Set the default Django settings module for the 'celery' program.
Expand All @@ -14,5 +12,8 @@
# should have a `CELERY_` prefix.
app.config_from_object('django.conf:settings', namespace='CELERY')

# Disable worker hijacking the root logger
app.conf.worker_hijack_root_logger = False

# Load task modules from all registered Django apps.
app.autodiscover_tasks()
35 changes: 30 additions & 5 deletions src/gurubase-backend/backend/backend/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os
import sys
import logging
from datetime import datetime

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -206,19 +207,47 @@
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '[%(asctime)s] [%(levelname)s] %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S'
},
},
'handlers': {
'console': {
'level': 'INFO',
'class': 'logging.StreamHandler',
'formatter': 'verbose',
'filters': ['hide_info_specific_task'],
},
},
'loggers': {
'': { # This means all loggers will use this configuration
'': { # Default logger
'handlers': ['console'],
'level': 'INFO',
'propagate': True,
},
'celery': {
'handlers': ['console'],
'level': 'INFO',
'propagate': False,
},
'celery.beat': {
'handlers': ['console'],
'level': 'INFO',
},
},
'filters': {
'hide_info_specific_task': {
'()': 'django.utils.log.CallbackFilter',
'callback': lambda record: not (
('task_stop_inactive_ui_crawls' in record.getMessage() or 'core.tasks.stop_inactive_ui_crawls' in record.getMessage()) and
record.levelno == logging.INFO and
not (datetime.fromtimestamp(record.created).minute == 0 and
datetime.fromtimestamp(record.created).second <= 3)
)
}
}
}


Expand Down Expand Up @@ -389,10 +418,6 @@

BINGE_HISTORY_PAGE_SIZE = config('BINGE_HISTORY_PAGE_SIZE', default=30, cast=int)

DATA_SOURCES_LIMIT_TOTAL_WEBSITES_COUNT = config('DATA_SOURCES_LIMIT_TOTAL_WEBSITES_COUNT', default=1500, cast=int)
DATA_SOURCES_LIMIT_TOTAL_YOUTUBE_COUNT = config('DATA_SOURCES_LIMIT_TOTAL_YOUTUBE_COUNT', default=100, cast=int)
DATA_SOURCES_LIMIT_TOTAL_PDF_MB = config('DATA_SOURCES_LIMIT_TOTAL_PDF_MB', default=100, cast=int)

GITHUB_REPO_CODE_COLLECTION_NAME = config('GITHUB_REPO_CODE_COLLECTION_NAME', default='github_repo_code')

CLOUDFLARE_BASE_URL = config('CLOUDFLARE_BASE_URL', default='https://api.cloudflare.com/client/v4')
Expand Down
2 changes: 2 additions & 0 deletions src/gurubase-backend/backend/backend/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@
urlpatterns += [
path('api/<str:guru_type>/answer/', core_views.answer, name="answer-api"),
path('api/analytics/', include('analytics.urls')),
path('api/widget/ask/', core_views.ask_widget, name='ask_widget_api'),
path('api/widget/guru/', core_views.get_guru_visuals, name='get_guru_visuals_api'),
path('settings/', core_views.manage_settings, name='manage_settings'), # New settings endpoint
]

Expand Down
2 changes: 1 addition & 1 deletion src/gurubase-backend/backend/core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def guru_type(self, obj):

@admin.register(GuruType)
class GuruTypeAdmin(admin.ModelAdmin):
list_display = ['id', 'slug', 'active', 'has_sitemap_added_questions', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'colors', 'custom', 'maintainers_list', 'github_repo', 'date_created', 'date_updated', 'github_details_updated_date']
list_display = ['id', 'slug', 'active', 'has_sitemap_added_questions', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'colors', 'custom', 'maintainers_list', 'github_repos', 'date_created', 'date_updated', 'github_details_updated_date']
search_fields = ['id', 'slug', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'date_created', 'date_updated', 'maintainers__email']
list_filter = ('active', 'custom', 'has_sitemap_added_questions')
ordering = ('-id',)
Expand Down
6 changes: 1 addition & 5 deletions src/gurubase-backend/backend/core/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,7 @@ def wrapper(request, *args, **kwargs):
if not origin:
return Response({'msg': 'Origin header is required'}, status=status.HTTP_401_UNAUTHORIZED)

# Parse origin to get domain
parsed_origin = urlparse(origin)
request_domain = f"{parsed_origin.scheme}://{parsed_origin.netloc}"

if request_domain != widget_id_obj.domain:
if not widget_id_obj.domain_matches_pattern(origin, widget_id_obj.domain):
return Response({'msg': 'Invalid domain. Please check your domain URL in Gurubase platform'}, status=status.HTTP_401_UNAUTHORIZED)

request.guru_type = widget_id_obj.guru_type
Expand Down
15 changes: 12 additions & 3 deletions src/gurubase-backend/backend/core/cors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def cors_allow_api_keys(sender, request, **kwargs):
"""
Signal handler to allow CORS for domains registered with widget IDs.
Checks if the requesting origin matches any widget ID's domain.
Checks if the requesting origin matches any widget ID's domain or wildcard pattern.
"""
try:
# Get origin from headers
Expand All @@ -21,8 +21,17 @@ def cors_allow_api_keys(sender, request, **kwargs):
parsed_origin = urlparse(origin)
request_domain = f"{parsed_origin.scheme}://{parsed_origin.netloc}"

# Check if this domain exists in any widget ID
return WidgetId.objects.filter(domain=request_domain).exists()
# First check for exact domain matches (more efficient query)
if WidgetId.objects.filter(domain=request_domain, is_wildcard=False).exists():
return True

# Then check for wildcard patterns
wildcard_patterns = WidgetId.objects.filter(is_wildcard=True).values_list('domain', flat=True)
for pattern in wildcard_patterns:
if WidgetId.domain_matches_pattern(request_domain, pattern):
return True

return False

except Exception as e:
# Log the error but don't block the request - let other CORS rules handle it
Expand Down
22 changes: 12 additions & 10 deletions src/gurubase-backend/backend/core/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def run_spider_process(url, crawl_state_id, link_limit):
logger.error(traceback.format_exc())


def get_internal_links(url: str, crawl_state_id: int = None, link_limit: int = 1500) -> List[str]:
def get_internal_links(url: str, crawl_state_id: int, link_limit: int) -> List[str]:
"""
Crawls a website starting from the given URL and returns a list of all internal links found.
The crawler only follows links that start with the same domain as the initial URL.
Expand Down Expand Up @@ -397,9 +397,10 @@ def __init__(self, *args, **kwargs):
self.link_limit = kwargs.get('link_limit', 1500)
self.should_close = False
if settings.ENV != 'selfhosted':
self.proxies = format_proxies(get_random_proxies())
proxies = format_proxies(get_random_proxies())
else:
self.proxies = None
proxies = None
self.proxies = proxies
except Exception as e:
logger.error(f"Error initializing InternalLinkSpider: {str(e)}", traceback.format_exc())
CrawlState.objects.get(id=self.crawl_state_id).status = CrawlState.Status.FAILED
Expand Down Expand Up @@ -451,10 +452,11 @@ def parse(self, response):
if settings.ENV != 'selfhosted' and len(self.internal_links) >= self.link_limit:
if self.crawl_state_id:
crawl_state = CrawlState.objects.get(id=self.crawl_state_id)
crawl_state.status = CrawlState.Status.FAILED
crawl_state.error_message = f"Link limit of {self.link_limit} exceeded"
crawl_state.end_time = timezone.now()
crawl_state.save()
if not crawl_state.user.is_admin:
crawl_state.status = CrawlState.Status.FAILED
crawl_state.error_message = f"Link limit of {self.link_limit} exceeded"
crawl_state.end_time = timezone.now()
crawl_state.save()
return

if len(self.internal_links) % 100 == 0:
Expand Down Expand Up @@ -543,7 +545,7 @@ def get_user(user):
return user

@staticmethod
def start_crawl(guru_slug, user, url, link_limit=1500, source=CrawlState.Source.API):
def start_crawl(guru_slug, user, url, source=CrawlState.Source.API):
from core.serializers import CrawlStateSerializer
from core.tasks import crawl_website
import re
Expand Down Expand Up @@ -574,12 +576,12 @@ def start_crawl(guru_slug, user, url, link_limit=1500, source=CrawlState.Source.
crawl_state = CrawlState.objects.create(
url=url,
status=CrawlState.Status.RUNNING,
link_limit=link_limit,
link_limit=guru_type.website_count_limit,
guru_type=guru_type,
user=user,
source=source
)
crawl_website.delay(url, crawl_state.id, link_limit)
crawl_website.delay(url, crawl_state.id, guru_type.website_count_limit)
return CrawlStateSerializer(crawl_state).data, 200

@staticmethod
Expand Down
9 changes: 9 additions & 0 deletions src/gurubase-backend/backend/core/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ def replace_media_root_with_nginx_base_url(url):
return url
return url

def replace_media_root_with_localhost(url):
if settings.ENV == 'selfhosted':
port = settings.NGINX_BASE_URL[settings.NGINX_BASE_URL.rfind(":"):][1:]
# Replace also for development environment
url = url.replace(settings.MEDIA_ROOT, f'http://localhost:{port}/media')
url = url.replace("/workspace/backend/media", f'http://localhost:{port}/media')
return url
return url


class GCP:
def __init__(self, bucket_name=settings.GS_BUCKET_NAME):
Expand Down
Loading

0 comments on commit 822302d

Please sign in to comment.