Skip to content

Commit

Permalink
Selfhosted fixes (#172)
Browse files Browse the repository at this point in the history
Fix selfhosted guru creation adding multiple repository
Fix crawling on guru creation. The guru limit still applies for other crawls
Remove unnecessary guru slug arg on crawl stop/status endpoints
Fix widget follow up / binge
GitHub reindex status update fix
Fix empty data source sidebar not resetting the pending urls
Add sitemap parse confirmation
Fix deleting individual urls on sidebar
Fix binge map mobile button not being hidden for bot responses
Fix pdf links on selfhosted (ui/widget/slack/api/discord)
Fix discord referencing with sources having underscores in their name
Fix guru icon update still showing "pending changes"
Fix the pdf references in the answer (not the references section)
---------

Co-authored-by: aralyekta <[email protected]>
  • Loading branch information
kursataktas and aralyekta authored Mar 8, 2025
1 parent 648dfa5 commit 02351a1
Show file tree
Hide file tree
Showing 21 changed files with 245 additions and 119 deletions.
8 changes: 6 additions & 2 deletions src/gurubase-backend/backend/backend/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@
path('analytics/', include('analytics.urls')),

path('<str:guru_slug>/crawl/start/', core_views.start_crawl_admin, name='start_crawl_admin'),
path('<str:guru_slug>/crawl/<int:crawl_id>/stop/', core_views.stop_crawl_admin, name='stop_crawl_admin'),
path('<str:guru_slug>/crawl/<int:crawl_id>/status/', core_views.get_crawl_status_admin, name='get_crawl_status_admin'),
path('crawl/<int:crawl_id>/stop/', core_views.stop_crawl_admin, name='stop_crawl_admin'),
path('crawl/<int:crawl_id>/status/', core_views.get_crawl_status_admin, name='get_crawl_status_admin'),
]

if settings.STREAM_ENABLED:
Expand All @@ -86,10 +86,14 @@
]
if settings.ENV == 'selfhosted':
urlpatterns += [
# Define the urls that are accessed by the selfhosted nginx proxy ('localhost:8029/api/')
path('api/<str:guru_type>/answer/', core_views.answer, name="answer-api"),
path('api/analytics/', include('analytics.urls')),
path('api/widget/ask/', core_views.ask_widget, name='ask_widget_api'),
path('api/widget/guru/', core_views.get_guru_visuals, name='get_guru_visuals_api'),
path('api/widget/binge/', core_views.widget_create_binge, name='widget_create_binge_api'),
path('api/<str:guru_type>/follow_up/examples/', core_views.follow_up_examples, name='follow_up_examples_api'),
path('api/slack/events/', core_views.slack_events, name='slack_events_api'),
path('settings/', core_views.manage_settings, name='manage_settings'), # New settings endpoint
]

Expand Down
38 changes: 22 additions & 16 deletions src/gurubase-backend/backend/core/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def youtube_content_extraction(youtube_url):

def pdf_content_extraction(pdf_path):
try:
pdf_path = replace_media_root_with_nginx_base_url(pdf_path)
loader = PyPDFLoader(pdf_path)
pages = loader.load()
except Exception as e:
Expand Down Expand Up @@ -563,36 +562,44 @@ def start_crawl(guru_slug, user, url, source=CrawlState.Source.API):
return {'msg': 'Invalid URL format'}, 400

user = CrawlService.get_user(user)
guru_type = CrawlService.validate_and_get_guru_type(guru_slug, user)
try:
guru_type = CrawlService.validate_and_get_guru_type(guru_slug, user)
link_limit = guru_type.website_count_limit
except NotFoundError as e:
if source == CrawlState.Source.UI:
guru_type = None
link_limit = 1500
else:
raise e

# Existing crawl start logic
existing_crawl = CrawlState.objects.filter(
guru_type=guru_type,
status=CrawlState.Status.RUNNING
).first()
if existing_crawl:
return {'msg': 'A crawl is already running for this guru type. Please wait for it to complete or stop it.'}, 400
if guru_type:
existing_crawl = CrawlState.objects.filter(
guru_type=guru_type,
status=CrawlState.Status.RUNNING
).first()
if existing_crawl:
return {'msg': 'A crawl is already running for this guru type. Please wait for it to complete or stop it.'}, 400

crawl_state = CrawlState.objects.create(
url=url,
status=CrawlState.Status.RUNNING,
link_limit=guru_type.website_count_limit,
link_limit=link_limit,
guru_type=guru_type,
user=user,
source=source
)
crawl_website.delay(url, crawl_state.id, guru_type.website_count_limit)
crawl_website.delay(url, crawl_state.id, link_limit)
return CrawlStateSerializer(crawl_state).data, 200

@staticmethod
def stop_crawl(guru_slug, user, crawl_id):
def stop_crawl(user, crawl_id):
from core.serializers import CrawlStateSerializer
user = CrawlService.get_user(user)
guru_type = CrawlService.validate_and_get_guru_type(guru_slug, user)

# Existing stop logic
try:
crawl_state = CrawlState.objects.get(id=crawl_id, guru_type=guru_type)
crawl_state = CrawlState.objects.get(id=crawl_id)
if crawl_state.status == CrawlState.Status.RUNNING:
crawl_state.status = CrawlState.Status.STOPPED
crawl_state.end_time = datetime.now(UTC)
Expand All @@ -602,14 +609,13 @@ def stop_crawl(guru_slug, user, crawl_id):
return {'msg': 'Crawl not found'}, 404

@staticmethod
def get_crawl_status(guru_slug, user, crawl_id):
def get_crawl_status(user, crawl_id):
from core.serializers import CrawlStateSerializer
user = CrawlService.get_user(user)
guru_type = CrawlService.validate_and_get_guru_type(guru_slug, user)

# Existing status logic
try:
crawl_state = CrawlState.objects.get(id=crawl_id, guru_type=guru_type)
crawl_state = CrawlState.objects.get(id=crawl_id)
# Update last_polled_at
crawl_state.last_polled_at = datetime.now(UTC)
crawl_state.save(update_fields=['last_polled_at'])
Expand Down
23 changes: 14 additions & 9 deletions src/gurubase-backend/backend/core/gcp.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
import logging
from django.conf import settings

import traceback
from django.core.files.storage import FileSystemStorage as DjangoFileSystemStorage

# storage = GoogleCloudStorage()
logger = logging.getLogger(__name__)


def replace_media_root_with_nginx_base_url(url):
# TODO: Update this when selfhosted url setting is added
if settings.ENV == 'selfhosted':
# Replace also for development environment
url = url.replace(settings.MEDIA_ROOT, f'{settings.NGINX_BASE_URL}/media')
url = url.replace("/workspace/backend/media", f'{settings.NGINX_BASE_URL}/media')
return url
if not url:
logger.error("URL is None", traceback.format_exc())
return ''
path = url.split(settings.MEDIA_ROOT)[1]
return f'{settings.NGINX_BASE_URL}/media{path}'
return url

def replace_media_root_with_localhost(url):
def replace_media_root_with_base_url(url):
# TODO: Update this when selfhosted url setting is added
if settings.ENV == 'selfhosted':
port = settings.NGINX_BASE_URL[settings.NGINX_BASE_URL.rfind(":"):][1:]
if not url:
logger.error("URL is None", traceback.format_exc())
return ''
# Replace also for development environment
url = url.replace(settings.MEDIA_ROOT, f'http://localhost:{port}/media')
url = url.replace("/workspace/backend/media", f'http://localhost:{port}/media')
return url
path = url.split(settings.MEDIA_ROOT)[1]
return f'{settings.BASE_URL}/media{path}'
return url


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def format_response(self, response):
clean_title
).strip()

formatted_msg.append(f"• [_{clean_title}_](<{ref['link']}>)")
formatted_msg.append(f"• [*{clean_title}*](<{ref['link']}>)")

# Add space for frontend link
formatted_msg.append(f":eyes: [_View on Gurubase for a better UX_](<{response['question_url']}>)")
Expand Down Expand Up @@ -453,7 +453,7 @@ async def on_message(message):
' ',
clean_title
).strip()
metadata += f"\n• [_{clean_title}_](<{ref['link']}>)"
metadata += f"\n• [*{clean_title}*](<{ref['link']}>)"

metadata += f"\n:eyes: [_View on Gurubase for a better UX_](<{response['question_url']}>)"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.2.18 on 2025-03-04 12:22

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('core', '0059_gurucreationform_source'),
]

operations = [
migrations.AlterField(
model_name='crawlstate',
name='guru_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='core.gurutype'),
),
]
2 changes: 1 addition & 1 deletion src/gurubase-backend/backend/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1650,7 +1650,7 @@ class Source(models.TextChoices):
end_time = models.DateTimeField(null=True, blank=True)
last_polled_at = models.DateTimeField(auto_now_add=True)
link_limit = models.IntegerField(default=1500)
guru_type = models.ForeignKey(GuruType, on_delete=models.CASCADE)
guru_type = models.ForeignKey(GuruType, on_delete=models.CASCADE, null=True, blank=True)
user = models.ForeignKey(User, on_delete=models.CASCADE, null=True, blank=True) # null on selfhosted

def __str__(self):
Expand Down
2 changes: 1 addition & 1 deletion src/gurubase-backend/backend/core/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
2. Contexts are not the exact answer, but they are relevant information to answer the question.
3. Highlight critical information in bold for emphasis.
4. Explain concepts whenever possible, being informative and helpful.
5. Provide references and links to sources mentioned in the context links and titles when applicable. Do not reference like "Context 1" or "Context 2". Add references like [Title](link) if applicable.
5. Provide references and links to sources mentioned in the context links and titles when applicable. Do not reference like "Context 1" or "Context 2". Add references like [Title](link) if applicable. However, for pdf files, only refer to the pdf title.
6. Demonstrate concepts with examples when possible.
7. Use code blocks for any code snippets.
8. Use exact names from contexts for functions/classes/methods.
Expand Down
2 changes: 1 addition & 1 deletion src/gurubase-backend/backend/core/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,5 +161,5 @@ class Meta:

def to_representation(self, instance):
repr = super().to_representation(instance)
repr['guru_type'] = instance.guru_type.slug
repr['guru_type'] = instance.guru_type.slug if instance.guru_type else None
return repr
9 changes: 5 additions & 4 deletions src/gurubase-backend/backend/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1370,14 +1370,14 @@ def process_guru_type(guru_type):

if len(structure) > data_source.guru_type.github_file_count_limit_per_repo_hard:
raise GithubRepoFileCountLimitError(
f"The codebase exceeds the maximum file limit of {data_source.guru_type.github_file_count_limit_per_repo_hard} files supported."
f"The codebase ({len(structure)}) exceeds the maximum file limit of {data_source.guru_type.github_file_count_limit_per_repo_hard} files supported."
)

# Calculate total size
total_size = sum(file['size'] for file in structure)
if total_size > data_source.guru_type.github_repo_size_limit_mb * 1024 * 1024:
raise GithubRepoSizeLimitError(
f"The codebase exceeds the maximum size limit of {data_source.guru_type.github_repo_size_limit_mb} MB supported."
f"The codebase ({total_size / (1024 * 1024):.2f} MB) exceeds the maximum size limit of {data_source.guru_type.github_repo_size_limit_mb} MB supported."
)

# Get existing files for this data source
Expand Down Expand Up @@ -1461,6 +1461,7 @@ def process_guru_type(guru_type):
data_source.in_milvus = False
data_source.error = ""
data_source.user_error = ""
data_source.status = DataSource.Status.SUCCESS
data_source.save()
data_source.write_to_milvus()

Expand Down Expand Up @@ -1488,7 +1489,7 @@ def process_guru_type(guru_type):
data_source.error = error_msg
data_source.status = DataSource.Status.FAIL
if data_source.last_successful_index_date:
user_error = f"An issue occurred while reindexing the codebase. The repository has grown beyond our size limit of {data_source.guru_type.github_repo_size_limit_mb} MB. No worries though - this guru still uses the codebase indexed on {data_source.last_successful_index_date.strftime('%B %d')}. Reindexing will be attempted again later."
user_error = f"An issue occurred while reindexing the codebase. The repository size ({total_size / (1024 * 1024):.2f} MB) has grown beyond our size limit of {data_source.guru_type.github_repo_size_limit_mb} MB. No worries though - this guru still uses the codebase indexed on {data_source.last_successful_index_date.strftime('%B %d')}. Reindexing will be attempted again later."
else:
user_error = str(e)
data_source.user_error = user_error
Expand All @@ -1501,7 +1502,7 @@ def process_guru_type(guru_type):
data_source.error = error_msg
data_source.status = DataSource.Status.FAIL
if data_source.last_successful_index_date:
user_error = f"An issue occurred while reindexing the codebase. The repository has grown beyond our file count limit of {data_source.guru_type.github_file_count_limit_per_repo_hard} files. No worries though - this guru still uses the codebase indexed on {data_source.last_successful_index_date.strftime('%B %d')}. Reindexing will be attempted again later."
user_error = f"An issue occurred while reindexing the codebase. The repository has grown to {len(structure)} files, which exceeds our file count limit of {data_source.guru_type.github_file_count_limit_per_repo_hard} files. No worries though - this guru still uses the codebase indexed on {data_source.last_successful_index_date.strftime('%B %d')}. Reindexing will be attempted again later."
else:
user_error = str(e)
data_source.user_error = user_error
Expand Down
4 changes: 3 additions & 1 deletion src/gurubase-backend/backend/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3167,6 +3167,7 @@ def format_date_updated(date_updated: datetime) -> str:
return date_updated.strftime('%-d %B %Y') if date_updated else None

def format_references(references: list, api: bool = False) -> list:
from core.gcp import replace_media_root_with_base_url
processed_references = []
for reference in references:
if 'question' in reference and 'link' in reference:
Expand Down Expand Up @@ -3205,7 +3206,8 @@ def format_references(references: list, api: bool = False) -> list:
if settings.ENV == 'selfhosted':
for reference in processed_references:
if reference['link'] == pdf_data_source.url:
reference['link'] = reference['link'].replace("/workspace/backend", "")
reference['link'] = replace_media_root_with_base_url(reference['link'])



return processed_references
Expand Down
16 changes: 6 additions & 10 deletions src/gurubase-backend/backend/core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from core.data_sources import CrawlService
from core.serializers import WidgetIdSerializer, BingeSerializer, DataSourceSerializer, GuruTypeSerializer, GuruTypeInternalSerializer, QuestionCopySerializer, FeaturedDataSourceSerializer, APIKeySerializer, DataSourceAPISerializer, SettingsSerializer
from core.auth import auth, follow_up_examples_auth, jwt_auth, combined_auth, stream_combined_auth, api_key_auth
from core.gcp import replace_media_root_with_localhost, replace_media_root_with_nginx_base_url
from core.gcp import replace_media_root_with_base_url, replace_media_root_with_nginx_base_url
from core.models import CrawlState, FeaturedDataSource, Question, ContentPageStatistics, WidgetId, Binge, DataSource, GuruType, Integration, Thread, APIKey, GuruCreationForm
from accounts.models import User
from core.utils import (
Expand Down Expand Up @@ -1519,7 +1519,7 @@ def get_guru_visuals(request):
guru_type = request.guru_type
response = {
'colors': guru_type.colors,
'icon_url': replace_media_root_with_localhost(guru_type.icon_url),
'icon_url': replace_media_root_with_base_url(guru_type.icon_url),
'name': guru_type.name,
'slug': guru_type.slug,
}
Expand Down Expand Up @@ -2796,10 +2796,9 @@ def start_crawl_api(request, guru_slug):

@api_view(['POST'])
@jwt_auth
def stop_crawl_admin(request, guru_slug, crawl_id):
def stop_crawl_admin(request, crawl_id):
try:
data, return_status = CrawlService.stop_crawl(
guru_slug,
request.user,
crawl_id
)
Expand All @@ -2811,10 +2810,9 @@ def stop_crawl_admin(request, guru_slug, crawl_id):
@api_view(['POST'])
@api_key_auth
@throttle_classes([ConcurrencyThrottleApiKey])
def stop_crawl_api(request, guru_slug, crawl_id):
def stop_crawl_api(request, crawl_id):
try:
data, return_status = CrawlService.stop_crawl(
guru_slug,
request.user,
crawl_id
)
Expand All @@ -2825,10 +2823,9 @@ def stop_crawl_api(request, guru_slug, crawl_id):

@api_view(['GET'])
@jwt_auth
def get_crawl_status_admin(request, guru_slug, crawl_id):
def get_crawl_status_admin(request, crawl_id):
try:
data, return_status = CrawlService.get_crawl_status(
guru_slug,
request.user,
crawl_id
)
Expand All @@ -2840,10 +2837,9 @@ def get_crawl_status_admin(request, guru_slug, crawl_id):
@api_view(['GET'])
@api_key_auth
@throttle_classes([ConcurrencyThrottleApiKey])
def get_crawl_status_api(request, guru_slug, crawl_id):
def get_crawl_status_api(request, crawl_id):
try:
data, return_status = CrawlService.get_crawl_status(
guru_slug,
request.user,
crawl_id
)
Expand Down
8 changes: 4 additions & 4 deletions src/gurubase-frontend/src/app/actions.js
Original file line number Diff line number Diff line change
Expand Up @@ -1152,10 +1152,10 @@ export async function startCrawl(url, guruSlug) {
}
}

export async function stopCrawl(crawlId, guruSlug) {
export async function stopCrawl(crawlId) {
try {
const response = await makeAuthenticatedRequest(
`${process.env.NEXT_PUBLIC_BACKEND_FETCH_URL}/${guruSlug}/crawl/${crawlId}/stop/`,
`${process.env.NEXT_PUBLIC_BACKEND_FETCH_URL}/crawl/${crawlId}/stop/`,
{
method: "POST",
headers: { "Content-Type": "application/json" }
Expand All @@ -1172,10 +1172,10 @@ export async function stopCrawl(crawlId, guruSlug) {
}
}

export async function getCrawlStatus(crawlId, guruSlug) {
export async function getCrawlStatus(crawlId) {
try {
const response = await makeAuthenticatedRequest(
`${process.env.NEXT_PUBLIC_BACKEND_FETCH_URL}/${guruSlug}/crawl/${crawlId}/status/`,
`${process.env.NEXT_PUBLIC_BACKEND_FETCH_URL}/crawl/${crawlId}/status/`,
{
method: "GET",
headers: { "Content-Type": "application/json" }
Expand Down
5 changes: 3 additions & 2 deletions src/gurubase-frontend/src/components/Content/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ const Content = (props) => {
<Clock className="w-8 h-8 text-white" />
</div>
<h1 className="text-2xl font-semibold">Bot Conversation</h1>
<p className="text-md text-muted-foreground text-center max-w-lg">
<p className="text-md text-muted-foreground text-center max-w-lg mx-4">
This binge is from a conversation on{" "}
{finalSource?.charAt(0).toUpperCase() +
finalSource?.slice(1).toLowerCase()}
Expand Down Expand Up @@ -763,7 +763,8 @@ const Content = (props) => {
{/* Mobile Binge Map section */}
{typeof window !== "undefined" && slug && (
<>
{finalBingeId &&
{treeData?.children?.length > 0 &&
finalBingeId &&
!isBingeMapOpen &&
!isLoading &&
!streamingStatus &&
Expand Down
4 changes: 3 additions & 1 deletion src/gurubase-frontend/src/components/GuruEditPageSidebar.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ export default function GuruEditPageSidebar({ guruData }) {
variant="outline"
size="smButtonLgText"
className="w-full text-black hover:bg-gray-800 hover:text-white rounded-full"
onClick={() => handleNavigation(`/g/${guruSlug}`)}>
onClick={() =>
window.open(`/g/${guruSlug}`, "_blank", "noopener,noreferrer")
}>
<div className="inline-flex items-center gap-1">
<span>Visit Guru</span>
<svg
Expand Down
Loading

0 comments on commit 02351a1

Please sign in to comment.