Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions backend/apps/ai/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ ai-update-project-context:
@echo "Updating project context"
@CMD="python manage.py ai_update_project_context" $(MAKE) exec-backend-command

ai-update-repository-chunks:
@echo "Updating repository chunks"
@CMD="python manage.py ai_update_repository_chunks" $(MAKE) exec-backend-command

ai-update-repository-context:
@echo "Updating repository context"
@CMD="python manage.py ai_update_repository_context" $(MAKE) exec-backend-command

ai-update-slack-message-chunks:
@echo "Updating Slack message chunks"
@CMD="python manage.py ai_update_slack_message_chunks" $(MAKE) exec-backend-command
Expand Down
13 changes: 9 additions & 4 deletions backend/apps/ai/common/base/chunk_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from apps.ai.common.utils import create_chunks_and_embeddings
from apps.ai.models.chunk import Chunk
from apps.ai.models.context import Context
from apps.common.utils import is_valid_json


class BaseChunkCommand(BaseAICommand):
Expand Down Expand Up @@ -43,10 +44,14 @@ def process_chunks_batch(self, entities: list[Model]) -> int:
count, _ = context.chunks.all().delete()
self.stdout.write(f"Deleted {count} stale chunks for {entity_key}")

prose_content, metadata_content = self.extract_content(entity)
full_content = (
f"{metadata_content}\n\n{prose_content}" if metadata_content else prose_content
)
content, metadata_content = self.extract_content(entity)

if is_valid_json(content):
full_content = content
else:
full_content = (
f"{metadata_content}\n\n{content}" if metadata_content else content
)

if not full_content.strip():
self.stdout.write(f"No content to chunk for {self.entity_name} {entity_key}")
Expand Down
2 changes: 2 additions & 0 deletions backend/apps/ai/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@
DEFAULT_CHUNKS_RETRIEVAL_LIMIT = 5
DEFAULT_SIMILARITY_THRESHOLD = 0.4
DELIMITER = "\n\n"
GITHUB_REQUEST_INTERVAL_SECONDS = 0.5
MIN_REQUEST_INTERVAL_SECONDS = 1.2
QUEUE_RESPONSE_TIME_MINUTES = 1
177 changes: 177 additions & 0 deletions backend/apps/ai/common/extractors/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Content extractor for Repository."""

import json
import logging
import time

from apps.ai.common.constants import DELIMITER, GITHUB_REQUEST_INTERVAL_SECONDS
from apps.common.utils import is_valid_json
from apps.github.utils import get_repository_file_content

logger = logging.getLogger(__name__)


def extract_repository_content(repository) -> tuple[str, str]:
"""Extract structured content from repository data.

Args:
repository: Repository instance

Returns:
tuple[str, str]: (json_content, metadata_content)

"""
repository_data = {}

if repository.name:
repository_data["name"] = repository.name
if repository.key:
repository_data["key"] = repository.key
if repository.description:
repository_data["description"] = repository.description
if repository.homepage:
repository_data["homepage"] = repository.homepage
if repository.license:
repository_data["license"] = repository.license
if repository.topics:
repository_data["topics"] = repository.topics

status = {}
if repository.is_archived:
status["archived"] = True
if repository.is_empty:
status["empty"] = True
if repository.is_owasp_repository:
status["owasp_repository"] = True
if repository.is_owasp_site_repository:
status["owasp_site_repository"] = True
if status:
repository_data["status"] = status

funding = {}
if repository.is_funding_policy_compliant:
funding["policy_compliant"] = True
if repository.has_funding_yml:
funding["has_funding_yml"] = True
if funding:
repository_data["funding"] = funding

if repository.pages_status:
repository_data["pages_status"] = repository.pages_status

features = []
if repository.has_downloads:
features.append("downloads")
if repository.has_issues:
features.append("issues")
if repository.has_pages:
features.append("pages")
if repository.has_projects:
features.append("projects")
if repository.has_wiki:
features.append("wiki")
if features:
repository_data["features"] = features

stats = {}
if repository.commits_count:
stats["commits"] = repository.commits_count
if repository.contributors_count:
stats["contributors"] = repository.contributors_count
if repository.forks_count:
stats["forks"] = repository.forks_count
if repository.open_issues_count:
stats["open_issues"] = repository.open_issues_count
if repository.stars_count:
stats["stars"] = repository.stars_count
if repository.subscribers_count:
stats["subscribers"] = repository.subscribers_count
if repository.watchers_count:
stats["watchers"] = repository.watchers_count
if stats:
repository_data["statistics"] = stats

dates = {}
if repository.created_at:
dates["created"] = repository.created_at.strftime("%Y-%m-%d")
if repository.updated_at:
dates["last_updated"] = repository.updated_at.strftime("%Y-%m-%d")
if repository.pushed_at:
dates["last_pushed"] = repository.pushed_at.strftime("%Y-%m-%d")
if dates:
repository_data["dates"] = dates

ownership = {}
if repository.organization:
ownership["organization"] = repository.organization.login
if repository.owner:
ownership["owner"] = repository.owner.login
if ownership:
repository_data["ownership"] = ownership

markdown_files = [
"README.md",
"index.md",
"info.md",
"leaders.md",
]

if repository.organization:
owner = repository.organization.login
else:
owner = repository.owner.login if repository.owner else ""
branch = repository.default_branch or "main"

tab_files = []
if owner and repository.key:
contents_url = (
f"https://api.github.com/repos/{owner}/{repository.key}/contents/?ref={branch}"
)
response = get_repository_file_content(contents_url)
if response and is_valid_json(response):
items = json.loads(response)
for item in items:
if isinstance(item, dict):
name = item.get("name", "")
if name.startswith("tab_") and name.endswith(".md"):
tab_files.append(name)

all_markdown_files = markdown_files + tab_files

markdown_content = {}
for file_path in all_markdown_files:
try:
if owner and repository.key:
raw_url = (
f"https://raw.githubusercontent.com/{owner}/{repository.key}/"
f"{branch}/{file_path}"
)
content = get_repository_file_content(raw_url)

if content and content.strip():
markdown_content[file_path] = content
time.sleep(GITHUB_REQUEST_INTERVAL_SECONDS)

except (ValueError, TypeError, OSError):
logger.debug("Failed to fetch markdown file")
continue

if markdown_content:
repository_data["markdown_content"] = markdown_content

json_content = json.dumps(repository_data, indent=2)

metadata_parts = []
if repository.name:
metadata_parts.append(f"Repository Name: {repository.name}")
if repository.key:
metadata_parts.append(f"Repository Key: {repository.key}")
if repository.organization:
metadata_parts.append(f"Organization: {repository.organization.login}")
if repository.owner:
metadata_parts.append(f"Owner: {repository.owner.login}")

return (
json_content,
DELIMITER.join(filter(None, metadata_parts)),
)
41 changes: 41 additions & 0 deletions backend/apps/ai/management/commands/ai_update_repository_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""A command to create chunks of OWASP repository data for RAG."""

from django.db.models import QuerySet

from apps.ai.common.base.chunk_command import BaseChunkCommand
from apps.ai.common.extractors.repository import extract_repository_content
from apps.github.models.repository import Repository


class Command(BaseChunkCommand):
key_field_name = "key"
model_class = Repository

def __init__(self, *args, **kwargs):
"""Initialize command for repository."""
super().__init__(*args, **kwargs)
self.entity_name_plural = "repositories"

def extract_content(self, entity: Repository) -> tuple[str, str]:
"""Extract content from the repository."""
return extract_repository_content(entity)

def get_base_queryset(self) -> QuerySet:
"""Return the base queryset with filtering for OWASP site repositories."""
return (
super()
.get_base_queryset()
.filter(
is_owasp_site_repository=True,
is_archived=False,
is_empty=False,
)
)

def get_default_queryset(self) -> QuerySet:
"""Override to avoid is_active filter since Repository doesn't have that field."""
return self.get_base_queryset()

def source_name(self) -> str:
"""Return the source name for context creation."""
return "owasp_repository"
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""A command to update context for OWASP repository data."""

from django.db.models import QuerySet

from apps.ai.common.base.context_command import BaseContextCommand
from apps.ai.common.extractors.repository import extract_repository_content
from apps.github.models.repository import Repository


class Command(BaseContextCommand):
key_field_name = "key"
model_class = Repository

def __init__(self, *args, **kwargs):
"""Initialize command for repository."""
super().__init__(*args, **kwargs)
self.entity_name_plural = "repositories"

def extract_content(self, entity: Repository) -> tuple[str, str]:
"""Extract content from the repository."""
return extract_repository_content(entity)

def get_base_queryset(self) -> QuerySet:
"""Return the base queryset with filtering for OWASP site repositories."""
return (
super()
.get_base_queryset()
.filter(
is_owasp_site_repository=True,
is_archived=False,
is_empty=False,
)
)

def get_default_queryset(self) -> QuerySet:
"""Override to avoid is_active filter since Repository doesn't have that field."""
return self.get_base_queryset()

def source_name(self) -> str:
"""Return the source name for context creation."""
return "owasp_repository"
4 changes: 2 additions & 2 deletions backend/apps/ai/models/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def bulk_save(chunks, fields=None):
def split_text(text: str) -> list[str]:
"""Split text into chunks."""
return RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=40,
chunk_size=500,
chunk_overlap=80,
length_function=len,
separators=["\n\n", "\n", " ", ""],
).split_text(text)
Expand Down
18 changes: 18 additions & 0 deletions backend/apps/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import json
import re
from datetime import UTC, datetime
from urllib.parse import urlparse
Expand Down Expand Up @@ -102,6 +103,23 @@ def get_user_ip_address(request) -> str:
return x_forwarded_for.split(",")[0] if x_forwarded_for else request.META.get("REMOTE_ADDR")


def is_valid_json(content: str) -> bool:
"""Check if content is JSON format.

Args:
content: The content to check

Returns:
bool: True if content is valid JSON, False otherwise

"""
try:
json.loads(content)
except (TypeError, ValueError):
return False
return True


def join_values(fields: list, delimiter: str = " ") -> str:
"""Join non-empty field values using a specified delimiter.

Expand Down
9 changes: 9 additions & 0 deletions backend/apps/slack/MANIFEST.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ features:
description: OWASP users list
usage_hint: <user>
should_escape: false
- command: /ai
url: https://nest.owasp.org/integrations/slack/commands/
description: AI-powered OWASP Nest assistant
usage_hint: <your question>
should_escape: false
oauth_config:
scopes:
user:
Expand All @@ -103,6 +108,7 @@ oauth_config:
- mpim:read
- users:read
bot:
- app_mentions:read
- channels:read
- chat:write
- commands
Expand All @@ -115,6 +121,7 @@ oauth_config:
- users:read
- groups:write
- channels:manage
- channels:history
settings:
event_subscriptions:
request_url: https://nest.owasp.org/integrations/slack/events/
Expand All @@ -123,7 +130,9 @@ settings:
- team_join
bot_events:
- app_home_opened
- app_mention
- member_joined_channel
- message.channels
- team_join
interactivity:
is_enabled: true
Expand Down
1 change: 1 addition & 0 deletions backend/apps/slack/admin/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ConversationAdmin(admin.ModelAdmin):
"is_private",
"is_archived",
"is_general",
"is_nest_bot_assistant_enabled",
)
},
),
Expand Down
Loading