Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/apps/ai/Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ai-create-chapter-chunks:
@echo "Creating chapter chunks"
@CMD="python manage.py ai_create_chapter_chunks" $(MAKE) exec-backend-command

ai-create-slack-message-chunks:
@echo "Creating Slack message chunks"
@CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command
7 changes: 2 additions & 5 deletions backend/apps/ai/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@
class ChunkAdmin(admin.ModelAdmin):
list_display = (
"id",
"message",
"text",
)
search_fields = (
"message__slack_message_id",
"text",
"content_type",
)
search_fields = ("text", "object_id")


admin.site.register(Chunk, ChunkAdmin)
Empty file.
5 changes: 5 additions & 0 deletions backend/apps/ai/common/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""AI app constants."""

DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2
DELIMITER = "\n\n"
MIN_REQUEST_INTERVAL_SECONDS = 1.2
211 changes: 211 additions & 0 deletions backend/apps/ai/management/commands/ai_create_chapter_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""A command to create chunks of OWASP chapter data for RAG."""

import os
import time
from datetime import UTC, datetime, timedelta

import openai
from django.core.management.base import BaseCommand

from apps.ai.common.constants import (
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
DELIMITER,
MIN_REQUEST_INTERVAL_SECONDS,
)
from apps.ai.models.chunk import Chunk
from apps.owasp.models.chapter import Chapter


class Command(BaseCommand):
help = "Create chunks for OWASP chapter data"

def add_arguments(self, parser):
parser.add_argument(
"--chapter",
type=str,
help="Process only the chapter with this key",
)
parser.add_argument(
"--all",
action="store_true",
help="Process all the chapters",
Comment on lines +29 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Clarify that --all includes inactive chapters.

The help text should explicitly mention that this option processes both active and inactive chapters to avoid confusion.

         parser.add_argument(
             "--all",
             action="store_true",
-            help="Process all the chapters",
+            help="Process all chapters (including inactive)",
         )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"--all",
action="store_true",
help="Process all the chapters",
parser.add_argument(
"--all",
action="store_true",
help="Process all chapters (including inactive)",
)
🤖 Prompt for AI Agents
In backend/apps/ai/management/commands/ai_create_chapter_chunks.py around lines
29 to 31, update the help text for the "--all" argument to explicitly state that
it processes both active and inactive chapters. Modify the help string to
clarify this behavior so users understand that using "--all" includes inactive
chapters as well.

)
parser.add_argument(
"--batch-size",
type=int,
default=50,
help="Number of chapters to process in each batch",
Comment on lines +34 to +37
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Add validation for batch-size argument.

Consider adding validation to ensure batch-size is positive.

         parser.add_argument(
             "--batch-size",
             type=int,
             default=50,
             help="Number of chapters to process in each batch",
         )
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=50,
+            help="Number of chapters to process in each batch",
+            choices=range(1, 1001),
+            metavar="{1..1000}",
+        )

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In backend/apps/ai/management/commands/ai_create_chapter_chunks.py around lines
34 to 37, the batch-size argument lacks validation to ensure it is positive. Add
a check after parsing the batch-size argument to verify it is greater than zero,
and raise an appropriate error or exit with a message if the value is zero or
negative. This will prevent invalid batch sizes from being used during
processing.

)

def handle(self, *args, **options):
if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")):
self.stdout.write(
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
)
return

self.openai_client = openai.OpenAI(api_key=openai_api_key)

if chapter := options["chapter"]:
queryset = Chapter.objects.filter(key=chapter)
elif options["all"]:
queryset = Chapter.objects.all()
else:
queryset = Chapter.objects.filter(is_active=True)

if not (total_chapters := queryset.count()):
self.stdout.write("No chapters found to process")
return

self.stdout.write(f"Found {total_chapters} chapters to process")

batch_size = options["batch_size"]
for offset in range(0, total_chapters, batch_size):
batch_chapters = queryset[offset : offset + batch_size]

batch_chunks = []
for chapter in batch_chapters:
batch_chunks.extend(self.create_chunks(chapter))

if batch_chunks:
Chunk.bulk_save(batch_chunks)
self.stdout.write(f"Saved {len(batch_chunks)} chunks")

Comment on lines +63 to +73
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Add progress tracking for better user feedback.

Consider adding progress indicators showing which batch is being processed.

         batch_size = options["batch_size"]
+        batch_count = 0
         for offset in range(0, total_chapters, batch_size):
+            batch_count += 1
+            self.stdout.write(f"Processing batch {batch_count}/{(total_chapters + batch_size - 1) // batch_size}...")
             batch_chapters = queryset[offset : offset + batch_size]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
for offset in range(0, total_chapters, batch_size):
batch_chapters = queryset[offset : offset + batch_size]
batch_chunks = []
for chapter in batch_chapters:
batch_chunks.extend(self.create_chunks(chapter))
if batch_chunks:
Chunk.bulk_save(batch_chunks)
self.stdout.write(f"Saved {len(batch_chunks)} chunks")
batch_size = options["batch_size"]
+ batch_count = 0
for offset in range(0, total_chapters, batch_size):
+ batch_count += 1
+ self.stdout.write(
+ f"Processing batch {batch_count}/{(total_chapters + batch_size - 1) // batch_size}..."
+ )
batch_chapters = queryset[offset : offset + batch_size]
batch_chunks = []
for chapter in batch_chapters:
batch_chunks.extend(self.create_chunks(chapter))
if batch_chunks:
Chunk.bulk_save(batch_chunks)
self.stdout.write(f"Saved {len(batch_chunks)} chunks")
🧰 Tools
🪛 Flake8 (7.2.0)

[error] 64-64: whitespace before ':'

(E203)

🤖 Prompt for AI Agents
In backend/apps/ai/management/commands/ai_create_chapter_chunks.py around lines
63 to 73, add progress tracking by including a print or log statement before
processing each batch to indicate the current batch number and total batches.
Calculate the total number of batches based on total_chapters and batch_size,
then output progress like "Processing batch X of Y" before processing each batch
to provide better user feedback.

self.stdout.write(f"Completed processing all {total_chapters} chapters")

def create_chunks(self, chapter: Chapter) -> list[Chunk]:
"""Create chunks from a chapter's data."""
prose_content, metadata_content = self.extract_chapter_content(chapter)

all_chunk_texts = []

if metadata_content.strip():
all_chunk_texts.append(metadata_content)

if prose_content.strip():
all_chunk_texts.extend(Chunk.split_text(prose_content))

if not all_chunk_texts:
self.stdout.write(f"No content to chunk for chapter {chapter.key}")
return []

try:
time_since_last_request = datetime.now(UTC) - getattr(
self,
"last_request_time",
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
)

if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())

response = self.openai_client.embeddings.create(
input=all_chunk_texts,
model="text-embedding-3-small",
)
self.last_request_time = datetime.now(UTC)

return [
chunk
for text, embedding in zip(
all_chunk_texts,
[d.embedding for d in response.data],
strict=True,
)
if (
chunk := Chunk.update_data(
text=text,
content_object=chapter,
embedding=embedding,
save=False,
)
)
]
except openai.OpenAIError as e:
self.stdout.write(self.style.ERROR(f"OpenAI API error for chapter {chapter.key}: {e}"))
return []

def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
"""Extract and separate prose content from metadata for a chapter.
Returns:
tuple[str, str]: (prose_content, metadata_content)
"""
prose_parts = []
metadata_parts = []

if chapter.description:
prose_parts.append(f"Description: {chapter.description}")

if chapter.summary:
prose_parts.append(f"Summary: {chapter.summary}")

if hasattr(chapter, "owasp_repository") and chapter.owasp_repository:
repo = chapter.owasp_repository
if repo.description:
prose_parts.append(f"Repository Description: {repo.description}")
if repo.topics:
metadata_parts.append(f"Repository Topics: {', '.join(repo.topics)}")

if chapter.name:
metadata_parts.append(f"Chapter Name: {chapter.name}")

location_parts = []
if chapter.country:
location_parts.append(f"Country: {chapter.country}")
if chapter.region:
location_parts.append(f"Region: {chapter.region}")
if chapter.postal_code:
location_parts.append(f"Postal Code: {chapter.postal_code}")
if chapter.suggested_location:
location_parts.append(f"Location: {chapter.suggested_location}")

if location_parts:
metadata_parts.append(f"Location Information: {', '.join(location_parts)}")

if chapter.level:
metadata_parts.append(f"Chapter Level: {chapter.level}")

if chapter.currency:
metadata_parts.append(f"Currency: {chapter.currency}")

if chapter.meetup_group:
metadata_parts.append(f"Meetup Group: {chapter.meetup_group}")

if chapter.tags:
metadata_parts.append(f"Tags: {', '.join(chapter.tags)}")

if chapter.topics:
metadata_parts.append(f"Topics: {', '.join(chapter.topics)}")

if chapter.leaders_raw:
leaders_info = []
for leader in chapter.leaders_raw:
if isinstance(leader, dict):
leader_name = leader.get("name", "")
leader_email = leader.get("email", "")
if leader_name:
leader_text = f"Leader: {leader_name}"
if leader_email:
leader_text += f" ({leader_email})"
leaders_info.append(leader_text)

if leaders_info:
metadata_parts.append(f"Chapter Leaders: {', '.join(leaders_info)}")

if chapter.related_urls:
valid_urls = [
url
for url in chapter.related_urls
if url and url not in (chapter.invalid_urls or [])
]
if valid_urls:
metadata_parts.append(f"Related URLs: {', '.join(valid_urls)}")

metadata_parts.append(f"Active Chapter: {'Yes' if chapter.is_active else 'No'}")

return (
DELIMITER.join(filter(None, prose_parts)),
DELIMITER.join(filter(None, metadata_parts)),
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
import openai
from django.core.management.base import BaseCommand

from apps.ai.common.constants import (
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
MIN_REQUEST_INTERVAL_SECONDS,
)
from apps.ai.models.chunk import Chunk
from apps.slack.models.message import Message

MIN_REQUEST_INTERVAL_SECONDS = 1.2
DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2


class Command(BaseCommand):
help = "Create chunks for Slack messages"
Expand Down Expand Up @@ -78,10 +79,10 @@ def create_chunks(self, message: Message) -> list[Chunk]:
)
if (
chunk := Chunk.update_data(
text=text,
content_object=message,
embedding=embedding,
message=message,
save=False,
text=text,
)
)
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Generated by Django 5.2.3 on 2025-07-01 10:39

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("ai", "0003_alter_chunk_options_alter_chunk_embedding_and_more"),
("contenttypes", "0002_remove_content_type_name"),
]

operations = [
migrations.AlterUniqueTogether(
name="chunk",
unique_together=set(),
),
migrations.AddField(
model_name="chunk",
name="content_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
to="contenttypes.contenttype",
),
),
migrations.AddField(
model_name="chunk",
name="object_id",
field=models.PositiveIntegerField(default=0),
),
migrations.RemoveField(
model_name="chunk",
name="message",
),
# we need to remove the unique constraint before adding the new one
# message needs to be removed first and then we can add the new unique constraint
migrations.AlterUniqueTogether(
name="chunk",
unique_together={("content_type", "object_id", "text")},
),
]
36 changes: 25 additions & 11 deletions backend/apps/ai/models/chunk.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,38 @@
"""Slack app chunk model."""
"""AI app chunk model."""

from django.contrib.contenttypes.fields import GenericForeignKey
from django.contrib.contenttypes.models import ContentType
from django.db import models
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pgvector.django import VectorField

from apps.common.models import BulkSaveModel, TimestampedModel
from apps.common.utils import truncate
from apps.slack.models.message import Message


class Chunk(TimestampedModel):
"""Slack Chunk model."""
"""AI Chunk model for storing text chunks with embeddings."""

class Meta:
db_table = "ai_chunks"
verbose_name = "Chunk"
unique_together = ("message", "text")
unique_together = ("content_type", "object_id", "text")

content_object = GenericForeignKey("content_type", "object_id")
content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE, blank=True, null=True)
embedding = VectorField(verbose_name="Embedding", dimensions=1536)
message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks")
object_id = models.PositiveIntegerField(default=0)
text = models.TextField(verbose_name="Text")

def __str__(self):
"""Human readable representation."""
content_name = (
getattr(self.content_object, "name", None)
or getattr(self.content_object, "key", None)
or str(self.content_object)
)
return (
f"Chunk {self.id} for Message {self.message.slack_message_id}: "
f"Chunk {self.id} for {self.content_type.model} {content_name}: "
f"{truncate(self.text, 50)}"
)

Expand All @@ -46,7 +54,7 @@ def split_text(text: str) -> list[str]:
@staticmethod
def update_data(
text: str,
message: Message,
content_object,
embedding,
*,
save: bool = True,
Expand All @@ -55,18 +63,24 @@ def update_data(

Args:
text (str): The text content of the chunk.
message (Message): The message this chunk belongs to.
content_object: The object this chunk belongs to (Message, Chapter, etc.).
embedding (list): The embedding vector for the chunk.
save (bool): Whether to save the chunk to the database.

Returns:
Chunk: The updated chunk instance.
Chunk: The updated chunk instance or None if it already exists.

"""
if Chunk.objects.filter(message=message, text=text).exists():
content_type = ContentType.objects.get_for_model(content_object)

if Chunk.objects.filter(
content_type=content_type, object_id=content_object.id, text=text
).exists():
return None

chunk = Chunk(message=message, text=text, embedding=embedding)
chunk = Chunk(
content_type=content_type, object_id=content_object.id, text=text, embedding=embedding
)

if save:
chunk.save()
Expand Down
Loading