OWASP · arkid15r · Jul 3, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 2, 2025
@@ -1,3 +1,7 @@
+ai-create-chapter-chunks:
+	@echo "Creating chapter chunks"
+	@CMD="python manage.py ai_create_chapter_chunks" $(MAKE) exec-backend-command
+
 ai-create-slack-message-chunks:
 	@echo "Creating Slack message chunks"
 	@CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command
@@ -8,13 +8,10 @@
 class ChunkAdmin(admin.ModelAdmin):
     list_display = (
         "id",
-        "message",
-        "text",
-    )
-    search_fields = (
-        "message__slack_message_id",
         "text",
+        "content_type",
     )
+    search_fields = ("text", "object_id")
 
 
 admin.site.register(Chunk, ChunkAdmin)
@@ -0,0 +1,5 @@
+"""AI app constants."""
+
+DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2
+DELIMITER = "\n\n"
+MIN_REQUEST_INTERVAL_SECONDS = 1.2
@@ -0,0 +1,211 @@
+"""A command to create chunks of OWASP chapter data for RAG."""
+
+import os
+import time
+from datetime import UTC, datetime, timedelta
+
+import openai
+from django.core.management.base import BaseCommand
+
+from apps.ai.common.constants import (
+    DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
+    DELIMITER,
+    MIN_REQUEST_INTERVAL_SECONDS,
+)
+from apps.ai.models.chunk import Chunk
+from apps.owasp.models.chapter import Chapter
+
+
+class Command(BaseCommand):
+    help = "Create chunks for OWASP chapter data"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--chapter",
+            type=str,
+            help="Process only the chapter with this key",
+        )
+        parser.add_argument(
+            "--all",
+            action="store_true",
+            help="Process all the chapters",
-            "--all",
-            action="store_true",
-            help="Process all the chapters",
+         parser.add_argument(
+             "--all",
+             action="store_true",
+             help="Process all chapters (including inactive)",
+         )
-            "--all",
-            action="store_true",
-            help="Process all the chapters",
+         parser.add_argument(
+             "--all",
+             action="store_true",
+             help="Process all chapters (including inactive)",
+         )
+        )
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=50,
+            help="Number of chapters to process in each batch",
+        )
+
+    def handle(self, *args, **options):
+        if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")):
+            self.stdout.write(
+                self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
+            )
+            return
+
+        self.openai_client = openai.OpenAI(api_key=openai_api_key)
+
+        if chapter := options["chapter"]:
+            queryset = Chapter.objects.filter(key=chapter)
+        elif options["all"]:
+            queryset = Chapter.objects.all()
+        else:
+            queryset = Chapter.objects.filter(is_active=True)
+
+        if not (total_chapters := queryset.count()):
+            self.stdout.write("No chapters found to process")
+            return
+
+        self.stdout.write(f"Found {total_chapters} chapters to process")
+
+        batch_size = options["batch_size"]
+        for offset in range(0, total_chapters, batch_size):
+            batch_chapters = queryset[offset : offset + batch_size]
+
+            batch_chunks = []
+            for chapter in batch_chapters:
+                batch_chunks.extend(self.create_chunks(chapter))
+
+            if batch_chunks:
+                Chunk.bulk_save(batch_chunks)
+                self.stdout.write(f"Saved {len(batch_chunks)} chunks")
+
-        for offset in range(0, total_chapters, batch_size):
-            batch_chapters = queryset[offset : offset + batch_size]
-
-            batch_chunks = []
-            for chapter in batch_chapters:
-                batch_chunks.extend(self.create_chunks(chapter))
-
-            if batch_chunks:
-                Chunk.bulk_save(batch_chunks)
-                self.stdout.write(f"Saved {len(batch_chunks)} chunks")
+         batch_size = options["batch_size"]
+        batch_count = 0
+         for offset in range(0, total_chapters, batch_size):
+            batch_count += 1
+            self.stdout.write(
+                f"Processing batch {batch_count}/{(total_chapters + batch_size - 1) // batch_size}..."
+            )
+             batch_chapters = queryset[offset : offset + batch_size]
+
+             batch_chunks = []
+             for chapter in batch_chapters:
+                 batch_chunks.extend(self.create_chunks(chapter))
+
+             if batch_chunks:
+                 Chunk.bulk_save(batch_chunks)
+                 self.stdout.write(f"Saved {len(batch_chunks)} chunks")
-        for offset in range(0, total_chapters, batch_size):
-            batch_chapters = queryset[offset : offset + batch_size]
-
-            batch_chunks = []
-            for chapter in batch_chapters:
-                batch_chunks.extend(self.create_chunks(chapter))
-
-            if batch_chunks:
-                Chunk.bulk_save(batch_chunks)
-                self.stdout.write(f"Saved {len(batch_chunks)} chunks")
+         batch_size = options["batch_size"]
+        batch_count = 0
+         for offset in range(0, total_chapters, batch_size):
+            batch_count += 1
+            self.stdout.write(
+                f"Processing batch {batch_count}/{(total_chapters + batch_size - 1) // batch_size}..."
+            )
+             batch_chapters = queryset[offset : offset + batch_size]
+
+             batch_chunks = []
+             for chapter in batch_chapters:
+                 batch_chunks.extend(self.create_chunks(chapter))
+
+             if batch_chunks:
+                 Chunk.bulk_save(batch_chunks)
+                 self.stdout.write(f"Saved {len(batch_chunks)} chunks")
+        self.stdout.write(f"Completed processing all {total_chapters} chapters")
+
+    def create_chunks(self, chapter: Chapter) -> list[Chunk]:
+        """Create chunks from a chapter's data."""
+        prose_content, metadata_content = self.extract_chapter_content(chapter)
+
+        all_chunk_texts = []
+
+        if metadata_content.strip():
+            all_chunk_texts.append(metadata_content)
+
+        if prose_content.strip():
+            all_chunk_texts.extend(Chunk.split_text(prose_content))
+
+        if not all_chunk_texts:
+            self.stdout.write(f"No content to chunk for chapter {chapter.key}")
+            return []
+
+        try:
+            time_since_last_request = datetime.now(UTC) - getattr(
+                self,
+                "last_request_time",
+                datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
+            )
+
+            if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
+                time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
+
+            response = self.openai_client.embeddings.create(
+                input=all_chunk_texts,
+                model="text-embedding-3-small",
+            )
+            self.last_request_time = datetime.now(UTC)
+
+            return [
+                chunk
+                for text, embedding in zip(
+                    all_chunk_texts,
+                    [d.embedding for d in response.data],
+                    strict=True,
+                )
+                if (
+                    chunk := Chunk.update_data(
+                        text=text,
+                        content_object=chapter,
+                        embedding=embedding,
+                        save=False,
+                    )
+                )
+            ]
+        except openai.OpenAIError as e:
+            self.stdout.write(self.style.ERROR(f"OpenAI API error for chapter {chapter.key}: {e}"))
+            return []
+
+    def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
+        """Extract and separate prose content from metadata for a chapter.
+
+        Returns:
+          tuple[str, str]: (prose_content, metadata_content)
+
+        """
+        prose_parts = []
+        metadata_parts = []
+
+        if chapter.description:
+            prose_parts.append(f"Description: {chapter.description}")
+
+        if chapter.summary:
+            prose_parts.append(f"Summary: {chapter.summary}")
+
+        if hasattr(chapter, "owasp_repository") and chapter.owasp_repository:
+            repo = chapter.owasp_repository
+            if repo.description:
+                prose_parts.append(f"Repository Description: {repo.description}")
+            if repo.topics:
+                metadata_parts.append(f"Repository Topics: {', '.join(repo.topics)}")
+
+        if chapter.name:
+            metadata_parts.append(f"Chapter Name: {chapter.name}")
+
+        location_parts = []
+        if chapter.country:
+            location_parts.append(f"Country: {chapter.country}")
+        if chapter.region:
+            location_parts.append(f"Region: {chapter.region}")
+        if chapter.postal_code:
+            location_parts.append(f"Postal Code: {chapter.postal_code}")
+        if chapter.suggested_location:
+            location_parts.append(f"Location: {chapter.suggested_location}")
+
+        if location_parts:
+            metadata_parts.append(f"Location Information: {', '.join(location_parts)}")
+
+        if chapter.level:
+            metadata_parts.append(f"Chapter Level: {chapter.level}")
+
+        if chapter.currency:
+            metadata_parts.append(f"Currency: {chapter.currency}")
+
+        if chapter.meetup_group:
+            metadata_parts.append(f"Meetup Group: {chapter.meetup_group}")
+
+        if chapter.tags:
+            metadata_parts.append(f"Tags: {', '.join(chapter.tags)}")
+
+        if chapter.topics:
+            metadata_parts.append(f"Topics: {', '.join(chapter.topics)}")
+
+        if chapter.leaders_raw:
+            leaders_info = []
+            for leader in chapter.leaders_raw:
+                if isinstance(leader, dict):
+                    leader_name = leader.get("name", "")
+                    leader_email = leader.get("email", "")
+                    if leader_name:
+                        leader_text = f"Leader: {leader_name}"
+                        if leader_email:
+                            leader_text += f" ({leader_email})"
+                        leaders_info.append(leader_text)
+
+            if leaders_info:
+                metadata_parts.append(f"Chapter Leaders: {', '.join(leaders_info)}")
+
+        if chapter.related_urls:
+            valid_urls = [
+                url
+                for url in chapter.related_urls
+                if url and url not in (chapter.invalid_urls or [])
+            ]
+            if valid_urls:
+                metadata_parts.append(f"Related URLs: {', '.join(valid_urls)}")
+
+        metadata_parts.append(f"Active Chapter: {'Yes' if chapter.is_active else 'No'}")
+
+        return (
+            DELIMITER.join(filter(None, prose_parts)),
+            DELIMITER.join(filter(None, metadata_parts)),
+        )
@@ -7,12 +7,13 @@
 import openai
 from django.core.management.base import BaseCommand
 
+from apps.ai.common.constants import (
+    DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
+    MIN_REQUEST_INTERVAL_SECONDS,
+)
 from apps.ai.models.chunk import Chunk
 from apps.slack.models.message import Message
 
-MIN_REQUEST_INTERVAL_SECONDS = 1.2
-DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2
-
 
 class Command(BaseCommand):
     help = "Create chunks for Slack messages"
@@ -78,10 +79,10 @@ def create_chunks(self, message: Message) -> list[Chunk]:
                 )
                 if (
                     chunk := Chunk.update_data(
+                        text=text,
+                        content_object=message,
                         embedding=embedding,
-                        message=message,
                         save=False,
-                        text=text,
                     )
                 )
             ]

@@ -0,0 +1,43 @@
+# Generated by Django 5.2.3 on 2025-07-01 10:39
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("ai", "0003_alter_chunk_options_alter_chunk_embedding_and_more"),
+        ("contenttypes", "0002_remove_content_type_name"),
+    ]
+
+    operations = [
+        migrations.AlterUniqueTogether(
+            name="chunk",
+            unique_together=set(),
+        ),
+        migrations.AddField(
+            model_name="chunk",
+            name="content_type",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="contenttypes.contenttype",
+            ),
+        ),
+        migrations.AddField(
+            model_name="chunk",
+            name="object_id",
+            field=models.PositiveIntegerField(default=0),
+        ),
+        migrations.RemoveField(
+            model_name="chunk",
+            name="message",
+        ),
+        # we need to remove the unique constraint before adding the new one
+        # message needs to be removed first and then we can add the new unique constraint
+        migrations.AlterUniqueTogether(
+            name="chunk",
+            unique_together={("content_type", "object_id", "text")},
+        ),
+    ]
@@ -1,30 +1,38 @@
-"""Slack app chunk model."""
+"""AI app chunk model."""
 
+from django.contrib.contenttypes.fields import GenericForeignKey
+from django.contrib.contenttypes.models import ContentType
 from django.db import models
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pgvector.django import VectorField
 
 from apps.common.models import BulkSaveModel, TimestampedModel
 from apps.common.utils import truncate
-from apps.slack.models.message import Message
 
 
 class Chunk(TimestampedModel):
-    """Slack Chunk model."""
+    """AI Chunk model for storing text chunks with embeddings."""
 
     class Meta:
         db_table = "ai_chunks"
         verbose_name = "Chunk"
-        unique_together = ("message", "text")
+        unique_together = ("content_type", "object_id", "text")
 
+    content_object = GenericForeignKey("content_type", "object_id")
+    content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE, blank=True, null=True)
     embedding = VectorField(verbose_name="Embedding", dimensions=1536)
-    message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks")
+    object_id = models.PositiveIntegerField(default=0)
     text = models.TextField(verbose_name="Text")
 
     def __str__(self):
         """Human readable representation."""
+        content_name = (
+            getattr(self.content_object, "name", None)
+            or getattr(self.content_object, "key", None)
+            or str(self.content_object)
+        )
         return (
-            f"Chunk {self.id} for Message {self.message.slack_message_id}: "
+            f"Chunk {self.id} for {self.content_type.model} {content_name}: "
             f"{truncate(self.text, 50)}"
         )
 
@@ -46,7 +54,7 @@ def split_text(text: str) -> list[str]:
     @staticmethod
     def update_data(
         text: str,
-        message: Message,
+        content_object,
         embedding,
         *,
         save: bool = True,
@@ -55,18 +63,24 @@ def update_data(
 
         Args:
           text (str): The text content of the chunk.
-          message (Message): The message this chunk belongs to.
+          content_object: The object this chunk belongs to (Message, Chapter, etc.).
           embedding (list): The embedding vector for the chunk.
           save (bool): Whether to save the chunk to the database.
 
         Returns:
-          Chunk: The updated chunk instance.
+          Chunk: The updated chunk instance or None if it already exists.
 
         """
-        if Chunk.objects.filter(message=message, text=text).exists():
+        content_type = ContentType.objects.get_for_model(content_object)
+
+        if Chunk.objects.filter(
+            content_type=content_type, object_id=content_object.id, text=text
+        ).exists():
             return None
 
-        chunk = Chunk(message=message, text=text, embedding=embedding)
+        chunk = Chunk(
+            content_type=content_type, object_id=content_object.id, text=text, embedding=embedding
+        )
 
         if save:
             chunk.save()