diff --git a/backend/apps/ai/Makefile b/backend/apps/ai/Makefile index b74300a6c3..2a4714f2e6 100644 --- a/backend/apps/ai/Makefile +++ b/backend/apps/ai/Makefile @@ -1,3 +1,7 @@ +ai-create-chapter-chunks: + @echo "Creating chapter chunks" + @CMD="python manage.py ai_create_chapter_chunks" $(MAKE) exec-backend-command + ai-create-slack-message-chunks: @echo "Creating Slack message chunks" @CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command diff --git a/backend/apps/ai/admin.py b/backend/apps/ai/admin.py index e168d67d27..8d7c410c4d 100644 --- a/backend/apps/ai/admin.py +++ b/backend/apps/ai/admin.py @@ -8,13 +8,10 @@ class ChunkAdmin(admin.ModelAdmin): list_display = ( "id", - "message", - "text", - ) - search_fields = ( - "message__slack_message_id", "text", + "content_type", ) + search_fields = ("text", "object_id") admin.site.register(Chunk, ChunkAdmin) diff --git a/backend/apps/ai/common/__init__.py b/backend/apps/ai/common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/apps/ai/common/constants.py b/backend/apps/ai/common/constants.py new file mode 100644 index 0000000000..98aa2a5a4e --- /dev/null +++ b/backend/apps/ai/common/constants.py @@ -0,0 +1,5 @@ +"""AI app constants.""" + +DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2 +DELIMITER = "\n\n" +MIN_REQUEST_INTERVAL_SECONDS = 1.2 diff --git a/backend/apps/ai/management/commands/ai_create_chapter_chunks.py b/backend/apps/ai/management/commands/ai_create_chapter_chunks.py new file mode 100644 index 0000000000..bd80c3ea7d --- /dev/null +++ b/backend/apps/ai/management/commands/ai_create_chapter_chunks.py @@ -0,0 +1,211 @@ +"""A command to create chunks of OWASP chapter data for RAG.""" + +import os +import time +from datetime import UTC, datetime, timedelta + +import openai +from django.core.management.base import BaseCommand + +from apps.ai.common.constants import ( + DEFAULT_LAST_REQUEST_OFFSET_SECONDS, + DELIMITER, + MIN_REQUEST_INTERVAL_SECONDS, +) +from apps.ai.models.chunk import Chunk +from apps.owasp.models.chapter import Chapter + + +class Command(BaseCommand): + help = "Create chunks for OWASP chapter data" + + def add_arguments(self, parser): + parser.add_argument( + "--chapter", + type=str, + help="Process only the chapter with this key", + ) + parser.add_argument( + "--all", + action="store_true", + help="Process all the chapters", + ) + parser.add_argument( + "--batch-size", + type=int, + default=50, + help="Number of chapters to process in each batch", + ) + + def handle(self, *args, **options): + if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")): + self.stdout.write( + self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set") + ) + return + + self.openai_client = openai.OpenAI(api_key=openai_api_key) + + if chapter := options["chapter"]: + queryset = Chapter.objects.filter(key=chapter) + elif options["all"]: + queryset = Chapter.objects.all() + else: + queryset = Chapter.objects.filter(is_active=True) + + if not (total_chapters := queryset.count()): + self.stdout.write("No chapters found to process") + return + + self.stdout.write(f"Found {total_chapters} chapters to process") + + batch_size = options["batch_size"] + for offset in range(0, total_chapters, batch_size): + batch_chapters = queryset[offset : offset + batch_size] + + batch_chunks = [] + for chapter in batch_chapters: + batch_chunks.extend(self.create_chunks(chapter)) + + if batch_chunks: + Chunk.bulk_save(batch_chunks) + self.stdout.write(f"Saved {len(batch_chunks)} chunks") + + self.stdout.write(f"Completed processing all {total_chapters} chapters") + + def create_chunks(self, chapter: Chapter) -> list[Chunk]: + """Create chunks from a chapter's data.""" + prose_content, metadata_content = self.extract_chapter_content(chapter) + + all_chunk_texts = [] + + if metadata_content.strip(): + all_chunk_texts.append(metadata_content) + + if prose_content.strip(): + all_chunk_texts.extend(Chunk.split_text(prose_content)) + + if not all_chunk_texts: + self.stdout.write(f"No content to chunk for chapter {chapter.key}") + return [] + + try: + time_since_last_request = datetime.now(UTC) - getattr( + self, + "last_request_time", + datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS), + ) + + if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS): + time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds()) + + response = self.openai_client.embeddings.create( + input=all_chunk_texts, + model="text-embedding-3-small", + ) + self.last_request_time = datetime.now(UTC) + + return [ + chunk + for text, embedding in zip( + all_chunk_texts, + [d.embedding for d in response.data], + strict=True, + ) + if ( + chunk := Chunk.update_data( + text=text, + content_object=chapter, + embedding=embedding, + save=False, + ) + ) + ] + except openai.OpenAIError as e: + self.stdout.write(self.style.ERROR(f"OpenAI API error for chapter {chapter.key}: {e}")) + return [] + + def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]: + """Extract and separate prose content from metadata for a chapter. + + Returns: + tuple[str, str]: (prose_content, metadata_content) + + """ + prose_parts = [] + metadata_parts = [] + + if chapter.description: + prose_parts.append(f"Description: {chapter.description}") + + if chapter.summary: + prose_parts.append(f"Summary: {chapter.summary}") + + if hasattr(chapter, "owasp_repository") and chapter.owasp_repository: + repo = chapter.owasp_repository + if repo.description: + prose_parts.append(f"Repository Description: {repo.description}") + if repo.topics: + metadata_parts.append(f"Repository Topics: {', '.join(repo.topics)}") + + if chapter.name: + metadata_parts.append(f"Chapter Name: {chapter.name}") + + location_parts = [] + if chapter.country: + location_parts.append(f"Country: {chapter.country}") + if chapter.region: + location_parts.append(f"Region: {chapter.region}") + if chapter.postal_code: + location_parts.append(f"Postal Code: {chapter.postal_code}") + if chapter.suggested_location: + location_parts.append(f"Location: {chapter.suggested_location}") + + if location_parts: + metadata_parts.append(f"Location Information: {', '.join(location_parts)}") + + if chapter.level: + metadata_parts.append(f"Chapter Level: {chapter.level}") + + if chapter.currency: + metadata_parts.append(f"Currency: {chapter.currency}") + + if chapter.meetup_group: + metadata_parts.append(f"Meetup Group: {chapter.meetup_group}") + + if chapter.tags: + metadata_parts.append(f"Tags: {', '.join(chapter.tags)}") + + if chapter.topics: + metadata_parts.append(f"Topics: {', '.join(chapter.topics)}") + + if chapter.leaders_raw: + leaders_info = [] + for leader in chapter.leaders_raw: + if isinstance(leader, dict): + leader_name = leader.get("name", "") + leader_email = leader.get("email", "") + if leader_name: + leader_text = f"Leader: {leader_name}" + if leader_email: + leader_text += f" ({leader_email})" + leaders_info.append(leader_text) + + if leaders_info: + metadata_parts.append(f"Chapter Leaders: {', '.join(leaders_info)}") + + if chapter.related_urls: + valid_urls = [ + url + for url in chapter.related_urls + if url and url not in (chapter.invalid_urls or []) + ] + if valid_urls: + metadata_parts.append(f"Related URLs: {', '.join(valid_urls)}") + + metadata_parts.append(f"Active Chapter: {'Yes' if chapter.is_active else 'No'}") + + return ( + DELIMITER.join(filter(None, prose_parts)), + DELIMITER.join(filter(None, metadata_parts)), + ) diff --git a/backend/apps/ai/management/commands/ai_create_slack_message_chunks.py b/backend/apps/ai/management/commands/ai_create_slack_message_chunks.py index 266e7171e0..6eeb90dda6 100644 --- a/backend/apps/ai/management/commands/ai_create_slack_message_chunks.py +++ b/backend/apps/ai/management/commands/ai_create_slack_message_chunks.py @@ -7,12 +7,13 @@ import openai from django.core.management.base import BaseCommand +from apps.ai.common.constants import ( + DEFAULT_LAST_REQUEST_OFFSET_SECONDS, + MIN_REQUEST_INTERVAL_SECONDS, +) from apps.ai.models.chunk import Chunk from apps.slack.models.message import Message -MIN_REQUEST_INTERVAL_SECONDS = 1.2 -DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2 - class Command(BaseCommand): help = "Create chunks for Slack messages" @@ -78,10 +79,10 @@ def create_chunks(self, message: Message) -> list[Chunk]: ) if ( chunk := Chunk.update_data( + text=text, + content_object=message, embedding=embedding, - message=message, save=False, - text=text, ) ) ] diff --git a/backend/apps/ai/migrations/0004_alter_chunk_unique_together_chunk_content_type_and_more.py b/backend/apps/ai/migrations/0004_alter_chunk_unique_together_chunk_content_type_and_more.py new file mode 100644 index 0000000000..76d04690e5 --- /dev/null +++ b/backend/apps/ai/migrations/0004_alter_chunk_unique_together_chunk_content_type_and_more.py @@ -0,0 +1,43 @@ +# Generated by Django 5.2.3 on 2025-07-01 10:39 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("ai", "0003_alter_chunk_options_alter_chunk_embedding_and_more"), + ("contenttypes", "0002_remove_content_type_name"), + ] + + operations = [ + migrations.AlterUniqueTogether( + name="chunk", + unique_together=set(), + ), + migrations.AddField( + model_name="chunk", + name="content_type", + field=models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + to="contenttypes.contenttype", + ), + ), + migrations.AddField( + model_name="chunk", + name="object_id", + field=models.PositiveIntegerField(default=0), + ), + migrations.RemoveField( + model_name="chunk", + name="message", + ), + # we need to remove the unique constraint before adding the new one + # message needs to be removed first and then we can add the new unique constraint + migrations.AlterUniqueTogether( + name="chunk", + unique_together={("content_type", "object_id", "text")}, + ), + ] diff --git a/backend/apps/ai/models/chunk.py b/backend/apps/ai/models/chunk.py index 1d59caac67..8362948ffe 100644 --- a/backend/apps/ai/models/chunk.py +++ b/backend/apps/ai/models/chunk.py @@ -1,30 +1,38 @@ -"""Slack app chunk model.""" +"""AI app chunk model.""" +from django.contrib.contenttypes.fields import GenericForeignKey +from django.contrib.contenttypes.models import ContentType from django.db import models from langchain.text_splitter import RecursiveCharacterTextSplitter from pgvector.django import VectorField from apps.common.models import BulkSaveModel, TimestampedModel from apps.common.utils import truncate -from apps.slack.models.message import Message class Chunk(TimestampedModel): - """Slack Chunk model.""" + """AI Chunk model for storing text chunks with embeddings.""" class Meta: db_table = "ai_chunks" verbose_name = "Chunk" - unique_together = ("message", "text") + unique_together = ("content_type", "object_id", "text") + content_object = GenericForeignKey("content_type", "object_id") + content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE, blank=True, null=True) embedding = VectorField(verbose_name="Embedding", dimensions=1536) - message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks") + object_id = models.PositiveIntegerField(default=0) text = models.TextField(verbose_name="Text") def __str__(self): """Human readable representation.""" + content_name = ( + getattr(self.content_object, "name", None) + or getattr(self.content_object, "key", None) + or str(self.content_object) + ) return ( - f"Chunk {self.id} for Message {self.message.slack_message_id}: " + f"Chunk {self.id} for {self.content_type.model} {content_name}: " f"{truncate(self.text, 50)}" ) @@ -46,7 +54,7 @@ def split_text(text: str) -> list[str]: @staticmethod def update_data( text: str, - message: Message, + content_object, embedding, *, save: bool = True, @@ -55,18 +63,24 @@ def update_data( Args: text (str): The text content of the chunk. - message (Message): The message this chunk belongs to. + content_object: The object this chunk belongs to (Message, Chapter, etc.). embedding (list): The embedding vector for the chunk. save (bool): Whether to save the chunk to the database. Returns: - Chunk: The updated chunk instance. + Chunk: The updated chunk instance or None if it already exists. """ - if Chunk.objects.filter(message=message, text=text).exists(): + content_type = ContentType.objects.get_for_model(content_object) + + if Chunk.objects.filter( + content_type=content_type, object_id=content_object.id, text=text + ).exists(): return None - chunk = Chunk(message=message, text=text, embedding=embedding) + chunk = Chunk( + content_type=content_type, object_id=content_object.id, text=text, embedding=embedding + ) if save: chunk.save() diff --git a/backend/tests/apps/ai/models/chunk_test.py b/backend/tests/apps/ai/models/chunk_test.py index 3520d8191c..dca223f800 100644 --- a/backend/tests/apps/ai/models/chunk_test.py +++ b/backend/tests/apps/ai/models/chunk_test.py @@ -1,5 +1,6 @@ from unittest.mock import Mock, patch +from django.contrib.contenttypes.models import ContentType from django.db import models from apps.ai.models.chunk import Chunk @@ -10,27 +11,31 @@ def create_model_mock(model_class): mock = Mock(spec=model_class) mock._state = Mock() mock.pk = 1 + mock.id = 1 return mock class TestChunkModel: def test_str_method(self): - """Test the string representation of a chunk.""" mock_message = create_model_mock(Message) - mock_message.slack_message_id = "123456.789" + mock_message.name = "Test Message" - chunk = Chunk( - id=1, - text="This is a test chunk with some content that should be displayed", - message=mock_message, - ) + mock_content_type = Mock(spec=ContentType) + mock_content_type.model = "message" + + with ( + patch.object(Chunk, "content_type", mock_content_type), + patch.object(Chunk, "content_object", mock_message), + ): + chunk = Chunk() + chunk.id = 1 + chunk.text = "This is a test chunk with some content that should be displayed" - result = str(chunk) - assert "Chunk 1 for Message 123456.789:" in result - assert "This is a test chunk with some content that" in result + result = str(chunk) + assert "Chunk 1 for message Test Message:" in result + assert "This is a test chunk with some content that" in result def test_bulk_save_with_chunks(self): - """Test bulk_save method with valid chunks.""" mock_chunks = [Mock(), Mock(), Mock()] with patch("apps.common.models.BulkSaveModel.bulk_save") as mock_bulk_save: @@ -38,7 +43,6 @@ def test_bulk_save_with_chunks(self): mock_bulk_save.assert_called_once_with(Chunk, mock_chunks, fields=None) def test_bulk_save_with_fields_parameter(self): - """Test bulk_save method with custom fields parameter.""" mock_chunks = [Mock(), Mock()] fields = ["text", "embedding"] @@ -46,81 +50,128 @@ def test_bulk_save_with_fields_parameter(self): Chunk.bulk_save(mock_chunks, fields=fields) mock_bulk_save.assert_called_once_with(Chunk, mock_chunks, fields=fields) - def test_update_data_new_chunk(self, mocker): - """Test update_data method creates new chunk when it doesn't exist.""" + def test_split_text(self): + text = "This is a long text that should be split into multiple chunks. " * 10 + + result = Chunk.split_text(text) + + assert isinstance(result, list) + assert len(result) > 1 + assert all(isinstance(chunk, str) for chunk in result) + + @patch("apps.ai.models.chunk.Chunk.save") + @patch("apps.ai.models.chunk.Chunk.__init__") + def test_update_data_new_chunk(self, mock_init, mock_save, mocker): + mock_init.return_value = None + mock_message = create_model_mock(Message) text = "Test chunk content" embedding = [0.1, 0.2, 0.3] - mocker.patch( + mock_content_type = Mock(spec=ContentType) + mock_get_for_model = mocker.patch( + "django.contrib.contenttypes.models.ContentType.objects.get_for_model", + return_value=mock_content_type, + ) + + mock_filter = mocker.patch( "apps.ai.models.chunk.Chunk.objects.filter", return_value=Mock(exists=Mock(return_value=False)), ) - patched_save = mocker.patch("apps.ai.models.chunk.Chunk.save") + result = Chunk.update_data( + text=text, content_object=mock_message, embedding=embedding, save=True + ) - with patch.object(Chunk, "message", create=True): - result = Chunk.update_data( - text=text, message=mock_message, embedding=embedding, save=True - ) + mock_get_for_model.assert_called_once_with(mock_message) + mock_filter.assert_called_once_with( + content_type=mock_content_type, object_id=mock_message.id, text=text + ) + mock_init.assert_called_once_with( + content_type=mock_content_type, + object_id=mock_message.id, + text=text, + embedding=embedding, + ) + mock_save.assert_called_once() - assert result is not None - assert isinstance(result, Chunk) - assert result.text == text - assert result.message == mock_message - assert result.embedding == embedding - patched_save.assert_called_once() + assert result is not None + assert isinstance(result, Chunk) def test_update_data_existing_chunk(self, mocker): - """Test update_data method returns None when chunk already exists.""" mock_message = create_model_mock(Message) text = "Existing chunk content" embedding = [0.1, 0.2, 0.3] - mocker.patch( + mock_content_type = Mock(spec=ContentType) + mock_get_for_model = mocker.patch( + "django.contrib.contenttypes.models.ContentType.objects.get_for_model", + return_value=mock_content_type, + ) + + mock_filter = mocker.patch( "apps.ai.models.chunk.Chunk.objects.filter", return_value=Mock(exists=Mock(return_value=True)), ) - result = Chunk.update_data(text=text, message=mock_message, embedding=embedding, save=True) + result = Chunk.update_data( + text=text, content_object=mock_message, embedding=embedding, save=True + ) + mock_get_for_model.assert_called_once_with(mock_message) + mock_filter.assert_called_once_with( + content_type=mock_content_type, object_id=mock_message.id, text=text + ) assert result is None - def test_update_data_no_save(self, mocker): - """Test update_data method with save=False.""" + @patch("apps.ai.models.chunk.Chunk.save") + @patch("apps.ai.models.chunk.Chunk.__init__") + def test_update_data_no_save(self, mock_init, mock_save, mocker): + mock_init.return_value = None + mock_message = create_model_mock(Message) text = "Test chunk content" embedding = [0.1, 0.2, 0.3] - mocker.patch( + mock_content_type = Mock(spec=ContentType) + mock_get_for_model = mocker.patch( + "django.contrib.contenttypes.models.ContentType.objects.get_for_model", + return_value=mock_content_type, + ) + + mock_filter = mocker.patch( "apps.ai.models.chunk.Chunk.objects.filter", return_value=Mock(exists=Mock(return_value=False)), ) - patched_save = mocker.patch("apps.ai.models.chunk.Chunk.save") + result = Chunk.update_data( + text=text, content_object=mock_message, embedding=embedding, save=False + ) - with patch.object(Chunk, "message", create=True): - result = Chunk.update_data( - text=text, message=mock_message, embedding=embedding, save=False - ) + mock_get_for_model.assert_called_once_with(mock_message) + mock_filter.assert_called_once_with( + content_type=mock_content_type, object_id=mock_message.id, text=text + ) + mock_init.assert_called_once_with( + content_type=mock_content_type, + object_id=mock_message.id, + text=text, + embedding=embedding, + ) + mock_save.assert_not_called() - assert result is not None - assert isinstance(result, Chunk) - assert result.text == text - assert result.message == mock_message - assert result.embedding == embedding - patched_save.assert_not_called() + assert result is not None + assert isinstance(result, Chunk) def test_meta_class_attributes(self): - """Test the Meta class attributes of the Chunk model.""" assert Chunk._meta.db_table == "ai_chunks" assert Chunk._meta.verbose_name == "Chunk" - assert ("message", "text") in Chunk._meta.unique_together + assert ("content_type", "object_id", "text") in Chunk._meta.unique_together - def test_message_foreign_key_relationship(self): - """Test the foreign key relationship with Message model.""" - message_field = Chunk._meta.get_field("message") + def test_generic_foreign_key_relationship(self): + content_type_field = Chunk._meta.get_field("content_type") + object_id_field = Chunk._meta.get_field("object_id") - assert isinstance(message_field, models.ForeignKey) - assert message_field.remote_field.model == Message - assert message_field.remote_field.related_name == "chunks" + assert isinstance(content_type_field, models.ForeignKey) + assert content_type_field.remote_field.model == ContentType + assert isinstance(object_id_field, models.PositiveIntegerField)