Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ai-create-slack-message-chunks:
@echo "Creating Slack message chunks"
@CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command

clean-backend-dependencies:
@rm -rf backend/.cache
@rm -rf backend/.local
Expand Down
Empty file added backend/apps/ai/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions backend/apps/ai/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""AI app admin."""

from django.contrib import admin

from apps.ai.models.chunk import Chunk


class ChunkAdmin(admin.ModelAdmin):
list_display = (
"id",
"message",
"text",
)
search_fields = (
"message__slack_message_id",
"text",
)


admin.site.register(Chunk, ChunkAdmin)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""A command to create chunks of Slack messages."""

import os
import time
from datetime import UTC, datetime, timedelta

import openai
from django.core.management.base import BaseCommand

from apps.ai.models.chunk import Chunk
from apps.slack.models.message import Message

MIN_REQUEST_INTERVAL_SECONDS = 1.2
DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2


class Command(BaseCommand):
help = "Create chunks for Slack messages"

def handle(self, *args, **options):
if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")):
self.stdout.write(
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
)
return

self.openai_client = openai.OpenAI(api_key=openai_api_key)

total_messages = Message.objects.count()
self.stdout.write(f"Found {total_messages} messages to process")

batch_size = 100
for offset in range(0, total_messages, batch_size):
Chunk.bulk_save(
[
chunk
for message in Message.objects.all()[offset : offset + batch_size]
for chunk in self.create_chunks(message)
]
)

self.stdout.write(f"Completed processing all {total_messages} messages")

def create_chunks(self, message: Message) -> list[Chunk]:
"""Create chunks from a message."""
if message.subtype in {"channel_join", "channel_leave"}:
return []

if not (chunk_text := Chunk.split_text(message.cleaned_text)):
self.stdout.write(
f"No chunks created for message {message.slack_message_id}: "
f"`{message.cleaned_text}`"
)
return []

try:
time_since_last_request = datetime.now(UTC) - getattr(
self,
"last_request_time",
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
)

if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())

response = self.openai_client.embeddings.create(
input=chunk_text,
model="text-embedding-3-small",
)
self.last_request_time = datetime.now(UTC)

return [
chunk
for text, embedding in zip(
chunk_text,
[d.embedding for d in response.data], # Embedding data from OpenAI response.
strict=True,
)
if (
chunk := Chunk.update_data(
embedding=embedding,
message=message,
save=False,
text=text,
)
)
]
except openai.OpenAIError as e:
self.stdout.write(
self.style.ERROR(f"OpenAI API error for message {message.slack_message_id}: {e}")
)
return []
51 changes: 51 additions & 0 deletions backend/apps/ai/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Generated by Django 5.2.3 on 2025-06-22 06:17

import django.db.models.deletion
import pgvector.django.vector
from django.db import migrations, models
from pgvector.django import VectorExtension


class Migration(migrations.Migration):
initial = True

dependencies = [
("slack", "0018_conversation_sync_messages"),
]

operations = [
VectorExtension(),
migrations.CreateModel(
name="Chunk",
fields=[
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
),
),
("nest_created_at", models.DateTimeField(auto_now_add=True)),
("nest_updated_at", models.DateTimeField(auto_now=True)),
("chunk_text", models.TextField(verbose_name="Chunk Text")),
(
"embedding",
pgvector.django.vector.VectorField(
dimensions=1536, verbose_name="Chunk Embedding"
),
),
(
"message",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="chunks",
to="slack.message",
),
),
],
options={
"verbose_name": "Chunks",
"db_table": "ai_chunks",
"unique_together": {("message", "chunk_text")},
},
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 5.2.3 on 2025-06-26 21:04

from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("ai", "0001_initial"),
("slack", "0018_conversation_sync_messages"),
]

operations = [
migrations.RenameField(
model_name="chunk",
old_name="chunk_text",
new_name="text",
),
migrations.AlterUniqueTogether(
name="chunk",
unique_together={("message", "text")},
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Generated by Django 5.2.3 on 2025-06-26 21:45

import pgvector.django.vector
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("ai", "0002_rename_chunk_text_chunk_text_and_more"),
]

operations = [
migrations.AlterModelOptions(
name="chunk",
options={"verbose_name": "Chunk"},
),
migrations.AlterField(
model_name="chunk",
name="embedding",
field=pgvector.django.vector.VectorField(dimensions=1536, verbose_name="Embedding"),
),
migrations.AlterField(
model_name="chunk",
name="text",
field=models.TextField(verbose_name="Text"),
),
]
Empty file.
1 change: 1 addition & 0 deletions backend/apps/ai/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .chunk import Chunk
74 changes: 74 additions & 0 deletions backend/apps/ai/models/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Slack app chunk model."""

from django.db import models
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pgvector.django import VectorField

from apps.common.models import BulkSaveModel, TimestampedModel
from apps.common.utils import truncate
from apps.slack.models.message import Message


class Chunk(TimestampedModel):
"""Slack Chunk model."""

class Meta:
db_table = "ai_chunks"
verbose_name = "Chunk"
unique_together = ("message", "text")

embedding = VectorField(verbose_name="Embedding", dimensions=1536)
message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks")
text = models.TextField(verbose_name="Text")

def __str__(self):
"""Human readable representation."""
return (
f"Chunk {self.id} for Message {self.message.slack_message_id}: "
f"{truncate(self.text, 50)}"
)

@staticmethod
def bulk_save(chunks, fields=None):
"""Bulk save chunks."""
BulkSaveModel.bulk_save(Chunk, chunks, fields=fields)

@staticmethod
def split_text(text: str) -> list[str]:
"""Split text into chunks."""
return RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=40,
length_function=len,
separators=["\n\n", "\n", " ", ""],
).split_text(text)

@staticmethod
def update_data(
text: str,
message: Message,
embedding,
*,
save: bool = True,
) -> "Chunk | None":
"""Update chunk data.

Args:
text (str): The text content of the chunk.
message (Message): The message this chunk belongs to.
embedding (list): The embedding vector for the chunk.
save (bool): Whether to save the chunk to the database.

Returns:
Chunk: The updated chunk instance.

"""
if Chunk.objects.filter(message=message, text=text).exists():
return None

chunk = Chunk(message=message, text=text, embedding=embedding)

if save:
chunk.save()

return chunk
26 changes: 26 additions & 0 deletions backend/apps/slack/models/message.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Slack app message model."""

import re
from datetime import UTC, datetime

import emoji
from django.db import models

from apps.common.models import BulkSaveModel, TimestampedModel
Expand Down Expand Up @@ -46,6 +48,20 @@ def __str__(self):
else truncate(self.raw_data["text"], 50)
)

@property
def cleaned_text(self) -> str:
"""Get cleaned text from the message."""
if not self.text:
return ""

text = emoji.demojize(self.text) # Remove emojis.
text = re.sub(r"<@U[A-Z0-9]+>", "", text) # Remove user mentions.
text = re.sub(r"<https?://[^>]+>", "", text) # Remove links.
text = re.sub(r":\w+:", "", text) # Remove emoji aliases.
text = re.sub(r"\s+", " ", text) # Normalize whitespace.

return text.strip()

@property
def latest_reply(self) -> "Message | None":
"""Get the latest reply to this message."""
Expand All @@ -58,6 +74,16 @@ def latest_reply(self) -> "Message | None":
.first()
)

@property
def subtype(self) -> str | None:
"""Get the subtype of the message if it exists."""
return self.raw_data.get("subtype")

@property
def text(self) -> str:
"""Get the text of the message."""
return self.raw_data.get("text", "")

def from_slack(
self,
message_data: dict,
Expand Down
Loading