Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ slack-sync-messages:
@echo "Syncing Slack messages"
@CMD="python manage.py slack_sync_messages" $(MAKE) exec-backend-command

slack-create-message-chunks:
@echo "creating message chunks"
@CMD="python manage.py slack_create_chunks" $(MAKE) exec-backend-command

sync-data: \
update-data \
enrich-data \
Expand Down
Empty file added backend/apps/ai/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions backend/apps/ai/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""AI app admin."""

from django.contrib import admin

from apps.ai.models.chunk import Chunk


class ChunkAdmin(admin.ModelAdmin):
list_display = (
"id",
"message",
"chunk_text",
)
search_fields = (
"message__slack_message_id",
"chunk_text",
)


admin.site.register(Chunk, ChunkAdmin)
Empty file.
121 changes: 121 additions & 0 deletions backend/apps/ai/management/commands/slack_create_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""A command to create chunks of Slack messages."""

import os
import re
import time
from datetime import UTC, datetime, timedelta

import emoji
import openai
from django.core.management.base import BaseCommand
from langchain.text_splitter import RecursiveCharacterTextSplitter

from apps.ai.models.chunk import Chunk
from apps.slack.models.message import Message

MIN_REQUEST_INTERVAL_SECONDS = 1.2
DEFAULT_LAST_REQUEST_OFFSET_SECONDS = 2


class Command(BaseCommand):
help = "Create chunks for Slack messages"

def handle(self, *args, **options):
openai_api_key = os.getenv("DJANGO_OPEN_AI_SECRET_KEY")

if not openai_api_key:
self.stdout.write(
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
)
return

self.openai_client = openai.OpenAI(api_key=openai_api_key)

total_messages = Message.objects.count()
self.stdout.write(f"Found {total_messages} messages to process")

batch_size = 1000
processed_count = 0

for offset in range(0, total_messages, batch_size):
batch_messages = Message.objects.all()[offset : offset + batch_size]
batch_chunks = []

for message in batch_messages:
cleaned_text = self.clean_message_text(message.raw_data.get("text", ""))
chunks = self.create_chunks_from_message(message, cleaned_text)
batch_chunks.extend(chunks)

if batch_chunks:
Chunk.bulk_save(batch_chunks)

processed_count += len(batch_messages)

self.stdout.write(f"Completed processing all {total_messages} messages")

def create_chunks_from_message(self, message: Message, cleaned_text: str) -> list[Chunk]:
"""Create chunks from a message."""
if message.subtype in ["channel_join", "channel_leave"]:
return []

chunk_texts = self.split_message_text(cleaned_text)

if not chunk_texts:
self.stdout.write(
f"No chunks created for message {message.slack_message_id} - text too short"
)
return []

try:
time_since_last_request = datetime.now(UTC) - getattr(
self,
"last_request_time",
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
)

if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())

response = self.openai_client.embeddings.create(
model="text-embedding-3-small", input=chunk_texts
)
self.last_request_time = datetime.now(UTC)
embeddings = [d.embedding for d in response.data]
chunks = [
Chunk.update_data(
chunk_text=text,
message=message,
embedding=embedding,
save=False,
)
for text, embedding in zip(chunk_texts, embeddings, strict=True)
]
return [chunk for chunk in chunks if chunk is not None]
except openai.OpenAIError as e:
self.stdout.write(
self.style.ERROR(f"OpenAI API error for message {message.slack_message_id}: {e}")
)
return []

def split_message_text(self, message_text: str) -> list[str]:
"""Split message text into chunks."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=40,
length_function=len,
separators=["\n\n", "\n", " ", ""],
)
return splitter.split_text(message_text)

def clean_message_text(self, message_text: str) -> str:
"""Clean message text by removing emojis and other noise while preserving context."""
if not message_text:
return ""

cleaned_text = emoji.demojize(message_text, delimiters=("", ""))
cleaned_text = re.sub(r"<@U[A-Z0-9]+>", "", cleaned_text)
cleaned_text = re.sub(r"<https?://[^>]+>", "", cleaned_text)
cleaned_text = re.sub(r":\w+:", "", cleaned_text)
cleaned_text = re.sub(r"\s+", " ", cleaned_text)

return cleaned_text.strip()
51 changes: 51 additions & 0 deletions backend/apps/ai/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Generated by Django 5.2.3 on 2025-06-22 06:17

import django.db.models.deletion
import pgvector.django.vector
from django.db import migrations, models
from pgvector.django import VectorExtension


class Migration(migrations.Migration):
initial = True

dependencies = [
("slack", "0018_conversation_sync_messages"),
]

operations = [
VectorExtension(),
migrations.CreateModel(
name="Chunk",
fields=[
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
),
),
("nest_created_at", models.DateTimeField(auto_now_add=True)),
("nest_updated_at", models.DateTimeField(auto_now=True)),
("chunk_text", models.TextField(verbose_name="Chunk Text")),
(
"embedding",
pgvector.django.vector.VectorField(
dimensions=1536, verbose_name="Chunk Embedding"
),
),
(
"message",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="chunks",
to="slack.message",
),
),
],
options={
"verbose_name": "Chunks",
"db_table": "ai_chunks",
"unique_together": {("message", "chunk_text")},
},
),
]
Empty file.
1 change: 1 addition & 0 deletions backend/apps/ai/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .chunk import Chunk
63 changes: 63 additions & 0 deletions backend/apps/ai/models/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Slack app chunk model."""

from django.db import models
from pgvector.django import VectorField

from apps.common.models import BulkSaveModel, TimestampedModel
from apps.common.utils import truncate
from apps.slack.models.message import Message


class Chunk(TimestampedModel):
"""Slack Chunk model."""

class Meta:
db_table = "ai_chunks"
verbose_name = "Chunks"
unique_together = ("message", "chunk_text")

message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks")
chunk_text = models.TextField(verbose_name="Chunk Text")
embedding = VectorField(verbose_name="Chunk Embedding", dimensions=1536)

def __str__(self):
"""Human readable representation."""
text_preview = truncate(self.chunk_text, 50)
return f"Chunk {self.id} for Message {self.message.slack_message_id}: {text_preview}"

@staticmethod
def bulk_save(chunks, fields=None):
"""Bulk save chunks."""
chunks = [chunk for chunk in chunks if chunk is not None]
if chunks:
BulkSaveModel.bulk_save(Chunk, chunks, fields=fields)

@staticmethod
def update_data(
chunk_text: str,
message: Message,
embedding,
*,
save: bool = True,
) -> "Chunk | None":
"""Update chunk data.

Args:
chunk_text (str): The text content of the chunk.
message (Message): The message this chunk belongs to.
embedding (list): The embedding vector for the chunk.
save (bool): Whether to save the chunk to the database.

Returns:
Chunk: The updated chunk instance.

"""
if Chunk.objects.filter(message=message, chunk_text=chunk_text).exists():
return None

chunk = Chunk(message=message, chunk_text=chunk_text, embedding=embedding)

if save:
chunk.save()

return chunk
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def handle(self, *args, **options):
conversations = (
Conversation.objects.filter(slack_channel_id=channel_id)
if channel_id
else Conversation.objects.filter(sync_messages=True, workspace=workspace)
else Conversation.objects.filter(workspace=workspace)
)

for conversation in conversations:
Expand Down
5 changes: 5 additions & 0 deletions backend/apps/slack/models/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ def latest_reply(self) -> "Message | None":
.first()
)

@property
def subtype(self) -> str | None:
"""Get the subtype of the message if it exists."""
return self.raw_data.get("subtype")

def from_slack(
self,
message_data: dict,
Expand Down
Loading