Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ slack-sync-messages:
@echo "Syncing Slack messages"
@CMD="python manage.py slack_sync_messages" $(MAKE) exec-backend-command

slack-create-message-chunks:
@echo "creating message chunks"
@CMD="python manage.py slack_create_chunks" $(MAKE) exec-backend-command

sync-data: \
update-data \
enrich-data \
Expand Down
Empty file added backend/apps/ai/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions backend/apps/ai/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""AI app admin."""

from django.contrib import admin

from apps.ai.models.chunk import Chunk


class ChunkAdmin(admin.ModelAdmin):
list_display = (
"id",
"message",
"chunk_text",
)
search_fields = (
"message__slack_message_id",
"chunk_text",
)


admin.site.register(Chunk, ChunkAdmin)
Empty file.
116 changes: 116 additions & 0 deletions backend/apps/ai/management/commands/slack_create_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""A command to create chunks of Slack messages."""

import os
import re
import time
from datetime import UTC, datetime, timedelta

import emoji
import openai
from django.core.management.base import BaseCommand
from langchain.text_splitter import RecursiveCharacterTextSplitter

from apps.ai.models.chunk import Chunk
from apps.slack.models.message import Message


class Command(BaseCommand):
help = "Create chunks for Slack messages"

def handle(self, *args, **options):
openai_api_key = os.getenv("DJANGO_OPEN_AI_SECRET_KEY")

if not openai_api_key:
self.stdout.write(
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
)
return

self.openai_client = openai.OpenAI(api_key=openai_api_key)

total_messages = Message.objects.count()
self.stdout.write(f"Found {total_messages} messages to process")

batch_size = 1000
processed_count = 0

for offset in range(0, total_messages, batch_size):
batch_messages = Message.objects.all()[offset : offset + batch_size]
batch_chunks = []

for message in batch_messages:
cleaned_text = self.clean_message_text(message.raw_data.get("text", ""))
chunks = self.create_chunks_from_message(message, cleaned_text)
batch_chunks.extend(chunks)

if batch_chunks:
Chunk.bulk_save(batch_chunks)

processed_count += len(batch_messages)

self.stdout.write(f"Completed processing all {total_messages} messages")

def create_chunks_from_message(
self, message: Message, cleaned_text: str
) -> list[Chunk | None]:
"""Create chunks from a message."""
if message.raw_data.get("subtype") in ["channel_join", "channel_leave"]:
return []

chunk_texts = self.split_message_text(cleaned_text)

if not chunk_texts:
self.stdout.write(
f"No chunks created for message {message.slack_message_id} - text too short"
)
return []

try:
last_request_time = datetime.now(UTC)
time_since_last_request = datetime.now(UTC) - last_request_time

if time_since_last_request < timedelta(seconds=1.2):
time.sleep(1.2 - time_since_last_request.total_seconds())

response = self.openai_client.embeddings.create(
model="text-embedding-3-small", input=chunk_texts
)
last_request_time = datetime.now(UTC)
embeddings = [d.embedding for d in response.data]
return [
Chunk.update_data(
chunk_text=text,
message=message,
embedding=embedding,
save=False,
)
for text, embedding in zip(chunk_texts, embeddings, strict=True)
]
except openai.error.OpenAIError as e:
self.stdout.write(
self.style.ERROR(f"OpenAI API error for message {message.slack_message_id}: {e}")
)
return []

def split_message_text(self, message_text: str) -> list[str]:
"""Split message text into chunks."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=40,
length_function=len,
separators=["\n\n", "\n", " ", ""],
)
return splitter.split_text(message_text)

def clean_message_text(self, message_text: str) -> str:
"""Clean message text by removing emojis and other noise while preserving context."""
if not message_text:
return ""

cleaned_text = emoji.demojize(message_text, delimiters=("", ""))
cleaned_text = re.sub(r"<@U[A-Z0-9]+>", "", message_text)
cleaned_text = re.sub(r"<https?://[^>]+>", "", cleaned_text)
cleaned_text = re.sub(r":\w+:", "", cleaned_text)
cleaned_text = re.sub(r"\s+", " ", cleaned_text)

return cleaned_text.strip()
51 changes: 51 additions & 0 deletions backend/apps/ai/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Generated by Django 5.2.3 on 2025-06-22 06:17

import django.db.models.deletion
import pgvector.django.vector
from django.db import migrations, models
from pgvector.django import VectorExtension


class Migration(migrations.Migration):
initial = True

dependencies = [
("slack", "0018_conversation_sync_messages"),
]

operations = [
VectorExtension(),
migrations.CreateModel(
name="Chunk",
fields=[
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
),
),
("nest_created_at", models.DateTimeField(auto_now_add=True)),
("nest_updated_at", models.DateTimeField(auto_now=True)),
("chunk_text", models.TextField(verbose_name="Chunk Text")),
(
"embedding",
pgvector.django.vector.VectorField(
dimensions=1536, verbose_name="Chunk Embedding"
),
),
(
"message",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="chunks",
to="slack.message",
),
),
],
options={
"verbose_name": "Chunks",
"db_table": "slack_chunks",
"unique_together": {("message", "chunk_text")},
},
),
]
Empty file.
1 change: 1 addition & 0 deletions backend/apps/ai/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .chunk import Chunk
70 changes: 70 additions & 0 deletions backend/apps/ai/models/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Slack app chunk model."""

from django.db import models
from pgvector.django import VectorField

from apps.common.models import BulkSaveModel, TimestampedModel
from apps.common.utils import truncate
from apps.slack.models.message import Message


class Chunk(TimestampedModel):
"""Slack Chunk model."""

class Meta:
db_table = "slack_chunks"
verbose_name = "Chunks"
unique_together = ("message", "chunk_text")

message = models.ForeignKey(Message, on_delete=models.CASCADE, related_name="chunks")
chunk_text = models.TextField(verbose_name="Chunk Text")
embedding = VectorField(verbose_name="Chunk Embedding", dimensions=1536)

def __str__(self):
"""Human readable representation."""
text_preview = truncate(self.chunk_text, 50)
return f"Chunk {self.id} for Message {self.message.slack_message_id}: {text_preview}"

def from_chunk(self, chunk_text: str, message: Message, embedding=None) -> None:
"""Update instance based on chunk data."""
self.chunk_text = chunk_text
self.message = message
self.embedding = embedding

@staticmethod
def bulk_save(chunks, fields=None):
"""Bulk save chunks."""
chunks = [chunk for chunk in chunks if chunk is not None]
if chunks:
BulkSaveModel.bulk_save(Chunk, chunks, fields=fields)

@staticmethod
def update_data(
chunk_text: str,
message: Message,
embedding,
*,
save: bool = True,
) -> "Chunk | None":
"""Update chunk data.
Args:
chunk_text (str): The text content of the chunk.
message (Message): The message this chunk belongs to.
embedding (list): The embedding vector for the chunk.
save (bool): Whether to save the chunk to the database.
Returns:
Chunk: The updated chunk instance.
"""
if Chunk.objects.filter(message=message, chunk_text=chunk_text).exists():
return None

chunk = Chunk(message=message)
chunk.from_chunk(chunk_text, message, embedding)

if save:
chunk.save()

return chunk
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def handle(self, *args, **options):
conversations = (
Conversation.objects.filter(slack_channel_id=channel_id)
if channel_id
else Conversation.objects.filter(sync_messages=True, workspace=workspace)
else Conversation.objects.filter(workspace=workspace)
)

for conversation in conversations:
Expand Down
Loading