-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.ts
126 lines (107 loc) · 3.38 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import { Hono } from "hono";
import { cors } from "hono/cors";
import { env } from "hono/adapter";
import { Index } from "@upstash/vector";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
const semanticSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 25,
separators: [" "],
chunkOverlap: 12,
});
const app = new Hono();
type Environment = {
VECTOR_URL: string;
VECTOR_TOKEN: string;
};
app.use(cors());
const WHITELIST = ["swear"];
const PROFANITY_THRESHOLD = 0.86;
app.post("/", async (c) => {
if (c.req.header("Content-Type") !== "application/json") {
return c.json({ error: "JSON body expected" }, { status: 406 });
}
try {
const { VECTOR_URL, VECTOR_TOKEN } = env<Environment>(c);
const index = new Index({
url: VECTOR_URL,
token: VECTOR_TOKEN,
cache: false,
});
const body = await c.req.json();
let { message } = body as { message: string };
if (!message) {
return c.json({ error: "Message argument is required" }, { status: 400 });
}
if (message.length > 1000) {
return c.json(
{ error: "Message can only be at most 1000 character" },
{ status: 413 }
);
}
message = message
.split(/\s/)
.filter((word) => !WHITELIST.includes(word.toLowerCase()))
.join(" ");
const [semanticChunks, wordChunks] = await Promise.all([
splitTextIntoSemantics(message),
splitTextIntoWords(message),
]);
const flaggedFor = new Set<{ score: number; text: string }>();
const vectorRes = await Promise.all([
...wordChunks.map(async (wordChunk) => {
const [vector] = await index.query({
topK: 1,
data: wordChunk,
includeMetadata: true,
});
if (vector && vector.score > 0.95) {
flaggedFor.add({
text: vector.metadata!.text as string,
score: vector.score,
});
}
return { score: 0 };
}),
...semanticChunks.map(async (semanticChunk) => {
const [vector] = await index.query({
topK: 1,
data: semanticChunk,
includeMetadata: true,
});
if (vector && vector.score > PROFANITY_THRESHOLD) {
flaggedFor.add({
text: vector.metadata!.text as string,
score: vector.score,
});
}
return vector!;
}),
]);
if (flaggedFor.size > 0) {
const sorted = Array.from(flaggedFor).sort(
(a, b) => b.score - a.score
)[0];
return c.json({
isProfanity: true,
score: sorted.score,
flaggedFor: sorted.text,
});
} else {
const mostProfaneChunk = vectorRes.sort((a, b) => b.score - a.score)[0];
return c.json({ isProfanity: false, score: mostProfaneChunk.score });
}
} catch (error) {
console.error(error);
return c.json({ error: "Something went wrong." }, { status: 500 });
}
});
function splitTextIntoWords(text: string): string[] {
return text.split(/\s/);
}
async function splitTextIntoSemantics(text: string) {
if (text.split(/\s/).length === 1) return [];
const documents = await semanticSplitter.createDocuments([text]);
const chunks = documents.map((chunk) => chunk.pageContent);
return chunks;
}
export default app;