Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: add support for indexing documents using pagefind, and pushing to s3 #2934

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.server
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,4 @@ FIRECRAWL_API_KEY=fc-abdef**************
PDF2MD_URL="http://localhost:8081"
BATCH_CHUNK_LIMIT=120
CHAT_COMPLETION_TIMEOUT_SECS=10
YOUTUBE_API_KEY=""
50 changes: 50 additions & 0 deletions .github/workflows/push-server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -802,3 +802,53 @@ jobs:
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}


pagefind-worker:
name: Push Pagefind Index worker
runs-on: ${{ matrix.runner }}
strategy:
matrix:
runner: [blacksmith-8vcpu-ubuntu-2204]
platform: [linux/amd64]
exclude:
- runner: blacksmith-8vcpu-ubuntu-2204
platform: linux/arm64
- runner: blacksmith-8vcpu-ubuntu-2204-arm
platform: linux/amd64
steps:
- name: Checkout the repo
uses: actions/checkout@v4

# - name: Set up QEMU
# uses: docker/setup-qemu-action@v3

- name: Setup buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
# list of Docker images to use as base name for tags
images: |
trieve/pagefind-worker
tags: |
type=raw,latest
type=sha

- name: Build and push Docker image
uses: useblacksmith/[email protected]
with:
platforms: ${{ matrix.platform }}
context: server/
file: ./server/Dockerfile.pagefind-worker
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
6 changes: 5 additions & 1 deletion clients/search-component/example/.env.dist
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,8 @@ VITE_BRAND_NAME="Trieve"
VITE_BRAND_COLOR="#CB53EB"
VITE_PROBLEM_LINK="mailto:[email protected]?subject="
VITE_SHOW_FLOATING_BTN="true"
VITE_FLOATING_BTN_POSITION="bottom-right"
VITE_FLOATING_BTN_POSITION="bottom-right"
VITE_USE_PAGEFIND="false"
VITE_PAGEFIND_URL="https://pagefind-testing-index.trieve.ai/pagfind-index-west/pagefind"
VITE_USE_GROUP_SEARCH="false"
VITE_DEFAULT_TAGS='[{"tag": "pink", "label": "Pink"}]'
4 changes: 2 additions & 2 deletions clients/search-component/example/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
<link rel="icon" type="image/png" sizes="32x32" href="https://cdn.trieve.ai/favicon-32x32.png" />
<link rel="icon" type="image/png" sizes="16x16" href="https://cdn.trieve.ai/favicon-16x16.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta property="og:title" content="Trieve Drop-In RAG and Search Component">
<meta property="og:title" content="Search Component By Trieve">
<meta property="og:site_name" content="advanced relevance semantic search and RAG API">
<meta property="og:url" content="https://docsearch.trieve.ai">
<meta property="og:description" content="Build better, faster, and more relevant search and RAG with our open source API. Date recency biasing, re-ranker models, dense vector search, sub-sentence highlighting, and more all on one endpoint that you can host yourself." >
<meta property="og:type" content="">
<meta property="og:image" content="https://cdn.trieve.ai/trieve-og.png">
<title>Trieve Drop-In RAG and Search Component</title>
<title>Search Component By Trieve</title>
</head>
<body>
<div id="root"></div>
Expand Down
18 changes: 15 additions & 3 deletions clients/search-component/example/src/routes/ecommerce.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ export default function ECommerce() {
const useGroupSearch = import.meta.env.VITE_USE_GROUP_SEARCH == "true";
const showFloatingButton = import.meta.env.VITE_SHOW_FLOATING_BTN == "true";
const floatingButtonPosition = import.meta.env.VITE_FLOATING_BTN_POSITION;
const usePagefind = import.meta.env.VITE_USE_PAGEFIND == "true";
const pagefindUrl = import.meta.env.VITE_PAGEFIND_URL;

const defaultSearchQueries: string[] = (
import.meta.env.VITE_DEFAULT_SEARCH_QUERIES ?? ""
Expand All @@ -33,9 +35,8 @@ export default function ECommerce() {
return (
<>
<div
className={`p-12 flex flex-col items-center justify-center w-screen h-screen relative ${
theme === "dark" ? "bg-zinc-900 text-zinc-50" : ""
}`}
className={`p-12 flex flex-col items-center justify-center w-screen h-screen relative ${theme === "dark" ? "bg-zinc-900 text-zinc-50" : ""
}`}
>
<div className="absolute top-6 right-6">
<ul>
Expand Down Expand Up @@ -81,10 +82,21 @@ export default function ECommerce() {
use_autocomplete: false,
search_type: "fulltext",
}}
buttonTriggers={[
{
selector: ".random-trigger-location",
mode: "chat",
},
]}
pagefindOptions={{
usePagefind: usePagefind,
cdnBaseUrl: pagefindUrl,
}}
defaultSearchQueries={defaultSearchQueries}
tags={defaultTags}
floatingButtonPosition={floatingButtonPosition}
showFloatingButton={showFloatingButton}
debounceMs={500}
/>
</>
) : (
Expand Down
6 changes: 6 additions & 0 deletions clients/search-component/example/src/routes/index.lazy.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ export default function Home() {
const brandFontFamily = import.meta.env.VITE_BRAND_FONT_FAMILY;
const problemLink = import.meta.env.VITE_PROBLEM_LINK;
const useGroupSearch = import.meta.env.VITE_USE_GROUP_SEARCH == "true";
const usePagefind = import.meta.env.VITE_USE_PAGEFIND == "true";
const pagefindUrl = import.meta.env.VITE_PAGEFIND_URL;
const defaultSearchQueries: string[] = (
import.meta.env.VITE_DEFAULT_SEARCH_QUERIES ?? ""
).split(",");
Expand Down Expand Up @@ -127,6 +129,10 @@ export default function Home() {
},
]}
useGroupSearch={useGroupSearch}
pagefindOptions={{
usePagefind: usePagefind,
cdnBaseUrl: pagefindUrl,
}}
defaultAiQuestions={[
"What is Trieve?",
"How to perform autocomplete search?",
Expand Down
98 changes: 80 additions & 18 deletions clients/search-component/src/utils/hooks/modal-context.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ import {
} from "trieve-ts-sdk";
import {
countChunks,
countChunksWithPagefind,
groupSearchWithPagefind,
groupSearchWithTrieve,
searchWithPagefind,
searchWithTrieve,
} from "../trieve";

Expand All @@ -33,11 +36,19 @@ type customAutoCompleteAddOn = {
use_autocomplete?: boolean;
};

// eslint-disable-next-line @typescript-eslint/no-explicit-any
export type PagefindApi = any;

export type currencyPosition = "before" | "after";
export type ModalTypes = "ecommerce" | "docs";
export type SearchModes = "chat" | "search";
export type searchOptions = simpleSearchReqPayload & customAutoCompleteAddOn;

export interface PagefindOptions {
usePagefind: boolean;
cdnBaseUrl?: string;
}

export type ModalProps = {
datasetId: string;
apiKey: string;
Expand Down Expand Up @@ -66,6 +77,7 @@ export type ModalProps = {
icon?: () => JSX.Element;
}[];
defaultSearchMode?: SearchModes;
pagefindOptions?: PagefindOptions;
type?: ModalTypes;
useGroupSearch?: boolean;
allowSwitchingModes?: boolean;
Expand Down Expand Up @@ -147,6 +159,7 @@ const ModalContext = createContext<{
currentGroup: ChunkGroup | null;
setCurrentGroup: React.Dispatch<React.SetStateAction<ChunkGroup | null>>;
tagCounts: CountChunkQueryResponseBody[];
pagefind?: PagefindApi;
}>({
props: defaultProps,
trieveSDK: (() => {}) as unknown as TrieveSDK,
Expand All @@ -170,6 +183,7 @@ const ModalContext = createContext<{
setCurrentGroup: () => {},
tagCounts: [],
setContextProps: () => {},
pagefind: null,
});

const ModalProvider = ({
Expand Down Expand Up @@ -197,6 +211,7 @@ const ModalProvider = ({
const [currentTag, setCurrentTag] = useState(
props.tags?.find((t) => t.selected)?.tag || "all"
);
const [pagefind, setPagefind] = useState<PagefindApi | null>(null);

const [currentGroup, setCurrentGroup] = useState<ChunkGroup | null>(null);

Expand All @@ -214,7 +229,7 @@ const ModalProvider = ({

try {
setLoadingResults(true);
if (props.useGroupSearch) {
if (props.useGroupSearch && !props.pagefindOptions?.usePagefind) {
const results = await groupSearchWithTrieve({
query: query,
searchOptions: props.searchOptions,
Expand All @@ -236,6 +251,33 @@ const ModalProvider = ({

setResults(Array.from(groupMap.values()));
setRequestID(results.requestID);
} else if (props.useGroupSearch && props.pagefindOptions?.usePagefind) {

const results = await groupSearchWithPagefind(
pagefind,
query,
props.datasetId,
currentTag !== "all" ? currentTag : undefined
);
const groupMap = new Map<string, GroupChunk[]>();
results.groups.forEach((group) => {
const title = group.chunks[0].chunk.metadata?.title;
if (groupMap.has(title)) {
groupMap.get(title)?.push(group);
} else {
groupMap.set(title, [group]);
}
});
setResults(Array.from(groupMap.values()));

} else if (!props.useGroupSearch && props.pagefindOptions?.usePagefind) {
const results = await searchWithPagefind(
pagefind,
query,
props.datasetId,
currentTag !== "all" ? currentTag : undefined
);
setResults(results);
} else {
const results = await searchWithTrieve({
query: query,
Expand Down Expand Up @@ -266,24 +308,33 @@ const ModalProvider = ({
return;
}
if (props.tags?.length) {
try {
const numberOfRecords = await Promise.all(
[ALL_TAG, ...props.tags].map((tag) =>
countChunks({
query: query,
trieve: trieve,
abortController,
...(tag.tag !== "all" && { tag: tag.tag }),
})
)
if (props.pagefindOptions?.usePagefind) {
const filterCounts = await countChunksWithPagefind(
pagefind,
query,
props.tags
);
setTagCounts(numberOfRecords);
} catch (e) {
if (
e != "AbortError" &&
e != "AbortError: signal is aborted without reason"
) {
console.error(e);
setTagCounts(filterCounts);
} else {
try {
const numberOfRecords = await Promise.all(
[ALL_TAG, ...props.tags].map((tag) =>
countChunks({
query: query,
trieve: trieve,
abortController,
...(tag.tag !== "all" && { tag: tag.tag }),
})
)
);
setTagCounts(numberOfRecords);
} catch (e) {
if (
e != "AbortError" &&
e != "AbortError: signal is aborted without reason"
) {
console.error(e);
}
}
}
}
Expand All @@ -296,6 +347,17 @@ const ModalProvider = ({
}));
}, [onLoadProps]);

useEffect(() => {
if (props.pagefindOptions?.usePagefind) {
const pagefind_base_url = `${props?.pagefindOptions.cdnBaseUrl}/${props.datasetId}`;
import(`${pagefind_base_url}/pagefind.js`).then((pagefind) => {
setPagefind(pagefind)
pagefind.filters().then(() => {
})
});
}
}, []);

useEffect(() => {
props.onOpenChange?.(open);
}, [open]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ export const setClickTriggers = (
let element: Element | null = document.querySelector(trigger.selector);
if (trigger.removeListeners ?? true) {
element = removeAllClickListeners(trigger.selector);
console.log("Removed click listeners from", trigger.selector);
}

if (element) {
Expand Down
Loading
Loading