Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements RAPTOR for better chunking #882 #883

Merged
merged 2 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions web/src/components/chunk-method-modal/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';

import { useTranslate } from '@/hooks/commonHooks';
import LayoutRecognize from '../layout-recognize';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '../parse-configuration';
import styles from './index.less';

interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
Expand Down Expand Up @@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
onCancel={hideModal}
afterClose={afterClose}
confirmLoading={loading}
width={700}
>
<Space size={[0, 8]} wrap>
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
Expand Down Expand Up @@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
</Form.Item>
)}
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
{showRaptorParseConfiguration(selectedTag) && (
<ParseConfiguration></ParseConfiguration>
)}
</Form>
</Modal>
);
Expand Down
206 changes: 206 additions & 0 deletions web/src/components/parse-configuration/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import { useTranslate } from '@/hooks/commonHooks';
import { PlusOutlined } from '@ant-design/icons';
import {
Button,
Divider,
Flex,
Form,
Input,
InputNumber,
Slider,
Switch,
} from 'antd';
import random from 'lodash/random';

export const excludedParseMethods = ['table', 'resume', 'one'];

export const showRaptorParseConfiguration = (parserId: string) => {
return !excludedParseMethods.includes(parserId);
};

// The three types "table", "resume" and "one" do not display this configuration.
const ParseConfiguration = () => {
const form = Form.useFormInstance();
const { t } = useTranslate('knowledgeConfiguration');

const handleGenerate = () => {
form.setFieldValue(
['parser_config', 'raptor', 'random_seed'],
random(10000),
);
};

return (
<>
<Divider></Divider>
<Form.Item
name={['parser_config', 'raptor', 'use_raptor']}
label={t('useRaptor')}
initialValue={false}
valuePropName="checked"
tooltip={t('useRaptorTip')}
>
<Switch />
</Form.Item>
<Form.Item
shouldUpdate={(prevValues, curValues) =>
prevValues.parser_config.raptor.use_raptor !==
curValues.parser_config.raptor.use_raptor
}
>
{({ getFieldValue }) => {
const useRaptor = getFieldValue([
'parser_config',
'raptor',
'use_raptor',
]);

return (
useRaptor && (
<>
<Form.Item
name={['parser_config', 'raptor', 'prompt']}
label={t('prompt')}
initialValue={t('promptText')}
tooltip={t('promptTip')}
rules={[
{
required: true,
message: t('promptMessage'),
},
]}
>
<Input.TextArea rows={8} />
</Form.Item>
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
initialValue={128}
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<Slider max={2048} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<InputNumber max={2048} min={0} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
initialValue={0.1}
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<Slider
min={0}
max={1}
style={{ width: '100%' }}
step={0.01}
/>
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<InputNumber max={1} min={0} step={0.01} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
initialValue={64}
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<Slider min={1} max={1024} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<InputNumber max={1024} min={1} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('randomSeed')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'random_seed']}
noStyle
initialValue={0}
rules={[
{
required: true,
message: t('randomSeedMessage'),
},
]}
>
<InputNumber style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item noStyle>
<Button type="primary" onClick={handleGenerate}>
<PlusOutlined />
</Button>
</Form.Item>
</Flex>
</Form.Item>
</>
)
);
}}
</Form.Item>
</>
);
};

export default ParseConfiguration;
20 changes: 20 additions & 0 deletions web/src/locales/en.ts
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,26 @@ export default {
</p><p>
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
</p>`,
useRaptor: 'Use RAPTOR to enhance retrieval',
useRaptorTip:
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
prompt: 'Prompt',
promptTip: 'LLM prompt used for summarization.',
promptMessage: 'Prompt is required',
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
{cluster_content}
The above is the content you need to summarize.`,
maxToken: 'Max token',
maxTokenTip: 'Maximum token number for summarization.',
maxTokenMessage: 'Max token is required',
threshold: 'Threshold',
thresholdTip: 'The bigger the threshold is the less cluster will be.',
thresholdMessage: 'Threshold is required',
maxCluster: 'Max cluster',
maxClusterTip: 'Maximum cluster number.',
maxClusterMessage: 'Max cluster is required',
randomSeed: 'Random seed',
randomSeedMessage: 'Random seed is required',
},
chunk: {
chunk: 'Chunk',
Expand Down
19 changes: 19 additions & 0 deletions web/src/locales/zh-traditional.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,25 @@ export default {
</p><p>
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
</p>`,
useRaptor: '使用RAPTOR文件增強策略',
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
prompt: '提示詞',
promptMessage: '提示詞是必填項',
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
{集群內容}
以上就是你需要總結的內容。`,
maxToken: '最大token數',
maxTokenMessage: '最大token數是必填項',
threshold: '臨界點',
thresholdMessage: '臨界點是必填項',
maxCluster: '最大聚類數',
maxClusterMessage: '最大聚類數是必填項',
randomSeed: '隨機種子',
randomSeedMessage: '隨機種子是必填項',
promptTip: 'LLM提示用於總結。',
maxTokenTip: '用於匯總的最大token數。',
thresholdTip: '閾值越大,聚類越少。',
maxClusterTip: '最大聚類數。',
},
chunk: {
chunk: '解析塊',
Expand Down
19 changes: 19 additions & 0 deletions web/src/locales/zh.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,25 @@ export default {
</p><p>
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
</p>`,
useRaptor: '使用召回增强RAPTOR策略',
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
prompt: '提示词',
promptMessage: '提示词是必填项',
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
{集群内容}
以上就是你需要总结的内容。`,
maxToken: '最大token数',
maxTokenMessage: '最大token数是必填项',
threshold: '临界点',
thresholdMessage: '临界点是必填项',
maxCluster: '最大聚类数',
maxClusterMessage: '最大聚类数是必填项',
randomSeed: '随机种子',
randomSeedMessage: '随机种子是必填项',
promptTip: 'LLM提示用于总结。',
maxTokenTip: '用于汇总的最大token数。',
thresholdTip: '阈值越大,聚类越少。',
maxClusterTip: '最大聚类数。',
},
chunk: {
chunk: '解析块',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import {

import LayoutRecognize from '@/components/layout-recognize';
import MaxTokenNumber from '@/components/max-token-number';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '@/components/parse-configuration';
import { useTranslate } from '@/hooks/commonHooks';
import { FormInstance } from 'antd/lib';
import styles from './index.less';
Expand Down Expand Up @@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
{({ getFieldValue }) => {
const parserId = getFieldValue('parser_id');

if (parserId === 'naive') {
return (
<>
<MaxTokenNumber></MaxTokenNumber>
<LayoutRecognize></LayoutRecognize>
</>
);
}
return null;
return (
<>
{parserId === 'naive' && (
<>
<MaxTokenNumber></MaxTokenNumber>
<LayoutRecognize></LayoutRecognize>
</>
)}
{showRaptorParseConfiguration(parserId) && (
<ParseConfiguration></ParseConfiguration>
)}
</>
);
}}
</Form.Item>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
'embd_id',
'parser_id',
'language',
'parser_config.chunk_token_num',
'parser_config',
]),
avatar: fileList,
});
Expand Down