feat: 拆分文本增加滑块,增加直接分段导入方式

This commit is contained in:
archer
2023-04-23 22:36:04 +08:00
parent 2774940851
commit e0b1a78344
15 changed files with 317 additions and 155 deletions

View File

@@ -85,8 +85,12 @@ export const postModelDataInput = (data: {
/**
* 拆分数据
*/
export const postModelDataSplitData = (data: { modelId: string; text: string; prompt: string }) =>
POST(`/model/data/splitData`, data);
export const postModelDataSplitData = (data: {
modelId: string;
chunks: string[];
prompt: string;
mode: 'qa' | 'subsection';
}) => POST(`/model/data/splitData`, data);
/**
* json导入数据

View File

@@ -0,0 +1,52 @@
import React from 'react';
import { Stack, Box, Flex, useTheme } from '@chakra-ui/react';
import type { StackProps } from '@chakra-ui/react';
// @ts-ignore
interface Props extends StackProps {
list: { label: string; value: string | number }[];
value: string | number;
onChange: (e: string | number) => void;
}
const Radio = ({ list, value, onChange, ...props }: Props) => {
return (
<Stack {...props} spacing={5} direction={'row'}>
{list.map((item) => (
<Flex
key={item.value}
alignItems={'center'}
cursor={'pointer'}
userSelect={'none'}
_before={{
content: '""',
w: '16px',
h: '16px',
mr: 1,
borderRadius: '16px',
transition: '0.2s',
...(value === item.value
? {
border: '5px solid',
borderColor: 'blue.500'
}
: {
border: '2px solid',
borderColor: 'gray.200'
})
}}
_hover={{
_before: {
borderColor: 'blue.400'
}
}}
onClick={() => onChange(item.value)}
>
{item.label}
</Flex>
))}
</Stack>
);
};
export default Radio;

View File

@@ -106,16 +106,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
});
} else {
// 有匹配情况下,添加知识库内容。
// 系统提示词过滤,最多 2000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000);
// 系统提示词过滤,最多 3000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
prompts.unshift({
obj: 'SYSTEM',
value: `${
model.systemPrompt || '根据知识库内容回答'
} 知识库是最新的,下面是知识库内容:当前时间为${dayjs().format(
'YYYY/MM/DD HH:mm:ss'
)}\n${systemPrompt}`
value: `
${model.systemPrompt}
${
model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
: ''
}
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
`
});
}

View File

@@ -4,7 +4,7 @@ import { connectToDatabase, Model } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import { ModelDataSchema } from '@/types/mongoSchema';
import { generateVector } from '@/service/events/generateVector';
import { connectPg, PgClient } from '@/service/pg';
import { PgClient } from '@/service/pg';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -26,7 +26,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const userId = await authToken(authorization);
await connectToDatabase();
const pg = await connectPg();
// 验证是否是该用户的 model
const model = await Model.findOne({

View File

@@ -2,14 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, SplitData, Model } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import { generateVector } from '@/service/events/generateVector';
import { generateQA } from '@/service/events/generateQA';
import { encode } from 'gpt-token-utils';
import { PgClient } from '@/service/pg';
/* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { text, modelId, prompt } = req.body as { text: string; modelId: string; prompt: string };
if (!text || !modelId || !prompt) {
const { chunks, modelId, prompt, mode } = req.body as {
modelId: string;
chunks: string[];
prompt: string;
mode: 'qa' | 'subsection';
};
if (!chunks || !modelId || !prompt) {
throw new Error('参数错误');
}
await connectToDatabase();
@@ -28,46 +34,31 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
throw new Error('无权操作该模型');
}
const replaceText = text.replace(/\\n/g, '\n');
if (mode === 'qa') {
// 批量QA拆分插入数据
await SplitData.create({
userId,
modelId,
textList: chunks,
prompt
});
// 文本拆分成 chunk
const chunks = replaceText.split('\n').filter((item) => item.trim());
generateQA();
} else if (mode === 'subsection') {
// 插入记录
await PgClient.insert('modelData', {
values: chunks.map((item) => [
{ key: 'user_id', value: userId },
{ key: 'model_id', value: modelId },
{ key: 'q', value: item },
{ key: 'a', value: '' },
{ key: 'status', value: 'waiting' }
])
});
const textList: string[] = [];
let splitText = '';
/* 取 2.5k ~ 3.5K tokens 内容 */
chunks.forEach((chunk) => {
const tokens = encode(splitText + chunk).length;
if (tokens >= 3500) {
// 超过 3500不要这块内容
splitText && textList.push(splitText);
splitText = chunk;
} else if (tokens >= 2500) {
// 超过 3000取内容
splitText && textList.push(splitText + chunk);
splitText = '';
} else {
//没超过 3000继续添加
splitText += chunk;
}
});
if (splitText) {
textList.push(splitText);
generateVector();
}
// 批量插入数据
await SplitData.create({
userId,
modelId,
rawText: text,
textList,
prompt
});
generateQA();
jsonRes(res);
} catch (err) {
jsonRes(res, {

View File

@@ -126,16 +126,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
});
} else {
// 有匹配或者低匹配度模式情况下,添加知识库内容。
// 系统提示词过滤,最多 2000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000);
// 系统提示词过滤,最多 3000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
prompts.unshift({
obj: 'SYSTEM',
value: `${
model.systemPrompt || '根据知识库内容回答'
} 知识库是最新的,下面是知识库内容:当前时间为${dayjs().format(
'YYYY/MM/DD HH:mm:ss'
)}\n${systemPrompt}`
value: `
${model.systemPrompt}
${
model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
: ''
}
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
`
});
}

View File

@@ -133,7 +133,7 @@ const Chat = ({ modelId, chatId }: { modelId: string; chatId: string }) => {
if (isScroll && res.history.length > 0) {
setTimeout(() => {
scrollToBottom('auto');
}, 2000);
}, 1200);
}
} catch (e: any) {
toast({

View File

@@ -122,9 +122,9 @@ const InputDataModal = ({
<Box h={'30px'}></Box>
<Textarea
placeholder={
'相关问题,可以输入多个问法, 最多500字。例如\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用'
'相关问题,可以输入多个问法, 最多 1000 字。例如:\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用'
}
maxLength={500}
maxLength={1000}
resize={'none'}
h={'calc(100% - 30px)'}
{...register(`q`, {
@@ -136,9 +136,9 @@ const InputDataModal = ({
<Box h={'30px'}></Box>
<Textarea
placeholder={
'知识点,最多1000字。请保持主语的完整性,缺少主语会导致效果不佳。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……'
'知识点,最多 2000 字。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……'
}
maxLength={1000}
maxLength={2000}
resize={'none'}
h={'calc(100% - 30px)'}
{...register(`a`, {

View File

@@ -18,6 +18,7 @@ import {
MenuItem,
Input
} from '@chakra-ui/react';
import type { BoxProps } from '@chakra-ui/react';
import type { ModelSchema } from '@/types/mongoSchema';
import type { ModelDataItemType } from '@/types/model';
import { ModelDataStatusMap } from '@/constants/model';
@@ -114,6 +115,14 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
}
});
const tdStyles: BoxProps = {
fontSize: 'xs',
maxW: '500px',
whiteSpace: 'pre-wrap',
maxH: '250px',
overflowY: 'auto'
};
return (
<>
<Flex>
@@ -156,8 +165,8 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
>
</MenuItem>
<MenuItem onClick={onOpenSelectFileModal}>/ QA </MenuItem>
<MenuItem onClick={onOpenSelectUrlModal}> QA </MenuItem>
<MenuItem onClick={onOpenSelectFileModal}>/</MenuItem>
{/* <MenuItem onClick={onOpenSelectUrlModal}>网站内容拆分</MenuItem> */}
<MenuItem onClick={onOpenSelectCsvModal}>csv </MenuItem>
</MenuList>
</Menu>
@@ -191,33 +200,23 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
<Box mt={4}>
<TableContainer minH={'500px'}>
<Table variant={'simple'}>
<Table variant={'simple'} w={'100%'}>
<Thead>
<Tr>
<Th>Question</Th>
<Th>Text</Th>
<Th>Status</Th>
<Th>()</Th>
<Th></Th>
<Th></Th>
<Th></Th>
</Tr>
</Thead>
<Tbody>
{modelDataList.map((item) => (
<Tr key={item.id}>
<Td minW={'200px'}>
<Box fontSize={'xs'} whiteSpace={'pre-wrap'}>
{item.q}
</Box>
<Td>
<Box {...tdStyles}>{item.q}</Box>
</Td>
<Td minW={'200px'}>
<Box
w={'100%'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
maxH={'250px'}
overflowY={'auto'}
>
{item.a}
</Box>
<Td>
<Box {...tdStyles}>{item.a || '-'}</Box>
</Td>
<Td>{ModelDataStatusMap[item.status]}</Td>
<Td>

View File

@@ -1,4 +1,4 @@
import React, { useState, useCallback } from 'react';
import React, { useState, useCallback, useMemo } from 'react';
import {
Box,
Flex,
@@ -20,9 +20,26 @@ import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query';
import { postModelDataSplitData } from '@/api/model';
import { formatPrice } from '@/utils/user';
import Radio from '@/components/Radio';
import { splitText } from '@/utils/file';
const fileExtension = '.txt,.doc,.docx,.pdf,.md';
const modeMap = {
qa: {
maxLen: 2800,
slideLen: 800,
price: 3,
isPrompt: true
},
subsection: {
maxLen: 1000,
slideLen: 300,
price: 0.4,
isPrompt: false
}
};
const SelectFileModal = ({
onClose,
onSuccess,
@@ -36,38 +53,45 @@ const SelectFileModal = ({
const { toast } = useToast();
const [prompt, setPrompt] = useState('');
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [fileText, setFileText] = useState('');
const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
});
const fileText = useMemo(() => {
const chunks = fileTextArr.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
);
return chunks.join('');
}, [fileTextArr, mode]);
const onSelectFile = useCallback(
async (e: File[]) => {
setSelecting(true);
try {
const fileTexts = (
await Promise.all(
e.map((file) => {
// @ts-ignore
const extension = file?.name?.split('.').pop().toLowerCase();
switch (extension) {
case 'txt':
case 'md':
return readTxtContent(file);
case 'pdf':
return readPdfContent(file);
case 'doc':
case 'docx':
return readDocContent(file);
default:
return '';
}
})
)
)
.join(' ')
.replace(/(\\n|\n)+/g, '\n');
setFileText(fileTexts);
const fileTexts = await Promise.all(
e.map((file) => {
// @ts-ignore
const extension = file?.name?.split('.').pop().toLowerCase();
switch (extension) {
case 'txt':
case 'md':
return readTxtContent(file);
case 'pdf':
return readPdfContent(file);
case 'doc':
case 'docx':
return readDocContent(file);
default:
return '';
}
})
);
setFileTextArr(fileTexts);
} catch (error: any) {
console.log(error);
toast({
@@ -77,16 +101,25 @@ const SelectFileModal = ({
}
setSelecting(false);
},
[setSelecting, toast]
[toast]
);
const { mutate, isLoading } = useMutation({
mutationFn: async () => {
if (!fileText) return;
const chunks = fileTextArr
.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
)
.flat();
await postModelDataSplitData({
modelId,
text: fileText.replace(/\\n/g, '\n').replace(/\n+/g, '\n'),
prompt: `下面是"${prompt || '一段长文本'}"`
chunks,
prompt: `下面是"${prompt || '一段长文本'}"`,
mode
});
toast({
title: '导入数据成功,需要一段拆解和训练',
@@ -106,58 +139,82 @@ const SelectFileModal = ({
return (
<Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay />
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalContent maxW={'min(1000px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalHeader></ModalHeader>
<ModalCloseButton />
<ModalBody
display={'flex'}
flexDirection={'column'}
p={4}
p={0}
h={'100%'}
alignItems={'center'}
justifyContent={'center'}
fontSize={'sm'}
>
<Button isLoading={selecting} onClick={onOpen}>
</Button>
<Box mt={2} maxW={['100%', '70%']}>
<Box mt={2} px={4} maxW={['100%']} textAlign={'justify'} color={'blackAlpha.600'}>
{fileExtension} QA
tokens
tokens {encode(fileText).length}{' '}
tokens {formatPrice(encode(fileText).length * modeMap[mode].price)}
</Box>
<Box mt={2}>
{encode(fileText).length} tokens {formatPrice(encode(fileText).length * 3)}
</Box>
<Flex w={'100%'} alignItems={'center'} my={4}>
<Box flex={'0 0 auto'} mr={2}>
</Box>
<Input
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
{/* 拆分模式 */}
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
<Box flex={'0 0 70px'}>:</Box>
<Radio
ml={3}
list={[
{ label: 'QA拆分', value: 'qa' },
{ label: '直接分段', value: 'subsection' }
]}
value={mode}
onChange={(e) => setMode(e as 'subsection' | 'qa')}
/>
</Flex>
<Textarea
flex={'1 0 0'}
h={0}
w={'100%'}
placeholder="文件内容"
maxLength={-1}
resize={'none'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
value={fileText}
onChange={(e) => setFileText(e.target.value)}
/>
{/* 内容介绍 */}
{modeMap[mode].isPrompt && (
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
<Box flex={'0 0 70px'} mr={2}>
</Box>
<Input
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
/>
</Flex>
)}
{/* 文本内容 */}
<Box flex={'1 0 0'} px={5} h={0} w={'100%'} overflowY={'auto'} mt={4}>
{fileTextArr.map((item, i) => (
<Box key={i} mb={5}>
<Box mb={1}>{i + 1}</Box>
<Textarea
placeholder="文件内容"
maxLength={-1}
rows={10}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
value={item}
onChange={(e) => {
setFileTextArr([
...fileTextArr.slice(0, i),
e.target.value,
...fileTextArr.slice(i + 1)
]);
}}
/>
</Box>
))}
</Box>
</ModalBody>
<Flex px={6} pt={2} pb={4}>
<Button isLoading={selecting} onClick={onOpen}>
</Button>
<Box flex={1}></Box>
<Button variant={'outline'} mr={3} onClick={onClose}>
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
</Button>
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}>

View File

@@ -44,8 +44,9 @@ const SelectUrlModal = ({
if (!webText) return;
await postModelDataSplitData({
modelId,
text: webText,
prompt: `下面是"${prompt || '一段长文本'}"`
chunks: [],
prompt: `下面是"${prompt || '一段长文本'}"`,
mode: 'qa'
});
toast({
title: '导入数据成功,需要一段拆解和训练',
@@ -89,7 +90,7 @@ const SelectUrlModal = ({
<Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay />
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalHeader></ModalHeader>
<ModalHeader></ModalHeader>
<ModalCloseButton />
<ModalBody
@@ -102,7 +103,7 @@ const SelectUrlModal = ({
fontSize={'sm'}
>
<Box mt={2} maxW={['100%', '70%']}>
QA tokens
</Box>
<Box mt={2}>

View File

@@ -69,9 +69,13 @@ export async function generateQA(next = false): Promise<any> {
const chatAPI = getOpenAIApi(userApiKey || systemKey);
const systemPrompt: ChatCompletionRequestMessage = {
role: 'system',
content: `你是出题官.${
dataItem.prompt || '下面是"一段长文本"'
},从中选出5至20个题目和答案,题目包含问答题,计算题,代码题等.答案要详细.按格式返回: Q1:\nA1:\nQ2:\nA2:\n`
content: `你是出题
${dataItem.prompt || '下面是"一段长文本"'}
从中选出5至20个题目和答案,题目包含问答题,计算题,代码题等.答案要详细.按格式返回: Q1:
A1:
Q2:
A2:
...`
};
// 请求 chatgpt 获取回答

View File

@@ -18,10 +18,6 @@ const SplitDataSchema = new Schema({
ref: 'model',
required: true
},
rawText: {
type: String,
required: true
},
textList: {
type: [String],
default: []

View File

@@ -75,7 +75,6 @@ export interface ModelSplitDataSchema {
_id: string;
userId: string;
modelId: string;
rawText: string;
prompt: string;
errorText: string;
textList: string[];

View File

@@ -1,5 +1,6 @@
import mammoth from 'mammoth';
import Papa from 'papaparse';
import { encode } from 'gpt-token-utils';
/**
* 读取 txt 文件内容
@@ -137,3 +138,54 @@ export const fileDownload = ({
downloadLink.click();
document.body.removeChild(downloadLink);
};
/**
* text split into chunks
* maxLen - one chunk len. max: 3500
* slideLen - The size of the before and after Text
* maxLen > slideLen
*/
export const splitText = ({
text,
maxLen,
slideLen
}: {
text: string;
maxLen: number;
slideLen: number;
}) => {
const textArr =
text.match(/[!?。\n.]+|[^\s]+/g)?.filter((item) => {
const text = item.replace(/(\\n)/g, '\n').trim();
if (text && text !== '\n') return true;
return false;
}) || [];
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
for (let i = 0; i < textArr.length; i++) {
const tokenLen = encode(textArr[i]).length;
chunks[chunks.length - 1].sum += tokenLen;
chunks[chunks.length - 1].arr.push(textArr[i]);
// current length is over maxLen. create new chunk
if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
// get slide len text as the initial value
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
const chunkText = chunks[chunks.length - 1].arr[j];
const tokenLen = encode(chunkText).length;
chunk.sum += tokenLen;
chunk.arr.unshift(chunkText);
if (chunk.sum >= slideLen) {
break;
}
}
chunks.push(chunk);
}
}
const result = chunks.map((item) => item.arr.join(''));
return result;
};