mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
feat: 拆分文本增加滑块,增加直接分段导入方式
This commit is contained in:
@@ -85,8 +85,12 @@ export const postModelDataInput = (data: {
|
||||
/**
|
||||
* 拆分数据
|
||||
*/
|
||||
export const postModelDataSplitData = (data: { modelId: string; text: string; prompt: string }) =>
|
||||
POST(`/model/data/splitData`, data);
|
||||
export const postModelDataSplitData = (data: {
|
||||
modelId: string;
|
||||
chunks: string[];
|
||||
prompt: string;
|
||||
mode: 'qa' | 'subsection';
|
||||
}) => POST(`/model/data/splitData`, data);
|
||||
|
||||
/**
|
||||
* json导入数据
|
||||
|
52
src/components/Radio/index.tsx
Normal file
52
src/components/Radio/index.tsx
Normal file
@@ -0,0 +1,52 @@
|
||||
import React from 'react';
|
||||
import { Stack, Box, Flex, useTheme } from '@chakra-ui/react';
|
||||
import type { StackProps } from '@chakra-ui/react';
|
||||
|
||||
// @ts-ignore
|
||||
interface Props extends StackProps {
|
||||
list: { label: string; value: string | number }[];
|
||||
value: string | number;
|
||||
onChange: (e: string | number) => void;
|
||||
}
|
||||
|
||||
const Radio = ({ list, value, onChange, ...props }: Props) => {
|
||||
return (
|
||||
<Stack {...props} spacing={5} direction={'row'}>
|
||||
{list.map((item) => (
|
||||
<Flex
|
||||
key={item.value}
|
||||
alignItems={'center'}
|
||||
cursor={'pointer'}
|
||||
userSelect={'none'}
|
||||
_before={{
|
||||
content: '""',
|
||||
w: '16px',
|
||||
h: '16px',
|
||||
mr: 1,
|
||||
borderRadius: '16px',
|
||||
transition: '0.2s',
|
||||
...(value === item.value
|
||||
? {
|
||||
border: '5px solid',
|
||||
borderColor: 'blue.500'
|
||||
}
|
||||
: {
|
||||
border: '2px solid',
|
||||
borderColor: 'gray.200'
|
||||
})
|
||||
}}
|
||||
_hover={{
|
||||
_before: {
|
||||
borderColor: 'blue.400'
|
||||
}
|
||||
}}
|
||||
onClick={() => onChange(item.value)}
|
||||
>
|
||||
{item.label}
|
||||
</Flex>
|
||||
))}
|
||||
</Stack>
|
||||
);
|
||||
};
|
||||
|
||||
export default Radio;
|
@@ -106,16 +106,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
});
|
||||
} else {
|
||||
// 有匹配情况下,添加知识库内容。
|
||||
// 系统提示词过滤,最多 2000 tokens
|
||||
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000);
|
||||
// 系统提示词过滤,最多 3000 tokens
|
||||
const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
|
||||
|
||||
prompts.unshift({
|
||||
obj: 'SYSTEM',
|
||||
value: `${
|
||||
model.systemPrompt || '根据知识库内容回答'
|
||||
} 知识库是最新的,下面是知识库内容:当前时间为${dayjs().format(
|
||||
'YYYY/MM/DD HH:mm:ss'
|
||||
)}\n${systemPrompt}`
|
||||
value: `
|
||||
${model.systemPrompt}
|
||||
${
|
||||
model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
|
||||
? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
|
||||
: ''
|
||||
}
|
||||
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
|
||||
`
|
||||
});
|
||||
}
|
||||
|
||||
|
@@ -4,7 +4,7 @@ import { connectToDatabase, Model } from '@/service/mongo';
|
||||
import { authToken } from '@/service/utils/tools';
|
||||
import { ModelDataSchema } from '@/types/mongoSchema';
|
||||
import { generateVector } from '@/service/events/generateVector';
|
||||
import { connectPg, PgClient } from '@/service/pg';
|
||||
import { PgClient } from '@/service/pg';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -26,7 +26,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
const userId = await authToken(authorization);
|
||||
|
||||
await connectToDatabase();
|
||||
const pg = await connectPg();
|
||||
|
||||
// 验证是否是该用户的 model
|
||||
const model = await Model.findOne({
|
||||
|
@@ -2,14 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@/service/response';
|
||||
import { connectToDatabase, SplitData, Model } from '@/service/mongo';
|
||||
import { authToken } from '@/service/utils/tools';
|
||||
import { generateVector } from '@/service/events/generateVector';
|
||||
import { generateQA } from '@/service/events/generateQA';
|
||||
import { encode } from 'gpt-token-utils';
|
||||
import { PgClient } from '@/service/pg';
|
||||
|
||||
/* 拆分数据成QA */
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
const { text, modelId, prompt } = req.body as { text: string; modelId: string; prompt: string };
|
||||
if (!text || !modelId || !prompt) {
|
||||
const { chunks, modelId, prompt, mode } = req.body as {
|
||||
modelId: string;
|
||||
chunks: string[];
|
||||
prompt: string;
|
||||
mode: 'qa' | 'subsection';
|
||||
};
|
||||
if (!chunks || !modelId || !prompt) {
|
||||
throw new Error('参数错误');
|
||||
}
|
||||
await connectToDatabase();
|
||||
@@ -28,46 +34,31 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
throw new Error('无权操作该模型');
|
||||
}
|
||||
|
||||
const replaceText = text.replace(/\\n/g, '\n');
|
||||
if (mode === 'qa') {
|
||||
// 批量QA拆分插入数据
|
||||
await SplitData.create({
|
||||
userId,
|
||||
modelId,
|
||||
textList: chunks,
|
||||
prompt
|
||||
});
|
||||
|
||||
// 文本拆分成 chunk
|
||||
const chunks = replaceText.split('\n').filter((item) => item.trim());
|
||||
generateQA();
|
||||
} else if (mode === 'subsection') {
|
||||
// 插入记录
|
||||
await PgClient.insert('modelData', {
|
||||
values: chunks.map((item) => [
|
||||
{ key: 'user_id', value: userId },
|
||||
{ key: 'model_id', value: modelId },
|
||||
{ key: 'q', value: item },
|
||||
{ key: 'a', value: '' },
|
||||
{ key: 'status', value: 'waiting' }
|
||||
])
|
||||
});
|
||||
|
||||
const textList: string[] = [];
|
||||
let splitText = '';
|
||||
|
||||
/* 取 2.5k ~ 3.5K tokens 内容 */
|
||||
chunks.forEach((chunk) => {
|
||||
const tokens = encode(splitText + chunk).length;
|
||||
if (tokens >= 3500) {
|
||||
// 超过 3500,不要这块内容
|
||||
splitText && textList.push(splitText);
|
||||
splitText = chunk;
|
||||
} else if (tokens >= 2500) {
|
||||
// 超过 3000,取内容
|
||||
splitText && textList.push(splitText + chunk);
|
||||
splitText = '';
|
||||
} else {
|
||||
//没超过 3000,继续添加
|
||||
splitText += chunk;
|
||||
}
|
||||
});
|
||||
|
||||
if (splitText) {
|
||||
textList.push(splitText);
|
||||
generateVector();
|
||||
}
|
||||
|
||||
// 批量插入数据
|
||||
await SplitData.create({
|
||||
userId,
|
||||
modelId,
|
||||
rawText: text,
|
||||
textList,
|
||||
prompt
|
||||
});
|
||||
|
||||
generateQA();
|
||||
|
||||
jsonRes(res);
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
|
@@ -126,16 +126,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
});
|
||||
} else {
|
||||
// 有匹配或者低匹配度模式情况下,添加知识库内容。
|
||||
// 系统提示词过滤,最多 2000 tokens
|
||||
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000);
|
||||
// 系统提示词过滤,最多 3000 tokens
|
||||
const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
|
||||
|
||||
prompts.unshift({
|
||||
obj: 'SYSTEM',
|
||||
value: `${
|
||||
model.systemPrompt || '根据知识库内容回答'
|
||||
} 知识库是最新的,下面是知识库内容:当前时间为${dayjs().format(
|
||||
'YYYY/MM/DD HH:mm:ss'
|
||||
)}\n${systemPrompt}`
|
||||
value: `
|
||||
${model.systemPrompt}
|
||||
${
|
||||
model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
|
||||
? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
|
||||
: ''
|
||||
}
|
||||
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
|
||||
`
|
||||
});
|
||||
}
|
||||
|
||||
|
@@ -133,7 +133,7 @@ const Chat = ({ modelId, chatId }: { modelId: string; chatId: string }) => {
|
||||
if (isScroll && res.history.length > 0) {
|
||||
setTimeout(() => {
|
||||
scrollToBottom('auto');
|
||||
}, 2000);
|
||||
}, 1200);
|
||||
}
|
||||
} catch (e: any) {
|
||||
toast({
|
||||
|
@@ -122,9 +122,9 @@ const InputDataModal = ({
|
||||
<Box h={'30px'}>问题</Box>
|
||||
<Textarea
|
||||
placeholder={
|
||||
'相关问题,可以输入多个问法, 最多500字。例如:\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用'
|
||||
'相关问题,可以输入多个问法, 最多 1000 字。例如:\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用'
|
||||
}
|
||||
maxLength={500}
|
||||
maxLength={1000}
|
||||
resize={'none'}
|
||||
h={'calc(100% - 30px)'}
|
||||
{...register(`q`, {
|
||||
@@ -136,9 +136,9 @@ const InputDataModal = ({
|
||||
<Box h={'30px'}>知识点</Box>
|
||||
<Textarea
|
||||
placeholder={
|
||||
'知识点,最多1000字。请保持主语的完整性,缺少主语会导致效果不佳。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……'
|
||||
'知识点,最多 2000 字。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……'
|
||||
}
|
||||
maxLength={1000}
|
||||
maxLength={2000}
|
||||
resize={'none'}
|
||||
h={'calc(100% - 30px)'}
|
||||
{...register(`a`, {
|
||||
|
@@ -18,6 +18,7 @@ import {
|
||||
MenuItem,
|
||||
Input
|
||||
} from '@chakra-ui/react';
|
||||
import type { BoxProps } from '@chakra-ui/react';
|
||||
import type { ModelSchema } from '@/types/mongoSchema';
|
||||
import type { ModelDataItemType } from '@/types/model';
|
||||
import { ModelDataStatusMap } from '@/constants/model';
|
||||
@@ -114,6 +115,14 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
}
|
||||
});
|
||||
|
||||
const tdStyles: BoxProps = {
|
||||
fontSize: 'xs',
|
||||
maxW: '500px',
|
||||
whiteSpace: 'pre-wrap',
|
||||
maxH: '250px',
|
||||
overflowY: 'auto'
|
||||
};
|
||||
|
||||
return (
|
||||
<>
|
||||
<Flex>
|
||||
@@ -156,8 +165,8 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
>
|
||||
手动输入
|
||||
</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectFileModal}>文本/文件 QA 拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectUrlModal}>网站内容 QA 拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectFileModal}>文本/文件拆分</MenuItem>
|
||||
{/* <MenuItem onClick={onOpenSelectUrlModal}>网站内容拆分</MenuItem> */}
|
||||
<MenuItem onClick={onOpenSelectCsvModal}>csv 问答对导入</MenuItem>
|
||||
</MenuList>
|
||||
</Menu>
|
||||
@@ -191,33 +200,23 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
|
||||
<Box mt={4}>
|
||||
<TableContainer minH={'500px'}>
|
||||
<Table variant={'simple'}>
|
||||
<Table variant={'simple'} w={'100%'}>
|
||||
<Thead>
|
||||
<Tr>
|
||||
<Th>Question</Th>
|
||||
<Th>Text</Th>
|
||||
<Th>Status</Th>
|
||||
<Th>匹配内容(问题)</Th>
|
||||
<Th>对应答案</Th>
|
||||
<Th>状态</Th>
|
||||
<Th>操作</Th>
|
||||
</Tr>
|
||||
</Thead>
|
||||
<Tbody>
|
||||
{modelDataList.map((item) => (
|
||||
<Tr key={item.id}>
|
||||
<Td minW={'200px'}>
|
||||
<Box fontSize={'xs'} whiteSpace={'pre-wrap'}>
|
||||
{item.q}
|
||||
</Box>
|
||||
<Td>
|
||||
<Box {...tdStyles}>{item.q}</Box>
|
||||
</Td>
|
||||
<Td minW={'200px'}>
|
||||
<Box
|
||||
w={'100%'}
|
||||
fontSize={'xs'}
|
||||
whiteSpace={'pre-wrap'}
|
||||
maxH={'250px'}
|
||||
overflowY={'auto'}
|
||||
>
|
||||
{item.a}
|
||||
</Box>
|
||||
<Td>
|
||||
<Box {...tdStyles}>{item.a || '-'}</Box>
|
||||
</Td>
|
||||
<Td>{ModelDataStatusMap[item.status]}</Td>
|
||||
<Td>
|
||||
|
@@ -1,4 +1,4 @@
|
||||
import React, { useState, useCallback } from 'react';
|
||||
import React, { useState, useCallback, useMemo } from 'react';
|
||||
import {
|
||||
Box,
|
||||
Flex,
|
||||
@@ -20,9 +20,26 @@ import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
||||
import { useMutation } from '@tanstack/react-query';
|
||||
import { postModelDataSplitData } from '@/api/model';
|
||||
import { formatPrice } from '@/utils/user';
|
||||
import Radio from '@/components/Radio';
|
||||
import { splitText } from '@/utils/file';
|
||||
|
||||
const fileExtension = '.txt,.doc,.docx,.pdf,.md';
|
||||
|
||||
const modeMap = {
|
||||
qa: {
|
||||
maxLen: 2800,
|
||||
slideLen: 800,
|
||||
price: 3,
|
||||
isPrompt: true
|
||||
},
|
||||
subsection: {
|
||||
maxLen: 1000,
|
||||
slideLen: 300,
|
||||
price: 0.4,
|
||||
isPrompt: false
|
||||
}
|
||||
};
|
||||
|
||||
const SelectFileModal = ({
|
||||
onClose,
|
||||
onSuccess,
|
||||
@@ -36,38 +53,45 @@ const SelectFileModal = ({
|
||||
const { toast } = useToast();
|
||||
const [prompt, setPrompt] = useState('');
|
||||
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
|
||||
const [fileText, setFileText] = useState('');
|
||||
const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
|
||||
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
|
||||
const { openConfirm, ConfirmChild } = useConfirm({
|
||||
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
|
||||
});
|
||||
|
||||
const fileText = useMemo(() => {
|
||||
const chunks = fileTextArr.map((item) =>
|
||||
splitText({
|
||||
text: item,
|
||||
...modeMap[mode]
|
||||
})
|
||||
);
|
||||
return chunks.join('');
|
||||
}, [fileTextArr, mode]);
|
||||
|
||||
const onSelectFile = useCallback(
|
||||
async (e: File[]) => {
|
||||
setSelecting(true);
|
||||
try {
|
||||
const fileTexts = (
|
||||
await Promise.all(
|
||||
e.map((file) => {
|
||||
// @ts-ignore
|
||||
const extension = file?.name?.split('.').pop().toLowerCase();
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
case 'md':
|
||||
return readTxtContent(file);
|
||||
case 'pdf':
|
||||
return readPdfContent(file);
|
||||
case 'doc':
|
||||
case 'docx':
|
||||
return readDocContent(file);
|
||||
default:
|
||||
return '';
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
.join(' ')
|
||||
.replace(/(\\n|\n)+/g, '\n');
|
||||
setFileText(fileTexts);
|
||||
const fileTexts = await Promise.all(
|
||||
e.map((file) => {
|
||||
// @ts-ignore
|
||||
const extension = file?.name?.split('.').pop().toLowerCase();
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
case 'md':
|
||||
return readTxtContent(file);
|
||||
case 'pdf':
|
||||
return readPdfContent(file);
|
||||
case 'doc':
|
||||
case 'docx':
|
||||
return readDocContent(file);
|
||||
default:
|
||||
return '';
|
||||
}
|
||||
})
|
||||
);
|
||||
setFileTextArr(fileTexts);
|
||||
} catch (error: any) {
|
||||
console.log(error);
|
||||
toast({
|
||||
@@ -77,16 +101,25 @@ const SelectFileModal = ({
|
||||
}
|
||||
setSelecting(false);
|
||||
},
|
||||
[setSelecting, toast]
|
||||
[toast]
|
||||
);
|
||||
|
||||
const { mutate, isLoading } = useMutation({
|
||||
mutationFn: async () => {
|
||||
if (!fileText) return;
|
||||
const chunks = fileTextArr
|
||||
.map((item) =>
|
||||
splitText({
|
||||
text: item,
|
||||
...modeMap[mode]
|
||||
})
|
||||
)
|
||||
.flat();
|
||||
await postModelDataSplitData({
|
||||
modelId,
|
||||
text: fileText.replace(/\\n/g, '\n').replace(/\n+/g, '\n'),
|
||||
prompt: `下面是"${prompt || '一段长文本'}"`
|
||||
chunks,
|
||||
prompt: `下面是"${prompt || '一段长文本'}"`,
|
||||
mode
|
||||
});
|
||||
toast({
|
||||
title: '导入数据成功,需要一段拆解和训练',
|
||||
@@ -106,58 +139,82 @@ const SelectFileModal = ({
|
||||
return (
|
||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||
<ModalOverlay />
|
||||
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
|
||||
<ModalContent maxW={'min(1000px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
|
||||
<ModalHeader>文件导入</ModalHeader>
|
||||
<ModalCloseButton />
|
||||
|
||||
<ModalBody
|
||||
display={'flex'}
|
||||
flexDirection={'column'}
|
||||
p={4}
|
||||
p={0}
|
||||
h={'100%'}
|
||||
alignItems={'center'}
|
||||
justifyContent={'center'}
|
||||
fontSize={'sm'}
|
||||
>
|
||||
<Button isLoading={selecting} onClick={onOpen}>
|
||||
选择文件
|
||||
</Button>
|
||||
<Box mt={2} maxW={['100%', '70%']}>
|
||||
<Box mt={2} px={4} maxW={['100%']} textAlign={'justify'} color={'blackAlpha.600'}>
|
||||
支持 {fileExtension} 文件。模型会自动对文本进行 QA 拆分,需要较长训练时间,拆分需要消耗
|
||||
tokens,账号余额不足时,未拆分的数据会被删除。
|
||||
tokens,账号余额不足时,未拆分的数据会被删除。当前一共 {encode(fileText).length}{' '}
|
||||
个tokens,大约 {formatPrice(encode(fileText).length * modeMap[mode].price)}元
|
||||
</Box>
|
||||
<Box mt={2}>
|
||||
一共 {encode(fileText).length} 个tokens,大约 {formatPrice(encode(fileText).length * 3)}
|
||||
元
|
||||
</Box>
|
||||
<Flex w={'100%'} alignItems={'center'} my={4}>
|
||||
<Box flex={'0 0 auto'} mr={2}>
|
||||
下面是
|
||||
</Box>
|
||||
<Input
|
||||
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
|
||||
value={prompt}
|
||||
onChange={(e) => setPrompt(e.target.value)}
|
||||
size={'sm'}
|
||||
{/* 拆分模式 */}
|
||||
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
|
||||
<Box flex={'0 0 70px'}>分段模式:</Box>
|
||||
<Radio
|
||||
ml={3}
|
||||
list={[
|
||||
{ label: 'QA拆分', value: 'qa' },
|
||||
{ label: '直接分段', value: 'subsection' }
|
||||
]}
|
||||
value={mode}
|
||||
onChange={(e) => setMode(e as 'subsection' | 'qa')}
|
||||
/>
|
||||
</Flex>
|
||||
<Textarea
|
||||
flex={'1 0 0'}
|
||||
h={0}
|
||||
w={'100%'}
|
||||
placeholder="文件内容"
|
||||
maxLength={-1}
|
||||
resize={'none'}
|
||||
fontSize={'xs'}
|
||||
whiteSpace={'pre-wrap'}
|
||||
value={fileText}
|
||||
onChange={(e) => setFileText(e.target.value)}
|
||||
/>
|
||||
{/* 内容介绍 */}
|
||||
{modeMap[mode].isPrompt && (
|
||||
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
|
||||
<Box flex={'0 0 70px'} mr={2}>
|
||||
下面是
|
||||
</Box>
|
||||
<Input
|
||||
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
|
||||
value={prompt}
|
||||
onChange={(e) => setPrompt(e.target.value)}
|
||||
size={'sm'}
|
||||
/>
|
||||
</Flex>
|
||||
)}
|
||||
{/* 文本内容 */}
|
||||
<Box flex={'1 0 0'} px={5} h={0} w={'100%'} overflowY={'auto'} mt={4}>
|
||||
{fileTextArr.map((item, i) => (
|
||||
<Box key={i} mb={5}>
|
||||
<Box mb={1}>文本{i + 1}</Box>
|
||||
<Textarea
|
||||
placeholder="文件内容"
|
||||
maxLength={-1}
|
||||
rows={10}
|
||||
fontSize={'xs'}
|
||||
whiteSpace={'pre-wrap'}
|
||||
value={item}
|
||||
onChange={(e) => {
|
||||
setFileTextArr([
|
||||
...fileTextArr.slice(0, i),
|
||||
e.target.value,
|
||||
...fileTextArr.slice(i + 1)
|
||||
]);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
))}
|
||||
</Box>
|
||||
</ModalBody>
|
||||
|
||||
<Flex px={6} pt={2} pb={4}>
|
||||
<Button isLoading={selecting} onClick={onOpen}>
|
||||
选择文件
|
||||
</Button>
|
||||
<Box flex={1}></Box>
|
||||
<Button variant={'outline'} mr={3} onClick={onClose}>
|
||||
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
|
||||
取消
|
||||
</Button>
|
||||
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}>
|
||||
|
@@ -44,8 +44,9 @@ const SelectUrlModal = ({
|
||||
if (!webText) return;
|
||||
await postModelDataSplitData({
|
||||
modelId,
|
||||
text: webText,
|
||||
prompt: `下面是"${prompt || '一段长文本'}"`
|
||||
chunks: [],
|
||||
prompt: `下面是"${prompt || '一段长文本'}"`,
|
||||
mode: 'qa'
|
||||
});
|
||||
toast({
|
||||
title: '导入数据成功,需要一段拆解和训练',
|
||||
@@ -89,7 +90,7 @@ const SelectUrlModal = ({
|
||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||
<ModalOverlay />
|
||||
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
|
||||
<ModalHeader>网站地址导入</ModalHeader>
|
||||
<ModalHeader>静态网站内容导入</ModalHeader>
|
||||
<ModalCloseButton />
|
||||
|
||||
<ModalBody
|
||||
@@ -102,7 +103,7 @@ const SelectUrlModal = ({
|
||||
fontSize={'sm'}
|
||||
>
|
||||
<Box mt={2} maxW={['100%', '70%']}>
|
||||
根据网站地址,获取网站文本内容(请注意获取后的内容,不是每个网站内容都能获取到的)。模型会对文本进行
|
||||
根据网站地址,获取网站文本内容(请注意仅能获取静态网站文本,注意看下获取后的内容是否正确)。模型会对文本进行
|
||||
QA 拆分,需要较长训练时间,拆分需要消耗 tokens,账号余额不足时,未拆分的数据会被删除。
|
||||
</Box>
|
||||
<Box mt={2}>
|
||||
|
@@ -69,9 +69,13 @@ export async function generateQA(next = false): Promise<any> {
|
||||
const chatAPI = getOpenAIApi(userApiKey || systemKey);
|
||||
const systemPrompt: ChatCompletionRequestMessage = {
|
||||
role: 'system',
|
||||
content: `你是出题官.${
|
||||
dataItem.prompt || '下面是"一段长文本"'
|
||||
},从中选出5至20个题目和答案,题目包含问答题,计算题,代码题等.答案要详细.按格式返回: Q1:\nA1:\nQ2:\nA2:\n`
|
||||
content: `你是出题人
|
||||
${dataItem.prompt || '下面是"一段长文本"'}
|
||||
从中选出5至20个题目和答案,题目包含问答题,计算题,代码题等.答案要详细.按格式返回: Q1:
|
||||
A1:
|
||||
Q2:
|
||||
A2:
|
||||
...`
|
||||
};
|
||||
|
||||
// 请求 chatgpt 获取回答
|
||||
|
@@ -18,10 +18,6 @@ const SplitDataSchema = new Schema({
|
||||
ref: 'model',
|
||||
required: true
|
||||
},
|
||||
rawText: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
textList: {
|
||||
type: [String],
|
||||
default: []
|
||||
|
1
src/types/mongoSchema.d.ts
vendored
1
src/types/mongoSchema.d.ts
vendored
@@ -75,7 +75,6 @@ export interface ModelSplitDataSchema {
|
||||
_id: string;
|
||||
userId: string;
|
||||
modelId: string;
|
||||
rawText: string;
|
||||
prompt: string;
|
||||
errorText: string;
|
||||
textList: string[];
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import mammoth from 'mammoth';
|
||||
import Papa from 'papaparse';
|
||||
import { encode } from 'gpt-token-utils';
|
||||
|
||||
/**
|
||||
* 读取 txt 文件内容
|
||||
@@ -137,3 +138,54 @@ export const fileDownload = ({
|
||||
downloadLink.click();
|
||||
document.body.removeChild(downloadLink);
|
||||
};
|
||||
|
||||
/**
|
||||
* text split into chunks
|
||||
* maxLen - one chunk len. max: 3500
|
||||
* slideLen - The size of the before and after Text
|
||||
* maxLen > slideLen
|
||||
*/
|
||||
export const splitText = ({
|
||||
text,
|
||||
maxLen,
|
||||
slideLen
|
||||
}: {
|
||||
text: string;
|
||||
maxLen: number;
|
||||
slideLen: number;
|
||||
}) => {
|
||||
const textArr =
|
||||
text.match(/[!?。\n.]+|[^\s]+/g)?.filter((item) => {
|
||||
const text = item.replace(/(\\n)/g, '\n').trim();
|
||||
if (text && text !== '\n') return true;
|
||||
return false;
|
||||
}) || [];
|
||||
|
||||
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
|
||||
|
||||
for (let i = 0; i < textArr.length; i++) {
|
||||
const tokenLen = encode(textArr[i]).length;
|
||||
chunks[chunks.length - 1].sum += tokenLen;
|
||||
chunks[chunks.length - 1].arr.push(textArr[i]);
|
||||
|
||||
// current length is over maxLen. create new chunk
|
||||
if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
|
||||
// get slide len text as the initial value
|
||||
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
|
||||
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
|
||||
const chunkText = chunks[chunks.length - 1].arr[j];
|
||||
const tokenLen = encode(chunkText).length;
|
||||
chunk.sum += tokenLen;
|
||||
chunk.arr.unshift(chunkText);
|
||||
|
||||
if (chunk.sum >= slideLen) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
chunks.push(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
const result = chunks.map((item) => item.arr.join(''));
|
||||
return result;
|
||||
};
|
||||
|
Reference in New Issue
Block a user