mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-24 13:53:50 +00:00
feat: 根据url获取网站文本
This commit is contained in:
@@ -5,15 +5,30 @@ import { TrainingItemType } from '../types/training';
|
||||
import { RequestPaging } from '../types/index';
|
||||
import { Obj2Query } from '@/utils/tools';
|
||||
|
||||
/**
|
||||
* 获取模型列表
|
||||
*/
|
||||
export const getMyModels = () => GET<ModelSchema[]>('/model/list');
|
||||
|
||||
/**
|
||||
* 创建一个模型
|
||||
*/
|
||||
export const postCreateModel = (data: { name: string; serviceModelName: string }) =>
|
||||
POST<ModelSchema>('/model/create', data);
|
||||
|
||||
/**
|
||||
* 根据 ID 删除模型
|
||||
*/
|
||||
export const delModelById = (id: string) => DELETE(`/model/del?modelId=${id}`);
|
||||
|
||||
/**
|
||||
* 根据 ID 获取模型
|
||||
*/
|
||||
export const getModelById = (id: string) => GET<ModelSchema>(`/model/detail?modelId=${id}`);
|
||||
|
||||
/**
|
||||
* 根据 ID 更新模型
|
||||
*/
|
||||
export const putModelById = (id: string, data: ModelUpdateParams) =>
|
||||
PUT(`/model/update?modelId=${id}`, data);
|
||||
|
||||
@@ -35,29 +50,58 @@ export const getModelTrainings = (id: string) =>
|
||||
type GetModelDataListProps = RequestPaging & {
|
||||
modelId: string;
|
||||
};
|
||||
/**
|
||||
* 获取模型的知识库数据
|
||||
*/
|
||||
export const getModelDataList = (props: GetModelDataListProps) =>
|
||||
GET(`/model/data/getModelData?${Obj2Query(props)}`);
|
||||
|
||||
/**
|
||||
* 获取导出数据(不分页)
|
||||
*/
|
||||
export const getExportDataList = (modelId: string) =>
|
||||
GET<string>(`/model/data/exportModelData?modelId=${modelId}`);
|
||||
|
||||
export const getModelSplitDataList = (modelId: string) =>
|
||||
GET<ModelSplitDataSchema[]>(`/model/data/getSplitData?modelId=${modelId}`);
|
||||
/**
|
||||
* 获取模型正在拆分数据的数量
|
||||
*/
|
||||
export const getModelSplitDataListLen = (modelId: string) =>
|
||||
GET<number>(`/model/data/getSplitData?modelId=${modelId}`);
|
||||
|
||||
/**
|
||||
* 获取 web 页面内容
|
||||
*/
|
||||
export const getWebContent = (url: string) => POST<string>(`/model/data/fetchingUrlData`, { url });
|
||||
|
||||
/**
|
||||
* 手动输入数据
|
||||
*/
|
||||
export const postModelDataInput = (data: {
|
||||
modelId: string;
|
||||
data: { text: ModelDataSchema['text']; q: ModelDataSchema['q'] }[];
|
||||
}) => POST<number>(`/model/data/pushModelDataInput`, data);
|
||||
|
||||
export const postModelDataFileText = (data: { modelId: string; text: string; prompt: string }) =>
|
||||
/**
|
||||
* 拆分数据
|
||||
*/
|
||||
export const postModelDataSplitData = (data: { modelId: string; text: string; prompt: string }) =>
|
||||
POST(`/model/data/splitData`, data);
|
||||
|
||||
/**
|
||||
* json导入数据
|
||||
*/
|
||||
export const postModelDataJsonData = (
|
||||
modelId: string,
|
||||
jsonData: { prompt: string; completion: string; vector?: number[] }[]
|
||||
) => POST(`/model/data/pushModelDataJson`, { modelId, data: jsonData });
|
||||
|
||||
/**
|
||||
* 更新模型数据
|
||||
*/
|
||||
export const putModelDataById = (data: { dataId: string; text: string; q?: string }) =>
|
||||
PUT('/model/data/putModelData', data);
|
||||
/**
|
||||
* 删除一条模型数据
|
||||
*/
|
||||
export const delOneModelData = (dataId: string) =>
|
||||
DELETE(`/model/data/delModelDataById?dataId=${dataId}`);
|
||||
|
@@ -4,7 +4,6 @@ import { connectToDatabase } from '@/service/mongo';
|
||||
import { authToken } from '@/service/utils/tools';
|
||||
import { connectRedis } from '@/service/redis';
|
||||
import { VecModelDataIdx } from '@/constants/redis';
|
||||
import { BufferToVector } from '@/utils/tools';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
|
36
src/pages/api/model/data/fetchingUrlData.ts
Normal file
36
src/pages/api/model/data/fetchingUrlData.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@/service/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { authToken } from '@/service/utils/tools';
|
||||
import axios from 'axios';
|
||||
import { httpsAgent } from '@/service/utils/tools';
|
||||
|
||||
/**
|
||||
* 读取网站的内容
|
||||
*/
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
try {
|
||||
const { url } = req.body as { url: string };
|
||||
if (!url) {
|
||||
throw new Error('缺少 url');
|
||||
}
|
||||
await connectToDatabase();
|
||||
|
||||
const { authorization } = req.headers;
|
||||
|
||||
await authToken(authorization);
|
||||
|
||||
const data = await axios
|
||||
.get(url, {
|
||||
httpsAgent
|
||||
})
|
||||
.then((res) => res.data as string);
|
||||
|
||||
jsonRes(res, { data });
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error: err
|
||||
});
|
||||
}
|
||||
}
|
@@ -24,7 +24,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data
|
||||
data: data.map((item) => item.textList).flat().length
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
|
@@ -24,7 +24,7 @@ import { usePagination } from '@/hooks/usePagination';
|
||||
import {
|
||||
getModelDataList,
|
||||
delOneModelData,
|
||||
getModelSplitDataList,
|
||||
getModelSplitDataListLen,
|
||||
getExportDataList
|
||||
} from '@/api/model';
|
||||
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
|
||||
@@ -36,6 +36,7 @@ import type { FormData as InputDataType } from './InputDataModal';
|
||||
|
||||
const InputModel = dynamic(() => import('./InputDataModal'));
|
||||
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
|
||||
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
|
||||
const SelectJsonModel = dynamic(() => import('./SelectJsonModal'));
|
||||
|
||||
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
@@ -63,14 +64,19 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
onOpen: onOpenSelectFileModal,
|
||||
onClose: onCloseSelectFileModal
|
||||
} = useDisclosure();
|
||||
const {
|
||||
isOpen: isOpenSelectUrlModal,
|
||||
onOpen: onOpenSelectUrlModal,
|
||||
onClose: onCloseSelectUrlModal
|
||||
} = useDisclosure();
|
||||
const {
|
||||
isOpen: isOpenSelectJsonModal,
|
||||
onOpen: onOpenSelectJsonModal,
|
||||
onClose: onCloseSelectJsonModal
|
||||
} = useDisclosure();
|
||||
|
||||
const { data: splitDataList, refetch } = useQuery(['getModelSplitDataList'], () =>
|
||||
getModelSplitDataList(model._id)
|
||||
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
|
||||
getModelSplitDataListLen(model._id)
|
||||
);
|
||||
|
||||
const refetchData = useCallback(
|
||||
@@ -143,14 +149,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
手动输入
|
||||
</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectFileModal}>文件导入</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectUrlModal}>网站地址导入</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectJsonModal}>JSON导入</MenuItem>
|
||||
</MenuList>
|
||||
</Menu>
|
||||
</Flex>
|
||||
{splitDataList && splitDataList.length > 0 && (
|
||||
<Box fontSize={'xs'}>
|
||||
{splitDataList.map((item) => item.textList).flat().length}条数据正在拆分...
|
||||
</Box>
|
||||
{!!(splitDataLen && splitDataLen > 0) && (
|
||||
<Box fontSize={'xs'}>{splitDataLen}条数据正在拆分...</Box>
|
||||
)}
|
||||
<Box mt={4}>
|
||||
<TableContainer minH={'500px'}>
|
||||
@@ -236,6 +241,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
onSuccess={refetchData}
|
||||
/>
|
||||
)}
|
||||
{isOpenSelectUrlModal && (
|
||||
<SelectUrlModel
|
||||
modelId={model._id}
|
||||
onClose={onCloseSelectUrlModal}
|
||||
onSuccess={refetchData}
|
||||
/>
|
||||
)}
|
||||
{isOpenSelectJsonModal && (
|
||||
<SelectJsonModel
|
||||
modelId={model._id}
|
||||
|
@@ -19,7 +19,8 @@ import { encode } from 'gpt-token-utils';
|
||||
import { useConfirm } from '@/hooks/useConfirm';
|
||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools';
|
||||
import { useMutation } from '@tanstack/react-query';
|
||||
import { postModelDataFileText } from '@/api/model';
|
||||
import { postModelDataSplitData } from '@/api/model';
|
||||
import { formatPrice } from '@/utils/user';
|
||||
|
||||
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12);
|
||||
|
||||
@@ -85,7 +86,7 @@ const SelectFileModal = ({
|
||||
const { mutate, isLoading } = useMutation({
|
||||
mutationFn: async () => {
|
||||
if (!fileText) return;
|
||||
await postModelDataFileText({
|
||||
await postModelDataSplitData({
|
||||
modelId,
|
||||
text: fileText,
|
||||
prompt: `下面是${prompt || '一段长文本'}`
|
||||
@@ -126,10 +127,11 @@ const SelectFileModal = ({
|
||||
</Button>
|
||||
<Box mt={2} maxW={['100%', '70%']}>
|
||||
支持 {fileExtension} 文件。模型会自动对文本进行 QA 拆分,需要较长训练时间,拆分需要消耗
|
||||
tokens,大约0.04元/1k tokens,请确保账号余额充足。
|
||||
tokens,账号余额不足时,未拆分的数据会被删除。
|
||||
</Box>
|
||||
<Box mt={2}>
|
||||
一共 {fileText.length} 个字,{encode(fileText).length} 个tokens
|
||||
一共 {encode(fileText).length} 个tokens,大约 {formatPrice(encode(fileText).length * 4)}
|
||||
元
|
||||
</Box>
|
||||
<Flex w={'100%'} alignItems={'center'} my={4}>
|
||||
<Box flex={'0 0 auto'} mr={2}>
|
||||
|
168
src/pages/model/detail/components/SelectUrlModal.tsx
Normal file
168
src/pages/model/detail/components/SelectUrlModal.tsx
Normal file
@@ -0,0 +1,168 @@
|
||||
import React, { useState } from 'react';
|
||||
import {
|
||||
Box,
|
||||
Flex,
|
||||
Button,
|
||||
Modal,
|
||||
ModalOverlay,
|
||||
ModalContent,
|
||||
ModalHeader,
|
||||
ModalCloseButton,
|
||||
ModalBody,
|
||||
Input,
|
||||
Textarea
|
||||
} from '@chakra-ui/react';
|
||||
import { useToast } from '@/hooks/useToast';
|
||||
import { customAlphabet } from 'nanoid';
|
||||
import { encode } from 'gpt-token-utils';
|
||||
import { useConfirm } from '@/hooks/useConfirm';
|
||||
import { useMutation } from '@tanstack/react-query';
|
||||
import { postModelDataSplitData, getWebContent } from '@/api/model';
|
||||
import { formatPrice } from '@/utils/user';
|
||||
|
||||
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12);
|
||||
|
||||
const SelectUrlModal = ({
|
||||
onClose,
|
||||
onSuccess,
|
||||
modelId
|
||||
}: {
|
||||
onClose: () => void;
|
||||
onSuccess: () => void;
|
||||
modelId: string;
|
||||
}) => {
|
||||
const { toast } = useToast();
|
||||
const [webUrl, setWebUrl] = useState('');
|
||||
const [webText, setWebText] = useState('');
|
||||
const [prompt, setPrompt] = useState(''); // 提示词
|
||||
const { openConfirm, ConfirmChild } = useConfirm({
|
||||
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
|
||||
});
|
||||
|
||||
const { mutate: onclickImport, isLoading: isImporting } = useMutation({
|
||||
mutationFn: async () => {
|
||||
if (!webText) return;
|
||||
await postModelDataSplitData({
|
||||
modelId,
|
||||
text: webText,
|
||||
prompt: `下面是${prompt || '一段长文本'}`
|
||||
});
|
||||
toast({
|
||||
title: '导入数据成功,需要一段拆解和训练',
|
||||
status: 'success'
|
||||
});
|
||||
onClose();
|
||||
onSuccess();
|
||||
},
|
||||
onError(error) {
|
||||
console.log(error);
|
||||
toast({
|
||||
title: '导入数据失败',
|
||||
status: 'error'
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
const { mutate: onclickFetchingUrl, isLoading: isFetching } = useMutation({
|
||||
mutationFn: async () => {
|
||||
if (!webUrl) return;
|
||||
const res = await getWebContent(webUrl);
|
||||
const parser = new DOMParser();
|
||||
const htmlDoc = parser.parseFromString(res, 'text/html');
|
||||
const data = htmlDoc?.body?.innerText || '';
|
||||
|
||||
if (!data) {
|
||||
throw new Error('获取不到数据');
|
||||
}
|
||||
setWebText(data.replace(/\s+/g, ' '));
|
||||
},
|
||||
onError(error) {
|
||||
console.log(error);
|
||||
toast({
|
||||
status: 'error',
|
||||
title: '获取网站内容失败'
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return (
|
||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||
<ModalOverlay />
|
||||
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
|
||||
<ModalHeader>网站地址导入</ModalHeader>
|
||||
<ModalCloseButton />
|
||||
|
||||
<ModalBody
|
||||
display={'flex'}
|
||||
flexDirection={'column'}
|
||||
p={4}
|
||||
h={'100%'}
|
||||
alignItems={'center'}
|
||||
justifyContent={'center'}
|
||||
fontSize={'sm'}
|
||||
>
|
||||
<Box mt={2} maxW={['100%', '70%']}>
|
||||
根据网站地址,获取网站文本内容(请注意获取后的内容,不是每个网站内容都能获取到的)。模型会对文本进行
|
||||
QA 拆分,需要较长训练时间,拆分需要消耗 tokens,账号余额不足时,未拆分的数据会被删除。
|
||||
</Box>
|
||||
<Box mt={2}>
|
||||
一共 {encode(webText).length} 个tokens,大约 {formatPrice(encode(webText).length * 4)}元
|
||||
</Box>
|
||||
<Flex w={'100%'} alignItems={'center'} my={4}>
|
||||
<Box flex={'0 0 70px'}>网站地址</Box>
|
||||
<Input
|
||||
mx={2}
|
||||
placeholder="需要获取内容的地址。例如:https://fastgpt.ahapocket.cn"
|
||||
value={webUrl}
|
||||
onChange={(e) => setWebUrl(e.target.value)}
|
||||
size={'sm'}
|
||||
/>
|
||||
<Button isLoading={isFetching} onClick={() => onclickFetchingUrl()}>
|
||||
获取
|
||||
</Button>
|
||||
</Flex>
|
||||
<Flex w={'100%'} alignItems={'center'} my={4}>
|
||||
<Box flex={'0 0 70px'} mr={2}>
|
||||
下面是
|
||||
</Box>
|
||||
<Input
|
||||
placeholder="内容提示词。例如: Laf的介绍/关于gpt4的论文/一段长文本"
|
||||
value={prompt}
|
||||
onChange={(e) => setPrompt(e.target.value)}
|
||||
size={'sm'}
|
||||
/>
|
||||
</Flex>
|
||||
<Textarea
|
||||
flex={'1 0 0'}
|
||||
h={0}
|
||||
w={'100%'}
|
||||
placeholder="网站的内容"
|
||||
maxLength={-1}
|
||||
resize={'none'}
|
||||
fontSize={'xs'}
|
||||
whiteSpace={'pre-wrap'}
|
||||
value={webText}
|
||||
onChange={(e) => setWebText(e.target.value)}
|
||||
/>
|
||||
</ModalBody>
|
||||
|
||||
<Flex px={6} pt={2} pb={4}>
|
||||
<Box flex={1}></Box>
|
||||
<Button variant={'outline'} mr={3} onClick={onClose}>
|
||||
取消
|
||||
</Button>
|
||||
<Button
|
||||
isLoading={isImporting}
|
||||
isDisabled={webText === ''}
|
||||
onClick={openConfirm(onclickImport)}
|
||||
>
|
||||
确认导入
|
||||
</Button>
|
||||
</Flex>
|
||||
</ModalContent>
|
||||
<ConfirmChild />
|
||||
</Modal>
|
||||
);
|
||||
};
|
||||
|
||||
export default SelectUrlModal;
|
@@ -134,15 +134,7 @@ export const vectorToBuffer = (vector: number[]) => {
|
||||
|
||||
return buffer;
|
||||
};
|
||||
export const BufferToVector = (bufferStr: string) => {
|
||||
let buffer = Buffer.from(`bufferStr`, 'binary'); // 将字符串转换成 Buffer 对象
|
||||
const npVector = new Float32Array(
|
||||
buffer,
|
||||
buffer.byteOffset,
|
||||
buffer.byteLength / Float32Array.BYTES_PER_ELEMENT
|
||||
);
|
||||
return Array.from(npVector);
|
||||
};
|
||||
|
||||
export function formatVector(vector: number[]) {
|
||||
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
|
||||
if (vector.length > 1536) {
|
||||
|
Reference in New Issue
Block a user