feat: 根据url获取网站文本

This commit is contained in:
archer
2023-04-05 16:10:47 +08:00
parent 5feb2e19bf
commit dc329041f3
8 changed files with 278 additions and 25 deletions

View File

@@ -5,15 +5,30 @@ import { TrainingItemType } from '../types/training';
import { RequestPaging } from '../types/index';
import { Obj2Query } from '@/utils/tools';
/**
* 获取模型列表
*/
export const getMyModels = () => GET<ModelSchema[]>('/model/list');
/**
* 创建一个模型
*/
export const postCreateModel = (data: { name: string; serviceModelName: string }) =>
POST<ModelSchema>('/model/create', data);
/**
* 根据 ID 删除模型
*/
export const delModelById = (id: string) => DELETE(`/model/del?modelId=${id}`);
/**
* 根据 ID 获取模型
*/
export const getModelById = (id: string) => GET<ModelSchema>(`/model/detail?modelId=${id}`);
/**
* 根据 ID 更新模型
*/
export const putModelById = (id: string, data: ModelUpdateParams) =>
PUT(`/model/update?modelId=${id}`, data);
@@ -35,29 +50,58 @@ export const getModelTrainings = (id: string) =>
type GetModelDataListProps = RequestPaging & {
modelId: string;
};
/**
* 获取模型的知识库数据
*/
export const getModelDataList = (props: GetModelDataListProps) =>
GET(`/model/data/getModelData?${Obj2Query(props)}`);
/**
* 获取导出数据(不分页)
*/
export const getExportDataList = (modelId: string) =>
GET<string>(`/model/data/exportModelData?modelId=${modelId}`);
export const getModelSplitDataList = (modelId: string) =>
GET<ModelSplitDataSchema[]>(`/model/data/getSplitData?modelId=${modelId}`);
/**
* 获取模型正在拆分数据的数量
*/
export const getModelSplitDataListLen = (modelId: string) =>
GET<number>(`/model/data/getSplitData?modelId=${modelId}`);
/**
* 获取 web 页面内容
*/
export const getWebContent = (url: string) => POST<string>(`/model/data/fetchingUrlData`, { url });
/**
* 手动输入数据
*/
export const postModelDataInput = (data: {
modelId: string;
data: { text: ModelDataSchema['text']; q: ModelDataSchema['q'] }[];
}) => POST<number>(`/model/data/pushModelDataInput`, data);
export const postModelDataFileText = (data: { modelId: string; text: string; prompt: string }) =>
/**
* 拆分数据
*/
export const postModelDataSplitData = (data: { modelId: string; text: string; prompt: string }) =>
POST(`/model/data/splitData`, data);
/**
* json导入数据
*/
export const postModelDataJsonData = (
modelId: string,
jsonData: { prompt: string; completion: string; vector?: number[] }[]
) => POST(`/model/data/pushModelDataJson`, { modelId, data: jsonData });
/**
* 更新模型数据
*/
export const putModelDataById = (data: { dataId: string; text: string; q?: string }) =>
PUT('/model/data/putModelData', data);
/**
* 删除一条模型数据
*/
export const delOneModelData = (dataId: string) =>
DELETE(`/model/data/delModelDataById?dataId=${dataId}`);

View File

@@ -4,7 +4,6 @@ import { connectToDatabase } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import { connectRedis } from '@/service/redis';
import { VecModelDataIdx } from '@/constants/redis';
import { BufferToVector } from '@/utils/tools';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {

View File

@@ -0,0 +1,36 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import axios from 'axios';
import { httpsAgent } from '@/service/utils/tools';
/**
* 读取网站的内容
*/
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { url } = req.body as { url: string };
if (!url) {
throw new Error('缺少 url');
}
await connectToDatabase();
const { authorization } = req.headers;
await authToken(authorization);
const data = await axios
.get(url, {
httpsAgent
})
.then((res) => res.data as string);
jsonRes(res, { data });
} catch (err) {
jsonRes(res, {
code: 500,
error: err
});
}
}

View File

@@ -24,7 +24,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
});
jsonRes(res, {
data
data: data.map((item) => item.textList).flat().length
});
} catch (err) {
jsonRes(res, {

View File

@@ -24,7 +24,7 @@ import { usePagination } from '@/hooks/usePagination';
import {
getModelDataList,
delOneModelData,
getModelSplitDataList,
getModelSplitDataListLen,
getExportDataList
} from '@/api/model';
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
@@ -36,6 +36,7 @@ import type { FormData as InputDataType } from './InputDataModal';
const InputModel = dynamic(() => import('./InputDataModal'));
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
const SelectJsonModel = dynamic(() => import('./SelectJsonModal'));
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
@@ -63,14 +64,19 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
onOpen: onOpenSelectFileModal,
onClose: onCloseSelectFileModal
} = useDisclosure();
const {
isOpen: isOpenSelectUrlModal,
onOpen: onOpenSelectUrlModal,
onClose: onCloseSelectUrlModal
} = useDisclosure();
const {
isOpen: isOpenSelectJsonModal,
onOpen: onOpenSelectJsonModal,
onClose: onCloseSelectJsonModal
} = useDisclosure();
const { data: splitDataList, refetch } = useQuery(['getModelSplitDataList'], () =>
getModelSplitDataList(model._id)
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
getModelSplitDataListLen(model._id)
);
const refetchData = useCallback(
@@ -143,14 +149,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
</MenuItem>
<MenuItem onClick={onOpenSelectFileModal}></MenuItem>
<MenuItem onClick={onOpenSelectUrlModal}></MenuItem>
<MenuItem onClick={onOpenSelectJsonModal}>JSON导入</MenuItem>
</MenuList>
</Menu>
</Flex>
{splitDataList && splitDataList.length > 0 && (
<Box fontSize={'xs'}>
{splitDataList.map((item) => item.textList).flat().length}...
</Box>
{!!(splitDataLen && splitDataLen > 0) && (
<Box fontSize={'xs'}>{splitDataLen}...</Box>
)}
<Box mt={4}>
<TableContainer minH={'500px'}>
@@ -236,6 +241,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
onSuccess={refetchData}
/>
)}
{isOpenSelectUrlModal && (
<SelectUrlModel
modelId={model._id}
onClose={onCloseSelectUrlModal}
onSuccess={refetchData}
/>
)}
{isOpenSelectJsonModal && (
<SelectJsonModel
modelId={model._id}

View File

@@ -19,7 +19,8 @@ import { encode } from 'gpt-token-utils';
import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools';
import { useMutation } from '@tanstack/react-query';
import { postModelDataFileText } from '@/api/model';
import { postModelDataSplitData } from '@/api/model';
import { formatPrice } from '@/utils/user';
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12);
@@ -85,7 +86,7 @@ const SelectFileModal = ({
const { mutate, isLoading } = useMutation({
mutationFn: async () => {
if (!fileText) return;
await postModelDataFileText({
await postModelDataSplitData({
modelId,
text: fileText,
prompt: `下面是${prompt || '一段长文本'}`
@@ -126,10 +127,11 @@ const SelectFileModal = ({
</Button>
<Box mt={2} maxW={['100%', '70%']}>
{fileExtension} QA
tokens0.04/1k tokens
tokens
</Box>
<Box mt={2}>
{fileText.length} {encode(fileText).length} tokens
{encode(fileText).length} tokens {formatPrice(encode(fileText).length * 4)}
</Box>
<Flex w={'100%'} alignItems={'center'} my={4}>
<Box flex={'0 0 auto'} mr={2}>

View File

@@ -0,0 +1,168 @@
import React, { useState } from 'react';
import {
Box,
Flex,
Button,
Modal,
ModalOverlay,
ModalContent,
ModalHeader,
ModalCloseButton,
ModalBody,
Input,
Textarea
} from '@chakra-ui/react';
import { useToast } from '@/hooks/useToast';
import { customAlphabet } from 'nanoid';
import { encode } from 'gpt-token-utils';
import { useConfirm } from '@/hooks/useConfirm';
import { useMutation } from '@tanstack/react-query';
import { postModelDataSplitData, getWebContent } from '@/api/model';
import { formatPrice } from '@/utils/user';
const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12);
const SelectUrlModal = ({
onClose,
onSuccess,
modelId
}: {
onClose: () => void;
onSuccess: () => void;
modelId: string;
}) => {
const { toast } = useToast();
const [webUrl, setWebUrl] = useState('');
const [webText, setWebText] = useState('');
const [prompt, setPrompt] = useState(''); // 提示词
const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
});
const { mutate: onclickImport, isLoading: isImporting } = useMutation({
mutationFn: async () => {
if (!webText) return;
await postModelDataSplitData({
modelId,
text: webText,
prompt: `下面是${prompt || '一段长文本'}`
});
toast({
title: '导入数据成功,需要一段拆解和训练',
status: 'success'
});
onClose();
onSuccess();
},
onError(error) {
console.log(error);
toast({
title: '导入数据失败',
status: 'error'
});
}
});
const { mutate: onclickFetchingUrl, isLoading: isFetching } = useMutation({
mutationFn: async () => {
if (!webUrl) return;
const res = await getWebContent(webUrl);
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(res, 'text/html');
const data = htmlDoc?.body?.innerText || '';
if (!data) {
throw new Error('获取不到数据');
}
setWebText(data.replace(/\s+/g, ' '));
},
onError(error) {
console.log(error);
toast({
status: 'error',
title: '获取网站内容失败'
});
}
});
return (
<Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay />
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalHeader></ModalHeader>
<ModalCloseButton />
<ModalBody
display={'flex'}
flexDirection={'column'}
p={4}
h={'100%'}
alignItems={'center'}
justifyContent={'center'}
fontSize={'sm'}
>
<Box mt={2} maxW={['100%', '70%']}>
QA tokens
</Box>
<Box mt={2}>
{encode(webText).length} tokens {formatPrice(encode(webText).length * 4)}
</Box>
<Flex w={'100%'} alignItems={'center'} my={4}>
<Box flex={'0 0 70px'}></Box>
<Input
mx={2}
placeholder="需要获取内容的地址。例如https://fastgpt.ahapocket.cn"
value={webUrl}
onChange={(e) => setWebUrl(e.target.value)}
size={'sm'}
/>
<Button isLoading={isFetching} onClick={() => onclickFetchingUrl()}>
</Button>
</Flex>
<Flex w={'100%'} alignItems={'center'} my={4}>
<Box flex={'0 0 70px'} mr={2}>
</Box>
<Input
placeholder="内容提示词。例如: Laf的介绍/关于gpt4的论文/一段长文本"
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
/>
</Flex>
<Textarea
flex={'1 0 0'}
h={0}
w={'100%'}
placeholder="网站的内容"
maxLength={-1}
resize={'none'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
value={webText}
onChange={(e) => setWebText(e.target.value)}
/>
</ModalBody>
<Flex px={6} pt={2} pb={4}>
<Box flex={1}></Box>
<Button variant={'outline'} mr={3} onClick={onClose}>
</Button>
<Button
isLoading={isImporting}
isDisabled={webText === ''}
onClick={openConfirm(onclickImport)}
>
</Button>
</Flex>
</ModalContent>
<ConfirmChild />
</Modal>
);
};
export default SelectUrlModal;

View File

@@ -134,15 +134,7 @@ export const vectorToBuffer = (vector: number[]) => {
return buffer;
};
export const BufferToVector = (bufferStr: string) => {
let buffer = Buffer.from(`bufferStr`, 'binary'); // 将字符串转换成 Buffer 对象
const npVector = new Float32Array(
buffer,
buffer.byteOffset,
buffer.byteLength / Float32Array.BYTES_PER_ELEMENT
);
return Array.from(npVector);
};
export function formatVector(vector: number[]) {
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
if (vector.length > 1536) {