perf: code

This commit is contained in:
archer
2023-05-27 15:18:10 +08:00
parent 4f0bd677f2
commit a287ace126
23 changed files with 82 additions and 118 deletions

View File

@@ -6,14 +6,13 @@ CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS modelData (
id BIGSERIAL PRIMARY KEY,
vector VECTOR(1536) NOT NULL,
status VARCHAR(50) NOT NULL,
user_id VARCHAR(50) NOT NULL,
kb_id VARCHAR(50),
q TEXT NOT NULL,
a TEXT NOT NULL
);
-- create index
CREATE INDEX IF NOT EXISTS modelData_status_index ON modelData USING HASH (status);
CREATE INDEX IF NOT EXISTS modelData_userId_index ON modelData USING HASH (user_id);
CREATE INDEX IF NOT EXISTS modelData_kbId_index ON modelData USING HASH (kb_id);
-- vector 索引,可以到 pg vector 去配置,根据数据量去配置
EOSQL

View File

@@ -1,7 +1,7 @@
import { GET, POST, PUT, DELETE } from '../request';
import type { KbItemType } from '@/types/plugin';
import { RequestPaging } from '@/types/index';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { Props as PushDataProps } from '@/pages/api/openapi/kb/pushData';
export type KbUpdateParams = { id: string; name: string; tags: string; avatar: string };
@@ -66,5 +66,5 @@ export const postSplitData = (data: {
kbId: string;
chunks: string[];
prompt: string;
mode: `${TrainingTypeEnum}`;
mode: `${TrainingModeEnum}`;
}) => POST(`/openapi/text/pushData`, data);

View File

@@ -96,16 +96,6 @@ export const formatModelStatus = {
}
};
export enum ModelDataStatusEnum {
ready = 'ready',
waiting = 'waiting'
}
export const ModelDataStatusMap: Record<`${ModelDataStatusEnum}`, string> = {
ready: '训练完成',
waiting: '训练中'
};
/* 知识库搜索时的配置 */
// 搜索方式
export enum appVectorSearchModeEnum {

View File

@@ -1,8 +1,8 @@
export enum TrainingTypeEnum {
export enum TrainingModeEnum {
'qa' = 'qa',
'index' = 'index'
}
export const TrainingTypeMap = {
[TrainingTypeEnum.qa]: 'qa',
[TrainingTypeEnum.index]: 'index'
[TrainingModeEnum.qa]: 'qa',
[TrainingModeEnum.index]: 'index'
};

View File

@@ -3,7 +3,7 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { authUser } from '@/service/utils/auth';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
@@ -23,8 +23,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
jsonRes(res, {
data: {
qaListLen: result.find((item) => item._id === TrainingTypeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingTypeEnum.index)?.count || 0
qaListLen: result.find((item) => item._id === TrainingModeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingModeEnum.index)?.count || 0
}
});
} catch (error) {

View File

@@ -10,7 +10,6 @@ import { authModel } from '@/service/utils/auth';
import { ChatModelMap } from '@/constants/model';
import { ChatRoleEnum } from '@/constants/chat';
import { openaiEmbedding } from '../plugin/openaiEmbedding';
import { ModelDataStatusEnum } from '@/constants/model';
import { modelToolMap } from '@/utils/plugin';
export type QuoteItemType = { id: string; q: string; a: string; isEdit: boolean };
@@ -102,8 +101,6 @@ export async function appKbSearch({
PgClient.select<QuoteItemType>('modelData', {
fields: ['id', 'q', 'a'],
where: [
['status', ModelDataStatusEnum.ready],
'AND',
`kb_id IN (${model.chat.relatedKbs.map((item) => `'${item}'`).join(',')})`,
'AND',
`vector <=> '[${promptVector}]' < ${similarity}`

View File

@@ -5,13 +5,13 @@ import { connectToDatabase, TrainingData } from '@/service/mongo';
import { authUser } from '@/service/utils/auth';
import { authKb } from '@/service/utils/auth';
import { withNextCors } from '@/service/utils/tools';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { startQueue } from '@/service/utils/tools';
export type Props = {
kbId: string;
data: { a: KbDataItemType['a']; q: KbDataItemType['q'] }[];
mode: `${TrainingTypeEnum}`;
mode: `${TrainingModeEnum}`;
prompt?: string;
};
@@ -60,6 +60,39 @@ export async function pushDataToKb({
return {};
}
// 去重
// 过滤重复的 qa 内容
// const searchRes = await Promise.allSettled(
// dataItems.map(async ({ q, a = '' }) => {
// if (!q) {
// return Promise.reject('q为空');
// }
// q = q.replace(/\\n/g, '\n');
// a = a.replace(/\\n/g, '\n');
// // Exactly the same data, not push
// try {
// const count = await PgClient.count('modelData', {
// where: [['user_id', userId], 'AND', ['kb_id', kbId], 'AND', ['q', q], 'AND', ['a', a]]
// });
// if (count > 0) {
// return Promise.reject('已经存在');
// }
// } catch (error) {
// error;
// }
// return Promise.resolve({
// q,
// a
// });
// })
// );
// const filterData = searchRes
// .filter((item) => item.status === 'fulfilled')
// .map<{ q: string; a: string }>((item: any) => item.value);
// 插入记录
await TrainingData.insertMany(
data.map((item) => ({

View File

@@ -1,8 +1,6 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { authUser } from '@/service/utils/auth';
import { ModelDataStatusEnum } from '@/constants/model';
import { generateVector } from '@/service/events/generateVector';
import { PgClient } from '@/service/pg';
import { withNextCors } from '@/service/utils/tools';
import { openaiEmbedding } from '../plugin/openaiEmbedding';

View File

@@ -22,7 +22,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const where: any = [['user_id', userId], 'AND', ['id', dataId]];
const searchRes = await PgClient.select<PgKBDataItemType>('modelData', {
fields: ['id', 'q', 'a', 'status'],
fields: ['id', 'q', 'a'],
where,
limit: 1
});

View File

@@ -35,7 +35,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
];
const searchRes = await PgClient.select<PgKBDataItemType>('modelData', {
fields: ['id', 'q', 'a', 'status'],
fields: ['id', 'q', 'a'],
where,
order: [{ field: 'id', mode: 'DESC' }],
limit: pageSize,

View File

@@ -2,10 +2,9 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { authUser } from '@/service/utils/auth';
import { generateQA } from '@/service/events/generateQA';
import { generateVector } from '@/service/events/generateVector';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { Types } from 'mongoose';
import { startQueue } from '@/service/utils/tools';
/* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
@@ -36,23 +35,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
jsonRes(res, {
data: {
qaListLen: result.find((item) => item._id === TrainingTypeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingTypeEnum.index)?.count || 0
qaListLen: result.find((item) => item._id === TrainingModeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingModeEnum.index)?.count || 0
}
});
if (init) {
const list = await TrainingData.find(
{
userId,
kbId
},
'_id'
).limit(10);
list.forEach((item) => {
generateQA();
generateVector();
});
startQueue();
}
} catch (err) {
jsonRes(res, {

View File

@@ -22,7 +22,6 @@ import {
import { QuestionOutlineIcon } from '@chakra-ui/icons';
import type { BoxProps } from '@chakra-ui/react';
import type { KbDataItemType } from '@/types/plugin';
import { ModelDataStatusMap } from '@/constants/model';
import { usePagination } from '@/hooks/usePagination';
import {
getKbDataList,
@@ -92,7 +91,7 @@ const DataCard = ({ kbId }: { kbId: string }) => {
} = useDisclosure();
const { data: { qaListLen = 0, vectorListLen = 0 } = {}, refetch } = useQuery(
['getModelSplitDataList'],
['getModelSplitDataList', kbId],
() => getTrainingData({ kbId, init: false }),
{
onError(err) {
@@ -240,7 +239,6 @@ const DataCard = ({ kbId }: { kbId: string }) => {
</Tooltip>
</Th>
<Th></Th>
<Th></Th>
<Th></Th>
</Tr>
</Thead>
@@ -253,7 +251,6 @@ const DataCard = ({ kbId }: { kbId: string }) => {
<Td>
<Box {...tdStyles.current}>{item.a || '-'}</Box>
</Td>
<Td>{ModelDataStatusMap[item.status]}</Td>
<Td>
<IconButton
mr={5}

View File

@@ -56,13 +56,13 @@ const Detail = ({ kbId }: { kbId: string }) => {
}
},
onError(err: any) {
loadKbList(true);
setLastKbId('');
router.replace(`/kb`);
toast({
title: getErrText(err, '获取知识库异常'),
status: 'error'
});
loadKbList(true);
setLastKbId('');
router.replace(`/kb?kbId=${myKbList[0]?._id || ''}`);
}
});

View File

@@ -13,7 +13,8 @@ import {
import { useForm } from 'react-hook-form';
import { postKbDataFromList, putKbDataById } from '@/api/plugins/kb';
import { useToast } from '@/hooks/useToast';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { getErrText } from '@/utils/tools';
export type FormData = { dataId?: string; a: string; q: string };
@@ -61,7 +62,7 @@ const InputDataModal = ({
q: e.q
}
],
mode: TrainingTypeEnum.index
mode: TrainingModeEnum.index
});
toast({
@@ -75,10 +76,9 @@ const InputDataModal = ({
onSuccess();
} catch (err: any) {
toast({
title: err?.message || '出现了点意外~',
title: getErrText(err, '出现了点意外~'),
status: 'error'
});
console.log(err);
}
setLoading(false);
},

View File

@@ -19,7 +19,8 @@ import { postKbDataFromList } from '@/api/plugins/kb';
import Markdown from '@/components/Markdown';
import { useMarkdown } from '@/hooks/useMarkdown';
import { fileDownload } from '@/utils/file';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { getErrText } from '@/utils/tools';
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
@@ -56,9 +57,8 @@ const SelectJsonModal = ({
}))
);
} catch (error: any) {
console.log(error);
toast({
title: error?.message || 'csv 文件格式有误',
title: getErrText(error, 'csv 文件格式有误'),
status: 'error'
});
}
@@ -74,7 +74,7 @@ const SelectJsonModal = ({
const res = await postKbDataFromList({
kbId,
data: fileData,
mode: TrainingTypeEnum.index
mode: TrainingModeEnum.index
});
toast({

View File

@@ -20,19 +20,19 @@ import { useMutation } from '@tanstack/react-query';
import { postKbDataFromList } from '@/api/plugins/kb';
import Radio from '@/components/Radio';
import { splitText_token } from '@/utils/file';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { getErrText } from '@/utils/tools';
const fileExtension = '.txt,.doc,.docx,.pdf,.md';
const modeMap = {
qa: {
[TrainingModeEnum.qa]: {
maxLen: 2800,
slideLen: 800,
price: 4,
isPrompt: true
},
index: {
[TrainingModeEnum.index]: {
maxLen: 800,
slideLen: 300,
price: 0.4,
@@ -53,7 +53,7 @@ const SelectFileModal = ({
const { toast } = useToast();
const [prompt, setPrompt] = useState('');
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [mode, setMode] = useState<`${TrainingTypeEnum}`>(TrainingTypeEnum.index);
const [mode, setMode] = useState<`${TrainingModeEnum}`>(TrainingModeEnum.index);
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({
tokens: 0,
@@ -122,9 +122,9 @@ const SelectFileModal = ({
onClose();
onSuccess();
},
onError() {
onError(err) {
toast({
title: '导入文件失败',
title: getErrText(err, '导入文件失败'),
status: 'error'
});
}

View File

@@ -7,7 +7,7 @@ import { modelServiceToolMap } from '../utils/chat';
import { ChatRoleEnum } from '@/constants/chat';
import { BillTypeEnum } from '@/constants/user';
import { pushDataToKb } from '@/pages/api/openapi/kb/pushData';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
import { ERROR_ENUM } from '../errorCode';
export async function generateQA(): Promise<any> {
@@ -23,7 +23,7 @@ export async function generateQA(): Promise<any> {
// 找出一个需要生成的 dataItem (4分钟锁)
const data = await TrainingData.findOneAndUpdate(
{
mode: TrainingTypeEnum.qa,
mode: TrainingModeEnum.qa,
lockTime: { $lte: new Date(Date.now() - 2 * 60 * 1000) }
},
{
@@ -115,7 +115,7 @@ A2:
kbId,
data: responseList,
userId,
mode: TrainingTypeEnum.index
mode: TrainingModeEnum.index
});
// delete data from training
@@ -126,6 +126,7 @@ A2:
global.qaQueueLen--;
generateQA();
} catch (err: any) {
global.qaQueueLen--;
// log
if (err?.response) {
console.log('openai error: 生成QA错误');
@@ -144,7 +145,6 @@ A2:
}
// unlock
global.qaQueueLen--;
await TrainingData.findByIdAndUpdate(trainingId, {
lockTime: new Date('2000/1/1')
});

View File

@@ -3,7 +3,7 @@ import { insertKbItem, PgClient } from '@/service/pg';
import { openaiEmbedding } from '@/pages/api/openapi/plugin/openaiEmbedding';
import { TrainingData } from '../models/trainingData';
import { ERROR_ENUM } from '../errorCode';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
/* 索引生成队列。每导入一次,就是一个单独的线程 */
export async function generateVector(): Promise<any> {
@@ -18,7 +18,7 @@ export async function generateVector(): Promise<any> {
try {
const data = await TrainingData.findOneAndUpdate(
{
mode: TrainingTypeEnum.index,
mode: TrainingModeEnum.index,
lockTime: { $lte: new Date(Date.now() - 2 * 60 * 1000) }
},
{
@@ -50,38 +50,6 @@ export async function generateVector(): Promise<any> {
}
];
// 过滤重复的 qa 内容
// const searchRes = await Promise.allSettled(
// dataItems.map(async ({ q, a = '' }) => {
// if (!q) {
// return Promise.reject('q为空');
// }
// q = q.replace(/\\n/g, '\n');
// a = a.replace(/\\n/g, '\n');
// // Exactly the same data, not push
// try {
// const count = await PgClient.count('modelData', {
// where: [['user_id', userId], 'AND', ['kb_id', kbId], 'AND', ['q', q], 'AND', ['a', a]]
// });
// if (count > 0) {
// return Promise.reject('已经存在');
// }
// } catch (error) {
// error;
// }
// return Promise.resolve({
// q,
// a
// });
// })
// );
// const filterData = searchRes
// .filter((item) => item.status === 'fulfilled')
// .map<{ q: string; a: string }>((item: any) => item.value);
// 生成词向量
const vectors = await openaiEmbedding({
input: dataItems.map((item) => item.q),
@@ -107,6 +75,7 @@ export async function generateVector(): Promise<any> {
global.vectorQueueLen--;
generateVector();
} catch (err: any) {
global.vectorQueueLen--;
// log
if (err?.response) {
console.log('openai error: 生成向量错误');
@@ -125,7 +94,6 @@ export async function generateVector(): Promise<any> {
}
// unlock
global.vectorQueueLen--;
await TrainingData.findByIdAndUpdate(trainingId, {
lockTime: new Date('2000/1/1')
});

View File

@@ -1,6 +1,5 @@
import { Pool } from 'pg';
import type { QueryResultRow } from 'pg';
import { ModelDataStatusEnum } from '@/constants/model';
export const connectPg = async () => {
if (global.pgClient) {
@@ -180,8 +179,7 @@ export const insertKbItem = ({
{ key: 'kb_id', value: kbId },
{ key: 'q', value: item.q },
{ key: 'a', value: item.a },
{ key: 'vector', value: `[${item.vector}]` },
{ key: 'status', value: ModelDataStatusEnum.ready }
{ key: 'vector', value: `[${item.vector}]` }
])
});
};

View File

@@ -45,7 +45,7 @@ export const jsonRes = <T = any>(
} else if (openaiError[error?.response?.statusText]) {
msg = openaiError[error.response.statusText];
}
console.log(error?.message || error);
console.log(error);
}
res.json({

View File

@@ -8,7 +8,7 @@ import {
} from '@/constants/model';
import type { DataType } from './data';
import { BillTypeEnum } from '@/constants/user';
import { TrainingTypeEnum } from '@/constants/plugin';
import { TrainingModeEnum } from '@/constants/plugin';
export interface UserModelSchema {
_id: string;
@@ -74,7 +74,7 @@ export interface TrainingDataSchema {
userId: string;
kbId: string;
lockTime: Date;
mode: `${TrainingTypeEnum}`;
mode: `${TrainingModeEnum}`;
prompt: string;
q: string;
a: string;

8
src/types/pg.d.ts vendored
View File

@@ -1,11 +1,7 @@
import { ModelDataStatusEnum } from '@/constants/model';
export interface PgKBDataItemType {
id: string;
q: string;
a: string;
status: `${ModelDataStatusEnum}`;
// model_id: string;
// user_id: string;
// kb_id: string;
user_id: string;
kb_id: string;
}

View File

@@ -8,7 +8,6 @@ export interface KbItemType extends kbSchema {
export interface KbDataItemType {
id: string;
status: 'waiting' | 'ready';
q: string; // 提问词
a: string; // 原文
kbId: string;