mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
4.6.7 fix (#752)
This commit is contained in:
@@ -3,10 +3,12 @@ export const fileImgs = [
|
||||
{ suffix: 'csv', src: 'file/fill/csv' },
|
||||
{ suffix: '(doc|docs)', src: 'file/fill/doc' },
|
||||
{ suffix: 'txt', src: 'file/fill/txt' },
|
||||
{ suffix: 'md', src: 'file/fill/markdown' }
|
||||
{ suffix: 'md', src: 'file/fill/markdown' },
|
||||
{ suffix: 'html', src: 'file/fill/html' }
|
||||
|
||||
// { suffix: '.', src: '/imgs/files/file.svg' }
|
||||
];
|
||||
|
||||
export function getFileIcon(name = '', defaultImg = '/imgs/files/file.svg') {
|
||||
export function getFileIcon(name = '', defaultImg = 'file/fill/file') {
|
||||
return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg;
|
||||
}
|
||||
|
@@ -51,19 +51,18 @@ export const uploadMarkdownBase64 = async ({
|
||||
// match base64, upload and replace it
|
||||
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
|
||||
const base64Arr = rawText.match(base64Regex) || [];
|
||||
// upload base64 and replace it
|
||||
await Promise.all(
|
||||
base64Arr.map(async (base64Img) => {
|
||||
try {
|
||||
const str = await uploadImgController(base64Img);
|
||||
|
||||
rawText = rawText.replace(base64Img, str);
|
||||
} catch (error) {
|
||||
rawText = rawText.replace(base64Img, '');
|
||||
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
||||
}
|
||||
})
|
||||
);
|
||||
// upload base64 and replace it
|
||||
for await (const base64Img of base64Arr) {
|
||||
try {
|
||||
const str = await uploadImgController(base64Img);
|
||||
|
||||
rawText = rawText.replace(base64Img, str);
|
||||
} catch (error) {
|
||||
rawText = rawText.replace(base64Img, '');
|
||||
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove white space on both sides of the picture
|
||||
|
15
packages/global/core/dataset/api.d.ts
vendored
15
packages/global/core/dataset/api.d.ts
vendored
@@ -48,10 +48,6 @@ export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
|
||||
name: string;
|
||||
rawTextLength: number;
|
||||
hashRawText: string;
|
||||
trainingType: `${TrainingModeEnum}`;
|
||||
chunkSize: number;
|
||||
chunkSplitter: string;
|
||||
qaPrompt: string;
|
||||
|
||||
fileMetadata?: Record<string, any>;
|
||||
collectionMetadata?: Record<string, any>;
|
||||
@@ -74,3 +70,14 @@ export type PostWebsiteSyncParams = {
|
||||
datasetId: string;
|
||||
billId: string;
|
||||
};
|
||||
|
||||
export type PushDatasetDataProps = {
|
||||
collectionId: string;
|
||||
data: PushDatasetDataChunkProps[];
|
||||
trainingMode: `${TrainingModeEnum}`;
|
||||
prompt?: string;
|
||||
billId?: string;
|
||||
};
|
||||
export type PushDatasetDataResponse = {
|
||||
insertLen: number;
|
||||
};
|
||||
|
2
packages/global/core/dataset/controller.d.ts
vendored
2
packages/global/core/dataset/controller.d.ts
vendored
@@ -21,7 +21,7 @@ export type UpdateDatasetDataProps = {
|
||||
};
|
||||
|
||||
export type PatchIndexesProps = {
|
||||
type: 'create' | 'update' | 'delete';
|
||||
type: 'create' | 'update' | 'delete' | 'unChange';
|
||||
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||
dataId?: string;
|
||||
};
|
||||
|
@@ -46,8 +46,17 @@ export async function readMongoImg({ id }: { id: string }) {
|
||||
return data?.binary;
|
||||
}
|
||||
|
||||
export async function delImgByRelatedId(relateIds: string[]) {
|
||||
export async function delImgByRelatedId({
|
||||
teamId,
|
||||
relateIds
|
||||
}: {
|
||||
teamId: string;
|
||||
relateIds: string[];
|
||||
}) {
|
||||
if (relateIds.length === 0) return;
|
||||
|
||||
return MongoImage.deleteMany({
|
||||
teamId,
|
||||
'metadata.relatedId': { $in: relateIds.map((id) => String(id)) }
|
||||
});
|
||||
}
|
||||
|
@@ -34,9 +34,8 @@ const ImageSchema = new Schema({
|
||||
try {
|
||||
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
||||
ImageSchema.index({ type: 1 });
|
||||
ImageSchema.index({ teamId: 1 });
|
||||
ImageSchema.index({ createTime: 1 });
|
||||
ImageSchema.index({ 'metadata.relatedId': 1 });
|
||||
ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -28,12 +28,16 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
||||
// },
|
||||
filename: async (req, file, cb) => {
|
||||
const { ext } = path.parse(decodeURIComponent(file.originalname));
|
||||
cb(null, `${getNanoid(32)}${ext}`);
|
||||
cb(null, `${getNanoid()}${ext}`);
|
||||
}
|
||||
})
|
||||
}).single('file');
|
||||
|
||||
async doUpload<T = Record<string, any>>(req: NextApiRequest, res: NextApiResponse) {
|
||||
async doUpload<T = Record<string, any>>(
|
||||
req: NextApiRequest,
|
||||
res: NextApiResponse,
|
||||
originBuckerName?: `${BucketNameEnum}`
|
||||
) {
|
||||
return new Promise<{
|
||||
file: FileType;
|
||||
metadata: Record<string, any>;
|
||||
@@ -47,7 +51,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
||||
}
|
||||
|
||||
// check bucket name
|
||||
const bucketName = req.body?.bucketName as `${BucketNameEnum}`;
|
||||
const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`;
|
||||
if (bucketName && !bucketNameMap[bucketName]) {
|
||||
return reject('BucketName is invalid');
|
||||
}
|
||||
|
@@ -39,14 +39,15 @@ export const insertDatasetDataVector = async (
|
||||
}
|
||||
): Promise<{ insertId: string }> => {
|
||||
const { teamId, datasetId, collectionId, vectors, retry = 3 } = props;
|
||||
|
||||
try {
|
||||
const { rows } = await PgClient.insert(PgDatasetTableName, {
|
||||
values: [
|
||||
[
|
||||
{ key: 'vector', value: `[${vectors[0]}]` },
|
||||
{ key: 'team_id', value: String(teamId) },
|
||||
{ key: 'dataset_id', value: datasetId },
|
||||
{ key: 'collection_id', value: collectionId }
|
||||
{ key: 'dataset_id', value: String(datasetId) },
|
||||
{ key: 'collection_id', value: String(collectionId) }
|
||||
]
|
||||
]
|
||||
});
|
||||
@@ -176,8 +177,8 @@ export const getVectorDataByTime = async (start: Date, end: Date) => {
|
||||
`);
|
||||
|
||||
return rows.map((item) => ({
|
||||
id: item.id,
|
||||
datasetId: item.dataset_id,
|
||||
teamId: item.team_id
|
||||
id: String(item.id),
|
||||
teamId: item.team_id,
|
||||
datasetId: item.dataset_id
|
||||
}));
|
||||
};
|
||||
|
@@ -89,6 +89,7 @@ try {
|
||||
close custom feedback;
|
||||
*/
|
||||
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
|
||||
ChatItemSchema.index({ time: -1 }, { background: true });
|
||||
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
|
||||
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
|
||||
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });
|
||||
|
@@ -25,7 +25,7 @@ export async function createOneCollection({
|
||||
type,
|
||||
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 0,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
@@ -134,7 +134,10 @@ export async function delCollectionAndRelatedSources({
|
||||
|
||||
// delete file and imgs
|
||||
await Promise.all([
|
||||
delImgByRelatedId(relatedImageIds),
|
||||
delImgByRelatedId({
|
||||
teamId,
|
||||
relateIds: relatedImageIds
|
||||
}),
|
||||
delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList
|
||||
|
@@ -1,5 +1,15 @@
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import type {
|
||||
PushDatasetDataChunkProps,
|
||||
PushDatasetDataProps,
|
||||
PushDatasetDataResponse
|
||||
} from '@fastgpt/global/core/dataset/api.d';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
|
||||
try {
|
||||
@@ -19,3 +29,165 @@ export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promi
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk,
|
||||
|
||||
vectorModelList = [],
|
||||
qaModelList = []
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
vectorModelList: VectorModelItemType[];
|
||||
qaModelList: LLMModelItemType[];
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
const {
|
||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
|
||||
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
|
||||
if (!collectionId) return Promise.reject(`CollectionId is empty`);
|
||||
|
||||
if (trainingMode === TrainingModeEnum.chunk) {
|
||||
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Model ${vectorModel} is inValid`);
|
||||
}
|
||||
|
||||
return {
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
model: vectorModelData.model,
|
||||
weight: vectorModelData.weight
|
||||
};
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.qa) {
|
||||
const qaModelData = qaModelList?.find((item) => item.model === agentModel);
|
||||
if (!qaModelData) {
|
||||
return Promise.reject(`Model ${agentModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: qaModelData.maxContext * 0.8,
|
||||
model: qaModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
|
||||
};
|
||||
|
||||
const { model, maxToken, weight } = await checkModelValid({
|
||||
collectionId
|
||||
});
|
||||
|
||||
// format q and a, remove empty char
|
||||
data.forEach((item) => {
|
||||
item.q = simpleText(item.q);
|
||||
item.a = simpleText(item.a);
|
||||
|
||||
item.indexes = item.indexes
|
||||
?.map((index) => {
|
||||
return {
|
||||
...index,
|
||||
text: simpleText(index.text)
|
||||
};
|
||||
})
|
||||
.filter(Boolean);
|
||||
});
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
||||
success: [],
|
||||
overToken: [],
|
||||
repeat: [],
|
||||
error: []
|
||||
};
|
||||
|
||||
// filter repeat content
|
||||
data.forEach((item) => {
|
||||
if (!item.q) {
|
||||
filterResult.error.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
const text = item.q + item.a;
|
||||
|
||||
// count q token
|
||||
const token = countPromptTokens(item.q);
|
||||
|
||||
if (token > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
console.log('repeat', item);
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
set.add(text);
|
||||
}
|
||||
});
|
||||
|
||||
// insert data to db
|
||||
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
||||
try {
|
||||
const results = await MongoDatasetTraining.insertMany(
|
||||
dataList.map((item, i) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? i,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
}))
|
||||
);
|
||||
await delay(500);
|
||||
return results.length;
|
||||
} catch (error) {
|
||||
if (retry > 0) {
|
||||
await delay(500);
|
||||
return insertData(dataList, retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
let insertLen = 0;
|
||||
const chunkSize = 50;
|
||||
const chunkList = filterResult.success.reduce(
|
||||
(acc, cur) => {
|
||||
const lastChunk = acc[acc.length - 1];
|
||||
if (lastChunk.length < chunkSize) {
|
||||
lastChunk.push(cur);
|
||||
} else {
|
||||
acc.push([cur]);
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
[[]] as PushDatasetDataChunkProps[][]
|
||||
);
|
||||
for await (const chunks of chunkList) {
|
||||
insertLen += await insertData(chunks);
|
||||
}
|
||||
|
||||
delete filterResult.success;
|
||||
|
||||
return {
|
||||
insertLen,
|
||||
...filterResult
|
||||
};
|
||||
}
|
||||
|
@@ -52,7 +52,7 @@ const BillSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
BillSchema.index({ teamId: 1, tmbId: 1, time: -1 });
|
||||
BillSchema.index({ teamId: 1, time: -1 });
|
||||
BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
40
packages/web/common/file/read/csv.ts
Normal file
40
packages/web/common/file/read/csv.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import Papa from 'papaparse';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
/**
|
||||
* read csv to json
|
||||
* @response {
|
||||
* header: string[],
|
||||
* data: string[][]
|
||||
* }
|
||||
*/
|
||||
export const readCsvContent = async ({ file }: { file: File }) => {
|
||||
try {
|
||||
const { rawText: textArr } = await readFileRawText(file);
|
||||
const csvArr = Papa.parse(textArr).data as string[][];
|
||||
if (csvArr.length === 0) {
|
||||
throw new Error('csv 解析失败');
|
||||
}
|
||||
|
||||
const header = csvArr.shift() as string[];
|
||||
|
||||
// add title to data
|
||||
const rawText = csvArr
|
||||
.map((item) =>
|
||||
item.map((value, index) => {
|
||||
if (!header[index]) return value;
|
||||
return `${header[index]}: ${value}`;
|
||||
})
|
||||
)
|
||||
.flat()
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText,
|
||||
header,
|
||||
data: csvArr.map((item) => item)
|
||||
};
|
||||
} catch (error) {
|
||||
return Promise.reject('解析 csv 文件失败');
|
||||
}
|
||||
};
|
@@ -1,4 +1,5 @@
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import { readCsvContent } from './csv';
|
||||
import { readHtmlFile } from './html';
|
||||
import { readMdFile } from './md';
|
||||
import { readPdfFile } from './pdf';
|
||||
@@ -29,6 +30,8 @@ export const readFileRawContent = async ({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'csv':
|
||||
return readCsvContent({ file });
|
||||
case 'pdf':
|
||||
const pdf = await loadFile2Buffer({ file });
|
||||
return readPdfFile({ pdf });
|
||||
|
@@ -74,7 +74,7 @@ const JSONEditor = ({ defaultValue, value, onChange, resize, ...props }: Props)
|
||||
|
||||
<Box
|
||||
borderWidth={'1px'}
|
||||
borderRadius={'base'}
|
||||
borderRadius={'md'}
|
||||
borderColor={'myGray.200'}
|
||||
py={2}
|
||||
{...props}
|
||||
|
@@ -22,13 +22,15 @@
|
||||
"react-dom": "18.2.0",
|
||||
"react-i18next": "^12.3.1",
|
||||
"turndown": "^7.1.2",
|
||||
"lexical":"0.12.6",
|
||||
"lexical": "0.12.6",
|
||||
"@lexical/react": "0.12.6",
|
||||
"papaparse": "^5.4.1",
|
||||
"@lexical/utils": "0.12.6",
|
||||
"@lexical/text": "0.12.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "18.2.0",
|
||||
"@types/papaparse": "^5.3.7",
|
||||
"@types/react-dom": "18.2.0",
|
||||
"@types/turndown": "^5.0.4"
|
||||
}
|
||||
|
Reference in New Issue
Block a user