diff --git a/client/package.json b/client/package.json index 7d84398c2..5a9076498 100644 --- a/client/package.json +++ b/client/package.json @@ -31,6 +31,7 @@ "i18next": "^22.5.1", "immer": "^9.0.19", "js-cookie": "^3.0.5", + "jschardet": "^3.0.0", "jsdom": "^22.1.0", "jsonwebtoken": "^9.0.0", "lodash": "^4.17.21", diff --git a/client/pnpm-lock.yaml b/client/pnpm-lock.yaml index 3c6d39e45..3030193f9 100644 --- a/client/pnpm-lock.yaml +++ b/client/pnpm-lock.yaml @@ -71,6 +71,9 @@ dependencies: js-cookie: specifier: ^3.0.5 version: registry.npmmirror.com/js-cookie@3.0.5 + jschardet: + specifier: ^3.0.0 + version: registry.npmmirror.com/jschardet@3.0.0 jsdom: specifier: ^22.1.0 version: registry.npmmirror.com/jsdom@22.1.0 @@ -8918,6 +8921,13 @@ packages: argparse: registry.npmmirror.com/argparse@2.0.1 dev: true + registry.npmmirror.com/jschardet@3.0.0: + resolution: {integrity: sha512-lJH6tJ77V8Nzd5QWRkFYCLc13a3vADkh3r/Fi8HupZGWk2OVVDfnZP8V/VgQgZ+lzW0kG2UGb5hFgt3V3ndotQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/jschardet/-/jschardet-3.0.0.tgz} + name: jschardet + version: 3.0.0 + engines: {node: '>=0.1.90'} + dev: false + registry.npmmirror.com/jsdom@22.1.0: resolution: {integrity: sha512-/9AVW7xNbsBv6GfWho4TTNjEo9fe6Zhf9O7s0Fhhr3u+awPwAJMKwAMXnkk5vBxflqLW9hTHX/0cs+P3gW+cQw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/jsdom/-/jsdom-22.1.0.tgz} name: jsdom diff --git a/client/public/locales/en/common.json b/client/public/locales/en/common.json index 379c6df86..3889b959c 100644 --- a/client/public/locales/en/common.json +++ b/client/public/locales/en/common.json @@ -88,13 +88,16 @@ }, "file": { "Click to download CSV template": "Click to download CSV template", + "Click to view file": "Click to view file", "Create File": "Create File", "Create file": "Create file", "Drag and drop": "Drag and drop files here", "Fetch Url": "Fetch Url", "If the imported file is garbled, please convert CSV to UTF-8 encoding format": "If the imported file is garbled, please convert CSV to UTF-8 encoding format", + "Parse": "{{name}} Parsing...", "Release the mouse to upload the file": "Release the mouse to upload the file", "Select a maximum of 10 files": "Select a maximum of 10 files", + "Uploading": "Uploading: {{name}}, Progress: {{percent}}%", "max 10": "Max 10 files", "select a document": "select a document", "support": "support {{fileExtension}} file", diff --git a/client/public/locales/zh/common.json b/client/public/locales/zh/common.json index 203f801bf..e1ba315da 100644 --- a/client/public/locales/zh/common.json +++ b/client/public/locales/zh/common.json @@ -88,13 +88,16 @@ }, "file": { "Click to download CSV template": "点击下载 CSV 模板", + "Click to view file": "点击查看原始文件", "Create File": "创建新文件", "Create file": "创建文件", "Drag and drop": "拖拽文件至此", "Fetch Url": "链接读取", "If the imported file is garbled, please convert CSV to UTF-8 encoding format": "如果导入文件乱码,请将 CSV 转成 UTF-8 编码格式", + "Parse": "{{name}} 解析中...", "Release the mouse to upload the file": "松开鼠标上传文件", "Select a maximum of 10 files": "最多选择10个文件", + "Uploading": "正在上传 {{name}},进度: {{percent}}%", "max 10": "最多选择 10 个文件", "select a document": "选择文件", "support": "支持 {{fileExtension}} 文件", diff --git a/client/src/api/plugins/kb.ts b/client/src/api/plugins/kb.ts index a705c7c58..ac24a5ef7 100644 --- a/client/src/api/plugins/kb.ts +++ b/client/src/api/plugins/kb.ts @@ -1,5 +1,5 @@ import { GET, POST, PUT, DELETE } from '../request'; -import type { KbItemType, KbListItemType } from '@/types/plugin'; +import type { DatasetItemType, KbItemType, KbListItemType } from '@/types/plugin'; import { RequestPaging } from '@/types/index'; import { TrainingModeEnum } from '@/constants/plugin'; import { @@ -13,6 +13,7 @@ import { import { Response as KbDataItemType } from '@/pages/api/plugins/kb/data/getDataById'; import { Props as UpdateDataProps } from '@/pages/api/openapi/kb/updateData'; import type { KbUpdateParams, CreateKbParams } from '../request/kb'; +import { QuoteItemType } from '@/types/chat'; /* knowledge base */ export const getKbList = () => GET(`/plugins/kb/list`); @@ -58,7 +59,7 @@ export const getTrainingData = (data: { kbId: string; init: boolean }) => export const getTrainingQueueLen = () => GET(`/plugins/kb/data/getQueueLen`); export const getKbDataItemById = (dataId: string) => - GET(`/plugins/kb/data/getDataById`, { dataId }); + GET(`/plugins/kb/data/getDataById`, { dataId }); /** * 直接push数据 @@ -69,10 +70,8 @@ export const postKbDataFromList = (data: PushDataProps) => /** * insert one data to dataset */ -export const insertData2Kb = (data: { - kbId: string; - data: { a: string; q: string; source?: string }; -}) => POST(`/plugins/kb/data/insertData`, data); +export const insertData2Kb = (data: { kbId: string; data: DatasetItemType }) => + POST(`/plugins/kb/data/insertData`, data); /** * 更新一条数据 diff --git a/client/src/api/request.ts b/client/src/api/request.ts index fd6bbb6a8..ed754c9e4 100644 --- a/client/src/api/request.ts +++ b/client/src/api/request.ts @@ -1,4 +1,9 @@ -import axios, { Method, InternalAxiosRequestConfig, AxiosResponse } from 'axios'; +import axios, { + Method, + InternalAxiosRequestConfig, + AxiosResponse, + AxiosProgressEvent +} from 'axios'; import { clearToken, getToken } from '@/utils/user'; import { TOKEN_ERROR_CODE } from '@/service/errorCode'; @@ -6,6 +11,7 @@ interface ConfigType { headers?: { [key: string]: string }; hold?: boolean; timeout?: number; + onUploadProgress?: (progressEvent: AxiosProgressEvent) => void; } interface ResponseDataType { code: number; diff --git a/client/src/api/system.ts b/client/src/api/system.ts index cf384ace5..3982628ea 100644 --- a/client/src/api/system.ts +++ b/client/src/api/system.ts @@ -1,6 +1,20 @@ import { GET, POST, PUT } from './request'; import type { InitDateResponse } from '@/pages/api/system/getInitData'; +import { AxiosProgressEvent } from 'axios'; export const getInitData = () => GET('/system/getInitData'); export const uploadImg = (base64Img: string) => POST('/system/uploadImage', { base64Img }); + +export const postUploadFiles = ( + data: FormData, + onUploadProgress: (progressEvent: AxiosProgressEvent) => void +) => + POST('/plugins/file/upload', data, { + onUploadProgress, + headers: { + 'Content-Type': 'multipart/form-data; charset=utf-8' + } + }); + +export const getFileViewUrl = (fileId: string) => GET('/plugins/file/readUrl', { fileId }); diff --git a/client/src/components/ChatBox/ContextModal.tsx b/client/src/components/ChatBox/ContextModal.tsx index fa2e4484a..6802bdae4 100644 --- a/client/src/components/ChatBox/ContextModal.tsx +++ b/client/src/components/ChatBox/ContextModal.tsx @@ -21,7 +21,13 @@ const ContextModal = ({ minW={['90vw', '600px']} isCentered > - + {context.map((item, i) => ( (); + const [editDataItem, setEditDataItem] = useState(); /** * click edit, get new kbDataItem @@ -44,19 +36,14 @@ const QuoteModal = ({ if (!item.id) return; try { setIsLoading(true); - const data = (await getKbDataItemById(item.id)) as QuoteItemType; + const data = await getKbDataItemById(item.id); if (!data) { onUpdateQuote(item.id, '已删除'); throw new Error('该数据已被删除'); } - setEditDataItem({ - kbId: data.kb_id, - dataId: data.id, - q: data.q, - a: data.a - }); + setEditDataItem(data); } catch (err) { toast({ status: 'warning', @@ -85,7 +72,13 @@ const QuoteModal = ({ } > - + {rawSearch.map((item, i) => ( - {item.source && ({item.source})} + {item.source && } {item.q} {item.a} {item.id && ( @@ -136,10 +129,13 @@ const QuoteModal = ({ {editDataItem && ( setEditDataItem(undefined)} - onSuccess={() => onUpdateQuote(editDataItem.dataId, '手动修改')} - onDelete={() => onUpdateQuote(editDataItem.dataId, '已删除')} - kbId={editDataItem.kbId} - defaultValues={editDataItem} + onSuccess={() => onUpdateQuote(editDataItem.id, '手动修改')} + onDelete={() => onUpdateQuote(editDataItem.id, '已删除')} + kbId={editDataItem.kb_id} + defaultValues={{ + ...editDataItem, + dataId: editDataItem.id + }} /> )} diff --git a/client/src/components/Loading/index.tsx b/client/src/components/Loading/index.tsx index e9441e5fb..919ca7592 100644 --- a/client/src/components/Loading/index.tsx +++ b/client/src/components/Loading/index.tsx @@ -1,7 +1,7 @@ import React from 'react'; -import { Spinner, Flex } from '@chakra-ui/react'; +import { Spinner, Flex, Box } from '@chakra-ui/react'; -const Loading = ({ fixed = true }: { fixed?: boolean }) => { +const Loading = ({ fixed = true, text = '' }: { fixed?: boolean; text?: string }) => { return ( { bottom={0} alignItems={'center'} justifyContent={'center'} + flexDirection={'column'} > + {text && ( + + {text} + + )} ); }; diff --git a/client/src/constants/common.ts b/client/src/constants/common.ts index e11da54d5..0604a33f1 100644 --- a/client/src/constants/common.ts +++ b/client/src/constants/common.ts @@ -15,7 +15,8 @@ export const fileImgs = [ export enum TrackEventName { windowError = 'windowError', - pageError = 'pageError' + pageError = 'pageError', + wordReadError = 'wordReadError' } export const htmlTemplate = ` diff --git a/client/src/hooks/useLoading.tsx b/client/src/hooks/useLoading.tsx index 64bbde086..5d87e6c0b 100644 --- a/client/src/hooks/useLoading.tsx +++ b/client/src/hooks/useLoading.tsx @@ -5,8 +5,16 @@ export const useLoading = (props?: { defaultLoading: boolean }) => { const [isLoading, setIsLoading] = useState(props?.defaultLoading || false); const Loading = useCallback( - ({ loading, fixed = true }: { loading?: boolean; fixed?: boolean }): JSX.Element | null => { - return isLoading || loading ? : null; + ({ + loading, + fixed = true, + text = '' + }: { + loading?: boolean; + fixed?: boolean; + text?: string; + }): JSX.Element | null => { + return isLoading || loading ? : null; }, [isLoading] ); diff --git a/client/src/pages/api/admin/initv43.ts b/client/src/pages/api/admin/initv43.ts new file mode 100644 index 000000000..9ff322e30 --- /dev/null +++ b/client/src/pages/api/admin/initv43.ts @@ -0,0 +1,35 @@ +// Next.js API route support: https://nextjs.org/docs/api-routes/introduction +import type { NextApiRequest, NextApiResponse } from 'next'; +import { jsonRes } from '@/service/response'; +import { authUser } from '@/service/utils/auth'; +import { PgClient } from '@/service/pg'; +import { PgTrainingTableName } from '@/constants/plugin'; + +export default async function handler(req: NextApiRequest, res: NextApiResponse) { + try { + await authUser({ req, authRoot: true }); + + const { rowCount } = await PgClient.query(`SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = '${PgTrainingTableName}' + AND column_name = 'file_id'`); + + if (rowCount > 0) { + return jsonRes(res, { + data: '已经存在file_id字段' + }); + } + + jsonRes(res, { + data: await PgClient.query( + `ALTER TABLE ${PgTrainingTableName} ADD COLUMN file_id VARCHAR(100)` + ) + }); + } catch (error) { + jsonRes(res, { + code: 500, + error + }); + } +} diff --git a/client/src/pages/api/openapi/kb/pushData.ts b/client/src/pages/api/openapi/kb/pushData.ts index d48242f3c..8d3fe412d 100644 --- a/client/src/pages/api/openapi/kb/pushData.ts +++ b/client/src/pages/api/openapi/kb/pushData.ts @@ -9,12 +9,11 @@ import { startQueue } from '@/service/utils/tools'; import { PgClient } from '@/service/pg'; import { modelToolMap } from '@/utils/plugin'; import { getVectorModel } from '@/service/utils/data'; - -export type DateItemType = { a: string; q: string; source?: string }; +import { DatasetItemType } from '@/types/plugin'; export type Props = { kbId: string; - data: DateItemType[]; + data: DatasetItemType[]; mode: `${TrainingModeEnum}`; prompt?: string; }; @@ -95,7 +94,7 @@ export async function pushDataToKb({ // 过滤重复的 qa 内容 const set = new Set(); - const filterData: DateItemType[] = []; + const filterData: DatasetItemType[] = []; data.forEach((item) => { if (!item.q) return; @@ -120,13 +119,10 @@ export async function pushDataToKb({ // 数据库去重 const insertData = ( await Promise.allSettled( - filterData.map(async ({ q, a = '', source }) => { + filterData.map(async (data) => { + let { q, a } = data; if (mode !== TrainingModeEnum.index) { - return Promise.resolve({ - q, - a, - source - }); + return Promise.resolve(data); } if (!q) { @@ -152,23 +148,17 @@ export async function pushDataToKb({ console.log(error); error; } - return Promise.resolve({ - q, - a, - source - }); + return Promise.resolve(data); }) ) ) .filter((item) => item.status === 'fulfilled') - .map((item: any) => item.value); + .map((item: any) => item.value); // 插入记录 const insertRes = await TrainingData.insertMany( insertData.map((item) => ({ - q: item.q, - a: item.a, - source: item.source, + ...item, userId, kbId, mode, diff --git a/client/src/pages/api/openapi/kb/searchTest.ts b/client/src/pages/api/openapi/kb/searchTest.ts index 9283735b2..a19330a85 100644 --- a/client/src/pages/api/openapi/kb/searchTest.ts +++ b/client/src/pages/api/openapi/kb/searchTest.ts @@ -41,7 +41,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex const response: any = await PgClient.query( `BEGIN; SET LOCAL ivfflat.probes = ${global.systemEnv.pgIvfflatProbe || 10}; - select id,q,a,source,(vector <#> '[${ + select id, q, a, source, file_id, (vector <#> '[${ vectors[0] }]') * -1 AS score from ${PgTrainingTableName} where kb_id='${kbId}' AND user_id='${userId}' order by vector <#> '[${ vectors[0] @@ -49,7 +49,9 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex COMMIT;` ); - jsonRes(res, { data: response?.[2]?.rows || [] }); + jsonRes(res, { + data: response?.[2]?.rows || [] + }); } catch (err) { console.log(err); jsonRes(res, { diff --git a/client/src/pages/api/plugins/file/read.ts b/client/src/pages/api/plugins/file/read.ts index b9572f368..4da034948 100644 --- a/client/src/pages/api/plugins/file/read.ts +++ b/client/src/pages/api/plugins/file/read.ts @@ -3,6 +3,7 @@ import { jsonRes } from '@/service/response'; import { connectToDatabase } from '@/service/mongo'; import { GridFSStorage } from '@/service/lib/gridfs'; import { authFileToken } from './readUrl'; +import jschardet from 'jschardet'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { @@ -12,6 +13,10 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< const { fileId, userId } = await authFileToken(token); + if (!fileId) { + throw new Error('fileId is empty'); + } + const gridFs = new GridFSStorage('dataset', userId); const [file, buffer] = await Promise.all([ @@ -19,9 +24,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< gridFs.download(fileId) ]); - res.setHeader('encoding', file.encoding); + const encoding = jschardet.detect(buffer)?.encoding; + + res.setHeader('encoding', encoding); res.setHeader('Content-Type', file.contentType); res.setHeader('Cache-Control', 'public, max-age=3600'); + res.setHeader('Content-Disposition', `inline; filename="${encodeURIComponent(file.filename)}"`); res.end(buffer); } catch (error) { diff --git a/client/src/pages/api/plugins/file/upload.ts b/client/src/pages/api/plugins/file/upload.ts index 0c4401caa..efeccf310 100644 --- a/client/src/pages/api/plugins/file/upload.ts +++ b/client/src/pages/api/plugins/file/upload.ts @@ -28,9 +28,10 @@ class UploadModel { limits: { fieldSize: maxSize }, + preservePath: true, storage: multer.diskStorage({ filename: (_req, file, cb) => { - const { ext } = path.parse(file.originalname); + const { ext } = path.parse(decodeURIComponent(file.originalname)); cb(null, nanoid() + ext); } }) @@ -44,8 +45,13 @@ class UploadModel { return reject(error); } - // @ts-ignore - resolve({ files: req.files }); + resolve({ + // @ts-ignore + files: req.files?.map((file) => ({ + ...file, + originalname: decodeURIComponent(file.originalname) + })) + }); }); }); } @@ -56,9 +62,9 @@ const upload = new UploadModel(); export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { await connectToDatabase(); - const { userId } = await authUser({ req }); + const { userId } = await authUser({ req, authToken: true }); - const { files } = await upload.doUpload(req, res); + const { files = [] } = await upload.doUpload(req, res); const gridFs = new GridFSStorage('dataset', userId); diff --git a/client/src/pages/api/plugins/kb/data/getDataById.ts b/client/src/pages/api/plugins/kb/data/getDataById.ts index 5d6ea143d..834e886b9 100644 --- a/client/src/pages/api/plugins/kb/data/getDataById.ts +++ b/client/src/pages/api/plugins/kb/data/getDataById.ts @@ -30,7 +30,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< const where: any = [['user_id', userId], 'AND', ['id', dataId]]; const searchRes = await PgClient.select(PgTrainingTableName, { - fields: ['kb_id', 'id', 'q', 'a', 'source'], + fields: ['kb_id', 'id', 'q', 'a', 'source', 'file_id'], where, limit: 1 }); diff --git a/client/src/pages/api/plugins/kb/data/getDataList.ts b/client/src/pages/api/plugins/kb/data/getDataList.ts index daf9b529d..cffbc45eb 100644 --- a/client/src/pages/api/plugins/kb/data/getDataList.ts +++ b/client/src/pages/api/plugins/kb/data/getDataList.ts @@ -43,7 +43,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< const [searchRes, total] = await Promise.all([ PgClient.select(PgTrainingTableName, { - fields: ['id', 'q', 'a', 'source'], + fields: ['id', 'q', 'a', 'source', 'file_id'], where, order: [{ field: 'id', mode: 'DESC' }], limit: pageSize, diff --git a/client/src/pages/api/plugins/kb/data/insertData.ts b/client/src/pages/api/plugins/kb/data/insertData.ts index a54e1c19c..fc47d9040 100644 --- a/client/src/pages/api/plugins/kb/data/insertData.ts +++ b/client/src/pages/api/plugins/kb/data/insertData.ts @@ -8,10 +8,11 @@ import { insertKbItem, PgClient } from '@/service/pg'; import { modelToolMap } from '@/utils/plugin'; import { getVectorModel } from '@/service/utils/data'; import { getVector } from '@/pages/api/openapi/plugin/vector'; +import { DatasetItemType } from '@/types/plugin'; export type Props = { kbId: string; - data: { a: string; q: string; source?: string }; + data: DatasetItemType; }; export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse) { diff --git a/client/src/pages/kb/detail/components/DataCard.tsx b/client/src/pages/kb/detail/components/DataCard.tsx index a9fec1df5..d3b6033da 100644 --- a/client/src/pages/kb/detail/components/DataCard.tsx +++ b/client/src/pages/kb/detail/components/DataCard.tsx @@ -198,8 +198,7 @@ const DataCard = ({ kbId }: { kbId: string }) => { onClick={() => setEditInputData({ dataId: item.id, - q: item.q, - a: item.a + ...item }) } > diff --git a/client/src/pages/kb/detail/components/Import/Chunk.tsx b/client/src/pages/kb/detail/components/Import/Chunk.tsx index 871541ac6..d28516795 100644 --- a/client/src/pages/kb/detail/components/Import/Chunk.tsx +++ b/client/src/pages/kb/detail/components/Import/Chunk.tsx @@ -109,10 +109,9 @@ const ChunkImport = ({ kbId }: { kbId: string }) => { return { ...file, tokens: splitRes.tokens, - chunks: splitRes.chunks.map((chunk) => ({ - q: chunk, - a: '', - source: file.filename + chunks: file.chunks.map((chunk, i) => ({ + ...chunk, + q: splitRes.chunks[i] })) }; }) diff --git a/client/src/pages/kb/detail/components/Import/Csv.tsx b/client/src/pages/kb/detail/components/Import/Csv.tsx index 68c99214b..15d701631 100644 --- a/client/src/pages/kb/detail/components/Import/Csv.tsx +++ b/client/src/pages/kb/detail/components/Import/Csv.tsx @@ -1,11 +1,10 @@ -import React, { useState, useCallback, useMemo } from 'react'; +import React, { useState, useMemo } from 'react'; import { Box, Flex, Button, useTheme, Image } from '@chakra-ui/react'; import { useToast } from '@/hooks/useToast'; import { useConfirm } from '@/hooks/useConfirm'; import { useMutation } from '@tanstack/react-query'; import { postKbDataFromList } from '@/api/plugins/kb'; import { getErrText } from '@/utils/tools'; -import { vectorModelList } from '@/store/static'; import MyIcon from '@/components/Icon'; import DeleteIcon, { hoverDeleteStyles } from '@/components/Icon/delete'; import { TrainingModeEnum } from '@/constants/plugin'; diff --git a/client/src/pages/kb/detail/components/Import/FileSelect.tsx b/client/src/pages/kb/detail/components/Import/FileSelect.tsx index 751d5813e..e52aedd3c 100644 --- a/client/src/pages/kb/detail/components/Import/FileSelect.tsx +++ b/client/src/pages/kb/detail/components/Import/FileSelect.tsx @@ -2,7 +2,13 @@ import MyIcon from '@/components/Icon'; import { useLoading } from '@/hooks/useLoading'; import { useSelectFile } from '@/hooks/useSelectFile'; import { useToast } from '@/hooks/useToast'; -import { fileDownload, readCsvContent, simpleText, splitText2Chunks } from '@/utils/file'; +import { + fileDownload, + readCsvContent, + simpleText, + splitText2Chunks, + uploadFiles +} from '@/utils/file'; import { Box, Flex, useDisclosure, type BoxProps } from '@chakra-ui/react'; import { fileImgs } from '@/constants/common'; import { DragEvent, useCallback, useState } from 'react'; @@ -11,7 +17,8 @@ import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; import { customAlphabet } from 'nanoid'; import dynamic from 'next/dynamic'; import MyTooltip from '@/components/MyTooltip'; -import { FetchResultItem } from '@/types/plugin'; +import { FetchResultItem, DatasetItemType } from '@/types/plugin'; +import { getErrText } from '@/utils/tools'; const UrlFetchModal = dynamic(() => import('./UrlFetchModal')); const CreateFileModal = dynamic(() => import('./CreateFileModal')); @@ -22,7 +29,7 @@ const csvTemplate = `question,answer,source\n"什么是 laf","laf 是一个云 export type FileItemType = { id: string; filename: string; - chunks: { q: string; a: string; source?: string }[]; + chunks: DatasetItemType[]; text: string; icon: string; tokens: number; @@ -58,7 +65,7 @@ const FileSelect = ({ }); const [isDragging, setIsDragging] = useState(false); - const [selecting, setSelecting] = useState(false); + const [selectingText, setSelectingText] = useState(); const { isOpen: isOpenUrlFetch, @@ -73,7 +80,6 @@ const FileSelect = ({ const onSelectFile = useCallback( async (files: File[]) => { - setSelecting(true); try { // Parse file by file const chunkFiles: FileItemType[] = []; @@ -88,19 +94,31 @@ const FileSelect = ({ continue; } - let text = await (async () => { - switch (extension) { - case 'txt': - case 'md': - return readTxtContent(file); - case 'pdf': - return readPdfContent(file); - case 'doc': - case 'docx': - return readDocContent(file); - } - return ''; - })(); + // parse and upload files + let [text, filesId] = await Promise.all([ + (async () => { + switch (extension) { + case 'txt': + case 'md': + return readTxtContent(file); + case 'pdf': + return readPdfContent(file); + case 'doc': + case 'docx': + return readDocContent(file); + } + return ''; + })(), + uploadFiles(files, (percent) => { + if (percent < 100) { + setSelectingText( + t('file.Uploading', { name: file.name.slice(0, 20), percent }) || '' + ); + } else { + setSelectingText(t('file.Parse', { name: file.name.slice(0, 20) }) || ''); + } + }) + ]); if (text) { text = simpleText(text); @@ -117,7 +135,8 @@ const FileSelect = ({ chunks: splitRes.chunks.map((chunk) => ({ q: chunk, a: '', - source: file.name + source: file.name, + file_id: filesId[0] })) }; chunkFiles.unshift(fileItem); @@ -139,7 +158,8 @@ const FileSelect = ({ chunks: data.map((item) => ({ q: item[0], a: item[1], - source: item[2] || file.name + source: item[2] || file.name, + file_id: filesId[0] })) }; @@ -150,13 +170,13 @@ const FileSelect = ({ } catch (error: any) { console.log(error); toast({ - title: typeof error === 'string' ? error : '解析文件失败', + title: getErrText(error, '解析文件失败'), status: 'error' }); } - setSelecting(false); + setSelectingText(undefined); }, - [chunkLen, onPushFiles, toast] + [chunkLen, onPushFiles, t, toast] ); const onUrlFetch = useCallback( (e: FetchResultItem[]) => { @@ -353,7 +373,9 @@ const FileSelect = ({ {t('file.Click to download CSV template')} )} - + {selectingText !== undefined && ( + + )} {isOpenUrlFetch && } {isOpenCreateFile && } diff --git a/client/src/pages/kb/detail/components/Import/QA.tsx b/client/src/pages/kb/detail/components/Import/QA.tsx index dfb40231d..5b3f875a9 100644 --- a/client/src/pages/kb/detail/components/Import/QA.tsx +++ b/client/src/pages/kb/detail/components/Import/QA.tsx @@ -97,10 +97,9 @@ const QAImport = ({ kbId }: { kbId: string }) => { return { ...file, tokens: splitRes.tokens, - chunks: splitRes.chunks.map((chunk) => ({ - q: chunk, - a: '', - source: file.filename + chunks: file.chunks.map((chunk, i) => ({ + ...chunk, + q: splitRes.chunks[i] })) }; }) diff --git a/client/src/pages/kb/detail/components/InputDataModal.tsx b/client/src/pages/kb/detail/components/InputDataModal.tsx index 47b3753b4..c45f87975 100644 --- a/client/src/pages/kb/detail/components/InputDataModal.tsx +++ b/client/src/pages/kb/detail/components/InputDataModal.tsx @@ -1,7 +1,8 @@ import React, { useState, useCallback } from 'react'; -import { Box, Flex, Button, Textarea, IconButton } from '@chakra-ui/react'; +import { Box, Flex, Button, Textarea, IconButton, BoxProps } from '@chakra-ui/react'; import { useForm } from 'react-hook-form'; import { insertData2Kb, putKbDataById, delOneKbDataByDataId } from '@/api/plugins/kb'; +import { getFileViewUrl } from '@/api/system'; import { useToast } from '@/hooks/useToast'; import { getErrText } from '@/utils/tools'; import MyIcon from '@/components/Icon'; @@ -10,8 +11,10 @@ import MyTooltip from '@/components/MyTooltip'; import { QuestionOutlineIcon } from '@chakra-ui/icons'; import { useUserStore } from '@/store/user'; import { useQuery } from '@tanstack/react-query'; +import { DatasetItemType } from '@/types/plugin'; +import { useTranslation } from 'react-i18next'; -export type FormData = { dataId?: string; a: string; q: string; source?: string }; +export type FormData = { dataId?: string } & DatasetItemType; const InputDataModal = ({ onClose, @@ -29,12 +32,13 @@ const InputDataModal = ({ kbId: string; defaultValues?: FormData; }) => { + const { t } = useTranslation(); const [loading, setLoading] = useState(false); const { toast } = useToast(); const { kbDetail, getKbDetail } = useUserStore(); - const { register, handleSubmit, reset } = useForm({ + const { getValues, register, handleSubmit, reset } = useForm({ defaultValues }); @@ -183,7 +187,16 @@ const InputDataModal = ({ - + + + {defaultValues.dataId && onDelete && ( )} - - + + + + @@ -233,3 +248,44 @@ const InputDataModal = ({ }; export default InputDataModal; + +interface RawFileTextProps extends BoxProps { + filename?: string; + fileId?: string; +} +export function RawFileText({ fileId, filename = '', ...props }: RawFileTextProps) { + const { t } = useTranslation(); + const { toast } = useToast(); + return ( + + { + try { + const url = await getFileViewUrl(fileId); + const asPath = `${location.origin}${url}`; + window.open(asPath, '_blank'); + } catch (error) { + toast({ + title: getErrText(error, '获取文件地址失败'), + status: 'error' + }); + } + } + } + : {})} + {...props} + > + {filename} + + + ); +} diff --git a/client/src/pages/kb/detail/components/Test.tsx b/client/src/pages/kb/detail/components/Test.tsx index 3be73d994..079865b30 100644 --- a/client/src/pages/kb/detail/components/Test.tsx +++ b/client/src/pages/kb/detail/components/Test.tsx @@ -207,8 +207,7 @@ const Test = ({ kbId }: { kbId: string }) => { setEditData({ dataId: data.id, - q: data.q, - a: data.a + ...data }); } catch (err) { toast({ diff --git a/client/src/service/events/generateQA.ts b/client/src/service/events/generateQA.ts index 714dc4d01..58dbde75d 100644 --- a/client/src/service/events/generateQA.ts +++ b/client/src/service/events/generateQA.ts @@ -38,7 +38,7 @@ export async function generateQA(): Promise { prompt: 1, q: 1, source: 1, - model: 1 + file_id: 1 }); // task preemption @@ -136,7 +136,8 @@ A2: kbId, data: responseList.map((item) => ({ ...item, - source: data.source + source: data.source, + file_id: data.file_id })), userId, mode: TrainingModeEnum.index diff --git a/client/src/service/events/generateVector.ts b/client/src/service/events/generateVector.ts index 74bd72a05..3cc486a14 100644 --- a/client/src/service/events/generateVector.ts +++ b/client/src/service/events/generateVector.ts @@ -38,6 +38,7 @@ export async function generateVector(): Promise { q: 1, a: 1, source: 1, + file_id: 1, vectorModel: 1 }); @@ -74,6 +75,7 @@ export async function generateVector(): Promise { q: dataItems[i].q, a: dataItems[i].a, source: data.source, + file_id: data.file_id, vector })) }); diff --git a/client/src/service/models/trainingData.ts b/client/src/service/models/trainingData.ts index 93a8582c2..58cc3a1c7 100644 --- a/client/src/service/models/trainingData.ts +++ b/client/src/service/models/trainingData.ts @@ -49,6 +49,10 @@ const TrainingDataSchema = new Schema({ source: { type: String, default: '' + }, + file_id: { + type: String, + default: '' } }); diff --git a/client/src/service/moduleDispatch/kb/search.ts b/client/src/service/moduleDispatch/kb/search.ts index 834d4403b..8281e09e0 100644 --- a/client/src/service/moduleDispatch/kb/search.ts +++ b/client/src/service/moduleDispatch/kb/search.ts @@ -42,7 +42,7 @@ export async function dispatchKBSearch(props: Record): Promise `'${item.kbId}'`) .join(',')}) AND vector <#> '[${vectors[0]}]' < -${similarity} order by vector <#> '[${ vectors[0] diff --git a/client/src/service/pg.ts b/client/src/service/pg.ts index 61d2ff107..e2d91b9e8 100644 --- a/client/src/service/pg.ts +++ b/client/src/service/pg.ts @@ -1,8 +1,8 @@ import { Pool } from 'pg'; import type { QueryResultRow } from 'pg'; import { PgTrainingTableName } from '@/constants/plugin'; -import { exit } from 'process'; import { addLog } from './utils/tools'; +import { DatasetItemType } from '@/types/plugin'; export const connectPg = async (): Promise => { if (global.pgClient) { @@ -45,7 +45,7 @@ type DeleteProps = { where: WhereProps; }; -type ValuesProps = { key: string; value: string | number }[]; +type ValuesProps = { key: string; value?: string | number }[]; type UpdateProps = { values: ValuesProps; where: WhereProps; @@ -168,18 +168,16 @@ export const insertKbItem = ({ }: { userId: string; kbId: string; - data: { + data: (DatasetItemType & { vector: number[]; - q: string; - a: string; - source?: string; - }[]; + })[]; }) => { return PgClient.insert(PgTrainingTableName, { values: data.map((item) => [ { key: 'user_id', value: userId }, { key: 'kb_id', value: kbId }, { key: 'source', value: item.source?.slice(0, 30)?.trim() || '' }, + { key: 'file_id', value: item.file_id }, { key: 'q', value: item.q.replace(/'/g, '"') }, { key: 'a', value: item.a.replace(/'/g, '"') }, { key: 'vector', value: `[${item.vector}]` } @@ -196,10 +194,11 @@ export async function initPg() { id BIGSERIAL PRIMARY KEY, vector VECTOR(1536) NOT NULL, user_id VARCHAR(50) NOT NULL, - kb_id VARCHAR(50) NOT NULL, + kb_id VARCHAR(50), source VARCHAR(100), + file_id VARCHAR(100), q TEXT NOT NULL, - a TEXT NOT NULL + a TEXT ); CREATE INDEX IF NOT EXISTS modelData_userId_index ON ${PgTrainingTableName} USING HASH (user_id); CREATE INDEX IF NOT EXISTS modelData_kbId_index ON ${PgTrainingTableName} USING HASH (kb_id); diff --git a/client/src/types/chat.d.ts b/client/src/types/chat.d.ts index f6a9c9e4b..86802b887 100644 --- a/client/src/types/chat.d.ts +++ b/client/src/types/chat.d.ts @@ -3,6 +3,7 @@ import type { InitChatResponse, InitShareChatResponse } from '@/api/response/cha import { TaskResponseKeyEnum } from '@/constants/chat'; import { ClassifyQuestionAgentItemType } from './app'; import { ChatItemSchema } from './mongoSchema'; +import { KbDataItemType } from './plugin'; export type ExportChatType = 'md' | 'pdf' | 'html'; @@ -41,12 +42,8 @@ export type ShareChatType = InitShareChatResponse & { history: ShareChatHistoryItemType; }; -export type QuoteItemType = { +export type QuoteItemType = KbDataItemType & { kb_id: string; - id: string; - q: string; - a: string; - source?: string; }; export type ChatHistoryItemResType = { diff --git a/client/src/types/mongoSchema.d.ts b/client/src/types/mongoSchema.d.ts index 171e7ad82..674c31fa9 100644 --- a/client/src/types/mongoSchema.d.ts +++ b/client/src/types/mongoSchema.d.ts @@ -78,6 +78,7 @@ export interface TrainingDataSchema { q: string; a: string; source: string; + file_id: string; } export interface ChatSchema { diff --git a/client/src/types/pg.d.ts b/client/src/types/pg.d.ts deleted file mode 100644 index e69de29bb..000000000 diff --git a/client/src/types/plugin.d.ts b/client/src/types/plugin.d.ts index ad755da0d..3fcf3b788 100644 --- a/client/src/types/plugin.d.ts +++ b/client/src/types/plugin.d.ts @@ -20,12 +20,15 @@ export interface KbItemType { tags: string; } -export interface KbDataItemType { - id: string; +export type DatasetItemType = { q: string; // 提问词 a: string; // 原文 - source: string; -} + source?: string; + file_id?: string; +}; +export type KbDataItemType = DatasetItemType & { + id: string; +}; export type KbTestItemType = { id: string; diff --git a/client/src/utils/file.ts b/client/src/utils/file.ts index 8bb01f8d2..502013835 100644 --- a/client/src/utils/file.ts +++ b/client/src/utils/file.ts @@ -2,7 +2,23 @@ import mammoth from 'mammoth'; import Papa from 'papaparse'; import { getOpenAiEncMap } from './plugin/openai'; import { getErrText } from './tools'; -import { uploadImg } from '@/api/system'; +import { uploadImg, postUploadFiles } from '@/api/system'; + +/** + * upload file to mongo gridfs + */ +export const uploadFiles = (files: File[], percentListen?: (percent: number) => void) => { + const form = new FormData(); + files.forEach((file) => { + form.append('file', file, encodeURIComponent(file.name)); + }); + return postUploadFiles(form, (e) => { + if (!e.total) return; + + const percent = Math.round((e.loaded / e.total) * 100); + percentListen && percentListen(percent); + }); +}; /** * 读取 txt 文件内容 @@ -37,7 +53,11 @@ export const readPdfContent = (file: File) => const readPDFPage = async (doc: any, pageNo: number) => { const page = await doc.getPage(pageNo); const tokenizedText = await page.getTextContent(); - const pageText = tokenizedText.items.map((token: any) => token.str).join(' '); + + const pageText = tokenizedText.items + .map((token: any) => token.str) + .filter((item: string) => item) + .join(''); return pageText; }; @@ -54,12 +74,12 @@ export const readPdfContent = (file: File) => const pageTexts = await Promise.all(pageTextPromises); resolve(pageTexts.join('\n')); } catch (err) { - console.log(err, 'pdfjs error'); + console.log(err, 'pdf load error'); reject('解析 PDF 失败'); } }; reader.onerror = (err) => { - console.log(err, 'reader error'); + console.log(err, 'pdf load error'); reject('解析 PDF 失败'); }; } catch (error) { @@ -83,10 +103,18 @@ export const readDocContent = (file: File) => }); resolve(res?.value); } catch (error) { + window.umami?.track('wordReadError', { + err: error?.toString() + }); + console.log('error doc read:', error); + reject('读取 doc 文件失败, 请转换成 PDF'); } }; reader.onerror = (err) => { + window.umami?.track('wordReadError', { + err: err?.toString() + }); console.log('error doc read:', err); reject('读取 doc 文件失败');