perf: 文本拆分

This commit is contained in:
archer
2023-03-27 19:19:47 +08:00
parent af385b1b42
commit 7fb6f62cf6

View File

@@ -8,11 +8,10 @@ import { generateAbstract } from '@/service/events/generateAbstract';
/* 拆分数据成QA */ /* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) { export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try { try {
let { text, dataId } = req.body as { text: string; dataId: string }; const { text, dataId } = req.body as { text: string; dataId: string };
if (!text || !dataId) { if (!text || !dataId) {
throw new Error('参数错误'); throw new Error('参数错误');
} }
text = text.replace(/\n+/g, '\n');
await connectToDatabase(); await connectToDatabase();
const { authorization } = req.headers; const { authorization } = req.headers;
@@ -24,19 +23,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
if (!DataRecord) { if (!DataRecord) {
throw new Error('找不到数据集'); throw new Error('找不到数据集');
} }
const replaceText = text.replace(/[\r\n\\n]+/g, ' ');
// 文本拆分成 chunk
let chunks = replaceText.match(/[^!?.。]+[!?.。]/g) || [];
const dataItems: any[] = []; const dataItems: any[] = [];
let splitText = '';
// 每 1000 字符一组 chunks.forEach((chunk) => {
for (let i = 0; i <= text.length / 1000; i++) { splitText += chunk;
dataItems.push({ if (splitText.length >= 980) {
userId, dataItems.push({
dataId, userId,
type: DataRecord.type, dataId,
text: text.slice(i * 1000, (i + 1) * 1000), type: DataRecord.type,
status: 1 text: splitText,
}); status: 1
} });
splitText = '';
}
});
// 批量插入数据 // 批量插入数据
await DataItem.insertMany(dataItems); await DataItem.insertMany(dataItems);
@@ -49,7 +56,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
} }
jsonRes(res, { jsonRes(res, {
data: '' data: { chunks, replaceText }
}); });
} catch (err) { } catch (err) {
jsonRes(res, { jsonRes(res, {