perf: 文本拆分

This commit is contained in:
archer
2023-03-27 19:19:47 +08:00
parent af385b1b42
commit 7fb6f62cf6

View File

@@ -8,11 +8,10 @@ import { generateAbstract } from '@/service/events/generateAbstract';
/* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
let { text, dataId } = req.body as { text: string; dataId: string };
const { text, dataId } = req.body as { text: string; dataId: string };
if (!text || !dataId) {
throw new Error('参数错误');
}
text = text.replace(/\n+/g, '\n');
await connectToDatabase();
const { authorization } = req.headers;
@@ -24,19 +23,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
if (!DataRecord) {
throw new Error('找不到数据集');
}
const replaceText = text.replace(/[\r\n\\n]+/g, ' ');
// 文本拆分成 chunk
let chunks = replaceText.match(/[^!?.。]+[!?.。]/g) || [];
const dataItems: any[] = [];
let splitText = '';
// 每 1000 字符一组
for (let i = 0; i <= text.length / 1000; i++) {
chunks.forEach((chunk) => {
splitText += chunk;
if (splitText.length >= 980) {
dataItems.push({
userId,
dataId,
type: DataRecord.type,
text: text.slice(i * 1000, (i + 1) * 1000),
text: splitText,
status: 1
});
splitText = '';
}
});
// 批量插入数据
await DataItem.insertMany(dataItems);
@@ -49,7 +56,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
jsonRes(res, {
data: ''
data: { chunks, replaceText }
});
} catch (err) {
jsonRes(res, {