fix: resole crawl cannot get docs (#5344)

2026-06-15 01:08:07 +08:00 · 2025-07-30 15:38:30 +08:00
parent 061547a983
commit 6c37776de1
29 changed files with 358 additions and 421 deletions
@@ -1,71 +1,64 @@
 import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
 import fg from 'fast-glob';
 import matter from 'gray-matter';
 import { i18n } from '@/lib/i18n';

 export const revalidate = false;

-// 将文件路径转换为URL路径
+// 黑名单路径（不带语言前缀）
+const blacklist = ['use-cases/index', 'protocol/index', 'api/index'];
+
+// 将文件路径转换为 URL 路径（包括文件名）
 function filePathToUrl(filePath: string, defaultLanguage: string): string {
-  // 移除 ./content/docs/ 前缀
-  let urlPath = filePath.replace('./content/docs/', '');
-  
-  // 确定基础路径
+  let relativePath = filePath.replace('./content/docs/', '');
+
  const basePath = defaultLanguage === 'zh-CN' ? '/docs' : '/en/docs';
-  
-  // 如果是英文文件，移除 .en 后缀
-  if (defaultLanguage !== 'zh-CN' && urlPath.endsWith('.en.mdx')) {
-    urlPath = urlPath.replace('.en.mdx', '');
-  } else if (urlPath.endsWith('.mdx')) {
-    urlPath = urlPath.replace('.mdx', '');
+
+  if (defaultLanguage !== 'zh-CN' && relativePath.endsWith('.en.mdx')) {
+    relativePath = relativePath.replace(/\.en\.mdx$/, '');
+  } else if (relativePath.endsWith('.mdx')) {
+    relativePath = relativePath.replace(/\.mdx$/, '');
  }
-  
-  // 处理 index 文件
-  if (urlPath.endsWith('/index')) {
-    urlPath = urlPath.replace('/index', '');
-  }
-  
-  // 拼接完整路径
-  return `${basePath}/${urlPath}`.replace(/\/\/+/g, '/');
+
+  return `${basePath}/${relativePath}`.replace(/\/\/+/g, '/');
+}
+
+// 判断是否为黑名单路径
+function isBlacklisted(url: string): boolean {
+  return blacklist.some(
+    (item) => url.endsWith(`/docs/${item}`) || url.endsWith(`/en/docs/${item}`)
+  );
 }

 export async function GET(request: Request) {
  const defaultLanguage = i18n.defaultLanguage;
-  
-  // 检查请求路径是否为 /en/robots
+
  const requestUrl = new URL(request.url);
  const isEnRobotsRoute = requestUrl.pathname === '/en/robots';

  let globPattern;
-
  if (isEnRobotsRoute) {
-    // 如果是 /en/robots 路由，只选择 .en.mdx 文件
    globPattern = ['./content/docs/**/*.en.mdx'];
  } else if (defaultLanguage === 'zh-CN') {
-    // 中文环境下的普通路由
    globPattern = ['./content/docs/**/*.mdx'];
  } else {
-    // 英文环境下的普通路由
    globPattern = ['./content/docs/**/*.en.mdx'];
  }

-  const files = await fg(globPattern);
+  const files = await fg(globPattern, { caseSensitiveMatch: true });

-  const urls = await Promise.all(
-    files.map(async (file: string) => {
-      const urlPath = filePathToUrl(file, defaultLanguage);
-      return `${urlPath}`;
-    })
-  );
+  // 转换文件路径为 URL，并过滤黑名单
+  const urls = files
+    .map((file) => filePathToUrl(file, defaultLanguage))
+    .filter((url) => !isBlacklisted(url));

-  // 按URL排序
  urls.sort((a, b) => a.localeCompare(b));

-  // 生成HTML链接列表
  const html = `
    <html>
      <head>
-        <title>FastGPT Documentation Links</title>
+        <title>FastGPT 文档目录</title>
        <style>
          body { font-family: Arial, sans-serif; margin: 20px; }
          h1 { color: #333; }
@@ -78,7 +71,7 @@ export async function GET(request: Request) {
      <body>
        <h1>Documentation Links</h1>
        <ul>
-          ${urls.map(url => `<li><a href="${url}">${url}</a></li>`).join('')}
+          ${urls.map((url) => `<li><a href="${url}">${url}</a></li>`).join('')}
        </ul>
      </body>
    </html>
@@ -86,7 +79,7 @@ export async function GET(request: Request) {

  return new Response(html, {
    headers: {
-      'Content-Type': 'text/html',
-    },
+      'Content-Type': 'text/html'
+    }
  });
-}
+}
@@ -0,0 +1,86 @@
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import fs from 'fs/promises';
+import path from 'path';
+
+const docsRoot = path.resolve(process.cwd(), 'content/docs');
+
+function isInvalidPage(str: string): boolean {
+  if (!str || typeof str !== 'string') return true;
+  if (/\[.*?\]\(.*?\)/.test(str) || /^https?:\/\//.test(str) || /[()]/.test(str)) return true;
+  if (/^\s*---[\s\S]*---\s*$/.test(str)) return true;
+  return false;
+}
+
+function getPageName(str: string): string {
+  return str.startsWith('...') ? str.slice(3) : str;
+}
+
+async function findFirstValidPage(dirRelPath: string): Promise<string | null> {
+  const absDir = path.join(docsRoot, dirRelPath);
+  const metaPath = path.join(absDir, 'meta.json');
+
+  try {
+    const metaRaw = await fs.readFile(metaPath, 'utf-8');
+    const meta = JSON.parse(metaRaw);
+    if (!Array.isArray(meta.pages)) return null;
+
+    for (const page of meta.pages) {
+      if (isInvalidPage(page)) continue;
+
+      const pageName = getPageName(page);
+      const pagePath = path.join(dirRelPath, pageName);
+
+      const candidateDir = path.join(docsRoot, pagePath);
+      const candidateFile = candidateDir + '.mdx';
+
+      try {
+        await fs.access(candidateFile);
+        return pagePath;
+      } catch {
+        try {
+          const stat = await fs.stat(candidateDir);
+          if (stat.isDirectory()) {
+            const recursiveResult = await findFirstValidPage(pagePath);
+            if (recursiveResult) return recursiveResult;
+          }
+        } catch {
+          // ignore
+        }
+      }
+    }
+  } catch {
+    // ignore
+  }
+
+  return null;
+}
+
+export async function GET(req: NextRequest) {
+  const url = new URL(req.url);
+  const rawPath = url.searchParams.get('path');
+
+  if (!rawPath || !rawPath.startsWith('/docs')) {
+    return NextResponse.json({ error: 'Invalid path' }, { status: 400 });
+  }
+
+  // 去除 /docs 前缀，且清理首尾斜杠
+  const relPath = rawPath.replace(/^\/docs\/?/, '').replace(/^\/|\/$/g, '');
+
+  try {
+    // 先检测是否有该 mdx 文件
+    const maybeFile = path.join(docsRoot, relPath + '.mdx');
+    await fs.access(maybeFile);
+    // 如果存在，返回完整路径（带 /docs）
+    return NextResponse.json('/docs/' + relPath);
+  } catch {
+    // 不存在，尝试递归寻找第一个有效页面
+    const found = await findFirstValidPage(relPath);
+    if (found) {
+      // 返回带 /docs 前缀的完整路径
+      return NextResponse.json('/docs/' + found.replace(/\\/g, '/'));
+    } else {
+      return NextResponse.json({ error: 'No valid mdx page found' }, { status: 404 });
+    }
+  }
+}