Add llms.txt (#502)

2026-04-16 07:26:35 +00:00 · 2025-12-01 20:39:03 +01:00
parent 1b73ce2855
commit 62d1627412
3 changed files with 245 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,7 @@ package-lock.json
 /generator/openapi/
 /generator/openapi.yml
 /generator/expandOpenAPIRef
 # LLM documentation (generated)
 /public/llms/
 /public/llms.txt
--- a/package.json
+++ b/package.json
@@ -3,9 +3,10 @@
  "version": "1.0.0",
  "private": true,
  "scripts": {
-    "dev": "next dev",
+    "dev": "npm run gen:llm && next dev",
-    "build": "next build",
+    "build": "npm run gen:llm && next build",
    "gen": "swagger-codegen generate -i https://raw.githubusercontent.com/netbirdio/netbird/main/management/server/http/api/openapi.yml -l openapi -o generator/openapi && npx ts-node generator/index.ts gen --input generator/openapi/openapi.json --output src/pages/ipa/resources",
    "gen:llm": "node scripts/generate-llm-docs.mjs",
    "start": "next start",
    "lint": "next lint"
  },
--- a/scripts/generate-llm-docs.mjs
+++ b/scripts/generate-llm-docs.mjs
@@ -0,0 +1,238 @@
 #!/usr/bin/env node
 /**
 * LLM Documentation Generator
 *
 * Generates clean markdown files from MDX pages for LLM indexing.
 * Creates:
 * - public/llms/*.md - Clean markdown versions of each page
 * - public/llms.txt - Index file linking to all pages
 */
 import fs from 'fs';
 import path from 'path';
 import { fileURLToPath } from 'url';
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const ROOT_DIR = path.join(__dirname, '..');
 const PAGES_DIR = path.join(ROOT_DIR, 'src/pages');
 const OUTPUT_DIR = path.join(ROOT_DIR, 'public/llms');
 const LLMS_TXT_PATH = path.join(ROOT_DIR, 'public/llms.txt');
 // Base URL for the docs site
 const BASE_URL = 'https://docs.netbird.io';
 /**
 * Strip JSX/React components from MDX content, keeping clean markdown
 */
 function stripJsx(content) {
  let result = content;
  // Remove import statements
  result = result.replace(/^import\s+.*?[;\n]/gm, '');
  // Remove export statements (but keep the content if it's a description)
  result = result.replace(/^export\s+const\s+description\s*=\s*(['"`])(.+?)\1;?\s*$/gm, '');
  result = result.replace(/^export\s+.*?[;\n]/gm, '');
  // Remove JSX self-closing tags like <Note />, <Button />, etc.
  result = result.replace(/<[A-Z][a-zA-Z]*\s*\/>/g, '');
  // Remove JSX components with children like <Note>...</Note>
  // Handle multi-line JSX blocks
  result = result.replace(/<([A-Z][a-zA-Z]*)[^>]*>[\s\S]*?<\/\1>/g, (match, tagName) => {
    // For Note components, try to extract the text content
    if (tagName === 'Note' || tagName === 'Warning' || tagName === 'Info') {
      const innerContent = match.replace(/<[^>]+>/g, '').trim();
      if (innerContent) {
        return `> **Note:** ${innerContent}\n`;
      }
    }
    return '';
  });
  // Remove remaining JSX tags but keep inner text for simple cases
  result = result.replace(/<([a-z][a-zA-Z]*)[^>]*>([\s\S]*?)<\/\1>/g, '$2');
  // Remove self-closing HTML/JSX tags with attributes
  result = result.replace(/<[a-zA-Z]+[^>]*\/>/g, '');
  // Remove div tags with className (common in MDX)
  result = result.replace(/<div[^>]*className[^>]*>[\s\S]*?<\/div>/g, '');
  // Remove iframe embeds (videos, etc.)
  result = result.replace(/<iframe[\s\S]*?<\/iframe>/g, '[Video content]');
  result = result.replace(/<iframe[^>]*\/>/g, '[Video content]');
  // Remove Button components but note the link
  result = result.replace(/<Button[^>]*href="([^"]*)"[^>]*>([^<]*)<\/Button>/g, '[$2]($1)');
  result = result.replace(/<Button[^>]*children="([^"]*)"[^>]*href="([^"]*)"[^>]*\/>/g, '[$1]($2)');
  result = result.replace(/<Button[^>]*href="([^"]*)"[^>]*children="([^"]*)"[^>]*\/>/g, '[$2]($1)');
  // Clean up excessive newlines
  result = result.replace(/\n{3,}/g, '\n\n');
  // Clean up leading/trailing whitespace
  result = result.trim();
  return result;
 }
 /**
 * Extract title from MDX content
 */
 function extractTitle(content) {
  // Try to find # heading
  const h1Match = content.match(/^#\s+(.+)$/m);
  if (h1Match) {
    return h1Match[1].trim();
  }
  return null;
 }
 /**
 * Get all MDX files recursively
 */
 function getMdxFiles(dir, baseDir = dir) {
  const files = [];
  const entries = fs.readdirSync(dir, { withFileTypes: true });
  for (const entry of entries) {
    const fullPath = path.join(dir, entry.name);
    if (entry.isDirectory()) {
      // Skip api/ipa directories if you want (they're auto-generated)
      files.push(...getMdxFiles(fullPath, baseDir));
    } else if (entry.name.endsWith('.mdx')) {
      const relativePath = path.relative(baseDir, fullPath);
      files.push({
        fullPath,
        relativePath,
        slug: relativePath.replace(/\.mdx$/, '').replace(/\/index$/, ''),
      });
    }
  }
  return files;
 }
 /**
 * Main generation function
 */
 async function generate() {
  console.log('Generating LLM documentation...\n');
  // Ensure output directory exists
  if (fs.existsSync(OUTPUT_DIR)) {
    fs.rmSync(OUTPUT_DIR, { recursive: true });
  }
  fs.mkdirSync(OUTPUT_DIR, { recursive: true });
  // Get all MDX files
  const mdxFiles = getMdxFiles(PAGES_DIR);
  console.log(`Found ${mdxFiles.length} MDX files\n`);
  const pages = [];
  for (const file of mdxFiles) {
    const content = fs.readFileSync(file.fullPath, 'utf-8');
    const cleanContent = stripJsx(content);
    const title = extractTitle(cleanContent) || file.slug.split('/').pop();
    // Create output path maintaining directory structure
    const outputPath = path.join(OUTPUT_DIR, `${file.slug}.md`);
    const outputDir = path.dirname(outputPath);
    // Ensure directory exists
    fs.mkdirSync(outputDir, { recursive: true });
    // Add metadata header
    const finalContent = `# ${title}\n\nSource: ${BASE_URL}/${file.slug}\n\n---\n\n${cleanContent}`;
    fs.writeFileSync(outputPath, finalContent);
    pages.push({
      title,
      slug: file.slug,
      path: `/llms/${file.slug}.md`,
    });
    console.log(`  Generated: ${file.slug}.md`);
  }
  // Generate llms.txt
  const llmsTxt = generateLlmsTxt(pages);
  fs.writeFileSync(LLMS_TXT_PATH, llmsTxt);
  console.log(`\nGenerated llms.txt with ${pages.length} pages`);
  console.log('\nDone!');
 }
 /**
 * Generate llms.txt content
 */
 function generateLlmsTxt(pages) {
  // Group pages by section
  const sections = {};
  for (const page of pages) {
    const parts = page.slug.split('/');
    const section = parts[0] || 'root';
    if (!sections[section]) {
      sections[section] = [];
    }
    sections[section].push(page);
  }
  let content = `# NetBird Documentation
 > NetBird is an open-source WireGuard-based mesh VPN platform that creates secure private networks.
 > This file provides an index of all documentation pages in markdown format for LLM consumption.
 ## About
 - Website: https://netbird.io
 - Documentation: ${BASE_URL}
 - GitHub: https://github.com/netbirdio/netbird
 ## Documentation Index
 Each link below points to a clean markdown version of the documentation page.
 `;
  // Section display names
  const sectionNames = {
    'introduction': 'Introduction',
    'about-netbird': 'About NetBird',
    'get-started': 'Getting Started',
    'manage': 'Managing NetBird',
    'selfhosted': 'Self-Hosting',
    'ipa': 'API Reference',
    'use-cases': 'Use Cases',
    'client': 'Client',
    'help': 'Help & Troubleshooting',
  };
  for (const [section, sectionPages] of Object.entries(sections)) {
    const sectionTitle = sectionNames[section] || section.charAt(0).toUpperCase() + section.slice(1);
    content += `### ${sectionTitle}\n\n`;
    for (const page of sectionPages) {
      content += `- [${page.title}](${BASE_URL}${page.path})\n`;
    }
    content += '\n';
  }
  content += `---
 ## Usage
 LLMs can fetch individual markdown files to get detailed information about specific topics.
 Each markdown file contains the clean documentation content with a link back to the original page.
 Generated: ${new Date().toISOString()}
 `;
  return content;
 }
 // Run
 generate().catch(console.error);