apache · allisonwang-db · Sep 22, 2025 · Sep 22, 2025 · Sep 29, 2025
diff --git a/dev/create-release/generate-llms-txt.py b/dev/create-release/generate-llms-txt.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script generates llms.txt file for Apache Spark documentation
+
+import sys
+import argparse
+from pathlib import Path
+
+
+def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest") -> None:
+    """
+    Generate the llms.txt file for Apache Spark documentation with hardcoded categories.
+    """
+    content = []
+    content.append("# Apache Spark")
+    content.append("")
+    content.append(
+        "> Apache Spark™ is a unified analytics engine for large-scale data processing. "
+        "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine "
+        "that supports general execution graphs. It also supports a rich set of higher-level "
+        "tools including Spark SQL for SQL and structured data processing, MLlib for machine "
+        "learning, GraphX for graph processing, and Structured Streaming for incremental "
+        "computation and stream processing."
+    )
+    content.append("")
+
+    doc_home_url = f"https://spark.apache.org/docs/{version}/"
+    content.append(f"Documentation home: {doc_home_url}")
+    content.append("")
+
+    content.append("## Programming Guides")
+    content.append("")
+    programming_guides = [
+        ("Quick Start", f"https://spark.apache.org/docs/{version}/quick-start.html"),
+        (
+            "RDD Programming Guide",
+            f"https://spark.apache.org/docs/{version}/rdd-programming-guide.html",
+        ),
+        (
+            "Spark SQL, Datasets, and DataFrames",
+            f"https://spark.apache.org/docs/{version}/sql-programming-guide.html",
+        ),
+        (
+            "Structured Streaming",
+            f"https://spark.apache.org/docs/{version}/structured-streaming-programming-guide.html",
+        ),
+        (
+            "Spark Streaming",
+            f"https://spark.apache.org/docs/{version}/streaming-programming-guide.html",
+        ),
+        ("MLlib", f"https://spark.apache.org/docs/{version}/ml-guide.html"),
+        ("GraphX", f"https://spark.apache.org/docs/{version}/graphx-programming-guide.html"),
+        ("SparkR", f"https://spark.apache.org/docs/{version}/sparkr.html"),
+        (
+            "PySpark",
+            f"https://spark.apache.org/docs/{version}/api/python/getting_started/index.html",
+        ),
+        (
+            "Spark SQL CLI",
+            f"https://spark.apache.org/docs/{version}/sql-distributed-sql-engine-spark-sql-cli.html",
+        ),
+    ]
+    for title, url in programming_guides:
+        content.append(f"- [{title}]({url})")
+    content.append("")
+
+    content.append("## API Docs")
+    content.append("")
+    # TODO: Update API docs to point to their own llms.txt files once available
+    # e.g., https://spark.apache.org/docs/{version}/api/python/llms.txt
+    api_docs = [
+        ("Spark Python API", f"https://spark.apache.org/docs/{version}/api/python/index.html"),
+        (
+            "Spark Scala API",
+            f"https://spark.apache.org/docs/{version}/api/scala/org/apache/spark/index.html",
+        ),
+        ("Spark Java API", f"https://spark.apache.org/docs/{version}/api/java/index.html"),
+        ("Spark R API", f"https://spark.apache.org/docs/{version}/api/R/index.html"),
+        (
+            "Spark SQL Built-in Functions",
+            f"https://spark.apache.org/docs/{version}/api/sql/index.html",
+        ),
+    ]
+    for title, url in api_docs:
+        content.append(f"- [{title}]({url})")
+    content.append("")
+
+    content.append("## Deployment Guides")
+    content.append("")
+    deployment_guides = [
+        ("Cluster Overview", f"https://spark.apache.org/docs/{version}/cluster-overview.html"),
+        (
+            "Submitting Applications",
+            f"https://spark.apache.org/docs/{version}/submitting-applications.html",
+        ),
+        (
+            "Standalone Deploy Mode",
+            f"https://spark.apache.org/docs/{version}/spark-standalone.html",
+        ),
+        ("YARN", f"https://spark.apache.org/docs/{version}/running-on-yarn.html"),
+        ("Kubernetes", f"https://spark.apache.org/docs/{version}/running-on-kubernetes.html"),
+    ]
+    for title, url in deployment_guides:
+        content.append(f"- [{title}]({url})")
+    content.append("")
+
+    content.append("## Other Documents")
+    content.append("")
+    other_docs = [
+        ("Configuration", f"https://spark.apache.org/docs/{version}/configuration.html"),
+        ("Monitoring", f"https://spark.apache.org/docs/{version}/monitoring.html"),
+        ("Web UI", f"https://spark.apache.org/docs/{version}/web-ui.html"),
+        ("Tuning Guide", f"https://spark.apache.org/docs/{version}/tuning.html"),
+        ("Job Scheduling", f"https://spark.apache.org/docs/{version}/job-scheduling.html"),
+        ("Security", f"https://spark.apache.org/docs/{version}/security.html"),
+        (
+            "Hardware Provisioning",
+            f"https://spark.apache.org/docs/{version}/hardware-provisioning.html",
+        ),
+        (
+            "Cloud Infrastructures",
+            f"https://spark.apache.org/docs/{version}/cloud-integration.html",
+        ),
+        ("Migration Guide", f"https://spark.apache.org/docs/{version}/migration-guide.html"),
+        ("Building Spark", f"https://spark.apache.org/docs/{version}/building-spark.html"),
+    ]
+    for title, url in other_docs:
+        content.append(f"- [{title}]({url})")
+    content.append("")
+
+    content.append("## External Resources")
+    content.append("")
+    content.append("- [Apache Spark Home](https://spark.apache.org/)")
+    content.append("- [Downloads](https://spark.apache.org/downloads.html)")
+    content.append("- [GitHub Repository](https://github.com/apache/spark)")
+    content.append("- [Issue Tracker (JIRA)](https://issues.apache.org/jira/projects/SPARK)")
+    content.append("- [Mailing Lists](https://spark.apache.org/mailing-lists.html)")
+    content.append("- [Community](https://spark.apache.org/community.html)")
+    content.append("- [Contributing](https://spark.apache.org/contributing.html)")
+    content.append("")
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(content))
+
+    print(f"Generated {output_path}")
+
+    total_docs = len(programming_guides) + len(api_docs) + len(deployment_guides) + len(other_docs)
+    sections_count = 5
+
+    print(f"Total documentation pages indexed: {total_docs}")
+    print(f"Sections: {sections_count}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate llms.txt file for Apache Spark documentation"
+    )
+    parser.add_argument(
+        "--docs-path", type=str, default="docs", help="Path to the docs directory (default: docs)"
+    )
+    parser.add_argument(
+        "--output", type=str, default="llms.txt", help="Output file path (default: llms.txt)"
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        default="latest",
+        help="Spark documentation version (default: latest)",
+    )
+
+    args = parser.parse_args()
+
+    # Convert to Path objects
+    script_dir = Path(__file__).parent
+    project_root = script_dir.parent.parent  # Go up two levels from dev/create-release/
+    docs_path = project_root / args.docs_path
+    output_path = project_root / args.output
+
+    # Check if docs directory exists
+    if not docs_path.exists():
+        print(f"Error: Documentation directory '{docs_path}' does not exist")
+        sys.exit(1)
+
+    # Generate the llms.txt file
+    generate_llms_txt(docs_path, output_path, args.version)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
@@ -833,6 +833,14 @@ if [[ "$1" == "docs" ]]; then
   fi
   bundle install
   PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" bundle exec jekyll build
+
+  # Generate llms.txt for LLM consumption
+  echo "Generating llms.txt..."
+  python "$SELF/generate-llms-txt.py" \
+    --docs-path . \
+    --output _site/llms.txt \
+    --version "$SPARK_VERSION"
+
   cd ..
   cd ..