From fafcbff876c482000a418fbf2a7eeeb950e10576 Mon Sep 17 00:00:00 2001 From: Allison Wang Date: Mon, 22 Sep 2025 13:21:11 -0700 Subject: [PATCH 1/3] llmtxt --- dev/create-release/generate-llm-txt.py | 323 +++++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100755 dev/create-release/generate-llm-txt.py diff --git a/dev/create-release/generate-llm-txt.py b/dev/create-release/generate-llm-txt.py new file mode 100755 index 0000000000000..6603dddc41936 --- /dev/null +++ b/dev/create-release/generate-llm-txt.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script generates llm.txt file for Apache Spark documentation + +import os +import re +import sys +import argparse +from pathlib import Path +from typing import List, Tuple, Dict, Optional + + +def extract_title_from_md(file_path: Path) -> str: + """ + Extract the title from a markdown file by looking for Jekyll front matter. + Falls back to first H1 header, then to filename if neither found. + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # First, try to extract from Jekyll front matter + if content.startswith('---'): + front_matter_end = content.find('---', 3) + if front_matter_end != -1: + front_matter = content[3:front_matter_end] + # Look for displayTitle first, then title + for line in front_matter.split('\n'): + if line.startswith('displayTitle:'): + title = line[13:].strip().strip('"').strip("'") + if title: + return title + elif line.startswith('title:'): + title = line[6:].strip().strip('"').strip("'") + if title and title != "Overview": # Skip generic "Overview" + return title + + # Then try to find first H1 header after front matter + lines = content.split('\n') + for line in lines: + if line.startswith('# '): + title = line[2:].strip() + if title and not title.startswith('{'): # Skip Jekyll variables + return title + except Exception: + pass + + # Fallback to filename-based title + filename = file_path.stem + # Convert filename to title (e.g., building-spark -> Building Spark) + title = filename.replace('-', ' ').replace('_', ' ') + return title.title() + + +def parse_index_md(index_path: Path) -> Dict[str, List[Tuple[str, str]]]: + """ + Parse the index.md file to extract the documentation structure. + Returns a dictionary with section names as keys and lists of (title, link) tuples as values. + """ + structure = {} + current_section = None + + try: + with open(index_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Skip front matter + if content.startswith('---'): + front_matter_end = content.find('---', 3) + if front_matter_end != -1: + content = content[front_matter_end + 3:] + + lines = content.split('\n') + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Detect main sections + if line == "**Programming Guides:**": + current_section = "Programming Guides" + structure[current_section] = [] + elif line == "**API Docs:**": + current_section = "API Docs" + structure[current_section] = [] + elif line == "**Deployment Guides:**": + current_section = "Deployment Guides" + structure[current_section] = [] + elif line == "**Other Documents:**": + current_section = "Other Documents" + structure[current_section] = [] + elif line == "**External Resources:**": + current_section = "External Resources" + structure[current_section] = [] + # Parse links in the current section + elif current_section and line.startswith('*'): + # Extract markdown links [title](url) + link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)' + matches = re.findall(link_pattern, line) + for title, url in matches: + # Only include internal documentation links + if not url.startswith('http') and url.endswith('.html'): + structure[current_section].append((title, url)) + elif url.startswith('api/'): + # Include API doc links + structure[current_section].append((title, url)) + + i += 1 + except Exception as e: + print(f"Warning: Could not parse index.md: {e}", file=sys.stderr) + + return structure + + +def get_all_docs_files(docs_path: Path) -> Dict[str, Tuple[str, str]]: + """ + Get all documentation files and their titles. + Returns a dictionary with filename (without .md) as key and (title, path) as value. + """ + docs_files = {} + + # Process root level markdown files + for md_file in docs_path.glob("*.md"): + if md_file.name in ["README.md", "404.md"]: + continue + filename = md_file.stem + title = extract_title_from_md(md_file) + docs_files[filename] = (title, str(md_file.relative_to(docs_path))) + + # Process subdirectory markdown files (e.g., streaming/) + for subdir in docs_path.iterdir(): + if subdir.is_dir() and not subdir.name.startswith('_') and subdir.name not in ['css', 'img', 'js', 'api']: + for md_file in subdir.glob("*.md"): + if md_file.name == "README.md": + continue + # Use relative path as key + rel_path = md_file.relative_to(docs_path) + filename = str(rel_path.with_suffix('')) + title = extract_title_from_md(md_file) + docs_files[filename] = (title, str(rel_path)) + + return docs_files + + +def generate_llm_txt(docs_path: Path, output_path: Path, version: str = "latest") -> None: + """ + Generate the llm.txt file for Apache Spark documentation using structure from index.md. + """ + # Parse the index.md file to get the documentation structure + index_path = docs_path / "index.md" + structure = parse_index_md(index_path) + + # Get all available documentation files + all_docs = get_all_docs_files(docs_path) + + # Generate the content + content = [] + content.append("# Apache Spark") + content.append("") + content.append("> Apache Spark™ is a unified analytics engine for large-scale data processing. " + "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine " + "that supports general execution graphs. It also supports a rich set of higher-level " + "tools including Spark SQL for SQL and structured data processing, MLlib for machine " + "learning, GraphX for graph processing, and Structured Streaming for incremental " + "computation and stream processing.") + content.append("") + + # Main documentation link + doc_home_url = f"https://spark.apache.org/docs/{version}/" if version != "latest" else "https://spark.apache.org/docs/latest/" + content.append(f"Documentation home: {doc_home_url}") + content.append("") + + # Process each section from index.md + sections_to_include = ["Programming Guides", "API Docs", "Deployment Guides", "Other Documents"] + + for section in sections_to_include: + if section in structure and structure[section]: + content.append(f"## {section}") + content.append("") + + for title, link in structure[section]: + # Convert relative links to absolute URLs + if link.startswith('api/'): + # API documentation links + url = f"https://spark.apache.org/docs/{version}/{link}" + elif link.endswith('.html'): + # Regular documentation links + base_name = link.replace('.html', '') + + # Handle special cases like streaming/index.html + if '/' in base_name: + url = f"https://spark.apache.org/docs/{version}/{link}" + else: + url = f"https://spark.apache.org/docs/{version}/{link}" + else: + # External links (shouldn't happen in these sections) + continue + + content.append(f"- [{title}]({url})") + content.append("") + + # Add main SQL reference pages (not every single SQL command) + sql_refs = [] + # Only include high-level SQL reference pages, not individual commands + important_sql_refs = [ + 'sql-ref', # Main SQL reference + 'sql-ref-ansi-compliance', # ANSI compliance + 'sql-ref-datatypes', # Data types + 'sql-ref-datetime-pattern', # Datetime patterns + 'sql-ref-functions', # Functions overview + 'sql-ref-functions-builtin', # Built-in functions + 'sql-ref-identifier', # Identifiers + 'sql-ref-literals', # Literals + 'sql-ref-null-semantics', # NULL semantics + 'sql-ref-syntax', # SQL syntax overview + ] + + for filename, (title, _) in all_docs.items(): + if filename in important_sql_refs: + url = f"https://spark.apache.org/docs/{version}/{filename}.html" + sql_refs.append((title, url)) + + if sql_refs: + content.append("## SQL Reference") + content.append("") + for title, url in sorted(sql_refs): + content.append(f"- [{title}]({url})") + content.append("") + + # Add streaming documentation from the streaming/ subdirectory + streaming_docs = [] + for filename, (title, _) in all_docs.items(): + if filename.startswith('streaming/'): + url = f"https://spark.apache.org/docs/{version}/{filename}.html" + streaming_docs.append((title, url)) + + if streaming_docs: + content.append("## Structured Streaming (Detailed)") + content.append("") + for title, url in sorted(streaming_docs): + content.append(f"- [{title}]({url})") + content.append("") + + # External Resources + content.append("## External Resources") + content.append("") + content.append("- [Apache Spark Home](https://spark.apache.org/)") + content.append("- [Downloads](https://spark.apache.org/downloads.html)") + content.append("- [GitHub Repository](https://github.com/apache/spark)") + content.append("- [Issue Tracker (JIRA)](https://issues.apache.org/jira/projects/SPARK)") + content.append("- [Mailing Lists](https://spark.apache.org/mailing-lists.html)") + content.append("- [Community](https://spark.apache.org/community.html)") + content.append("- [Contributing](https://spark.apache.org/contributing.html)") + content.append("") + + # Write to file + with open(output_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(content)) + + print(f"Generated {output_path}") + + # Count total docs included + total_docs = sum(len(v) for v in structure.values()) + total_docs += len(sql_refs) + len(streaming_docs) + print(f"Total documentation pages indexed: {total_docs}") + print(f"Sections: {len([s for s in structure if structure[s]]) + (1 if sql_refs else 0) + (1 if streaming_docs else 0)}") + + +def main(): + parser = argparse.ArgumentParser(description='Generate llm.txt file for Apache Spark documentation') + parser.add_argument( + '--docs-path', + type=str, + default='docs', + help='Path to the docs directory (default: docs)' + ) + parser.add_argument( + '--output', + type=str, + default='llm.txt', + help='Output file path (default: llm.txt)' + ) + parser.add_argument( + '--version', + type=str, + default='latest', + help='Spark documentation version (default: latest)' + ) + + args = parser.parse_args() + + # Convert to Path objects + script_dir = Path(__file__).parent + project_root = script_dir.parent.parent # Go up two levels from dev/create-release/ + docs_path = project_root / args.docs_path + output_path = project_root / args.output + + # Check if docs directory exists + if not docs_path.exists(): + print(f"Error: Documentation directory '{docs_path}' does not exist") + sys.exit(1) + + # Generate the llm.txt file + generate_llm_txt(docs_path, output_path, args.version) + + +if __name__ == "__main__": + main() \ No newline at end of file From 5a80169e46737347d34746029d771b6e548dce2d Mon Sep 17 00:00:00 2001 From: Allison Wang Date: Mon, 22 Sep 2025 13:37:31 -0700 Subject: [PATCH 2/3] fix --- dev/create-release/generate-llm-txt.py | 323 ------------------------ dev/create-release/generate-llms-txt.py | 171 +++++++++++++ dev/create-release/release-build.sh | 8 + 3 files changed, 179 insertions(+), 323 deletions(-) delete mode 100755 dev/create-release/generate-llm-txt.py create mode 100755 dev/create-release/generate-llms-txt.py diff --git a/dev/create-release/generate-llm-txt.py b/dev/create-release/generate-llm-txt.py deleted file mode 100755 index 6603dddc41936..0000000000000 --- a/dev/create-release/generate-llm-txt.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This script generates llm.txt file for Apache Spark documentation - -import os -import re -import sys -import argparse -from pathlib import Path -from typing import List, Tuple, Dict, Optional - - -def extract_title_from_md(file_path: Path) -> str: - """ - Extract the title from a markdown file by looking for Jekyll front matter. - Falls back to first H1 header, then to filename if neither found. - """ - try: - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() - - # First, try to extract from Jekyll front matter - if content.startswith('---'): - front_matter_end = content.find('---', 3) - if front_matter_end != -1: - front_matter = content[3:front_matter_end] - # Look for displayTitle first, then title - for line in front_matter.split('\n'): - if line.startswith('displayTitle:'): - title = line[13:].strip().strip('"').strip("'") - if title: - return title - elif line.startswith('title:'): - title = line[6:].strip().strip('"').strip("'") - if title and title != "Overview": # Skip generic "Overview" - return title - - # Then try to find first H1 header after front matter - lines = content.split('\n') - for line in lines: - if line.startswith('# '): - title = line[2:].strip() - if title and not title.startswith('{'): # Skip Jekyll variables - return title - except Exception: - pass - - # Fallback to filename-based title - filename = file_path.stem - # Convert filename to title (e.g., building-spark -> Building Spark) - title = filename.replace('-', ' ').replace('_', ' ') - return title.title() - - -def parse_index_md(index_path: Path) -> Dict[str, List[Tuple[str, str]]]: - """ - Parse the index.md file to extract the documentation structure. - Returns a dictionary with section names as keys and lists of (title, link) tuples as values. - """ - structure = {} - current_section = None - - try: - with open(index_path, 'r', encoding='utf-8') as f: - content = f.read() - - # Skip front matter - if content.startswith('---'): - front_matter_end = content.find('---', 3) - if front_matter_end != -1: - content = content[front_matter_end + 3:] - - lines = content.split('\n') - i = 0 - while i < len(lines): - line = lines[i].strip() - - # Detect main sections - if line == "**Programming Guides:**": - current_section = "Programming Guides" - structure[current_section] = [] - elif line == "**API Docs:**": - current_section = "API Docs" - structure[current_section] = [] - elif line == "**Deployment Guides:**": - current_section = "Deployment Guides" - structure[current_section] = [] - elif line == "**Other Documents:**": - current_section = "Other Documents" - structure[current_section] = [] - elif line == "**External Resources:**": - current_section = "External Resources" - structure[current_section] = [] - # Parse links in the current section - elif current_section and line.startswith('*'): - # Extract markdown links [title](url) - link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)' - matches = re.findall(link_pattern, line) - for title, url in matches: - # Only include internal documentation links - if not url.startswith('http') and url.endswith('.html'): - structure[current_section].append((title, url)) - elif url.startswith('api/'): - # Include API doc links - structure[current_section].append((title, url)) - - i += 1 - except Exception as e: - print(f"Warning: Could not parse index.md: {e}", file=sys.stderr) - - return structure - - -def get_all_docs_files(docs_path: Path) -> Dict[str, Tuple[str, str]]: - """ - Get all documentation files and their titles. - Returns a dictionary with filename (without .md) as key and (title, path) as value. - """ - docs_files = {} - - # Process root level markdown files - for md_file in docs_path.glob("*.md"): - if md_file.name in ["README.md", "404.md"]: - continue - filename = md_file.stem - title = extract_title_from_md(md_file) - docs_files[filename] = (title, str(md_file.relative_to(docs_path))) - - # Process subdirectory markdown files (e.g., streaming/) - for subdir in docs_path.iterdir(): - if subdir.is_dir() and not subdir.name.startswith('_') and subdir.name not in ['css', 'img', 'js', 'api']: - for md_file in subdir.glob("*.md"): - if md_file.name == "README.md": - continue - # Use relative path as key - rel_path = md_file.relative_to(docs_path) - filename = str(rel_path.with_suffix('')) - title = extract_title_from_md(md_file) - docs_files[filename] = (title, str(rel_path)) - - return docs_files - - -def generate_llm_txt(docs_path: Path, output_path: Path, version: str = "latest") -> None: - """ - Generate the llm.txt file for Apache Spark documentation using structure from index.md. - """ - # Parse the index.md file to get the documentation structure - index_path = docs_path / "index.md" - structure = parse_index_md(index_path) - - # Get all available documentation files - all_docs = get_all_docs_files(docs_path) - - # Generate the content - content = [] - content.append("# Apache Spark") - content.append("") - content.append("> Apache Spark™ is a unified analytics engine for large-scale data processing. " - "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine " - "that supports general execution graphs. It also supports a rich set of higher-level " - "tools including Spark SQL for SQL and structured data processing, MLlib for machine " - "learning, GraphX for graph processing, and Structured Streaming for incremental " - "computation and stream processing.") - content.append("") - - # Main documentation link - doc_home_url = f"https://spark.apache.org/docs/{version}/" if version != "latest" else "https://spark.apache.org/docs/latest/" - content.append(f"Documentation home: {doc_home_url}") - content.append("") - - # Process each section from index.md - sections_to_include = ["Programming Guides", "API Docs", "Deployment Guides", "Other Documents"] - - for section in sections_to_include: - if section in structure and structure[section]: - content.append(f"## {section}") - content.append("") - - for title, link in structure[section]: - # Convert relative links to absolute URLs - if link.startswith('api/'): - # API documentation links - url = f"https://spark.apache.org/docs/{version}/{link}" - elif link.endswith('.html'): - # Regular documentation links - base_name = link.replace('.html', '') - - # Handle special cases like streaming/index.html - if '/' in base_name: - url = f"https://spark.apache.org/docs/{version}/{link}" - else: - url = f"https://spark.apache.org/docs/{version}/{link}" - else: - # External links (shouldn't happen in these sections) - continue - - content.append(f"- [{title}]({url})") - content.append("") - - # Add main SQL reference pages (not every single SQL command) - sql_refs = [] - # Only include high-level SQL reference pages, not individual commands - important_sql_refs = [ - 'sql-ref', # Main SQL reference - 'sql-ref-ansi-compliance', # ANSI compliance - 'sql-ref-datatypes', # Data types - 'sql-ref-datetime-pattern', # Datetime patterns - 'sql-ref-functions', # Functions overview - 'sql-ref-functions-builtin', # Built-in functions - 'sql-ref-identifier', # Identifiers - 'sql-ref-literals', # Literals - 'sql-ref-null-semantics', # NULL semantics - 'sql-ref-syntax', # SQL syntax overview - ] - - for filename, (title, _) in all_docs.items(): - if filename in important_sql_refs: - url = f"https://spark.apache.org/docs/{version}/{filename}.html" - sql_refs.append((title, url)) - - if sql_refs: - content.append("## SQL Reference") - content.append("") - for title, url in sorted(sql_refs): - content.append(f"- [{title}]({url})") - content.append("") - - # Add streaming documentation from the streaming/ subdirectory - streaming_docs = [] - for filename, (title, _) in all_docs.items(): - if filename.startswith('streaming/'): - url = f"https://spark.apache.org/docs/{version}/{filename}.html" - streaming_docs.append((title, url)) - - if streaming_docs: - content.append("## Structured Streaming (Detailed)") - content.append("") - for title, url in sorted(streaming_docs): - content.append(f"- [{title}]({url})") - content.append("") - - # External Resources - content.append("## External Resources") - content.append("") - content.append("- [Apache Spark Home](https://spark.apache.org/)") - content.append("- [Downloads](https://spark.apache.org/downloads.html)") - content.append("- [GitHub Repository](https://github.com/apache/spark)") - content.append("- [Issue Tracker (JIRA)](https://issues.apache.org/jira/projects/SPARK)") - content.append("- [Mailing Lists](https://spark.apache.org/mailing-lists.html)") - content.append("- [Community](https://spark.apache.org/community.html)") - content.append("- [Contributing](https://spark.apache.org/contributing.html)") - content.append("") - - # Write to file - with open(output_path, 'w', encoding='utf-8') as f: - f.write('\n'.join(content)) - - print(f"Generated {output_path}") - - # Count total docs included - total_docs = sum(len(v) for v in structure.values()) - total_docs += len(sql_refs) + len(streaming_docs) - print(f"Total documentation pages indexed: {total_docs}") - print(f"Sections: {len([s for s in structure if structure[s]]) + (1 if sql_refs else 0) + (1 if streaming_docs else 0)}") - - -def main(): - parser = argparse.ArgumentParser(description='Generate llm.txt file for Apache Spark documentation') - parser.add_argument( - '--docs-path', - type=str, - default='docs', - help='Path to the docs directory (default: docs)' - ) - parser.add_argument( - '--output', - type=str, - default='llm.txt', - help='Output file path (default: llm.txt)' - ) - parser.add_argument( - '--version', - type=str, - default='latest', - help='Spark documentation version (default: latest)' - ) - - args = parser.parse_args() - - # Convert to Path objects - script_dir = Path(__file__).parent - project_root = script_dir.parent.parent # Go up two levels from dev/create-release/ - docs_path = project_root / args.docs_path - output_path = project_root / args.output - - # Check if docs directory exists - if not docs_path.exists(): - print(f"Error: Documentation directory '{docs_path}' does not exist") - sys.exit(1) - - # Generate the llm.txt file - generate_llm_txt(docs_path, output_path, args.version) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/dev/create-release/generate-llms-txt.py b/dev/create-release/generate-llms-txt.py new file mode 100755 index 0000000000000..027e3089a70fb --- /dev/null +++ b/dev/create-release/generate-llms-txt.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script generates llms.txt file for Apache Spark documentation + +import sys +import argparse +from pathlib import Path + + +def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest") -> None: + """ + Generate the llms.txt file for Apache Spark documentation with hardcoded categories. + """ + content = [] + content.append("# Apache Spark") + content.append("") + content.append("> Apache Spark™ is a unified analytics engine for large-scale data processing. " + "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine " + "that supports general execution graphs. It also supports a rich set of higher-level " + "tools including Spark SQL for SQL and structured data processing, MLlib for machine " + "learning, GraphX for graph processing, and Structured Streaming for incremental " + "computation and stream processing.") + content.append("") + + doc_home_url = f"https://spark.apache.org/docs/{version}/" + content.append(f"Documentation home: {doc_home_url}") + content.append("") + + content.append("## Programming Guides") + content.append("") + programming_guides = [ + ("Quick Start", f"https://spark.apache.org/docs/{version}/quick-start.html"), + ("RDD Programming Guide", f"https://spark.apache.org/docs/{version}/rdd-programming-guide.html"), + ("Spark SQL, Datasets, and DataFrames", f"https://spark.apache.org/docs/{version}/sql-programming-guide.html"), + ("Structured Streaming", f"https://spark.apache.org/docs/{version}/structured-streaming-programming-guide.html"), + ("Spark Streaming", f"https://spark.apache.org/docs/{version}/streaming-programming-guide.html"), + ("MLlib", f"https://spark.apache.org/docs/{version}/ml-guide.html"), + ("GraphX", f"https://spark.apache.org/docs/{version}/graphx-programming-guide.html"), + ("SparkR", f"https://spark.apache.org/docs/{version}/sparkr.html"), + ("PySpark", f"https://spark.apache.org/docs/{version}/api/python/getting_started/index.html"), + ("Spark SQL CLI", f"https://spark.apache.org/docs/{version}/sql-distributed-sql-engine-spark-sql-cli.html"), + ] + for title, url in programming_guides: + content.append(f"- [{title}]({url})") + content.append("") + + content.append("## API Docs") + content.append("") + # TODO: Update API docs to point to their own llms.txt files once available + # e.g., https://spark.apache.org/docs/{version}/api/python/llms.txt + api_docs = [ + ("Spark Python API", f"https://spark.apache.org/docs/{version}/api/python/index.html"), + ("Spark Scala API", f"https://spark.apache.org/docs/{version}/api/scala/org/apache/spark/index.html"), + ("Spark Java API", f"https://spark.apache.org/docs/{version}/api/java/index.html"), + ("Spark R API", f"https://spark.apache.org/docs/{version}/api/R/index.html"), + ("Spark SQL Built-in Functions", f"https://spark.apache.org/docs/{version}/api/sql/index.html"), + ] + for title, url in api_docs: + content.append(f"- [{title}]({url})") + content.append("") + + content.append("## Deployment Guides") + content.append("") + deployment_guides = [ + ("Cluster Overview", f"https://spark.apache.org/docs/{version}/cluster-overview.html"), + ("Submitting Applications", f"https://spark.apache.org/docs/{version}/submitting-applications.html"), + ("Standalone Deploy Mode", f"https://spark.apache.org/docs/{version}/spark-standalone.html"), + ("YARN", f"https://spark.apache.org/docs/{version}/running-on-yarn.html"), + ("Kubernetes", f"https://spark.apache.org/docs/{version}/running-on-kubernetes.html"), + ] + for title, url in deployment_guides: + content.append(f"- [{title}]({url})") + content.append("") + + content.append("## Other Documents") + content.append("") + other_docs = [ + ("Configuration", f"https://spark.apache.org/docs/{version}/configuration.html"), + ("Monitoring", f"https://spark.apache.org/docs/{version}/monitoring.html"), + ("Web UI", f"https://spark.apache.org/docs/{version}/web-ui.html"), + ("Tuning Guide", f"https://spark.apache.org/docs/{version}/tuning.html"), + ("Job Scheduling", f"https://spark.apache.org/docs/{version}/job-scheduling.html"), + ("Security", f"https://spark.apache.org/docs/{version}/security.html"), + ("Hardware Provisioning", f"https://spark.apache.org/docs/{version}/hardware-provisioning.html"), + ("Cloud Infrastructures", f"https://spark.apache.org/docs/{version}/cloud-integration.html"), + ("Migration Guide", f"https://spark.apache.org/docs/{version}/migration-guide.html"), + ("Building Spark", f"https://spark.apache.org/docs/{version}/building-spark.html"), + ] + for title, url in other_docs: + content.append(f"- [{title}]({url})") + content.append("") + + content.append("## External Resources") + content.append("") + content.append("- [Apache Spark Home](https://spark.apache.org/)") + content.append("- [Downloads](https://spark.apache.org/downloads.html)") + content.append("- [GitHub Repository](https://github.com/apache/spark)") + content.append("- [Issue Tracker (JIRA)](https://issues.apache.org/jira/projects/SPARK)") + content.append("- [Mailing Lists](https://spark.apache.org/mailing-lists.html)") + content.append("- [Community](https://spark.apache.org/community.html)") + content.append("- [Contributing](https://spark.apache.org/contributing.html)") + content.append("") + + with open(output_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(content)) + + print(f"Generated {output_path}") + + total_docs = len(programming_guides) + len(api_docs) + len(deployment_guides) + len(other_docs) + sections_count = 5 + + print(f"Total documentation pages indexed: {total_docs}") + print(f"Sections: {sections_count}") + + +def main(): + parser = argparse.ArgumentParser(description='Generate llms.txt file for Apache Spark documentation') + parser.add_argument( + '--docs-path', + type=str, + default='docs', + help='Path to the docs directory (default: docs)' + ) + parser.add_argument( + '--output', + type=str, + default='llms.txt', + help='Output file path (default: llms.txt)' + ) + parser.add_argument( + '--version', + type=str, + default='latest', + help='Spark documentation version (default: latest)' + ) + + args = parser.parse_args() + + # Convert to Path objects + script_dir = Path(__file__).parent + project_root = script_dir.parent.parent # Go up two levels from dev/create-release/ + docs_path = project_root / args.docs_path + output_path = project_root / args.output + + # Check if docs directory exists + if not docs_path.exists(): + print(f"Error: Documentation directory '{docs_path}' does not exist") + sys.exit(1) + + # Generate the llms.txt file + generate_llms_txt(docs_path, output_path, args.version) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index b984876f41643..73ea87cfe6ace 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -833,6 +833,14 @@ if [[ "$1" == "docs" ]]; then fi bundle install PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" bundle exec jekyll build + + # Generate llms.txt for LLM consumption + echo "Generating llms.txt..." + python "$SELF/generate-llms-txt.py" \ + --docs-path . \ + --output _site/llms.txt \ + --version "$SPARK_VERSION" + cd .. cd .. From a48283e44b46eee97cd873672ccdaa3bf8fd02dd Mon Sep 17 00:00:00 2001 From: Allison Wang Date: Mon, 29 Sep 2025 14:34:20 -0700 Subject: [PATCH 3/3] fix --- dev/create-release/generate-llms-txt.py | 128 +++++++++++++++--------- 1 file changed, 81 insertions(+), 47 deletions(-) diff --git a/dev/create-release/generate-llms-txt.py b/dev/create-release/generate-llms-txt.py index 027e3089a70fb..e1d2690767603 100755 --- a/dev/create-release/generate-llms-txt.py +++ b/dev/create-release/generate-llms-txt.py @@ -30,64 +30,96 @@ def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest content = [] content.append("# Apache Spark") content.append("") - content.append("> Apache Spark™ is a unified analytics engine for large-scale data processing. " - "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine " - "that supports general execution graphs. It also supports a rich set of higher-level " - "tools including Spark SQL for SQL and structured data processing, MLlib for machine " - "learning, GraphX for graph processing, and Structured Streaming for incremental " - "computation and stream processing.") + content.append( + "> Apache Spark™ is a unified analytics engine for large-scale data processing. " + "It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine " + "that supports general execution graphs. It also supports a rich set of higher-level " + "tools including Spark SQL for SQL and structured data processing, MLlib for machine " + "learning, GraphX for graph processing, and Structured Streaming for incremental " + "computation and stream processing." + ) content.append("") - + doc_home_url = f"https://spark.apache.org/docs/{version}/" content.append(f"Documentation home: {doc_home_url}") content.append("") - + content.append("## Programming Guides") content.append("") programming_guides = [ ("Quick Start", f"https://spark.apache.org/docs/{version}/quick-start.html"), - ("RDD Programming Guide", f"https://spark.apache.org/docs/{version}/rdd-programming-guide.html"), - ("Spark SQL, Datasets, and DataFrames", f"https://spark.apache.org/docs/{version}/sql-programming-guide.html"), - ("Structured Streaming", f"https://spark.apache.org/docs/{version}/structured-streaming-programming-guide.html"), - ("Spark Streaming", f"https://spark.apache.org/docs/{version}/streaming-programming-guide.html"), + ( + "RDD Programming Guide", + f"https://spark.apache.org/docs/{version}/rdd-programming-guide.html", + ), + ( + "Spark SQL, Datasets, and DataFrames", + f"https://spark.apache.org/docs/{version}/sql-programming-guide.html", + ), + ( + "Structured Streaming", + f"https://spark.apache.org/docs/{version}/structured-streaming-programming-guide.html", + ), + ( + "Spark Streaming", + f"https://spark.apache.org/docs/{version}/streaming-programming-guide.html", + ), ("MLlib", f"https://spark.apache.org/docs/{version}/ml-guide.html"), ("GraphX", f"https://spark.apache.org/docs/{version}/graphx-programming-guide.html"), ("SparkR", f"https://spark.apache.org/docs/{version}/sparkr.html"), - ("PySpark", f"https://spark.apache.org/docs/{version}/api/python/getting_started/index.html"), - ("Spark SQL CLI", f"https://spark.apache.org/docs/{version}/sql-distributed-sql-engine-spark-sql-cli.html"), + ( + "PySpark", + f"https://spark.apache.org/docs/{version}/api/python/getting_started/index.html", + ), + ( + "Spark SQL CLI", + f"https://spark.apache.org/docs/{version}/sql-distributed-sql-engine-spark-sql-cli.html", + ), ] for title, url in programming_guides: content.append(f"- [{title}]({url})") content.append("") - + content.append("## API Docs") content.append("") # TODO: Update API docs to point to their own llms.txt files once available # e.g., https://spark.apache.org/docs/{version}/api/python/llms.txt api_docs = [ ("Spark Python API", f"https://spark.apache.org/docs/{version}/api/python/index.html"), - ("Spark Scala API", f"https://spark.apache.org/docs/{version}/api/scala/org/apache/spark/index.html"), + ( + "Spark Scala API", + f"https://spark.apache.org/docs/{version}/api/scala/org/apache/spark/index.html", + ), ("Spark Java API", f"https://spark.apache.org/docs/{version}/api/java/index.html"), ("Spark R API", f"https://spark.apache.org/docs/{version}/api/R/index.html"), - ("Spark SQL Built-in Functions", f"https://spark.apache.org/docs/{version}/api/sql/index.html"), + ( + "Spark SQL Built-in Functions", + f"https://spark.apache.org/docs/{version}/api/sql/index.html", + ), ] for title, url in api_docs: content.append(f"- [{title}]({url})") content.append("") - + content.append("## Deployment Guides") content.append("") deployment_guides = [ ("Cluster Overview", f"https://spark.apache.org/docs/{version}/cluster-overview.html"), - ("Submitting Applications", f"https://spark.apache.org/docs/{version}/submitting-applications.html"), - ("Standalone Deploy Mode", f"https://spark.apache.org/docs/{version}/spark-standalone.html"), + ( + "Submitting Applications", + f"https://spark.apache.org/docs/{version}/submitting-applications.html", + ), + ( + "Standalone Deploy Mode", + f"https://spark.apache.org/docs/{version}/spark-standalone.html", + ), ("YARN", f"https://spark.apache.org/docs/{version}/running-on-yarn.html"), ("Kubernetes", f"https://spark.apache.org/docs/{version}/running-on-kubernetes.html"), ] for title, url in deployment_guides: content.append(f"- [{title}]({url})") content.append("") - + content.append("## Other Documents") content.append("") other_docs = [ @@ -97,15 +129,21 @@ def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest ("Tuning Guide", f"https://spark.apache.org/docs/{version}/tuning.html"), ("Job Scheduling", f"https://spark.apache.org/docs/{version}/job-scheduling.html"), ("Security", f"https://spark.apache.org/docs/{version}/security.html"), - ("Hardware Provisioning", f"https://spark.apache.org/docs/{version}/hardware-provisioning.html"), - ("Cloud Infrastructures", f"https://spark.apache.org/docs/{version}/cloud-integration.html"), + ( + "Hardware Provisioning", + f"https://spark.apache.org/docs/{version}/hardware-provisioning.html", + ), + ( + "Cloud Infrastructures", + f"https://spark.apache.org/docs/{version}/cloud-integration.html", + ), ("Migration Guide", f"https://spark.apache.org/docs/{version}/migration-guide.html"), ("Building Spark", f"https://spark.apache.org/docs/{version}/building-spark.html"), ] for title, url in other_docs: content.append(f"- [{title}]({url})") content.append("") - + content.append("## External Resources") content.append("") content.append("- [Apache Spark Home](https://spark.apache.org/)") @@ -116,56 +154,52 @@ def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest content.append("- [Community](https://spark.apache.org/community.html)") content.append("- [Contributing](https://spark.apache.org/contributing.html)") content.append("") - - with open(output_path, 'w', encoding='utf-8') as f: - f.write('\n'.join(content)) - + + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(content)) + print(f"Generated {output_path}") - + total_docs = len(programming_guides) + len(api_docs) + len(deployment_guides) + len(other_docs) sections_count = 5 - + print(f"Total documentation pages indexed: {total_docs}") print(f"Sections: {sections_count}") def main(): - parser = argparse.ArgumentParser(description='Generate llms.txt file for Apache Spark documentation') + parser = argparse.ArgumentParser( + description="Generate llms.txt file for Apache Spark documentation" + ) parser.add_argument( - '--docs-path', - type=str, - default='docs', - help='Path to the docs directory (default: docs)' + "--docs-path", type=str, default="docs", help="Path to the docs directory (default: docs)" ) parser.add_argument( - '--output', - type=str, - default='llms.txt', - help='Output file path (default: llms.txt)' + "--output", type=str, default="llms.txt", help="Output file path (default: llms.txt)" ) parser.add_argument( - '--version', + "--version", type=str, - default='latest', - help='Spark documentation version (default: latest)' + default="latest", + help="Spark documentation version (default: latest)", ) - + args = parser.parse_args() - + # Convert to Path objects script_dir = Path(__file__).parent project_root = script_dir.parent.parent # Go up two levels from dev/create-release/ docs_path = project_root / args.docs_path output_path = project_root / args.output - + # Check if docs directory exists if not docs_path.exists(): print(f"Error: Documentation directory '{docs_path}' does not exist") sys.exit(1) - + # Generate the llms.txt file generate_llms_txt(docs_path, output_path, args.version) if __name__ == "__main__": - main() \ No newline at end of file + main()