Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,47 @@
# under the License.

name: Dev
on: [push, pull_request]
on: [push, pull_request, workflow_dispatch]

concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true

jobs:
breaking-changes:
name: Breaking Changes Detection
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
permissions:
contents: read
pull-requests: write
issues: write
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
# Need full history to access base branch for git diff operations
# GitHub Actions only fetches single commit by default (fetch-depth: 1)
# Our script uses: git diff "$BASE_REF"..HEAD which requires base branch
fetch-depth: 0

- name: Run breaking changes detection
shell: bash
run: |
chmod +x ./ci/scripts/detect-breaking-changes.sh
./ci/scripts/detect-breaking-changes.sh
timeout-minutes: 5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Upload breaking changes report
if: failure()
uses: actions/upload-artifact@v4
with:
name: breaking-changes-report
path: breaking-changes-report.md
retention-days: 30

license-header-check:
runs-on: ubuntu-latest
name: Check License Header
Expand All @@ -35,6 +69,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need the full history I believe.

fetch-depth: 0 # Fetch full history
token: ${{ secrets.GITHUB_TOKEN }}
- uses: actions/setup-node@v4
with:
node-version: "20"
Expand Down
49 changes: 49 additions & 0 deletions ci/scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# CI Scripts

This directory contains scripts used in the DataFusion CI/CD pipeline.

## Breaking Changes Detection

### `detect-breaking-changes.sh`

Automatically detects potential breaking changes in DataFusion pull requests using git-based analysis.

**Features:**
- Detects changes to LogicalPlan enums and variants
- Monitors DataFrame API modifications
- Checks SQL parser keyword changes
- Generates detailed reports with diff snippets
- Integrates with GitHub to post PR comments
- Lightweight git-based analysis (no compilation required)

**Usage:**
```bash
# Run from repository root
./ci/scripts/detect-breaking-changes.sh

# With custom base ref
GITHUB_BASE_REF=v42.0.0 ./ci/scripts/detect-breaking-changes.sh
```

**Environment Variables:**
- `GITHUB_BASE_REF`: Base branch/tag to compare against (default: `main`)
- `GITHUB_HEAD_REF`: Head ref being tested (default: `HEAD`)
- `GITHUB_TOKEN`: Required for posting PR comments
- `GITHUB_ENV`: Set by GitHub Actions for CI integration

**Output:**
- Exit code 0: No breaking changes detected
- Exit code 1: Breaking changes detected
- Generates `breaking-changes-report.md` when issues found

### `breaking-changes-report.md`

Template/example of the report generated when breaking changes are detected. Contains:
- Summary of detected changes
- Links to DataFusion API stability guidelines
- Required actions for PR authors
- Approval requirements

## Integration

The breaking changes detection is integrated into the GitHub Actions workflow (`.github/workflows/dev.yml`) and runs automatically on all pull requests.
259 changes: 259 additions & 0 deletions ci/scripts/detect-breaking-changes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
#!/bin/bash

set -euo pipefail # Add -u for undefined variables and -o pipefail for pipe failures

# Ensure we're in the repository root
if [ ! -f "Cargo.toml" ] || [ ! -d ".git" ]; then
echo "❌ Error: This script must be run from the repository root directory"
echo "Current directory: $(pwd)"
echo "Expected: A directory containing Cargo.toml and .git"
exit 1
fi

BASE_REF="${GITHUB_BASE_REF:-main}"
HEAD_REF="${GITHUB_HEAD_REF:-HEAD}"

# In GitHub Actions PR context, use the proper base reference
if [ -n "$GITHUB_BASE_REF" ]; then
# For PRs, GitHub provides GITHUB_BASE_REF (e.g., "main")
# We need to reference it as origin/$GITHUB_BASE_REF
BASE_REF="origin/$GITHUB_BASE_REF"
fi

echo "🔍 DataFusion Breaking Changes Detection"
echo "Comparing against: $BASE_REF"
echo "Current ref: $HEAD_REF"

# Skip installation - we're only using git-based analysis
echo "📋 Using git-based analysis for breaking change detection..."

detect_datafusion_breaking_changes() {
local breaking_changes_found=1 # 1 = false in bash

echo "📋 Checking DataFusion-specific API rules..."

# Validate that we can access git and the base ref exists
if ! git rev-parse --verify "$BASE_REF" >/dev/null 2>&1; then
echo "⚠️ Warning: Base ref '$BASE_REF' not found, skipping git-based analysis"
return 1 # No breaking changes (can't detect them)
fi

# Only run git-based checks if we have a valid base ref
if [ -n "$BASE_REF" ]; then
echo "Checking for Rust source file changes..."
if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "\.rs$"; then
echo "Rust files modified - running detailed checks..."

echo "Checking LogicalPlan stability..."
if check_logical_plan_changes; then
echo "❌ Breaking changes detected in LogicalPlan"
breaking_changes_found=0
else
echo "✅ No LogicalPlan breaking changes detected"
fi

echo "Checking DataFrame API..."
if check_dataframe_api_changes; then
echo "❌ Breaking changes detected in DataFrame API"
breaking_changes_found=0
else
echo "✅ No DataFrame API breaking changes detected"
fi

echo "Checking SQL parser compatibility..."
if check_sql_parser_changes; then
echo "❌ Breaking changes detected in SQL parser"
breaking_changes_found=0
else
echo "✅ No SQL parser breaking changes detected"
fi
else
echo "✅ No Rust source files modified - no breaking changes possible"
fi
else
echo "⚠️ Skipping git-based checks (no base ref available)"
fi

return $breaking_changes_found
}

check_logical_plan_changes() {
# Check for changes in LogicalPlan files using git diff
if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "(logical_plan|logical.*expr)"; then
echo " LogicalPlan-related files modified"

# Check for enum variant removals (potential breaking change)
if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-.*enum.*LogicalPlan|^-.*pub.*enum"; then
echo " Enum definitions modified"
return 0 # Breaking change detected
fi

# Check for removed enum variants
if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-\s*[A-Z][a-zA-Z]*\s*[\(\{,]"; then
echo " Enum variants potentially removed"
return 0 # Breaking change detected
fi
fi

return 1 # No breaking changes
}

check_dataframe_api_changes() {
# Check if DataFrame public methods were changed
if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "dataframe.*\.rs$"; then
echo " DataFrame files modified, checking for API changes..."

# Check for ANY changes to public methods (additions, removals, modifications)
# Only look at actual DataFrame source files, not test files
if git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/core/src/dataframe/" | grep -qE "^[-+].*pub fn"; then
echo " Public DataFrame API methods modified"
return 0 # Breaking change detected
fi
fi

return 1 # No breaking changes
}

check_sql_parser_changes() {
# Check for SQL keyword removals
if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "keywords"; then
if git diff "$BASE_REF"..HEAD 2>/dev/null | grep -qE "^-.*,"; then
echo " SQL keywords potentially removed"
return 0 # Breaking change detected
fi
fi

return 1 # No breaking changes
}

generate_breaking_changes_report() {
local output_file="breaking-changes-report.md"
local timestamp=$(date -u +"%Y-%m-%d %H:%M:%S UTC")

cat > "$output_file" << EOF
# 🚨 Breaking Changes Report

**Generated:** $timestamp
**Base Ref:** $BASE_REF
**Head Ref:** $HEAD_REF

## Summary
Breaking changes detected in this PR that require the \`api-change\` label.

## DataFusion API Stability Guidelines
Per the [API Health Policy](https://datafusion.apache.org/contributor-guide/specification/api-health-policy.html):

EOF

echo "### Git-Based Analysis:" >> "$output_file"

# Only add git-based analysis if we have a base ref
if [ -n "$BASE_REF" ]; then
local changes_found=false

# Check for DataFrame API changes with details - EXCLUDE script files
if git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -qE "dataframe.*\.rs$"; then
# Only look at .rs files in DataFrame source directories
local dataframe_diff=$(git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/core/src/dataframe/" | grep -E "^[-+].*pub fn")
if [ -n "$dataframe_diff" ]; then
echo "- ⚠️ **DataFrame API changes detected:**" >> "$output_file"
echo "\`\`\`diff" >> "$output_file"
echo "$dataframe_diff" | head -10 >> "$output_file"
echo "\`\`\`" >> "$output_file"
echo "" >> "$output_file"
changes_found=true
fi
fi

# Check for LogicalPlan changes with details - EXCLUDE script files
local logical_plan_files=$(git diff "$BASE_REF" --name-only 2>/dev/null | grep -E "(logical_plan|logical.*expr).*\.rs$")
if [ -n "$logical_plan_files" ]; then
echo "- ⚠️ **LogicalPlan changes detected in:**" >> "$output_file"
echo "$logical_plan_files" | sed 's/^/ - /' >> "$output_file"
echo "" >> "$output_file"
changes_found=true
fi

# Show general public API changes - ONLY in .rs files, EXCLUDE ci/scripts and tests
local api_changes=$(git diff "$BASE_REF"..HEAD 2>/dev/null -- "datafusion/" "ballista/" | grep -E "^[-+].*pub (fn|struct|enum)" | grep -v "/tests/")
if [ -n "$api_changes" ]; then
echo "- ⚠️ **Public API signature changes:**" >> "$output_file"
echo "\`\`\`diff" >> "$output_file"
echo "$api_changes" | head -10 >> "$output_file"
echo "\`\`\`" >> "$output_file"
echo "" >> "$output_file"
changes_found=true
fi

if [ "$changes_found" = false ]; then
echo "- ⚠️ Breaking changes detected but specific patterns not identified" >> "$output_file"
echo "- Review the modified files below for potential API changes" >> "$output_file"
fi

# Add list of changed SOURCE files only
echo "" >> "$output_file"
echo "### Source Files Modified:" >> "$output_file"
git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | grep -E "\.rs$" | head -20 | sed 's/^/- /' >> "$output_file" || {
echo "- No Rust source files modified" >> "$output_file"
}

echo "" >> "$output_file"
echo "### All Files Modified:" >> "$output_file"
git diff "$BASE_REF"..HEAD --name-only 2>/dev/null | head -20 | sed 's/^/- /' >> "$output_file"
else
echo "- ⚠️ Git-based analysis skipped (no base ref available)" >> "$output_file"
fi

cat >> "$output_file" << EOF

## Required Actions:
1. Add the \`api-change\` label to this PR
2. Update CHANGELOG.md with breaking change details
3. Consider adding deprecation warnings before removal
4. Update migration guide if needed

## Approval Requirements:
- Breaking changes require approval from a DataFusion maintainer
- Consider if this change is necessary or if a deprecation path exists

## Note:
This analysis uses git-based pattern detection to identify common breaking change patterns.
Review the specific changes carefully to determine if they truly constitute breaking changes.
EOF

echo "📋 Report generated: $output_file"
}

main() {
if detect_datafusion_breaking_changes; then
echo ""
echo "🚨 FINAL RESULT: Breaking changes detected!"
# Only set GITHUB_ENV if it exists (in CI)
if [ -n "$GITHUB_ENV" ]; then
echo "BREAKING_CHANGES_DETECTED=true" >> "$GITHUB_ENV"
fi

generate_breaking_changes_report

# Only try to comment if we have GitHub CLI and are in a PR context
if command -v gh >/dev/null 2>&1 && [ -n "$GITHUB_TOKEN" ] && [ -n "$GITHUB_REPOSITORY" ] && [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
echo "💬 Adding PR comment..."
gh pr comment --body-file breaking-changes-report.md || {
echo "⚠️ Could not add PR comment, but report was generated"
}
else
echo "⚠️ Skipping PR comment (not in PR context or missing GitHub CLI)"
fi

exit 1
else
echo ""
echo "🎉 FINAL RESULT: No breaking changes detected"
# Only set GITHUB_ENV if it exists (in CI)
if [ -n "$GITHUB_ENV" ]; then
echo "BREAKING_CHANGES_DETECTED=false" >> "$GITHUB_ENV"
fi
fi
}

main "$@"