Introduce row_limit param (#607)

varun-edachali-dbx · web-flow · commit 4f11ff0be33e · 2025-07-07T14:30:10.000+05:30
* introduce row_limit

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* move use_sea init to Session constructor

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* more explicit typing

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* add row_limit to Thrift backend

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* formatting (black)

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* add e2e test for thrift resultRowLimit

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* explicitly convert extra cursor params to dict

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* remove excess tests

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

* add docstring for row_limit

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;

---------

Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/src/databricks/sql/backend/databricks_client.py b/src/databricks/sql/backend/databricks_client.py
@@ -85,6 +85,7 @@ def execute_command(
         parameters: List,
         async_op: bool,
         enforce_embedded_schema_correctness: bool,
+        row_limit: Optional[int] = None,
     ) -> Union["ResultSet", None]:
         """
         Executes a SQL command or query within the specified session.
@@ -103,6 +104,7 @@ def execute_command(
             parameters: List of parameters to bind to the query
             async_op: Whether to execute the command asynchronously
             enforce_embedded_schema_correctness: Whether to enforce schema correctness
+            row_limit: Maximum number of rows in the operation result.
 
         Returns:
             If async_op is False, returns a ResultSet object containing the
diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py
@@ -405,6 +405,7 @@ def execute_command(
         parameters: List[Dict[str, Any]],
         async_op: bool,
         enforce_embedded_schema_correctness: bool,
+        row_limit: Optional[int] = None,
     ) -> Union[SeaResultSet, None]:
         """
         Execute a SQL command using the SEA backend.
@@ -462,7 +463,7 @@ def execute_command(
             format=format,
             wait_timeout=(WaitTimeout.ASYNC if async_op else WaitTimeout.SYNC).value,
             on_wait_timeout="CONTINUE",
-            row_limit=max_rows,
+            row_limit=row_limit,
             parameters=sea_parameters if sea_parameters else None,
             result_compression=result_compression,
         )
diff --git a/src/databricks/sql/backend/thrift_backend.py b/src/databricks/sql/backend/thrift_backend.py
@@ -4,7 +4,7 @@
 import math
 import time
 import threading
-from typing import List, Union, Any, TYPE_CHECKING
+from typing import List, Optional, Union, Any, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from databricks.sql.client import Cursor
@@ -929,6 +929,7 @@ def execute_command(
         parameters=[],
         async_op=False,
         enforce_embedded_schema_correctness=False,
+        row_limit: Optional[int] = None,
     ) -> Union["ResultSet", None]:
         thrift_handle = session_id.to_thrift_handle()
         if not thrift_handle:
@@ -969,6 +970,7 @@ def execute_command(
             useArrowNativeTypes=spark_arrow_types,
             parameters=parameters,
             enforceEmbeddedSchemaCorrectness=enforce_embedded_schema_correctness,
+            resultRowLimit=row_limit,
         )
         resp = self.make_request(self._client.ExecuteStatement, req)
 
diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py
@@ -335,8 +335,14 @@ def cursor(
         self,
         arraysize: int = DEFAULT_ARRAY_SIZE,
         buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES,
+        row_limit: Optional[int] = None,
     ) -> "Cursor":
         """
+        Args:
+            arraysize: The maximum number of rows in direct results.
+            buffer_size_bytes: The maximum number of bytes in direct results.
+            row_limit: The maximum number of rows in the result.
+
         Return a new Cursor object using the connection.
 
         Will throw an Error if the connection has been closed.
@@ -349,6 +355,7 @@ def cursor(
             self.session.backend,
             arraysize=arraysize,
             result_buffer_size_bytes=buffer_size_bytes,
+            row_limit=row_limit,
         )
         self._cursors.append(cursor)
         return cursor
@@ -382,6 +389,7 @@ def __init__(
         backend: DatabricksClient,
         result_buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES,
         arraysize: int = DEFAULT_ARRAY_SIZE,
+        row_limit: Optional[int] = None,
     ) -> None:
         """
         These objects represent a database cursor, which is used to manage the context of a fetch
@@ -391,16 +399,18 @@ def __init__(
         visible by other cursors or connections.
         """
 
-        self.connection = connection
-        self.rowcount = -1  # Return -1 as this is not supported
-        self.buffer_size_bytes = result_buffer_size_bytes
+        self.connection: Connection = connection
+
+        self.rowcount: int = -1  # Return -1 as this is not supported
+        self.buffer_size_bytes: int = result_buffer_size_bytes
         self.active_result_set: Union[ResultSet, None] = None
-        self.arraysize = arraysize
+        self.arraysize: int = arraysize
+        self.row_limit: Optional[int] = row_limit
         # Note that Cursor closed => active result set closed, but not vice versa
-        self.open = True
-        self.executing_command_id = None
-        self.backend = backend
-        self.active_command_id = None
+        self.open: bool = True
+        self.executing_command_id: Optional[CommandId] = None
+        self.backend: DatabricksClient = backend
+        self.active_command_id: Optional[CommandId] = None
         self.escaper = ParamEscaper()
         self.lastrowid = None
 
@@ -779,6 +789,7 @@ def execute(
             parameters=prepared_params,
             async_op=False,
             enforce_embedded_schema_correctness=enforce_embedded_schema_correctness,
+            row_limit=self.row_limit,
         )
 
         if self.active_result_set and self.active_result_set.is_staging_operation:
@@ -835,6 +846,7 @@ def execute_async(
             parameters=prepared_params,
             async_op=True,
             enforce_embedded_schema_correctness=enforce_embedded_schema_correctness,
+            row_limit=self.row_limit,
         )
 
         return self
diff --git a/tests/e2e/test_driver.py b/tests/e2e/test_driver.py
@@ -113,10 +113,12 @@ def connection(self, extra_params=()):
             conn.close()
 
     @contextmanager
-    def cursor(self, extra_params=()):
+    def cursor(self, extra_params=(), extra_cursor_params=()):
         with self.connection(extra_params) as conn:
             cursor = conn.cursor(
-                arraysize=self.arraysize, buffer_size_bytes=self.buffer_size_bytes
+                arraysize=self.arraysize,
+                buffer_size_bytes=self.buffer_size_bytes,
+                **dict(extra_cursor_params),
             )
             try:
                 yield cursor
@@ -943,6 +945,60 @@ def test_catalogs_returns_arrow_table(self):
             results = cursor.fetchall_arrow()
             assert isinstance(results, pyarrow.Table)
 
+    def test_row_limit_with_larger_result(self):
+        """Test that row_limit properly constrains results when query would return more rows"""
+        row_limit = 1000
+        with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
+            # Execute a query that returns more than row_limit rows
+            cursor.execute("SELECT * FROM range(2000)")
+            rows = cursor.fetchall()
+
+            # Check if the number of rows is limited to row_limit
+            assert len(rows) == row_limit, f"Expected {row_limit} rows, got {len(rows)}"
+
+    def test_row_limit_with_smaller_result(self):
+        """Test that row_limit doesn't affect results when query returns fewer rows than limit"""
+        row_limit = 100
+        expected_rows = 50
+        with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
+            # Execute a query that returns fewer than row_limit rows
+            cursor.execute(f"SELECT * FROM range({expected_rows})")
+            rows = cursor.fetchall()
+
+            # Check if all rows are returned (not limited by row_limit)
+            assert (
+                len(rows) == expected_rows
+            ), f"Expected {expected_rows} rows, got {len(rows)}"
+
+    @skipUnless(pysql_supports_arrow(), "arrow test needs arrow support")
+    def test_row_limit_with_arrow_larger_result(self):
+        """Test that row_limit properly constrains arrow results when query would return more rows"""
+        row_limit = 800
+        with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
+            # Execute a query that returns more than row_limit rows
+            cursor.execute("SELECT * FROM range(1500)")
+            arrow_table = cursor.fetchall_arrow()
+
+            # Check if the number of rows in the arrow table is limited to row_limit
+            assert (
+                arrow_table.num_rows == row_limit
+            ), f"Expected {row_limit} rows, got {arrow_table.num_rows}"
+
+    @skipUnless(pysql_supports_arrow(), "arrow test needs arrow support")
+    def test_row_limit_with_arrow_smaller_result(self):
+        """Test that row_limit doesn't affect arrow results when query returns fewer rows than limit"""
+        row_limit = 200
+        expected_rows = 100
+        with self.cursor(extra_cursor_params={"row_limit": row_limit}) as cursor:
+            # Execute a query that returns fewer than row_limit rows
+            cursor.execute(f"SELECT * FROM range({expected_rows})")
+            arrow_table = cursor.fetchall_arrow()
+
+            # Check if all rows are returned (not limited by row_limit)
+            assert (
+                arrow_table.num_rows == expected_rows
+            ), f"Expected {expected_rows} rows, got {arrow_table.num_rows}"
+
 
 # use a RetrySuite to encapsulate these tests which we'll typically want to run together; however keep
 # the 429/503 subsuites separate since they execute under different circumstances.