Merge branch 'main' into test-build-test

tpackard1 · web-flow · commit d76bceab6812 · 2022-12-19T11:54:37.000-05:00
diff --git a/Dockerfile b/Dockerfile
@@ -8,6 +8,6 @@ RUN apt-get install -y build-essential
 RUN apt-get install -y libhdf5-dev
 
 RUN python -m pip install --upgrade pip
-RUN python -m pip install --use-deprecated=legacy-resolver \
+RUN python -m pip install \
     -r https://github.com/raw/pandas-dev/pandas/main/requirements-dev.txt
 CMD ["/bin/bash"]
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -38,6 +38,8 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_csv`
 * :func:`read_excel`
 * :func:`read_sql`
+* :func:`read_sql_query`
+* :func:`read_sql_table`
 
 Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
@@ -394,7 +396,7 @@ If installed, we now require:
 +-----------------+-----------------+----------+---------+
 | Package         | Minimum Version | Required | Changed |
 +=================+=================+==========+=========+
-| mypy (dev)      | 0.990           |          |    X    |
+| mypy (dev)      | 0.991           |          |    X    |
 +-----------------+-----------------+----------+---------+
 | python-dateutil | 2.8.2           |    X     |    X    |
 +-----------------+-----------------+----------+---------+
@@ -836,6 +838,7 @@ Indexing
 - Bug in :meth:`DataFrame.__setitem__` raising when indexer is a :class:`DataFrame` with ``boolean`` dtype (:issue:`47125`)
 - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
 - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`)
+- Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`)
 - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`)
 - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when right hand side is :class:`DataFrame` with :class:`MultiIndex` columns (:issue:`49121`)
 - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
@@ -879,7 +882,7 @@ I/O
 - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
--
+- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 
 Period
 ^^^^^^
diff --git a/environment.yml b/environment.yml
@@ -80,7 +80,7 @@ dependencies:
   - flake8=6.0.0
   - flake8-bugbear=22.7.1 # used by flake8, find likely bugs
   - isort>=5.2.1  # check that imports are in the right order
-  - mypy=0.990
+  - mypy=0.991
   - pre-commit>=2.15.0
   - pycodestyle  # used by flake8
   - pyupgrade
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -332,9 +332,18 @@ static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
     return PyBytes_AS_STRING(obj);
 }
 
-static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
+static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
                              size_t *_outLen) {
-    return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
+    char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
+                                                    (Py_ssize_t *)_outLen);
+    if (encoded == NULL) {
+        /* Something went wrong.
+          Set errorMsg(to tell encoder to stop),
+          and let Python exception propagate. */
+        JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
+        enc->errorMsg = "Encoding failed.";
+    }
+    return encoded;
 }
 
 /* JSON callback. returns a char* and mutates the pointer to *len */
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6158,6 +6158,11 @@ def _maybe_cast_slice_bound(self, label, side: str_t):
         # We are a plain index here (sub-class override this method if they
         # wish to have special treatment for floats/ints, e.g. Float64Index and
         # datetimelike Indexes
+        # Special case numeric EA Indexes, since they are not handled by NumericIndex
+
+        if is_extension_array_dtype(self.dtype) and is_numeric_dtype(self.dtype):
+            return self._maybe_cast_indexer(label)
+
         # reject them, if index does not contain label
         if (is_float(label) or is_integer(label)) and label not in self:
             self._raise_invalid_indexer("slice", label)
diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py
@@ -160,8 +160,7 @@ def _get_strcols(self) -> list[list[str]]:
             def pad_empties(x):
                 for pad in reversed(x):
                     if pad:
-                        break
-                return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
+                        return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
 
             gen = (pad_empties(i) for i in out)
 
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
@@ -417,9 +417,9 @@ def best_len(values: list[str]) -> int:
             for max_items in reversed(range(1, len(value) + 1)):
                 pprinted_seq = _pprint_seq(value, max_seq_items=max_items)
                 if len(pprinted_seq) < max_space:
+                    head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
+                    tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
                     break
-            head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
-            tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
 
         summary = ""
         line = space2
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -2304,8 +2304,8 @@ def set_sticky(
                             "selector": f"thead tr:nth-child({obj.nlevels+1}) th",
                             "props": props
                             + (
-                                f"top:{(i+1) * pixel_size}px; height:{pixel_size}px; "
-                                "z-index:2;"
+                                f"top:{(len(levels_)) * pixel_size}px; "
+                                f"height:{pixel_size}px; z-index:2;"
                             ),
                         }
                     )
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -224,6 +224,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = ...,
     columns: list[str] | None = ...,
     chunksize: None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -238,6 +239,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = ...,
     columns: list[str] | None = ...,
     chunksize: int = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> Iterator[DataFrame]:
     ...
 
@@ -251,6 +253,7 @@ def read_sql_table(
     parse_dates: list[str] | dict[str, str] | None = None,
     columns: list[str] | None = None,
     chunksize: int | None = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | Iterator[DataFrame]:
     """
     Read SQL database table into a DataFrame.
@@ -287,6 +290,12 @@ def read_sql_table(
     chunksize : int, default None
         If specified, returns an iterator where `chunksize` is the number of
         rows to include in each chunk.
+    use_nullable_dtypes : bool = False
+        Whether to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        .. versionadded:: 2.0
 
     Returns
     -------
@@ -318,6 +327,7 @@ def read_sql_table(
             parse_dates=parse_dates,
             columns=columns,
             chunksize=chunksize,
+            use_nullable_dtypes=use_nullable_dtypes,
         )
 
     if table is not None:
@@ -336,6 +346,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = ...,
     chunksize: None = ...,
     dtype: DtypeArg | None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -350,6 +361,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = ...,
     chunksize: int = ...,
     dtype: DtypeArg | None = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> Iterator[DataFrame]:
     ...
 
@@ -363,6 +375,7 @@ def read_sql_query(
     parse_dates: list[str] | dict[str, str] | None = None,
     chunksize: int | None = None,
     dtype: DtypeArg | None = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | Iterator[DataFrame]:
     """
     Read SQL query into a DataFrame.
@@ -406,6 +419,12 @@ def read_sql_query(
         {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}.
 
         .. versionadded:: 1.3.0
+    use_nullable_dtypes : bool = False
+        Whether to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        .. versionadded:: 2.0
 
     Returns
     -------
@@ -430,6 +449,7 @@ def read_sql_query(
             parse_dates=parse_dates,
             chunksize=chunksize,
             dtype=dtype,
+            use_nullable_dtypes=use_nullable_dtypes,
         )
 
 
diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py
@@ -159,4 +159,4 @@ def test_sequence_like_with_categorical(self):
             str(s)
 
         for c, col in df.items():
-            str(s)
+            str(col)
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -291,6 +291,15 @@ def test_encode_unicode_4bytes_utf8highest(self):
         assert enc == json.dumps(four_bytes_input)
         assert dec == json.loads(enc)
 
+    def test_encode_unicode_error(self):
+        string = "'\udac0'"
+        msg = (
+            r"'utf-8' codec can't encode character '\\udac0' "
+            r"in position 1: surrogates not allowed"
+        )
+        with pytest.raises(UnicodeEncodeError, match=msg):
+            ujson.dumps([string])
+
     def test_encode_array_in_array(self):
         arr_in_arr_input = [[[[]]]]
         output = ujson.encode(arr_in_arr_input)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -2276,21 +2276,22 @@ def test_get_engine_auto_error_message(self):
         pass
         # TODO(GH#36893) fill this in when we add more engines
 
-    def test_read_sql_nullable_dtypes(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
+    def test_read_sql_nullable_dtypes(self, string_storage, func):
         # GH#50048
         table = "test"
         df = self.nullable_data()
         df.to_sql(table, self.conn, index=False, if_exists="replace")
 
         with pd.option_context("mode.string_storage", string_storage):
-            result = pd.read_sql(
+            result = getattr(pd, func)(
                 f"Select * from {table}", self.conn, use_nullable_dtypes=True
             )
         expected = self.nullable_expected(string_storage)
         tm.assert_frame_equal(result, expected)
 
         with pd.option_context("mode.string_storage", string_storage):
-            iterator = pd.read_sql(
+            iterator = getattr(pd, func)(
                 f"Select * from {table}",
                 self.conn,
                 use_nullable_dtypes=True,
@@ -2300,20 +2301,21 @@ def test_read_sql_nullable_dtypes(self, string_storage):
             for result in iterator:
                 tm.assert_frame_equal(result, expected)
 
-    def test_read_sql_nullable_dtypes_table(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
+    def test_read_sql_nullable_dtypes_table(self, string_storage, func):
         # GH#50048
         table = "test"
         df = self.nullable_data()
         df.to_sql(table, self.conn, index=False, if_exists="replace")
 
         with pd.option_context("mode.string_storage", string_storage):
-            result = pd.read_sql(table, self.conn, use_nullable_dtypes=True)
+            result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True)
         expected = self.nullable_expected(string_storage)
         tm.assert_frame_equal(result, expected)
 
         with pd.option_context("mode.string_storage", string_storage):
-            iterator = pd.read_sql(
-                f"Select * from {table}",
+            iterator = getattr(pd, func)(
+                table,
                 self.conn,
                 use_nullable_dtypes=True,
                 chunksize=3,
@@ -2463,7 +2465,8 @@ class Test(BaseModel):
     def nullable_expected(self, storage) -> DataFrame:
         return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"})
 
-    def test_read_sql_nullable_dtypes_table(self, string_storage):
+    @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
+    def test_read_sql_nullable_dtypes_table(self, string_storage, func):
         # GH#50048 Not supported for sqlite
         pass
 
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
@@ -10,6 +10,7 @@
 from pandas import (
     NA,
     DataFrame,
+    Index,
     IndexSlice,
     MultiIndex,
     Series,
@@ -366,6 +367,14 @@ def test_loc_setitem_nested_data_enlargement():
     tm.assert_series_equal(ser, expected)
 
 
+def test_loc_ea_numeric_index_oob_slice_end():
+    # GH#50161
+    ser = Series(1, index=Index([0, 1, 2], dtype="Int64"))
+    result = ser.loc[2:3]
+    expected = Series(1, index=Index([2], dtype="Int64"))
+    tm.assert_series_equal(result, expected)
+
+
 def test_getitem_bool_int_key():
     # GH#48653
     ser = Series({True: 1, False: 0})
diff --git a/pyproject.toml b/pyproject.toml
@@ -217,6 +217,7 @@ disable = [
   "too-many-public-methods",
   "too-many-return-statements",
   "too-many-statements",
+  "undefined-loop-variable",
   "unexpected-keyword-arg",
   "ungrouped-imports",
   "unsubscriptable-object",
@@ -276,7 +277,6 @@ disable = [
   "signature-differs",
   "super-init-not-called",
   "try-except-raise",
-  "undefined-loop-variable",
   "unnecessary-lambda",
   "unspecified-encoding",
   "unused-argument",
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -57,7 +57,7 @@ cpplint
 flake8==6.0.0
 flake8-bugbear==22.7.1
 isort>=5.2.1
-mypy==0.990
+mypy==0.991
 pre-commit>=2.15.0
 pycodestyle
 pyupgrade

Original file line number	Diff line number	Diff line change
`@@ -2304,8 +2304,8 @@ def set_sticky(`
`2304`	`2304`	`"selector": f"thead tr:nth-child({obj.nlevels+1}) th",`
`2305`	`2305`	`"props": props`
`2306`	`2306`	`+ (`
`2307`		`- f"top:{(i+1) * pixel_size}px; height:{pixel_size}px; "`
`2308`		`- "z-index:2;"`
	`2307`	`+ f"top:{(len(levels_)) * pixel_size}px; "`
	`2308`	`+ f"height:{pixel_size}px; z-index:2;"`
`2309`	`2309`	`),`
`2310`	`2310`	`}`
`2311`	`2311`	`)`