Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ Timezones

Numeric
^^^^^^^
- Bug in :meth:`Series.searchsorted` with numeric or boolean dtype failing to raise ``TypeError`` for incompatible value types like ``timedelta64``, ``datetime64``, or ``str``, returning incorrect results instead (:issue:`65015`)
- Fixed bug in :func:`read_excel` where having a column with mixture of numeric and boolean values will typecast the values based on the first appearance data type since 1==True and 0==False (:issue:`60088`)
- Fixed bug in :meth:`Series.clip` where passing a scalar numpy array (e.g. ``np.array(0)``) would raise a ``TypeError`` (:issue:`59053`)
- Fixed bug in :meth:`Series.mean` and :meth:`Series.sum` (and their :class:`DataFrame` counterparts) overflowing for ``float16`` dtypes instead of upcasting to ``float64`` (:issue:`43929`)
Expand Down
66 changes: 66 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
lib,
)
from pandas._libs.missing import NA
from pandas._libs.tslibs import (
NaTType,
Period,
Timedelta,
Timestamp,
)
from pandas.util._decorators import set_module
from pandas.util._exceptions import find_stack_level

Expand Down Expand Up @@ -1219,6 +1225,63 @@ def take(
# ------------ #


def validate_searchsorted_value(
arr_dtype: np.dtype, value: NumpyValueArrayLike | ExtensionArray
) -> None:
"""
Check that value is compatible with arr_dtype for searchsorted.

Raises TypeError for incompatible types like datetime64/timedelta64/str
values searched against numeric/bool arrays.

Everything allowed by _validate_setitem_value is allowed here, but
searchsorted is strictly more permissive because it only requires
comparability, not storability. For example, a non-round float like
1.5 can be searchsorted against an integer array but cannot be set
into one, and a datetime64[ns] value can be searchsorted against a
datetime64[s] array even though it would lose precision on setitem.
DatetimeLikeArrayMixin subclasses already validate via
_validate_setitem_value in NDArrayBackedExtensionArray.searchsorted.
This function covers the paths that lack a _validate_setitem_value
implementation: numpy-backed Series/Index (via algorithms.searchsorted),
NumpyExtensionArray, BaseMaskedArray, and ArrowExtensionArray.
"""
if arr_dtype.kind not in "iufb":
return

if isinstance(value, (np.datetime64, np.timedelta64, str, bytes)):
raise TypeError(
f"searchsorted requires compatible dtype or scalar, "
f"got {type(value).__name__}"
)

if isinstance(value, (Timestamp, Timedelta, Period, NaTType)):
raise TypeError(
f"searchsorted requires compatible dtype or scalar, "
f"got {type(value).__name__}"
)

if isinstance(value, np.ndarray):
if value.dtype.kind in "mMUS":
raise TypeError(
f"searchsorted requires compatible dtype or scalar, got {value.dtype}"
)
elif isinstance(value, ABCExtensionArray):
if value.dtype.kind in "mM":
raise TypeError(
f"searchsorted requires compatible dtype or scalar, got {value.dtype}"
)
elif is_list_like(value):
_value = np.asarray(value)
if _value.dtype.kind in "mMUS":
raise TypeError(
f"searchsorted requires compatible dtype or scalar, got {_value.dtype}"
)
# lists of pandas objects result in object dtype; check first element
if _value.dtype == object and len(_value) > 0:
validate_searchsorted_value(arr_dtype, _value.ravel()[0])


def searchsorted(
arr: ArrayLike,
value: NumpyValueArrayLike | ExtensionArray,
Expand Down Expand Up @@ -1270,6 +1333,9 @@ def searchsorted(
if sorter is not None:
sorter = ensure_platform_int(sorter)

if isinstance(arr, np.ndarray):
validate_searchsorted_value(arr.dtype, value)

if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in "iu"
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
nanops,
ops,
)
from pandas.core.algorithms import validate_searchsorted_value
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.construction import ensure_wrapped_if_datetimelike
Expand All @@ -63,7 +64,7 @@

from pandas import Index
from pandas.arrays import StringArray
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays import ExtensionArray


@set_module("pandas.arrays")
Expand Down Expand Up @@ -199,6 +200,7 @@ def searchsorted(
# Parent's searchsorted calls _validate_setitem_value, which is
# too strict for search (e.g. rejects float into int). Delegate
# directly to numpy which handles cross-dtype searches correctly.
validate_searchsorted_value(self._ndarray.dtype, value)
return self._ndarray.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]

def _cast_pointwise_result(self, values) -> ArrayLike:
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/series/methods/test_searchsorted.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@

import pandas as pd
from pandas import (
Index,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.api.types import is_scalar
from pandas.core.arrays.numpy_ import NumpyExtensionArray


class TestSeriesSearchSorted:
Expand Down Expand Up @@ -75,3 +78,51 @@ def test_searchsorted_dataframe_fail(self):
msg = "Value must be 1-D array-like or scalar, DataFrame is not supported"
with pytest.raises(ValueError, match=msg):
ser.searchsorted(vals)

@pytest.mark.parametrize(
"value",
[
np.timedelta64(1, "ns"),
np.datetime64("2020-01-01"),
Timedelta("1 day"),
Timestamp("2020-01-01"),
pd.NaT,
pd.Period("2020", "D"),
"foo",
np.array([1, 2], dtype="timedelta64[ns]"),
np.array(["2020-01-01"], dtype="datetime64[ns]"),
np.array(["a", "b"]),
[Timestamp("2020-01-01")],
[Timedelta("1 day")],
],
)
@pytest.mark.parametrize("dtype", ["int64", "float64", "uint8", "bool"])
def test_searchsorted_numeric_incompatible_dtype(self, dtype, value):
ser = Series([1, 2, 3], dtype=dtype)
msg = "searchsorted requires compatible dtype or scalar"
with pytest.raises(TypeError, match=msg):
ser.searchsorted(value)

@pytest.mark.parametrize(
"value",
[
np.timedelta64(1, "ns"),
np.datetime64("2020-01-01"),
Timedelta("1 day"),
Timestamp("2020-01-01"),
pd.NaT,
"foo",
],
)
def test_searchsorted_numpy_ea_incompatible_dtype(self, value):
# NumpyExtensionArray path
nea = NumpyExtensionArray(np.array([1, 2, 3]))
msg = "searchsorted requires compatible dtype or scalar"
with pytest.raises(TypeError, match=msg):
nea.searchsorted(value)

@pytest.mark.parametrize("dtype", ["int64", "float64", "uint8"])
def test_searchsorted_numeric_with_index(self, dtype):
# Ensure valid numeric types still work through Index path
idx = Index([1, 2, 3], dtype=dtype)
assert idx.searchsorted(2) == 1
Loading