diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 3342e5205393..5c7d57bdd61b 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -531,16 +531,6 @@ def _c_array(ctype: type, values: List[Any]) -> ctypes.Array: return (ctype * len(values))(*values) # type: ignore[operator] -def _json_default_with_numpy(obj: Any) -> Any: - """Convert numpy classes to JSON serializable objects.""" - if isinstance(obj, (np.integer, np.floating, np.bool_)): - return obj.item() - elif isinstance(obj, np.ndarray): - return obj.tolist() - else: - return obj - - def _to_string(x: Union[int, float, str, List]) -> str: if isinstance(x, list): val_list = ",".join(str(val) for val in x) @@ -876,7 +866,7 @@ def _dump_pandas_categorical( pandas_categorical: Optional[List[List]], file_name: Optional[Union[str, Path]] = None, ) -> str: - categorical_json = json.dumps(pandas_categorical, default=_json_default_with_numpy) + categorical_json = json.dumps(pandas_categorical) pandas_str = f"\npandas_categorical:{categorical_json}\n" if file_name is not None: with open(file_name, "a") as f: @@ -4700,12 +4690,7 @@ def dump_model( ) ) ret = json.loads(string_buffer.value.decode("utf-8"), object_hook=object_hook) - ret["pandas_categorical"] = json.loads( - json.dumps( - self.pandas_categorical, - default=_json_default_with_numpy, - ) - ) + ret["pandas_categorical"] = json.loads(json.dumps(self.pandas_categorical)) return ret def predict( diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 108f37da0bee..5bd027f17179 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -880,6 +880,48 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c np.testing.assert_equal(codes, data[:, 0]) +def test_pandas_categorical_json_serialization_works(rng): + # Some 'numpy' types aren't JSON-serializable by default, this checks + # that having them in pandas_categorical doesn't cause issues for JSON + # serialization of models. + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "np_float": pd.Categorical(np.array([1.1, 2.2, 3.3], dtype=np.float32)), + "np_bool": pd.Categorical(np.array([True, False, False], dtype=np.bool_)), + "np_int": pd.Categorical(np.array([1, 2, 3], dtype=np.int64)), + } + ) + + # confirm that the array dtypes also become the category dtypes + assert df["np_float"].dtype.categories.dtype in (np.float32, np.float64) + assert df["np_bool"].dtype.categories.dtype == np.bool_ + assert df["np_int"].dtype.categories.dtype == np.int64 + + dtrain = lgb.Dataset( + df, + label=rng.random((df.shape[0],)), + categorical_feature=["np_float", "np_bool", "np_int"], + ) + dtrain.construct() + + bst = lgb.train( + params={ + "objective": "regression", + }, + train_set=dtrain, + num_boost_round=2, + ) + + # JSON serialization works + json_data = bst.dump_model() + assert json_data["pandas_categorical"] == [ + [float(np.float32(1.1)), float(np.float32(2.2)), float(np.float32(3.3))], + [False, True], + [1, 2, 3], + ] + + @pytest.mark.parametrize("min_data_in_bin", [2, 10]) def test_feature_num_bin(min_data_in_bin, rng): X = np.vstack(