Skip to content

Commit 180e35c

Browse files
CoderWotaWotajameslamb
authored
[python-package] Add decision_function() to LGBMClassifier (#7159)
Co-authored-by: Wota <[email protected]> Co-authored-by: James Lamb <[email protected]>
1 parent eb27a2f commit 180e35c

File tree

2 files changed

+132
-16
lines changed

2 files changed

+132
-16
lines changed

python-package/lightgbm/sklearn.py

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -727,25 +727,35 @@ def _more_tags(self) -> Dict[str, Any]:
727727
# "check_sample_weight_equivalence" can be removed when lightgbm's
728728
# minimum supported scikit-learn version is at least 1.6
729729
# ref: https://github.com/scikit-learn/scikit-learn/pull/30137
730+
xfail_checks = {
731+
"check_no_attributes_set_in_init": (
732+
"scikit-learn incorrectly asserts that private attributes "
733+
"cannot be set in __init__: "
734+
"(see https://github.com/lightgbm-org/LightGBM/issues/2628)"
735+
),
736+
"check_all_zero_sample_weights_error": (
737+
"Beginning in scikit-learn 1.9, by default estimators are expected to reject "
738+
"sample weight arrays that are all-0. LightGBM intentionally accepts such arrays. "
739+
"LightGBM supports some operations where training on an all-0-weight input could make sense, "
740+
"like batch updates with training continuation or manual model creation with forced splits."
741+
),
742+
"check_sample_weight_equivalence": check_sample_weight_str,
743+
"check_sample_weight_equivalence_on_dense_data": check_sample_weight_str,
744+
"check_sample_weight_equivalence_on_sparse_data": check_sample_weight_str,
745+
}
746+
# "check_decision_proba_consistency" can be removed when lightgbm's
747+
# minimum supported scikit-learn version is at least 1.2
748+
sklearn_major, sklearn_minor, *_ = _sklearn_version.split(".")
749+
if (int(sklearn_major), int(sklearn_minor)) < (1, 2):
750+
xfail_checks["check_decision_proba_consistency"] = (
751+
"decision_function() returns raw margins while predict_proba() applies sigmoid in C++ "
752+
"independently, causing different tie structures after rounding. "
753+
"scikit-learn >= 1.2 relaxed this check to accept monotonically consistent scores."
754+
)
730755
return {
731756
"allow_nan": True,
732757
"X_types": ["2darray", "sparse", "1dlabels"],
733-
"_xfail_checks": {
734-
"check_no_attributes_set_in_init": (
735-
"scikit-learn incorrectly asserts that private attributes "
736-
"cannot be set in __init__: "
737-
"(see https://github.com/lightgbm-org/LightGBM/issues/2628)"
738-
),
739-
"check_all_zero_sample_weights_error": (
740-
"Beginning in scikit-learn 1.9, by default estimators are expected to reject "
741-
"sample weight arrays that are all-0. LightGBM intentionally accepts such arrays. "
742-
"LightGBM supports some operations where training on an all-0-weight input could make sense, "
743-
"like batch updates with training continuation or manual model creation with forced splits."
744-
),
745-
"check_sample_weight_equivalence": check_sample_weight_str,
746-
"check_sample_weight_equivalence_on_dense_data": check_sample_weight_str,
747-
"check_sample_weight_equivalence_on_sparse_data": check_sample_weight_str,
748-
},
758+
"_xfail_checks": xfail_checks,
749759
}
750760

751761
@staticmethod
@@ -1737,6 +1747,49 @@ def predict_proba(
17371747
X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects",
17381748
)
17391749

1750+
def decision_function(
1751+
self,
1752+
X: _LGBM_ScikitMatrixLike,
1753+
*,
1754+
start_iteration: int = 0,
1755+
num_iteration: Optional[int] = None,
1756+
validate_features: bool = False,
1757+
**kwargs: Any,
1758+
) -> _LGBM_PredictReturnType:
1759+
"""Return the raw margin score for each sample.
1760+
1761+
Parameters
1762+
----------
1763+
X : numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]
1764+
Input features matrix.
1765+
start_iteration : int, optional (default=0)
1766+
Start index of the iteration to predict.
1767+
If <= 0, starts from the first iteration.
1768+
num_iteration : int or None, optional (default=None)
1769+
Total number of iterations used in the prediction.
1770+
If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
1771+
otherwise, all iterations from ``start_iteration`` are used (no limits).
1772+
If <= 0, all iterations from ``start_iteration`` are used (no limits).
1773+
validate_features : bool, optional (default=False)
1774+
If True, ensure that the features used to predict match the ones used to train.
1775+
Used only if data is pandas DataFrame.
1776+
**kwargs
1777+
Other parameters forwarded to ``predict()``.
1778+
1779+
Returns
1780+
-------
1781+
raw_score : array-like of shape = [n_samples] or shape = [n_samples, n_classes]
1782+
The predicted values.
1783+
"""
1784+
return super().predict(
1785+
X=X,
1786+
raw_score=True,
1787+
start_iteration=start_iteration,
1788+
num_iteration=num_iteration,
1789+
validate_features=validate_features,
1790+
**kwargs,
1791+
)
1792+
17401793
@property
17411794
def classes_(self) -> np.ndarray:
17421795
""":obj:`array` of shape = [n_classes]: The class label array."""

tests/python_package_test/test_sklearn.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import scipy.sparse
1414
from scipy.stats import spearmanr
1515
from sklearn.base import clone
16+
from sklearn.calibration import CalibratedClassifierCV
1617
from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
1718
from sklearn.ensemble import StackingClassifier, StackingRegressor
1819
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
@@ -41,6 +42,7 @@
4142
load_digits,
4243
load_iris,
4344
load_linnerud,
45+
logistic_sigmoid,
4446
make_ranking,
4547
make_synthetic_regression,
4648
np_assert_array_equal,
@@ -973,6 +975,67 @@ def test_predict():
973975
np.testing.assert_allclose(res_class_sklearn, y_train)
974976

975977

978+
def test_decision_function_and_predict_proba_consistency():
979+
# binary
980+
X, y = load_breast_cancer(return_X_y=True)
981+
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
982+
clf = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbose=-1).fit(X_train, y_train)
983+
preds_raw = clf.decision_function(X_test)
984+
np.testing.assert_allclose(preds_raw, clf.predict(X_test, raw_score=True))
985+
np.testing.assert_allclose(logistic_sigmoid(preds_raw), clf.predict_proba(X_test)[:, 1])
986+
987+
# multiclass
988+
X, y = load_iris(return_X_y=True)
989+
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
990+
clf = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbose=-1).fit(X_train, y_train)
991+
preds_raw = clf.decision_function(X_test)
992+
np.testing.assert_allclose(preds_raw, clf.predict(X_test, raw_score=True))
993+
np.testing.assert_allclose(softmax(preds_raw), clf.predict_proba(X_test))
994+
995+
996+
@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
997+
def test_calibrated_classifier_cv(method):
998+
# binary
999+
X, y = load_breast_cancer(return_X_y=True)
1000+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1001+
deterministic_params = {
1002+
"deterministic": True,
1003+
"force_col_wise": True,
1004+
"n_jobs": 1,
1005+
"seed": 312,
1006+
}
1007+
clf = CalibratedClassifierCV(
1008+
lgb.LGBMClassifier(n_estimators=10, verbose=-1, **deterministic_params),
1009+
method=method,
1010+
cv=3,
1011+
)
1012+
clf.fit(X_train, y_train)
1013+
proba = clf.predict_proba(X_test)
1014+
assert proba.shape == (X_test.shape[0], 2)
1015+
np.testing.assert_array_less(proba, 1.0 + 1e-9)
1016+
np.testing.assert_array_less(-1e-9, proba)
1017+
np.testing.assert_allclose(proba.sum(axis=1), 1.0)
1018+
score = accuracy_score(y_test, clf.predict(X_test))
1019+
assert 0.8 <= score <= 1.0
1020+
1021+
# multiclass
1022+
X, y = load_iris(return_X_y=True)
1023+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1024+
clf = CalibratedClassifierCV(
1025+
lgb.LGBMClassifier(n_estimators=10, verbose=-1, **deterministic_params),
1026+
method=method,
1027+
cv=3,
1028+
)
1029+
clf.fit(X_train, y_train)
1030+
proba = clf.predict_proba(X_test)
1031+
assert proba.shape == (X_test.shape[0], 3)
1032+
np.testing.assert_array_less(proba, 1.0 + 1e-9)
1033+
np.testing.assert_array_less(-1e-9, proba)
1034+
np.testing.assert_allclose(proba.sum(axis=1), 1.0)
1035+
score = accuracy_score(y_test, clf.predict(X_test))
1036+
assert 0.8 <= score <= 1.0
1037+
1038+
9761039
def test_predict_with_params_from_init():
9771040
X, y = load_iris(return_X_y=True)
9781041
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

0 commit comments

Comments
 (0)