@@ -35,14 +35,19 @@ def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_feat
3535
3636 # prior to calling this function, call remove_extra_bins which will eliminate extra work in this function
3737
38- # this generator function returns data in whatever order it thinks is most efficient . Normally for
38+ # This generator function returns data as the feature data within terms gets read . Normally for
3939 # mains it returns them in order, but pairs will be returned as their data completes and they can
4040 # be mixed in with mains. So, if we request data for [(0), (1), (2), (3), (4), (1, 3)] the return sequence
41- # could be [(0), (1), (2), (3), (1, 3), (4)]. More complicated pair/triples return even more randomized ordering.
41+ # would be [(0), (1), (2), (3), (1, 3), (4)]. More complicated pair/triples return even more randomized ordering.
4242 # For additive models the results can be processed in any order, so this imposes no penalities on us.
4343
4444 _log .info ("eval_terms" )
4545
46+ # Flatten the term_features array to make one entry per feature within each term
47+ # each item in the list contains placeholders for the binned array that we need
48+ # to complete the term. We fill these with None initially. At the end of the array
49+ # is the term_idx. So it looks approximately like this:
50+ # eg: [[None, 0], [None, 1], [None, 2], [None, None, 3], [None, None, None, 4]]
4651 all_requirements = list (
4752 chain .from_iterable (
4853 map (
@@ -59,25 +64,37 @@ def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_feat
5964 ),
6065 )
6166
67+ # get the per-feature (per-term) binning for all levels
68+ # eg: [[{'a': 0, 'b': 1}], [[1, 2, 3, 4, 5], [1, 3, 5]]]
6269 all_bin_levels1 , all_bin_levels2 = tee (
6370 map (bins .__getitem__ , chain .from_iterable (term_features )), 2
6471 )
6572
73+ # get a per-feature per-term list of indexes that will be used into the bins list
74+ # eg: [0, 1, 0, 0, 1, 2]
75+ levels = list (
76+ map (
77+ min ,
78+ zip (
79+ map ((- 1 ).__add__ , map (len , all_bin_levels2 )),
80+ map ((- 2 ).__add__ , map (len , all_requirements )),
81+ ),
82+ )
83+ )
84+
85+ # index into the bins list per-feature per-terms to get the actual binning
86+ # eg: [{'a': 0, 'b': 1}, [3.5, 6.5, 8.5], {'x': 0, 'y': 1}]
6687 feature_bins1 , feature_bins2 = tee (
6788 map (
6889 getitem ,
6990 all_bin_levels1 ,
70- map (
71- min ,
72- zip (
73- map ((- 1 ).__add__ , map (len , all_bin_levels2 )),
74- map ((- 2 ).__add__ , map (len , all_requirements )),
75- ),
76- ),
91+ levels ,
7792 ),
7893 2 ,
7994 )
8095
96+ # replace the continuous bins with None values
97+ # eg: [{'a': 0, 'b': 1}, None, {'x': 0, 'y': 1}]
8198 all_feature_bins = list (
8299 map (
83100 getitem ,
@@ -86,15 +103,18 @@ def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_feat
86103 )
87104 )
88105
106+ # generate the list of requests which consist of keys (feature_idx, id(categories))
107+ # and values (bin_level_idx, categories)
108+ # where categories is None for continous features
109+ # eg: {(0, id({'a': 0, 'b': 1})): (1, {'a': 0, 'b': 1}), ...}
89110 requests = dict (
90111 zip (
91112 zip (chain .from_iterable (term_features ), map (id , all_feature_bins )),
92- zip (count () , all_feature_bins ),
113+ zip (levels , all_feature_bins ),
93114 )
94115 )
95116
96- # Order requests by (feature_idx, term order) for implementation independence.
97- # Since term_features is sorted by # dimensions, this also orders by # dimensions.
117+ # Order requests by (feature_idx, category_level) for implementation independence.
98118 requests = sorted (
99119 zip (
100120 map (_itemgetter0 , requests .keys ()),
@@ -105,12 +125,17 @@ def eval_terms(X, n_samples, feature_names_in, feature_types_in, bins, term_feat
105125
106126 request_feature_idxs = list (map (_itemgetter0 , requests ))
107127
128+ # the request keys that we expect back
108129 keys1 , keys2 = tee (
109130 zip (chain .from_iterable (term_features ), map (id , all_feature_bins )), 2
110131 )
111132
112133 waiting = {}
113134 # sum is used to iterate outside the interpreter. The result is not used.
135+ # make a fast lookup so that we can determine which terms are affected
136+ # by the completion of a feature. The value returned from the key
137+ # has space to store the result to accumulate binned features for higher
138+ # order interactions that require more than one feature.
114139 sum (
115140 map (
116141 truth ,
0 commit comments