# Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause from libc.string cimport memcpy from libc.string cimport memset from libc.math cimport INFINITY import numpy as np cimport numpy as cnp cnp.import_array() from scipy.special.cython_special cimport xlogy from sklearn.tree._utils cimport log from sklearn.tree._utils cimport WeightedFenwickTree from sklearn.tree._partitioner cimport sort # EPSILON is used in the Poisson criterion cdef float64_t EPSILON = 10 * np.finfo('double').eps cdef class Criterion: """Interface for impurity criteria. This object stores methods on how to calculate how good a split is using different metrics. """ def __getstate__(self): return {} def __setstate__(self, d): pass cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, intp_t start, intp_t end, ) except -1 nogil: """Placeholder for a method which will initialize the criterion. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. Parameters ---------- y : ndarray, dtype=float64_t y is a buffer that can store values for n_outputs target variables stored as a Cython memoryview. sample_weight : ndarray, dtype=float64_t The weight of each sample stored as a Cython memoryview. weighted_n_samples : float64_t The total weight of the samples being considered sample_indices : ndarray, dtype=intp_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. start : intp_t The first sample to be used on this node end : intp_t The last sample used on this node """ pass cdef void init_missing(self, intp_t n_missing) noexcept nogil: """Initialize sum_missing if there are missing values. This method assumes that caller placed the missing samples in self.sample_indices[-n_missing:] Parameters ---------- n_missing: intp_t Number of missing values for specific feature. """ pass cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. This method must be implemented by the subclass. """ pass cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. Parameters ---------- new_pos : intp_t New starting index position of the sample_indices in the right child """ pass cdef float64_t node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the better. """ pass cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. Parameters ---------- impurity_left : float64_t pointer The memory address where the impurity of the left child should be stored. impurity_right : float64_t pointer The memory address where the impurity of the right child should be stored """ pass cdef void node_value(self, float64_t* dest) noexcept nogil: """Placeholder for storing the node value. Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. Parameters ---------- dest : float64_t pointer The memory address where the node value should be stored. """ pass cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: pass cdef float64_t middle_value(self) noexcept nogil: """Compute the middle value of a split for monotonicity constraints This method is implemented in ClassificationCriterion and RegressionCriterion. """ pass cdef float64_t proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ cdef float64_t impurity_left cdef float64_t impurity_right self.children_impurity(&impurity_left, &impurity_right) return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) cdef float64_t impurity_improvement(self, float64_t impurity_parent, float64_t impurity_left, float64_t impurity_right) noexcept nogil: """Compute the improvement in impurity. This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, Parameters ---------- impurity_parent : float64_t The initial impurity of the parent node before the split impurity_left : float64_t The impurity of the left child impurity_right : float64_t The impurity of the right child Return ------ float64_t : improvement in impurity after the split occurs """ return ((self.weighted_n_node_samples / self.weighted_n_samples) * (impurity_parent - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, float64_t lower_bound, float64_t upper_bound, ) noexcept nogil: pass cdef inline bint _check_monotonicity( self, cnp.int8_t monotonic_cst, float64_t lower_bound, float64_t upper_bound, float64_t value_left, float64_t value_right, ) noexcept nogil: cdef: bint check_lower_bound = ( (value_left >= lower_bound) & (value_right >= lower_bound) ) bint check_upper_bound = ( (value_left <= upper_bound) & (value_right <= upper_bound) ) bint check_monotonic_cst = ( (value_left - value_right) * monotonic_cst <= 0 ) return check_lower_bound & check_upper_bound & check_monotonic_cst cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" cdef inline void _move_sums_classification( ClassificationCriterion criterion, float64_t[:, ::1] sum_1, float64_t[:, ::1] sum_2, float64_t* weighted_n_1, float64_t* weighted_n_2, bint put_missing_in_1, ) noexcept nogil: """Distribute sum_total and sum_missing into sum_1 and sum_2. If there are missing values and: - put_missing_in_1 is True, then missing values to go sum_1. Specifically: sum_1 = sum_missing sum_2 = sum_total - sum_missing - put_missing_in_1 is False, then missing values go to sum_2. Specifically: sum_1 = 0 sum_2 = sum_total """ cdef intp_t k, c, n_bytes if criterion.n_missing != 0 and put_missing_in_1: for k in range(criterion.n_outputs): n_bytes = criterion.n_classes[k] * sizeof(float64_t) memcpy(&sum_1[k, 0], &criterion.sum_missing[k, 0], n_bytes) for k in range(criterion.n_outputs): for c in range(criterion.n_classes[k]): sum_2[k, c] = criterion.sum_total[k, c] - criterion.sum_missing[k, c] weighted_n_1[0] = criterion.weighted_n_missing weighted_n_2[0] = criterion.weighted_n_node_samples - criterion.weighted_n_missing else: # Assigning sum_2 = sum_total for all outputs. for k in range(criterion.n_outputs): n_bytes = criterion.n_classes[k] * sizeof(float64_t) memset(&sum_1[k, 0], 0, n_bytes) memcpy(&sum_2[k, 0], &criterion.sum_total[k, 0], n_bytes) weighted_n_1[0] = 0.0 weighted_n_2[0] = criterion.weighted_n_node_samples cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" def __cinit__(self, intp_t n_outputs, cnp.ndarray[intp_t, ndim=1] n_classes): """Initialize attributes for this criterion. Parameters ---------- n_outputs : intp_t The number of targets, the dimensionality of the prediction n_classes : numpy.ndarray, dtype=intp_t The number of unique classes in each target """ self.start = 0 self.pos = 0 self.end = 0 self.missing_go_to_left = 0 self.n_outputs = n_outputs self.n_samples = 0 self.n_node_samples = 0 self.weighted_n_node_samples = 0.0 self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 self.weighted_n_missing = 0.0 self.n_classes = np.empty(n_outputs, dtype=np.intp) cdef intp_t k = 0 cdef intp_t max_n_classes = 0 # For each target, set the number of unique classes in that target, # and also compute the maximal stride of all targets for k in range(n_outputs): self.n_classes[k] = n_classes[k] if n_classes[k] > max_n_classes: max_n_classes = n_classes[k] self.max_n_classes = max_n_classes # Count labels for each output self.sum_total = np.zeros((n_outputs, max_n_classes), dtype=np.float64) self.sum_left = np.zeros((n_outputs, max_n_classes), dtype=np.float64) self.sum_right = np.zeros((n_outputs, max_n_classes), dtype=np.float64) def __reduce__(self): return (type(self), (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__()) cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, intp_t start, intp_t end ) except -1 nogil: """Initialize the criterion. This initializes the criterion at node sample_indices[start:end] and children sample_indices[start:start] and sample_indices[start:end]. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. Parameters ---------- y : ndarray, dtype=float64_t The target stored as a buffer for memory efficiency. sample_weight : ndarray, dtype=float64_t The weight of each sample stored as a Cython memoryview. weighted_n_samples : float64_t The total weight of all samples sample_indices : ndarray, dtype=intp_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. start : intp_t The first sample to use in the mask end : intp_t The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = end - start self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0.0 cdef intp_t i cdef intp_t p cdef intp_t k cdef intp_t c cdef float64_t w = 1.0 for k in range(self.n_outputs): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(float64_t)) for p in range(start, end): i = sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. if sample_weight is not None: w = sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): c = self.y[i, k] self.sum_total[k, c] += w self.weighted_n_node_samples += w # Reset to pos=start self.reset() return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" self.sum_missing = np.zeros((self.n_outputs, self.max_n_classes), dtype=np.float64) cdef void init_missing(self, intp_t n_missing) noexcept nogil: """Initialize sum_missing if there are missing values. This method assumes that caller placed the missing samples in self.sample_indices[-n_missing:] """ cdef intp_t i, p, k, c cdef float64_t w = 1.0 self.n_missing = n_missing if n_missing == 0: return memset(&self.sum_missing[0, 0], 0, self.max_n_classes * self.n_outputs * sizeof(float64_t)) self.weighted_n_missing = 0.0 # The missing samples are assumed to be in self.sample_indices[-n_missing:] for p in range(self.end - n_missing, self.end): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): c = self.y[i, k] self.sum_missing[k, c] += w self.weighted_n_missing += w cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ self.pos = self.start _move_sums_classification( self, self.sum_left, self.sum_right, &self.weighted_n_left, &self.weighted_n_right, self.missing_go_to_left, ) return 0 cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ self.pos = self.end _move_sums_classification( self, self.sum_right, self.sum_left, &self.weighted_n_right, &self.weighted_n_left, not self.missing_go_to_left ) return 0 cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. Parameters ---------- new_pos : intp_t The new ending position for which to move sample_indices from the right child to the left child. """ cdef intp_t pos = self.pos # The missing samples are assumed to be in # self.sample_indices[-self.n_missing:] that is # self.sample_indices[end_non_missing:self.end]. cdef intp_t end_non_missing = self.end - self.n_missing cdef intp_t i cdef intp_t p cdef intp_t k cdef intp_t c cdef float64_t w = 1.0 # Update statistics up to new_pos # # Given that # sum_left[x] + sum_right[x] = sum_total[x] # and that sum_total is known, we are going to update # sum_left from the direction that require the least amount # of computations, i.e. from pos to new_pos or from end to new_po. if (new_pos - pos) <= (end_non_missing - new_pos): for p in range(pos, new_pos): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): self.sum_left[k, self.y[i, k]] += w self.weighted_n_left += w else: self.reverse_reset() for p in range(end_non_missing - 1, new_pos - 1, -1): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): self.sum_left[k, self.y[i, k]] -= w self.weighted_n_left -= w # Update right part statistics self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left for k in range(self.n_outputs): for c in range(self.n_classes[k]): self.sum_right[k, c] = self.sum_total[k, c] - self.sum_left[k, c] self.pos = new_pos return 0 cdef float64_t node_impurity(self) noexcept nogil: pass cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: pass cdef void node_value(self, float64_t* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. Parameters ---------- dest : float64_t pointer The memory address which we will save the node value into. """ cdef intp_t k, c for k in range(self.n_outputs): for c in range(self.n_classes[k]): dest[c] = self.sum_total[k, c] / self.weighted_n_node_samples dest += self.max_n_classes cdef inline void clip_node_value( self, float64_t * dest, float64_t lower_bound, float64_t upper_bound ) noexcept nogil: """Clip the values in dest such that predicted probabilities stay between `lower_bound` and `upper_bound` when monotonic constraints are enforced. Note that monotonicity constraints are only supported for: - single-output trees and - binary classifications. """ if dest[0] < lower_bound: dest[0] = lower_bound elif dest[0] > upper_bound: dest[0] = upper_bound # Values for binary classification must sum to 1. dest[1] = 1 - dest[0] cdef inline float64_t middle_value(self) noexcept nogil: """Compute the middle value of a split for monotonicity constraints as the simple average of the left and right children values. Note that monotonicity constraints are only supported for: - single-output trees and - binary classifications. """ return ( (self.sum_left[0, 0] / (2 * self.weighted_n_left)) + (self.sum_right[0, 0] / (2 * self.weighted_n_right)) ) cdef inline bint check_monotonicity( self, cnp.int8_t monotonic_cst, float64_t lower_bound, float64_t upper_bound, ) noexcept nogil: """Check monotonicity constraint is satisfied at the current classification split""" cdef: float64_t value_left = self.sum_left[0][0] / self.weighted_n_left float64_t value_right = self.sum_right[0][0] / self.weighted_n_right return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right) cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. The cross-entropy is then defined as cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef float64_t entropy = 0.0 cdef float64_t count_k cdef intp_t k cdef intp_t c for k in range(self.n_outputs): for c in range(self.n_classes[k]): count_k = self.sum_total[k, c] if count_k > 0.0: count_k /= self.weighted_n_node_samples entropy -= count_k * log(count_k) return entropy / self.n_outputs cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). Parameters ---------- impurity_left : float64_t pointer The memory address to save the impurity of the left node impurity_right : float64_t pointer The memory address to save the impurity of the right node """ cdef float64_t entropy_left = 0.0 cdef float64_t entropy_right = 0.0 cdef float64_t count_k cdef intp_t k cdef intp_t c for k in range(self.n_outputs): for c in range(self.n_classes[k]): count_k = self.sum_left[k, c] if count_k > 0.0: count_k /= self.weighted_n_left entropy_left -= count_k * log(count_k) count_k = self.sum_right[k, c] if count_k > 0.0: count_k /= self.weighted_n_right entropy_right -= count_k * log(count_k) impurity_left[0] = entropy_left / self.n_outputs impurity_right[0] = entropy_right / self.n_outputs cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. The Gini Index is then defined as: index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef float64_t gini = 0.0 cdef float64_t sq_count cdef float64_t count_k cdef intp_t k cdef intp_t c for k in range(self.n_outputs): sq_count = 0.0 for c in range(self.n_classes[k]): count_k = self.sum_total[k, c] sq_count += count_k * count_k gini += 1.0 - sq_count / (self.weighted_n_node_samples * self.weighted_n_node_samples) return gini / self.n_outputs cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. Parameters ---------- impurity_left : float64_t pointer The memory address to save the impurity of the left node to impurity_right : float64_t pointer The memory address to save the impurity of the right node to """ cdef float64_t gini_left = 0.0 cdef float64_t gini_right = 0.0 cdef float64_t sq_count_left cdef float64_t sq_count_right cdef float64_t count_k cdef intp_t k cdef intp_t c for k in range(self.n_outputs): sq_count_left = 0.0 sq_count_right = 0.0 for c in range(self.n_classes[k]): count_k = self.sum_left[k, c] sq_count_left += count_k * count_k count_k = self.sum_right[k, c] sq_count_right += count_k * count_k gini_left += 1.0 - sq_count_left / (self.weighted_n_left * self.weighted_n_left) gini_right += 1.0 - sq_count_right / (self.weighted_n_right * self.weighted_n_right) impurity_left[0] = gini_left / self.n_outputs impurity_right[0] = gini_right / self.n_outputs cdef inline void _move_sums_regression( RegressionCriterion criterion, float64_t[::1] sum_1, float64_t[::1] sum_2, float64_t* weighted_n_1, float64_t* weighted_n_2, bint put_missing_in_1, ) noexcept nogil: """Distribute sum_total and sum_missing into sum_1 and sum_2. If there are missing values and: - put_missing_in_1 is True, then missing values to go sum_1. Specifically: sum_1 = sum_missing sum_2 = sum_total - sum_missing - put_missing_in_1 is False, then missing values go to sum_2. Specifically: sum_1 = 0 sum_2 = sum_total """ cdef: intp_t i intp_t n_bytes = criterion.n_outputs * sizeof(float64_t) bint has_missing = criterion.n_missing != 0 if has_missing and put_missing_in_1: memcpy(&sum_1[0], &criterion.sum_missing[0], n_bytes) for i in range(criterion.n_outputs): sum_2[i] = criterion.sum_total[i] - criterion.sum_missing[i] weighted_n_1[0] = criterion.weighted_n_missing weighted_n_2[0] = criterion.weighted_n_node_samples - criterion.weighted_n_missing else: memset(&sum_1[0], 0, n_bytes) # Assigning sum_2 = sum_total for all outputs. memcpy(&sum_2[0], &criterion.sum_total[0], n_bytes) weighted_n_1[0] = 0.0 weighted_n_2[0] = criterion.weighted_n_node_samples cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ def __cinit__(self, intp_t n_outputs, intp_t n_samples): """Initialize parameters for this criterion. Parameters ---------- n_outputs : intp_t The number of targets to be predicted n_samples : intp_t The total number of samples to fit on """ # Default values self.start = 0 self.pos = 0 self.end = 0 self.n_outputs = n_outputs self.n_samples = n_samples self.n_node_samples = 0 self.weighted_n_node_samples = 0.0 self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 self.weighted_n_missing = 0.0 self.sq_sum_total = 0.0 self.sum_total = np.zeros(n_outputs, dtype=np.float64) self.sum_left = np.zeros(n_outputs, dtype=np.float64) self.sum_right = np.zeros(n_outputs, dtype=np.float64) def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, intp_t start, intp_t end, ) except -1 nogil: """Initialize the criterion. This initializes the criterion at node sample_indices[start:end] and children sample_indices[start:start] and sample_indices[start:end]. """ # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = end - start self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef intp_t i cdef intp_t p cdef intp_t k cdef float64_t y_ik cdef float64_t w_y_ik cdef float64_t w = 1.0 self.sq_sum_total = 0.0 memset(&self.sum_total[0], 0, self.n_outputs * sizeof(float64_t)) for p in range(start, end): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] w_y_ik = w * y_ik self.sum_total[k] += w_y_ik self.sq_sum_total += w_y_ik * y_ik self.weighted_n_node_samples += w # Reset to pos=start self.reset() return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" self.sum_missing = np.zeros(self.n_outputs, dtype=np.float64) cdef void init_missing(self, intp_t n_missing) noexcept nogil: """Initialize sum_missing if there are missing values. This method assumes that caller placed the missing samples in self.sample_indices[-n_missing:] """ cdef intp_t i, p, k cdef float64_t y_ik cdef float64_t w_y_ik cdef float64_t w = 1.0 self.n_missing = n_missing if n_missing == 0: return memset(&self.sum_missing[0], 0, self.n_outputs * sizeof(float64_t)) self.weighted_n_missing = 0.0 # The missing samples are assumed to be in self.sample_indices[-n_missing:] for p in range(self.end - n_missing, self.end): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] w_y_ik = w * y_ik self.sum_missing[k] += w_y_ik self.weighted_n_missing += w cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" self.pos = self.start _move_sums_regression( self, self.sum_left, self.sum_right, &self.weighted_n_left, &self.weighted_n_right, self.missing_go_to_left ) return 0 cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end.""" self.pos = self.end _move_sums_regression( self, self.sum_right, self.sum_left, &self.weighted_n_right, &self.weighted_n_left, not self.missing_go_to_left ) return 0 cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left.""" cdef intp_t pos = self.pos # The missing samples are assumed to be in # self.sample_indices[-self.n_missing:] that is # self.sample_indices[end_non_missing:self.end]. cdef intp_t end_non_missing = self.end - self.n_missing cdef intp_t i cdef intp_t p cdef intp_t k cdef float64_t w = 1.0 # Update statistics up to new_pos # # Given that # sum_left[x] + sum_right[x] = sum_total[x] # and that sum_total is known, we are going to update # sum_left from the direction that require the least amount # of computations, i.e. from pos to new_pos or from end to new_pos. if (new_pos - pos) <= (end_non_missing - new_pos): for p in range(pos, new_pos): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): self.sum_left[k] += w * self.y[i, k] self.weighted_n_left += w else: self.reverse_reset() for p in range(end_non_missing - 1, new_pos - 1, -1): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] for k in range(self.n_outputs): self.sum_left[k] -= w * self.y[i, k] self.weighted_n_left -= w self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) for k in range(self.n_outputs): self.sum_right[k] = self.sum_total[k] - self.sum_left[k] self.pos = new_pos return 0 cdef float64_t node_impurity(self) noexcept nogil: pass cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: pass cdef void node_value(self, float64_t* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] into dest.""" cdef intp_t k for k in range(self.n_outputs): dest[k] = self.sum_total[k] / self.weighted_n_node_samples cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" if dest[0] < lower_bound: dest[0] = lower_bound elif dest[0] > upper_bound: dest[0] = upper_bound cdef float64_t middle_value(self) noexcept nogil: """Compute the middle value of a split for monotonicity constraints as the simple average of the left and right children values. Monotonicity constraints are only supported for single-output trees we can safely assume n_outputs == 1. """ return ( (self.sum_left[0] / (2 * self.weighted_n_left)) + (self.sum_right[0] / (2 * self.weighted_n_right)) ) cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, float64_t lower_bound, float64_t upper_bound, ) noexcept nogil: """Check monotonicity constraint is satisfied at the current regression split""" cdef: float64_t value_left = self.sum_left[0] / self.weighted_n_left float64_t value_right = self.sum_right[0] / self.weighted_n_right return self._check_monotonicity(monotonic_cst, lower_bound, upper_bound, value_left, value_right) cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. MSE = var_left + var_right """ cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ cdef float64_t impurity cdef intp_t k impurity = self.sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): impurity -= (self.sum_total[k] / self.weighted_n_node_samples)**2.0 return impurity / self.n_outputs cdef float64_t proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The MSE proxy is derived from sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 Neglecting constant terms, this gives: - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef intp_t k cdef float64_t proxy_impurity_left = 0.0 cdef float64_t proxy_impurity_right = 0.0 for k in range(self.n_outputs): proxy_impurity_left += self.sum_left[k] * self.sum_left[k] proxy_impurity_right += self.sum_right[k] * self.sum_right[k] return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ cdef const float64_t[:] sample_weight = self.sample_weight cdef const intp_t[:] sample_indices = self.sample_indices cdef intp_t pos = self.pos cdef intp_t start = self.start cdef float64_t y_ik cdef float64_t sq_sum_left = 0.0 cdef float64_t sq_sum_right cdef intp_t i cdef intp_t p cdef intp_t k cdef float64_t w = 1.0 cdef intp_t end_non_missing for p in range(start, pos): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik if self.missing_go_to_left: # add up the impact of these missing values on the left child # statistics. # Note: this only impacts the square sum as the sum # is modified elsewhere. end_non_missing = self.end - self.n_missing for p in range(end_non_missing, self.end): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik sq_sum_right = self.sq_sum_total - sq_sum_left impurity_left[0] = sq_sum_left / self.weighted_n_left impurity_right[0] = sq_sum_right / self.weighted_n_right for k in range(self.n_outputs): impurity_left[0] -= (self.sum_left[k] / self.weighted_n_left) ** 2.0 impurity_right[0] -= (self.sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs # Helper for MAE criterion: cdef void precompute_absolute_errors( const float64_t[::1] sorted_y, const intp_t[::1] ranks, const float64_t[:] sample_weight, const intp_t[:] sample_indices, WeightedFenwickTree tree, intp_t start, intp_t end, float64_t[::1] abs_errors, float64_t[::1] medians, ) noexcept nogil: """ Fill `abs_errors` and `medians`. If start < end: Forward pass: Computes the "prefix" AEs/medians i.e the AEs for each set of indices sample_indices[start:start + i] with i in {1, ..., n}, where n = end - start. Else: Backward pass: Computes the "suffix" AEs/medians i.e the AEs for each set of indices sample_indices[start - i:start] with i in {1, ..., n}, where n = start - end. Parameters ---------- sorted_y : const float64_t[::1] Target values, sorted ranks : const intp_t[::1] Ranks of the node-local values of y for points in sample_indices such that: sorted_y[ranks[p]] == y[sample_indices[p]] for any p in [start, end) or (end, start]. sample_weight : const float64_t[:] sample_indices : const intp_t[:] indices indicating which samples to use. Shape: (n_samples,) tree : WeightedFenwickTree pre-instanciated tree start : intp_t Start index in `sample_indices` end : intp_t End index (exclusive) in `sample_indices` abs_errors : float64_t[::1] array to store (increment) the computed absolute errors. Shape: (n,) with n := end - start medians : float64_t[::1] array to store (overwrite) the computed medians. Shape: (n,) Complexity: O(n log n) """ cdef: intp_t p, i, step, n, rank, median_rank, median_prev_rank float64_t w = 1. float64_t half_weight, median float64_t w_right, w_left, wy_left, wy_right if start < end: step = 1 n = end - start else: n = start - end step = -1 tree.reset(n) p = start # We iterate exactly `n` samples starting at absolute index `start` and # move by `step` (+1 for the forward pass, -1 for the backward pass). for _ in range(n): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] # Activate sample i at its rank: rank = ranks[p] tree.add(rank, sorted_y[rank], w) # Weighted median by cumulative weight: the median is where the # cumulative weight crosses half of the total weight. half_weight = 0.5 * tree.total_w # find the smallest activated rank with cumulative weight > half_weight # while returning the prefix sums (`w_left` and `wy_left`) # up to (and excluding) that index: median_rank = tree.search(half_weight, &w_left, &wy_left, &median_prev_rank) if median_rank != median_prev_rank: # Exact match for half_weight fell between two consecutive ranks: # cumulative weight up to `median_rank` excluded is exactly half_weight. # In that case, `median_prev_rank` is the activated rank such that # the cumulative weight up to it included is exactly half_weight. # In this case we take the mid-point: median = (sorted_y[median_prev_rank] + sorted_y[median_rank]) / 2 else: # if there are no exact match for half_weight in the cumulative weights # `median_rank == median_prev_rank` and the median is: median = sorted_y[median_rank] # Convert left prefix sums into right-hand complements. w_right = tree.total_w - w_left wy_right = tree.total_wy - wy_left medians[p] = median # Pinball-loss identity for absolute error at the current set: # sum_{y_i >= m} w_i (y_i - m) = wy_right - m * w_right # sum_{y_i < m} w_i (m - y_i) = m * w_left - wy_left abs_errors[p] += ( (wy_right - median * w_right) + (median * w_left - wy_left) ) p += step cdef inline void compute_ranks( float64_t* sorted_y, intp_t* sorted_indices, intp_t* ranks, intp_t n ) noexcept nogil: """Sort `sorted_y` inplace and fill `ranks` accordingly""" cdef intp_t i for i in range(n): sorted_indices[i] = i sort(sorted_y, sorted_indices, n) for i in range(n): ranks[sorted_indices[i]] = i def _py_precompute_absolute_errors( const float64_t[:, ::1] ys, const float64_t[:] sample_weight, const intp_t[:] sample_indices, const intp_t start, const intp_t end, const intp_t n, ): """Used for testing precompute_absolute_errors.""" cdef: intp_t p, i intp_t s = start intp_t e = end WeightedFenwickTree tree = WeightedFenwickTree(n) float64_t[::1] sorted_y = np.empty(n, dtype=np.float64) intp_t[::1] sorted_indices = np.empty(n, dtype=np.intp) intp_t[::1] ranks = np.empty(n, dtype=np.intp) float64_t[::1] abs_errors = np.zeros(n, dtype=np.float64) float64_t[::1] medians = np.empty(n, dtype=np.float64) if start > end: s = end + 1 e = start + 1 for p in range(s, e): i = sample_indices[p] sorted_y[p - s] = ys[i, 0] compute_ranks(&sorted_y[0], &sorted_indices[0], &ranks[s], n) precompute_absolute_errors( sorted_y, ranks, sample_weight, sample_indices, tree, start, end, abs_errors, medians ) return np.asarray(abs_errors)[s:e], np.asarray(medians)[s:e] cdef class MAE(Criterion): r"""Mean absolute error impurity criterion. It has almost nothing in common with other regression criterions so it doesn't inherit from RegressionCriterion. MAE = (1 / n)*(\sum_i |y_i - p_i|), where y_i is the true value and p_i is the predicted value. In a decision tree, that prediction is the (weighted) median of the targets in the node. How this implementation works ----------------------------- This class precomputes in `reset`, for the current node, the absolute-error values and corresponding medians for all potential split positions: every p in [start, end). For that: - We first compute the rank of each samples node-local sorted order of target values. `self.ranks[p]` gives the rank of sample p. - While iterating the segment of indices (p in [start, end)), we * "activate" one sample at a time at its rank within a prefix sum tree, the `WeightedFenwickTree`: `tree.add(rank, y, weight)` The tree maintains cumulative sums of weights and of `weight * y` * search for the half total weight in the tree: `tree.search(current_total_weight / 2)`. This allows us to retrieve/compute: * the current weighted median value * the absolute-error contribution via the standard pinball-loss identity: AE = (wy_right - median * w_right) + (median * w_left - wy_left) - We perform two such passes: * one forward from `start` to `end - 1` to fill `left_abs_errors[p]` and `left_medians[p]` for left children. * one backward from `end - 1` down to `start` to fill `right_abs_errors[p]` and `right_medians[p]` for right children. Complexity: time complexity is O(n log n), indeed: - computing ranks is based on sorting: O(n log n) - add and search operations in the Fenwick tree are O(log n). => the forward and backward passes are O(n log n). How the other methods use the precomputations -------------------------------------------- - `reset` performs the precomputation described above. It also stores the node weighted median per output in `node_medians` (prediction value of the node). - `update(new_pos)` only updates `weighted_n_left` and `weighted_n_right`; no recomputation of errors is needed. - `children_impurity` reads the precomputed absolute errors at `left_abs_errors[pos - 1]` and `right_abs_errors[pos]` and scales them by the corresponding child weights and `n_outputs` to report the impurity of each child. - `middle_value` and `check_monotonicity` use the precomputed `left_medians[pos - 1]` and `right_medians[pos]` to derive the mid-point value and to validate monotonic constraints when enabled. - Missing values are not supported for MAE: `init_missing` raises. For a complementary, in-depth discussion of the mathematics and design choices, see the external report: https://github.com/cakedev0/fast-mae-split/blob/main/report.ipynb """ cdef float64_t[::1] node_medians cdef float64_t[::1] left_abs_errors cdef float64_t[::1] right_abs_errors cdef float64_t[::1] left_medians cdef float64_t[::1] right_medians cdef float64_t[::1] sorted_y cdef intp_t [::1] sorted_indices cdef intp_t[::1] ranks cdef WeightedFenwickTree prefix_sum_tree def __cinit__(self, intp_t n_outputs, intp_t n_samples): """Initialize parameters for this criterion. Parameters ---------- n_outputs : intp_t The number of targets to be predicted n_samples : intp_t The total number of samples to fit on """ # Default values self.start = 0 self.pos = 0 self.end = 0 self.n_outputs = n_outputs self.n_samples = n_samples self.n_node_samples = 0 self.weighted_n_node_samples = 0.0 self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 self.node_medians = np.zeros(n_outputs, dtype=np.float64) # Note: this criterion has a n_samples x 64 bytes memory footprint, which is # fine as it's instantiated only once to build an entire tree self.left_abs_errors = np.empty(n_samples, dtype=np.float64) self.right_abs_errors = np.empty(n_samples, dtype=np.float64) self.left_medians = np.empty(n_samples, dtype=np.float64) self.right_medians = np.empty(n_samples, dtype=np.float64) self.ranks = np.empty(n_samples, dtype=np.intp) # Important: The arrays declared above are indexed with # the absolute position `p` in `sample_indices` (not with a 0-based offset). # The forward and backward passes in `reset` method ensure that # for any current split position `pos` we can read: # - left child precomputed values at `p = pos - 1`, and # - right child precomputed values at `p = pos`. self.prefix_sum_tree = WeightedFenwickTree(n_samples) # used memory: 2 float64 arrays of size n_samples + 1 # we reuse a single `WeightedFenwickTree` instance to build prefix # and suffix aggregates over the node samples. # Work buffer arrays, used with 0-based offset: self.sorted_y = np.empty(n_samples, dtype=np.float64) self.sorted_indices = np.empty(n_samples, dtype=np.intp) cdef int init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, intp_t start, intp_t end, ) except -1 nogil: """Initialize the criterion. This initializes the criterion at node sample_indices[start:end] and children sample_indices[start:start] and sample_indices[start:end]. WARNING: sample_indices will be modified in-place externally after this method is called. """ cdef: intp_t i, p intp_t n = end - start float64_t w = 1.0 # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices self.start = start self.end = end self.n_node_samples = n self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. for p in range(start, end): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] self.weighted_n_node_samples += w # Reset to pos=start self.reset() return 0 cdef void init_missing(self, intp_t n_missing) noexcept nogil: """Raise error if n_missing != 0.""" if n_missing == 0: return with gil: raise ValueError("missing values is not supported for MAE.") cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. Reset might be called after an external class has changed inplace self.sample_indices[start:end], hence re-computing the absolute errors is needed. """ cdef intp_t k, p, i self.weighted_n_left = 0.0 self.weighted_n_right = self.weighted_n_node_samples self.pos = self.start n_bytes = self.n_node_samples * sizeof(float64_t) memset(&self.left_abs_errors[self.start], 0, n_bytes) memset(&self.right_abs_errors[self.start], 0, n_bytes) # Multi-output handling: # absolute errors are accumulated across outputs by # incrementing `left_abs_errors` and `right_abs_errors` on each pass. # The per-output medians arrays are overwritten at each output iteration # as they are only used for monotonicity checks when `n_outputs == 1`. for k in range(self.n_outputs): # 1) Node-local ordering: # for each output k, the values `y[sample_indices[p], k]` for p # in [start, end) are copied into self.sorted_y[0:n_node_samples]` # and ranked with `compute_ranks`. # The resulting `self.ranks[p]` gives the rank of sample p in the # node-local sorted order. for p in range(self.start, self.end): i = self.sample_indices[p] self.sorted_y[p - self.start] = self.y[i, k] compute_ranks( &self.sorted_y[0], &self.sorted_indices[0], &self.ranks[self.start], self.n_node_samples, ) # 2) Forward pass # from `start` to `end - 1` to fill `left_abs_errors[p]` and # `left_medians[p]` for left children. precompute_absolute_errors( self.sorted_y, self.ranks, self.sample_weight, self.sample_indices, self.prefix_sum_tree, self.start, self.end, # left_abs_errors is incremented, left_medians is overwritten self.left_abs_errors, self.left_medians ) # 3) Backward pass # from `end - 1` down to `start` to fill `right_abs_errors[p]` # and `right_medians[p]` for right children. precompute_absolute_errors( self.sorted_y, self.ranks, self.sample_weight, self.sample_indices, self.prefix_sum_tree, self.end - 1, self.start - 1, # right_abs_errors is incremented, right_medians is overwritten self.right_abs_errors, self.right_medians ) # Store the median for the current node: when p == self.start all the # node's data points are sent to the right child, so the current node # median value and the right child median value would be equal. self.node_medians[k] = self.right_medians[self.start] return 0 cdef int reverse_reset(self) except -1 nogil: """For this class, this method is never called.""" raise NotImplementedError("This method is not implemented for this subclass") cdef int update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. new_pos is guaranteed to be greater than pos. Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. Time complexity: O(new_pos - pos) (which usually is O(1), at least for dense data). """ cdef intp_t pos = self.pos cdef intp_t i, p cdef float64_t w = 1.0 # Update statistics up to new_pos for p in range(pos, new_pos): i = self.sample_indices[p] if self.sample_weight is not None: w = self.sample_weight[i] self.weighted_n_left += w self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) self.pos = new_pos return 0 cdef void node_value(self, float64_t* dest) noexcept nogil: """Computes the node value of sample_indices[start:end] into dest.""" cdef intp_t k for k in range(self.n_outputs): dest[k] = self.node_medians[k] cdef inline float64_t middle_value(self) noexcept nogil: """Compute the middle value of a split for monotonicity constraints as the simple average of the left and right children values. Monotonicity constraints are only supported for single-output trees we can safely assume n_outputs == 1. """ return ( self.left_medians[self.pos - 1] + self.right_medians[self.pos] ) / 2 cdef inline bint check_monotonicity( self, cnp.int8_t monotonic_cst, float64_t lower_bound, float64_t upper_bound, ) noexcept nogil: """Check monotonicity constraint is satisfied at the current regression split""" return self._check_monotonicity( monotonic_cst, lower_bound, upper_bound, self.left_medians[self.pos - 1], self.right_medians[self.pos]) cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. Time complexity: O(1) (precomputed in `.reset()`) """ return ( self.right_abs_errors[0] / (self.weighted_n_node_samples * self.n_outputs) ) cdef void children_impurity(self, float64_t* p_impurity_left, float64_t* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). Time complexity: O(1) (precomputed in `.reset()`) """ cdef float64_t impurity_left = 0.0 cdef float64_t impurity_right = 0.0 # if pos == start, left child is empty, hence impurity is 0 if self.pos > self.start: impurity_left += self.left_abs_errors[self.pos - 1] p_impurity_left[0] = impurity_left / (self.weighted_n_left * self.n_outputs) # if pos == end, right child is empty, hence impurity is 0 if self.pos < self.end: impurity_right += self.right_abs_errors[self.pos] p_impurity_right[0] = impurity_right / (self.weighted_n_right * self.n_outputs) # those 2 methods are copied from the RegressionCriterion abstract class: def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) cdef inline void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: """Clip the value in dest between lower_bound and upper_bound for monotonic constraints.""" if dest[0] < lower_bound: dest[0] = lower_bound elif dest[0] > upper_bound: dest[0] = upper_bound cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. Uses the formula (35) in Friedman's original Gradient Boosting paper: diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ cdef float64_t proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ cdef float64_t total_sum_left = 0.0 cdef float64_t total_sum_right = 0.0 cdef intp_t k cdef float64_t diff = 0.0 for k in range(self.n_outputs): total_sum_left += self.sum_left[k] total_sum_right += self.sum_right[k] diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) return diff * diff / (self.weighted_n_left * self.weighted_n_right) cdef float64_t impurity_improvement(self, float64_t impurity_parent, float64_t impurity_left, float64_t impurity_right) noexcept nogil: # Note: none of the arguments are used here cdef float64_t total_sum_left = 0.0 cdef float64_t total_sum_right = 0.0 cdef intp_t k cdef float64_t diff = 0.0 for k in range(self.n_outputs): total_sum_left += self.sum_left[k] total_sum_right += self.sum_right[k] diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: # min_impurity_split with default = 0 forces us to use a non-negative # impurity like the Poisson deviance. Without this restriction, one could # throw away the 'constant' term sum(y_true * log(y_true)) and just use # Poisson loss = - 1/n * sum(y_true * log(y_pred)) # = - 1/n * sum(y_true * log(mean(y_true)) # = - mean(y_true) * log(mean(y_true)) # With this trick (used in proxy_impurity_improvement()), as for MSE, # children_impurity would only need to go over left xor right split, not # both. This could be faster. cdef float64_t node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. """ return self.poisson_loss(self.start, self.end, self.sum_total, self.weighted_n_node_samples) cdef float64_t proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The Poisson proxy is derived from: sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) Neglecting constant terms, this gives - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ cdef intp_t k cdef float64_t proxy_impurity_left = 0.0 cdef float64_t proxy_impurity_right = 0.0 cdef float64_t y_mean_left = 0. cdef float64_t y_mean_right = 0. for k in range(self.n_outputs): if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. # Since sum_right = sum_total - sum_left, it can lead to # floating point rounding error and will not give zero. Thus, # we relax the above comparison to sum(y_i) <= EPSILON. return -INFINITY else: y_mean_left = self.sum_left[k] / self.weighted_n_left y_mean_right = self.sum_right[k] / self.weighted_n_right proxy_impurity_left -= self.sum_left[k] * log(y_mean_left) proxy_impurity_right -= self.sum_right[k] * log(y_mean_right) return - proxy_impurity_left - proxy_impurity_right cdef void children_impurity(self, float64_t* impurity_left, float64_t* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ cdef intp_t start = self.start cdef intp_t pos = self.pos cdef intp_t end = self.end impurity_left[0] = self.poisson_loss(start, pos, self.sum_left, self.weighted_n_left) impurity_right[0] = self.poisson_loss(pos, end, self.sum_right, self.weighted_n_right) cdef inline float64_t poisson_loss( self, intp_t start, intp_t end, const float64_t[::1] y_sum, float64_t weight_sum ) noexcept nogil: """Helper function to compute Poisson loss (~deviance) of a given node. """ cdef const float64_t[:, ::1] y = self.y cdef const float64_t[:] sample_weight = self.sample_weight cdef const intp_t[:] sample_indices = self.sample_indices cdef float64_t y_mean = 0. cdef float64_t poisson_loss = 0. cdef float64_t w = 1.0 cdef intp_t i, k, p cdef intp_t n_outputs = self.n_outputs for k in range(n_outputs): if y_sum[k] <= EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. # Thus, we relax the comparison y_sum <= 0 to # y_sum <= EPSILON. return INFINITY y_mean = y_sum[k] / weight_sum for p in range(start, end): i = sample_indices[p] if sample_weight is not None: w = sample_weight[i] poisson_loss += w * xlogy(y[i, k], y[i, k] / y_mean) return poisson_loss / (weight_sum * n_outputs)