From 3fd2d954f4d19fd8d17d818ab0a37b9d9e18fb1d Mon Sep 17 00:00:00 2001 From: hashlag <90853356+hashlag@users.noreply.github.com> Date: Sat, 3 Feb 2024 22:45:50 +0300 Subject: [PATCH 1/2] add regressor --- demo/regressor_demo.py | 49 ++++++++++++++++++ neighbours/__init__.py | 3 +- .../{knn_classifier.py => classifier.py} | 0 neighbours/regressor.py | 51 +++++++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 demo/regressor_demo.py rename neighbours/{knn_classifier.py => classifier.py} (100%) create mode 100644 neighbours/regressor.py diff --git a/demo/regressor_demo.py b/demo/regressor_demo.py new file mode 100644 index 0000000..6d25286 --- /dev/null +++ b/demo/regressor_demo.py @@ -0,0 +1,49 @@ +import matplotlib.pyplot as plt +import neighbours as ns + +import numpy as np +import random +import math + + +# function for generating a synthetic regression problem +def f(x): + if x > 40: + return math.log(x, 2) - 6 + else: + return math.cos(x * 0.1) + + +# generate x coordinates +X = [[i + random.uniform(-1, 1)] for i in np.arange(start=1, stop=100, step=1)] + +# calculate corresponding y coordinates +y = [f(i[0]) + random.uniform(-0.1, 0.1) for i in X] + +# convert to numpy arrays +X = np.array(X) +y = np.array(y) + +# generate x coordinates for demo plot +x_points = np.arange(start=0, stop=100, step=0.1) +X_demo = np.array([[x] for x in x_points]) + +# create a regressor then load data +regressor = ns.KNNRegressor(1, 10, 7) +regressor.load(X, y) + +# create an array to store predicted y values for demo plot +y_predicted = [] + +# get predictions for all samples in X_demo +for sample in X_demo: + predicted_value = regressor.predict(sample, ns.distance.euclidean, ns.kernel.gaussian, 3) + y_predicted.append(predicted_value) + +# plot train points +plt.plot(X, y, 'bo') + +# plot predicted y against x +plt.plot(x_points, y_predicted, 'r') + +plt.show() diff --git a/neighbours/__init__.py b/neighbours/__init__.py index 1644ecb..caf8708 100644 --- a/neighbours/__init__.py +++ b/neighbours/__init__.py @@ -1,3 +1,4 @@ -from .knn_classifier import * +from .classifier import * +from .regressor import * from . import distance from . import kernel diff --git a/neighbours/knn_classifier.py b/neighbours/classifier.py similarity index 100% rename from neighbours/knn_classifier.py rename to neighbours/classifier.py diff --git a/neighbours/regressor.py b/neighbours/regressor.py new file mode 100644 index 0000000..9ae3ec8 --- /dev/null +++ b/neighbours/regressor.py @@ -0,0 +1,51 @@ +import numpy as np + +from .rp_neighbours import * +from .exceptions import * + + +class KNNRegressor: + def __init__(self, features, trees_count, rpt_m): + self.features = features + self.forest = RPTForest(features, trees_count, rpt_m) + self.targets = None + + def load(self, points, targets): + """Loads train data, builds a corresponding forest + + :param points: np.ndarray of train samples + :param targets: an array of target values corresponding to loaded train points + """ + + if not isinstance(points, np.ndarray): + raise InvalidType("points should be represented as np.ndarray") + + if not isinstance(targets, np.ndarray) and not isinstance(targets, list): + raise InvalidType("targets should be represented as np.ndarray or list") + + self.targets = targets + + if points.ndim != 2: + raise InvalidDimensionError("points array should be two-dimensional") + + if points.shape[1] != self.features: + raise InvalidDimensionError( + "invalid number of features in sample (expected {}, got {})".format(self.features, points.shape[1]) + ) + + self.forest.load(points) + + def predict(self, point: np.ndarray, distance, kernel, h): + nearest_point_indexes = self.forest.get_neighbours(point) + + # Nadaraya-Watson estimator + + numerator = float(0) + denominator = float(0.0000001) + + for point_ix in nearest_point_indexes: + weight = kernel(distance(point, self.forest.get_point(point_ix)) / h) + numerator += weight * self.targets[point_ix] + denominator += weight + + return numerator / denominator From f9f724fa7d920474982e38811e3b2677f61155d0 Mon Sep 17 00:00:00 2001 From: hashlag <90853356+hashlag@users.noreply.github.com> Date: Sun, 4 Feb 2024 17:51:50 +0300 Subject: [PATCH 2/2] upd regressor --- neighbours/regressor.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/neighbours/regressor.py b/neighbours/regressor.py index 9ae3ec8..119b6c8 100644 --- a/neighbours/regressor.py +++ b/neighbours/regressor.py @@ -5,7 +5,26 @@ from .exceptions import * class KNNRegressor: + """K-nearest neighbors regressor + + Nadaraya-Watson kNN regressor based on random projection forest. + + Supports different (including custom) smoothing kernels and distance metrics. + + Attributes: + features: number of features in each sample + forest: an instance of RPTForest + targets: an array of target values corresponding to loaded train points + """ + def __init__(self, features, trees_count, rpt_m): + """Initializes new regressor + + :param features: number of features in each sample + :param trees_count: number of trees in the forest + :param rpt_m: maximum number of samples in one leaf of an RP tree + """ + self.features = features self.forest = RPTForest(features, trees_count, rpt_m) self.targets = None @@ -36,16 +55,25 @@ class KNNRegressor: self.forest.load(points) def predict(self, point: np.ndarray, distance, kernel, h): + """Predict target value for given point + + :param point: target point as np.ndarray + :param distance: distance metric function (e.g., from neighbours.distance) + :param kernel: smoothing kernel function (e.g., from neighbours.kernel) + :param h: bandwidth + :return: predicted value or numpy.nan if unable to obtain a prediction + """ + nearest_point_indexes = self.forest.get_neighbours(point) # Nadaraya-Watson estimator numerator = float(0) - denominator = float(0.0000001) + denominator = float(0) for point_ix in nearest_point_indexes: weight = kernel(distance(point, self.forest.get_point(point_ix)) / h) numerator += weight * self.targets[point_ix] denominator += weight - return numerator / denominator + return np.nan if denominator == 0 else numerator / denominator