From 3fd2d954f4d19fd8d17d818ab0a37b9d9e18fb1d Mon Sep 17 00:00:00 2001
From: hashlag <90853356+hashlag@users.noreply.github.com>
Date: Sat, 3 Feb 2024 22:45:50 +0300
Subject: [PATCH 1/2] add regressor

---
 demo/regressor_demo.py                        | 49 ++++++++++++++++++
 neighbours/__init__.py                        |  3 +-
 .../{knn_classifier.py => classifier.py}      |  0
 neighbours/regressor.py                       | 51 +++++++++++++++++++
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 demo/regressor_demo.py
 rename neighbours/{knn_classifier.py => classifier.py} (100%)
 create mode 100644 neighbours/regressor.py

diff --git a/demo/regressor_demo.py b/demo/regressor_demo.py
new file mode 100644
index 0000000..6d25286
--- /dev/null
+++ b/demo/regressor_demo.py
@@ -0,0 +1,49 @@
+import matplotlib.pyplot as plt
+import neighbours as ns
+
+import numpy as np
+import random
+import math
+
+
+# function for generating a synthetic regression problem
+def f(x):
+    if x > 40:
+        return math.log(x, 2) - 6
+    else:
+        return math.cos(x * 0.1)
+
+
+# generate x coordinates
+X = [[i + random.uniform(-1, 1)] for i in np.arange(start=1, stop=100, step=1)]
+
+# calculate corresponding y coordinates
+y = [f(i[0]) + random.uniform(-0.1, 0.1) for i in X]
+
+# convert to numpy arrays
+X = np.array(X)
+y = np.array(y)
+
+# generate x coordinates for demo plot
+x_points = np.arange(start=0, stop=100, step=0.1)
+X_demo = np.array([[x] for x in x_points])
+
+# create a regressor then load data
+regressor = ns.KNNRegressor(1, 10, 7)
+regressor.load(X, y)
+
+# create an array to store predicted y values for demo plot
+y_predicted = []
+
+# get predictions for all samples in X_demo
+for sample in X_demo:
+    predicted_value = regressor.predict(sample, ns.distance.euclidean, ns.kernel.gaussian, 3)
+    y_predicted.append(predicted_value)
+
+# plot train points
+plt.plot(X, y, 'bo')
+
+# plot predicted y against x
+plt.plot(x_points, y_predicted, 'r')
+
+plt.show()
diff --git a/neighbours/__init__.py b/neighbours/__init__.py
index 1644ecb..caf8708 100644
--- a/neighbours/__init__.py
+++ b/neighbours/__init__.py
@@ -1,3 +1,4 @@
-from .knn_classifier import *
+from .classifier import *
+from .regressor import *
 from . import distance
 from . import kernel
diff --git a/neighbours/knn_classifier.py b/neighbours/classifier.py
similarity index 100%
rename from neighbours/knn_classifier.py
rename to neighbours/classifier.py
diff --git a/neighbours/regressor.py b/neighbours/regressor.py
new file mode 100644
index 0000000..9ae3ec8
--- /dev/null
+++ b/neighbours/regressor.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+from .rp_neighbours import *
+from .exceptions import *
+
+
+class KNNRegressor:
+    def __init__(self, features, trees_count, rpt_m):
+        self.features = features
+        self.forest = RPTForest(features, trees_count, rpt_m)
+        self.targets = None
+
+    def load(self, points, targets):
+        """Loads train data, builds a corresponding forest
+
+        :param points: np.ndarray of train samples
+        :param targets: an array of target values corresponding to loaded train points
+        """
+
+        if not isinstance(points, np.ndarray):
+            raise InvalidType("points should be represented as np.ndarray")
+
+        if not isinstance(targets, np.ndarray) and not isinstance(targets, list):
+            raise InvalidType("targets should be represented as np.ndarray or list")
+
+        self.targets = targets
+
+        if points.ndim != 2:
+            raise InvalidDimensionError("points array should be two-dimensional")
+
+        if points.shape[1] != self.features:
+            raise InvalidDimensionError(
+                "invalid number of features in sample (expected {}, got {})".format(self.features, points.shape[1])
+            )
+
+        self.forest.load(points)
+
+    def predict(self, point: np.ndarray, distance, kernel, h):
+        nearest_point_indexes = self.forest.get_neighbours(point)
+
+        # Nadaraya-Watson estimator
+
+        numerator = float(0)
+        denominator = float(0.0000001)
+
+        for point_ix in nearest_point_indexes:
+            weight = kernel(distance(point, self.forest.get_point(point_ix)) / h)
+            numerator += weight * self.targets[point_ix]
+            denominator += weight
+
+        return numerator / denominator

From f9f724fa7d920474982e38811e3b2677f61155d0 Mon Sep 17 00:00:00 2001
From: hashlag <90853356+hashlag@users.noreply.github.com>
Date: Sun, 4 Feb 2024 17:51:50 +0300
Subject: [PATCH 2/2] upd regressor

---
 neighbours/regressor.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/neighbours/regressor.py b/neighbours/regressor.py
index 9ae3ec8..119b6c8 100644
--- a/neighbours/regressor.py
+++ b/neighbours/regressor.py
@@ -5,7 +5,26 @@ from .exceptions import *
 
 
 class KNNRegressor:
+    """K-nearest neighbors regressor
+
+    Nadaraya-Watson kNN regressor based on random projection forest.
+
+    Supports different (including custom) smoothing kernels and distance metrics.
+
+    Attributes:
+        features: number of features in each sample
+        forest: an instance of RPTForest
+        targets: an array of target values corresponding to loaded train points
+    """
+
     def __init__(self, features, trees_count, rpt_m):
+        """Initializes new regressor
+
+        :param features: number of features in each sample
+        :param trees_count: number of trees in the forest
+        :param rpt_m: maximum number of samples in one leaf of an RP tree
+        """
+
         self.features = features
         self.forest = RPTForest(features, trees_count, rpt_m)
         self.targets = None
@@ -36,16 +55,25 @@ class KNNRegressor:
         self.forest.load(points)
 
     def predict(self, point: np.ndarray, distance, kernel, h):
+        """Predict target value for given point
+
+        :param point: target point as np.ndarray
+        :param distance: distance metric function (e.g., from neighbours.distance)
+        :param kernel: smoothing kernel function (e.g., from neighbours.kernel)
+        :param h: bandwidth
+        :return: predicted value or numpy.nan if unable to obtain a prediction
+        """
+
         nearest_point_indexes = self.forest.get_neighbours(point)
 
         # Nadaraya-Watson estimator
 
         numerator = float(0)
-        denominator = float(0.0000001)
+        denominator = float(0)
 
         for point_ix in nearest_point_indexes:
             weight = kernel(distance(point, self.forest.get_point(point_ix)) / h)
             numerator += weight * self.targets[point_ix]
             denominator += weight
 
-        return numerator / denominator
+        return np.nan if denominator == 0 else numerator / denominator