# Copyright (c) 2022 Pratyush Kumar, Abhinav Prakash, and Yu Ding
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
[docs]class BayesTreePowerCurve(object):
"""
Parameters
----------
n_trees: int
Number of trees to use. An integer greater than 0.
"""
def __init__(self, n_trees=200):
try:
from bartpy.sklearnmodel import SklearnModel
except:
print("To run this model, please install the bartpy python library. You can see more details about it on the installation page of DSWE package.")
if not isinstance(n_trees, int):
raise ValueError("The number of trees must be an integer value.")
self.n_trees = n_trees
[docs] def fit(self, X_train, y_train):
"""
Parameters
----------
X_train: np.ndarray or pd.DataFrame
A matrix or dataframe of input variable values in the training dataset.
y_train: np.array
A numeric array for response values in the training dataset.
Returns
-------
BayesTreePowerCurve
self with trained parameter values.
"""
from bartpy.sklearnmodel import SklearnModel
if not (isinstance(X_train, list) or isinstance(X_train, pd.DataFrame) or isinstance(X_train, pd.Series) or isinstance(X_train, np.ndarray)):
raise ValueError(
"The X_train should be either a list or numpy array or dataframe.")
if not (isinstance(y_train, list) or isinstance(y_train, np.ndarray)) or isinstance(y_train, pd.Series) or isinstance(y_train, pd.DataFrame):
raise ValueError(
"The target data should be either a list or numpy array or dataframe.")
if len(X_train) != len(y_train):
raise ValueError(
"The X_train and y_train should have same number of data points.")
self.X_train = np.array(X_train)
self.y_train = np.array(y_train)
if len(self.X_train.shape) == 1:
self.X_train = self.X_train.reshape(-1, 1)
# scale the features
self.scale_features = StandardScaler()
self.scale_features.fit(self.X_train)
self.X_train = self.scale_features.transform(self.X_train)
self.is_discrete = False
if (self.y_train == self.y_train.astype(int)).all():
# target values are discrete
self.y_train = self.y_train.astype(int)
self.is_discrete = True
else:
# target values are continuous
self.scale_target = StandardScaler() # scale the target
self.scale_target.fit(self.y_train.reshape(-1, 1))
self.y_train = self.scale_target.transform(
self.y_train.reshape(-1, 1)).squeeze()
self.model = SklearnModel(self.n_trees)
self.model.fit(self.X_train, self.y_train)
return self
[docs] def predict(self, X_test):
"""
Parameters
----------
X_test: np.ndarray or pd.DataFrame
A matrix or dataframe of test input variable values to compute predictions.
Returns
-------
np.array
A numeric array for predictions at the data points in X_test.
"""
if not (isinstance(X_test, list) or isinstance(X_test, pd.DataFrame) or isinstance(X_test, pd.Series) or isinstance(X_test, np.ndarray)):
raise ValueError(
"The X_test should be either a list or numpy array or dataframe.")
X_test = np.array(X_test)
if len(X_test.shape) == 1:
X_test = X_test.reshape(-1, 1)
if len(self.X_train.shape) > 1:
if X_test.shape[1] != self.X_train.shape[1]:
raise ValueError(
"The number of features in train and test set must be same.")
X_test = self.scale_features.transform(X_test)
predictions = self.model.predict(X_test)
if not self.is_discrete:
predictions = self.scale_target.inverse_transform(
predictions.reshape(-1, 1)).squeeze()
return predictions