subsampling_opt_src
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.gaussian_process.kernels import Matern, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors, cm
from math import log, floor, sqrt
copper = mpl.cm.copper
cNorm = colors.Normalize(vmin=-0.9, vmax=-0.25)
scalarMap = cm.ScalarMappable(norm=cNorm, cmap=copper)
Utilities to estimate allocation of compute time, complexities, …
def cost_per_model(pct, algo='rf'):
'''returns the ratio of the computational time needed at a given % compared to the full dataset (100%)'''
x = [i for i in range(1,101, 1)]
if algo == 'rf':
nlogn = [i*log(i) for i in x]
return nlogn[99]/nlogn[int(pct*100) - 1]
if algo == 'svm':
n_n = [i*i for i in x]
return n_n[99]/n_n[int(pct*100) - 1]
def budget_division(budget, how='equal', steps=3, lower=0.4):
'''returns of the budget should be divided in the different sample steps'''
def normalizing_factor(lst, budget):
'''sum(lst).X = budget'''
return budget / sum(lst)
if how == 'equal':
return [int(budget/steps) for _ in range(steps)]
else:
slices = [budget/(1+s) for s in range(steps)]
norm_factor = normalizing_factor(slices, budget)
normalized_slices = [norm_factor*s for s in slices]
if how == 'linear_asc':
return normalized_slices
if how == 'linear_desc':
return normalized_slices[::-1]
def models_at_sample_size(budget, sample_size, algo):
'''given a budget and a sample size, returns the number of models that can be trained'''
return int(budget*cost_per_model(sample_size, algo))
def size(i, lower=0.4, steps=3):
return lower + i * (1 - lower)/(steps - 1)
def get_data():
"""Synthetic binary classification dataset."""
data, targets = make_classification(
n_samples=5_000,
n_features=22,
n_informative=12,
n_redundant=4,
random_state=0,
)
return data, targets
Function to optimize.
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
estimator = RFC(
n_estimators=n_estimators,
min_samples_split=min_samples_split,
max_features=max_features,
random_state=2
)
cval = cross_val_score(estimator, data, targets,
scoring='neg_log_loss', cv=3)
return cval.mean()
# Points to probe in next level.
# Something dynamic like sqrt(observations) could do the job but needs additional control mechanisms.
n_points = 5
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
ExpSineSquared, DotProduct,
ConstantKernel)
def points_to_probe(optimizer):
'''Generates viz and returns points to probe'''
x0_obs = np.array([[res["params"]["max_features"]] for res in optimizer.res])
x1_obs = np.array([[res["params"]["min_samples_split"]] for res in optimizer.res])
x2_obs = np.array([[res["params"]["n_estimators"]] for res in optimizer.res])
y_obs = np.array([res["target"] for res in optimizer.res])
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
for m, zlow, zhigh in [('o', -50, -25), ('^', -30, -5)]:
ax.scatter(x0_obs, x1_obs, x2_obs, c=scalarMap.to_rgba(y_obs), alpha=0.5)
ax.set_xlabel('max_features')
ax.set_ylabel('min_samples_split')
ax.set_zlabel('n_estimators')
plt.colorbar(scalarMap)
plt.tight_layout()
plt.show()
#plt.matshow(optimizer._gp.L_)
#plt.title('Lower-triangular Cholesky decomposition of cov')
#plt.show()
idx = y_obs.argsort()[-n_points:][::-1]
probe = [[x0_obs[i], x1_obs[i], x2_obs[i]] for i in idx]
return probe
def optimize_rfc(data, targets, level, cov_function_prior, n_iter=0, bounds=None, to_probe=None):
"""
level: index + 1 of sample size in [pct0, pct1, .. pctN].
cov_function_prior: definition of cov. function by the gaussian process regression. It's going to be updated every step.
n_iter: number of models to be computed at each sample size. Is constrained by the total budget.
bounds: updated boundaries for hyper param. space.
to_probe: promissing points found in smaller sample sizes.
"""
def rfc_crossval(n_estimators, min_samples_split, max_features):
return rfc_cv(
n_estimators=int(n_estimators),
min_samples_split=float(min_samples_split),
max_features=max(min(max_features, 0.999), 1e-3),
data=data,
targets=targets,
)
optimizer = BayesianOptimization(
f=rfc_crossval,
pbounds={
"n_estimators": (10, 250),
"min_samples_split": (0.01, 0.999),
"max_features": (0.1, 0.999),
},
random_state=1234,
verbose=1
)
# model noise in each sample size (level)
optimizer._gp.kernel = cov_function_prior + WhiteKernel(noise_level=0.01/(level + 1))
if len(to_probe) > 0:
for point in to_probe:
optimizer.probe(
params=point,
lazy=True,
)
# control structure to constrain compute budget
if level == 1:
init_points = 2 # minimum amount of points to start inference -> randomly generated.
else:
init_points = 0
n_iter -= n_points
optimizer.maximize(init_points=init_points, n_iter=n_iter, acq="ucb", kappa=20/level)
#print(Colours.yellow(f'Prior kernel: {optimizer._gp.kernel}'))
#print(Colours.purple(f'Posterior kernel: {optimizer._gp.kernel_}'))
cov_function_posterior = optimizer._gp.kernel_
return points_to_probe(optimizer), cov_function_posterior
data, targets = get_data()
lower = 0.3 # smallest percentage to sample from dataset
steps = 3 # 3 different percentages to experiment with (linear interpolated)
budget = 100
bounds = None
to_probe = []
cov_function_prior = Matern(nu=2.5)
plt.figure()
for level, b in enumerate(budget_division(budget, how='equal', steps=steps, lower=lower)):
sample_size = size(level, lower, steps)
n_iter = models_at_sample_size(b, sample_size, 'rf')
# sampling
rows = int(len(data) * sample_size)
idx = np.random.choice(len(data), rows, replace=False)
sampled_X = data[idx,:]
sampled_y = targets[idx]
print(Colours.green(f"--- Optimizing Random Forest: {n_iter} models; budget: {b} --- "))
to_probe, cov_function_posterior = optimize_rfc(sampled_X, sampled_y, level + 1, cov_function_prior, n_iter, bounds, to_probe)
cov_function_prior = cov_function_posterior