Multi-Objective Bayesian Optimization with Spearmint

The first step is to start MongoDB. For that, open a new terminal and type:

In [ ]:
sudo mongod

Next, we have to write a config.json file, which contains the configuration of the BO experiment. In the folder,

Practicasl/tune_neural_network_multi-objective/bo,

you should be able to find the corresponding config.json file. This file is similar for the unconstrained case. However, it specifies the objective and the constraints to use:

In [ ]:
{
    "language"          : "PYTHON",
    "main_file"         : "wrapper",
    "grid_size"         : 1000,
    "experiment-name"   : "Tune-NN-Hypers-Multi-Objective",
    "acquisition"       : "PESM",
    "random_seed"       : 1,
    "moo_use_grid_only_to_solve_problem"       : true,
    "moo_grid_size_to_solve_problem"       : 1000,
    "pesm_use_grid_only_to_solve_problem"       : true,
    "pesm_pareto_set_size"       : 50,
    "pesm_not_constrain_predictions"       : false,
    "likelihood"        : "GAUSSIAN",
    "max_finished_jobs" : 50,
    "variables" : {
        "log_learning_rate" : {
            "type" : "FLOAT",
            "size" : 1,
            "min"  : -10,
            "max"  : -1
        },
        "log_weight_decay" : {
            "type" : "FLOAT",
            "size" : 1,
            "min"  : -10,
            "max"  : 10
        },
        "n_hidden_units" : {
            "type" : "INT",
            "size" : 1,
            "min"  : 10,
            "max"  : 50
        }
    },
    "tasks": {
      "error" : {
          "type"        : "OBJECTIVE"
      },
      "time" : {
          "type"        : "OBJECTIVE"
      }
    }
}

This file contains all the information needed by Spearmint to run the BO experiment. It specifies the variables the objective has and its ranges and types, the acquistiion function to use, etc. Below you have the code of the wrapper, which evaluates the objective. It does not do much, since all the code is found in nn.py

In [ ]:
from nn import *
  
if __name__ == '__main__':

    main(0, {u'log_learning_rate': np.array([ np.log(1e-3) ]), u'log_weight_decay': np.array([ np.log(1.0) ]),
        u'n_hidden_units': np.array([ 20 ]) })

The content of the nn.py file is given below. Spearmint will simply call the main method, which now return the two objectives. The second objective is simply that the prediction time of the network normalized by the prediction time of the fastest network (smallest one).

In [ ]:
from __future__ import absolute_import
from __future__ import print_function

import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy.stats.norm as norm
from autograd import grad
import sys

def make_nn_funs(layer_sizes, weight_decay):
    """These functions implement a standard multi-layer perceptron."""
    shapes = zip(layer_sizes[:-1], layer_sizes[1:])
    num_weights = sum((m+1)*n for m, n in shapes)

    def unpack_layers(weights):
        for m, n in shapes:
            cur_layer_weights = weights[:m*n]     .reshape((m, n))
            cur_layer_biases  = weights[m*n:m*n+n].reshape((1, n))
            yield cur_layer_weights, cur_layer_biases
            weights = weights[(m+1)*n:]

    def predictions(weights, inputs):
        for W, b in unpack_layers(weights):
            outputs = np.dot(inputs, W) + b
            inputs = np.maximum(outputs, 0.0)
        return outputs

    def loss(weights, inputs, targets, N):
        preds = predictions(weights, inputs)
        return np.sum(weights**2) * weight_decay + 1.0 * N / inputs.shape[ 0 ] * np.sum((preds - targets)**2)

    return num_weights, predictions, loss

def load_data():

    # We load the data

    data = np.loadtxt('../../data/data.txt')
    index_train = np.loadtxt('../../data/index_train_0.txt', dtype = int)
    index_validation = np.loadtxt('../../data/index_validation_0.txt', dtype = int)
    index_test = np.loadtxt('../../data/index_test_0.txt', dtype = int)
    index_features = np.loadtxt('../../data/index_features.txt', dtype = int)
    index_target = np.array([ np.loadtxt('../../data/index_target.txt', dtype = int) ])

    X_train = data[ index_train, : ][ :, index_features ]
    X_validation = data[ index_validation, : ][ :, index_features ]
    X_test = data[ index_test, : ][ :, index_features ]

    y_train = np.array(data[ index_train, index_target ], ndmin = 2).T
    y_validation = np.array(data[ index_validation, index_target ], ndmin = 2).T
    y_test = np.array(data[ index_test, index_target ], ndmin = 2).T

    # We normalize the input features

    mean_X_train = np.mean(X_train, axis = 0)
    std_X_train = np.std(X_train, axis = 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train = (X_train - mean_X_train) / std_X_train
    X_validation = (X_validation - mean_X_train) / std_X_train
    X_test = (X_test - mean_X_train) / std_X_train

    # We normalize the output variables

    mean_y_train = np.mean(y_train)
    std_y_train = np.std(y_train)

    y_train = (y_train - mean_y_train) / std_y_train
    y_validation = (y_validation - mean_y_train) / std_y_train
    y_test = (y_test - mean_y_train) / std_y_train

    return X_train, X_validation, X_test, y_train, y_validation, y_test, mean_y_train, std_y_train

def main(job_id, params, test = False):

    #We load the data
    X_train, X_validation, X_test, y_train, y_validation, y_test, mean_y_train, std_y_train = load_data()

    if test:
        X_train = np.concatenate((X_train, X_validation), 0)
        y_train = np.concatenate((y_train, y_validation), 0)

    # Implement a 3-hidden layer neural network.
    learning_rate = np.exp(params[ 'log_learning_rate' ][ 0 ])
    weight_decay = np.exp(params[ 'log_weight_decay' ][ 0 ])
    n_hidden_units = params[ 'n_hidden_units' ][ 0 ]
    num_weights, predictions, loss = \
        make_nn_funs([ X_train.shape[ 1 ], n_hidden_units, n_hidden_units, n_hidden_units, 1], weight_decay)

    gradient_objective = grad(loss)

    np.random.seed(1)
    rs = npr.RandomState(0)
    weights = 0.1 * rs.randn(num_weights)

    def print_perf(epoch, w):
        print("{0:3}|{1:15}".format(epoch, -loss(w, X_train, y_train, len(y_train))))
        sys.stdout.flush()

    # Train with sgd and adam

    def make_batches(N_data, batch_size):
        return [ slice(i, min(i + batch_size, N_data)) for i in range(0, N_data, batch_size) ]

    batch_size = 100
    batch_idxs = make_batches(X_train.shape[0], batch_size)

    m1 = 0
    m2 = 0
    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-8
    alpha = learning_rate
    t = 0

    epochs = 50
    N = len(X_train)
    for epoch in range(epochs):
        permutation = np.random.choice(range(N), N, replace = False)
        print_perf(epoch, weights)
        for idxs in batch_idxs:
            t += 1
            grad_w = gradient_objective(weights, X_train[ permutation[ idxs ] ], y_train[ permutation[ idxs ] ], N)
            m1 = beta1 * m1 + (1 - beta1) * grad_w
            m2 = beta2 * m2 + (1 - beta2) * grad_w**2
            m1_hat = m1 / (1 - beta1**t)
            m2_hat = m2 / (1 - beta2**t)
            weights -= alpha * m1_hat / (np.sqrt(m2_hat) + epsilon)

    # We evaluate the validation error or test error

    if test:
        preds = predictions(weights, X_test)
        error = np.sqrt(np.mean((preds - y_test)**2)) * std_y_train
    else:
        preds = predictions(weights, X_validation)
        error = np.sqrt(np.mean((preds - y_validation)**2)) * std_y_train

    print("RMSE:", error)
    sys.stdout.flush()

    # We evaluate the prediction time

    X_test_time = np.random.randn(int(5e5), X_train.shape[ 1 ])
    import time

    start = time.time()
    preds = predictions(weights, X_test_time)
    end = time.time()

    elapsed_time_candidate = end - start

    _, predictions, _= \
        make_nn_funs([ X_train.shape[ 1 ], 10, 10, 10, 1], weight_decay)

    start = time.time()
    preds = predictions(weights, X_test_time)
    end = time.time()

    elapsed_time_fastest = end - start

    time_ratio = elapsed_time_candidate /elapsed_time_fastest

    print("Time ratio:", time_ratio)
    
    return { 'error': error, 'time': time_ratio}

Now we are ready to run Spearmint. Before, that however, we will have to install the right version. Simply go to the folder Spearmint-PESMO and type

In [ ]:
sudo python setup.py install

Go back to tune_neural_network/bo. The first step is to clean the MongoDB Base. For that, type

In [ ]:
python ../../Spearmint-PESMO/spearmint/cleanup.py .

Now we can start our BO experimnet. Simply type

In [ ]:
python ../../Spearmint-PESMO/spearmint/main.py .

This will run Spearmint for a total of 50 iterations. After that, we can extract the recommendation for the problem's solution. Simply type,

In [ ]:
python get_final_result.py

This python script will connect to MongoDB to extract the recommendaiton. Below you can see the code.

In [ ]:
from nn import *

import os
import sys
import importlib
import imp
import pdb
import numpy             as np
import numpy.random      as npr
import numpy.linalg      as npla
import matplotlib        as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

from spearmint.visualizations         import plots_2d
from spearmint.utils.parsing          import parse_config_file
from spearmint.utils.parsing          import parse_tasks_from_jobs
from spearmint.utils.parsing          import repeat_experiment_name
from spearmint.utils.parsing          import get_objectives_and_constraints
from spearmint.utils.parsing          import DEFAULT_TASK_NAME
from spearmint.utils.database.mongodb import MongoDB
from spearmint.tasks.input_space      import InputSpace
from spearmint.tasks.input_space      import paramify_no_types
from spearmint.main                   import load_jobs

if __name__ == '__main__':

    options         = parse_config_file('.', 'config.json')
    experiment_name = options["experiment-name"]

    db               = MongoDB(database_address=options['database']['address'])
    recommendations  = db.load(experiment_name, 'recommendations')

    values = paramify_no_types(recommendations[ -1 ][ 'params_o' ])

    n_points = len(values[ values.keys()[ 0 ] ])

    frontier = np.zeros((n_points, 2))

    for i in range(n_points):
        value = dict()
        for j in values.keys():
            value[ j ] = values[ j ][ i ]

        ret = main(0, value)
        frontier[ i, 0 ] = ret['error']
        frontier[ i, 1 ] = ret['time']

    frontier = frontier[ np.argsort(frontier[ :, 0 ]), : ]
    np.savetxt('pareto_frontier_bo.txt', frontier)

We will also compare results with a random exploration of the parameter space. The code is found in the file random_search.py located in the folder random_search. Below you can see the code. The file pareto_frontier_bo.txt contains the solution.

In [ ]:
from nn import *

if __name__ == '__main__':

    np.random.seed(1)

    grid_size = 50
    samples_log_learning_rate = np.array(-10.0 + np.random.rand(grid_size) * (10.0 - -1.0), ndmin = 2).T
    samples_log_weight_decay  = np.array(-10.0 + np.random.rand(grid_size) * (10.0 - -10.0), ndmin = 2).T
    samples_h_hidden_units  = np.array(10.0 + np.round(np.random.rand(grid_size) * (50.0 - 10.0) - 0.5), dtype = int, ndmin = 2).T

    values = np.zeros((grid_size, 1))
    for i in range(grid_size):
        values[ i ] = main(0, {u'log_learning_rate': samples_log_learning_rate[ i ],
            u'log_weight_decay': samples_log_weight_decay[ i ],
            u'n_hidden_units': samples_h_hidden_units[ i ]})

        print(values[ i ], samples_log_learning_rate[ i ], samples_log_weight_decay[ i ], samples_h_hidden_units[ i ])

    results = np.concatenate((values, samples_log_learning_rate, samples_log_weight_decay, samples_h_hidden_units), 1)
    np.savetxt("results.txt", results)

This will store the results of the random search strategy in the file results.txt. To extract the best configuration found simply execute the script get_final_results.py, whose code is found below. It is the same as the previous one, but it will only Pareto optimal solutions. The file, pareto_frontier_random_search.txt contains the results.

In [ ]:
from nn import *
from spearmint.utils.moop import _cull_algorithm
import numpy as np

if __name__ == '__main__':

    results = np.loadtxt('results.txt')

    # We retain only non-dominated optimal points

    indices_optimal_points = _cull_algorithm(results[ : , 0 :  2 ])
    optimal_results = results[ indices_optimal_points, : ]
    values = np.zeros((optimal_results.shape[ 0 ], 2))

    # We evaluate again the optimal points

    for i in range(optimal_results.shape[ 0 ]):
        best = optimal_results[ i, 2 :  ]
        params = {u'log_learning_rate': np.array([ best[ 0 ] ]), \
            u'log_weight_decay': np.array([ best[ 1 ] ]), \
            u'n_hidden_units': np.array([ int(best[ 2 ]) ]) }

        ret = main(0, params)
        values[ i, 0 ] = ret['error']
        values[ i, 1 ] = ret['time']

    # We save the Pareto frontier

    values = values[ np.argsort(values[ :, 0 ]), : ]

    np.savetxt('pareto_frontier_random_search.txt', values)

We can plot the optimal points found by each method by using the python script located at tune_neural_network_multi-objective, called plot_frontiers.py, whose content is shown below

In [ ]:
import numpy as np
import matplotlib.pyplot as plt

bo = np.loadtxt('bo/pareto_frontier_bo.txt')
rs = np.loadtxt('random_search/pareto_frontier_random_search.txt')

plt.plot(bo[ :, 0 ], bo[ :, 1 ],color="blue", label = "Bayesian Optimizaton")
plt.plot(rs[ :, 0 ], rs[ :, 1 ],color="red", label = "Random Search")

plt.scatter(bo[ :, 0 ], bo[ :, 1 ],color="blue")
plt.scatter(rs[ :, 0 ], rs[ :, 1 ],color="red")


plt.xlabel("Error")
plt.ylabel("Time Ratio")
plt.legend()
plt.show()

import pdb; pdb.set_trace()