Source code for rom.generators.random_forest

import os
import time
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import multiprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz

from .model_generator_base import ModelGeneratorBase
from ..shared import pickle_file, save_dict_to_csv, zipdir


[docs]class RandomForest(ModelGeneratorBase):
    def __init__(self, analysis_id, random_seed=None, **kwargs):
        super(RandomForest, self).__init__(analysis_id, random_seed, **kwargs)

[docs]    def export_tree_png(self, tree, covariates, filename):
        export_graphviz(tree, feature_names=np.asarray(covariates), filled=True, rounded=True)
        copy_tree_path = os.path.join(os.path.dirname(filename), 'tree.dot')
        os.rename('tree.dot', copy_tree_path)
        os.system('dot -Tpng %s -o %s' % (copy_tree_path, filename))
        if os.path.exists(copy_tree_path):
            os.remove(copy_tree_path)

[docs]    def evaluate(self, model, model_name, model_type, x_data, y_data, downsample,
                 build_time, cv_time, covariates=None, scaler=None):
        """

        Evaluate the performance of the forest based on known x_data and y_data.

        :param model:
        :param model_name:
        :param model_type:
        :param x_data:
        :param y_data:
        :param downsample:
        :param build_time:
        :param cv_time:
        :param covariates:
        :return:
        """
        _yhat, performance = super(RandomForest, self).evaluate(
            model, model_name, model_type, x_data, y_data, downsample,
            build_time, cv_time, covariates, scaler
        )

        importance_data = pd.Series(model.feature_importances_, index=np.asarray(covariates))
        importance_data = importance_data.nlargest(20)

        fig = plt.figure(figsize=(8, 3), dpi=100)
        # defaults to the ax in the figure.
        ax = sns.barplot(x=list(importance_data), y=list(importance_data.index.values),
                         color="grey", ci=None)
        # ax.set(xlabel='Relative Importance', ylabel='')
        ax.set_xlabel('Relative Importance')
        plt.tight_layout()
        fig.savefig('%s/fig_importance_%s.png' % (self.images_dir, model_name))
        fig.clf()
        plt.clf()

        # plot a single tree
        if downsample <= 0.01:
            tree_file_name = '%s/fig_first_tree_%s.png' % (self.images_dir, model_name)
            self.export_tree_png(model.estimators_[0], covariates, tree_file_name)

        # add some more data to the model evaluation dict
        performance['n_estimators'] = model.n_estimators
        performance['max_depth'] = model.max_depth if not model.max_depth else 0
        performance['max_features'] = model.max_features
        performance['min_samples_leaf'] = model.min_samples_leaf
        performance['min_samples_split'] = model.min_samples_leaf

        return performance

[docs]    def save_cv_results(self, cv_results, response, downsample, filename):
        """
        Save the cv_results to a CSV file. Data in the cv_results file looks like the following.

        The CV results are the results of the GridSearch k-fold cross validation. The form of the
        results take the following from:

        .. code-block:: python

            {
                'param_kernel': masked_array(data=['poly', 'poly', 'rbf', 'rbf'],
                                             mask=[False False False False]...)
                'param_gamma': masked_array(data=[-- -- 0.1 0.2],
                                            mask=[True  True False False]...),
                'param_degree': masked_array(data=[2.0 3.0 - - --],
                                             mask=[False False  True  True]...),
                'split0_test_score': [0.8, 0.7, 0.8, 0.9],
                'split1_test_score': [0.82, 0.5, 0.7, 0.78],
                'mean_test_score': [0.81, 0.60, 0.75, 0.82],
                'std_test_score': [0.02, 0.01, 0.03, 0.03],
                'rank_test_score': [2, 4, 3, 1],
                'split0_train_score': [0.8, 0.9, 0.7],
                'split1_train_score': [0.82, 0.5, 0.7],
                'mean_train_score': [0.81, 0.7, 0.7],
                'std_train_score': [0.03, 0.03, 0.04],
                'mean_fit_time': [0.73, 0.63, 0.43, 0.49],
                'std_fit_time': [0.01, 0.02, 0.01, 0.01],
                'mean_score_time': [0.007, 0.06, 0.04, 0.04],
                'std_score_time': [0.001, 0.002, 0.003, 0.005],
                'params': [{'kernel': 'poly', 'degree': 2}, ...],
            }

        :param cv_results:
        :param filename:
        :return:
        """

        data = {}
        data['downsample'] = []
        for params in cv_results['params']:
            for param, value in params.items():
                if not data.get(param, None):
                    data[param] = []
                data[param].append(value)
                data['downsample'] = downsample
                data['response'] = response
        data['mean_train_score'] = cv_results['mean_train_score']
        data['mean_test_score'] = cv_results['mean_test_score']
        data['mean_fit_time'] = cv_results['mean_fit_time']
        data['mean_score_time'] = cv_results['mean_score_time']
        data['rank_test_score'] = cv_results['rank_test_score']

        df = pd.DataFrame.from_dict(data)
        df.to_csv(filename)

[docs]    def build(self, data_file, metamodel, **kwargs):
        super(RandomForest, self).build(data_file, metamodel, **kwargs)

        analysis_options = kwargs.get('algorithm_options', {})

        train_x, test_x, train_y, test_y, validate_xy, _scaler = self.train_test_validate_split(
            self.dataset,
            metamodel,
            downsample=self.downsample
        )

        # save the validate dataframe to be used later to validate the accuracy of the models
        self.save_dataframe(validate_xy, "%s/rf_validation" % self.validation_dir)

        for response in metamodel.available_response_names(self.model_type):
            print("Fitting random forest model for %s" % response)

            start = time.time()
            base_fit_params = analysis_options.get('base_fit_params', {})
            rf = RandomForestRegressor(**base_fit_params)
            base_model = rf.fit(train_x, train_y[response])
            build_time = time.time() - start

            # Evaluate the forest when building them
            self.model_results.append(
                self.evaluate(
                    base_model, response, 'base', test_x, test_y[response],
                    self.downsample, build_time, 0,
                    covariates=metamodel.covariate_names(self.model_type),
                    scaler=_scaler
                )
            )

            if not kwargs.get('skip_cv', False):
                rf = RandomForestRegressor()

                kfold = 3
                print('Perfoming CV with k-fold equal to %s' % kfold)
                # grab the param grid from what was specified in the metamodels.json file
                param_grid = analysis_options.get('param_grid', {})
                total_candidates = 1
                for param, options in param_grid.items():
                    total_candidates = len(options) * total_candidates

                print('CV will result in %s candidates' % total_candidates)

                # allow for the computer to be responsive during grid_search
                n_jobs = multiprocessing.cpu_count() - 1
                grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=kfold,
                                           verbose=2, refit=True, n_jobs=n_jobs)

                start = time.time()
                grid_search.fit(train_x, train_y[response])
                cv_time = time.time() - start

                # This should work, but the size of the model is really large after, so
                # trying to recreate the best_rf to save space.
                # best_rf = grid_search.best_estimator_

                print('The best params were %s' % grid_search.best_params_)

                # rebuild only the best rf, and save the results
                model = RandomForestRegressor(**grid_search.best_params_)
                best_model = model.fit(train_x, train_y[response])

                pickle_file(best_model, '%s/%s' % (self.models_dir, response))

                # save the cv results
                self.save_cv_results(
                    grid_search.cv_results_, response, self.downsample,
                    '%s/cv_results_%s.csv' % (self.base_dir, response)
                )

                self.model_results.append(
                    self.evaluate(
                        best_model, response, 'best', test_x, test_y[response],
                        self.downsample, build_time, cv_time,
                        covariates=metamodel.covariate_names(self.model_type),
                        scaler=_scaler
                    )
                )
            else:
                pickle_file(base_model, '%s/%s' % (self.models_dir, response))

        if self.model_results:
            save_dict_to_csv(self.model_results, '%s/model_results.csv' % self.base_dir)

        # zip up the models
        zipf = zipfile.ZipFile(
            '%s/models.zip' % self.models_dir, 'w', zipfile.ZIP_DEFLATED, allowZip64=True
        )
        zipdir(self.models_dir, zipf, '.pkl')
        zipf.close()

        # save the data that was used in the models for future processing and analysis
        self.dataset.to_csv('%s/data.csv' % self.data_dir)