//! Naive Bayes Classifiers
//!
//! The classifier supports Gaussian, Bernoulli and Multinomial distributions.
//!
//! A naive Bayes classifier works by treating the features of each input as independent
//! observations. Under this assumption we utilize Bayes' rule to compute the
//! probability that each input belongs to a given class.
//!
//! # Examples
//!
//! ```
//! use rusty_machine::learning::naive_bayes::{NaiveBayes, Gaussian};
//! use rusty_machine::linalg::Matrix;
//! use rusty_machine::learning::SupModel;
//!
//! let inputs = Matrix::new(6, 2, vec![1.0, 1.1,
//!                                     1.1, 0.9,
//!                                     2.2, 2.3,
//!                                     2.5, 2.7,
//!                                     5.2, 4.3,
//!                                     6.2, 7.3]);
//!
//! let targets = Matrix::new(6,3, vec![1.0, 0.0, 0.0,
//!                                     1.0, 0.0, 0.0,
//!                                     0.0, 1.0, 0.0,
//!                                     0.0, 1.0, 0.0,
//!                                     0.0, 0.0, 1.0,
//!                                     0.0, 0.0, 1.0]);
//!
//! // Create a Gaussian Naive Bayes classifier.
//! let mut model = NaiveBayes::<Gaussian>::new();
//!
//! // Train the model.
//! model.train(&inputs, &targets).unwrap();
//!
//! // Predict the classes on the input data
//! let outputs = model.predict(&inputs).unwrap();
//!
//! // Will output the target classes - otherwise our classifier is bad!
//! println!("Final outputs --\n{}", outputs);
//! ```

use linalg::{Matrix, Axes, BaseMatrix, BaseMatrixMut};
use learning::{LearningResult, SupModel};
use learning::error::{Error, ErrorKind};
use rulinalg::utils;

use std::f64::consts::PI;

/// The Naive Bayes model.
#[derive(Debug)]
pub struct NaiveBayes<T: Distribution> {
    distr: Option<T>,
    cluster_count: Option<usize>,
    class_prior: Option<Vec<f64>>,
    class_counts: Vec<usize>,
}

impl<T: Distribution> NaiveBayes<T> {
    /// Create a new NaiveBayes model from a given
    /// distribution.
    ///
    /// # Examples
    ///
    /// ```
    /// use rusty_machine::learning::naive_bayes::{NaiveBayes, Gaussian};
    ///
    /// // Create a new Gaussian Naive Bayes model.
    /// let _ = NaiveBayes::<Gaussian>::new();
    /// ```
    pub fn new() -> NaiveBayes<T> {
        NaiveBayes {
            distr: None,
            cluster_count: None,
            class_prior: None,
            class_counts: Vec::new(),
        }
    }

    /// Get the cluster count for this model.
    ///
    /// Returns an option which is `None` until the model has been trained.
    pub fn cluster_count(&self) -> Option<&usize> {
        self.cluster_count.as_ref()
    }

    /// Get the class prior distribution for this model.
    ///
    /// Returns an option which is `None` until the model has been trained.
    pub fn class_prior(&self) -> Option<&Vec<f64>> {
        self.class_prior.as_ref()
    }

    /// Get the distribution for this model.
    ///
    /// Returns an option which is `None` until the model has been trained.
    pub fn distr(&self) -> Option<&T> {
        self.distr.as_ref()
    }
}

/// Train and predict from the Naive Bayes model.
///
/// The input matrix must be rows made up of features.
/// The target matrix should have indicator vectors in each row specifying
/// the input class. e.g. [[1,0,0],[0,0,1]] shows class 1 first, then class 3.
impl<T: Distribution> SupModel<Matrix<f64>, Matrix<f64>> for NaiveBayes<T> {
    /// Train the model using inputs and targets.
    fn train(&mut self, inputs: &Matrix<f64>, targets: &Matrix<f64>) -> LearningResult<()> {
        self.distr = Some(T::from_model_params(targets.cols(), inputs.cols()));
        self.update_params(inputs, targets)
    }

    /// Predict output from inputs.
    fn predict(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
        let log_probs = try!(self.get_log_probs(inputs));
        let input_classes = NaiveBayes::<T>::get_classes(log_probs);

        if let Some(cluster_count) = self.cluster_count {
            let mut class_data = Vec::with_capacity(inputs.rows() * cluster_count);

            for c in input_classes {
                let mut row = vec![0f64; cluster_count];
                row[c] = 1f64;

                class_data.append(&mut row);
            }

            Ok(Matrix::new(inputs.rows(), cluster_count, class_data))
        } else {
            Err(Error::new(ErrorKind::UntrainedModel, "The model has not been trained."))
        }
    }
}

impl<T: Distribution> NaiveBayes<T> {
    /// Get the log-probabilities per class for each input.
    pub fn get_log_probs(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {

        if let (&Some(ref distr), &Some(ref prior)) = (&self.distr, &self.class_prior) {
            // Get the joint log likelihood from the distribution
            distr.joint_log_lik(inputs, prior)
        } else {
            Err(Error::new_untrained())
        }
    }

    fn update_params(&mut self, inputs: &Matrix<f64>, targets: &Matrix<f64>) -> LearningResult<()> {
        let class_count = targets.cols();
        let total_data = inputs.rows();

        self.class_counts = vec![0; class_count];
        let mut class_data = vec![Vec::new(); class_count];

        for (idx, row) in targets.iter_rows().enumerate() {
            // Find the class of this input
            let class = try!(NaiveBayes::<T>::find_class(row));

            // Note the class of the input
            class_data[class].push(idx);
            self.class_counts[class] += 1;
        }

        if let Some(ref mut distr) = self.distr {
            for (idx, c) in class_data.into_iter().enumerate() {
                // If this class' vector has not been populated, we can safely
                // skip this iteration, since the user is clearly not interested
                // in associating features with this class
                if c.is_empty() {
                    continue;
                }
                // Update the parameters within this class
                try!(distr.update_params(&inputs.select_rows(&c), idx));
            }
        }

        let mut class_prior = Vec::with_capacity(class_count);

        // Compute the prior as the proportion in each class
        class_prior.extend(self.class_counts.iter().map(|c| *c as f64 / total_data as f64));

        self.class_prior = Some(class_prior);
        self.cluster_count = Some(class_count);
        Ok(())
    }

    fn find_class(row: &[f64]) -> LearningResult<usize> {
        // Find the `1` entry in the row
        for (idx, r) in row.into_iter().enumerate() {
            if *r == 1f64 {
                return Ok(idx);
            }
        }

        Err(Error::new(ErrorKind::InvalidState,
                       "No class found for entry in targets"))
    }

    fn get_classes(log_probs: Matrix<f64>) -> Vec<usize> {
        let mut data_classes = Vec::with_capacity(log_probs.rows());

        data_classes.extend(log_probs.iter_rows().map(|row| {
            // Argmax each class log-probability per input
            let (class, _) = utils::argmax(row);
            class
        }));

        data_classes
    }
}

/// Naive Bayes Distribution.
pub trait Distribution {
    /// Initialize the distribution parameters.
    fn from_model_params(class_count: usize, features: usize) -> Self;

    /// Updates the distribution parameters.
    fn update_params(&mut self, data: &Matrix<f64>, class: usize) -> LearningResult<()>;

    /// Compute the joint log likelihood of the data.
    ///
    /// Returns a matrix with rows containing the probability that the input lies in each class.
    fn joint_log_lik(&self,
                     data: &Matrix<f64>,
                     class_prior: &[f64])
                     -> LearningResult<Matrix<f64>>;
}

/// The Gaussian Naive Bayes model distribution.
///
/// Defines:
///
/// p(x|C<sub>k</sub>) = ∏<sub>i</sub> N(x<sub>i</sub> ;
/// μ<sub>k</sub>, σ<sup>2</sup><sub>k</sub>)
#[derive(Debug)]
pub struct Gaussian {
    theta: Matrix<f64>,
    sigma: Matrix<f64>,
}

impl Gaussian {
    /// Returns the distribution means.
    ///
    /// This is a matrix of class by feature means.
    pub fn theta(&self) -> &Matrix<f64> {
        &self.theta
    }

    /// Returns the distribution variances.
    ///
    /// This is a matrix of class by feature variances.
    pub fn sigma(&self) -> &Matrix<f64> {
        &self.sigma
    }
}

impl Distribution for Gaussian {
    fn from_model_params(class_count: usize, features: usize) -> Gaussian {
        Gaussian {
            theta: Matrix::zeros(class_count, features),
            sigma: Matrix::zeros(class_count, features),
        }
    }

    fn update_params(&mut self, data: &Matrix<f64>, class: usize) -> LearningResult<()> {
        // Compute mean and sample variance
        let mean = data.mean(Axes::Row).into_vec();
        let var = try!(data.variance(Axes::Row).map_err(|_| {
                Error::new(ErrorKind::InvalidData,
                           "Cannot compute variance for Gaussian distribution.")
            }))
            .into_vec();

        let features = data.cols();

        for (idx, (m, v)) in mean.into_iter().zip(var.into_iter()).enumerate() {
            self.theta.mut_data()[class * features + idx] = m;
            self.sigma.mut_data()[class * features + idx] = v;
        }

        Ok(())
    }

    fn joint_log_lik(&self,
                     data: &Matrix<f64>,
                     class_prior: &[f64])
                     -> LearningResult<Matrix<f64>> {
        let class_count = class_prior.len();
        let mut log_lik = Vec::with_capacity(class_count);

        for (i, item) in class_prior.into_iter().enumerate() {
            let joint_i = item.ln();
            let n_ij = -0.5 * (self.sigma.select_rows(&[i]) * 2.0 * PI).apply(&|x| x.ln()).sum();

            // NOTE: Here we are copying the row data which is inefficient
            let r_ij = (data - self.theta.select_rows(&vec![i; data.rows()]))
                .apply(&|x| x * x)
                .elediv(&self.sigma.select_rows(&vec![i; data.rows()]))
                .sum_cols();

            let res = (-r_ij * 0.5) + n_ij;

            log_lik.append(&mut (res + joint_i).into_vec());
        }

        Ok(Matrix::new(class_count, data.rows(), log_lik).transpose())
    }
}

/// The Bernoulli Naive Bayes model distribution.
///
/// Defines:
///
///    p(x|C<sub>k</sub>) = ∏<sub>i</sub> p<sub>k</sub><sup>x<sub>i</sub></sup>
/// (1-p)<sub>k</sub><sup>1-x<sub>i</sub></sup>
#[derive(Debug)]
pub struct Bernoulli {
    log_probs: Matrix<f64>,
    pseudo_count: f64,
}

impl Bernoulli {
    /// The log probability matrix.
    ///
    /// A matrix of class by feature model log-probabilities.
    pub fn log_probs(&self) -> &Matrix<f64> {
        &self.log_probs
    }
}

impl Distribution for Bernoulli {
    fn from_model_params(class_count: usize, features: usize) -> Bernoulli {
        Bernoulli {
            log_probs: Matrix::zeros(class_count, features),
            pseudo_count: 1f64,
        }
    }

    fn update_params(&mut self, data: &Matrix<f64>, class: usize) -> LearningResult<()> {
        let features = data.cols();

        // We add the pseudo count to the class count and feature count
        let pseudo_cc = data.rows() as f64 + (2f64 * self.pseudo_count);
        let pseudo_fc = data.sum_rows() + self.pseudo_count;

        let log_probs = (pseudo_fc.apply(&|x| x.ln()) - pseudo_cc.ln()).into_vec();

        for (i, item) in log_probs.iter().enumerate().take(features) {
            self.log_probs[[class, i]] = *item;
        }

        Ok(())

    }

    fn joint_log_lik(&self,
                     data: &Matrix<f64>,
                     class_prior: &[f64])
                     -> LearningResult<Matrix<f64>> {
        let class_count = class_prior.len();

        let neg_prob = self.log_probs.clone().apply(&|x| (1f64 - x.exp()).ln());

        let res = data * (&self.log_probs - &neg_prob).transpose();

        // NOTE: Some messy stuff now to get the class row contribution.
        // Really we want to add to each row the class log-priors and the
        // neg_prob_sum contribution - the last term in
        // x log(p) + (1-x)log(1-p) = x (log(p) - log(1-p)) + log(1-p)

        let mut per_class_row = Vec::with_capacity(class_count);
        let neg_prob_sum = neg_prob.sum_cols();

        for (idx, p) in class_prior.into_iter().enumerate() {
            per_class_row.push(p.ln() + neg_prob_sum[idx]);
        }

        let class_row_mat = Matrix::new(1, class_count, per_class_row);

        Ok(res + class_row_mat.select_rows(&vec![0; data.rows()]))
    }
}

/// The Multinomial Naive Bayes model distribution.
///
/// Defines:
///
///    p(x|C<sub>k</sub>) ∝ ∏<sub>i</sub> p<sub>k</sub><sup>x<sub>i</sub></sup>
#[derive(Debug)]
pub struct Multinomial {
    log_probs: Matrix<f64>,
    pseudo_count: f64,
}

impl Multinomial {
    /// The log probability matrix.
    ///
    /// A matrix of class by feature model log-probabilities.
    pub fn log_probs(&self) -> &Matrix<f64> {
        &self.log_probs
    }
}

impl Distribution for Multinomial {
    fn from_model_params(class_count: usize, features: usize) -> Multinomial {
        Multinomial {
            log_probs: Matrix::zeros(class_count, features),
            pseudo_count: 1f64,
        }
    }

    fn update_params(&mut self, data: &Matrix<f64>, class: usize) -> LearningResult<()> {
        let features = data.cols();

        let pseudo_fc = data.sum_rows() + self.pseudo_count;
        let pseudo_cc = pseudo_fc.sum();

        let log_probs = (pseudo_fc.apply(&|x| x.ln()) - pseudo_cc.ln()).into_vec();

        for (i, item) in log_probs.iter().enumerate().take(features) {
            self.log_probs[[class, i]] = *item;
        }

        Ok(())
    }

    fn joint_log_lik(&self,
                     data: &Matrix<f64>,
                     class_prior: &[f64])
                     -> LearningResult<Matrix<f64>> {
        let class_count = class_prior.len();

        let res = data * self.log_probs.transpose();

        let mut per_class_row = Vec::with_capacity(class_count);
        for p in class_prior {
            per_class_row.push(p.ln());
        }

        let class_row_mat = Matrix::new(1, class_count, per_class_row);

        Ok(res + class_row_mat.select_rows(&vec![0; data.rows()]))
    }
}

#[cfg(test)]
mod tests {
    use super::NaiveBayes;
    use super::Gaussian;
    use super::Bernoulli;
    use super::Multinomial;

    use learning::SupModel;

    use linalg::Matrix;

    #[test]
    fn test_gaussian() {
        let inputs = Matrix::new(6,
                                 2,
                                 vec![1.0, 1.1, 1.1, 0.9, 2.2, 2.3, 2.5, 2.7, 5.2, 4.3, 6.2, 7.3]);

        let targets = Matrix::new(6,
                                  3,
                                  vec![1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0,
                                       0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]);

        let mut model = NaiveBayes::<Gaussian>::new();
        model.train(&inputs, &targets).unwrap();

        let outputs = model.predict(&inputs).unwrap();
        assert_eq!(outputs.into_vec(), targets.into_vec());
    }

    #[test]
    fn test_bernoulli() {
        let inputs = Matrix::new(4,
                                 3,
                                 vec![1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]);

        let targets = Matrix::new(4, 2, vec![1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0]);

        let mut model = NaiveBayes::<Bernoulli>::new();
        model.train(&inputs, &targets).unwrap();

        let outputs = model.predict(&inputs).unwrap();
        assert_eq!(outputs.into_vec(), targets.into_vec());
    }

    #[test]
    fn test_multinomial() {
        let inputs = Matrix::new(4,
                                 3,
                                 vec![1.0, 0.0, 5.0, 0.0, 0.0, 11.0, 13.0, 1.0, 0.0, 12.0, 3.0,
                                      0.0]);

        let targets = Matrix::new(4, 2, vec![1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0]);

        let mut model = NaiveBayes::<Multinomial>::new();
        model.train(&inputs, &targets).unwrap();

        let outputs = model.predict(&inputs).unwrap();
        assert_eq!(outputs.into_vec(), targets.into_vec());
    }
}