B
    ZvdD                 @   s6  d Z ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. e ej/e&e'dG dd deeZ0dS )z9Bagging classifier trained on balanced bootstrap samples.    N)Parallel)clone)BaggingClassifier)_parallel_decision_function)_partition_estimators)DecisionTreeClassifier)delayed)check_is_fitted   )_ParamsValidationMixin)Pipeline)RandomUnderSampler)BaseUnderSampler)Substitutioncheck_sampling_strategycheck_target_type)available_if)_n_jobs_docstring_random_state_docstring)
HasMethodsInterval
StrOptions   )_bagging_parameter_constraints_estimator_has)sampling_strategyn_jobsrandom_statec                   s  e Zd ZdZeedr$eejZn
ee	Ze
eejddddeddd	d
dheegdgedgdgd d)dddddddddddddd fddZ fddZe fddZedd Zedd Z fd d!Zd* fd"d#	Zeed$d%d& Z fd'd(Z  ZS )+BalancedBaggingClassifieru  A Bagging classifier with additional balancing.

    This implementation of Bagging is similar to the scikit-learn
    implementation. It includes an additional step to balance the training set
    at fit time using a given sampler.

    This classifier can serves as a basis to implement various methods such as
    Exactly Balanced Bagging [6]_, Roughly Balanced Bagging [7]_,
    Over-Bagging [6]_, or SMOTE-Bagging [8]_.

    Read more in the :ref:`User Guide <bagging>`.

    Parameters
    ----------
    estimator : estimator object, default=None
        The base estimator to fit on random subsets of the dataset.
        If None, then the base estimator is a decision tree.

        .. versionadded:: 0.10

    n_estimators : int, default=10
        The number of base estimators in the ensemble.

    max_samples : int or float, default=1.0
        The number of samples to draw from X to train each base estimator.

        - If int, then draw ``max_samples`` samples.
        - If float, then draw ``max_samples * X.shape[0]`` samples.

    max_features : int or float, default=1.0
        The number of features to draw from X to train each base estimator.

        - If int, then draw ``max_features`` features.
        - If float, then draw ``max_features * X.shape[1]`` features.

    bootstrap : bool, default=True
        Whether samples are drawn with replacement.

        .. note::
           Note that this bootstrap will be generated from the resampled
           dataset.

    bootstrap_features : bool, default=False
        Whether features are drawn with replacement.

    oob_score : bool, default=False
        Whether to use out-of-bag samples to estimate
        the generalization error.

    warm_start : bool, default=False
        When set to True, reuse the solution of the previous call to fit
        and add more estimators to the ensemble, otherwise, just fit
        a whole new ensemble.

    {sampling_strategy}

    replacement : bool, default=False
        Whether or not to randomly sample with replacement or not when
        `sampler is None`, corresponding to a
        :class:`~imblearn.under_sampling.RandomUnderSampler`.

    {n_jobs}

    {random_state}

    verbose : int, default=0
        Controls the verbosity of the building process.

    sampler : sampler object, default=None
        The sampler used to balanced the dataset before to bootstrap
        (if `bootstrap=True`) and `fit` a base estimator. By default, a
        :class:`~imblearn.under_sampling.RandomUnderSampler` is used.

        .. versionadded:: 0.8

    base_estimator : estimator object, default=None
        The base estimator to fit on random subsets of the dataset.
        If None, then the base estimator is a decision tree.

        .. deprecated:: 0.10
           `base_estimator` was renamed to `estimator` in version 0.10 and
           will be removed in 0.12.

    Attributes
    ----------
    estimator_ : estimator
        The base estimator from which the ensemble is grown.

        .. versionadded:: 0.10

    base_estimator_ : estimator
        The base estimator from which the ensemble is grown.

        .. deprecated:: 1.2
           `base_estimator_` is deprecated in `scikit-learn` 1.2 and will be
           removed in 1.4. Use `estimator_` instead. When the minimum version
           of `scikit-learn` supported by `imbalanced-learn` will reach 1.4,
           this attribute will be removed.

    n_features_ : int
        The number of features when `fit` is performed.

        .. deprecated:: 1.0
           `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed
           in version 1.2. When the minimum version of `scikit-learn` supported
           by `imbalanced-learn` will reach 1.2, this attribute will be removed.

    estimators_ : list of estimators
        The collection of fitted base estimators.

    sampler_ : sampler object
        The validate sampler created from the `sampler` parameter.

    estimators_samples_ : list of ndarray
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator. Each subset is defined by a boolean mask.

    estimators_features_ : list of ndarray
        The subset of drawn features for each base estimator.

    classes_ : ndarray of shape (n_classes,)
        The classes labels.

    n_classes_ : int or list
        The number of classes.

    oob_score_ : float
        Score of the training dataset obtained using an out-of-bag estimate.

    oob_decision_function_ : ndarray of shape (n_samples, n_classes)
        Decision function computed with out-of-bag estimate on the training
        set. If n_estimators is small it might be possible that a data point
        was never left out during the bootstrap. In this case,
        ``oob_decision_function_`` might contain NaN.

    n_features_in_ : int
        Number of features in the input dataset.

        .. versionadded:: 0.9

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during `fit`. Defined only when `X` has feature
        names that are all strings.

        .. versionadded:: 0.9

    See Also
    --------
    BalancedRandomForestClassifier : Random forest applying random-under
        sampling to balance the different bootstraps.

    EasyEnsembleClassifier : Ensemble of AdaBoost classifier trained on
        balanced bootstraps.

    RUSBoostClassifier : AdaBoost classifier were each bootstrap is balanced
        using random-under sampling at each round of boosting.

    Notes
    -----
    This is possible to turn this classifier into a balanced random forest [5]_
    by passing a :class:`~sklearn.tree.DecisionTreeClassifier` with
    `max_features='auto'` as a base estimator.

    See
    :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_ensemble_classifier.py`.

    References
    ----------
    .. [1] L. Breiman, "Pasting small votes for classification in large
           databases and on-line", Machine Learning, 36(1), 85-103, 1999.

    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
           1996.

    .. [3] T. Ho, "The random subspace method for constructing decision
           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
           1998.

    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
           Learning and Knowledge Discovery in Databases, 346-361, 2012.

    .. [5] C. Chen Chao, A. Liaw, and L. Breiman. "Using random forest to
           learn imbalanced data." University of California, Berkeley 110,
           2004.

    .. [6] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and
           boosting." AAAI/IAAI 1997 (1997): 546-551.

    .. [7] S. Hido, H. Kashima, and Y. Takahashi. "Roughly balanced bagging
           for imbalanced data." Statistical Analysis and Data Mining: The ASA
           Data Science Journal 2.5‐6 (2009): 412-426.

    .. [8] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by
           using ensemble models." 2009 IEEE symposium on computational
           intelligence and data mining. IEEE, 2009.

    Examples
    --------
    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.metrics import confusion_matrix
    >>> from imblearn.ensemble import BalancedBaggingClassifier
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{1: 900, 0: 100}})
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
    ...                                                     random_state=0)
    >>> bbc = BalancedBaggingClassifier(random_state=42)
    >>> bbc.fit(X_train, y_train)
    BalancedBaggingClassifier(...)
    >>> y_pred = bbc.predict(X_test)
    >>> print(confusion_matrix(y_test, y_pred))
    [[ 23   0]
     [  2 225]]
    _parameter_constraintsr   r   right)closedautoZmajorityznot minorityznot majorityallbooleanZfit_resampleN)r   replacementsampler
   g      ?TF
deprecated)max_samplesmax_features	bootstrapbootstrap_features	oob_score
warm_startr   r%   r   r   verboser&   base_estimatorc               sn   t t j}d|i}d|jkr*||d< n|| _t jf |||||||||||d
 |	| _|
| _|| _d S )Nr0   	estimator)
n_estimatorsr)   r*   r+   r,   r-   r.   r   r   r/   )	inspect	signaturesuper__init__
parametersr1   r   r%   r&   )selfr1   r2   r)   r*   r+   r,   r-   r.   r   r%   r   r   r/   r&   r0   Zbagging_classifier_signatureZestimator_params)	__class__ L/var/www/html/venv/lib/python3.7/site-packages/imblearn/ensemble/_bagging.pyr6     s(    

z"BalancedBaggingClassifier.__init__c                sX   t  |}t jtrL jjdkrL fddt j| jj D  _	n j _	|S )Nbypassc                s*   i | ]"\}}|t  j|kd  d  qS )r   )npwhereZclasses_).0keyvalue)r8   r:   r;   
<dictcomp>H  s   z9BalancedBaggingClassifier._validate_y.<locals>.<dictcomp>)
r5   _validate_y
isinstancer   dictsampler__sampling_typer   items_sampling_strategy)r8   yZ	y_encoded)r9   )r8   r;   rC   B  s    
z%BalancedBaggingClassifier._validate_yc             C   s   | j dk	r| jdkrtd| j dk	r2t| j }n*| jdkrTtdt t| j}nt|}| jjdkrx| jj	| j
d td| jfd|fg| _y| j| _W n tk
r   Y nX dS )	zZCheck the estimator and the n_estimator attribute, set the
        `estimator_` attribute.N)Nr(   zEBoth `estimator` and `base_estimator` were set. Only set `estimator`.zX`base_estimator` was renamed to `estimator` in version 0.10 and will be removed in 0.12.r<   )r   r&   
classifier)r1   r0   
ValueErrorr   warningswarnFutureWarningrF   rG   Z
set_paramsrI   r   
_estimatorZbase_estimator_AttributeError)r8   defaultr0   r:   r:   r;   _validate_estimatorT  s(    



z-BalancedBaggingClassifier._validate_estimatorc             C   s   | j S )z$Estimator used to grow the ensemble.)rP   )r8   r:   r:   r;   
estimator_w  s    z$BalancedBaggingClassifier.estimator_c             C   s   t dt | jS )z-Number of features when ``fit`` is performed.z`n_features_` was deprecated in scikit-learn 1.0. This attribute will not be accessible when the minimum supported version of scikit-learn is 1.2.)rM   rN   rO   Zn_features_in_)r8   r:   r:   r;   n_features_}  s    z%BalancedBaggingClassifier.n_features_c                s   |    t ||S )a+  Build a Bagging ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like of shape (n_samples,)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : object
            Fitted estimator.
        )Z_validate_paramsr5   fit)r8   XrJ   )r9   r:   r;   rV     s    zBalancedBaggingClassifier.fitc                sD   t | | jd kr"t| jd| _nt| j| _t j||| jd dS )N)r%   )sample_weight)	r   r&   r   r%   rF   r   r5   _fitr)   )r8   rW   rJ   r)   	max_depthrX   )r9   r:   r;   rY     s    
zBalancedBaggingClassifier._fitdecision_functionc                sn   t  j ddgdddd tjj\}}t|jd fddt|D }t|j }|S )	a  Average of the decision functions of the base classifiers.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        Returns
        -------
        score : ndarray of shape (n_samples, k)
            The decision function of the input samples. The columns correspond
            to the classes in sorted order, as they appear in the attribute
            ``classes_``. Regression and binary classification are special
            cases with ``k == 1``, otherwise ``k==n_classes``.
        ZcsrZcscNF)Zaccept_sparseZdtypeZforce_all_finitereset)r   r/   c             3   sJ   | ]B}t tj| |d    j| |d     V  qdS )r   N)r   r   Zestimators_Zestimators_features_)r?   i)rW   r8   startsr:   r;   	<genexpr>  s   z>BalancedBaggingClassifier.decision_function.<locals>.<genexpr>)	r	   Z_validate_datar   r2   r   r   r/   rangesum)r8   rW   r   _Zall_decisionsZ	decisionsr:   )rW   r8   r^   r;   r[     s    z+BalancedBaggingClassifier.decision_functionc                s<   t   }d}d}d}||kr,||| |< n||i||< |S )NZ_xfail_checksZcheck_estimators_nan_infz9Fails because the sampler removed infinity and NaN values)r5   
_more_tags)r8   tagsZtags_keyZfailing_testreason)r9   r:   r;   rc     s    
z$BalancedBaggingClassifier._more_tags)Nr'   )NNN) __name__
__module____qualname____doc__hasattrr   copydeepcopyr   r   updater   numbersRealr   rE   callabler   r6   rC   r   rS   propertyrT   rU   rV   rY   r   r   r[   rc   __classcell__r:   r:   )r9   r;   r   !   sF    `

 #.r   )1ri   rk   r3   rn   rM   numpyr=   Zjoblibr   Zsklearn.baser   Zsklearn.ensembler   Zsklearn.ensemble._baggingr   Zsklearn.ensemble._baser   Zsklearn.treer   Zsklearn.utils.fixesr   Zsklearn.utils.validationr	   baser   Zpipeliner   Zunder_samplingr   Zunder_sampling.baser   utilsr   r   r   Zutils._available_ifr   Zutils._docstringr   r   Zutils._param_validationr   r   r   Z_commonr   r   Z_sampling_strategy_docstringr   r:   r:   r:   r;   <module>   s6   