Source code for paddlets.transform.sklearn_transforms

# !/usr/bin/env python3
# -*- coding:utf-8 -*-

import abc
from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict

import pandas as pd
import numpy as np
import scipy

from sklearn import preprocessing

from paddlets.transform.sklearn_transforms_base import SklearnTransformWrapper

[docs]class OneHot(SklearnTransformWrapper):
    """
    Transform categorical columns with OneHot encoder.
    
    Args:
        cols(str|List): Column(s) to be encoded.
        handle_unknown(str): {'error', 'ignore'}, default='error'
        drop(bool): Whether to delete the original column, default=False
        dtype(object): Data type, default=float
        categorie(str|List): 'auto' or a list of array-like, default='auto', if categorie is 'auto', it determine categories automatically from the dataset.
        
    Returns:
        None
    """
    def __init__(
        self, 
        cols: Union[str, List[str]], 
        dtype: object = np.float64, 
        handle_unknown: str = "error", 
        categories: Union[str, List[str]] = 'auto', 
        drop: bool = False
    ):
        super().__init__(
            preprocessing.__getattribute__('OneHotEncoder'),
            in_col_names=cols,
            per_col_transform=True,
            drop_origin_columns=drop,
            dtype=dtype,
            handle_unknown=handle_unknown,
            categories=categories,
        )

[docs]class Ordinal(SklearnTransformWrapper):
    """
    Encode categorical features as an integer array.
    
    Args:
        cols(str|List): Name of columns to Encode 
        handle_unknown(str): {‘error’, ‘use_encoded_value’}, default=’error’
        drop(bool): Whether to delete the original column, default=False.
        dtype(object): Number type, default=float.
        unknown_value(str): int or np.nan, default=None.
        categorie(str|List): 'auto' or a list of array-like, default='auto',if categorie is 'auto', it determine categories automatically from the training data. if categorie is list, categories[i] holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values.
    
    Returns:
        None
    """
    def __init__(
        self, 
        cols: Union[str, List[str]], 
        dtype: np.dtype = np.dtype("float64"), 
        categories: Union[str, List[str]] = 'auto', 
        unknown_value: Union[int, None] = None,
        handle_unknown: str = 'error',
        drop: bool = False
    ):
        super().__init__(
            preprocessing.__getattribute__('OrdinalEncoder'),
            in_col_names=cols,
            per_col_transform=False,
            drop_origin_columns=drop,
            dtype=dtype,
            handle_unknown=handle_unknown,
            categories=categories,
            unknown_value=unknown_value,
        )

[docs]class MinMaxScaler(SklearnTransformWrapper):
    """
    Transform a dataset by scaling the values of sepcified column(s) to the expected range: [min, max].
    
    The transformation is done by:
    
    X_std = (X - X.min) / (X.max - X.min)
    
    X_scaled = X_std * (max - min) + min


    Args:
        cols(str|List): Column name(s) to be scaled.
        f_range(tuple): tuple (min, max), default=(0, 1), Desired range of transformed values.
        clip(bool): Set to True to clip transformed values of held-out data to provided feature range.

    Returns:
        None
    """ 
    def __init__(
        self, 
        cols: Union[str, List[str]]=None, 
        f_range: tuple=(0, 1), 
        clip: bool=False
    ):
        super().__init__(
            preprocessing.__getattribute__('MinMaxScaler'),
            in_col_names=cols,
            drop_origin_columns=True,
            per_col_transform=True,
            feature_range=f_range,
            clip=clip
        )

[docs]class StandardScaler(SklearnTransformWrapper):
    """
    Transform a dataset by scaling the values of sepcified column(s) to zero mean and unit variance.

    The transformation is done by:
    z = (x - u) / s.

    where u is the MEAN or zero if with_mean=False, and s is the standard deviation or one if with_std=False.

    Args:
        cols(str|List): Column name or names to be scaled.
        with_mean(bool): If True, center the data before scaling. 
        with_std(bool):If True, scale the data to unit variance.

    Returns:
        None
    """
    def __init__(
        self, 
        cols: Union[str, List[str]]=None, 
        with_mean: bool=True, 
        with_std: bool=True
    ):
        super().__init__(
            preprocessing.__getattribute__('StandardScaler'),
            in_col_names=cols,
            drop_origin_columns=True,
            per_col_transform=True,
            with_mean=with_mean,
            with_std=with_std
        )