Source code for paddlets.transform.sklearn_transforms

# !/usr/bin/env python3
# -*- coding:utf-8 -*-

import abc
from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict

import pandas as pd
import numpy as np
import scipy

from sklearn import preprocessing

from paddlets.transform.sklearn_transforms_base import SklearnTransformWrapper

[docs]class OneHot(SklearnTransformWrapper): """ Transform categorical columns with OneHot encoder. Args: cols(str|List): Column(s) to be encoded. handle_unknown(str): {'error', 'ignore'}, default='error' drop(bool): Whether to delete the original column, default=False dtype(object): Data type, default=float categorie(str|List): 'auto' or a list of array-like, default='auto', if categorie is 'auto', it determine categories automatically from the dataset. Returns: None """ def __init__( self, cols: Union[str, List[str]], dtype: object = np.float64, handle_unknown: str = "error", categories: Union[str, List[str]] = 'auto', drop: bool = False ): super().__init__( preprocessing.__getattribute__('OneHotEncoder'), in_col_names=cols, per_col_transform=True, drop_origin_columns=drop, dtype=dtype, handle_unknown=handle_unknown, categories=categories, )
[docs]class Ordinal(SklearnTransformWrapper): """ Encode categorical features as an integer array. Args: cols(str|List): Name of columns to Encode handle_unknown(str): {‘error’, ‘use_encoded_value’}, default=’error’ drop(bool): Whether to delete the original column, default=False. dtype(object): Number type, default=float. unknown_value(str): int or np.nan, default=None. categorie(str|List): 'auto' or a list of array-like, default='auto',if categorie is 'auto', it determine categories automatically from the training data. if categorie is list, categories[i] holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. Returns: None """ def __init__( self, cols: Union[str, List[str]], dtype: np.dtype = np.dtype("float64"), categories: Union[str, List[str]] = 'auto', unknown_value: Union[int, None] = None, handle_unknown: str = 'error', drop: bool = False ): super().__init__( preprocessing.__getattribute__('OrdinalEncoder'), in_col_names=cols, per_col_transform=False, drop_origin_columns=drop, dtype=dtype, handle_unknown=handle_unknown, categories=categories, unknown_value=unknown_value, )
[docs]class MinMaxScaler(SklearnTransformWrapper): """ Transform a dataset by scaling the values of sepcified column(s) to the expected range: [min, max]. The transformation is done by: X_std = (X - X.min) / (X.max - X.min) X_scaled = X_std * (max - min) + min Args: cols(str|List): Column name(s) to be scaled. f_range(tuple): tuple (min, max), default=(0, 1), Desired range of transformed values. clip(bool): Set to True to clip transformed values of held-out data to provided feature range. Returns: None """ def __init__( self, cols: Union[str, List[str]]=None, f_range: tuple=(0, 1), clip: bool=False ): super().__init__( preprocessing.__getattribute__('MinMaxScaler'), in_col_names=cols, drop_origin_columns=True, per_col_transform=True, feature_range=f_range, clip=clip )
[docs]class StandardScaler(SklearnTransformWrapper): """ Transform a dataset by scaling the values of sepcified column(s) to zero mean and unit variance. The transformation is done by: z = (x - u) / s. where u is the MEAN or zero if with_mean=False, and s is the standard deviation or one if with_std=False. Args: cols(str|List): Column name or names to be scaled. with_mean(bool): If True, center the data before scaling. with_std(bool):If True, scale the data to unit variance. Returns: None """ def __init__( self, cols: Union[str, List[str]]=None, with_mean: bool=True, with_std: bool=True ): super().__init__( preprocessing.__getattribute__('StandardScaler'), in_col_names=cols, drop_origin_columns=True, per_col_transform=True, with_mean=with_mean, with_std=with_std )