Source code for paddlets.transform.statistical

# !/usr/bin/env python3
# -*- coding:utf-8 -*-

from typing import Union, List

import pandas as pd
import numpy as np

from paddlets.transform.base import BaseTransform
from paddlets.datasets.tsdataset import TimeSeries, TSDataset
from paddlets.logger import Logger, raise_if_not, raise_if, raise_log
from paddlets.logger.logger import log_decorator

logger = Logger(__name__)

STATISTICS = ['median', 'mean', 'max', 'min', 'std']

[docs]class StatsTransform(BaseTransform):
    """
    Statistical features: 'median', 'mean', 'max', 'min', 'std'
    
    Args:
        cols(str|List): Name of columns to transform. 
        start(int): Start coordinates.
        end(int): End coordinates.
        statistics(str|List): Indicators that need to be counted, default=['median', 'mean', 'max', 'min', 'std'].

    Returns:
        None

    Examples:
        .. code-block:: python

            Given X:
                X
                1
                2
                3
                4
        
            statistics = ['mean'], start = 0, end = 2
        
            after transform:
                X X_mean
                1 nan
                2 1.5
                3 2.5
                4 3.5
            Remark: since the first element(1) start index has no value, the result of calculating the mean is nan

            statistics = ['mean'], start = 1, end = 3
        
            after transform:
                X X_mean
                1 nan
                2 nan
                3 1.5
                4 2.5
    """

    def __init__(self, cols: Union[str, List],
                 start: int = 0,
                 end: int = 1,
                 statistics: List = STATISTICS):
        super(StatsTransform, self).__init__()
        self._cols = cols
        if isinstance(cols, str):
            self._cols=[cols]
        if len(self._cols) < 1:
            raise_log(ValueError("The feature column is not specified!"))

        self._statistics = statistics
        if len(self._statistics) < 1:
            raise_log(ValueError("The statistics are not specified!"))
        if not set(self._statistics) <= set(STATISTICS):
            raise_log(ValueError("%s not in %s" % (self._statistics, STATISTICS)))
        
        if start < 0 or end < 0:
            raise_log(ValueError("Start or end index less than 0"))
        if end <= start:
            raise_log(ValueError("Start index greater than end"))
        
        self._start = start
        self._end = end
        self._map = {}
        for e in STATISTICS:
            self._map[e] = []
        
        self.need_previous_data = True
        self.n_rows_pre_data_need = self._end

[docs]    @log_decorator
    def fit_one(self, tsdata: TSDataset):
        """
        Fit the StatsTransform to dataset.
        
        Args:
            tsdata(TSDataset): Dataset to be fitted.
        
        Returns:
            StatsTransform
        """
        return self

[docs]    @log_decorator
    def transform_one(self, tsdata: TSDataset, inplace: bool = False) -> TSDataset:
        """
        Transform dataset to statstransform codes.
        
        Args:
            tsdata(TSDataset): Dataset to be transformed.
            inplace(bool): Whether to perform the transformation inplace. default=False
        
        Returns:
            TSDataset
        """
        new_ts = tsdata
        if not inplace:
            new_ts = tsdata.copy()
        
        statics_df = tsdata[self._cols]
        if isinstance(statics_df, pd.core.series.Series):
            statics_df = statics_df.to_frame()
        for col in self._cols:
            data_item = new_ts.get_item_from_column(col).data
            try:
                cur_series = statics_df[col].astype('float')
            except ValueError:
                logger.warning("Values in the column %s should be numerical" % (col))
            
            start_old = self._start
            end_old = self._end
            for i in range(len(cur_series)):
                end = i - start_old + 1
                start = i - end_old + 1
                for e in self._statistics:
                    self._map[e].append(cur_series[start: end].__getattr__(e)())

            for e in self._statistics:
                new_name = '%s_%s' % (col, e)
                if new_ts.columns[col] == 'target':
                    new_value = pd.Series(self._map[e])
                    new_value.index = new_ts[col].index
                    new_ts.set_column(new_name, new_value, 'observed_cov')
                else:
                    data_item[new_name] = self._map[e]
                self._map[e].clear()

        return new_ts