Source code for paddlets.transform.statistical

# !/usr/bin/env python3
# -*- coding:utf-8 -*-

from typing import Union, List

import pandas as pd
import numpy as np

from paddlets.transform.base import BaseTransform
from paddlets.datasets.tsdataset import TimeSeries, TSDataset
from paddlets.logger import Logger, raise_if_not, raise_if, raise_log
from paddlets.logger.logger import log_decorator

logger = Logger(__name__)

STATISTICS = ['median', 'mean', 'max', 'min', 'std']

[docs]class StatsTransform(BaseTransform): """ Statistical features: 'median', 'mean', 'max', 'min', 'std' Args: cols(str|List): Name of columns to transform. start(int): Start coordinates. end(int): End coordinates. statistics(str|List): Indicators that need to be counted, default=['median', 'mean', 'max', 'min', 'std']. Returns: None Examples: .. code-block:: python Given X: X 1 2 3 4 statistics = ['mean'], start = 0, end = 2 after transform: X X_mean 1 nan 2 1.5 3 2.5 4 3.5 Remark: since the first element(1) start index has no value, the result of calculating the mean is nan statistics = ['mean'], start = 1, end = 3 after transform: X X_mean 1 nan 2 nan 3 1.5 4 2.5 """ def __init__(self, cols: Union[str, List], start: int = 0, end: int = 1, statistics: List = STATISTICS): super(StatsTransform, self).__init__() self._cols = cols if isinstance(cols, str): self._cols=[cols] if len(self._cols) < 1: raise_log(ValueError("The feature column is not specified!")) self._statistics = statistics if len(self._statistics) < 1: raise_log(ValueError("The statistics are not specified!")) if not set(self._statistics) <= set(STATISTICS): raise_log(ValueError("%s not in %s" % (self._statistics, STATISTICS))) if start < 0 or end < 0: raise_log(ValueError("Start or end index less than 0")) if end <= start: raise_log(ValueError("Start index greater than end")) self._start = start self._end = end self._map = {} for e in STATISTICS: self._map[e] = [] self.need_previous_data = True self.n_rows_pre_data_need = self._end
[docs] @log_decorator def fit_one(self, tsdata: TSDataset): """ Fit the StatsTransform to dataset. Args: tsdata(TSDataset): Dataset to be fitted. Returns: StatsTransform """ return self
[docs] @log_decorator def transform_one(self, tsdata: TSDataset, inplace: bool = False) -> TSDataset: """ Transform dataset to statstransform codes. Args: tsdata(TSDataset): Dataset to be transformed. inplace(bool): Whether to perform the transformation inplace. default=False Returns: TSDataset """ new_ts = tsdata if not inplace: new_ts = tsdata.copy() statics_df = tsdata[self._cols] if isinstance(statics_df, pd.core.series.Series): statics_df = statics_df.to_frame() for col in self._cols: data_item = new_ts.get_item_from_column(col).data try: cur_series = statics_df[col].astype('float') except ValueError: logger.warning("Values in the column %s should be numerical" % (col)) start_old = self._start end_old = self._end for i in range(len(cur_series)): end = i - start_old + 1 start = i - end_old + 1 for e in self._statistics: self._map[e].append(cur_series[start: end].__getattr__(e)()) for e in self._statistics: new_name = '%s_%s' % (col, e) if new_ts.columns[col] == 'target': new_value = pd.Series(self._map[e]) new_value.index = new_ts[col].index new_ts.set_column(new_name, new_value, 'observed_cov') else: data_item[new_name] = self._map[e] self._map[e].clear() return new_ts