Edit on GitHub

utils.math.statistics

Mathematical statistics.

View Source

  1"""
  2Mathematical statistics.
  3"""
  4
  5import numpy
  6import pandas
  7
  8from typing import List, Hashable
  9
 10from ..logs.log import logger
 11
 12
 13def findOutliers(
 14    srs: pandas.Series,
 15    sigma: int = 3,
 16    joinNeighbouringOutliers: bool = True,
 17    howManyCountAsNeighbours: int = 3
 18) -> pandas.Series:
 19    """
 20    Find outliers in the array and return the mask where outliers
 21    are marked with `True`. NaNs and INFs, if any, do not count
 22    as outliers.
 23
 24    If `joinNeighbouringOutliers` is `True`, then non-outlier values
 25    between neighbouring outliers will be marked as outliers too. The same
 26    goes for starting/ending elements, if they are close enough to an outlier.
 27
 28    Example with `joinNeighbouringOutliers` set to `True`:
 29
 30    ``` py
 31    import pandas
 32    import numpy
 33
 34    from phab.utils.math import statistics
 35
 36    srs = pandas.Series(
 37        [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3]
 38    )
 39
 40    markedOutliers = statistics.findOutliers(
 41        srs,
 42        sigma=3,
 43        joinNeighbouringOutliers=True,
 44        howManyCountAsNeighbours=3
 45    )
 46
 47    for i, v in srs.items():
 48        if markedOutliers[i] == True:
 49            print(f"{v} - outlier")
 50        else:
 51            print(f"{v} - regular")
 52    # 4.0 - outlier
 53    # 111.0 - outlier
 54    # 4.0 - regular
 55    # 4.0 - regular
 56    # 5.0 - regular
 57    # 6.0 - regular
 58    # inf - regular
 59    # 2.0 - regular
 60    # 4.0 - regular
 61    # 4.0 - regular
 62    # nan - regular
 63    # 1.0 - regular
 64    # 1000000000000000.0 - outlier
 65    # 4.0 - outlier
 66    # 3.0 - outlier
 67    # 3.0 - outlier
 68    # 101.0 - outlier
 69    # 2.0 - outlier
 70    # 4.0 - outlier
 71    # 3.0 - outlier
 72    ```
 73
 74    Example with `joinNeighbouringOutliers` set to `False`:
 75
 76    ``` py
 77    # ...
 78
 79    markedOutliers = statistics.findOutliers(
 80        srs,
 81        sigma=3,
 82        joinNeighbouringOutliers=False,
 83        howManyCountAsNeighbours=3
 84    )
 85
 86    for i, v in srs.items():
 87        if markedOutliers[i] == True:
 88            print(f"{v} - outlier")
 89        else:
 90            print(f"{v} - regular")
 91    # 4.0 - regular
 92    # 111.0 - outlier
 93    # 4.0 - regular
 94    # 4.0 - regular
 95    # 5.0 - regular
 96    # 6.0 - regular
 97    # inf - regular
 98    # 2.0 - regular
 99    # 4.0 - regular
100    # 4.0 - regular
101    # nan - regular
102    # 1.0 - regular
103    # 1000000000000000.0 - outlier
104    # 4.0 - regular
105    # 3.0 - regular
106    # 3.0 - regular
107    # 101.0 - outlier
108    # 2.0 - regular
109    # 4.0 - regular
110    # 3.0 - regular
111    ```
112    """
113    # tbl = pandas.DataFrame({"vls": srs})
114    tbl = pandas.DataFrame(srs)
115
116    # find NaNs/INFs
117    tbl["finite"] = numpy.isfinite(srs)
118
119    # every element is an outlier by default
120    tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool)
121
122    # NaNs and INFs are not outliers
123    # print(tbl["finite"].loc[lambda x: x == False].index)
124    tbl.loc[
125        tbl["finite"] == False,
126        "outliers"
127    ] = False
128
129    tableFinite = tbl[tbl["finite"] == True].iloc[:, 0]
130
131    med = numpy.median(tableFinite.values)  # type:ignore[arg-type] # ya hz
132    sig = 1.48 * numpy.median(numpy.abs(tableFinite - med))  # type:ignore[operator] # ya hz
133
134    for i, v in tableFinite.items():
135        if v > med - sigma * sig:
136            tbl.at[i, "outliers"] = False
137
138    for i, v in tableFinite.items():
139        if v >= med + sigma * sig:
140            tbl.at[i, "outliers"] = True
141
142    if not joinNeighbouringOutliers:
143        # the entire table with all columns (the original and two new ones,
144        # "finite" and "ouliers")
145        # return tbl
146        #
147        # or just two columns: first one with the values and "outliers"
148        # return tbl.iloc[:,0:3:2]
149        #
150        # or just the outliers
151        return tbl["outliers"]
152        #
153        # or just the "outliers" that are True, because it's a waste to pass
154        # around all the values, let alone the entire table
155        # return tbl[tbl["outliers"] == True]["outliers"]
156    else:
157        outlrs: pandas.Series = tbl["outliers"].copy()
158        elementsSincePreviousOutlier: int = 0
159        #
160        # later we might want to have the number of neighbours to scale
161        # with the list length, but for now it is passed as a fixed
162        # number in the `howManyCountAsNeighbours` argument of the function
163        # howManyCountAsNeighbours = round(len(outlrs.index) / 10)
164        #
165        # intuitively it should be `False`, but we need to account
166        # for possible neighbours from the very start
167        countingSincePreviousOutlier: bool = True
168        potentialNeighbourOutliers: List[Hashable] = []
169        for i, v in outlrs.items():
170            if v == True:
171                if countingSincePreviousOutlier:
172                    # make all the previous elements to be outliers too
173                    for pno in potentialNeighbourOutliers:
174                        outlrs[pno] = True
175                    potentialNeighbourOutliers = []
176                    elementsSincePreviousOutlier = 0
177                else:
178                    countingSincePreviousOutlier = True
179            else:
180                if countingSincePreviousOutlier:
181                    elementsSincePreviousOutlier += 1
182                    if elementsSincePreviousOutlier > howManyCountAsNeighbours:
183                        elementsSincePreviousOutlier = 0
184                        countingSincePreviousOutlier = False
185                        potentialNeighbourOutliers = []
186                    else:
187                        potentialNeighbourOutliers.append(i)
188        # if there are some pending potential neighbour outliers
189        # after we finished iterating the list, make them outliers
190        if len(potentialNeighbourOutliers) > 0:
191            for pno in potentialNeighbourOutliers:
192                outlrs[pno] = True
193            potentialNeighbourOutliers = []
194        return outlrs

def findOutliers( srs: pandas.core.series.Series, sigma: int = 3, joinNeighbouringOutliers: bool = True, howManyCountAsNeighbours: int = 3) -> pandas.core.series.Series: View Source

 14def findOutliers(
 15    srs: pandas.Series,
 16    sigma: int = 3,
 17    joinNeighbouringOutliers: bool = True,
 18    howManyCountAsNeighbours: int = 3
 19) -> pandas.Series:
 20    """
 21    Find outliers in the array and return the mask where outliers
 22    are marked with `True`. NaNs and INFs, if any, do not count
 23    as outliers.
 24
 25    If `joinNeighbouringOutliers` is `True`, then non-outlier values
 26    between neighbouring outliers will be marked as outliers too. The same
 27    goes for starting/ending elements, if they are close enough to an outlier.
 28
 29    Example with `joinNeighbouringOutliers` set to `True`:
 30
 31    ``` py
 32    import pandas
 33    import numpy
 34
 35    from phab.utils.math import statistics
 36
 37    srs = pandas.Series(
 38        [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3]
 39    )
 40
 41    markedOutliers = statistics.findOutliers(
 42        srs,
 43        sigma=3,
 44        joinNeighbouringOutliers=True,
 45        howManyCountAsNeighbours=3
 46    )
 47
 48    for i, v in srs.items():
 49        if markedOutliers[i] == True:
 50            print(f"{v} - outlier")
 51        else:
 52            print(f"{v} - regular")
 53    # 4.0 - outlier
 54    # 111.0 - outlier
 55    # 4.0 - regular
 56    # 4.0 - regular
 57    # 5.0 - regular
 58    # 6.0 - regular
 59    # inf - regular
 60    # 2.0 - regular
 61    # 4.0 - regular
 62    # 4.0 - regular
 63    # nan - regular
 64    # 1.0 - regular
 65    # 1000000000000000.0 - outlier
 66    # 4.0 - outlier
 67    # 3.0 - outlier
 68    # 3.0 - outlier
 69    # 101.0 - outlier
 70    # 2.0 - outlier
 71    # 4.0 - outlier
 72    # 3.0 - outlier
 73    ```
 74
 75    Example with `joinNeighbouringOutliers` set to `False`:
 76
 77    ``` py
 78    # ...
 79
 80    markedOutliers = statistics.findOutliers(
 81        srs,
 82        sigma=3,
 83        joinNeighbouringOutliers=False,
 84        howManyCountAsNeighbours=3
 85    )
 86
 87    for i, v in srs.items():
 88        if markedOutliers[i] == True:
 89            print(f"{v} - outlier")
 90        else:
 91            print(f"{v} - regular")
 92    # 4.0 - regular
 93    # 111.0 - outlier
 94    # 4.0 - regular
 95    # 4.0 - regular
 96    # 5.0 - regular
 97    # 6.0 - regular
 98    # inf - regular
 99    # 2.0 - regular
100    # 4.0 - regular
101    # 4.0 - regular
102    # nan - regular
103    # 1.0 - regular
104    # 1000000000000000.0 - outlier
105    # 4.0 - regular
106    # 3.0 - regular
107    # 3.0 - regular
108    # 101.0 - outlier
109    # 2.0 - regular
110    # 4.0 - regular
111    # 3.0 - regular
112    ```
113    """
114    # tbl = pandas.DataFrame({"vls": srs})
115    tbl = pandas.DataFrame(srs)
116
117    # find NaNs/INFs
118    tbl["finite"] = numpy.isfinite(srs)
119
120    # every element is an outlier by default
121    tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool)
122
123    # NaNs and INFs are not outliers
124    # print(tbl["finite"].loc[lambda x: x == False].index)
125    tbl.loc[
126        tbl["finite"] == False,
127        "outliers"
128    ] = False
129
130    tableFinite = tbl[tbl["finite"] == True].iloc[:, 0]
131
132    med = numpy.median(tableFinite.values)  # type:ignore[arg-type] # ya hz
133    sig = 1.48 * numpy.median(numpy.abs(tableFinite - med))  # type:ignore[operator] # ya hz
134
135    for i, v in tableFinite.items():
136        if v > med - sigma * sig:
137            tbl.at[i, "outliers"] = False
138
139    for i, v in tableFinite.items():
140        if v >= med + sigma * sig:
141            tbl.at[i, "outliers"] = True
142
143    if not joinNeighbouringOutliers:
144        # the entire table with all columns (the original and two new ones,
145        # "finite" and "ouliers")
146        # return tbl
147        #
148        # or just two columns: first one with the values and "outliers"
149        # return tbl.iloc[:,0:3:2]
150        #
151        # or just the outliers
152        return tbl["outliers"]
153        #
154        # or just the "outliers" that are True, because it's a waste to pass
155        # around all the values, let alone the entire table
156        # return tbl[tbl["outliers"] == True]["outliers"]
157    else:
158        outlrs: pandas.Series = tbl["outliers"].copy()
159        elementsSincePreviousOutlier: int = 0
160        #
161        # later we might want to have the number of neighbours to scale
162        # with the list length, but for now it is passed as a fixed
163        # number in the `howManyCountAsNeighbours` argument of the function
164        # howManyCountAsNeighbours = round(len(outlrs.index) / 10)
165        #
166        # intuitively it should be `False`, but we need to account
167        # for possible neighbours from the very start
168        countingSincePreviousOutlier: bool = True
169        potentialNeighbourOutliers: List[Hashable] = []
170        for i, v in outlrs.items():
171            if v == True:
172                if countingSincePreviousOutlier:
173                    # make all the previous elements to be outliers too
174                    for pno in potentialNeighbourOutliers:
175                        outlrs[pno] = True
176                    potentialNeighbourOutliers = []
177                    elementsSincePreviousOutlier = 0
178                else:
179                    countingSincePreviousOutlier = True
180            else:
181                if countingSincePreviousOutlier:
182                    elementsSincePreviousOutlier += 1
183                    if elementsSincePreviousOutlier > howManyCountAsNeighbours:
184                        elementsSincePreviousOutlier = 0
185                        countingSincePreviousOutlier = False
186                        potentialNeighbourOutliers = []
187                    else:
188                        potentialNeighbourOutliers.append(i)
189        # if there are some pending potential neighbour outliers
190        # after we finished iterating the list, make them outliers
191        if len(potentialNeighbourOutliers) > 0:
192            for pno in potentialNeighbourOutliers:
193                outlrs[pno] = True
194            potentialNeighbourOutliers = []
195        return outlrs

Find outliers in the array and return the mask where outliers are marked with True. NaNs and INFs, if any, do not count as outliers.

If joinNeighbouringOutliers is True, then non-outlier values between neighbouring outliers will be marked as outliers too. The same goes for starting/ending elements, if they are close enough to an outlier.

Example with joinNeighbouringOutliers set to True:

import pandas
import numpy

from phab.utils.math import statistics

srs = pandas.Series(
    [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3]
)

markedOutliers = statistics.findOutliers(
    srs,
    sigma=3,
    joinNeighbouringOutliers=True,
    howManyCountAsNeighbours=3
)

for i, v in srs.items():
    if markedOutliers[i] == True:
        print(f"{v} - outlier")
    else:
        print(f"{v} - regular")
# 4.0 - outlier
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - outlier
# 3.0 - outlier
# 3.0 - outlier
# 101.0 - outlier
# 2.0 - outlier
# 4.0 - outlier
# 3.0 - outlier

Example with joinNeighbouringOutliers set to False:

# ...

markedOutliers = statistics.findOutliers(
    srs,
    sigma=3,
    joinNeighbouringOutliers=False,
    howManyCountAsNeighbours=3
)

for i, v in srs.items():
    if markedOutliers[i] == True:
        print(f"{v} - outlier")
    else:
        print(f"{v} - regular")
# 4.0 - regular
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - regular
# 3.0 - regular
# 3.0 - regular
# 101.0 - outlier
# 2.0 - regular
# 4.0 - regular
# 3.0 - regular