utils.math.statistics
Mathematical statistics.
1""" 2Mathematical statistics. 3""" 4 5import numpy 6import pandas 7 8from typing import List, Hashable 9 10from ..logs.log import logger 11 12 13def findOutliers( 14 srs: pandas.Series, 15 sigma: int = 3, 16 joinNeighbouringOutliers: bool = True, 17 howManyCountAsNeighbours: int = 3 18) -> pandas.Series: 19 """ 20 Find outliers in the array and return the mask where outliers 21 are marked with `True`. NaNs and INFs, if any, do not count 22 as outliers. 23 24 If `joinNeighbouringOutliers` is `True`, then non-outlier values 25 between neighbouring outliers will be marked as outliers too. The same 26 goes for starting/ending elements, if they are close enough to an outlier. 27 28 Example with `joinNeighbouringOutliers` set to `True`: 29 30 ``` py 31 import pandas 32 import numpy 33 34 from phab.utils.math import statistics 35 36 srs = pandas.Series( 37 [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3] 38 ) 39 40 markedOutliers = statistics.findOutliers( 41 srs, 42 sigma=3, 43 joinNeighbouringOutliers=True, 44 howManyCountAsNeighbours=3 45 ) 46 47 for i, v in srs.items(): 48 if markedOutliers[i] == True: 49 print(f"{v} - outlier") 50 else: 51 print(f"{v} - regular") 52 # 4.0 - outlier 53 # 111.0 - outlier 54 # 4.0 - regular 55 # 4.0 - regular 56 # 5.0 - regular 57 # 6.0 - regular 58 # inf - regular 59 # 2.0 - regular 60 # 4.0 - regular 61 # 4.0 - regular 62 # nan - regular 63 # 1.0 - regular 64 # 1000000000000000.0 - outlier 65 # 4.0 - outlier 66 # 3.0 - outlier 67 # 3.0 - outlier 68 # 101.0 - outlier 69 # 2.0 - outlier 70 # 4.0 - outlier 71 # 3.0 - outlier 72 ``` 73 74 Example with `joinNeighbouringOutliers` set to `False`: 75 76 ``` py 77 # ... 78 79 markedOutliers = statistics.findOutliers( 80 srs, 81 sigma=3, 82 joinNeighbouringOutliers=False, 83 howManyCountAsNeighbours=3 84 ) 85 86 for i, v in srs.items(): 87 if markedOutliers[i] == True: 88 print(f"{v} - outlier") 89 else: 90 print(f"{v} - regular") 91 # 4.0 - regular 92 # 111.0 - outlier 93 # 4.0 - regular 94 # 4.0 - regular 95 # 5.0 - regular 96 # 6.0 - regular 97 # inf - regular 98 # 2.0 - regular 99 # 4.0 - regular 100 # 4.0 - regular 101 # nan - regular 102 # 1.0 - regular 103 # 1000000000000000.0 - outlier 104 # 4.0 - regular 105 # 3.0 - regular 106 # 3.0 - regular 107 # 101.0 - outlier 108 # 2.0 - regular 109 # 4.0 - regular 110 # 3.0 - regular 111 ``` 112 """ 113 # tbl = pandas.DataFrame({"vls": srs}) 114 tbl = pandas.DataFrame(srs) 115 116 # find NaNs/INFs 117 tbl["finite"] = numpy.isfinite(srs) 118 119 # every element is an outlier by default 120 tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool) 121 122 # NaNs and INFs are not outliers 123 # print(tbl["finite"].loc[lambda x: x == False].index) 124 tbl.loc[ 125 tbl["finite"] == False, 126 "outliers" 127 ] = False 128 129 tableFinite = tbl[tbl["finite"] == True].iloc[:, 0] 130 131 med = numpy.median(tableFinite.values) # type:ignore[arg-type] # ya hz 132 sig = 1.48 * numpy.median(numpy.abs(tableFinite - med)) # type:ignore[operator] # ya hz 133 134 for i, v in tableFinite.items(): 135 if v > med - sigma * sig: 136 tbl.at[i, "outliers"] = False 137 138 for i, v in tableFinite.items(): 139 if v >= med + sigma * sig: 140 tbl.at[i, "outliers"] = True 141 142 if not joinNeighbouringOutliers: 143 # the entire table with all columns (the original and two new ones, 144 # "finite" and "ouliers") 145 # return tbl 146 # 147 # or just two columns: first one with the values and "outliers" 148 # return tbl.iloc[:,0:3:2] 149 # 150 # or just the outliers 151 return tbl["outliers"] 152 # 153 # or just the "outliers" that are True, because it's a waste to pass 154 # around all the values, let alone the entire table 155 # return tbl[tbl["outliers"] == True]["outliers"] 156 else: 157 outlrs: pandas.Series = tbl["outliers"].copy() 158 elementsSincePreviousOutlier: int = 0 159 # 160 # later we might want to have the number of neighbours to scale 161 # with the list length, but for now it is passed as a fixed 162 # number in the `howManyCountAsNeighbours` argument of the function 163 # howManyCountAsNeighbours = round(len(outlrs.index) / 10) 164 # 165 # intuitively it should be `False`, but we need to account 166 # for possible neighbours from the very start 167 countingSincePreviousOutlier: bool = True 168 potentialNeighbourOutliers: List[Hashable] = [] 169 for i, v in outlrs.items(): 170 if v == True: 171 if countingSincePreviousOutlier: 172 # make all the previous elements to be outliers too 173 for pno in potentialNeighbourOutliers: 174 outlrs[pno] = True 175 potentialNeighbourOutliers = [] 176 elementsSincePreviousOutlier = 0 177 else: 178 countingSincePreviousOutlier = True 179 else: 180 if countingSincePreviousOutlier: 181 elementsSincePreviousOutlier += 1 182 if elementsSincePreviousOutlier > howManyCountAsNeighbours: 183 elementsSincePreviousOutlier = 0 184 countingSincePreviousOutlier = False 185 potentialNeighbourOutliers = [] 186 else: 187 potentialNeighbourOutliers.append(i) 188 # if there are some pending potential neighbour outliers 189 # after we finished iterating the list, make them outliers 190 if len(potentialNeighbourOutliers) > 0: 191 for pno in potentialNeighbourOutliers: 192 outlrs[pno] = True 193 potentialNeighbourOutliers = [] 194 return outlrs
def
findOutliers( srs: pandas.core.series.Series, sigma: int = 3, joinNeighbouringOutliers: bool = True, howManyCountAsNeighbours: int = 3) -> pandas.core.series.Series:
14def findOutliers( 15 srs: pandas.Series, 16 sigma: int = 3, 17 joinNeighbouringOutliers: bool = True, 18 howManyCountAsNeighbours: int = 3 19) -> pandas.Series: 20 """ 21 Find outliers in the array and return the mask where outliers 22 are marked with `True`. NaNs and INFs, if any, do not count 23 as outliers. 24 25 If `joinNeighbouringOutliers` is `True`, then non-outlier values 26 between neighbouring outliers will be marked as outliers too. The same 27 goes for starting/ending elements, if they are close enough to an outlier. 28 29 Example with `joinNeighbouringOutliers` set to `True`: 30 31 ``` py 32 import pandas 33 import numpy 34 35 from phab.utils.math import statistics 36 37 srs = pandas.Series( 38 [4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3] 39 ) 40 41 markedOutliers = statistics.findOutliers( 42 srs, 43 sigma=3, 44 joinNeighbouringOutliers=True, 45 howManyCountAsNeighbours=3 46 ) 47 48 for i, v in srs.items(): 49 if markedOutliers[i] == True: 50 print(f"{v} - outlier") 51 else: 52 print(f"{v} - regular") 53 # 4.0 - outlier 54 # 111.0 - outlier 55 # 4.0 - regular 56 # 4.0 - regular 57 # 5.0 - regular 58 # 6.0 - regular 59 # inf - regular 60 # 2.0 - regular 61 # 4.0 - regular 62 # 4.0 - regular 63 # nan - regular 64 # 1.0 - regular 65 # 1000000000000000.0 - outlier 66 # 4.0 - outlier 67 # 3.0 - outlier 68 # 3.0 - outlier 69 # 101.0 - outlier 70 # 2.0 - outlier 71 # 4.0 - outlier 72 # 3.0 - outlier 73 ``` 74 75 Example with `joinNeighbouringOutliers` set to `False`: 76 77 ``` py 78 # ... 79 80 markedOutliers = statistics.findOutliers( 81 srs, 82 sigma=3, 83 joinNeighbouringOutliers=False, 84 howManyCountAsNeighbours=3 85 ) 86 87 for i, v in srs.items(): 88 if markedOutliers[i] == True: 89 print(f"{v} - outlier") 90 else: 91 print(f"{v} - regular") 92 # 4.0 - regular 93 # 111.0 - outlier 94 # 4.0 - regular 95 # 4.0 - regular 96 # 5.0 - regular 97 # 6.0 - regular 98 # inf - regular 99 # 2.0 - regular 100 # 4.0 - regular 101 # 4.0 - regular 102 # nan - regular 103 # 1.0 - regular 104 # 1000000000000000.0 - outlier 105 # 4.0 - regular 106 # 3.0 - regular 107 # 3.0 - regular 108 # 101.0 - outlier 109 # 2.0 - regular 110 # 4.0 - regular 111 # 3.0 - regular 112 ``` 113 """ 114 # tbl = pandas.DataFrame({"vls": srs}) 115 tbl = pandas.DataFrame(srs) 116 117 # find NaNs/INFs 118 tbl["finite"] = numpy.isfinite(srs) 119 120 # every element is an outlier by default 121 tbl["outliers"] = numpy.ones(len(tbl.index), dtype=bool) 122 123 # NaNs and INFs are not outliers 124 # print(tbl["finite"].loc[lambda x: x == False].index) 125 tbl.loc[ 126 tbl["finite"] == False, 127 "outliers" 128 ] = False 129 130 tableFinite = tbl[tbl["finite"] == True].iloc[:, 0] 131 132 med = numpy.median(tableFinite.values) # type:ignore[arg-type] # ya hz 133 sig = 1.48 * numpy.median(numpy.abs(tableFinite - med)) # type:ignore[operator] # ya hz 134 135 for i, v in tableFinite.items(): 136 if v > med - sigma * sig: 137 tbl.at[i, "outliers"] = False 138 139 for i, v in tableFinite.items(): 140 if v >= med + sigma * sig: 141 tbl.at[i, "outliers"] = True 142 143 if not joinNeighbouringOutliers: 144 # the entire table with all columns (the original and two new ones, 145 # "finite" and "ouliers") 146 # return tbl 147 # 148 # or just two columns: first one with the values and "outliers" 149 # return tbl.iloc[:,0:3:2] 150 # 151 # or just the outliers 152 return tbl["outliers"] 153 # 154 # or just the "outliers" that are True, because it's a waste to pass 155 # around all the values, let alone the entire table 156 # return tbl[tbl["outliers"] == True]["outliers"] 157 else: 158 outlrs: pandas.Series = tbl["outliers"].copy() 159 elementsSincePreviousOutlier: int = 0 160 # 161 # later we might want to have the number of neighbours to scale 162 # with the list length, but for now it is passed as a fixed 163 # number in the `howManyCountAsNeighbours` argument of the function 164 # howManyCountAsNeighbours = round(len(outlrs.index) / 10) 165 # 166 # intuitively it should be `False`, but we need to account 167 # for possible neighbours from the very start 168 countingSincePreviousOutlier: bool = True 169 potentialNeighbourOutliers: List[Hashable] = [] 170 for i, v in outlrs.items(): 171 if v == True: 172 if countingSincePreviousOutlier: 173 # make all the previous elements to be outliers too 174 for pno in potentialNeighbourOutliers: 175 outlrs[pno] = True 176 potentialNeighbourOutliers = [] 177 elementsSincePreviousOutlier = 0 178 else: 179 countingSincePreviousOutlier = True 180 else: 181 if countingSincePreviousOutlier: 182 elementsSincePreviousOutlier += 1 183 if elementsSincePreviousOutlier > howManyCountAsNeighbours: 184 elementsSincePreviousOutlier = 0 185 countingSincePreviousOutlier = False 186 potentialNeighbourOutliers = [] 187 else: 188 potentialNeighbourOutliers.append(i) 189 # if there are some pending potential neighbour outliers 190 # after we finished iterating the list, make them outliers 191 if len(potentialNeighbourOutliers) > 0: 192 for pno in potentialNeighbourOutliers: 193 outlrs[pno] = True 194 potentialNeighbourOutliers = [] 195 return outlrs
Find outliers in the array and return the mask where outliers
are marked with True
. NaNs and INFs, if any, do not count
as outliers.
If joinNeighbouringOutliers
is True
, then non-outlier values
between neighbouring outliers will be marked as outliers too. The same
goes for starting/ending elements, if they are close enough to an outlier.
Example with joinNeighbouringOutliers
set to True
:
import pandas
import numpy
from phab.utils.math import statistics
srs = pandas.Series(
[4, 111, 4, 4, 5, 6, numpy.inf, 2, 4, 4, numpy.nan, 1, 1e15, 4, 3, 3, 101, 2, 4, 3]
)
markedOutliers = statistics.findOutliers(
srs,
sigma=3,
joinNeighbouringOutliers=True,
howManyCountAsNeighbours=3
)
for i, v in srs.items():
if markedOutliers[i] == True:
print(f"{v} - outlier")
else:
print(f"{v} - regular")
# 4.0 - outlier
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - outlier
# 3.0 - outlier
# 3.0 - outlier
# 101.0 - outlier
# 2.0 - outlier
# 4.0 - outlier
# 3.0 - outlier
Example with joinNeighbouringOutliers
set to False
:
# ...
markedOutliers = statistics.findOutliers(
srs,
sigma=3,
joinNeighbouringOutliers=False,
howManyCountAsNeighbours=3
)
for i, v in srs.items():
if markedOutliers[i] == True:
print(f"{v} - outlier")
else:
print(f"{v} - regular")
# 4.0 - regular
# 111.0 - outlier
# 4.0 - regular
# 4.0 - regular
# 5.0 - regular
# 6.0 - regular
# inf - regular
# 2.0 - regular
# 4.0 - regular
# 4.0 - regular
# nan - regular
# 1.0 - regular
# 1000000000000000.0 - outlier
# 4.0 - regular
# 3.0 - regular
# 3.0 - regular
# 101.0 - outlier
# 2.0 - regular
# 4.0 - regular
# 3.0 - regular